xoxys.prometheus/defaults/main.yml
2021-05-29 13:45:35 +02:00

126 lines
3.9 KiB
YAML

---
prometheus_version: 2.27.0
prometheus_user: "prometheus_adm"
prometheus_user_home: "/home/{{ prometheus_user }}"
prometheus_group: "{{ prometheus_user }}"
prometheus_extra_groups: []
prometheus_packages: []
prometheus_base_dir: "/opt/prometheus"
prometheus_config_dir: "{{ prometheus_base_dir }}/conf"
prometheus_rules_dir: "{{ prometheus_config_dir }}/rules"
prometheus_file_sd_dir: "{{ prometheus_config_dir }}/file_sd"
prometheus_data_dir: "{{ prometheus_base_dir }}/data"
prometheus_web_listen_address: "0.0.0.0:9090"
prometheus_web_external_url: ""
# @var prometheus_web_config:description: See official [documentation](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md).
prometheus_web_config:
tls_server_config: {}
http_server_config: {}
basic_auth_users: {}
prometheus_storage_retention: "30d"
prometheus_storage_retention_size: "0"
# @var prometheus_storage_retention_size:description: > __[EXPERIMENTAL]__ Maximum number of bytes that can be stored for blocks.
# Units supported: KB, MB, GB, TB, PB.
# @end
prometheus_config_flags_extra: []
# @var prometheus_config_flags_extra:example: >
# prometheus_config_flags_extra:
# - name: storage.tsdb.retention
# value: 15d
# - name: alertmanager.timeout
# - value: 10s
# @end
prometheus_alertmanager_config: []
# @var prometheus_alertmanager_config:example: >
# prometheus_alertmanager_config:
# - scheme: https
# path_prefix: alertmanager/
# basic_auth:
# username: user
# password: pass
# static_configs:
# - targets: ["127.0.0.1:9093"]
# proxy_url: "127.0.0.2"
# @end
prometheus_alert_relabel_configs: []
# @var prometheus_alert_relabel_configs:example: >
# prometheus_alert_relabel_configs:
# - action: labeldrop
# regex: replica
prometheus_global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 15s
prometheus_remote_write: []
# @var prometheus_remote_write:example: >
# prometheus_remote_write:
# - url: https://dev.kausal.co/prom/push
# basic_auth:
# password: FOO
prometheus_remote_read: []
# @var prometheus_remote_read:example: >
# prometheus_remote_read:
# - url: https://demo.cloudalchemy.org:9201/read
# basic_auth:
# password: FOO
# @end
prometheus_external_labels:
environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}"
prometheus_targets: []
# @var prometheus_targets:example: >
# prometheus_targets:
# - name: node
# config:
# - targets:
# - localhost:9100
# labels:
# env: test
# @end
prometheus_scrape_configs:
- job_name: "prometheus"
metrics_path: "{{ prometheus_web_external_url | urlsplit('path') }}/metrics"
static_configs:
- targets:
- "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090"
prometheus_alert_rules_files:
- "{{ prometheus_rules_dir }}/*.rules"
prometheus_static_targets_files:
- prometheus/targets/*.yml
- prometheus/targets/*.json
prometheus_alert_rules:
- alert: Watchdog
expr: vector(1)
for: 10m
labels:
severity: warning
annotations:
description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty."
summary: "Ensure entire alerting pipeline is functional"
- alert: InstanceDown
expr: "up == 0"
for: 5m
labels:
severity: critical
annotations:
description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} down{% endraw %}"
prometheus_alert_rules_extra: []