xoxys.prometheus/defaults/main.yml

138 lines
4.4 KiB
YAML

---
prometheus_version: 2.27.0
prometheus_user: "prometheus_adm"
prometheus_user_home: "/home/{{ prometheus_user }}"
prometheus_group: "{{ prometheus_user }}"
prometheus_extra_groups: []
prometheus_packages: []
prometheus_base_dir: "/opt/prometheus"
prometheus_config_dir: "{{ prometheus_base_dir }}/conf"
prometheus_rules_dir: "{{ prometheus_config_dir }}/rules"
prometheus_file_sd_dir: "{{ prometheus_config_dir }}/file_sd"
prometheus_data_dir: "{{ prometheus_base_dir }}/data"
prometheus_read_only_dirs: []
prometheus_web_bind_ip: 127.0.0.1
prometheus_web_bind_port: 61000
prometheus_web_external_url: "http://localhost:61000/"
prometheus_web_tls_enabled: False
prometheus_web_tls_cert_path: "{{ prometheus_base_dir }}/tls/certs/mycert.pem"
prometheus_web_tls_key_path: "{{ prometheus_base_dir }}/tls/private/mykey.pem"
prometheus_web_tls_cert_source: mycert.pem
prometheus_web_tls_key_source: mykey.pem
# @var prometheus_web_http_server:description: See official [documentation](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md).
# @var prometheus_web_http_server: $ "_unset_"
# @var prometheus_web_basic_auth_users:description: See official [documentation](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md).
# @var prometheus_web_basic_auth_users: $ "_unset_"
prometheus_log_level: error
prometheus_storage_retention: "30d"
prometheus_storage_retention_size: "0"
# @var prometheus_storage_retention_size:description: > __[EXPERIMENTAL]__ Maximum number of bytes that can be stored for blocks.
# Units supported: KB, MB, GB, TB, PB.
# @end
prometheus_config_flags_extra: []
# @var prometheus_config_flags_extra:example: >
# prometheus_config_flags_extra:
# - name: alertmanager.timeout
# value: 10s
# @end
prometheus_alertmanager_config: []
# @var prometheus_alertmanager_config:example: >
# prometheus_alertmanager_config:
# - scheme: https
# path_prefix: alertmanager/
# basic_auth:
# username: user
# password: pass
# static_configs:
# - targets: ["127.0.0.1:9093"]
# proxy_url: "127.0.0.2"
# @end
prometheus_alert_relabel_configs: []
# @var prometheus_alert_relabel_configs:example: >
# prometheus_alert_relabel_configs:
# - action: labeldrop
# regex: replica
prometheus_global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 15s
prometheus_remote_write: []
# @var prometheus_remote_write:example: >
# prometheus_remote_write:
# - url: https://dev.kausal.co/prom/push
# basic_auth:
# password: FOO
prometheus_remote_read: []
# @var prometheus_remote_read:example: >
# prometheus_remote_read:
# - url: https://demo.cloudalchemy.org:9201/read
# basic_auth:
# password: FOO
# @end
prometheus_external_labels:
environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}"
prometheus_targets: []
# @var prometheus_targets:example: >
# prometheus_targets:
# - name: node
# config:
# - targets:
# - localhost:9100
# labels:
# env: test
# @end
prometheus_scrape_configs:
- job_name: "prometheus"
metrics_path: "{{ prometheus_web_external_url | urlsplit('path') if (prometheus_web_external_url | urlsplit('path')) | length > 1 else '' }}/metrics"
static_configs:
- targets:
- "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:61000"
prometheus_alert_rules_files:
- "prometheus/rules/*.rules"
prometheus_static_targets_files:
- prometheus/targets/*.yml
- prometheus/targets/*.json
prometheus_alert_rules:
- alert: Watchdog
expr: vector(1)
for: 10m
labels:
severity: warning
annotations:
description: >-
This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing.
summary: "Ensure entire alerting pipeline is functional"
- alert: InstanceDown
expr: "up == 0"
for: 5m
labels:
severity: critical
annotations:
description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} down{% endraw %}"
prometheus_alert_rules_extra: []