--- prometheus_version: 2.27.0 prometheus_user: "prometheus_adm" prometheus_user_home: "/home/{{ prometheus_user }}" prometheus_group: "{{ prometheus_user }}" prometheus_extra_groups: [] prometheus_packages: [] prometheus_base_dir: "/opt/prometheus" prometheus_config_dir: "{{ prometheus_base_dir }}/conf" prometheus_rules_dir: "{{ prometheus_config_dir }}/rules" prometheus_file_sd_dir: "{{ prometheus_config_dir }}/file_sd" prometheus_data_dir: "{{ prometheus_base_dir }}/data" prometheus_read_only_dirs: [] prometheus_web_bind_ip: 127.0.0.1 prometheus_web_bind_port: 9090 prometheus_web_external_url: "" # @var prometheus_web_config:description: See official [documentation](https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md). prometheus_web_config: tls_server_config: {} http_server_config: {} basic_auth_users: {} prometheus_storage_retention: "30d" prometheus_storage_retention_size: "0" # @var prometheus_storage_retention_size:description: > __[EXPERIMENTAL]__ Maximum number of bytes that can be stored for blocks. # Units supported: KB, MB, GB, TB, PB. # @end prometheus_config_flags_extra: [] # @var prometheus_config_flags_extra:example: > # prometheus_config_flags_extra: # - name: storage.tsdb.retention # value: 15d # - name: alertmanager.timeout # - value: 10s # @end prometheus_alertmanager_config: [] # @var prometheus_alertmanager_config:example: > # prometheus_alertmanager_config: # - scheme: https # path_prefix: alertmanager/ # basic_auth: # username: user # password: pass # static_configs: # - targets: ["127.0.0.1:9093"] # proxy_url: "127.0.0.2" # @end prometheus_alert_relabel_configs: [] # @var prometheus_alert_relabel_configs:example: > # prometheus_alert_relabel_configs: # - action: labeldrop # regex: replica prometheus_global: scrape_interval: 15s scrape_timeout: 10s evaluation_interval: 15s prometheus_remote_write: [] # @var prometheus_remote_write:example: > # prometheus_remote_write: # - url: https://dev.kausal.co/prom/push # basic_auth: # password: FOO prometheus_remote_read: [] # @var prometheus_remote_read:example: > # prometheus_remote_read: # - url: https://demo.cloudalchemy.org:9201/read # basic_auth: # password: FOO # @end prometheus_external_labels: environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" prometheus_targets: [] # @var prometheus_targets:example: > # prometheus_targets: # - name: node # config: # - targets: # - localhost:9100 # labels: # env: test # @end prometheus_scrape_configs: - job_name: "prometheus" metrics_path: "{{ prometheus_web_external_url | urlsplit('path') }}/metrics" static_configs: - targets: - "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090" prometheus_alert_rules_files: - "prometheus/rules/*.rules" prometheus_static_targets_files: - prometheus/targets/*.yml - prometheus/targets/*.json prometheus_alert_rules: - alert: Watchdog expr: vector(1) for: 10m labels: severity: warning annotations: description: >- This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. summary: "Ensure entire alerting pipeline is functional" - alert: InstanceDown expr: "up == 0" for: 5m labels: severity: critical annotations: description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}" summary: "{% raw %}Instance {{ $labels.instance }} down{% endraw %}" prometheus_alert_rules_extra: []