Compare commits
7 Commits
fc65595bce
...
transcaffe
Author | SHA1 | Date | |
---|---|---|---|
e707dbc31c
|
|||
f84fed4867
|
|||
23fdcb107f
|
|||
6382f2af7e
|
|||
0b5092f34d
|
|||
97ee8826ba
|
|||
aebcd29475
|
@ -7,6 +7,10 @@ metrics or alerting.
|
||||
|
||||
## Roles
|
||||
|
||||
- [`alertmanager`](roles/alertmanager/README.md): Runs prometheus'
|
||||
alertmanager for receiving alerts from prometheus and routing them
|
||||
to the correct configured receivers.
|
||||
|
||||
- [`matrix-alertmanager`](roles/matrix-alertmanager/README.md): An alert-
|
||||
manager receiver which posts alerts to a configured matrix channel
|
||||
using alertmanagers' webhooks.
|
||||
|
@ -3,12 +3,11 @@ name: observability
|
||||
version: 0.0.1
|
||||
readme: README.md
|
||||
authors:
|
||||
- Johanna Dorothea Reichmann <transcaffeine@finallycoffee.eu>
|
||||
- transcaffeine <transcaffeine@finally.coffee>
|
||||
description: Various ansible roles useful for automating infrastructure
|
||||
dependencies:
|
||||
"community.docker": "^1.10.0"
|
||||
license:
|
||||
- CNPLv7+
|
||||
license_file: LICENSE.md
|
||||
build_ignore:
|
||||
- '*.tar.gz'
|
||||
repository: https://git.finally.coffee/finallycoffee/observability
|
||||
|
3
meta/runtime.yml
Normal file
3
meta/runtime.yml
Normal file
@ -0,0 +1,3 @@
|
||||
---
|
||||
|
||||
requires_ansible: ">=2.12"
|
6
playbooks/loki.yml
Normal file
6
playbooks/loki.yml
Normal file
@ -0,0 +1,6 @@
|
||||
---
|
||||
- name: Install loki
|
||||
hosts: "{{ loki_hosts | default('loki') }}"
|
||||
become: true
|
||||
roles:
|
||||
- role: finallycoffee.observability.loki
|
10
roles/alertmanager/README.md
Normal file
10
roles/alertmanager/README.md
Normal file
@ -0,0 +1,10 @@
|
||||
# `finallycoffee.observability.alertmanager` ansible role
|
||||
|
||||
## Description
|
||||
|
||||
This role configures and runs prometheus alertmanager in a docker container.
|
||||
|
||||
The config file is templated on the host and persisted in `alertmanager_config_file`.
|
||||
|
||||
The alertmanager config can be passed by setting `alertmanager_config`, which expects the same yaml
|
||||
format as the "normal" alertmanager config file (with top-level keys `global`, `route` and `receivers`).
|
40
roles/alertmanager/defaults/main.yml
Normal file
40
roles/alertmanager/defaults/main.yml
Normal file
@ -0,0 +1,40 @@
|
||||
---
|
||||
|
||||
alertmanager_user: alertmanager
|
||||
alertmanager_version: 0.25.0
|
||||
alertmanager_base_path: /opt/alertmanager
|
||||
alertmanager_config_path: "{{ alertmanager_base_path }}/config"
|
||||
alertmanager_config_file: "{{ alertmanager_config_path }}/alertmanager.yml"
|
||||
alertmanager_data_path: "{{ alertmanager_base_path }}/data"
|
||||
|
||||
alertmanager_container_name: alertmanager
|
||||
alertmanager_container_image_name: alertmanager
|
||||
alertmanager_container_image_namespace: prometheus/
|
||||
alertmanager_container_image_registry: quay.io
|
||||
|
||||
alertmanager_container_image_repository: >-
|
||||
{{
|
||||
(container_registries[alertmanager_container_image_registry] | default(alertmanager_container_image_registry))
|
||||
+ '/' + (alertmanager_container_image_namespace | default(''))
|
||||
+ alertmanager_container_image_name
|
||||
}}
|
||||
alertmanager_container_image_reference: >-
|
||||
{{
|
||||
alertmanager_container_image_repository + ':'
|
||||
+ (alertmanager_container_image_tag | default('v' + alertmanager_version))
|
||||
}}
|
||||
|
||||
alertmanager_container_image_force_pull: "{{ alertmanager_container_image_tag is defined }}"
|
||||
|
||||
alertmanager_container_default_volumes:
|
||||
- "{{ alertmanager_config_file }}:/etc/alertmanager/alertmanager.yml:ro"
|
||||
- "{{ alertmanager_data_path }}:/alertmanager:rw"
|
||||
alertmanager_container_volumes: >-
|
||||
{{ alertmanager_container_default_volumes
|
||||
+ alertmanager_container_extra_volumes | default([]) }}
|
||||
alertmanager_container_restart_policy: "unless-stopped"
|
||||
|
||||
alertmanager_config:
|
||||
global: {}
|
||||
route: {}
|
||||
receivers: []
|
8
roles/alertmanager/handlers/main.yml
Normal file
8
roles/alertmanager/handlers/main.yml
Normal file
@ -0,0 +1,8 @@
|
||||
---
|
||||
|
||||
- name: Ensure alertmanager is restarted
|
||||
community.docker.docker_container:
|
||||
name: "{{ alertmanager_container_name }}"
|
||||
state: started
|
||||
restart: true
|
||||
listen: restart-alertmanager
|
51
roles/alertmanager/tasks/main.yml
Normal file
51
roles/alertmanager/tasks/main.yml
Normal file
@ -0,0 +1,51 @@
|
||||
---
|
||||
|
||||
- name: Ensure alertmanager user '{{ alertmanager_user }}' exists
|
||||
ansible.builtin.user:
|
||||
name: "{{ alertmanager_user }}"
|
||||
state: present
|
||||
system: true
|
||||
register: alertmanager_user_info
|
||||
|
||||
- name: Ensure mounts are created
|
||||
ansible.builtin.file:
|
||||
dest: "{{ item.path }}"
|
||||
state: directory
|
||||
owner: "{{ item.owner | default(alertmanager_user_info.uid | default(alertmanager_user)) }}"
|
||||
group: "{{ item.owner | default(alertmanager_user_info.group | default(alertmanager_user)) }}"
|
||||
mode: "{{ item.mode | default('0755') }}"
|
||||
loop:
|
||||
- path: "{{ alertmanager_base_path }}"
|
||||
- path: "{{ alertmanager_data_path }}"
|
||||
- path: "{{ alertmanager_config_path }}"
|
||||
|
||||
- name: Ensure config file is templated
|
||||
ansible.builtin.copy:
|
||||
dest: "{{ alertmanager_config_file }}"
|
||||
content: "{{ alertmanager_config | to_nice_yaml }}"
|
||||
owner: "{{ alertmanager_user_info.uid | default(alertmanager_user) }}"
|
||||
owner: "{{ alertmanager_user_info.uid | default(alertmanager_user) }}"
|
||||
mode: "0640"
|
||||
notify:
|
||||
- restart-alertmanager
|
||||
|
||||
- name: Ensure container image is present on host
|
||||
community.docker.docker_image:
|
||||
name: "{{ alertmanager_container_image_reference }}"
|
||||
state: present
|
||||
source: pull
|
||||
force_source: "{{ alertmanager_container_image_force_pull | bool }}"
|
||||
|
||||
- name: Ensure container '{{ alertmanager_container_name }}' is running
|
||||
community.docker.docker_container:
|
||||
name: "{{ alertmanager_container_name }}"
|
||||
image: "{{ alertmanager_container_image_reference }}"
|
||||
env: "{{ alertmanager_container_env | default(omit) }}"
|
||||
user: "{{ alertmanager_user_info.uid | default(alertmanager_user) }}"
|
||||
ports: "{{ alertmanager_container_ports | default(omit) }}"
|
||||
volumes: "{{ alertmanager_container_volumes | default(omit) }}"
|
||||
networks: "{{ alertmanager_container_networks | default(omit) }}"
|
||||
purge_networks: "{{ alertmanager_container_purge_networks | default(omit) }}"
|
||||
etc_hosts: "{{ alertmanager_container_etc_hosts | default(omit) }}"
|
||||
restart_policy: "{{ alertmanager_container_restart_policy }}"
|
||||
state: started
|
24
roles/cadvisor/README.md
Normal file
24
roles/cadvisor/README.md
Normal file
@ -0,0 +1,24 @@
|
||||
# `finallycoffee.observability.cadvisor` ansible role
|
||||
|
||||
## Overview
|
||||
|
||||
Deploys [cadvisor](https://github.com/google/cadvisor/), a daemon
|
||||
for collecting and exporting information about running (docker)
|
||||
containers in a docker container.
|
||||
|
||||
## Configuration
|
||||
|
||||
In order to scrape `/metrics` of running containers, it is recommended
|
||||
to expose the default port of cadvisor to the host using
|
||||
```yaml
|
||||
cadvisor_container_ports:
|
||||
- "127.0.0.1:8080:8080`
|
||||
```
|
||||
so that cadvisor metrics are exposed at `http://127.0.0.1:8080/metrics`.
|
||||
|
||||
### Enabling/Disabling collection of metrics
|
||||
|
||||
By setting `cadvisor_disabled_metrics`, the collection of metrics
|
||||
can be disabled. The default list of disabled metrics is quite extensive,
|
||||
so when enabling a disabled-by-default metric, it is recommended to
|
||||
use `cadvisor_force_enable_metrics` instead, as it's empty by default.
|
53
roles/cadvisor/defaults/main.yml
Normal file
53
roles/cadvisor/defaults/main.yml
Normal file
@ -0,0 +1,53 @@
|
||||
---
|
||||
|
||||
cadvisor_version: 0.45.0
|
||||
|
||||
cadvisor_container_name: cadvisor
|
||||
cadvisor_container_image_name: gcr.io/cadvisor/cadvisor
|
||||
cadvisor_container_image_tag: ~
|
||||
cadvisor_container_image_ref: >-
|
||||
{{ cadvisor_container_image_name }}:{{ cadvisor_container_image_tag | default('v' + cadvisor_version, True) }}
|
||||
cadvisor_container_volumes: >-
|
||||
{{ cadvisor_container_base_volumes + cadvisor_container_extra_volumes | default([], True) }}
|
||||
cadvisor_container_extra_volumes: ~
|
||||
cadvisor_container_env: ~
|
||||
cadvisor_container_labels: "{{ cadvisor_container_base_labels | combine(cadvisor_container_extra_labels) }}"
|
||||
cadvisor_container_extra_labels: {}
|
||||
cadvisor_container_ports: ~
|
||||
cadvisor_container_networks: ~
|
||||
cadvisor_container_etc_hosts: ~
|
||||
cadvisor_container_devices: [ "/dev/kmsg:/dev/kmsg:rwm" ]
|
||||
cadvisor_container_privileged: yes
|
||||
cadvisor_container_pid_mode: "host"
|
||||
cadvisor_container_userns_mode: "host"
|
||||
cadvisor_container_capabilities: ~
|
||||
cadvisor_container_restart_policy: unless-stopped
|
||||
cadvisor_container_command: >-2
|
||||
{{ ["--docker_only=false"]
|
||||
+ (["--disable_metrics=" + cadvisor_disabled_metrics | join( ',' )]
|
||||
if cadvisor_disabled_metrics | default(false, True) else [])
|
||||
+ (["--enable_metrics=" + cadvisor_force_enable_metrics | join( ',' )]
|
||||
if cadvisor_force_enable_metrics | default(false, True) else [])
|
||||
}}
|
||||
cadvisor_container_base_labels:
|
||||
version: "{{ cadvisor_version }}"
|
||||
cadvisor_container_base_volumes:
|
||||
- "/:/rootfs:ro"
|
||||
- "/var/run:/var/run:ro"
|
||||
- "/sys:/sys:ro"
|
||||
- "/var/lib/docker/:/var/lib/docker:ro"
|
||||
- "/dev/disk/:/dev/disk:ro"
|
||||
|
||||
cadvisor_disabled_metrics:
|
||||
- advtcp
|
||||
- cpu_topology
|
||||
- cpuset
|
||||
- hugetlb
|
||||
- memory_numa
|
||||
- process
|
||||
- referenced_memory
|
||||
- resctrl
|
||||
- sched
|
||||
- tcp
|
||||
- udp
|
||||
cadvisor_force_enable_metrics: []
|
26
roles/cadvisor/tasks/main.yml
Normal file
26
roles/cadvisor/tasks/main.yml
Normal file
@ -0,0 +1,26 @@
|
||||
---
|
||||
|
||||
- name: Ensure container image is present
|
||||
docker_image:
|
||||
name: "{{ cadvisor_container_image_ref }}"
|
||||
state: present
|
||||
source: pull
|
||||
force_source: "{{ cadvisor_container_image_tag|default(False, True) | bool }}"
|
||||
|
||||
- name: Ensure cadvisor container is running
|
||||
docker_container:
|
||||
name: "{{ cadvisor_container_name }}"
|
||||
image: "{{ cadvisor_container_image_ref }}"
|
||||
env: "{{ cadvisor_container_env | default(omit, True) }}"
|
||||
ports: "{{ cadvisor_container_ports | default(omit, True) }}"
|
||||
labels: "{{ cadvisor_container_labels }}"
|
||||
devices: "{{ cadvisor_container_devices }}"
|
||||
volumes: "{{ cadvisor_container_volumes }}"
|
||||
networks: "{{ cadvisor_container_networks | default(omit, True) }}"
|
||||
etc_hosts: "{{ cadvisor_container_etc_hosts | default(omit, True) }}"
|
||||
privileged: "{{ cadvisor_container_privileged }}"
|
||||
command: "{{ cadvisor_container_command }}"
|
||||
pid_mode: "{{ cadvisor_container_pid_mode | default(omit, True) }}"
|
||||
userns_mode: "{{ cadvisor_container_userns_mode | default(omit, True) }}"
|
||||
restart_policy: "{{ cadvisor_container_restart_policy }}"
|
||||
state: started
|
14
roles/loki/README.md
Normal file
14
roles/loki/README.md
Normal file
@ -0,0 +1,14 @@
|
||||
# `finallycoffee.observability.loki` ansible role
|
||||
|
||||
## Overview
|
||||
|
||||
Runs [loki](https://github.com/grafana/loki) in a docker container.
|
||||
|
||||
## Configuration
|
||||
|
||||
Listens on `3100` per default, and can be changed using `loki_config_server_http_listen_port` / `loki_config_server_http_listen_addr`.
|
||||
|
||||
### Required configuration
|
||||
|
||||
Loki's storage config can be provided in `loki_config_storage_config`,
|
||||
the schema configs can be provided in `loki_config_schema_config_configs`.
|
45
roles/loki/defaults/main.yml
Normal file
45
roles/loki/defaults/main.yml
Normal file
@ -0,0 +1,45 @@
|
||||
---
|
||||
loki_user: loki
|
||||
loki_version: "2.9.1"
|
||||
loki_base_path: "/opt/loki"
|
||||
loki_config_path: "{{ loki_base_path }}/config"
|
||||
loki_config_file: "{{ loki_config_path }}/config.yml"
|
||||
loki_state: present
|
||||
loki_container_state: >-
|
||||
{{ (loki_state == 'present') | ternary('started', 'absent') }}
|
||||
loki_run_user: "{{ loki_user_info.uid | default(loki_user) }}"
|
||||
loki_run_group: "{{ loki_user_info.group | default(loki_user) }}"
|
||||
loki_all_host_directories: >-
|
||||
{{ loki_required_host_directories + loki_host_directories | default([]) }}
|
||||
loki_required_host_directories:
|
||||
- path: "{{ loki_base_path }}"
|
||||
- path: "{{ loki_config_path }}"
|
||||
|
||||
loki_container_name: loki
|
||||
loki_container_image_name: "loki"
|
||||
loki_container_image_namespace: "grafana"
|
||||
loki_container_image_registry: "docker.io"
|
||||
loki_container_image_tag: ~
|
||||
loki_container_image: >-
|
||||
{{ loki_container_image_registry
|
||||
+ (('/' + loki_container_image_namespace)
|
||||
if loki_container_image_namespace | default(false, true) else '')
|
||||
+ '/' + loki_container_image_name
|
||||
+ ':' + (loki_container_image_tag | default(loki_version, true))
|
||||
}}
|
||||
loki_container_env: {}
|
||||
loki_container_base_volumes:
|
||||
- "{{ loki_config_file }}:/etc/loki/local-config.yaml:ro"
|
||||
loki_container_all_volumes: >-2
|
||||
{{ loki_container_base_volumes + loki_container_volumes | default([]) }}
|
||||
loki_container_all_labels: >-2
|
||||
{{ loki_container_base_labels | combine(loki_container_labels | default({})) }}
|
||||
loki_container_restart_policy: "unless-stopped"
|
||||
|
||||
loki_config_target: "all"
|
||||
loki_config_auth_enabled: false
|
||||
loki_config_server_http_listen_port: 3100
|
||||
loki_config_server_http_listen_address: 127.0.0.1
|
||||
loki_config_storage_config: {}
|
||||
loki_config_schema_config_configs: []
|
||||
loki_config_limits_config: []
|
10
roles/loki/handlers/main.yml
Normal file
10
roles/loki/handlers/main.yml
Normal file
@ -0,0 +1,10 @@
|
||||
---
|
||||
- name: Ensure loki is reloaded
|
||||
listen: loki_reload
|
||||
community.docker.docker_container:
|
||||
name: "{{ loki_container_name }}"
|
||||
state: "started"
|
||||
restart: true
|
||||
force_kill: true
|
||||
kill_signal: "HUP"
|
||||
when: "loki_container_state in ['started', 'present']"
|
56
roles/loki/tasks/main.yml
Normal file
56
roles/loki/tasks/main.yml
Normal file
@ -0,0 +1,56 @@
|
||||
---
|
||||
- name: Ensure loki user '{{ loki_user }}' is {{ loki_state }}
|
||||
ansible.builtin.user:
|
||||
name: "{{ loki_user }}"
|
||||
state: "{{ loki_state }}"
|
||||
system: true
|
||||
create_home: false
|
||||
register: loki_user_info
|
||||
|
||||
- name: Ensure loki host directories are {{ loki_state }}
|
||||
ansible.builtin.file:
|
||||
path: "{{ item.path }}"
|
||||
state: >-
|
||||
{{ (loki_state == 'present') | ternary('directory', 'absent') }}
|
||||
owner: "{{ item.owner | default(loki_run_user) }}"
|
||||
group: "{{ item.group | default(loki_run_group) }}"
|
||||
mode: "{{ item.mode | default('0755') }}"
|
||||
loop: "{{ loki_all_host_directories }}"
|
||||
|
||||
- name: Ensure loki configuration file is templated
|
||||
ansible.builtin.copy:
|
||||
content: "{{ loki_final_config | to_nice_yaml(width=10000, indent=2) }}"
|
||||
dest: "{{ loki_config_file }}"
|
||||
owner: "{{ loki_run_user }}"
|
||||
group: "{{ loki_run_group }}"
|
||||
mode: "0640"
|
||||
notify: loki_reload
|
||||
|
||||
- name: Ensure loki container image '{{ loki_container_image }}' is {{ loki_state }}
|
||||
community.docker.docker_image:
|
||||
name: "{{ loki_container_image }}"
|
||||
state: "{{ loki_state }}"
|
||||
source: "pull"
|
||||
force_source: >-
|
||||
{{ loki_container_image_force_source
|
||||
| default(loki_container_image_tag | default(false, true)) }}
|
||||
register: loki_container_image_info
|
||||
until: loki_container_image_info is success
|
||||
retries: 3
|
||||
delay: 5
|
||||
|
||||
- name: Ensure loki container '{{ loki_container_name }}' is {{ loki_container_state }}
|
||||
community.docker.docker_container:
|
||||
name: "{{ loki_container_name }}"
|
||||
image: "{{ loki_container_image }}"
|
||||
env: "{{ loki_container_env }}"
|
||||
user: "{{ loki_run_user }}:{{ loki_run_group }}"
|
||||
ports: "{{ loki_container_ports | default(omit, true) }}"
|
||||
labels: "{{ loki_container_all_labels }}"
|
||||
volumes: "{{ loki_container_all_volumes }}"
|
||||
networks: "{{ loki_container_networks | default(omit, true) }}"
|
||||
purge_networks: "{{ loki_container_purge_networks | default(omit, true) }}"
|
||||
etc_hosts: "{{ loki_container_etc_hosts | default(omit, true) }}"
|
||||
hostname: "{{ loki_container_hostname | default(omit, true) }}"
|
||||
restart_policy: "{{ loki_container_restart_policy }}"
|
||||
state: "{{ loki_container_state }}"
|
14
roles/loki/vars/main.yml
Normal file
14
roles/loki/vars/main.yml
Normal file
@ -0,0 +1,14 @@
|
||||
loki_default_config:
|
||||
target: "{{ loki_config_target }}"
|
||||
auth_enabled: "{{ loki_config_auth_enabled }}"
|
||||
server:
|
||||
http_listen_address: "{{ loki_config_server_http_listen_address }}"
|
||||
http_listen_port: "{{ loki_config_server_http_listen_port }}"
|
||||
storage_config: "{{ loki_config_storage_config }}"
|
||||
limits_config: "{{ loki_config_limits_config }}"
|
||||
schema_config:
|
||||
configs: "{{ loki_config_schema_config_configs }}"
|
||||
loki_final_config: >-
|
||||
{{ loki_default_config | combine(loki_config | default({}), recursive=True) }}
|
||||
loki_container_base_labels:
|
||||
version: "{{ loki_version }}"
|
Reference in New Issue
Block a user