Compare commits
7 Commits
fc65595bce
...
transcaffe
Author | SHA1 | Date | |
---|---|---|---|
e707dbc31c
|
|||
f84fed4867
|
|||
23fdcb107f
|
|||
6382f2af7e
|
|||
0b5092f34d
|
|||
97ee8826ba
|
|||
aebcd29475
|
@ -7,6 +7,10 @@ metrics or alerting.
|
|||||||
|
|
||||||
## Roles
|
## Roles
|
||||||
|
|
||||||
|
- [`alertmanager`](roles/alertmanager/README.md): Runs prometheus'
|
||||||
|
alertmanager for receiving alerts from prometheus and routing them
|
||||||
|
to the correct configured receivers.
|
||||||
|
|
||||||
- [`matrix-alertmanager`](roles/matrix-alertmanager/README.md): An alert-
|
- [`matrix-alertmanager`](roles/matrix-alertmanager/README.md): An alert-
|
||||||
manager receiver which posts alerts to a configured matrix channel
|
manager receiver which posts alerts to a configured matrix channel
|
||||||
using alertmanagers' webhooks.
|
using alertmanagers' webhooks.
|
||||||
|
@ -3,12 +3,11 @@ name: observability
|
|||||||
version: 0.0.1
|
version: 0.0.1
|
||||||
readme: README.md
|
readme: README.md
|
||||||
authors:
|
authors:
|
||||||
- Johanna Dorothea Reichmann <transcaffeine@finallycoffee.eu>
|
- transcaffeine <transcaffeine@finally.coffee>
|
||||||
description: Various ansible roles useful for automating infrastructure
|
description: Various ansible roles useful for automating infrastructure
|
||||||
dependencies:
|
dependencies:
|
||||||
"community.docker": "^1.10.0"
|
"community.docker": "^1.10.0"
|
||||||
license:
|
license_file: LICENSE.md
|
||||||
- CNPLv7+
|
|
||||||
build_ignore:
|
build_ignore:
|
||||||
- '*.tar.gz'
|
- '*.tar.gz'
|
||||||
repository: https://git.finally.coffee/finallycoffee/observability
|
repository: https://git.finally.coffee/finallycoffee/observability
|
||||||
|
3
meta/runtime.yml
Normal file
3
meta/runtime.yml
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
---
|
||||||
|
|
||||||
|
requires_ansible: ">=2.12"
|
6
playbooks/loki.yml
Normal file
6
playbooks/loki.yml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
---
|
||||||
|
- name: Install loki
|
||||||
|
hosts: "{{ loki_hosts | default('loki') }}"
|
||||||
|
become: true
|
||||||
|
roles:
|
||||||
|
- role: finallycoffee.observability.loki
|
10
roles/alertmanager/README.md
Normal file
10
roles/alertmanager/README.md
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
# `finallycoffee.observability.alertmanager` ansible role
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
This role configures and runs prometheus alertmanager in a docker container.
|
||||||
|
|
||||||
|
The config file is templated on the host and persisted in `alertmanager_config_file`.
|
||||||
|
|
||||||
|
The alertmanager config can be passed by setting `alertmanager_config`, which expects the same yaml
|
||||||
|
format as the "normal" alertmanager config file (with top-level keys `global`, `route` and `receivers`).
|
40
roles/alertmanager/defaults/main.yml
Normal file
40
roles/alertmanager/defaults/main.yml
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
---
|
||||||
|
|
||||||
|
alertmanager_user: alertmanager
|
||||||
|
alertmanager_version: 0.25.0
|
||||||
|
alertmanager_base_path: /opt/alertmanager
|
||||||
|
alertmanager_config_path: "{{ alertmanager_base_path }}/config"
|
||||||
|
alertmanager_config_file: "{{ alertmanager_config_path }}/alertmanager.yml"
|
||||||
|
alertmanager_data_path: "{{ alertmanager_base_path }}/data"
|
||||||
|
|
||||||
|
alertmanager_container_name: alertmanager
|
||||||
|
alertmanager_container_image_name: alertmanager
|
||||||
|
alertmanager_container_image_namespace: prometheus/
|
||||||
|
alertmanager_container_image_registry: quay.io
|
||||||
|
|
||||||
|
alertmanager_container_image_repository: >-
|
||||||
|
{{
|
||||||
|
(container_registries[alertmanager_container_image_registry] | default(alertmanager_container_image_registry))
|
||||||
|
+ '/' + (alertmanager_container_image_namespace | default(''))
|
||||||
|
+ alertmanager_container_image_name
|
||||||
|
}}
|
||||||
|
alertmanager_container_image_reference: >-
|
||||||
|
{{
|
||||||
|
alertmanager_container_image_repository + ':'
|
||||||
|
+ (alertmanager_container_image_tag | default('v' + alertmanager_version))
|
||||||
|
}}
|
||||||
|
|
||||||
|
alertmanager_container_image_force_pull: "{{ alertmanager_container_image_tag is defined }}"
|
||||||
|
|
||||||
|
alertmanager_container_default_volumes:
|
||||||
|
- "{{ alertmanager_config_file }}:/etc/alertmanager/alertmanager.yml:ro"
|
||||||
|
- "{{ alertmanager_data_path }}:/alertmanager:rw"
|
||||||
|
alertmanager_container_volumes: >-
|
||||||
|
{{ alertmanager_container_default_volumes
|
||||||
|
+ alertmanager_container_extra_volumes | default([]) }}
|
||||||
|
alertmanager_container_restart_policy: "unless-stopped"
|
||||||
|
|
||||||
|
alertmanager_config:
|
||||||
|
global: {}
|
||||||
|
route: {}
|
||||||
|
receivers: []
|
8
roles/alertmanager/handlers/main.yml
Normal file
8
roles/alertmanager/handlers/main.yml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
---
|
||||||
|
|
||||||
|
- name: Ensure alertmanager is restarted
|
||||||
|
community.docker.docker_container:
|
||||||
|
name: "{{ alertmanager_container_name }}"
|
||||||
|
state: started
|
||||||
|
restart: true
|
||||||
|
listen: restart-alertmanager
|
51
roles/alertmanager/tasks/main.yml
Normal file
51
roles/alertmanager/tasks/main.yml
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
---
|
||||||
|
|
||||||
|
- name: Ensure alertmanager user '{{ alertmanager_user }}' exists
|
||||||
|
ansible.builtin.user:
|
||||||
|
name: "{{ alertmanager_user }}"
|
||||||
|
state: present
|
||||||
|
system: true
|
||||||
|
register: alertmanager_user_info
|
||||||
|
|
||||||
|
- name: Ensure mounts are created
|
||||||
|
ansible.builtin.file:
|
||||||
|
dest: "{{ item.path }}"
|
||||||
|
state: directory
|
||||||
|
owner: "{{ item.owner | default(alertmanager_user_info.uid | default(alertmanager_user)) }}"
|
||||||
|
group: "{{ item.owner | default(alertmanager_user_info.group | default(alertmanager_user)) }}"
|
||||||
|
mode: "{{ item.mode | default('0755') }}"
|
||||||
|
loop:
|
||||||
|
- path: "{{ alertmanager_base_path }}"
|
||||||
|
- path: "{{ alertmanager_data_path }}"
|
||||||
|
- path: "{{ alertmanager_config_path }}"
|
||||||
|
|
||||||
|
- name: Ensure config file is templated
|
||||||
|
ansible.builtin.copy:
|
||||||
|
dest: "{{ alertmanager_config_file }}"
|
||||||
|
content: "{{ alertmanager_config | to_nice_yaml }}"
|
||||||
|
owner: "{{ alertmanager_user_info.uid | default(alertmanager_user) }}"
|
||||||
|
owner: "{{ alertmanager_user_info.uid | default(alertmanager_user) }}"
|
||||||
|
mode: "0640"
|
||||||
|
notify:
|
||||||
|
- restart-alertmanager
|
||||||
|
|
||||||
|
- name: Ensure container image is present on host
|
||||||
|
community.docker.docker_image:
|
||||||
|
name: "{{ alertmanager_container_image_reference }}"
|
||||||
|
state: present
|
||||||
|
source: pull
|
||||||
|
force_source: "{{ alertmanager_container_image_force_pull | bool }}"
|
||||||
|
|
||||||
|
- name: Ensure container '{{ alertmanager_container_name }}' is running
|
||||||
|
community.docker.docker_container:
|
||||||
|
name: "{{ alertmanager_container_name }}"
|
||||||
|
image: "{{ alertmanager_container_image_reference }}"
|
||||||
|
env: "{{ alertmanager_container_env | default(omit) }}"
|
||||||
|
user: "{{ alertmanager_user_info.uid | default(alertmanager_user) }}"
|
||||||
|
ports: "{{ alertmanager_container_ports | default(omit) }}"
|
||||||
|
volumes: "{{ alertmanager_container_volumes | default(omit) }}"
|
||||||
|
networks: "{{ alertmanager_container_networks | default(omit) }}"
|
||||||
|
purge_networks: "{{ alertmanager_container_purge_networks | default(omit) }}"
|
||||||
|
etc_hosts: "{{ alertmanager_container_etc_hosts | default(omit) }}"
|
||||||
|
restart_policy: "{{ alertmanager_container_restart_policy }}"
|
||||||
|
state: started
|
24
roles/cadvisor/README.md
Normal file
24
roles/cadvisor/README.md
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# `finallycoffee.observability.cadvisor` ansible role
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Deploys [cadvisor](https://github.com/google/cadvisor/), a daemon
|
||||||
|
for collecting and exporting information about running (docker)
|
||||||
|
containers in a docker container.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
In order to scrape `/metrics` of running containers, it is recommended
|
||||||
|
to expose the default port of cadvisor to the host using
|
||||||
|
```yaml
|
||||||
|
cadvisor_container_ports:
|
||||||
|
- "127.0.0.1:8080:8080`
|
||||||
|
```
|
||||||
|
so that cadvisor metrics are exposed at `http://127.0.0.1:8080/metrics`.
|
||||||
|
|
||||||
|
### Enabling/Disabling collection of metrics
|
||||||
|
|
||||||
|
By setting `cadvisor_disabled_metrics`, the collection of metrics
|
||||||
|
can be disabled. The default list of disabled metrics is quite extensive,
|
||||||
|
so when enabling a disabled-by-default metric, it is recommended to
|
||||||
|
use `cadvisor_force_enable_metrics` instead, as it's empty by default.
|
53
roles/cadvisor/defaults/main.yml
Normal file
53
roles/cadvisor/defaults/main.yml
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
---
|
||||||
|
|
||||||
|
cadvisor_version: 0.45.0
|
||||||
|
|
||||||
|
cadvisor_container_name: cadvisor
|
||||||
|
cadvisor_container_image_name: gcr.io/cadvisor/cadvisor
|
||||||
|
cadvisor_container_image_tag: ~
|
||||||
|
cadvisor_container_image_ref: >-
|
||||||
|
{{ cadvisor_container_image_name }}:{{ cadvisor_container_image_tag | default('v' + cadvisor_version, True) }}
|
||||||
|
cadvisor_container_volumes: >-
|
||||||
|
{{ cadvisor_container_base_volumes + cadvisor_container_extra_volumes | default([], True) }}
|
||||||
|
cadvisor_container_extra_volumes: ~
|
||||||
|
cadvisor_container_env: ~
|
||||||
|
cadvisor_container_labels: "{{ cadvisor_container_base_labels | combine(cadvisor_container_extra_labels) }}"
|
||||||
|
cadvisor_container_extra_labels: {}
|
||||||
|
cadvisor_container_ports: ~
|
||||||
|
cadvisor_container_networks: ~
|
||||||
|
cadvisor_container_etc_hosts: ~
|
||||||
|
cadvisor_container_devices: [ "/dev/kmsg:/dev/kmsg:rwm" ]
|
||||||
|
cadvisor_container_privileged: yes
|
||||||
|
cadvisor_container_pid_mode: "host"
|
||||||
|
cadvisor_container_userns_mode: "host"
|
||||||
|
cadvisor_container_capabilities: ~
|
||||||
|
cadvisor_container_restart_policy: unless-stopped
|
||||||
|
cadvisor_container_command: >-2
|
||||||
|
{{ ["--docker_only=false"]
|
||||||
|
+ (["--disable_metrics=" + cadvisor_disabled_metrics | join( ',' )]
|
||||||
|
if cadvisor_disabled_metrics | default(false, True) else [])
|
||||||
|
+ (["--enable_metrics=" + cadvisor_force_enable_metrics | join( ',' )]
|
||||||
|
if cadvisor_force_enable_metrics | default(false, True) else [])
|
||||||
|
}}
|
||||||
|
cadvisor_container_base_labels:
|
||||||
|
version: "{{ cadvisor_version }}"
|
||||||
|
cadvisor_container_base_volumes:
|
||||||
|
- "/:/rootfs:ro"
|
||||||
|
- "/var/run:/var/run:ro"
|
||||||
|
- "/sys:/sys:ro"
|
||||||
|
- "/var/lib/docker/:/var/lib/docker:ro"
|
||||||
|
- "/dev/disk/:/dev/disk:ro"
|
||||||
|
|
||||||
|
cadvisor_disabled_metrics:
|
||||||
|
- advtcp
|
||||||
|
- cpu_topology
|
||||||
|
- cpuset
|
||||||
|
- hugetlb
|
||||||
|
- memory_numa
|
||||||
|
- process
|
||||||
|
- referenced_memory
|
||||||
|
- resctrl
|
||||||
|
- sched
|
||||||
|
- tcp
|
||||||
|
- udp
|
||||||
|
cadvisor_force_enable_metrics: []
|
26
roles/cadvisor/tasks/main.yml
Normal file
26
roles/cadvisor/tasks/main.yml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
---
|
||||||
|
|
||||||
|
- name: Ensure container image is present
|
||||||
|
docker_image:
|
||||||
|
name: "{{ cadvisor_container_image_ref }}"
|
||||||
|
state: present
|
||||||
|
source: pull
|
||||||
|
force_source: "{{ cadvisor_container_image_tag|default(False, True) | bool }}"
|
||||||
|
|
||||||
|
- name: Ensure cadvisor container is running
|
||||||
|
docker_container:
|
||||||
|
name: "{{ cadvisor_container_name }}"
|
||||||
|
image: "{{ cadvisor_container_image_ref }}"
|
||||||
|
env: "{{ cadvisor_container_env | default(omit, True) }}"
|
||||||
|
ports: "{{ cadvisor_container_ports | default(omit, True) }}"
|
||||||
|
labels: "{{ cadvisor_container_labels }}"
|
||||||
|
devices: "{{ cadvisor_container_devices }}"
|
||||||
|
volumes: "{{ cadvisor_container_volumes }}"
|
||||||
|
networks: "{{ cadvisor_container_networks | default(omit, True) }}"
|
||||||
|
etc_hosts: "{{ cadvisor_container_etc_hosts | default(omit, True) }}"
|
||||||
|
privileged: "{{ cadvisor_container_privileged }}"
|
||||||
|
command: "{{ cadvisor_container_command }}"
|
||||||
|
pid_mode: "{{ cadvisor_container_pid_mode | default(omit, True) }}"
|
||||||
|
userns_mode: "{{ cadvisor_container_userns_mode | default(omit, True) }}"
|
||||||
|
restart_policy: "{{ cadvisor_container_restart_policy }}"
|
||||||
|
state: started
|
14
roles/loki/README.md
Normal file
14
roles/loki/README.md
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# `finallycoffee.observability.loki` ansible role
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Runs [loki](https://github.com/grafana/loki) in a docker container.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Listens on `3100` per default, and can be changed using `loki_config_server_http_listen_port` / `loki_config_server_http_listen_addr`.
|
||||||
|
|
||||||
|
### Required configuration
|
||||||
|
|
||||||
|
Loki's storage config can be provided in `loki_config_storage_config`,
|
||||||
|
the schema configs can be provided in `loki_config_schema_config_configs`.
|
45
roles/loki/defaults/main.yml
Normal file
45
roles/loki/defaults/main.yml
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
---
|
||||||
|
loki_user: loki
|
||||||
|
loki_version: "2.9.1"
|
||||||
|
loki_base_path: "/opt/loki"
|
||||||
|
loki_config_path: "{{ loki_base_path }}/config"
|
||||||
|
loki_config_file: "{{ loki_config_path }}/config.yml"
|
||||||
|
loki_state: present
|
||||||
|
loki_container_state: >-
|
||||||
|
{{ (loki_state == 'present') | ternary('started', 'absent') }}
|
||||||
|
loki_run_user: "{{ loki_user_info.uid | default(loki_user) }}"
|
||||||
|
loki_run_group: "{{ loki_user_info.group | default(loki_user) }}"
|
||||||
|
loki_all_host_directories: >-
|
||||||
|
{{ loki_required_host_directories + loki_host_directories | default([]) }}
|
||||||
|
loki_required_host_directories:
|
||||||
|
- path: "{{ loki_base_path }}"
|
||||||
|
- path: "{{ loki_config_path }}"
|
||||||
|
|
||||||
|
loki_container_name: loki
|
||||||
|
loki_container_image_name: "loki"
|
||||||
|
loki_container_image_namespace: "grafana"
|
||||||
|
loki_container_image_registry: "docker.io"
|
||||||
|
loki_container_image_tag: ~
|
||||||
|
loki_container_image: >-
|
||||||
|
{{ loki_container_image_registry
|
||||||
|
+ (('/' + loki_container_image_namespace)
|
||||||
|
if loki_container_image_namespace | default(false, true) else '')
|
||||||
|
+ '/' + loki_container_image_name
|
||||||
|
+ ':' + (loki_container_image_tag | default(loki_version, true))
|
||||||
|
}}
|
||||||
|
loki_container_env: {}
|
||||||
|
loki_container_base_volumes:
|
||||||
|
- "{{ loki_config_file }}:/etc/loki/local-config.yaml:ro"
|
||||||
|
loki_container_all_volumes: >-2
|
||||||
|
{{ loki_container_base_volumes + loki_container_volumes | default([]) }}
|
||||||
|
loki_container_all_labels: >-2
|
||||||
|
{{ loki_container_base_labels | combine(loki_container_labels | default({})) }}
|
||||||
|
loki_container_restart_policy: "unless-stopped"
|
||||||
|
|
||||||
|
loki_config_target: "all"
|
||||||
|
loki_config_auth_enabled: false
|
||||||
|
loki_config_server_http_listen_port: 3100
|
||||||
|
loki_config_server_http_listen_address: 127.0.0.1
|
||||||
|
loki_config_storage_config: {}
|
||||||
|
loki_config_schema_config_configs: []
|
||||||
|
loki_config_limits_config: []
|
10
roles/loki/handlers/main.yml
Normal file
10
roles/loki/handlers/main.yml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
---
|
||||||
|
- name: Ensure loki is reloaded
|
||||||
|
listen: loki_reload
|
||||||
|
community.docker.docker_container:
|
||||||
|
name: "{{ loki_container_name }}"
|
||||||
|
state: "started"
|
||||||
|
restart: true
|
||||||
|
force_kill: true
|
||||||
|
kill_signal: "HUP"
|
||||||
|
when: "loki_container_state in ['started', 'present']"
|
56
roles/loki/tasks/main.yml
Normal file
56
roles/loki/tasks/main.yml
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
---
|
||||||
|
- name: Ensure loki user '{{ loki_user }}' is {{ loki_state }}
|
||||||
|
ansible.builtin.user:
|
||||||
|
name: "{{ loki_user }}"
|
||||||
|
state: "{{ loki_state }}"
|
||||||
|
system: true
|
||||||
|
create_home: false
|
||||||
|
register: loki_user_info
|
||||||
|
|
||||||
|
- name: Ensure loki host directories are {{ loki_state }}
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "{{ item.path }}"
|
||||||
|
state: >-
|
||||||
|
{{ (loki_state == 'present') | ternary('directory', 'absent') }}
|
||||||
|
owner: "{{ item.owner | default(loki_run_user) }}"
|
||||||
|
group: "{{ item.group | default(loki_run_group) }}"
|
||||||
|
mode: "{{ item.mode | default('0755') }}"
|
||||||
|
loop: "{{ loki_all_host_directories }}"
|
||||||
|
|
||||||
|
- name: Ensure loki configuration file is templated
|
||||||
|
ansible.builtin.copy:
|
||||||
|
content: "{{ loki_final_config | to_nice_yaml(width=10000, indent=2) }}"
|
||||||
|
dest: "{{ loki_config_file }}"
|
||||||
|
owner: "{{ loki_run_user }}"
|
||||||
|
group: "{{ loki_run_group }}"
|
||||||
|
mode: "0640"
|
||||||
|
notify: loki_reload
|
||||||
|
|
||||||
|
- name: Ensure loki container image '{{ loki_container_image }}' is {{ loki_state }}
|
||||||
|
community.docker.docker_image:
|
||||||
|
name: "{{ loki_container_image }}"
|
||||||
|
state: "{{ loki_state }}"
|
||||||
|
source: "pull"
|
||||||
|
force_source: >-
|
||||||
|
{{ loki_container_image_force_source
|
||||||
|
| default(loki_container_image_tag | default(false, true)) }}
|
||||||
|
register: loki_container_image_info
|
||||||
|
until: loki_container_image_info is success
|
||||||
|
retries: 3
|
||||||
|
delay: 5
|
||||||
|
|
||||||
|
- name: Ensure loki container '{{ loki_container_name }}' is {{ loki_container_state }}
|
||||||
|
community.docker.docker_container:
|
||||||
|
name: "{{ loki_container_name }}"
|
||||||
|
image: "{{ loki_container_image }}"
|
||||||
|
env: "{{ loki_container_env }}"
|
||||||
|
user: "{{ loki_run_user }}:{{ loki_run_group }}"
|
||||||
|
ports: "{{ loki_container_ports | default(omit, true) }}"
|
||||||
|
labels: "{{ loki_container_all_labels }}"
|
||||||
|
volumes: "{{ loki_container_all_volumes }}"
|
||||||
|
networks: "{{ loki_container_networks | default(omit, true) }}"
|
||||||
|
purge_networks: "{{ loki_container_purge_networks | default(omit, true) }}"
|
||||||
|
etc_hosts: "{{ loki_container_etc_hosts | default(omit, true) }}"
|
||||||
|
hostname: "{{ loki_container_hostname | default(omit, true) }}"
|
||||||
|
restart_policy: "{{ loki_container_restart_policy }}"
|
||||||
|
state: "{{ loki_container_state }}"
|
14
roles/loki/vars/main.yml
Normal file
14
roles/loki/vars/main.yml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
loki_default_config:
|
||||||
|
target: "{{ loki_config_target }}"
|
||||||
|
auth_enabled: "{{ loki_config_auth_enabled }}"
|
||||||
|
server:
|
||||||
|
http_listen_address: "{{ loki_config_server_http_listen_address }}"
|
||||||
|
http_listen_port: "{{ loki_config_server_http_listen_port }}"
|
||||||
|
storage_config: "{{ loki_config_storage_config }}"
|
||||||
|
limits_config: "{{ loki_config_limits_config }}"
|
||||||
|
schema_config:
|
||||||
|
configs: "{{ loki_config_schema_config_configs }}"
|
||||||
|
loki_final_config: >-
|
||||||
|
{{ loki_default_config | combine(loki_config | default({}), recursive=True) }}
|
||||||
|
loki_container_base_labels:
|
||||||
|
version: "{{ loki_version }}"
|
Reference in New Issue
Block a user