7 Commits

21 changed files with 366 additions and 3 deletions

View File

@ -7,6 +7,10 @@ metrics or alerting.
## Roles ## Roles
- [`alertmanager`](roles/alertmanager/README.md): Runs prometheus'
alertmanager for receiving alerts from prometheus and routing them
to the correct configured receivers.
- [`matrix-alertmanager`](roles/matrix-alertmanager/README.md): An alert- - [`matrix-alertmanager`](roles/matrix-alertmanager/README.md): An alert-
manager receiver which posts alerts to a configured matrix channel manager receiver which posts alerts to a configured matrix channel
using alertmanagers' webhooks. using alertmanagers' webhooks.

View File

@ -3,12 +3,11 @@ name: observability
version: 0.0.1 version: 0.0.1
readme: README.md readme: README.md
authors: authors:
- Johanna Dorothea Reichmann <transcaffeine@finallycoffee.eu> - transcaffeine <transcaffeine@finally.coffee>
description: Various ansible roles useful for automating infrastructure description: Various ansible roles useful for automating infrastructure
dependencies: dependencies:
"community.docker": "^1.10.0" "community.docker": "^1.10.0"
license: license_file: LICENSE.md
- CNPLv7+
build_ignore: build_ignore:
- '*.tar.gz' - '*.tar.gz'
repository: https://git.finally.coffee/finallycoffee/observability repository: https://git.finally.coffee/finallycoffee/observability

3
meta/runtime.yml Normal file
View File

@ -0,0 +1,3 @@
---
requires_ansible: ">=2.12"

6
playbooks/loki.yml Normal file
View File

@ -0,0 +1,6 @@
---
- name: Install loki
hosts: "{{ loki_hosts | default('loki') }}"
become: true
roles:
- role: finallycoffee.observability.loki

View File

@ -0,0 +1,10 @@
# `finallycoffee.observability.alertmanager` ansible role
## Description
This role configures and runs prometheus alertmanager in a docker container.
The config file is templated on the host and persisted in `alertmanager_config_file`.
The alertmanager config can be passed by setting `alertmanager_config`, which expects the same yaml
format as the "normal" alertmanager config file (with top-level keys `global`, `route` and `receivers`).

View File

@ -0,0 +1,40 @@
---
alertmanager_user: alertmanager
alertmanager_version: 0.25.0
alertmanager_base_path: /opt/alertmanager
alertmanager_config_path: "{{ alertmanager_base_path }}/config"
alertmanager_config_file: "{{ alertmanager_config_path }}/alertmanager.yml"
alertmanager_data_path: "{{ alertmanager_base_path }}/data"
alertmanager_container_name: alertmanager
alertmanager_container_image_name: alertmanager
alertmanager_container_image_namespace: prometheus/
alertmanager_container_image_registry: quay.io
alertmanager_container_image_repository: >-
{{
(container_registries[alertmanager_container_image_registry] | default(alertmanager_container_image_registry))
+ '/' + (alertmanager_container_image_namespace | default(''))
+ alertmanager_container_image_name
}}
alertmanager_container_image_reference: >-
{{
alertmanager_container_image_repository + ':'
+ (alertmanager_container_image_tag | default('v' + alertmanager_version))
}}
alertmanager_container_image_force_pull: "{{ alertmanager_container_image_tag is defined }}"
alertmanager_container_default_volumes:
- "{{ alertmanager_config_file }}:/etc/alertmanager/alertmanager.yml:ro"
- "{{ alertmanager_data_path }}:/alertmanager:rw"
alertmanager_container_volumes: >-
{{ alertmanager_container_default_volumes
+ alertmanager_container_extra_volumes | default([]) }}
alertmanager_container_restart_policy: "unless-stopped"
alertmanager_config:
global: {}
route: {}
receivers: []

View File

@ -0,0 +1,8 @@
---
- name: Ensure alertmanager is restarted
community.docker.docker_container:
name: "{{ alertmanager_container_name }}"
state: started
restart: true
listen: restart-alertmanager

View File

@ -0,0 +1,51 @@
---
- name: Ensure alertmanager user '{{ alertmanager_user }}' exists
ansible.builtin.user:
name: "{{ alertmanager_user }}"
state: present
system: true
register: alertmanager_user_info
- name: Ensure mounts are created
ansible.builtin.file:
dest: "{{ item.path }}"
state: directory
owner: "{{ item.owner | default(alertmanager_user_info.uid | default(alertmanager_user)) }}"
group: "{{ item.owner | default(alertmanager_user_info.group | default(alertmanager_user)) }}"
mode: "{{ item.mode | default('0755') }}"
loop:
- path: "{{ alertmanager_base_path }}"
- path: "{{ alertmanager_data_path }}"
- path: "{{ alertmanager_config_path }}"
- name: Ensure config file is templated
ansible.builtin.copy:
dest: "{{ alertmanager_config_file }}"
content: "{{ alertmanager_config | to_nice_yaml }}"
owner: "{{ alertmanager_user_info.uid | default(alertmanager_user) }}"
owner: "{{ alertmanager_user_info.uid | default(alertmanager_user) }}"
mode: "0640"
notify:
- restart-alertmanager
- name: Ensure container image is present on host
community.docker.docker_image:
name: "{{ alertmanager_container_image_reference }}"
state: present
source: pull
force_source: "{{ alertmanager_container_image_force_pull | bool }}"
- name: Ensure container '{{ alertmanager_container_name }}' is running
community.docker.docker_container:
name: "{{ alertmanager_container_name }}"
image: "{{ alertmanager_container_image_reference }}"
env: "{{ alertmanager_container_env | default(omit) }}"
user: "{{ alertmanager_user_info.uid | default(alertmanager_user) }}"
ports: "{{ alertmanager_container_ports | default(omit) }}"
volumes: "{{ alertmanager_container_volumes | default(omit) }}"
networks: "{{ alertmanager_container_networks | default(omit) }}"
purge_networks: "{{ alertmanager_container_purge_networks | default(omit) }}"
etc_hosts: "{{ alertmanager_container_etc_hosts | default(omit) }}"
restart_policy: "{{ alertmanager_container_restart_policy }}"
state: started

24
roles/cadvisor/README.md Normal file
View File

@ -0,0 +1,24 @@
# `finallycoffee.observability.cadvisor` ansible role
## Overview
Deploys [cadvisor](https://github.com/google/cadvisor/), a daemon
for collecting and exporting information about running (docker)
containers in a docker container.
## Configuration
In order to scrape `/metrics` of running containers, it is recommended
to expose the default port of cadvisor to the host using
```yaml
cadvisor_container_ports:
- "127.0.0.1:8080:8080`
```
so that cadvisor metrics are exposed at `http://127.0.0.1:8080/metrics`.
### Enabling/Disabling collection of metrics
By setting `cadvisor_disabled_metrics`, the collection of metrics
can be disabled. The default list of disabled metrics is quite extensive,
so when enabling a disabled-by-default metric, it is recommended to
use `cadvisor_force_enable_metrics` instead, as it's empty by default.

View File

@ -0,0 +1,53 @@
---
cadvisor_version: 0.45.0
cadvisor_container_name: cadvisor
cadvisor_container_image_name: gcr.io/cadvisor/cadvisor
cadvisor_container_image_tag: ~
cadvisor_container_image_ref: >-
{{ cadvisor_container_image_name }}:{{ cadvisor_container_image_tag | default('v' + cadvisor_version, True) }}
cadvisor_container_volumes: >-
{{ cadvisor_container_base_volumes + cadvisor_container_extra_volumes | default([], True) }}
cadvisor_container_extra_volumes: ~
cadvisor_container_env: ~
cadvisor_container_labels: "{{ cadvisor_container_base_labels | combine(cadvisor_container_extra_labels) }}"
cadvisor_container_extra_labels: {}
cadvisor_container_ports: ~
cadvisor_container_networks: ~
cadvisor_container_etc_hosts: ~
cadvisor_container_devices: [ "/dev/kmsg:/dev/kmsg:rwm" ]
cadvisor_container_privileged: yes
cadvisor_container_pid_mode: "host"
cadvisor_container_userns_mode: "host"
cadvisor_container_capabilities: ~
cadvisor_container_restart_policy: unless-stopped
cadvisor_container_command: >-2
{{ ["--docker_only=false"]
+ (["--disable_metrics=" + cadvisor_disabled_metrics | join( ',' )]
if cadvisor_disabled_metrics | default(false, True) else [])
+ (["--enable_metrics=" + cadvisor_force_enable_metrics | join( ',' )]
if cadvisor_force_enable_metrics | default(false, True) else [])
}}
cadvisor_container_base_labels:
version: "{{ cadvisor_version }}"
cadvisor_container_base_volumes:
- "/:/rootfs:ro"
- "/var/run:/var/run:ro"
- "/sys:/sys:ro"
- "/var/lib/docker/:/var/lib/docker:ro"
- "/dev/disk/:/dev/disk:ro"
cadvisor_disabled_metrics:
- advtcp
- cpu_topology
- cpuset
- hugetlb
- memory_numa
- process
- referenced_memory
- resctrl
- sched
- tcp
- udp
cadvisor_force_enable_metrics: []

View File

@ -0,0 +1,26 @@
---
- name: Ensure container image is present
docker_image:
name: "{{ cadvisor_container_image_ref }}"
state: present
source: pull
force_source: "{{ cadvisor_container_image_tag|default(False, True) | bool }}"
- name: Ensure cadvisor container is running
docker_container:
name: "{{ cadvisor_container_name }}"
image: "{{ cadvisor_container_image_ref }}"
env: "{{ cadvisor_container_env | default(omit, True) }}"
ports: "{{ cadvisor_container_ports | default(omit, True) }}"
labels: "{{ cadvisor_container_labels }}"
devices: "{{ cadvisor_container_devices }}"
volumes: "{{ cadvisor_container_volumes }}"
networks: "{{ cadvisor_container_networks | default(omit, True) }}"
etc_hosts: "{{ cadvisor_container_etc_hosts | default(omit, True) }}"
privileged: "{{ cadvisor_container_privileged }}"
command: "{{ cadvisor_container_command }}"
pid_mode: "{{ cadvisor_container_pid_mode | default(omit, True) }}"
userns_mode: "{{ cadvisor_container_userns_mode | default(omit, True) }}"
restart_policy: "{{ cadvisor_container_restart_policy }}"
state: started

14
roles/loki/README.md Normal file
View File

@ -0,0 +1,14 @@
# `finallycoffee.observability.loki` ansible role
## Overview
Runs [loki](https://github.com/grafana/loki) in a docker container.
## Configuration
Listens on `3100` per default, and can be changed using `loki_config_server_http_listen_port` / `loki_config_server_http_listen_addr`.
### Required configuration
Loki's storage config can be provided in `loki_config_storage_config`,
the schema configs can be provided in `loki_config_schema_config_configs`.

View File

@ -0,0 +1,45 @@
---
loki_user: loki
loki_version: "2.9.1"
loki_base_path: "/opt/loki"
loki_config_path: "{{ loki_base_path }}/config"
loki_config_file: "{{ loki_config_path }}/config.yml"
loki_state: present
loki_container_state: >-
{{ (loki_state == 'present') | ternary('started', 'absent') }}
loki_run_user: "{{ loki_user_info.uid | default(loki_user) }}"
loki_run_group: "{{ loki_user_info.group | default(loki_user) }}"
loki_all_host_directories: >-
{{ loki_required_host_directories + loki_host_directories | default([]) }}
loki_required_host_directories:
- path: "{{ loki_base_path }}"
- path: "{{ loki_config_path }}"
loki_container_name: loki
loki_container_image_name: "loki"
loki_container_image_namespace: "grafana"
loki_container_image_registry: "docker.io"
loki_container_image_tag: ~
loki_container_image: >-
{{ loki_container_image_registry
+ (('/' + loki_container_image_namespace)
if loki_container_image_namespace | default(false, true) else '')
+ '/' + loki_container_image_name
+ ':' + (loki_container_image_tag | default(loki_version, true))
}}
loki_container_env: {}
loki_container_base_volumes:
- "{{ loki_config_file }}:/etc/loki/local-config.yaml:ro"
loki_container_all_volumes: >-2
{{ loki_container_base_volumes + loki_container_volumes | default([]) }}
loki_container_all_labels: >-2
{{ loki_container_base_labels | combine(loki_container_labels | default({})) }}
loki_container_restart_policy: "unless-stopped"
loki_config_target: "all"
loki_config_auth_enabled: false
loki_config_server_http_listen_port: 3100
loki_config_server_http_listen_address: 127.0.0.1
loki_config_storage_config: {}
loki_config_schema_config_configs: []
loki_config_limits_config: []

View File

@ -0,0 +1,10 @@
---
- name: Ensure loki is reloaded
listen: loki_reload
community.docker.docker_container:
name: "{{ loki_container_name }}"
state: "started"
restart: true
force_kill: true
kill_signal: "HUP"
when: "loki_container_state in ['started', 'present']"

56
roles/loki/tasks/main.yml Normal file
View File

@ -0,0 +1,56 @@
---
- name: Ensure loki user '{{ loki_user }}' is {{ loki_state }}
ansible.builtin.user:
name: "{{ loki_user }}"
state: "{{ loki_state }}"
system: true
create_home: false
register: loki_user_info
- name: Ensure loki host directories are {{ loki_state }}
ansible.builtin.file:
path: "{{ item.path }}"
state: >-
{{ (loki_state == 'present') | ternary('directory', 'absent') }}
owner: "{{ item.owner | default(loki_run_user) }}"
group: "{{ item.group | default(loki_run_group) }}"
mode: "{{ item.mode | default('0755') }}"
loop: "{{ loki_all_host_directories }}"
- name: Ensure loki configuration file is templated
ansible.builtin.copy:
content: "{{ loki_final_config | to_nice_yaml(width=10000, indent=2) }}"
dest: "{{ loki_config_file }}"
owner: "{{ loki_run_user }}"
group: "{{ loki_run_group }}"
mode: "0640"
notify: loki_reload
- name: Ensure loki container image '{{ loki_container_image }}' is {{ loki_state }}
community.docker.docker_image:
name: "{{ loki_container_image }}"
state: "{{ loki_state }}"
source: "pull"
force_source: >-
{{ loki_container_image_force_source
| default(loki_container_image_tag | default(false, true)) }}
register: loki_container_image_info
until: loki_container_image_info is success
retries: 3
delay: 5
- name: Ensure loki container '{{ loki_container_name }}' is {{ loki_container_state }}
community.docker.docker_container:
name: "{{ loki_container_name }}"
image: "{{ loki_container_image }}"
env: "{{ loki_container_env }}"
user: "{{ loki_run_user }}:{{ loki_run_group }}"
ports: "{{ loki_container_ports | default(omit, true) }}"
labels: "{{ loki_container_all_labels }}"
volumes: "{{ loki_container_all_volumes }}"
networks: "{{ loki_container_networks | default(omit, true) }}"
purge_networks: "{{ loki_container_purge_networks | default(omit, true) }}"
etc_hosts: "{{ loki_container_etc_hosts | default(omit, true) }}"
hostname: "{{ loki_container_hostname | default(omit, true) }}"
restart_policy: "{{ loki_container_restart_policy }}"
state: "{{ loki_container_state }}"

14
roles/loki/vars/main.yml Normal file
View File

@ -0,0 +1,14 @@
loki_default_config:
target: "{{ loki_config_target }}"
auth_enabled: "{{ loki_config_auth_enabled }}"
server:
http_listen_address: "{{ loki_config_server_http_listen_address }}"
http_listen_port: "{{ loki_config_server_http_listen_port }}"
storage_config: "{{ loki_config_storage_config }}"
limits_config: "{{ loki_config_limits_config }}"
schema_config:
configs: "{{ loki_config_schema_config_configs }}"
loki_final_config: >-
{{ loki_default_config | combine(loki_config | default({}), recursive=True) }}
loki_container_base_labels:
version: "{{ loki_version }}"