Add systemd-healthcheck to Synapse systemd service in an effort to increase reliability (of Synapse-dependant services)

Previously, we had a 10-second magical delay.

Now we first do a healthcheck to figure out when it really is up.
Then, we do the same 10-second magical delay to account for the time it
may take for a reverse-proxy (like Traefik) to pick up Synapse's routes.
This commit is contained in:
Slavi Pantaleev
2026-02-11 23:32:33 +02:00
parent f8815c0bb9
commit 59e70b8ca9
2 changed files with 36 additions and 8 deletions

View File

@@ -358,13 +358,37 @@ matrix_synapse_goofys_systemd_required_services_list_default: "{{ [devture_syste
matrix_synapse_goofys_systemd_required_services_list_auto: []
matrix_synapse_goofys_systemd_required_services_list_custom: []
# Controls how long to sleep for after starting the matrix-synapse container.
#
# Delaying, so that the homeserver can manage to fully start and various services
# that depend on it (`matrix_synapse_systemd_required_services_list` and `matrix_synapse_systemd_wanted_services_list`)
# may only start after the homeserver is up and running.
#
# This can be set to 0 to remove the delay.
# Controls the post-start health check in the systemd service.
# When enabled, ExecStartPost polls Synapse's /health endpoint via `docker exec` + `curl`,
# keeping the service in "activating (start-post)" state until Synapse is ready.
# Services with After=matrix-synapse.service will properly wait.
matrix_synapse_systemd_healthcheck_enabled: true
matrix_synapse_systemd_healthcheck_max_retries: 60
matrix_synapse_systemd_healthcheck_interval_seconds: 1
# The command used for the health check in ExecStartPost.
# Uses `docker exec` + `curl` (available in the Synapse container image) to poll /health.
# We intentionally don't rely on Docker's built-in container HEALTHCHECK (polling via `docker inspect`),
# because its check interval (default: 15s) is too slow for our startup needs, and lowering it
# would add unnecessary overhead for the entire container lifetime. We only need fast polling at startup.
matrix_synapse_systemd_healthcheck_command: >-
{{ devture_systemd_docker_base_host_command_sh }} -c
'for i in $(seq 1 {{ matrix_synapse_systemd_healthcheck_max_retries }}); do
echo "[Attempt $i/{{ matrix_synapse_systemd_healthcheck_max_retries }}] Synapse systemd health check: checking readiness via /health..";
{{ devture_systemd_docker_base_host_command_docker }} exec matrix-synapse
curl -fSs http://localhost:8008/health > /dev/null 2>&1 && echo "[Attempt $i/{{ matrix_synapse_systemd_healthcheck_max_retries }}] Synapse systemd health check: passed" && exit 0;
echo "[Attempt $i/{{ matrix_synapse_systemd_healthcheck_max_retries }}] Synapse systemd health check: not ready yet, retrying in {{ matrix_synapse_systemd_healthcheck_interval_seconds }}s..";
sleep {{ matrix_synapse_systemd_healthcheck_interval_seconds }};
done; echo "[Attempt $i/{{ matrix_synapse_systemd_healthcheck_max_retries }}] Synapse systemd health check: failed after {{ matrix_synapse_systemd_healthcheck_max_retries }} attempts"; exit 1'
# Controls how long to sleep for after the systemd health check passes.
# Even after Synapse is healthy, the reverse proxy (e.g. Traefik) needs time to discover
# the container and register its routes. Traefik's `providers.providersThrottleDuration`
# (default: 2s; see https://doc.traefik.io/traefik/providers/overview/#providersthrottleduration)
# adds a delay before applying new configuration from Docker events, meaning routes
# typically become available ~2-3 seconds after a container starts.
# Without this delay, services depending on Synapse may encounter 404 errors
# when connecting through the reverse proxy.
matrix_synapse_systemd_service_post_start_delay_seconds: 10
matrix_synapse_in_container_python_packages_path: "/usr/local/lib/python3.13/site-packages"

View File

@@ -69,8 +69,12 @@ ExecStartPre={{ devture_systemd_docker_base_host_command_docker }} network conne
ExecStart={{ devture_systemd_docker_base_host_command_docker }} start --attach matrix-synapse
{% if matrix_synapse_systemd_healthcheck_enabled %}
ExecStartPost={{ matrix_synapse_systemd_healthcheck_command }}
{% endif %}
{% if matrix_synapse_systemd_service_post_start_delay_seconds > 0 %}
ExecStartPost=-{{ matrix_host_command_sleep }} {{ matrix_synapse_systemd_service_post_start_delay_seconds }}
ExecStartPost=-{{ matrix_host_command_sleep }} {{ matrix_synapse_systemd_service_post_start_delay_seconds }}
{% endif %}
ExecStop=-{{ devture_systemd_docker_base_host_command_sh }} -c '{{ devture_systemd_docker_base_host_command_docker }} stop -t {{ devture_systemd_docker_base_container_stop_grace_time_seconds }} matrix-synapse 2>/dev/null || true'