- name: Deploy System Healthcheck Monitoring hosts: all become: yes vars_files: - ../infra_vars.yml - ../services_config.yml - ../infra_secrets.yml vars: healthcheck_interval_seconds: 60 # Send healthcheck every 60 seconds (1 minute) healthcheck_timeout_seconds: 90 # Uptime Kuma should alert if no ping received within 90s healthcheck_retries: 1 # Number of retries before alerting monitoring_script_dir: /opt/system-healthcheck monitoring_script_path: "{{ monitoring_script_dir }}/system_healthcheck.sh" log_file: "{{ monitoring_script_dir }}/system_healthcheck.log" systemd_service_name: system-healthcheck # Uptime Kuma configuration (auto-configured from services_config.yml and infra_secrets.yml) uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}" ntfy_topic: "{{ service_settings.ntfy.topic }}" tasks: - name: Validate Uptime Kuma configuration assert: that: - uptime_kuma_api_url is defined - uptime_kuma_api_url != "" - uptime_kuma_username is defined - uptime_kuma_username != "" - uptime_kuma_password is defined - uptime_kuma_password != "" fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set" - name: Get hostname for monitor identification command: hostname register: host_name changed_when: false - name: Set monitor name and group based on hostname set_fact: monitor_name: "system-healthcheck-{{ host_name.stdout }}" monitor_friendly_name: "System Healthcheck: {{ host_name.stdout }}" uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra" - name: Create Uptime Kuma monitor setup script copy: dest: /tmp/setup_uptime_kuma_healthcheck_monitor.py content: | #!/usr/bin/env python3 import sys import json from uptime_kuma_api import UptimeKumaApi def main(): api_url = sys.argv[1] username = sys.argv[2] password = sys.argv[3] group_name = sys.argv[4] monitor_name = sys.argv[5] monitor_description = sys.argv[6] interval = int(sys.argv[7]) retries = int(sys.argv[8]) ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts" api = UptimeKumaApi(api_url, timeout=60, wait_events=2.0) api.login(username, password) # Get all monitors monitors = api.get_monitors() # Get all notifications and find ntfy notification notifications = api.get_notifications() ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None) notification_id_list = {} if ntfy_notification: notification_id_list[ntfy_notification['id']] = True # Find or create group group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None) if not group: group_result = api.add_monitor(type='group', name=group_name) # Refresh to get the full group object with id monitors = api.get_monitors() group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None) # Find or create/update push monitor existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None) monitor_data = { 'type': 'push', 'name': monitor_name, 'parent': group['id'], 'interval': interval, 'upsideDown': False, # Normal mode: receiving pings = healthy 'maxretries': retries, 'description': monitor_description, 'notificationIDList': notification_id_list } if existing_monitor: monitor = api.edit_monitor(existing_monitor['id'], **monitor_data) # Refresh to get the full monitor object with pushToken monitors = api.get_monitors() monitor = next((m for m in monitors if m.get('name') == monitor_name), None) else: monitor_result = api.add_monitor(**monitor_data) # Refresh to get the full monitor object with pushToken monitors = api.get_monitors() monitor = next((m for m in monitors if m.get('name') == monitor_name), None) # Output result as JSON result = { 'monitor_id': monitor['id'], 'push_token': monitor['pushToken'], 'group_name': group_name, 'group_id': group['id'], 'monitor_name': monitor_name } print(json.dumps(result)) api.disconnect() if __name__ == '__main__': main() mode: '0755' delegate_to: localhost become: no - name: Run Uptime Kuma monitor setup script command: > {{ ansible_playbook_python }} /tmp/setup_uptime_kuma_healthcheck_monitor.py "{{ uptime_kuma_api_url }}" "{{ uptime_kuma_username }}" "{{ uptime_kuma_password }}" "{{ uptime_kuma_monitor_group }}" "{{ monitor_name }}" "{{ monitor_friendly_name }} - Regular healthcheck ping every {{ healthcheck_interval_seconds }}s" "{{ healthcheck_timeout_seconds }}" "{{ healthcheck_retries }}" "{{ ntfy_topic }}" register: monitor_setup_result delegate_to: localhost become: no changed_when: false - name: Parse monitor setup result set_fact: monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}" - name: Set push URL and monitor ID as facts set_fact: uptime_kuma_healthcheck_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}" uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}" - name: Install required packages for healthcheck monitoring package: name: - curl state: present - name: Create monitoring script directory file: path: "{{ monitoring_script_dir }}" state: directory owner: root group: root mode: '0755' - name: Create system healthcheck script copy: dest: "{{ monitoring_script_path }}" content: | #!/bin/bash # System Healthcheck Script # Sends regular heartbeat pings to Uptime Kuma # This ensures the system is running and able to communicate LOG_FILE="{{ log_file }}" UPTIME_KUMA_URL="{{ uptime_kuma_healthcheck_push_url }}" HOSTNAME=$(hostname) # Function to log messages log_message() { echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE" } # Function to send healthcheck ping to Uptime Kuma send_healthcheck() { local uptime_seconds=$(awk '{print int($1)}' /proc/uptime) local uptime_days=$((uptime_seconds / 86400)) local uptime_hours=$(((uptime_seconds % 86400) / 3600)) local uptime_minutes=$(((uptime_seconds % 3600) / 60)) local message="System healthy - Uptime: ${uptime_days}d ${uptime_hours}h ${uptime_minutes}m" log_message "Sending healthcheck ping: $message" # Send push notification to Uptime Kuma with status=up encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g') response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1) http_code=$(echo "$response" | tail -n1) if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then log_message "Healthcheck ping sent successfully (HTTP $http_code)" else log_message "ERROR: Failed to send healthcheck ping (HTTP $http_code)" return 1 fi } # Main healthcheck logic main() { log_message "Starting system healthcheck for $HOSTNAME" # Send healthcheck ping if send_healthcheck; then log_message "Healthcheck completed successfully" else log_message "ERROR: Healthcheck failed" exit 1 fi } # Run main function main owner: root group: root mode: '0755' - name: Create systemd service for system healthcheck copy: dest: "/etc/systemd/system/{{ systemd_service_name }}.service" content: | [Unit] Description=System Healthcheck Monitor After=network.target [Service] Type=oneshot ExecStart={{ monitoring_script_path }} User=root StandardOutput=journal StandardError=journal [Install] WantedBy=multi-user.target owner: root group: root mode: '0644' - name: Create systemd timer for system healthcheck copy: dest: "/etc/systemd/system/{{ systemd_service_name }}.timer" content: | [Unit] Description=Run System Healthcheck every minute Requires={{ systemd_service_name }}.service [Timer] OnBootSec=30sec OnUnitActiveSec={{ healthcheck_interval_seconds }}sec Persistent=true [Install] WantedBy=timers.target owner: root group: root mode: '0644' - name: Reload systemd daemon systemd: daemon_reload: yes - name: Enable and start system healthcheck timer systemd: name: "{{ systemd_service_name }}.timer" enabled: yes state: started - name: Test system healthcheck script command: "{{ monitoring_script_path }}" register: script_test changed_when: false - name: Verify script execution assert: that: - script_test.rc == 0 fail_msg: "System healthcheck script failed to execute properly" - name: Display monitor information debug: msg: | ✓ System healthcheck monitoring deployed successfully! Monitor Name: {{ monitor_friendly_name }} Monitor Group: {{ uptime_kuma_monitor_group }} Healthcheck Interval: Every {{ healthcheck_interval_seconds }} seconds (1 minute) Timeout: {{ healthcheck_timeout_seconds }} seconds (90s) Retries: {{ healthcheck_retries }} The system will send a heartbeat ping every minute. Uptime Kuma will alert if no ping is received within 90 seconds (with 1 retry). - name: Clean up temporary Uptime Kuma setup script file: path: /tmp/setup_uptime_kuma_healthcheck_monitor.py state: absent delegate_to: localhost become: no