diff --git a/ansible/infra/nodito/32_zfs_pool_setup_playbook.yml b/ansible/infra/nodito/32_zfs_pool_setup_playbook.yml index cb72328..4ff0ed4 100644 --- a/ansible/infra/nodito/32_zfs_pool_setup_playbook.yml +++ b/ansible/infra/nodito/32_zfs_pool_setup_playbook.yml @@ -170,499 +170,3 @@ fail: msg: "ZFS pool {{ zfs_pool_name }} is not in a healthy state" when: "'ONLINE' not in final_zfs_status.stdout" - -- name: Setup ZFS Pool Health Monitoring and Monthly Scrubs - hosts: nodito - become: true - vars_files: - - ../../infra_vars.yml - - ../../services_config.yml - - ../../infra_secrets.yml - - nodito_vars.yml - - vars: - zfs_check_interval_seconds: 86400 # 24 hours - zfs_check_timeout_seconds: 90000 # ~25 hours (interval + buffer) - zfs_check_retries: 1 - zfs_monitoring_script_dir: /opt/zfs-monitoring - zfs_monitoring_script_path: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.sh" - zfs_log_file: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.log" - zfs_systemd_health_service_name: zfs-health-monitor - zfs_systemd_scrub_service_name: zfs-monthly-scrub - uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}" - ntfy_topic: "{{ service_settings.ntfy.topic }}" - - tasks: - - name: Validate Uptime Kuma configuration - assert: - that: - - uptime_kuma_api_url is defined - - uptime_kuma_api_url != "" - - uptime_kuma_username is defined - - uptime_kuma_username != "" - - uptime_kuma_password is defined - - uptime_kuma_password != "" - fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set" - - - name: Get hostname for monitor identification - command: hostname - register: host_name - changed_when: false - - - name: Set monitor name and group based on hostname - set_fact: - monitor_name: "zfs-health-{{ host_name.stdout }}" - monitor_friendly_name: "ZFS Pool Health: {{ host_name.stdout }}" - uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra" - - - name: Create Uptime Kuma ZFS health monitor setup script - copy: - dest: /tmp/setup_uptime_kuma_zfs_monitor.py - content: | - #!/usr/bin/env python3 - import sys - import json - from uptime_kuma_api import UptimeKumaApi - - def main(): - api_url = sys.argv[1] - username = sys.argv[2] - password = sys.argv[3] - group_name = sys.argv[4] - monitor_name = sys.argv[5] - monitor_description = sys.argv[6] - interval = int(sys.argv[7]) - retries = int(sys.argv[8]) - ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts" - - api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0) - api.login(username, password) - - # Get all monitors - monitors = api.get_monitors() - - # Get all notifications and find ntfy notification - notifications = api.get_notifications() - ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None) - notification_id_list = {} - if ntfy_notification: - notification_id_list[ntfy_notification['id']] = True - - # Find or create group - group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None) - if not group: - group_result = api.add_monitor(type='group', name=group_name) - # Refresh to get the full group object with id - monitors = api.get_monitors() - group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None) - - # Find or create/update push monitor - existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None) - - monitor_data = { - 'type': 'push', - 'name': monitor_name, - 'parent': group['id'], - 'interval': interval, - 'upsideDown': False, # Normal heartbeat mode: receiving pings = healthy - 'maxretries': retries, - 'description': monitor_description, - 'notificationIDList': notification_id_list - } - - if existing_monitor: - monitor = api.edit_monitor(existing_monitor['id'], **monitor_data) - # Refresh to get the full monitor object with pushToken - monitors = api.get_monitors() - monitor = next((m for m in monitors if m.get('name') == monitor_name), None) - else: - monitor_result = api.add_monitor(**monitor_data) - # Refresh to get the full monitor object with pushToken - monitors = api.get_monitors() - monitor = next((m for m in monitors if m.get('name') == monitor_name), None) - - # Output result as JSON - result = { - 'monitor_id': monitor['id'], - 'push_token': monitor['pushToken'], - 'group_name': group_name, - 'group_id': group['id'], - 'monitor_name': monitor_name - } - print(json.dumps(result)) - - api.disconnect() - - if __name__ == '__main__': - main() - mode: '0755' - delegate_to: localhost - become: no - - - name: Run Uptime Kuma ZFS monitor setup script - command: > - {{ ansible_playbook_python }} - /tmp/setup_uptime_kuma_zfs_monitor.py - "{{ uptime_kuma_api_url }}" - "{{ uptime_kuma_username }}" - "{{ uptime_kuma_password }}" - "{{ uptime_kuma_monitor_group }}" - "{{ monitor_name }}" - "{{ monitor_friendly_name }} - Daily health check for pool {{ zfs_pool_name }}" - "{{ zfs_check_timeout_seconds }}" - "{{ zfs_check_retries }}" - "{{ ntfy_topic }}" - register: monitor_setup_result - delegate_to: localhost - become: no - changed_when: false - - - name: Parse monitor setup result - set_fact: - monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}" - - - name: Set push URL and monitor ID as facts - set_fact: - uptime_kuma_zfs_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}" - uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}" - - - name: Install required packages for ZFS monitoring - package: - name: - - curl - - jq - state: present - - - name: Create monitoring script directory - file: - path: "{{ zfs_monitoring_script_dir }}" - state: directory - owner: root - group: root - mode: '0755' - - - name: Create ZFS health monitoring script - copy: - dest: "{{ zfs_monitoring_script_path }}" - content: | - #!/bin/bash - - # ZFS Pool Health Monitoring Script - # Checks ZFS pool health using JSON output and sends heartbeat to Uptime Kuma if healthy - # If any issues detected, does NOT send heartbeat (triggers timeout alert) - - LOG_FILE="{{ zfs_log_file }}" - UPTIME_KUMA_URL="{{ uptime_kuma_zfs_push_url }}" - POOL_NAME="{{ zfs_pool_name }}" - HOSTNAME=$(hostname) - - # Function to log messages - log_message() { - echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE" - } - - # Function to check pool health using JSON output - check_pool_health() { - local pool="$1" - local issues_found=0 - - # Get pool status as JSON - local pool_json - pool_json=$(zpool status -j "$pool" 2>&1) - - if [ $? -ne 0 ]; then - log_message "ERROR: Failed to get pool status for $pool" - log_message " -> $pool_json" - return 1 - fi - - # Check 1: Pool state must be ONLINE - local pool_state - pool_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].state') - - if [ "$pool_state" != "ONLINE" ]; then - log_message "ISSUE: Pool state is $pool_state (expected ONLINE)" - issues_found=1 - else - log_message "OK: Pool state is ONLINE" - fi - - # Check 2: Check all vdevs and devices for non-ONLINE states - local bad_states - bad_states=$(echo "$pool_json" | jq -r --arg pool "$pool" ' - .pools[$pool].vdevs[] | - .. | objects | - select(.state? and .state != "ONLINE") | - "\(.name // "unknown"): \(.state)" - ' 2>/dev/null) - - if [ -n "$bad_states" ]; then - log_message "ISSUE: Found devices not in ONLINE state:" - echo "$bad_states" | while read -r line; do - log_message " -> $line" - done - issues_found=1 - else - log_message "OK: All devices are ONLINE" - fi - - # Check 3: Check for resilvering in progress - local scan_function scan_state - scan_function=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"') - scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"') - - if [ "$scan_function" = "RESILVER" ] && [ "$scan_state" = "SCANNING" ]; then - local resilver_progress - resilver_progress=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.issued // "unknown"') - log_message "ISSUE: Pool is currently resilvering (disk reconstruction in progress) - ${resilver_progress} processed" - issues_found=1 - fi - - # Check 4: Check for read/write/checksum errors on all devices - # Note: ZFS JSON output has error counts as strings, so convert to numbers for comparison - local devices_with_errors - devices_with_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" ' - .pools[$pool].vdevs[] | - .. | objects | - select(.name? and ((.read_errors // "0" | tonumber) > 0 or (.write_errors // "0" | tonumber) > 0 or (.checksum_errors // "0" | tonumber) > 0)) | - "\(.name): read=\(.read_errors // 0) write=\(.write_errors // 0) cksum=\(.checksum_errors // 0)" - ' 2>/dev/null) - - if [ -n "$devices_with_errors" ]; then - log_message "ISSUE: Found devices with I/O errors:" - echo "$devices_with_errors" | while read -r line; do - log_message " -> $line" - done - issues_found=1 - else - log_message "OK: No read/write/checksum errors detected" - fi - - # Check 5: Check for scan errors (from last scrub/resilver) - local scan_errors - scan_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.errors // "0"') - - if [ "$scan_errors" != "0" ] && [ "$scan_errors" != "null" ] && [ -n "$scan_errors" ]; then - log_message "ISSUE: Last scan reported $scan_errors errors" - issues_found=1 - else - log_message "OK: No scan errors" - fi - - return $issues_found - } - - # Function to get last scrub info for status message - get_scrub_info() { - local pool="$1" - local pool_json - pool_json=$(zpool status -j "$pool" 2>/dev/null) - - local scan_func scan_state scan_start - scan_func=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"') - scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"') - scan_start=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.start_time // ""') - - if [ "$scan_func" = "SCRUB" ] && [ "$scan_state" = "SCANNING" ]; then - echo "scrub in progress (started $scan_start)" - elif [ "$scan_func" = "SCRUB" ] && [ -n "$scan_start" ]; then - echo "last scrub: $scan_start" - else - echo "no scrub history" - fi - } - - # Function to send heartbeat to Uptime Kuma - send_heartbeat() { - local message="$1" - - log_message "Sending heartbeat to Uptime Kuma: $message" - - # URL encode the message - local encoded_message - encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g') - - local response http_code - response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1) - http_code=$(echo "$response" | tail -n1) - - if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then - log_message "Heartbeat sent successfully (HTTP $http_code)" - return 0 - else - log_message "ERROR: Failed to send heartbeat (HTTP $http_code)" - return 1 - fi - } - - # Main health check logic - main() { - log_message "==========================================" - log_message "Starting ZFS health check for pool: $POOL_NAME on $HOSTNAME" - - # Run all health checks - if check_pool_health "$POOL_NAME"; then - # All checks passed - send heartbeat - local scrub_info - scrub_info=$(get_scrub_info "$POOL_NAME") - - local message="Pool $POOL_NAME healthy ($scrub_info)" - send_heartbeat "$message" - - log_message "Health check completed: ALL OK" - exit 0 - else - # Issues found - do NOT send heartbeat (will trigger timeout alert) - log_message "Health check completed: ISSUES DETECTED - NOT sending heartbeat" - log_message "Uptime Kuma will alert after timeout due to missing heartbeat" - exit 1 - fi - } - - # Run main function - main - owner: root - group: root - mode: '0755' - - - name: Create systemd service for ZFS health monitoring - copy: - dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.service" - content: | - [Unit] - Description=ZFS Pool Health Monitor - After=zfs.target network.target - - [Service] - Type=oneshot - ExecStart={{ zfs_monitoring_script_path }} - User=root - StandardOutput=journal - StandardError=journal - - [Install] - WantedBy=multi-user.target - owner: root - group: root - mode: '0644' - - - name: Create systemd timer for daily ZFS health monitoring - copy: - dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.timer" - content: | - [Unit] - Description=Run ZFS Pool Health Monitor daily - Requires={{ zfs_systemd_health_service_name }}.service - - [Timer] - OnBootSec=5min - OnUnitActiveSec={{ zfs_check_interval_seconds }}sec - Persistent=true - - [Install] - WantedBy=timers.target - owner: root - group: root - mode: '0644' - - - name: Create systemd service for ZFS monthly scrub - copy: - dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.service" - content: | - [Unit] - Description=ZFS Monthly Scrub for {{ zfs_pool_name }} - After=zfs.target - - [Service] - Type=oneshot - ExecStart=/sbin/zpool scrub {{ zfs_pool_name }} - User=root - StandardOutput=journal - StandardError=journal - - [Install] - WantedBy=multi-user.target - owner: root - group: root - mode: '0644' - - - name: Create systemd timer for monthly ZFS scrub - copy: - dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.timer" - content: | - [Unit] - Description=Run ZFS Scrub on last day of every month at 4:00 AM - Requires={{ zfs_systemd_scrub_service_name }}.service - - [Timer] - OnCalendar=*-*~01 04:00:00 - Persistent=true - - [Install] - WantedBy=timers.target - owner: root - group: root - mode: '0644' - - - name: Reload systemd daemon - systemd: - daemon_reload: yes - - - name: Enable and start ZFS health monitoring timer - systemd: - name: "{{ zfs_systemd_health_service_name }}.timer" - enabled: yes - state: started - - - name: Enable and start ZFS monthly scrub timer - systemd: - name: "{{ zfs_systemd_scrub_service_name }}.timer" - enabled: yes - state: started - - - name: Test ZFS health monitoring script - command: "{{ zfs_monitoring_script_path }}" - register: script_test - changed_when: false - - - name: Verify script execution - assert: - that: - - script_test.rc == 0 - fail_msg: "ZFS health monitoring script failed - check pool health" - - - name: Display monitoring configuration - debug: - msg: | - ✓ ZFS Pool Health Monitoring deployed successfully! - - Monitor Name: {{ monitor_friendly_name }} - Monitor Group: {{ uptime_kuma_monitor_group }} - Pool Name: {{ zfs_pool_name }} - - Health Check: - - Frequency: Every {{ zfs_check_interval_seconds }} seconds (24 hours) - - Timeout: {{ zfs_check_timeout_seconds }} seconds (~25 hours) - - Script: {{ zfs_monitoring_script_path }} - - Log: {{ zfs_log_file }} - - Service: {{ zfs_systemd_health_service_name }}.service - - Timer: {{ zfs_systemd_health_service_name }}.timer - - Monthly Scrub: - - Schedule: Last day of month at 4:00 AM - - Service: {{ zfs_systemd_scrub_service_name }}.service - - Timer: {{ zfs_systemd_scrub_service_name }}.timer - - Conditions monitored: - - Pool state (must be ONLINE) - - Device states (no DEGRADED/FAULTED/OFFLINE/UNAVAIL) - - Resilver status (alerts if resilvering) - - Read/Write/Checksum errors - - Scrub errors - - - name: Clean up temporary Uptime Kuma setup script - file: - path: /tmp/setup_uptime_kuma_zfs_monitor.py - state: absent - delegate_to: localhost - become: no diff --git a/ansible/infra/nodito/34_nut_ups_setup_playbook.yml b/ansible/infra/nodito/34_nut_ups_setup_playbook.yml deleted file mode 100644 index 02468d5..0000000 --- a/ansible/infra/nodito/34_nut_ups_setup_playbook.yml +++ /dev/null @@ -1,569 +0,0 @@ -- name: Setup NUT (Network UPS Tools) for CyberPower UPS - hosts: nodito_host - become: true - vars_files: - - ../../infra_vars.yml - - nodito_vars.yml - - nodito_secrets.yml - - tasks: - # ------------------------------------------------------------------ - # Installation - # ------------------------------------------------------------------ - - name: Install NUT packages - apt: - name: - - nut - - nut-client - - nut-server - state: present - update_cache: true - - # ------------------------------------------------------------------ - # Verify UPS is detected - # ------------------------------------------------------------------ - - name: Check if UPS is detected via USB - shell: lsusb | grep -i cyber - register: lsusb_output - changed_when: false - failed_when: false - - - name: Display USB detection result - debug: - msg: "{{ lsusb_output.stdout | default('UPS not detected via USB - ensure it is plugged in') }}" - - - name: Fail if UPS not detected - fail: - msg: "CyberPower UPS not detected via USB. Ensure the USB cable is connected." - when: lsusb_output.rc != 0 - - - name: Reload udev rules for USB permissions - shell: | - udevadm control --reload-rules - udevadm trigger --subsystem-match=usb --action=add - changed_when: true - - - name: Verify USB device has nut group permissions - shell: | - BUS_DEV=$(lsusb | grep -i cyber | grep -oP 'Bus \K\d+|Device \K\d+' | tr '\n' '/' | sed 's/\/$//') - if [ -n "$BUS_DEV" ]; then - BUS=$(echo $BUS_DEV | cut -d'/' -f1) - DEV=$(echo $BUS_DEV | cut -d'/' -f2) - ls -la /dev/bus/usb/$BUS/$DEV - else - echo "UPS device not found" - exit 1 - fi - register: usb_permissions - changed_when: false - - - name: Display USB permissions - debug: - msg: "{{ usb_permissions.stdout }} (should show 'root nut', not 'root root')" - - - name: Scan for UPS with nut-scanner - command: nut-scanner -U - register: nut_scanner_output - changed_when: false - failed_when: false - - - name: Display nut-scanner result - debug: - msg: "{{ nut_scanner_output.stdout_lines }}" - - # ------------------------------------------------------------------ - # Configuration files - # ------------------------------------------------------------------ - - name: Configure NUT mode (standalone) - copy: - dest: /etc/nut/nut.conf - content: | - # Managed by Ansible - MODE=standalone - owner: root - group: nut - mode: "0640" - notify: Restart NUT services - - - name: Configure UPS device - copy: - dest: /etc/nut/ups.conf - content: | - # Managed by Ansible - [{{ ups_name }}] - driver = {{ ups_driver }} - port = {{ ups_port }} - desc = "{{ ups_desc }}" - offdelay = {{ ups_offdelay }} - ondelay = {{ ups_ondelay }} - owner: root - group: nut - mode: "0640" - notify: Restart NUT services - - - name: Configure upsd to listen on localhost - copy: - dest: /etc/nut/upsd.conf - content: | - # Managed by Ansible - LISTEN 127.0.0.1 3493 - owner: root - group: nut - mode: "0640" - notify: Restart NUT services - - - name: Configure upsd users - copy: - dest: /etc/nut/upsd.users - content: | - # Managed by Ansible - [{{ ups_user }}] - password = {{ ups_password }} - upsmon master - owner: root - group: nut - mode: "0640" - notify: Restart NUT services - - - name: Configure upsmon - copy: - dest: /etc/nut/upsmon.conf - content: | - # Managed by Ansible - MONITOR {{ ups_name }}@localhost 1 {{ ups_user }} {{ ups_password }} master - - MINSUPPLIES 1 - SHUTDOWNCMD "/sbin/shutdown -h +0" - POLLFREQ 5 - POLLFREQALERT 5 - HOSTSYNC 15 - DEADTIME 15 - POWERDOWNFLAG /etc/killpower - - # Notifications - NOTIFYMSG ONLINE "UPS %s on line power" - NOTIFYMSG ONBATT "UPS %s on battery" - NOTIFYMSG LOWBATT "UPS %s battery is low" - NOTIFYMSG FSD "UPS %s: forced shutdown in progress" - NOTIFYMSG COMMOK "Communications with UPS %s established" - NOTIFYMSG COMMBAD "Communications with UPS %s lost" - NOTIFYMSG SHUTDOWN "Auto logout and shutdown proceeding" - NOTIFYMSG REPLBATT "UPS %s battery needs replacing" - - # Log all events to syslog - NOTIFYFLAG ONLINE SYSLOG - NOTIFYFLAG ONBATT SYSLOG - NOTIFYFLAG LOWBATT SYSLOG - NOTIFYFLAG FSD SYSLOG - NOTIFYFLAG COMMOK SYSLOG - NOTIFYFLAG COMMBAD SYSLOG - NOTIFYFLAG SHUTDOWN SYSLOG - NOTIFYFLAG REPLBATT SYSLOG - owner: root - group: nut - mode: "0640" - notify: Restart NUT services - - # ------------------------------------------------------------------ - # Verify late-stage shutdown script - # ------------------------------------------------------------------ - - name: Verify nutshutdown script exists - stat: - path: /lib/systemd/system-shutdown/nutshutdown - register: nutshutdown_script - - - name: Warn if nutshutdown script is missing - debug: - msg: "WARNING: /lib/systemd/system-shutdown/nutshutdown not found. UPS may not cut power after shutdown." - when: not nutshutdown_script.stat.exists - - # ------------------------------------------------------------------ - # Services - # ------------------------------------------------------------------ - - name: Enable and start NUT driver enumerator - systemd: - name: nut-driver-enumerator - enabled: true - state: started - - - name: Enable and start NUT server - systemd: - name: nut-server - enabled: true - state: started - - - name: Enable and start NUT monitor - systemd: - name: nut-monitor - enabled: true - state: started - - # ------------------------------------------------------------------ - # Verification - # ------------------------------------------------------------------ - - name: Wait for NUT services to stabilize - pause: - seconds: 3 - - - name: Verify NUT can communicate with UPS - command: upsc {{ ups_name }}@localhost - register: upsc_output - changed_when: false - failed_when: upsc_output.rc != 0 - - - name: Display UPS status - debug: - msg: "{{ upsc_output.stdout_lines }}" - - - name: Get UPS status summary - shell: | - echo "Status: $(upsc {{ ups_name }}@localhost ups.status 2>/dev/null)" - echo "Battery: $(upsc {{ ups_name }}@localhost battery.charge 2>/dev/null)%" - echo "Runtime: $(upsc {{ ups_name }}@localhost battery.runtime 2>/dev/null)s" - echo "Load: $(upsc {{ ups_name }}@localhost ups.load 2>/dev/null)%" - register: ups_summary - changed_when: false - - - name: Display UPS summary - debug: - msg: "{{ ups_summary.stdout_lines }}" - - - name: Verify low battery thresholds - shell: | - echo "Runtime threshold: $(upsc {{ ups_name }}@localhost battery.runtime.low 2>/dev/null)s" - echo "Charge threshold: $(upsc {{ ups_name }}@localhost battery.charge.low 2>/dev/null)%" - register: thresholds - changed_when: false - - - name: Display low battery thresholds - debug: - msg: "{{ thresholds.stdout_lines }}" - - handlers: - - name: Restart NUT services - systemd: - name: "{{ item }}" - state: restarted - loop: - - nut-driver-enumerator - - nut-server - - nut-monitor - - -- name: Setup UPS Heartbeat Monitoring with Uptime Kuma - hosts: nodito - become: true - vars_files: - - ../../infra_vars.yml - - ../../services_config.yml - - ../../infra_secrets.yml - - nodito_vars.yml - - nodito_secrets.yml - - vars: - ups_heartbeat_interval_seconds: 60 - ups_heartbeat_timeout_seconds: 120 - ups_heartbeat_retries: 1 - ups_monitoring_script_dir: /opt/ups-monitoring - ups_monitoring_script_path: "{{ ups_monitoring_script_dir }}/ups_heartbeat.sh" - ups_log_file: "{{ ups_monitoring_script_dir }}/ups_heartbeat.log" - ups_systemd_service_name: ups-heartbeat - uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}" - ntfy_topic: "{{ service_settings.ntfy.topic }}" - - tasks: - - name: Validate Uptime Kuma configuration - assert: - that: - - uptime_kuma_api_url is defined - - uptime_kuma_api_url != "" - - uptime_kuma_username is defined - - uptime_kuma_username != "" - - uptime_kuma_password is defined - - uptime_kuma_password != "" - fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set" - - - name: Get hostname for monitor identification - command: hostname - register: host_name - changed_when: false - - - name: Set monitor name and group based on hostname - set_fact: - monitor_name: "ups-{{ host_name.stdout }}" - monitor_friendly_name: "UPS Status: {{ host_name.stdout }}" - uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra" - - - name: Create Uptime Kuma UPS monitor setup script - copy: - dest: /tmp/setup_uptime_kuma_ups_monitor.py - content: | - #!/usr/bin/env python3 - import sys - import json - from uptime_kuma_api import UptimeKumaApi - - def main(): - api_url = sys.argv[1] - username = sys.argv[2] - password = sys.argv[3] - group_name = sys.argv[4] - monitor_name = sys.argv[5] - monitor_description = sys.argv[6] - interval = int(sys.argv[7]) - retries = int(sys.argv[8]) - ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts" - - api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0) - api.login(username, password) - - monitors = api.get_monitors() - notifications = api.get_notifications() - - ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None) - notification_id_list = {} - if ntfy_notification: - notification_id_list[ntfy_notification['id']] = True - - group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None) - if not group: - api.add_monitor(type='group', name=group_name) - monitors = api.get_monitors() - group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None) - - existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None) - - monitor_data = { - 'type': 'push', - 'name': monitor_name, - 'parent': group['id'], - 'interval': interval, - 'upsideDown': False, # Normal heartbeat mode: receiving pings = healthy - 'maxretries': retries, - 'description': monitor_description, - 'notificationIDList': notification_id_list - } - - if existing_monitor: - api.edit_monitor(existing_monitor['id'], **monitor_data) - monitors = api.get_monitors() - monitor = next((m for m in monitors if m.get('name') == monitor_name), None) - else: - api.add_monitor(**monitor_data) - monitors = api.get_monitors() - monitor = next((m for m in monitors if m.get('name') == monitor_name), None) - - result = { - 'monitor_id': monitor['id'], - 'push_token': monitor['pushToken'], - 'group_name': group_name, - 'group_id': group['id'], - 'monitor_name': monitor_name - } - print(json.dumps(result)) - - api.disconnect() - - if __name__ == '__main__': - main() - mode: '0755' - delegate_to: localhost - become: no - - - name: Run Uptime Kuma UPS monitor setup script - command: > - {{ ansible_playbook_python }} - /tmp/setup_uptime_kuma_ups_monitor.py - "{{ uptime_kuma_api_url }}" - "{{ uptime_kuma_username }}" - "{{ uptime_kuma_password }}" - "{{ uptime_kuma_monitor_group }}" - "{{ monitor_name }}" - "{{ monitor_friendly_name }} - Alerts when UPS goes on battery or loses communication" - "{{ ups_heartbeat_timeout_seconds }}" - "{{ ups_heartbeat_retries }}" - "{{ ntfy_topic }}" - register: monitor_setup_result - delegate_to: localhost - become: no - changed_when: false - - - name: Parse monitor setup result - set_fact: - monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}" - - - name: Set push URL as fact - set_fact: - uptime_kuma_ups_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}" - - - name: Install required packages for UPS monitoring - package: - name: - - curl - state: present - - - name: Create monitoring script directory - file: - path: "{{ ups_monitoring_script_dir }}" - state: directory - owner: root - group: root - mode: '0755' - - - name: Create UPS heartbeat monitoring script - copy: - dest: "{{ ups_monitoring_script_path }}" - content: | - #!/bin/bash - - # UPS Heartbeat Monitoring Script - # Sends heartbeat to Uptime Kuma only when UPS is on mains power - # When on battery or communication lost, no heartbeat is sent (triggers timeout alert) - - LOG_FILE="{{ ups_log_file }}" - UPTIME_KUMA_URL="{{ uptime_kuma_ups_push_url }}" - UPS_NAME="{{ ups_name }}" - - log_message() { - echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE" - } - - send_heartbeat() { - local message="$1" - - local encoded_message - encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g; s/%/%25/g') - - local response http_code - response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1) - http_code=$(echo "$response" | tail -n1) - - if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then - log_message "Heartbeat sent: $message (HTTP $http_code)" - return 0 - else - log_message "ERROR: Failed to send heartbeat (HTTP $http_code)" - return 1 - fi - } - - main() { - local status charge runtime load - - status=$(upsc ${UPS_NAME}@localhost ups.status 2>/dev/null) - - if [ -z "$status" ]; then - log_message "ERROR: Cannot communicate with UPS - NOT sending heartbeat" - exit 1 - fi - - charge=$(upsc ${UPS_NAME}@localhost battery.charge 2>/dev/null) - runtime=$(upsc ${UPS_NAME}@localhost battery.runtime 2>/dev/null) - load=$(upsc ${UPS_NAME}@localhost ups.load 2>/dev/null) - - if [[ "$status" == *"OL"* ]]; then - local message="UPS on mains (charge=${charge}% runtime=${runtime}s load=${load}%)" - send_heartbeat "$message" - exit 0 - else - log_message "UPS not on mains power (status=$status) - NOT sending heartbeat" - exit 1 - fi - } - - main - owner: root - group: root - mode: '0755' - - - name: Create systemd service for UPS heartbeat - copy: - dest: "/etc/systemd/system/{{ ups_systemd_service_name }}.service" - content: | - [Unit] - Description=UPS Heartbeat Monitor - After=network.target nut-monitor.service - - [Service] - Type=oneshot - ExecStart={{ ups_monitoring_script_path }} - User=root - StandardOutput=journal - StandardError=journal - - [Install] - WantedBy=multi-user.target - owner: root - group: root - mode: '0644' - - - name: Create systemd timer for UPS heartbeat - copy: - dest: "/etc/systemd/system/{{ ups_systemd_service_name }}.timer" - content: | - [Unit] - Description=Run UPS Heartbeat Monitor every {{ ups_heartbeat_interval_seconds }} seconds - Requires={{ ups_systemd_service_name }}.service - - [Timer] - OnBootSec=1min - OnUnitActiveSec={{ ups_heartbeat_interval_seconds }}sec - Persistent=true - - [Install] - WantedBy=timers.target - owner: root - group: root - mode: '0644' - - - name: Reload systemd daemon - systemd: - daemon_reload: yes - - - name: Enable and start UPS heartbeat timer - systemd: - name: "{{ ups_systemd_service_name }}.timer" - enabled: yes - state: started - - - name: Test UPS heartbeat script - command: "{{ ups_monitoring_script_path }}" - register: script_test - changed_when: false - - - name: Verify script execution - assert: - that: - - script_test.rc == 0 - fail_msg: "UPS heartbeat script failed - check UPS status and communication" - - - name: Display monitoring configuration - debug: - msg: - - "UPS Monitoring configured successfully" - - "" - - "NUT Configuration:" - - " UPS Name: {{ ups_name }}" - - " UPS Description: {{ ups_desc }}" - - " Off Delay: {{ ups_offdelay }}s (time after shutdown before UPS cuts power)" - - " On Delay: {{ ups_ondelay }}s (time after mains returns before UPS restores power)" - - "" - - "Uptime Kuma Monitoring:" - - " Monitor Name: {{ monitor_friendly_name }}" - - " Monitor Group: {{ uptime_kuma_monitor_group }}" - - " Push URL: {{ uptime_kuma_ups_push_url }}" - - " Heartbeat Interval: {{ ups_heartbeat_interval_seconds }}s" - - " Timeout: {{ ups_heartbeat_timeout_seconds }}s" - - "" - - "Scripts and Services:" - - " Script: {{ ups_monitoring_script_path }}" - - " Log: {{ ups_log_file }}" - - " Service: {{ ups_systemd_service_name }}.service" - - " Timer: {{ ups_systemd_service_name }}.timer" - - - name: Clean up temporary Uptime Kuma setup script - file: - path: /tmp/setup_uptime_kuma_ups_monitor.py - state: absent - delegate_to: localhost - become: no diff --git a/ansible/infra/nodito/nodito_vars.yml b/ansible/infra/nodito/nodito_vars.yml index c0002f3..f9e6b0d 100644 --- a/ansible/infra/nodito/nodito_vars.yml +++ b/ansible/infra/nodito/nodito_vars.yml @@ -17,12 +17,3 @@ zfs_pool_name: "proxmox-tank-1" zfs_disk_1: "/dev/disk/by-id/ata-ST4000NT001-3M2101_WX11TN0Z" # First disk for RAID 1 mirror zfs_disk_2: "/dev/disk/by-id/ata-ST4000NT001-3M2101_WX11TN2P" # Second disk for RAID 1 mirror zfs_pool_mountpoint: "/var/lib/vz" - -# UPS Configuration (CyberPower CP900EPFCLCD via USB) -ups_name: cyberpower -ups_desc: "CyberPower CP900EPFCLCD" -ups_driver: usbhid-ups -ups_port: auto -ups_user: counterweight -ups_offdelay: 120 # Seconds after shutdown before UPS cuts outlet power -ups_ondelay: 30 # Seconds after mains returns before UPS restores outlet power