ups playbook

monitor zfs
2026-01-11 22:43:27 +01:00 · 2026-01-04 23:19:19 +01:00
3 changed files with 1074 additions and 0 deletions
--- a/ansible/infra/nodito/32_zfs_pool_setup_playbook.yml
+++ b/ansible/infra/nodito/32_zfs_pool_setup_playbook.yml
@ -170,3 +170,499 @@
      fail:
        msg: "ZFS pool {{ zfs_pool_name }} is not in a healthy state"
      when: "'ONLINE' not in final_zfs_status.stdout"
 - name: Setup ZFS Pool Health Monitoring and Monthly Scrubs
  hosts: nodito
  become: true
  vars_files:
    - ../../infra_vars.yml
    - ../../services_config.yml
    - ../../infra_secrets.yml
    - nodito_vars.yml
  vars:
    zfs_check_interval_seconds: 86400  # 24 hours
    zfs_check_timeout_seconds: 90000   # ~25 hours (interval + buffer)
    zfs_check_retries: 1
    zfs_monitoring_script_dir: /opt/zfs-monitoring
    zfs_monitoring_script_path: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.sh"
    zfs_log_file: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.log"
    zfs_systemd_health_service_name: zfs-health-monitor
    zfs_systemd_scrub_service_name: zfs-monthly-scrub
    uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
    ntfy_topic: "{{ service_settings.ntfy.topic }}"
  tasks:
    - name: Validate Uptime Kuma configuration
      assert:
        that:
          - uptime_kuma_api_url is defined
          - uptime_kuma_api_url != ""
          - uptime_kuma_username is defined
          - uptime_kuma_username != ""
          - uptime_kuma_password is defined
          - uptime_kuma_password != ""
        fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"
    - name: Get hostname for monitor identification
      command: hostname
      register: host_name
      changed_when: false
    - name: Set monitor name and group based on hostname
      set_fact:
        monitor_name: "zfs-health-{{ host_name.stdout }}"
        monitor_friendly_name: "ZFS Pool Health: {{ host_name.stdout }}"
        uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"
    - name: Create Uptime Kuma ZFS health monitor setup script
      copy:
        dest: /tmp/setup_uptime_kuma_zfs_monitor.py
        content: |
          #!/usr/bin/env python3
          import sys
          import json
          from uptime_kuma_api import UptimeKumaApi
          def main():
              api_url = sys.argv[1]
              username = sys.argv[2]
              password = sys.argv[3]
              group_name = sys.argv[4]
              monitor_name = sys.argv[5]
              monitor_description = sys.argv[6]
              interval = int(sys.argv[7])
              retries = int(sys.argv[8])
              ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"
              api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0)
              api.login(username, password)
              # Get all monitors
              monitors = api.get_monitors()
              # Get all notifications and find ntfy notification
              notifications = api.get_notifications()
              ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
              notification_id_list = {}
              if ntfy_notification:
                  notification_id_list[ntfy_notification['id']] = True
              # Find or create group
              group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
              if not group:
                  group_result = api.add_monitor(type='group', name=group_name)
                  # Refresh to get the full group object with id
                  monitors = api.get_monitors()
                  group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
              # Find or create/update push monitor
              existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
              monitor_data = {
                  'type': 'push',
                  'name': monitor_name,
                  'parent': group['id'],
                  'interval': interval,
                  'upsideDown': False,  # Normal heartbeat mode: receiving pings = healthy
                  'maxretries': retries,
                  'description': monitor_description,
                  'notificationIDList': notification_id_list
              }
              if existing_monitor:
                  monitor = api.edit_monitor(existing_monitor['id'], **monitor_data)
                  # Refresh to get the full monitor object with pushToken
                  monitors = api.get_monitors()
                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
              else:
                  monitor_result = api.add_monitor(**monitor_data)
                  # Refresh to get the full monitor object with pushToken
                  monitors = api.get_monitors()
                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
              # Output result as JSON
              result = {
                  'monitor_id': monitor['id'],
                  'push_token': monitor['pushToken'],
                  'group_name': group_name,
                  'group_id': group['id'],
                  'monitor_name': monitor_name
              }
              print(json.dumps(result))
              api.disconnect()
          if __name__ == '__main__':
              main()
        mode: '0755'
      delegate_to: localhost
      become: no
    - name: Run Uptime Kuma ZFS monitor setup script
      command: >
        {{ ansible_playbook_python }}
        /tmp/setup_uptime_kuma_zfs_monitor.py
        "{{ uptime_kuma_api_url }}"
        "{{ uptime_kuma_username }}"
        "{{ uptime_kuma_password }}"
        "{{ uptime_kuma_monitor_group }}"
        "{{ monitor_name }}"
        "{{ monitor_friendly_name }} - Daily health check for pool {{ zfs_pool_name }}"
        "{{ zfs_check_timeout_seconds }}"
        "{{ zfs_check_retries }}"
        "{{ ntfy_topic }}"
      register: monitor_setup_result
      delegate_to: localhost
      become: no
      changed_when: false
    - name: Parse monitor setup result
      set_fact:
        monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
    - name: Set push URL and monitor ID as facts
      set_fact:
        uptime_kuma_zfs_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
        uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}"
    - name: Install required packages for ZFS monitoring
      package:
        name:
          - curl
          - jq
        state: present
    - name: Create monitoring script directory
      file:
        path: "{{ zfs_monitoring_script_dir }}"
        state: directory
        owner: root
        group: root
        mode: '0755'
    - name: Create ZFS health monitoring script
      copy:
        dest: "{{ zfs_monitoring_script_path }}"
        content: |
          #!/bin/bash
          # ZFS Pool Health Monitoring Script
          # Checks ZFS pool health using JSON output and sends heartbeat to Uptime Kuma if healthy
          # If any issues detected, does NOT send heartbeat (triggers timeout alert)
          LOG_FILE="{{ zfs_log_file }}"
          UPTIME_KUMA_URL="{{ uptime_kuma_zfs_push_url }}"
          POOL_NAME="{{ zfs_pool_name }}"
          HOSTNAME=$(hostname)
          # Function to log messages
          log_message() {
              echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
          }
          # Function to check pool health using JSON output
          check_pool_health() {
              local pool="$1"
              local issues_found=0
              # Get pool status as JSON
              local pool_json
              pool_json=$(zpool status -j "$pool" 2>&1)
              if [ $? -ne 0 ]; then
                  log_message "ERROR: Failed to get pool status for $pool"
                  log_message "  -> $pool_json"
                  return 1
              fi
              # Check 1: Pool state must be ONLINE
              local pool_state
              pool_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].state')
              if [ "$pool_state" != "ONLINE" ]; then
                  log_message "ISSUE: Pool state is $pool_state (expected ONLINE)"
                  issues_found=1
              else
                  log_message "OK: Pool state is ONLINE"
              fi
              # Check 2: Check all vdevs and devices for non-ONLINE states
              local bad_states
              bad_states=$(echo "$pool_json" | jq -r --arg pool "$pool" '
                  .pools[$pool].vdevs[] | 
                  .. | objects | 
                  select(.state? and .state != "ONLINE") | 
                  "\(.name // "unknown"): \(.state)"
              ' 2>/dev/null)
              if [ -n "$bad_states" ]; then
                  log_message "ISSUE: Found devices not in ONLINE state:"
                  echo "$bad_states" | while read -r line; do
                      log_message "  -> $line"
                  done
                  issues_found=1
              else
                  log_message "OK: All devices are ONLINE"
              fi
              # Check 3: Check for resilvering in progress
              local scan_function scan_state
              scan_function=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')
              scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')
              if [ "$scan_function" = "RESILVER" ] && [ "$scan_state" = "SCANNING" ]; then
                  local resilver_progress
                  resilver_progress=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.issued // "unknown"')
                  log_message "ISSUE: Pool is currently resilvering (disk reconstruction in progress) - ${resilver_progress} processed"
                  issues_found=1
              fi
              # Check 4: Check for read/write/checksum errors on all devices
              # Note: ZFS JSON output has error counts as strings, so convert to numbers for comparison
              local devices_with_errors
              devices_with_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '
                  .pools[$pool].vdevs[] | 
                  .. | objects | 
                  select(.name? and ((.read_errors // "0" | tonumber) > 0 or (.write_errors // "0" | tonumber) > 0 or (.checksum_errors // "0" | tonumber) > 0)) | 
                  "\(.name): read=\(.read_errors // 0) write=\(.write_errors // 0) cksum=\(.checksum_errors // 0)"
              ' 2>/dev/null)
              if [ -n "$devices_with_errors" ]; then
                  log_message "ISSUE: Found devices with I/O errors:"
                  echo "$devices_with_errors" | while read -r line; do
                      log_message "  -> $line"
                  done
                  issues_found=1
              else
                  log_message "OK: No read/write/checksum errors detected"
              fi
              # Check 5: Check for scan errors (from last scrub/resilver)
              local scan_errors
              scan_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.errors // "0"')
              if [ "$scan_errors" != "0" ] && [ "$scan_errors" != "null" ] && [ -n "$scan_errors" ]; then
                  log_message "ISSUE: Last scan reported $scan_errors errors"
                  issues_found=1
              else
                  log_message "OK: No scan errors"
              fi
              return $issues_found
          }
          # Function to get last scrub info for status message
          get_scrub_info() {
              local pool="$1"
              local pool_json
              pool_json=$(zpool status -j "$pool" 2>/dev/null)
              local scan_func scan_state scan_start
              scan_func=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')
              scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')
              scan_start=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.start_time // ""')
              if [ "$scan_func" = "SCRUB" ] && [ "$scan_state" = "SCANNING" ]; then
                  echo "scrub in progress (started $scan_start)"
              elif [ "$scan_func" = "SCRUB" ] && [ -n "$scan_start" ]; then
                  echo "last scrub: $scan_start"
              else
                  echo "no scrub history"
              fi
          }
          # Function to send heartbeat to Uptime Kuma
          send_heartbeat() {
              local message="$1"
              log_message "Sending heartbeat to Uptime Kuma: $message"
              # URL encode the message
              local encoded_message
              encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g')
              local response http_code
              response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1)
              http_code=$(echo "$response" | tail -n1)
              if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
                  log_message "Heartbeat sent successfully (HTTP $http_code)"
                  return 0
              else
                  log_message "ERROR: Failed to send heartbeat (HTTP $http_code)"
                  return 1
              fi
          }
          # Main health check logic
          main() {
              log_message "=========================================="
              log_message "Starting ZFS health check for pool: $POOL_NAME on $HOSTNAME"
              # Run all health checks
              if check_pool_health "$POOL_NAME"; then
                  # All checks passed - send heartbeat
                  local scrub_info
                  scrub_info=$(get_scrub_info "$POOL_NAME")
                  local message="Pool $POOL_NAME healthy ($scrub_info)"
                  send_heartbeat "$message"
                  log_message "Health check completed: ALL OK"
                  exit 0
              else
                  # Issues found - do NOT send heartbeat (will trigger timeout alert)
                  log_message "Health check completed: ISSUES DETECTED - NOT sending heartbeat"
                  log_message "Uptime Kuma will alert after timeout due to missing heartbeat"
                  exit 1
              fi
          }
          # Run main function
          main
        owner: root
        group: root
        mode: '0755'
    - name: Create systemd service for ZFS health monitoring
      copy:
        dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.service"
        content: |
          [Unit]
          Description=ZFS Pool Health Monitor
          After=zfs.target network.target
          [Service]
          Type=oneshot
          ExecStart={{ zfs_monitoring_script_path }}
          User=root
          StandardOutput=journal
          StandardError=journal
          [Install]
          WantedBy=multi-user.target
        owner: root
        group: root
        mode: '0644'
    - name: Create systemd timer for daily ZFS health monitoring
      copy:
        dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.timer"
        content: |
          [Unit]
          Description=Run ZFS Pool Health Monitor daily
          Requires={{ zfs_systemd_health_service_name }}.service
          [Timer]
          OnBootSec=5min
          OnUnitActiveSec={{ zfs_check_interval_seconds }}sec
          Persistent=true
          [Install]
          WantedBy=timers.target
        owner: root
        group: root
        mode: '0644'
    - name: Create systemd service for ZFS monthly scrub
      copy:
        dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.service"
        content: |
          [Unit]
          Description=ZFS Monthly Scrub for {{ zfs_pool_name }}
          After=zfs.target
          [Service]
          Type=oneshot
          ExecStart=/sbin/zpool scrub {{ zfs_pool_name }}
          User=root
          StandardOutput=journal
          StandardError=journal
          [Install]
          WantedBy=multi-user.target
        owner: root
        group: root
        mode: '0644'
    - name: Create systemd timer for monthly ZFS scrub
      copy:
        dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.timer"
        content: |
          [Unit]
          Description=Run ZFS Scrub on last day of every month at 4:00 AM
          Requires={{ zfs_systemd_scrub_service_name }}.service
          [Timer]
          OnCalendar=*-*~01 04:00:00
          Persistent=true
          [Install]
          WantedBy=timers.target
        owner: root
        group: root
        mode: '0644'
    - name: Reload systemd daemon
      systemd:
        daemon_reload: yes
    - name: Enable and start ZFS health monitoring timer
      systemd:
        name: "{{ zfs_systemd_health_service_name }}.timer"
        enabled: yes
        state: started
    - name: Enable and start ZFS monthly scrub timer
      systemd:
        name: "{{ zfs_systemd_scrub_service_name }}.timer"
        enabled: yes
        state: started
    - name: Test ZFS health monitoring script
      command: "{{ zfs_monitoring_script_path }}"
      register: script_test
      changed_when: false
    - name: Verify script execution
      assert:
        that:
          - script_test.rc == 0
        fail_msg: "ZFS health monitoring script failed - check pool health"
    - name: Display monitoring configuration
      debug:
        msg: |
          ✓ ZFS Pool Health Monitoring deployed successfully!
          Monitor Name: {{ monitor_friendly_name }}
          Monitor Group: {{ uptime_kuma_monitor_group }}
          Pool Name: {{ zfs_pool_name }}
          Health Check:
            - Frequency: Every {{ zfs_check_interval_seconds }} seconds (24 hours)
            - Timeout: {{ zfs_check_timeout_seconds }} seconds (~25 hours)
            - Script: {{ zfs_monitoring_script_path }}
            - Log: {{ zfs_log_file }}
            - Service: {{ zfs_systemd_health_service_name }}.service
            - Timer: {{ zfs_systemd_health_service_name }}.timer
          Monthly Scrub:
            - Schedule: Last day of month at 4:00 AM
            - Service: {{ zfs_systemd_scrub_service_name }}.service
            - Timer: {{ zfs_systemd_scrub_service_name }}.timer
          Conditions monitored:
            - Pool state (must be ONLINE)
            - Device states (no DEGRADED/FAULTED/OFFLINE/UNAVAIL)
            - Resilver status (alerts if resilvering)
            - Read/Write/Checksum errors
            - Scrub errors
    - name: Clean up temporary Uptime Kuma setup script
      file:
        path: /tmp/setup_uptime_kuma_zfs_monitor.py
        state: absent
      delegate_to: localhost
      become: no
--- a/ansible/infra/nodito/34_nut_ups_setup_playbook.yml
+++ b/ansible/infra/nodito/34_nut_ups_setup_playbook.yml
@ -0,0 +1,569 @@
 - name: Setup NUT (Network UPS Tools) for CyberPower UPS
  hosts: nodito_host
  become: true
  vars_files:
    - ../../infra_vars.yml
    - nodito_vars.yml
    - nodito_secrets.yml
  tasks:
    # ------------------------------------------------------------------
    # Installation
    # ------------------------------------------------------------------
    - name: Install NUT packages
      apt:
        name:
          - nut
          - nut-client
          - nut-server
        state: present
        update_cache: true
    # ------------------------------------------------------------------
    # Verify UPS is detected
    # ------------------------------------------------------------------
    - name: Check if UPS is detected via USB
      shell: lsusb | grep -i cyber
      register: lsusb_output
      changed_when: false
      failed_when: false
    - name: Display USB detection result
      debug:
        msg: "{{ lsusb_output.stdout | default('UPS not detected via USB - ensure it is plugged in') }}"
    - name: Fail if UPS not detected
      fail:
        msg: "CyberPower UPS not detected via USB. Ensure the USB cable is connected."
      when: lsusb_output.rc != 0
    - name: Reload udev rules for USB permissions
      shell: |
        udevadm control --reload-rules
        udevadm trigger --subsystem-match=usb --action=add
      changed_when: true
    - name: Verify USB device has nut group permissions
      shell: |
        BUS_DEV=$(lsusb | grep -i cyber | grep -oP 'Bus \K\d+|Device \K\d+' | tr '\n' '/' | sed 's/\/$//')
        if [ -n "$BUS_DEV" ]; then
          BUS=$(echo $BUS_DEV | cut -d'/' -f1)
          DEV=$(echo $BUS_DEV | cut -d'/' -f2)
          ls -la /dev/bus/usb/$BUS/$DEV
        else
          echo "UPS device not found"
          exit 1
        fi
      register: usb_permissions
      changed_when: false
    - name: Display USB permissions
      debug:
        msg: "{{ usb_permissions.stdout }} (should show 'root nut', not 'root root')"
    - name: Scan for UPS with nut-scanner
      command: nut-scanner -U
      register: nut_scanner_output
      changed_when: false
      failed_when: false
    - name: Display nut-scanner result
      debug:
        msg: "{{ nut_scanner_output.stdout_lines }}"
    # ------------------------------------------------------------------
    # Configuration files
    # ------------------------------------------------------------------
    - name: Configure NUT mode (standalone)
      copy:
        dest: /etc/nut/nut.conf
        content: |
          # Managed by Ansible
          MODE=standalone
        owner: root
        group: nut
        mode: "0640"
      notify: Restart NUT services
    - name: Configure UPS device
      copy:
        dest: /etc/nut/ups.conf
        content: |
          # Managed by Ansible
          [{{ ups_name }}]
              driver = {{ ups_driver }}
              port = {{ ups_port }}
              desc = "{{ ups_desc }}"
              offdelay = {{ ups_offdelay }}
              ondelay = {{ ups_ondelay }}
        owner: root
        group: nut
        mode: "0640"
      notify: Restart NUT services
    - name: Configure upsd to listen on localhost
      copy:
        dest: /etc/nut/upsd.conf
        content: |
          # Managed by Ansible
          LISTEN 127.0.0.1 3493
        owner: root
        group: nut
        mode: "0640"
      notify: Restart NUT services
    - name: Configure upsd users
      copy:
        dest: /etc/nut/upsd.users
        content: |
          # Managed by Ansible
          [{{ ups_user }}]
              password = {{ ups_password }}
              upsmon master
        owner: root
        group: nut
        mode: "0640"
      notify: Restart NUT services
    - name: Configure upsmon
      copy:
        dest: /etc/nut/upsmon.conf
        content: |
          # Managed by Ansible
          MONITOR {{ ups_name }}@localhost 1 {{ ups_user }} {{ ups_password }} master
          MINSUPPLIES 1
          SHUTDOWNCMD "/sbin/shutdown -h +0"
          POLLFREQ 5
          POLLFREQALERT 5
          HOSTSYNC 15
          DEADTIME 15
          POWERDOWNFLAG /etc/killpower
          # Notifications
          NOTIFYMSG ONLINE    "UPS %s on line power"
          NOTIFYMSG ONBATT    "UPS %s on battery"
          NOTIFYMSG LOWBATT   "UPS %s battery is low"
          NOTIFYMSG FSD       "UPS %s: forced shutdown in progress"
          NOTIFYMSG COMMOK    "Communications with UPS %s established"
          NOTIFYMSG COMMBAD   "Communications with UPS %s lost"
          NOTIFYMSG SHUTDOWN  "Auto logout and shutdown proceeding"
          NOTIFYMSG REPLBATT  "UPS %s battery needs replacing"
          # Log all events to syslog
          NOTIFYFLAG ONLINE   SYSLOG
          NOTIFYFLAG ONBATT   SYSLOG
          NOTIFYFLAG LOWBATT  SYSLOG
          NOTIFYFLAG FSD      SYSLOG
          NOTIFYFLAG COMMOK   SYSLOG
          NOTIFYFLAG COMMBAD  SYSLOG
          NOTIFYFLAG SHUTDOWN SYSLOG
          NOTIFYFLAG REPLBATT SYSLOG
        owner: root
        group: nut
        mode: "0640"
      notify: Restart NUT services
    # ------------------------------------------------------------------
    # Verify late-stage shutdown script
    # ------------------------------------------------------------------
    - name: Verify nutshutdown script exists
      stat:
        path: /lib/systemd/system-shutdown/nutshutdown
      register: nutshutdown_script
    - name: Warn if nutshutdown script is missing
      debug:
        msg: "WARNING: /lib/systemd/system-shutdown/nutshutdown not found. UPS may not cut power after shutdown."
      when: not nutshutdown_script.stat.exists
    # ------------------------------------------------------------------
    # Services
    # ------------------------------------------------------------------
    - name: Enable and start NUT driver enumerator
      systemd:
        name: nut-driver-enumerator
        enabled: true
        state: started
    - name: Enable and start NUT server
      systemd:
        name: nut-server
        enabled: true
        state: started
    - name: Enable and start NUT monitor
      systemd:
        name: nut-monitor
        enabled: true
        state: started
    # ------------------------------------------------------------------
    # Verification
    # ------------------------------------------------------------------
    - name: Wait for NUT services to stabilize
      pause:
        seconds: 3
    - name: Verify NUT can communicate with UPS
      command: upsc {{ ups_name }}@localhost
      register: upsc_output
      changed_when: false
      failed_when: upsc_output.rc != 0
    - name: Display UPS status
      debug:
        msg: "{{ upsc_output.stdout_lines }}"
    - name: Get UPS status summary
      shell: |
        echo "Status: $(upsc {{ ups_name }}@localhost ups.status 2>/dev/null)"
        echo "Battery: $(upsc {{ ups_name }}@localhost battery.charge 2>/dev/null)%"
        echo "Runtime: $(upsc {{ ups_name }}@localhost battery.runtime 2>/dev/null)s"
        echo "Load: $(upsc {{ ups_name }}@localhost ups.load 2>/dev/null)%"
      register: ups_summary
      changed_when: false
    - name: Display UPS summary
      debug:
        msg: "{{ ups_summary.stdout_lines }}"
    - name: Verify low battery thresholds
      shell: |
        echo "Runtime threshold: $(upsc {{ ups_name }}@localhost battery.runtime.low 2>/dev/null)s"
        echo "Charge threshold: $(upsc {{ ups_name }}@localhost battery.charge.low 2>/dev/null)%"
      register: thresholds
      changed_when: false
    - name: Display low battery thresholds
      debug:
        msg: "{{ thresholds.stdout_lines }}"
  handlers:
    - name: Restart NUT services
      systemd:
        name: "{{ item }}"
        state: restarted
      loop:
        - nut-driver-enumerator
        - nut-server
        - nut-monitor
 - name: Setup UPS Heartbeat Monitoring with Uptime Kuma
  hosts: nodito
  become: true
  vars_files:
    - ../../infra_vars.yml
    - ../../services_config.yml
    - ../../infra_secrets.yml
    - nodito_vars.yml
    - nodito_secrets.yml
  vars:
    ups_heartbeat_interval_seconds: 60
    ups_heartbeat_timeout_seconds: 120
    ups_heartbeat_retries: 1
    ups_monitoring_script_dir: /opt/ups-monitoring
    ups_monitoring_script_path: "{{ ups_monitoring_script_dir }}/ups_heartbeat.sh"
    ups_log_file: "{{ ups_monitoring_script_dir }}/ups_heartbeat.log"
    ups_systemd_service_name: ups-heartbeat
    uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
    ntfy_topic: "{{ service_settings.ntfy.topic }}"
  tasks:
    - name: Validate Uptime Kuma configuration
      assert:
        that:
          - uptime_kuma_api_url is defined
          - uptime_kuma_api_url != ""
          - uptime_kuma_username is defined
          - uptime_kuma_username != ""
          - uptime_kuma_password is defined
          - uptime_kuma_password != ""
        fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"
    - name: Get hostname for monitor identification
      command: hostname
      register: host_name
      changed_when: false
    - name: Set monitor name and group based on hostname
      set_fact:
        monitor_name: "ups-{{ host_name.stdout }}"
        monitor_friendly_name: "UPS Status: {{ host_name.stdout }}"
        uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"
    - name: Create Uptime Kuma UPS monitor setup script
      copy:
        dest: /tmp/setup_uptime_kuma_ups_monitor.py
        content: |
          #!/usr/bin/env python3
          import sys
          import json
          from uptime_kuma_api import UptimeKumaApi
          def main():
              api_url = sys.argv[1]
              username = sys.argv[2]
              password = sys.argv[3]
              group_name = sys.argv[4]
              monitor_name = sys.argv[5]
              monitor_description = sys.argv[6]
              interval = int(sys.argv[7])
              retries = int(sys.argv[8])
              ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"
              api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0)
              api.login(username, password)
              monitors = api.get_monitors()
              notifications = api.get_notifications()
              ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
              notification_id_list = {}
              if ntfy_notification:
                  notification_id_list[ntfy_notification['id']] = True
              group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
              if not group:
                  api.add_monitor(type='group', name=group_name)
                  monitors = api.get_monitors()
                  group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
              existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
              monitor_data = {
                  'type': 'push',
                  'name': monitor_name,
                  'parent': group['id'],
                  'interval': interval,
                  'upsideDown': False,  # Normal heartbeat mode: receiving pings = healthy
                  'maxretries': retries,
                  'description': monitor_description,
                  'notificationIDList': notification_id_list
              }
              if existing_monitor:
                  api.edit_monitor(existing_monitor['id'], **monitor_data)
                  monitors = api.get_monitors()
                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
              else:
                  api.add_monitor(**monitor_data)
                  monitors = api.get_monitors()
                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
              result = {
                  'monitor_id': monitor['id'],
                  'push_token': monitor['pushToken'],
                  'group_name': group_name,
                  'group_id': group['id'],
                  'monitor_name': monitor_name
              }
              print(json.dumps(result))
              api.disconnect()
          if __name__ == '__main__':
              main()
        mode: '0755'
      delegate_to: localhost
      become: no
    - name: Run Uptime Kuma UPS monitor setup script
      command: >
        {{ ansible_playbook_python }}
        /tmp/setup_uptime_kuma_ups_monitor.py
        "{{ uptime_kuma_api_url }}"
        "{{ uptime_kuma_username }}"
        "{{ uptime_kuma_password }}"
        "{{ uptime_kuma_monitor_group }}"
        "{{ monitor_name }}"
        "{{ monitor_friendly_name }} - Alerts when UPS goes on battery or loses communication"
        "{{ ups_heartbeat_timeout_seconds }}"
        "{{ ups_heartbeat_retries }}"
        "{{ ntfy_topic }}"
      register: monitor_setup_result
      delegate_to: localhost
      become: no
      changed_when: false
    - name: Parse monitor setup result
      set_fact:
        monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
    - name: Set push URL as fact
      set_fact:
        uptime_kuma_ups_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
    - name: Install required packages for UPS monitoring
      package:
        name:
          - curl
        state: present
    - name: Create monitoring script directory
      file:
        path: "{{ ups_monitoring_script_dir }}"
        state: directory
        owner: root
        group: root
        mode: '0755'
    - name: Create UPS heartbeat monitoring script
      copy:
        dest: "{{ ups_monitoring_script_path }}"
        content: |
          #!/bin/bash
          # UPS Heartbeat Monitoring Script
          # Sends heartbeat to Uptime Kuma only when UPS is on mains power
          # When on battery or communication lost, no heartbeat is sent (triggers timeout alert)
          LOG_FILE="{{ ups_log_file }}"
          UPTIME_KUMA_URL="{{ uptime_kuma_ups_push_url }}"
          UPS_NAME="{{ ups_name }}"
          log_message() {
              echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
          }
          send_heartbeat() {
              local message="$1"
              local encoded_message
              encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g; s/%/%25/g')
              local response http_code
              response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1)
              http_code=$(echo "$response" | tail -n1)
              if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
                  log_message "Heartbeat sent: $message (HTTP $http_code)"
                  return 0
              else
                  log_message "ERROR: Failed to send heartbeat (HTTP $http_code)"
                  return 1
              fi
          }
          main() {
              local status charge runtime load
              status=$(upsc ${UPS_NAME}@localhost ups.status 2>/dev/null)
              if [ -z "$status" ]; then
                  log_message "ERROR: Cannot communicate with UPS - NOT sending heartbeat"
                  exit 1
              fi
              charge=$(upsc ${UPS_NAME}@localhost battery.charge 2>/dev/null)
              runtime=$(upsc ${UPS_NAME}@localhost battery.runtime 2>/dev/null)
              load=$(upsc ${UPS_NAME}@localhost ups.load 2>/dev/null)
              if [[ "$status" == *"OL"* ]]; then
                  local message="UPS on mains (charge=${charge}% runtime=${runtime}s load=${load}%)"
                  send_heartbeat "$message"
                  exit 0
              else
                  log_message "UPS not on mains power (status=$status) - NOT sending heartbeat"
                  exit 1
              fi
          }
          main
        owner: root
        group: root
        mode: '0755'
    - name: Create systemd service for UPS heartbeat
      copy:
        dest: "/etc/systemd/system/{{ ups_systemd_service_name }}.service"
        content: |
          [Unit]
          Description=UPS Heartbeat Monitor
          After=network.target nut-monitor.service
          [Service]
          Type=oneshot
          ExecStart={{ ups_monitoring_script_path }}
          User=root
          StandardOutput=journal
          StandardError=journal
          [Install]
          WantedBy=multi-user.target
        owner: root
        group: root
        mode: '0644'
    - name: Create systemd timer for UPS heartbeat
      copy:
        dest: "/etc/systemd/system/{{ ups_systemd_service_name }}.timer"
        content: |
          [Unit]
          Description=Run UPS Heartbeat Monitor every {{ ups_heartbeat_interval_seconds }} seconds
          Requires={{ ups_systemd_service_name }}.service
          [Timer]
          OnBootSec=1min
          OnUnitActiveSec={{ ups_heartbeat_interval_seconds }}sec
          Persistent=true
          [Install]
          WantedBy=timers.target
        owner: root
        group: root
        mode: '0644'
    - name: Reload systemd daemon
      systemd:
        daemon_reload: yes
    - name: Enable and start UPS heartbeat timer
      systemd:
        name: "{{ ups_systemd_service_name }}.timer"
        enabled: yes
        state: started
    - name: Test UPS heartbeat script
      command: "{{ ups_monitoring_script_path }}"
      register: script_test
      changed_when: false
    - name: Verify script execution
      assert:
        that:
          - script_test.rc == 0
        fail_msg: "UPS heartbeat script failed - check UPS status and communication"
    - name: Display monitoring configuration
      debug:
        msg:
          - "UPS Monitoring configured successfully"
          - ""
          - "NUT Configuration:"
          - "  UPS Name: {{ ups_name }}"
          - "  UPS Description: {{ ups_desc }}"
          - "  Off Delay: {{ ups_offdelay }}s (time after shutdown before UPS cuts power)"
          - "  On Delay: {{ ups_ondelay }}s (time after mains returns before UPS restores power)"
          - ""
          - "Uptime Kuma Monitoring:"
          - "  Monitor Name: {{ monitor_friendly_name }}"
          - "  Monitor Group: {{ uptime_kuma_monitor_group }}"
          - "  Push URL: {{ uptime_kuma_ups_push_url }}"
          - "  Heartbeat Interval: {{ ups_heartbeat_interval_seconds }}s"
          - "  Timeout: {{ ups_heartbeat_timeout_seconds }}s"
          - ""
          - "Scripts and Services:"
          - "  Script: {{ ups_monitoring_script_path }}"
          - "  Log: {{ ups_log_file }}"
          - "  Service: {{ ups_systemd_service_name }}.service"
          - "  Timer: {{ ups_systemd_service_name }}.timer"
    - name: Clean up temporary Uptime Kuma setup script
      file:
        path: /tmp/setup_uptime_kuma_ups_monitor.py
        state: absent
      delegate_to: localhost
      become: no
--- a/ansible/infra/nodito/nodito_vars.yml
+++ b/ansible/infra/nodito/nodito_vars.yml
@ -17,3 +17,12 @@ zfs_pool_name: "proxmox-tank-1"
 zfs_disk_1: "/dev/disk/by-id/ata-ST4000NT001-3M2101_WX11TN0Z"  # First disk for RAID 1 mirror
 zfs_disk_2: "/dev/disk/by-id/ata-ST4000NT001-3M2101_WX11TN2P"  # Second disk for RAID 1 mirror
 zfs_pool_mountpoint: "/var/lib/vz"
 # UPS Configuration (CyberPower CP900EPFCLCD via USB)
 ups_name: cyberpower
 ups_desc: "CyberPower CP900EPFCLCD"
 ups_driver: usbhid-ups
 ups_port: auto
 ups_user: counterweight
 ups_offdelay: 120  # Seconds after shutdown before UPS cuts outlet power
 ups_ondelay: 30    # Seconds after mains returns before UPS restores outlet power
Author	SHA1	Message	Date
counterweight	08281ce349	ups playbook	2026-01-11 22:43:27 +01:00
counterweight	fe321050c1	monitor zfs	2026-01-04 23:19:19 +01:00