thingies

ups playbook
monitor zfs
2026-02-08 18:22:31 +01:00 · 2026-01-11 22:43:27 +01:00 · 2026-01-04 23:19:19 +01:00
8 changed files with 1509 additions and 0 deletions
--- a/ansible/infra/910_docker_playbook.yml
+++ b/ansible/infra/910_docker_playbook.yml
@ -25,6 +25,7 @@
        name:
          - ca-certificates
          - curl
+          - gnupg
        state: present

    - name: Create directory for Docker GPG key
--- a/ansible/infra/nodito/32_zfs_pool_setup_playbook.yml
+++ b/ansible/infra/nodito/32_zfs_pool_setup_playbook.yml
@ -170,3 +170,499 @@
      fail:
        msg: "ZFS pool {{ zfs_pool_name }} is not in a healthy state"
      when: "'ONLINE' not in final_zfs_status.stdout"
+
+- name: Setup ZFS Pool Health Monitoring and Monthly Scrubs
+  hosts: nodito
+  become: true
+  vars_files:
+    - ../../infra_vars.yml
+    - ../../services_config.yml
+    - ../../infra_secrets.yml
+    - nodito_vars.yml
+
+  vars:
+    zfs_check_interval_seconds: 86400  # 24 hours
+    zfs_check_timeout_seconds: 90000   # ~25 hours (interval + buffer)
+    zfs_check_retries: 1
+    zfs_monitoring_script_dir: /opt/zfs-monitoring
+    zfs_monitoring_script_path: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.sh"
+    zfs_log_file: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.log"
+    zfs_systemd_health_service_name: zfs-health-monitor
+    zfs_systemd_scrub_service_name: zfs-monthly-scrub
+    uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
+    ntfy_topic: "{{ service_settings.ntfy.topic }}"
+
+  tasks:
+    - name: Validate Uptime Kuma configuration
+      assert:
+        that:
+          - uptime_kuma_api_url is defined
+          - uptime_kuma_api_url != ""
+          - uptime_kuma_username is defined
+          - uptime_kuma_username != ""
+          - uptime_kuma_password is defined
+          - uptime_kuma_password != ""
+        fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"
+
+    - name: Get hostname for monitor identification
+      command: hostname
+      register: host_name
+      changed_when: false
+
+    - name: Set monitor name and group based on hostname
+      set_fact:
+        monitor_name: "zfs-health-{{ host_name.stdout }}"
+        monitor_friendly_name: "ZFS Pool Health: {{ host_name.stdout }}"
+        uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"
+
+    - name: Create Uptime Kuma ZFS health monitor setup script
+      copy:
+        dest: /tmp/setup_uptime_kuma_zfs_monitor.py
+        content: |
+          #!/usr/bin/env python3
+          import sys
+          import json
+          from uptime_kuma_api import UptimeKumaApi
+          
+          def main():
+              api_url = sys.argv[1]
+              username = sys.argv[2]
+              password = sys.argv[3]
+              group_name = sys.argv[4]
+              monitor_name = sys.argv[5]
+              monitor_description = sys.argv[6]
+              interval = int(sys.argv[7])
+              retries = int(sys.argv[8])
+              ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"
+              
+              api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0)
+              api.login(username, password)
+              
+              # Get all monitors
+              monitors = api.get_monitors()
+              
+              # Get all notifications and find ntfy notification
+              notifications = api.get_notifications()
+              ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
+              notification_id_list = {}
+              if ntfy_notification:
+                  notification_id_list[ntfy_notification['id']] = True
+              
+              # Find or create group
+              group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
+              if not group:
+                  group_result = api.add_monitor(type='group', name=group_name)
+                  # Refresh to get the full group object with id
+                  monitors = api.get_monitors()
+                  group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
+              
+              # Find or create/update push monitor
+              existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
+              
+              monitor_data = {
+                  'type': 'push',
+                  'name': monitor_name,
+                  'parent': group['id'],
+                  'interval': interval,
+                  'upsideDown': False,  # Normal heartbeat mode: receiving pings = healthy
+                  'maxretries': retries,
+                  'description': monitor_description,
+                  'notificationIDList': notification_id_list
+              }
+              
+              if existing_monitor:
+                  monitor = api.edit_monitor(existing_monitor['id'], **monitor_data)
+                  # Refresh to get the full monitor object with pushToken
+                  monitors = api.get_monitors()
+                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
+              else:
+                  monitor_result = api.add_monitor(**monitor_data)
+                  # Refresh to get the full monitor object with pushToken
+                  monitors = api.get_monitors()
+                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
+              
+              # Output result as JSON
+              result = {
+                  'monitor_id': monitor['id'],
+                  'push_token': monitor['pushToken'],
+                  'group_name': group_name,
+                  'group_id': group['id'],
+                  'monitor_name': monitor_name
+              }
+              print(json.dumps(result))
+              
+              api.disconnect()
+          
+          if __name__ == '__main__':
+              main()
+        mode: '0755'
+      delegate_to: localhost
+      become: no
+
+    - name: Run Uptime Kuma ZFS monitor setup script
+      command: >
+        {{ ansible_playbook_python }}
+        /tmp/setup_uptime_kuma_zfs_monitor.py
+        "{{ uptime_kuma_api_url }}"
+        "{{ uptime_kuma_username }}"
+        "{{ uptime_kuma_password }}"
+        "{{ uptime_kuma_monitor_group }}"
+        "{{ monitor_name }}"
+        "{{ monitor_friendly_name }} - Daily health check for pool {{ zfs_pool_name }}"
+        "{{ zfs_check_timeout_seconds }}"
+        "{{ zfs_check_retries }}"
+        "{{ ntfy_topic }}"
+      register: monitor_setup_result
+      delegate_to: localhost
+      become: no
+      changed_when: false
+
+    - name: Parse monitor setup result
+      set_fact:
+        monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
+
+    - name: Set push URL and monitor ID as facts
+      set_fact:
+        uptime_kuma_zfs_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
+        uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}"
+
+    - name: Install required packages for ZFS monitoring
+      package:
+        name:
+          - curl
+          - jq
+        state: present
+
+    - name: Create monitoring script directory
+      file:
+        path: "{{ zfs_monitoring_script_dir }}"
+        state: directory
+        owner: root
+        group: root
+        mode: '0755'
+
+    - name: Create ZFS health monitoring script
+      copy:
+        dest: "{{ zfs_monitoring_script_path }}"
+        content: |
+          #!/bin/bash
+          
+          # ZFS Pool Health Monitoring Script
+          # Checks ZFS pool health using JSON output and sends heartbeat to Uptime Kuma if healthy
+          # If any issues detected, does NOT send heartbeat (triggers timeout alert)
+          
+          LOG_FILE="{{ zfs_log_file }}"
+          UPTIME_KUMA_URL="{{ uptime_kuma_zfs_push_url }}"
+          POOL_NAME="{{ zfs_pool_name }}"
+          HOSTNAME=$(hostname)
+          
+          # Function to log messages
+          log_message() {
+              echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
+          }
+          
+          # Function to check pool health using JSON output
+          check_pool_health() {
+              local pool="$1"
+              local issues_found=0
+              
+              # Get pool status as JSON
+              local pool_json
+              pool_json=$(zpool status -j "$pool" 2>&1)
+              
+              if [ $? -ne 0 ]; then
+                  log_message "ERROR: Failed to get pool status for $pool"
+                  log_message "  -> $pool_json"
+                  return 1
+              fi
+              
+              # Check 1: Pool state must be ONLINE
+              local pool_state
+              pool_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].state')
+              
+              if [ "$pool_state" != "ONLINE" ]; then
+                  log_message "ISSUE: Pool state is $pool_state (expected ONLINE)"
+                  issues_found=1
+              else
+                  log_message "OK: Pool state is ONLINE"
+              fi
+              
+              # Check 2: Check all vdevs and devices for non-ONLINE states
+              local bad_states
+              bad_states=$(echo "$pool_json" | jq -r --arg pool "$pool" '
+                  .pools[$pool].vdevs[] | 
+                  .. | objects | 
+                  select(.state? and .state != "ONLINE") | 
+                  "\(.name // "unknown"): \(.state)"
+              ' 2>/dev/null)
+              
+              if [ -n "$bad_states" ]; then
+                  log_message "ISSUE: Found devices not in ONLINE state:"
+                  echo "$bad_states" | while read -r line; do
+                      log_message "  -> $line"
+                  done
+                  issues_found=1
+              else
+                  log_message "OK: All devices are ONLINE"
+              fi
+              
+              # Check 3: Check for resilvering in progress
+              local scan_function scan_state
+              scan_function=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')
+              scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')
+              
+              if [ "$scan_function" = "RESILVER" ] && [ "$scan_state" = "SCANNING" ]; then
+                  local resilver_progress
+                  resilver_progress=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.issued // "unknown"')
+                  log_message "ISSUE: Pool is currently resilvering (disk reconstruction in progress) - ${resilver_progress} processed"
+                  issues_found=1
+              fi
+              
+              # Check 4: Check for read/write/checksum errors on all devices
+              # Note: ZFS JSON output has error counts as strings, so convert to numbers for comparison
+              local devices_with_errors
+              devices_with_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '
+                  .pools[$pool].vdevs[] | 
+                  .. | objects | 
+                  select(.name? and ((.read_errors // "0" | tonumber) > 0 or (.write_errors // "0" | tonumber) > 0 or (.checksum_errors // "0" | tonumber) > 0)) | 
+                  "\(.name): read=\(.read_errors // 0) write=\(.write_errors // 0) cksum=\(.checksum_errors // 0)"
+              ' 2>/dev/null)
+              
+              if [ -n "$devices_with_errors" ]; then
+                  log_message "ISSUE: Found devices with I/O errors:"
+                  echo "$devices_with_errors" | while read -r line; do
+                      log_message "  -> $line"
+                  done
+                  issues_found=1
+              else
+                  log_message "OK: No read/write/checksum errors detected"
+              fi
+              
+              # Check 5: Check for scan errors (from last scrub/resilver)
+              local scan_errors
+              scan_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.errors // "0"')
+              
+              if [ "$scan_errors" != "0" ] && [ "$scan_errors" != "null" ] && [ -n "$scan_errors" ]; then
+                  log_message "ISSUE: Last scan reported $scan_errors errors"
+                  issues_found=1
+              else
+                  log_message "OK: No scan errors"
+              fi
+              
+              return $issues_found
+          }
+          
+          # Function to get last scrub info for status message
+          get_scrub_info() {
+              local pool="$1"
+              local pool_json
+              pool_json=$(zpool status -j "$pool" 2>/dev/null)
+              
+              local scan_func scan_state scan_start
+              scan_func=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')
+              scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')
+              scan_start=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.start_time // ""')
+              
+              if [ "$scan_func" = "SCRUB" ] && [ "$scan_state" = "SCANNING" ]; then
+                  echo "scrub in progress (started $scan_start)"
+              elif [ "$scan_func" = "SCRUB" ] && [ -n "$scan_start" ]; then
+                  echo "last scrub: $scan_start"
+              else
+                  echo "no scrub history"
+              fi
+          }
+          
+          # Function to send heartbeat to Uptime Kuma
+          send_heartbeat() {
+              local message="$1"
+              
+              log_message "Sending heartbeat to Uptime Kuma: $message"
+              
+              # URL encode the message
+              local encoded_message
+              encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g')
+              
+              local response http_code
+              response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1)
+              http_code=$(echo "$response" | tail -n1)
+              
+              if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
+                  log_message "Heartbeat sent successfully (HTTP $http_code)"
+                  return 0
+              else
+                  log_message "ERROR: Failed to send heartbeat (HTTP $http_code)"
+                  return 1
+              fi
+          }
+          
+          # Main health check logic
+          main() {
+              log_message "=========================================="
+              log_message "Starting ZFS health check for pool: $POOL_NAME on $HOSTNAME"
+              
+              # Run all health checks
+              if check_pool_health "$POOL_NAME"; then
+                  # All checks passed - send heartbeat
+                  local scrub_info
+                  scrub_info=$(get_scrub_info "$POOL_NAME")
+                  
+                  local message="Pool $POOL_NAME healthy ($scrub_info)"
+                  send_heartbeat "$message"
+                  
+                  log_message "Health check completed: ALL OK"
+                  exit 0
+              else
+                  # Issues found - do NOT send heartbeat (will trigger timeout alert)
+                  log_message "Health check completed: ISSUES DETECTED - NOT sending heartbeat"
+                  log_message "Uptime Kuma will alert after timeout due to missing heartbeat"
+                  exit 1
+              fi
+          }
+          
+          # Run main function
+          main
+        owner: root
+        group: root
+        mode: '0755'
+
+    - name: Create systemd service for ZFS health monitoring
+      copy:
+        dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.service"
+        content: |
+          [Unit]
+          Description=ZFS Pool Health Monitor
+          After=zfs.target network.target
+          
+          [Service]
+          Type=oneshot
+          ExecStart={{ zfs_monitoring_script_path }}
+          User=root
+          StandardOutput=journal
+          StandardError=journal
+          
+          [Install]
+          WantedBy=multi-user.target
+        owner: root
+        group: root
+        mode: '0644'
+
+    - name: Create systemd timer for daily ZFS health monitoring
+      copy:
+        dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.timer"
+        content: |
+          [Unit]
+          Description=Run ZFS Pool Health Monitor daily
+          Requires={{ zfs_systemd_health_service_name }}.service
+          
+          [Timer]
+          OnBootSec=5min
+          OnUnitActiveSec={{ zfs_check_interval_seconds }}sec
+          Persistent=true
+          
+          [Install]
+          WantedBy=timers.target
+        owner: root
+        group: root
+        mode: '0644'
+
+    - name: Create systemd service for ZFS monthly scrub
+      copy:
+        dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.service"
+        content: |
+          [Unit]
+          Description=ZFS Monthly Scrub for {{ zfs_pool_name }}
+          After=zfs.target
+          
+          [Service]
+          Type=oneshot
+          ExecStart=/sbin/zpool scrub {{ zfs_pool_name }}
+          User=root
+          StandardOutput=journal
+          StandardError=journal
+          
+          [Install]
+          WantedBy=multi-user.target
+        owner: root
+        group: root
+        mode: '0644'
+
+    - name: Create systemd timer for monthly ZFS scrub
+      copy:
+        dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.timer"
+        content: |
+          [Unit]
+          Description=Run ZFS Scrub on last day of every month at 4:00 AM
+          Requires={{ zfs_systemd_scrub_service_name }}.service
+          
+          [Timer]
+          OnCalendar=*-*~01 04:00:00
+          Persistent=true
+          
+          [Install]
+          WantedBy=timers.target
+        owner: root
+        group: root
+        mode: '0644'
+
+    - name: Reload systemd daemon
+      systemd:
+        daemon_reload: yes
+
+    - name: Enable and start ZFS health monitoring timer
+      systemd:
+        name: "{{ zfs_systemd_health_service_name }}.timer"
+        enabled: yes
+        state: started
+
+    - name: Enable and start ZFS monthly scrub timer
+      systemd:
+        name: "{{ zfs_systemd_scrub_service_name }}.timer"
+        enabled: yes
+        state: started
+
+    - name: Test ZFS health monitoring script
+      command: "{{ zfs_monitoring_script_path }}"
+      register: script_test
+      changed_when: false
+
+    - name: Verify script execution
+      assert:
+        that:
+          - script_test.rc == 0
+        fail_msg: "ZFS health monitoring script failed - check pool health"
+
+    - name: Display monitoring configuration
+      debug:
+        msg: |
+          ✓ ZFS Pool Health Monitoring deployed successfully!
+          
+          Monitor Name: {{ monitor_friendly_name }}
+          Monitor Group: {{ uptime_kuma_monitor_group }}
+          Pool Name: {{ zfs_pool_name }}
+          
+          Health Check:
+            - Frequency: Every {{ zfs_check_interval_seconds }} seconds (24 hours)
+            - Timeout: {{ zfs_check_timeout_seconds }} seconds (~25 hours)
+            - Script: {{ zfs_monitoring_script_path }}
+            - Log: {{ zfs_log_file }}
+            - Service: {{ zfs_systemd_health_service_name }}.service
+            - Timer: {{ zfs_systemd_health_service_name }}.timer
+          
+          Monthly Scrub:
+            - Schedule: Last day of month at 4:00 AM
+            - Service: {{ zfs_systemd_scrub_service_name }}.service
+            - Timer: {{ zfs_systemd_scrub_service_name }}.timer
+          
+          Conditions monitored:
+            - Pool state (must be ONLINE)
+            - Device states (no DEGRADED/FAULTED/OFFLINE/UNAVAIL)
+            - Resilver status (alerts if resilvering)
+            - Read/Write/Checksum errors
+            - Scrub errors
+
+    - name: Clean up temporary Uptime Kuma setup script
+      file:
+        path: /tmp/setup_uptime_kuma_zfs_monitor.py
+        state: absent
+      delegate_to: localhost
+      become: no
--- a/ansible/infra/nodito/34_nut_ups_setup_playbook.yml
+++ b/ansible/infra/nodito/34_nut_ups_setup_playbook.yml
@ -0,0 +1,569 @@
+- name: Setup NUT (Network UPS Tools) for CyberPower UPS
+  hosts: nodito_host
+  become: true
+  vars_files:
+    - ../../infra_vars.yml
+    - nodito_vars.yml
+    - nodito_secrets.yml
+
+  tasks:
+    # ------------------------------------------------------------------
+    # Installation
+    # ------------------------------------------------------------------
+    - name: Install NUT packages
+      apt:
+        name:
+          - nut
+          - nut-client
+          - nut-server
+        state: present
+        update_cache: true
+
+    # ------------------------------------------------------------------
+    # Verify UPS is detected
+    # ------------------------------------------------------------------
+    - name: Check if UPS is detected via USB
+      shell: lsusb | grep -i cyber
+      register: lsusb_output
+      changed_when: false
+      failed_when: false
+
+    - name: Display USB detection result
+      debug:
+        msg: "{{ lsusb_output.stdout | default('UPS not detected via USB - ensure it is plugged in') }}"
+
+    - name: Fail if UPS not detected
+      fail:
+        msg: "CyberPower UPS not detected via USB. Ensure the USB cable is connected."
+      when: lsusb_output.rc != 0
+
+    - name: Reload udev rules for USB permissions
+      shell: |
+        udevadm control --reload-rules
+        udevadm trigger --subsystem-match=usb --action=add
+      changed_when: true
+
+    - name: Verify USB device has nut group permissions
+      shell: |
+        BUS_DEV=$(lsusb | grep -i cyber | grep -oP 'Bus \K\d+|Device \K\d+' | tr '\n' '/' | sed 's/\/$//')
+        if [ -n "$BUS_DEV" ]; then
+          BUS=$(echo $BUS_DEV | cut -d'/' -f1)
+          DEV=$(echo $BUS_DEV | cut -d'/' -f2)
+          ls -la /dev/bus/usb/$BUS/$DEV
+        else
+          echo "UPS device not found"
+          exit 1
+        fi
+      register: usb_permissions
+      changed_when: false
+
+    - name: Display USB permissions
+      debug:
+        msg: "{{ usb_permissions.stdout }} (should show 'root nut', not 'root root')"
+
+    - name: Scan for UPS with nut-scanner
+      command: nut-scanner -U
+      register: nut_scanner_output
+      changed_when: false
+      failed_when: false
+
+    - name: Display nut-scanner result
+      debug:
+        msg: "{{ nut_scanner_output.stdout_lines }}"
+
+    # ------------------------------------------------------------------
+    # Configuration files
+    # ------------------------------------------------------------------
+    - name: Configure NUT mode (standalone)
+      copy:
+        dest: /etc/nut/nut.conf
+        content: |
+          # Managed by Ansible
+          MODE=standalone
+        owner: root
+        group: nut
+        mode: "0640"
+      notify: Restart NUT services
+
+    - name: Configure UPS device
+      copy:
+        dest: /etc/nut/ups.conf
+        content: |
+          # Managed by Ansible
+          [{{ ups_name }}]
+              driver = {{ ups_driver }}
+              port = {{ ups_port }}
+              desc = "{{ ups_desc }}"
+              offdelay = {{ ups_offdelay }}
+              ondelay = {{ ups_ondelay }}
+        owner: root
+        group: nut
+        mode: "0640"
+      notify: Restart NUT services
+
+    - name: Configure upsd to listen on localhost
+      copy:
+        dest: /etc/nut/upsd.conf
+        content: |
+          # Managed by Ansible
+          LISTEN 127.0.0.1 3493
+        owner: root
+        group: nut
+        mode: "0640"
+      notify: Restart NUT services
+
+    - name: Configure upsd users
+      copy:
+        dest: /etc/nut/upsd.users
+        content: |
+          # Managed by Ansible
+          [{{ ups_user }}]
+              password = {{ ups_password }}
+              upsmon master
+        owner: root
+        group: nut
+        mode: "0640"
+      notify: Restart NUT services
+
+    - name: Configure upsmon
+      copy:
+        dest: /etc/nut/upsmon.conf
+        content: |
+          # Managed by Ansible
+          MONITOR {{ ups_name }}@localhost 1 {{ ups_user }} {{ ups_password }} master
+
+          MINSUPPLIES 1
+          SHUTDOWNCMD "/sbin/shutdown -h +0"
+          POLLFREQ 5
+          POLLFREQALERT 5
+          HOSTSYNC 15
+          DEADTIME 15
+          POWERDOWNFLAG /etc/killpower
+
+          # Notifications
+          NOTIFYMSG ONLINE    "UPS %s on line power"
+          NOTIFYMSG ONBATT    "UPS %s on battery"
+          NOTIFYMSG LOWBATT   "UPS %s battery is low"
+          NOTIFYMSG FSD       "UPS %s: forced shutdown in progress"
+          NOTIFYMSG COMMOK    "Communications with UPS %s established"
+          NOTIFYMSG COMMBAD   "Communications with UPS %s lost"
+          NOTIFYMSG SHUTDOWN  "Auto logout and shutdown proceeding"
+          NOTIFYMSG REPLBATT  "UPS %s battery needs replacing"
+
+          # Log all events to syslog
+          NOTIFYFLAG ONLINE   SYSLOG
+          NOTIFYFLAG ONBATT   SYSLOG
+          NOTIFYFLAG LOWBATT  SYSLOG
+          NOTIFYFLAG FSD      SYSLOG
+          NOTIFYFLAG COMMOK   SYSLOG
+          NOTIFYFLAG COMMBAD  SYSLOG
+          NOTIFYFLAG SHUTDOWN SYSLOG
+          NOTIFYFLAG REPLBATT SYSLOG
+        owner: root
+        group: nut
+        mode: "0640"
+      notify: Restart NUT services
+
+    # ------------------------------------------------------------------
+    # Verify late-stage shutdown script
+    # ------------------------------------------------------------------
+    - name: Verify nutshutdown script exists
+      stat:
+        path: /lib/systemd/system-shutdown/nutshutdown
+      register: nutshutdown_script
+
+    - name: Warn if nutshutdown script is missing
+      debug:
+        msg: "WARNING: /lib/systemd/system-shutdown/nutshutdown not found. UPS may not cut power after shutdown."
+      when: not nutshutdown_script.stat.exists
+
+    # ------------------------------------------------------------------
+    # Services
+    # ------------------------------------------------------------------
+    - name: Enable and start NUT driver enumerator
+      systemd:
+        name: nut-driver-enumerator
+        enabled: true
+        state: started
+
+    - name: Enable and start NUT server
+      systemd:
+        name: nut-server
+        enabled: true
+        state: started
+
+    - name: Enable and start NUT monitor
+      systemd:
+        name: nut-monitor
+        enabled: true
+        state: started
+
+    # ------------------------------------------------------------------
+    # Verification
+    # ------------------------------------------------------------------
+    - name: Wait for NUT services to stabilize
+      pause:
+        seconds: 3
+
+    - name: Verify NUT can communicate with UPS
+      command: upsc {{ ups_name }}@localhost
+      register: upsc_output
+      changed_when: false
+      failed_when: upsc_output.rc != 0
+
+    - name: Display UPS status
+      debug:
+        msg: "{{ upsc_output.stdout_lines }}"
+
+    - name: Get UPS status summary
+      shell: |
+        echo "Status: $(upsc {{ ups_name }}@localhost ups.status 2>/dev/null)"
+        echo "Battery: $(upsc {{ ups_name }}@localhost battery.charge 2>/dev/null)%"
+        echo "Runtime: $(upsc {{ ups_name }}@localhost battery.runtime 2>/dev/null)s"
+        echo "Load: $(upsc {{ ups_name }}@localhost ups.load 2>/dev/null)%"
+      register: ups_summary
+      changed_when: false
+
+    - name: Display UPS summary
+      debug:
+        msg: "{{ ups_summary.stdout_lines }}"
+
+    - name: Verify low battery thresholds
+      shell: |
+        echo "Runtime threshold: $(upsc {{ ups_name }}@localhost battery.runtime.low 2>/dev/null)s"
+        echo "Charge threshold: $(upsc {{ ups_name }}@localhost battery.charge.low 2>/dev/null)%"
+      register: thresholds
+      changed_when: false
+
+    - name: Display low battery thresholds
+      debug:
+        msg: "{{ thresholds.stdout_lines }}"
+
+  handlers:
+    - name: Restart NUT services
+      systemd:
+        name: "{{ item }}"
+        state: restarted
+      loop:
+        - nut-driver-enumerator
+        - nut-server
+        - nut-monitor
+
+
+- name: Setup UPS Heartbeat Monitoring with Uptime Kuma
+  hosts: nodito
+  become: true
+  vars_files:
+    - ../../infra_vars.yml
+    - ../../services_config.yml
+    - ../../infra_secrets.yml
+    - nodito_vars.yml
+    - nodito_secrets.yml
+
+  vars:
+    ups_heartbeat_interval_seconds: 60
+    ups_heartbeat_timeout_seconds: 120
+    ups_heartbeat_retries: 1
+    ups_monitoring_script_dir: /opt/ups-monitoring
+    ups_monitoring_script_path: "{{ ups_monitoring_script_dir }}/ups_heartbeat.sh"
+    ups_log_file: "{{ ups_monitoring_script_dir }}/ups_heartbeat.log"
+    ups_systemd_service_name: ups-heartbeat
+    uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
+    ntfy_topic: "{{ service_settings.ntfy.topic }}"
+
+  tasks:
+    - name: Validate Uptime Kuma configuration
+      assert:
+        that:
+          - uptime_kuma_api_url is defined
+          - uptime_kuma_api_url != ""
+          - uptime_kuma_username is defined
+          - uptime_kuma_username != ""
+          - uptime_kuma_password is defined
+          - uptime_kuma_password != ""
+        fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"
+
+    - name: Get hostname for monitor identification
+      command: hostname
+      register: host_name
+      changed_when: false
+
+    - name: Set monitor name and group based on hostname
+      set_fact:
+        monitor_name: "ups-{{ host_name.stdout }}"
+        monitor_friendly_name: "UPS Status: {{ host_name.stdout }}"
+        uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"
+
+    - name: Create Uptime Kuma UPS monitor setup script
+      copy:
+        dest: /tmp/setup_uptime_kuma_ups_monitor.py
+        content: |
+          #!/usr/bin/env python3
+          import sys
+          import json
+          from uptime_kuma_api import UptimeKumaApi
+
+          def main():
+              api_url = sys.argv[1]
+              username = sys.argv[2]
+              password = sys.argv[3]
+              group_name = sys.argv[4]
+              monitor_name = sys.argv[5]
+              monitor_description = sys.argv[6]
+              interval = int(sys.argv[7])
+              retries = int(sys.argv[8])
+              ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"
+
+              api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0)
+              api.login(username, password)
+
+              monitors = api.get_monitors()
+              notifications = api.get_notifications()
+
+              ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
+              notification_id_list = {}
+              if ntfy_notification:
+                  notification_id_list[ntfy_notification['id']] = True
+
+              group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
+              if not group:
+                  api.add_monitor(type='group', name=group_name)
+                  monitors = api.get_monitors()
+                  group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
+
+              existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
+
+              monitor_data = {
+                  'type': 'push',
+                  'name': monitor_name,
+                  'parent': group['id'],
+                  'interval': interval,
+                  'upsideDown': False,  # Normal heartbeat mode: receiving pings = healthy
+                  'maxretries': retries,
+                  'description': monitor_description,
+                  'notificationIDList': notification_id_list
+              }
+
+              if existing_monitor:
+                  api.edit_monitor(existing_monitor['id'], **monitor_data)
+                  monitors = api.get_monitors()
+                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
+              else:
+                  api.add_monitor(**monitor_data)
+                  monitors = api.get_monitors()
+                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
+
+              result = {
+                  'monitor_id': monitor['id'],
+                  'push_token': monitor['pushToken'],
+                  'group_name': group_name,
+                  'group_id': group['id'],
+                  'monitor_name': monitor_name
+              }
+              print(json.dumps(result))
+
+              api.disconnect()
+
+          if __name__ == '__main__':
+              main()
+        mode: '0755'
+      delegate_to: localhost
+      become: no
+
+    - name: Run Uptime Kuma UPS monitor setup script
+      command: >
+        {{ ansible_playbook_python }}
+        /tmp/setup_uptime_kuma_ups_monitor.py
+        "{{ uptime_kuma_api_url }}"
+        "{{ uptime_kuma_username }}"
+        "{{ uptime_kuma_password }}"
+        "{{ uptime_kuma_monitor_group }}"
+        "{{ monitor_name }}"
+        "{{ monitor_friendly_name }} - Alerts when UPS goes on battery or loses communication"
+        "{{ ups_heartbeat_timeout_seconds }}"
+        "{{ ups_heartbeat_retries }}"
+        "{{ ntfy_topic }}"
+      register: monitor_setup_result
+      delegate_to: localhost
+      become: no
+      changed_when: false
+
+    - name: Parse monitor setup result
+      set_fact:
+        monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
+
+    - name: Set push URL as fact
+      set_fact:
+        uptime_kuma_ups_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
+
+    - name: Install required packages for UPS monitoring
+      package:
+        name:
+          - curl
+        state: present
+
+    - name: Create monitoring script directory
+      file:
+        path: "{{ ups_monitoring_script_dir }}"
+        state: directory
+        owner: root
+        group: root
+        mode: '0755'
+
+    - name: Create UPS heartbeat monitoring script
+      copy:
+        dest: "{{ ups_monitoring_script_path }}"
+        content: |
+          #!/bin/bash
+
+          # UPS Heartbeat Monitoring Script
+          # Sends heartbeat to Uptime Kuma only when UPS is on mains power
+          # When on battery or communication lost, no heartbeat is sent (triggers timeout alert)
+
+          LOG_FILE="{{ ups_log_file }}"
+          UPTIME_KUMA_URL="{{ uptime_kuma_ups_push_url }}"
+          UPS_NAME="{{ ups_name }}"
+
+          log_message() {
+              echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
+          }
+
+          send_heartbeat() {
+              local message="$1"
+
+              local encoded_message
+              encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g; s/%/%25/g')
+
+              local response http_code
+              response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1)
+              http_code=$(echo "$response" | tail -n1)
+
+              if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
+                  log_message "Heartbeat sent: $message (HTTP $http_code)"
+                  return 0
+              else
+                  log_message "ERROR: Failed to send heartbeat (HTTP $http_code)"
+                  return 1
+              fi
+          }
+
+          main() {
+              local status charge runtime load
+
+              status=$(upsc ${UPS_NAME}@localhost ups.status 2>/dev/null)
+
+              if [ -z "$status" ]; then
+                  log_message "ERROR: Cannot communicate with UPS - NOT sending heartbeat"
+                  exit 1
+              fi
+
+              charge=$(upsc ${UPS_NAME}@localhost battery.charge 2>/dev/null)
+              runtime=$(upsc ${UPS_NAME}@localhost battery.runtime 2>/dev/null)
+              load=$(upsc ${UPS_NAME}@localhost ups.load 2>/dev/null)
+
+              if [[ "$status" == *"OL"* ]]; then
+                  local message="UPS on mains (charge=${charge}% runtime=${runtime}s load=${load}%)"
+                  send_heartbeat "$message"
+                  exit 0
+              else
+                  log_message "UPS not on mains power (status=$status) - NOT sending heartbeat"
+                  exit 1
+              fi
+          }
+
+          main
+        owner: root
+        group: root
+        mode: '0755'
+
+    - name: Create systemd service for UPS heartbeat
+      copy:
+        dest: "/etc/systemd/system/{{ ups_systemd_service_name }}.service"
+        content: |
+          [Unit]
+          Description=UPS Heartbeat Monitor
+          After=network.target nut-monitor.service
+
+          [Service]
+          Type=oneshot
+          ExecStart={{ ups_monitoring_script_path }}
+          User=root
+          StandardOutput=journal
+          StandardError=journal
+
+          [Install]
+          WantedBy=multi-user.target
+        owner: root
+        group: root
+        mode: '0644'
+
+    - name: Create systemd timer for UPS heartbeat
+      copy:
+        dest: "/etc/systemd/system/{{ ups_systemd_service_name }}.timer"
+        content: |
+          [Unit]
+          Description=Run UPS Heartbeat Monitor every {{ ups_heartbeat_interval_seconds }} seconds
+          Requires={{ ups_systemd_service_name }}.service
+
+          [Timer]
+          OnBootSec=1min
+          OnUnitActiveSec={{ ups_heartbeat_interval_seconds }}sec
+          Persistent=true
+
+          [Install]
+          WantedBy=timers.target
+        owner: root
+        group: root
+        mode: '0644'
+
+    - name: Reload systemd daemon
+      systemd:
+        daemon_reload: yes
+
+    - name: Enable and start UPS heartbeat timer
+      systemd:
+        name: "{{ ups_systemd_service_name }}.timer"
+        enabled: yes
+        state: started
+
+    - name: Test UPS heartbeat script
+      command: "{{ ups_monitoring_script_path }}"
+      register: script_test
+      changed_when: false
+
+    - name: Verify script execution
+      assert:
+        that:
+          - script_test.rc == 0
+        fail_msg: "UPS heartbeat script failed - check UPS status and communication"
+
+    - name: Display monitoring configuration
+      debug:
+        msg:
+          - "UPS Monitoring configured successfully"
+          - ""
+          - "NUT Configuration:"
+          - "  UPS Name: {{ ups_name }}"
+          - "  UPS Description: {{ ups_desc }}"
+          - "  Off Delay: {{ ups_offdelay }}s (time after shutdown before UPS cuts power)"
+          - "  On Delay: {{ ups_ondelay }}s (time after mains returns before UPS restores power)"
+          - ""
+          - "Uptime Kuma Monitoring:"
+          - "  Monitor Name: {{ monitor_friendly_name }}"
+          - "  Monitor Group: {{ uptime_kuma_monitor_group }}"
+          - "  Push URL: {{ uptime_kuma_ups_push_url }}"
+          - "  Heartbeat Interval: {{ ups_heartbeat_interval_seconds }}s"
+          - "  Timeout: {{ ups_heartbeat_timeout_seconds }}s"
+          - ""
+          - "Scripts and Services:"
+          - "  Script: {{ ups_monitoring_script_path }}"
+          - "  Log: {{ ups_log_file }}"
+          - "  Service: {{ ups_systemd_service_name }}.service"
+          - "  Timer: {{ ups_systemd_service_name }}.timer"
+
+    - name: Clean up temporary Uptime Kuma setup script
+      file:
+        path: /tmp/setup_uptime_kuma_ups_monitor.py
+        state: absent
+      delegate_to: localhost
+      become: no
--- a/ansible/infra/nodito/nodito_vars.yml
+++ b/ansible/infra/nodito/nodito_vars.yml
@ -17,3 +17,12 @@ zfs_pool_name: "proxmox-tank-1"
 zfs_disk_1: "/dev/disk/by-id/ata-ST4000NT001-3M2101_WX11TN0Z"  # First disk for RAID 1 mirror
 zfs_disk_2: "/dev/disk/by-id/ata-ST4000NT001-3M2101_WX11TN2P"  # Second disk for RAID 1 mirror
 zfs_pool_mountpoint: "/var/lib/vz"
+
+# UPS Configuration (CyberPower CP900EPFCLCD via USB)
+ups_name: cyberpower
+ups_desc: "CyberPower CP900EPFCLCD"
+ups_driver: usbhid-ups
+ups_port: auto
+ups_user: counterweight
+ups_offdelay: 120  # Seconds after shutdown before UPS cuts outlet power
+ups_ondelay: 30    # Seconds after mains returns before UPS restores outlet power
--- a/ansible/infra_secrets.yml.example
+++ b/ansible/infra_secrets.yml.example
@ -26,3 +26,8 @@ bitcoin_rpc_password: "CHANGE_ME_TO_SECURE_PASSWORD"
 # Mempool MariaDB credentials
 # Used by: services/mempool/deploy_mempool_playbook.yml
 mariadb_mempool_password: "CHANGE_ME_TO_SECURE_PASSWORD"
+
+# Forgejo Runner registration token
+# Used by: services/forgejo-runner/deploy_forgejo_runner_playbook.yml
+# See: services/forgejo-runner/SETUP.md for how to obtain this token
+forgejo_runner_registration_token: "YOUR_RUNNER_TOKEN_HERE"
--- a/ansible/services/forgejo-runner/SETUP.md
+++ b/ansible/services/forgejo-runner/SETUP.md
@ -0,0 +1,28 @@
+# Forgejo Runner Setup
+
+## Obtaining the Registration Token
+
+1. Log in to the Forgejo instance at `https://forgejo.contrapeso.xyz`
+2. Go to **Site Administration** > **Actions** > **Runners**
+3. Click **Create new runner**
+4. Copy the registration token
+
+## Configuring the Token
+
+Paste the token into `ansible/infra_secrets.yml`:
+
+```yaml
+forgejo_runner_registration_token: "YOUR_TOKEN_HERE"
+```
+
+## Running the Playbook
+
+```bash
+ansible-playbook ansible/services/forgejo-runner/deploy_forgejo_runner_playbook.yml
+```
+
+## Verifying
+
+1. On the VM: `systemctl status forgejo-runner` should show active
+2. In Forgejo: **Site Administration** > **Actions** > **Runners** should show the runner as online
+3. In Uptime Kuma: the `forgejo-runner-healthcheck` push monitor should be receiving pings
--- a/ansible/services/forgejo-runner/deploy_forgejo_runner_playbook.yml
+++ b/ansible/services/forgejo-runner/deploy_forgejo_runner_playbook.yml
@ -0,0 +1,392 @@
+- name: Install Forgejo Runner on Debian 13
+  hosts: forgejo_runner_local
+  become: yes
+  vars_files:
+    - ../../infra_vars.yml
+    - ../../services_config.yml
+    - ../../infra_secrets.yml
+    - ./forgejo_runner_vars.yml
+  vars:
+    uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
+    ntfy_topic: "{{ service_settings.ntfy.topic }}"
+    healthcheck_interval_seconds: 60
+    healthcheck_timeout_seconds: 90
+    healthcheck_retries: 1
+    healthcheck_script_dir: /opt/forgejo-runner-healthcheck
+    healthcheck_script_path: "{{ healthcheck_script_dir }}/forgejo_runner_healthcheck.sh"
+    healthcheck_log_file: "{{ healthcheck_script_dir }}/forgejo_runner_healthcheck.log"
+    healthcheck_service_name: forgejo-runner-healthcheck
+
+  tasks:
+    # ── 1. Assert Docker is available ──────────────────────────────────
+    - name: Check if Docker is installed
+      command: docker --version
+      register: docker_check
+      changed_when: false
+      failed_when: docker_check.rc != 0
+
+    - name: Fail if Docker is not available
+      assert:
+        that:
+          - docker_check.rc == 0
+        fail_msg: >
+          Docker is not installed or not in PATH.
+          Please install Docker before running this playbook.
+
+    # ── 2. Download forgejo-runner binary ──────────────────────────────
+    - name: Download forgejo-runner binary
+      get_url:
+        url: "{{ forgejo_runner_url }}"
+        dest: "{{ forgejo_runner_bin_path }}"
+        mode: '0755'
+
+    # ── 3. Create runner system user ───────────────────────────────────
+    - name: Create runner system user
+      user:
+        name: "{{ forgejo_runner_user }}"
+        system: yes
+        shell: /usr/sbin/nologin
+        home: "{{ forgejo_runner_dir }}"
+        create_home: no
+        groups: docker
+        append: yes
+        comment: 'Forgejo Runner'
+
+    # ── 4. Create working directory ────────────────────────────────────
+    - name: Create forgejo-runner working directory
+      file:
+        path: "{{ forgejo_runner_dir }}"
+        state: directory
+        owner: "{{ forgejo_runner_user }}"
+        group: "{{ forgejo_runner_user }}"
+        mode: '0750'
+
+    # ── 5. Generate default config ─────────────────────────────────────
+    - name: Check if config already exists
+      stat:
+        path: "{{ forgejo_runner_config_path }}"
+      register: config_stat
+
+    - name: Generate default config
+      shell: "{{ forgejo_runner_bin_path }} generate-config > {{ forgejo_runner_config_path }}"
+      args:
+        chdir: "{{ forgejo_runner_dir }}"
+      when: not config_stat.stat.exists
+
+    - name: Set config file ownership
+      file:
+        path: "{{ forgejo_runner_config_path }}"
+        owner: "{{ forgejo_runner_user }}"
+        group: "{{ forgejo_runner_user }}"
+      when: not config_stat.stat.exists
+
+    # ── 6. Register runner ─────────────────────────────────────────────
+    - name: Check if runner is already registered
+      stat:
+        path: "{{ forgejo_runner_dir }}/.runner"
+      register: runner_stat
+
+    - name: Register runner with Forgejo instance
+      command: >
+        {{ forgejo_runner_bin_path }} register --no-interactive
+        --instance {{ forgejo_instance_url }}
+        --token {{ forgejo_runner_registration_token }}
+        --name forgejo-runner-box
+        --labels "{{ forgejo_runner_labels }}"
+      args:
+        chdir: "{{ forgejo_runner_dir }}"
+      when: not runner_stat.stat.exists
+
+    - name: Set runner registration file ownership
+      file:
+        path: "{{ forgejo_runner_dir }}/.runner"
+        owner: "{{ forgejo_runner_user }}"
+        group: "{{ forgejo_runner_user }}"
+      when: not runner_stat.stat.exists
+
+    # ── 7. Create systemd service ──────────────────────────────────────
+    - name: Create forgejo-runner systemd service
+      copy:
+        dest: /etc/systemd/system/forgejo-runner.service
+        content: |
+          [Unit]
+          Description=Forgejo Runner
+          Documentation=https://forgejo.org/docs/latest/admin/actions/
+          After=docker.service
+          Requires=docker.service
+
+          [Service]
+          Type=simple
+          User={{ forgejo_runner_user }}
+          Group={{ forgejo_runner_user }}
+          WorkingDirectory={{ forgejo_runner_dir }}
+          ExecStart={{ forgejo_runner_bin_path }} daemon --config {{ forgejo_runner_config_path }}
+          Restart=on-failure
+          RestartSec=10
+
+          [Install]
+          WantedBy=multi-user.target
+        owner: root
+        group: root
+        mode: '0644'
+
+    # ── 8. Reload systemd, enable and start ────────────────────────────
+    - name: Reload systemd
+      systemd:
+        daemon_reload: yes
+
+    - name: Enable and start forgejo-runner service
+      systemd:
+        name: forgejo-runner
+        enabled: yes
+        state: started
+
+    # ── 9. Verify runner is active ─────────────────────────────────────
+    - name: Verify forgejo-runner is active
+      command: systemctl is-active forgejo-runner
+      register: runner_active
+      changed_when: false
+
+    - name: Assert runner is running
+      assert:
+        that:
+          - runner_active.stdout == "active"
+        fail_msg: "forgejo-runner service is not active: {{ runner_active.stdout }}"
+
+    # ── 10. Set up Uptime Kuma push monitor ────────────────────────────
+    - name: Create Uptime Kuma push monitor setup script
+      copy:
+        dest: /tmp/setup_forgejo_runner_monitor.py
+        content: |
+          #!/usr/bin/env python3
+          import sys
+          import json
+          from uptime_kuma_api import UptimeKumaApi
+
+          def main():
+              api_url = sys.argv[1]
+              username = sys.argv[2]
+              password = sys.argv[3]
+              group_name = sys.argv[4]
+              monitor_name = sys.argv[5]
+              monitor_description = sys.argv[6]
+              interval = int(sys.argv[7])
+              retries = int(sys.argv[8])
+              ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"
+
+              api = UptimeKumaApi(api_url, timeout=60, wait_events=2.0)
+              api.login(username, password)
+
+              # Get all monitors
+              monitors = api.get_monitors()
+
+              # Get all notifications and find ntfy notification
+              notifications = api.get_notifications()
+              ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
+              notification_id_list = {}
+              if ntfy_notification:
+                  notification_id_list[ntfy_notification['id']] = True
+
+              # Find or create group
+              group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
+              if not group:
+                  group_result = api.add_monitor(type='group', name=group_name)
+                  # Refresh to get the full group object with id
+                  monitors = api.get_monitors()
+                  group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
+
+              # Find or create/update push monitor
+              existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
+
+              monitor_data = {
+                  'type': 'push',
+                  'name': monitor_name,
+                  'parent': group['id'],
+                  'interval': interval,
+                  'upsideDown': False,
+                  'maxretries': retries,
+                  'description': monitor_description,
+                  'notificationIDList': notification_id_list
+              }
+
+              if existing_monitor:
+                  monitor = api.edit_monitor(existing_monitor['id'], **monitor_data)
+                  # Refresh to get the full monitor object with pushToken
+                  monitors = api.get_monitors()
+                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
+              else:
+                  monitor_result = api.add_monitor(**monitor_data)
+                  # Refresh to get the full monitor object with pushToken
+                  monitors = api.get_monitors()
+                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
+
+              result = {
+                  'monitor_id': monitor['id'],
+                  'push_token': monitor['pushToken'],
+                  'group_name': group_name,
+                  'group_id': group['id'],
+                  'monitor_name': monitor_name
+              }
+              print(json.dumps(result))
+
+              api.disconnect()
+
+          if __name__ == '__main__':
+              main()
+        mode: '0755'
+      delegate_to: localhost
+      become: no
+
+    - name: Run Uptime Kuma push monitor setup
+      command: >
+        {{ ansible_playbook_python }}
+        /tmp/setup_forgejo_runner_monitor.py
+        "{{ uptime_kuma_api_url }}"
+        "{{ uptime_kuma_username }}"
+        "{{ uptime_kuma_password }}"
+        "services"
+        "forgejo-runner-healthcheck"
+        "Forgejo Runner healthcheck - ping every {{ healthcheck_interval_seconds }}s"
+        "{{ healthcheck_timeout_seconds }}"
+        "{{ healthcheck_retries }}"
+        "{{ ntfy_topic }}"
+      register: monitor_setup_result
+      delegate_to: localhost
+      become: no
+      changed_when: false
+
+    - name: Parse monitor setup result
+      set_fact:
+        monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
+
+    - name: Set push URL
+      set_fact:
+        uptime_kuma_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
+
+    - name: Create healthcheck script directory
+      file:
+        path: "{{ healthcheck_script_dir }}"
+        state: directory
+        owner: root
+        group: root
+        mode: '0755'
+
+    - name: Create forgejo-runner healthcheck script
+      copy:
+        dest: "{{ healthcheck_script_path }}"
+        content: |
+          #!/bin/bash
+
+          # Forgejo Runner Healthcheck Script
+          # Checks if forgejo-runner is active and pings Uptime Kuma on success
+
+          LOG_FILE="{{ healthcheck_log_file }}"
+          UPTIME_KUMA_URL="{{ uptime_kuma_push_url }}"
+
+          log_message() {
+              echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
+          }
+
+          main() {
+              if systemctl is-active --quiet forgejo-runner; then
+                  log_message "forgejo-runner is active, sending ping"
+                  response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=forgejo-runner%20is%20active" 2>&1)
+                  http_code=$(echo "$response" | tail -n1)
+                  if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
+                      log_message "Ping sent successfully (HTTP $http_code)"
+                  else
+                      log_message "ERROR: Failed to send ping (HTTP $http_code)"
+                      exit 1
+                  fi
+              else
+                  log_message "ERROR: forgejo-runner is not active"
+                  exit 1
+              fi
+          }
+
+          main
+        owner: root
+        group: root
+        mode: '0755'
+
+    - name: Create healthcheck systemd service
+      copy:
+        dest: "/etc/systemd/system/{{ healthcheck_service_name }}.service"
+        content: |
+          [Unit]
+          Description=Forgejo Runner Healthcheck
+          After=network.target
+
+          [Service]
+          Type=oneshot
+          ExecStart={{ healthcheck_script_path }}
+          User=root
+          StandardOutput=journal
+          StandardError=journal
+
+          [Install]
+          WantedBy=multi-user.target
+        owner: root
+        group: root
+        mode: '0644'
+
+    - name: Create healthcheck systemd timer
+      copy:
+        dest: "/etc/systemd/system/{{ healthcheck_service_name }}.timer"
+        content: |
+          [Unit]
+          Description=Run Forgejo Runner Healthcheck every minute
+          Requires={{ healthcheck_service_name }}.service
+
+          [Timer]
+          OnBootSec=30sec
+          OnUnitActiveSec={{ healthcheck_interval_seconds }}sec
+          Persistent=true
+
+          [Install]
+          WantedBy=timers.target
+        owner: root
+        group: root
+        mode: '0644'
+
+    - name: Reload systemd for healthcheck units
+      systemd:
+        daemon_reload: yes
+
+    - name: Enable and start healthcheck timer
+      systemd:
+        name: "{{ healthcheck_service_name }}.timer"
+        enabled: yes
+        state: started
+
+    - name: Test healthcheck script
+      command: "{{ healthcheck_script_path }}"
+      register: healthcheck_test
+      changed_when: false
+
+    - name: Verify healthcheck script works
+      assert:
+        that:
+          - healthcheck_test.rc == 0
+        fail_msg: "Healthcheck script failed to execute properly"
+
+    - name: Display deployment summary
+      debug:
+        msg: |
+          Forgejo Runner deployed successfully!
+
+          Runner Name: forgejo-runner-box
+          Instance: {{ forgejo_instance_url }}
+          Working Directory: {{ forgejo_runner_dir }}
+          Service: forgejo-runner.service ({{ runner_active.stdout }})
+
+          Healthcheck Monitor: forgejo-runner-healthcheck
+          Healthcheck Interval: Every {{ healthcheck_interval_seconds }}s
+          Timeout: {{ healthcheck_timeout_seconds }}s
+
+    - name: Clean up temporary monitor setup script
+      file:
+        path: /tmp/setup_forgejo_runner_monitor.py
+        state: absent
+      delegate_to: localhost
+      become: no
--- a/ansible/services/forgejo-runner/forgejo_runner_vars.yml
+++ b/ansible/services/forgejo-runner/forgejo_runner_vars.yml
@ -0,0 +1,9 @@
+forgejo_runner_version: "6.3.1"
+forgejo_runner_arch: "linux-amd64"
+forgejo_runner_url: "https://code.forgejo.org/forgejo/runner/releases/download/v{{ forgejo_runner_version }}/forgejo-runner-{{ forgejo_runner_version }}-{{ forgejo_runner_arch }}"
+forgejo_runner_bin_path: "/usr/local/bin/forgejo-runner"
+forgejo_runner_user: "runner"
+forgejo_runner_dir: "/opt/forgejo-runner"
+forgejo_runner_config_path: "{{ forgejo_runner_dir }}/config.yml"
+forgejo_runner_labels: "docker:docker://node:20-bookworm,ubuntu-latest:docker://node:20-bookworm,ubuntu-22.04:docker://node:20-bookworm,ubuntu-24.04:docker://node:20-bookworm"
+forgejo_instance_url: "https://forgejo.contrapeso.xyz"
Author	SHA1	Message	Date
counterweight	c6e1a01167	thingies	2026-02-08 18:22:31 +01:00
counterweight	08281ce349	ups playbook	2026-01-11 22:43:27 +01:00
counterweight	fe321050c1	monitor zfs	2026-01-04 23:19:19 +01:00