Compare commits
3 commits
c6795dc581
...
c6e1a01167
| Author | SHA1 | Date | |
|---|---|---|---|
| c6e1a01167 | |||
| 08281ce349 | |||
| fe321050c1 |
8 changed files with 1509 additions and 0 deletions
|
|
@ -25,6 +25,7 @@
|
|||
name:
|
||||
- ca-certificates
|
||||
- curl
|
||||
- gnupg
|
||||
state: present
|
||||
|
||||
- name: Create directory for Docker GPG key
|
||||
|
|
|
|||
|
|
@ -170,3 +170,499 @@
|
|||
fail:
|
||||
msg: "ZFS pool {{ zfs_pool_name }} is not in a healthy state"
|
||||
when: "'ONLINE' not in final_zfs_status.stdout"
|
||||
|
||||
- name: Setup ZFS Pool Health Monitoring and Monthly Scrubs
|
||||
hosts: nodito
|
||||
become: true
|
||||
vars_files:
|
||||
- ../../infra_vars.yml
|
||||
- ../../services_config.yml
|
||||
- ../../infra_secrets.yml
|
||||
- nodito_vars.yml
|
||||
|
||||
vars:
|
||||
zfs_check_interval_seconds: 86400 # 24 hours
|
||||
zfs_check_timeout_seconds: 90000 # ~25 hours (interval + buffer)
|
||||
zfs_check_retries: 1
|
||||
zfs_monitoring_script_dir: /opt/zfs-monitoring
|
||||
zfs_monitoring_script_path: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.sh"
|
||||
zfs_log_file: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.log"
|
||||
zfs_systemd_health_service_name: zfs-health-monitor
|
||||
zfs_systemd_scrub_service_name: zfs-monthly-scrub
|
||||
uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
|
||||
ntfy_topic: "{{ service_settings.ntfy.topic }}"
|
||||
|
||||
tasks:
|
||||
- name: Validate Uptime Kuma configuration
|
||||
assert:
|
||||
that:
|
||||
- uptime_kuma_api_url is defined
|
||||
- uptime_kuma_api_url != ""
|
||||
- uptime_kuma_username is defined
|
||||
- uptime_kuma_username != ""
|
||||
- uptime_kuma_password is defined
|
||||
- uptime_kuma_password != ""
|
||||
fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"
|
||||
|
||||
- name: Get hostname for monitor identification
|
||||
command: hostname
|
||||
register: host_name
|
||||
changed_when: false
|
||||
|
||||
- name: Set monitor name and group based on hostname
|
||||
set_fact:
|
||||
monitor_name: "zfs-health-{{ host_name.stdout }}"
|
||||
monitor_friendly_name: "ZFS Pool Health: {{ host_name.stdout }}"
|
||||
uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"
|
||||
|
||||
- name: Create Uptime Kuma ZFS health monitor setup script
|
||||
copy:
|
||||
dest: /tmp/setup_uptime_kuma_zfs_monitor.py
|
||||
content: |
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import json
|
||||
from uptime_kuma_api import UptimeKumaApi
|
||||
|
||||
def main():
|
||||
api_url = sys.argv[1]
|
||||
username = sys.argv[2]
|
||||
password = sys.argv[3]
|
||||
group_name = sys.argv[4]
|
||||
monitor_name = sys.argv[5]
|
||||
monitor_description = sys.argv[6]
|
||||
interval = int(sys.argv[7])
|
||||
retries = int(sys.argv[8])
|
||||
ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"
|
||||
|
||||
api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0)
|
||||
api.login(username, password)
|
||||
|
||||
# Get all monitors
|
||||
monitors = api.get_monitors()
|
||||
|
||||
# Get all notifications and find ntfy notification
|
||||
notifications = api.get_notifications()
|
||||
ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
|
||||
notification_id_list = {}
|
||||
if ntfy_notification:
|
||||
notification_id_list[ntfy_notification['id']] = True
|
||||
|
||||
# Find or create group
|
||||
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
|
||||
if not group:
|
||||
group_result = api.add_monitor(type='group', name=group_name)
|
||||
# Refresh to get the full group object with id
|
||||
monitors = api.get_monitors()
|
||||
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
|
||||
|
||||
# Find or create/update push monitor
|
||||
existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
|
||||
monitor_data = {
|
||||
'type': 'push',
|
||||
'name': monitor_name,
|
||||
'parent': group['id'],
|
||||
'interval': interval,
|
||||
'upsideDown': False, # Normal heartbeat mode: receiving pings = healthy
|
||||
'maxretries': retries,
|
||||
'description': monitor_description,
|
||||
'notificationIDList': notification_id_list
|
||||
}
|
||||
|
||||
if existing_monitor:
|
||||
monitor = api.edit_monitor(existing_monitor['id'], **monitor_data)
|
||||
# Refresh to get the full monitor object with pushToken
|
||||
monitors = api.get_monitors()
|
||||
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
else:
|
||||
monitor_result = api.add_monitor(**monitor_data)
|
||||
# Refresh to get the full monitor object with pushToken
|
||||
monitors = api.get_monitors()
|
||||
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
|
||||
# Output result as JSON
|
||||
result = {
|
||||
'monitor_id': monitor['id'],
|
||||
'push_token': monitor['pushToken'],
|
||||
'group_name': group_name,
|
||||
'group_id': group['id'],
|
||||
'monitor_name': monitor_name
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
api.disconnect()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
mode: '0755'
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
|
||||
- name: Run Uptime Kuma ZFS monitor setup script
|
||||
command: >
|
||||
{{ ansible_playbook_python }}
|
||||
/tmp/setup_uptime_kuma_zfs_monitor.py
|
||||
"{{ uptime_kuma_api_url }}"
|
||||
"{{ uptime_kuma_username }}"
|
||||
"{{ uptime_kuma_password }}"
|
||||
"{{ uptime_kuma_monitor_group }}"
|
||||
"{{ monitor_name }}"
|
||||
"{{ monitor_friendly_name }} - Daily health check for pool {{ zfs_pool_name }}"
|
||||
"{{ zfs_check_timeout_seconds }}"
|
||||
"{{ zfs_check_retries }}"
|
||||
"{{ ntfy_topic }}"
|
||||
register: monitor_setup_result
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
changed_when: false
|
||||
|
||||
- name: Parse monitor setup result
|
||||
set_fact:
|
||||
monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
|
||||
|
||||
- name: Set push URL and monitor ID as facts
|
||||
set_fact:
|
||||
uptime_kuma_zfs_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
|
||||
uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}"
|
||||
|
||||
- name: Install required packages for ZFS monitoring
|
||||
package:
|
||||
name:
|
||||
- curl
|
||||
- jq
|
||||
state: present
|
||||
|
||||
- name: Create monitoring script directory
|
||||
file:
|
||||
path: "{{ zfs_monitoring_script_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: Create ZFS health monitoring script
|
||||
copy:
|
||||
dest: "{{ zfs_monitoring_script_path }}"
|
||||
content: |
|
||||
#!/bin/bash
|
||||
|
||||
# ZFS Pool Health Monitoring Script
|
||||
# Checks ZFS pool health using JSON output and sends heartbeat to Uptime Kuma if healthy
|
||||
# If any issues detected, does NOT send heartbeat (triggers timeout alert)
|
||||
|
||||
LOG_FILE="{{ zfs_log_file }}"
|
||||
UPTIME_KUMA_URL="{{ uptime_kuma_zfs_push_url }}"
|
||||
POOL_NAME="{{ zfs_pool_name }}"
|
||||
HOSTNAME=$(hostname)
|
||||
|
||||
# Function to log messages
|
||||
log_message() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Function to check pool health using JSON output
|
||||
check_pool_health() {
|
||||
local pool="$1"
|
||||
local issues_found=0
|
||||
|
||||
# Get pool status as JSON
|
||||
local pool_json
|
||||
pool_json=$(zpool status -j "$pool" 2>&1)
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
log_message "ERROR: Failed to get pool status for $pool"
|
||||
log_message " -> $pool_json"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Check 1: Pool state must be ONLINE
|
||||
local pool_state
|
||||
pool_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].state')
|
||||
|
||||
if [ "$pool_state" != "ONLINE" ]; then
|
||||
log_message "ISSUE: Pool state is $pool_state (expected ONLINE)"
|
||||
issues_found=1
|
||||
else
|
||||
log_message "OK: Pool state is ONLINE"
|
||||
fi
|
||||
|
||||
# Check 2: Check all vdevs and devices for non-ONLINE states
|
||||
local bad_states
|
||||
bad_states=$(echo "$pool_json" | jq -r --arg pool "$pool" '
|
||||
.pools[$pool].vdevs[] |
|
||||
.. | objects |
|
||||
select(.state? and .state != "ONLINE") |
|
||||
"\(.name // "unknown"): \(.state)"
|
||||
' 2>/dev/null)
|
||||
|
||||
if [ -n "$bad_states" ]; then
|
||||
log_message "ISSUE: Found devices not in ONLINE state:"
|
||||
echo "$bad_states" | while read -r line; do
|
||||
log_message " -> $line"
|
||||
done
|
||||
issues_found=1
|
||||
else
|
||||
log_message "OK: All devices are ONLINE"
|
||||
fi
|
||||
|
||||
# Check 3: Check for resilvering in progress
|
||||
local scan_function scan_state
|
||||
scan_function=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')
|
||||
scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')
|
||||
|
||||
if [ "$scan_function" = "RESILVER" ] && [ "$scan_state" = "SCANNING" ]; then
|
||||
local resilver_progress
|
||||
resilver_progress=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.issued // "unknown"')
|
||||
log_message "ISSUE: Pool is currently resilvering (disk reconstruction in progress) - ${resilver_progress} processed"
|
||||
issues_found=1
|
||||
fi
|
||||
|
||||
# Check 4: Check for read/write/checksum errors on all devices
|
||||
# Note: ZFS JSON output has error counts as strings, so convert to numbers for comparison
|
||||
local devices_with_errors
|
||||
devices_with_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '
|
||||
.pools[$pool].vdevs[] |
|
||||
.. | objects |
|
||||
select(.name? and ((.read_errors // "0" | tonumber) > 0 or (.write_errors // "0" | tonumber) > 0 or (.checksum_errors // "0" | tonumber) > 0)) |
|
||||
"\(.name): read=\(.read_errors // 0) write=\(.write_errors // 0) cksum=\(.checksum_errors // 0)"
|
||||
' 2>/dev/null)
|
||||
|
||||
if [ -n "$devices_with_errors" ]; then
|
||||
log_message "ISSUE: Found devices with I/O errors:"
|
||||
echo "$devices_with_errors" | while read -r line; do
|
||||
log_message " -> $line"
|
||||
done
|
||||
issues_found=1
|
||||
else
|
||||
log_message "OK: No read/write/checksum errors detected"
|
||||
fi
|
||||
|
||||
# Check 5: Check for scan errors (from last scrub/resilver)
|
||||
local scan_errors
|
||||
scan_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.errors // "0"')
|
||||
|
||||
if [ "$scan_errors" != "0" ] && [ "$scan_errors" != "null" ] && [ -n "$scan_errors" ]; then
|
||||
log_message "ISSUE: Last scan reported $scan_errors errors"
|
||||
issues_found=1
|
||||
else
|
||||
log_message "OK: No scan errors"
|
||||
fi
|
||||
|
||||
return $issues_found
|
||||
}
|
||||
|
||||
# Function to get last scrub info for status message
|
||||
get_scrub_info() {
|
||||
local pool="$1"
|
||||
local pool_json
|
||||
pool_json=$(zpool status -j "$pool" 2>/dev/null)
|
||||
|
||||
local scan_func scan_state scan_start
|
||||
scan_func=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')
|
||||
scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')
|
||||
scan_start=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.start_time // ""')
|
||||
|
||||
if [ "$scan_func" = "SCRUB" ] && [ "$scan_state" = "SCANNING" ]; then
|
||||
echo "scrub in progress (started $scan_start)"
|
||||
elif [ "$scan_func" = "SCRUB" ] && [ -n "$scan_start" ]; then
|
||||
echo "last scrub: $scan_start"
|
||||
else
|
||||
echo "no scrub history"
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to send heartbeat to Uptime Kuma
|
||||
send_heartbeat() {
|
||||
local message="$1"
|
||||
|
||||
log_message "Sending heartbeat to Uptime Kuma: $message"
|
||||
|
||||
# URL encode the message
|
||||
local encoded_message
|
||||
encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g')
|
||||
|
||||
local response http_code
|
||||
response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1)
|
||||
http_code=$(echo "$response" | tail -n1)
|
||||
|
||||
if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
|
||||
log_message "Heartbeat sent successfully (HTTP $http_code)"
|
||||
return 0
|
||||
else
|
||||
log_message "ERROR: Failed to send heartbeat (HTTP $http_code)"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Main health check logic
|
||||
main() {
|
||||
log_message "=========================================="
|
||||
log_message "Starting ZFS health check for pool: $POOL_NAME on $HOSTNAME"
|
||||
|
||||
# Run all health checks
|
||||
if check_pool_health "$POOL_NAME"; then
|
||||
# All checks passed - send heartbeat
|
||||
local scrub_info
|
||||
scrub_info=$(get_scrub_info "$POOL_NAME")
|
||||
|
||||
local message="Pool $POOL_NAME healthy ($scrub_info)"
|
||||
send_heartbeat "$message"
|
||||
|
||||
log_message "Health check completed: ALL OK"
|
||||
exit 0
|
||||
else
|
||||
# Issues found - do NOT send heartbeat (will trigger timeout alert)
|
||||
log_message "Health check completed: ISSUES DETECTED - NOT sending heartbeat"
|
||||
log_message "Uptime Kuma will alert after timeout due to missing heartbeat"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: Create systemd service for ZFS health monitoring
|
||||
copy:
|
||||
dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.service"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=ZFS Pool Health Monitor
|
||||
After=zfs.target network.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart={{ zfs_monitoring_script_path }}
|
||||
User=root
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Create systemd timer for daily ZFS health monitoring
|
||||
copy:
|
||||
dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.timer"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Run ZFS Pool Health Monitor daily
|
||||
Requires={{ zfs_systemd_health_service_name }}.service
|
||||
|
||||
[Timer]
|
||||
OnBootSec=5min
|
||||
OnUnitActiveSec={{ zfs_check_interval_seconds }}sec
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Create systemd service for ZFS monthly scrub
|
||||
copy:
|
||||
dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.service"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=ZFS Monthly Scrub for {{ zfs_pool_name }}
|
||||
After=zfs.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/sbin/zpool scrub {{ zfs_pool_name }}
|
||||
User=root
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Create systemd timer for monthly ZFS scrub
|
||||
copy:
|
||||
dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.timer"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Run ZFS Scrub on last day of every month at 4:00 AM
|
||||
Requires={{ zfs_systemd_scrub_service_name }}.service
|
||||
|
||||
[Timer]
|
||||
OnCalendar=*-*~01 04:00:00
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Reload systemd daemon
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Enable and start ZFS health monitoring timer
|
||||
systemd:
|
||||
name: "{{ zfs_systemd_health_service_name }}.timer"
|
||||
enabled: yes
|
||||
state: started
|
||||
|
||||
- name: Enable and start ZFS monthly scrub timer
|
||||
systemd:
|
||||
name: "{{ zfs_systemd_scrub_service_name }}.timer"
|
||||
enabled: yes
|
||||
state: started
|
||||
|
||||
- name: Test ZFS health monitoring script
|
||||
command: "{{ zfs_monitoring_script_path }}"
|
||||
register: script_test
|
||||
changed_when: false
|
||||
|
||||
- name: Verify script execution
|
||||
assert:
|
||||
that:
|
||||
- script_test.rc == 0
|
||||
fail_msg: "ZFS health monitoring script failed - check pool health"
|
||||
|
||||
- name: Display monitoring configuration
|
||||
debug:
|
||||
msg: |
|
||||
✓ ZFS Pool Health Monitoring deployed successfully!
|
||||
|
||||
Monitor Name: {{ monitor_friendly_name }}
|
||||
Monitor Group: {{ uptime_kuma_monitor_group }}
|
||||
Pool Name: {{ zfs_pool_name }}
|
||||
|
||||
Health Check:
|
||||
- Frequency: Every {{ zfs_check_interval_seconds }} seconds (24 hours)
|
||||
- Timeout: {{ zfs_check_timeout_seconds }} seconds (~25 hours)
|
||||
- Script: {{ zfs_monitoring_script_path }}
|
||||
- Log: {{ zfs_log_file }}
|
||||
- Service: {{ zfs_systemd_health_service_name }}.service
|
||||
- Timer: {{ zfs_systemd_health_service_name }}.timer
|
||||
|
||||
Monthly Scrub:
|
||||
- Schedule: Last day of month at 4:00 AM
|
||||
- Service: {{ zfs_systemd_scrub_service_name }}.service
|
||||
- Timer: {{ zfs_systemd_scrub_service_name }}.timer
|
||||
|
||||
Conditions monitored:
|
||||
- Pool state (must be ONLINE)
|
||||
- Device states (no DEGRADED/FAULTED/OFFLINE/UNAVAIL)
|
||||
- Resilver status (alerts if resilvering)
|
||||
- Read/Write/Checksum errors
|
||||
- Scrub errors
|
||||
|
||||
- name: Clean up temporary Uptime Kuma setup script
|
||||
file:
|
||||
path: /tmp/setup_uptime_kuma_zfs_monitor.py
|
||||
state: absent
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
|
|
|
|||
569
ansible/infra/nodito/34_nut_ups_setup_playbook.yml
Normal file
569
ansible/infra/nodito/34_nut_ups_setup_playbook.yml
Normal file
|
|
@ -0,0 +1,569 @@
|
|||
- name: Setup NUT (Network UPS Tools) for CyberPower UPS
|
||||
hosts: nodito_host
|
||||
become: true
|
||||
vars_files:
|
||||
- ../../infra_vars.yml
|
||||
- nodito_vars.yml
|
||||
- nodito_secrets.yml
|
||||
|
||||
tasks:
|
||||
# ------------------------------------------------------------------
|
||||
# Installation
|
||||
# ------------------------------------------------------------------
|
||||
- name: Install NUT packages
|
||||
apt:
|
||||
name:
|
||||
- nut
|
||||
- nut-client
|
||||
- nut-server
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Verify UPS is detected
|
||||
# ------------------------------------------------------------------
|
||||
- name: Check if UPS is detected via USB
|
||||
shell: lsusb | grep -i cyber
|
||||
register: lsusb_output
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Display USB detection result
|
||||
debug:
|
||||
msg: "{{ lsusb_output.stdout | default('UPS not detected via USB - ensure it is plugged in') }}"
|
||||
|
||||
- name: Fail if UPS not detected
|
||||
fail:
|
||||
msg: "CyberPower UPS not detected via USB. Ensure the USB cable is connected."
|
||||
when: lsusb_output.rc != 0
|
||||
|
||||
- name: Reload udev rules for USB permissions
|
||||
shell: |
|
||||
udevadm control --reload-rules
|
||||
udevadm trigger --subsystem-match=usb --action=add
|
||||
changed_when: true
|
||||
|
||||
- name: Verify USB device has nut group permissions
|
||||
shell: |
|
||||
BUS_DEV=$(lsusb | grep -i cyber | grep -oP 'Bus \K\d+|Device \K\d+' | tr '\n' '/' | sed 's/\/$//')
|
||||
if [ -n "$BUS_DEV" ]; then
|
||||
BUS=$(echo $BUS_DEV | cut -d'/' -f1)
|
||||
DEV=$(echo $BUS_DEV | cut -d'/' -f2)
|
||||
ls -la /dev/bus/usb/$BUS/$DEV
|
||||
else
|
||||
echo "UPS device not found"
|
||||
exit 1
|
||||
fi
|
||||
register: usb_permissions
|
||||
changed_when: false
|
||||
|
||||
- name: Display USB permissions
|
||||
debug:
|
||||
msg: "{{ usb_permissions.stdout }} (should show 'root nut', not 'root root')"
|
||||
|
||||
- name: Scan for UPS with nut-scanner
|
||||
command: nut-scanner -U
|
||||
register: nut_scanner_output
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Display nut-scanner result
|
||||
debug:
|
||||
msg: "{{ nut_scanner_output.stdout_lines }}"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Configuration files
|
||||
# ------------------------------------------------------------------
|
||||
- name: Configure NUT mode (standalone)
|
||||
copy:
|
||||
dest: /etc/nut/nut.conf
|
||||
content: |
|
||||
# Managed by Ansible
|
||||
MODE=standalone
|
||||
owner: root
|
||||
group: nut
|
||||
mode: "0640"
|
||||
notify: Restart NUT services
|
||||
|
||||
- name: Configure UPS device
|
||||
copy:
|
||||
dest: /etc/nut/ups.conf
|
||||
content: |
|
||||
# Managed by Ansible
|
||||
[{{ ups_name }}]
|
||||
driver = {{ ups_driver }}
|
||||
port = {{ ups_port }}
|
||||
desc = "{{ ups_desc }}"
|
||||
offdelay = {{ ups_offdelay }}
|
||||
ondelay = {{ ups_ondelay }}
|
||||
owner: root
|
||||
group: nut
|
||||
mode: "0640"
|
||||
notify: Restart NUT services
|
||||
|
||||
- name: Configure upsd to listen on localhost
|
||||
copy:
|
||||
dest: /etc/nut/upsd.conf
|
||||
content: |
|
||||
# Managed by Ansible
|
||||
LISTEN 127.0.0.1 3493
|
||||
owner: root
|
||||
group: nut
|
||||
mode: "0640"
|
||||
notify: Restart NUT services
|
||||
|
||||
- name: Configure upsd users
|
||||
copy:
|
||||
dest: /etc/nut/upsd.users
|
||||
content: |
|
||||
# Managed by Ansible
|
||||
[{{ ups_user }}]
|
||||
password = {{ ups_password }}
|
||||
upsmon master
|
||||
owner: root
|
||||
group: nut
|
||||
mode: "0640"
|
||||
notify: Restart NUT services
|
||||
|
||||
- name: Configure upsmon
|
||||
copy:
|
||||
dest: /etc/nut/upsmon.conf
|
||||
content: |
|
||||
# Managed by Ansible
|
||||
MONITOR {{ ups_name }}@localhost 1 {{ ups_user }} {{ ups_password }} master
|
||||
|
||||
MINSUPPLIES 1
|
||||
SHUTDOWNCMD "/sbin/shutdown -h +0"
|
||||
POLLFREQ 5
|
||||
POLLFREQALERT 5
|
||||
HOSTSYNC 15
|
||||
DEADTIME 15
|
||||
POWERDOWNFLAG /etc/killpower
|
||||
|
||||
# Notifications
|
||||
NOTIFYMSG ONLINE "UPS %s on line power"
|
||||
NOTIFYMSG ONBATT "UPS %s on battery"
|
||||
NOTIFYMSG LOWBATT "UPS %s battery is low"
|
||||
NOTIFYMSG FSD "UPS %s: forced shutdown in progress"
|
||||
NOTIFYMSG COMMOK "Communications with UPS %s established"
|
||||
NOTIFYMSG COMMBAD "Communications with UPS %s lost"
|
||||
NOTIFYMSG SHUTDOWN "Auto logout and shutdown proceeding"
|
||||
NOTIFYMSG REPLBATT "UPS %s battery needs replacing"
|
||||
|
||||
# Log all events to syslog
|
||||
NOTIFYFLAG ONLINE SYSLOG
|
||||
NOTIFYFLAG ONBATT SYSLOG
|
||||
NOTIFYFLAG LOWBATT SYSLOG
|
||||
NOTIFYFLAG FSD SYSLOG
|
||||
NOTIFYFLAG COMMOK SYSLOG
|
||||
NOTIFYFLAG COMMBAD SYSLOG
|
||||
NOTIFYFLAG SHUTDOWN SYSLOG
|
||||
NOTIFYFLAG REPLBATT SYSLOG
|
||||
owner: root
|
||||
group: nut
|
||||
mode: "0640"
|
||||
notify: Restart NUT services
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Verify late-stage shutdown script
|
||||
# ------------------------------------------------------------------
|
||||
- name: Verify nutshutdown script exists
|
||||
stat:
|
||||
path: /lib/systemd/system-shutdown/nutshutdown
|
||||
register: nutshutdown_script
|
||||
|
||||
- name: Warn if nutshutdown script is missing
|
||||
debug:
|
||||
msg: "WARNING: /lib/systemd/system-shutdown/nutshutdown not found. UPS may not cut power after shutdown."
|
||||
when: not nutshutdown_script.stat.exists
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Services
|
||||
# ------------------------------------------------------------------
|
||||
- name: Enable and start NUT driver enumerator
|
||||
systemd:
|
||||
name: nut-driver-enumerator
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
- name: Enable and start NUT server
|
||||
systemd:
|
||||
name: nut-server
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
- name: Enable and start NUT monitor
|
||||
systemd:
|
||||
name: nut-monitor
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Verification
|
||||
# ------------------------------------------------------------------
|
||||
- name: Wait for NUT services to stabilize
|
||||
pause:
|
||||
seconds: 3
|
||||
|
||||
- name: Verify NUT can communicate with UPS
|
||||
command: upsc {{ ups_name }}@localhost
|
||||
register: upsc_output
|
||||
changed_when: false
|
||||
failed_when: upsc_output.rc != 0
|
||||
|
||||
- name: Display UPS status
|
||||
debug:
|
||||
msg: "{{ upsc_output.stdout_lines }}"
|
||||
|
||||
- name: Get UPS status summary
|
||||
shell: |
|
||||
echo "Status: $(upsc {{ ups_name }}@localhost ups.status 2>/dev/null)"
|
||||
echo "Battery: $(upsc {{ ups_name }}@localhost battery.charge 2>/dev/null)%"
|
||||
echo "Runtime: $(upsc {{ ups_name }}@localhost battery.runtime 2>/dev/null)s"
|
||||
echo "Load: $(upsc {{ ups_name }}@localhost ups.load 2>/dev/null)%"
|
||||
register: ups_summary
|
||||
changed_when: false
|
||||
|
||||
- name: Display UPS summary
|
||||
debug:
|
||||
msg: "{{ ups_summary.stdout_lines }}"
|
||||
|
||||
- name: Verify low battery thresholds
|
||||
shell: |
|
||||
echo "Runtime threshold: $(upsc {{ ups_name }}@localhost battery.runtime.low 2>/dev/null)s"
|
||||
echo "Charge threshold: $(upsc {{ ups_name }}@localhost battery.charge.low 2>/dev/null)%"
|
||||
register: thresholds
|
||||
changed_when: false
|
||||
|
||||
- name: Display low battery thresholds
|
||||
debug:
|
||||
msg: "{{ thresholds.stdout_lines }}"
|
||||
|
||||
handlers:
|
||||
- name: Restart NUT services
|
||||
systemd:
|
||||
name: "{{ item }}"
|
||||
state: restarted
|
||||
loop:
|
||||
- nut-driver-enumerator
|
||||
- nut-server
|
||||
- nut-monitor
|
||||
|
||||
|
||||
- name: Setup UPS Heartbeat Monitoring with Uptime Kuma
|
||||
hosts: nodito
|
||||
become: true
|
||||
vars_files:
|
||||
- ../../infra_vars.yml
|
||||
- ../../services_config.yml
|
||||
- ../../infra_secrets.yml
|
||||
- nodito_vars.yml
|
||||
- nodito_secrets.yml
|
||||
|
||||
vars:
|
||||
ups_heartbeat_interval_seconds: 60
|
||||
ups_heartbeat_timeout_seconds: 120
|
||||
ups_heartbeat_retries: 1
|
||||
ups_monitoring_script_dir: /opt/ups-monitoring
|
||||
ups_monitoring_script_path: "{{ ups_monitoring_script_dir }}/ups_heartbeat.sh"
|
||||
ups_log_file: "{{ ups_monitoring_script_dir }}/ups_heartbeat.log"
|
||||
ups_systemd_service_name: ups-heartbeat
|
||||
uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
|
||||
ntfy_topic: "{{ service_settings.ntfy.topic }}"
|
||||
|
||||
tasks:
|
||||
- name: Validate Uptime Kuma configuration
|
||||
assert:
|
||||
that:
|
||||
- uptime_kuma_api_url is defined
|
||||
- uptime_kuma_api_url != ""
|
||||
- uptime_kuma_username is defined
|
||||
- uptime_kuma_username != ""
|
||||
- uptime_kuma_password is defined
|
||||
- uptime_kuma_password != ""
|
||||
fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"
|
||||
|
||||
- name: Get hostname for monitor identification
|
||||
command: hostname
|
||||
register: host_name
|
||||
changed_when: false
|
||||
|
||||
- name: Set monitor name and group based on hostname
|
||||
set_fact:
|
||||
monitor_name: "ups-{{ host_name.stdout }}"
|
||||
monitor_friendly_name: "UPS Status: {{ host_name.stdout }}"
|
||||
uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"
|
||||
|
||||
- name: Create Uptime Kuma UPS monitor setup script
|
||||
copy:
|
||||
dest: /tmp/setup_uptime_kuma_ups_monitor.py
|
||||
content: |
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import json
|
||||
from uptime_kuma_api import UptimeKumaApi
|
||||
|
||||
def main():
|
||||
api_url = sys.argv[1]
|
||||
username = sys.argv[2]
|
||||
password = sys.argv[3]
|
||||
group_name = sys.argv[4]
|
||||
monitor_name = sys.argv[5]
|
||||
monitor_description = sys.argv[6]
|
||||
interval = int(sys.argv[7])
|
||||
retries = int(sys.argv[8])
|
||||
ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"
|
||||
|
||||
api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0)
|
||||
api.login(username, password)
|
||||
|
||||
monitors = api.get_monitors()
|
||||
notifications = api.get_notifications()
|
||||
|
||||
ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
|
||||
notification_id_list = {}
|
||||
if ntfy_notification:
|
||||
notification_id_list[ntfy_notification['id']] = True
|
||||
|
||||
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
|
||||
if not group:
|
||||
api.add_monitor(type='group', name=group_name)
|
||||
monitors = api.get_monitors()
|
||||
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
|
||||
|
||||
existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
|
||||
monitor_data = {
|
||||
'type': 'push',
|
||||
'name': monitor_name,
|
||||
'parent': group['id'],
|
||||
'interval': interval,
|
||||
'upsideDown': False, # Normal heartbeat mode: receiving pings = healthy
|
||||
'maxretries': retries,
|
||||
'description': monitor_description,
|
||||
'notificationIDList': notification_id_list
|
||||
}
|
||||
|
||||
if existing_monitor:
|
||||
api.edit_monitor(existing_monitor['id'], **monitor_data)
|
||||
monitors = api.get_monitors()
|
||||
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
else:
|
||||
api.add_monitor(**monitor_data)
|
||||
monitors = api.get_monitors()
|
||||
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
|
||||
result = {
|
||||
'monitor_id': monitor['id'],
|
||||
'push_token': monitor['pushToken'],
|
||||
'group_name': group_name,
|
||||
'group_id': group['id'],
|
||||
'monitor_name': monitor_name
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
api.disconnect()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
mode: '0755'
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
|
||||
- name: Run Uptime Kuma UPS monitor setup script
|
||||
command: >
|
||||
{{ ansible_playbook_python }}
|
||||
/tmp/setup_uptime_kuma_ups_monitor.py
|
||||
"{{ uptime_kuma_api_url }}"
|
||||
"{{ uptime_kuma_username }}"
|
||||
"{{ uptime_kuma_password }}"
|
||||
"{{ uptime_kuma_monitor_group }}"
|
||||
"{{ monitor_name }}"
|
||||
"{{ monitor_friendly_name }} - Alerts when UPS goes on battery or loses communication"
|
||||
"{{ ups_heartbeat_timeout_seconds }}"
|
||||
"{{ ups_heartbeat_retries }}"
|
||||
"{{ ntfy_topic }}"
|
||||
register: monitor_setup_result
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
changed_when: false
|
||||
|
||||
- name: Parse monitor setup result
|
||||
set_fact:
|
||||
monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
|
||||
|
||||
- name: Set push URL as fact
|
||||
set_fact:
|
||||
uptime_kuma_ups_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
|
||||
|
||||
- name: Install required packages for UPS monitoring
|
||||
package:
|
||||
name:
|
||||
- curl
|
||||
state: present
|
||||
|
||||
- name: Create monitoring script directory
|
||||
file:
|
||||
path: "{{ ups_monitoring_script_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: Create UPS heartbeat monitoring script
|
||||
copy:
|
||||
dest: "{{ ups_monitoring_script_path }}"
|
||||
content: |
|
||||
#!/bin/bash
|
||||
|
||||
# UPS Heartbeat Monitoring Script
|
||||
# Sends heartbeat to Uptime Kuma only when UPS is on mains power
|
||||
# When on battery or communication lost, no heartbeat is sent (triggers timeout alert)
|
||||
|
||||
LOG_FILE="{{ ups_log_file }}"
|
||||
UPTIME_KUMA_URL="{{ uptime_kuma_ups_push_url }}"
|
||||
UPS_NAME="{{ ups_name }}"
|
||||
|
||||
log_message() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
send_heartbeat() {
|
||||
local message="$1"
|
||||
|
||||
local encoded_message
|
||||
encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g; s/%/%25/g')
|
||||
|
||||
local response http_code
|
||||
response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1)
|
||||
http_code=$(echo "$response" | tail -n1)
|
||||
|
||||
if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
|
||||
log_message "Heartbeat sent: $message (HTTP $http_code)"
|
||||
return 0
|
||||
else
|
||||
log_message "ERROR: Failed to send heartbeat (HTTP $http_code)"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
main() {
|
||||
local status charge runtime load
|
||||
|
||||
status=$(upsc ${UPS_NAME}@localhost ups.status 2>/dev/null)
|
||||
|
||||
if [ -z "$status" ]; then
|
||||
log_message "ERROR: Cannot communicate with UPS - NOT sending heartbeat"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
charge=$(upsc ${UPS_NAME}@localhost battery.charge 2>/dev/null)
|
||||
runtime=$(upsc ${UPS_NAME}@localhost battery.runtime 2>/dev/null)
|
||||
load=$(upsc ${UPS_NAME}@localhost ups.load 2>/dev/null)
|
||||
|
||||
if [[ "$status" == *"OL"* ]]; then
|
||||
local message="UPS on mains (charge=${charge}% runtime=${runtime}s load=${load}%)"
|
||||
send_heartbeat "$message"
|
||||
exit 0
|
||||
else
|
||||
log_message "UPS not on mains power (status=$status) - NOT sending heartbeat"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
main
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: Create systemd service for UPS heartbeat
|
||||
copy:
|
||||
dest: "/etc/systemd/system/{{ ups_systemd_service_name }}.service"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=UPS Heartbeat Monitor
|
||||
After=network.target nut-monitor.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart={{ ups_monitoring_script_path }}
|
||||
User=root
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Create systemd timer for UPS heartbeat
|
||||
copy:
|
||||
dest: "/etc/systemd/system/{{ ups_systemd_service_name }}.timer"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Run UPS Heartbeat Monitor every {{ ups_heartbeat_interval_seconds }} seconds
|
||||
Requires={{ ups_systemd_service_name }}.service
|
||||
|
||||
[Timer]
|
||||
OnBootSec=1min
|
||||
OnUnitActiveSec={{ ups_heartbeat_interval_seconds }}sec
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Reload systemd daemon
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Enable and start UPS heartbeat timer
|
||||
systemd:
|
||||
name: "{{ ups_systemd_service_name }}.timer"
|
||||
enabled: yes
|
||||
state: started
|
||||
|
||||
- name: Test UPS heartbeat script
|
||||
command: "{{ ups_monitoring_script_path }}"
|
||||
register: script_test
|
||||
changed_when: false
|
||||
|
||||
- name: Verify script execution
|
||||
assert:
|
||||
that:
|
||||
- script_test.rc == 0
|
||||
fail_msg: "UPS heartbeat script failed - check UPS status and communication"
|
||||
|
||||
- name: Display monitoring configuration
|
||||
debug:
|
||||
msg:
|
||||
- "UPS Monitoring configured successfully"
|
||||
- ""
|
||||
- "NUT Configuration:"
|
||||
- " UPS Name: {{ ups_name }}"
|
||||
- " UPS Description: {{ ups_desc }}"
|
||||
- " Off Delay: {{ ups_offdelay }}s (time after shutdown before UPS cuts power)"
|
||||
- " On Delay: {{ ups_ondelay }}s (time after mains returns before UPS restores power)"
|
||||
- ""
|
||||
- "Uptime Kuma Monitoring:"
|
||||
- " Monitor Name: {{ monitor_friendly_name }}"
|
||||
- " Monitor Group: {{ uptime_kuma_monitor_group }}"
|
||||
- " Push URL: {{ uptime_kuma_ups_push_url }}"
|
||||
- " Heartbeat Interval: {{ ups_heartbeat_interval_seconds }}s"
|
||||
- " Timeout: {{ ups_heartbeat_timeout_seconds }}s"
|
||||
- ""
|
||||
- "Scripts and Services:"
|
||||
- " Script: {{ ups_monitoring_script_path }}"
|
||||
- " Log: {{ ups_log_file }}"
|
||||
- " Service: {{ ups_systemd_service_name }}.service"
|
||||
- " Timer: {{ ups_systemd_service_name }}.timer"
|
||||
|
||||
- name: Clean up temporary Uptime Kuma setup script
|
||||
file:
|
||||
path: /tmp/setup_uptime_kuma_ups_monitor.py
|
||||
state: absent
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
|
|
@ -17,3 +17,12 @@ zfs_pool_name: "proxmox-tank-1"
|
|||
zfs_disk_1: "/dev/disk/by-id/ata-ST4000NT001-3M2101_WX11TN0Z" # First disk for RAID 1 mirror
|
||||
zfs_disk_2: "/dev/disk/by-id/ata-ST4000NT001-3M2101_WX11TN2P" # Second disk for RAID 1 mirror
|
||||
zfs_pool_mountpoint: "/var/lib/vz"
|
||||
|
||||
# UPS Configuration (CyberPower CP900EPFCLCD via USB)
|
||||
ups_name: cyberpower
|
||||
ups_desc: "CyberPower CP900EPFCLCD"
|
||||
ups_driver: usbhid-ups
|
||||
ups_port: auto
|
||||
ups_user: counterweight
|
||||
ups_offdelay: 120 # Seconds after shutdown before UPS cuts outlet power
|
||||
ups_ondelay: 30 # Seconds after mains returns before UPS restores outlet power
|
||||
|
|
|
|||
|
|
@ -26,3 +26,8 @@ bitcoin_rpc_password: "CHANGE_ME_TO_SECURE_PASSWORD"
|
|||
# Mempool MariaDB credentials
|
||||
# Used by: services/mempool/deploy_mempool_playbook.yml
|
||||
mariadb_mempool_password: "CHANGE_ME_TO_SECURE_PASSWORD"
|
||||
|
||||
# Forgejo Runner registration token
|
||||
# Used by: services/forgejo-runner/deploy_forgejo_runner_playbook.yml
|
||||
# See: services/forgejo-runner/SETUP.md for how to obtain this token
|
||||
forgejo_runner_registration_token: "YOUR_RUNNER_TOKEN_HERE"
|
||||
|
|
|
|||
28
ansible/services/forgejo-runner/SETUP.md
Normal file
28
ansible/services/forgejo-runner/SETUP.md
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
# Forgejo Runner Setup
|
||||
|
||||
## Obtaining the Registration Token
|
||||
|
||||
1. Log in to the Forgejo instance at `https://forgejo.contrapeso.xyz`
|
||||
2. Go to **Site Administration** > **Actions** > **Runners**
|
||||
3. Click **Create new runner**
|
||||
4. Copy the registration token
|
||||
|
||||
## Configuring the Token
|
||||
|
||||
Paste the token into `ansible/infra_secrets.yml`:
|
||||
|
||||
```yaml
|
||||
forgejo_runner_registration_token: "YOUR_TOKEN_HERE"
|
||||
```
|
||||
|
||||
## Running the Playbook
|
||||
|
||||
```bash
|
||||
ansible-playbook ansible/services/forgejo-runner/deploy_forgejo_runner_playbook.yml
|
||||
```
|
||||
|
||||
## Verifying
|
||||
|
||||
1. On the VM: `systemctl status forgejo-runner` should show active
|
||||
2. In Forgejo: **Site Administration** > **Actions** > **Runners** should show the runner as online
|
||||
3. In Uptime Kuma: the `forgejo-runner-healthcheck` push monitor should be receiving pings
|
||||
|
|
@ -0,0 +1,392 @@
|
|||
- name: Install Forgejo Runner on Debian 13
|
||||
hosts: forgejo_runner_local
|
||||
become: yes
|
||||
vars_files:
|
||||
- ../../infra_vars.yml
|
||||
- ../../services_config.yml
|
||||
- ../../infra_secrets.yml
|
||||
- ./forgejo_runner_vars.yml
|
||||
vars:
|
||||
uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
|
||||
ntfy_topic: "{{ service_settings.ntfy.topic }}"
|
||||
healthcheck_interval_seconds: 60
|
||||
healthcheck_timeout_seconds: 90
|
||||
healthcheck_retries: 1
|
||||
healthcheck_script_dir: /opt/forgejo-runner-healthcheck
|
||||
healthcheck_script_path: "{{ healthcheck_script_dir }}/forgejo_runner_healthcheck.sh"
|
||||
healthcheck_log_file: "{{ healthcheck_script_dir }}/forgejo_runner_healthcheck.log"
|
||||
healthcheck_service_name: forgejo-runner-healthcheck
|
||||
|
||||
tasks:
|
||||
# ── 1. Assert Docker is available ──────────────────────────────────
|
||||
- name: Check if Docker is installed
|
||||
command: docker --version
|
||||
register: docker_check
|
||||
changed_when: false
|
||||
failed_when: docker_check.rc != 0
|
||||
|
||||
- name: Fail if Docker is not available
|
||||
assert:
|
||||
that:
|
||||
- docker_check.rc == 0
|
||||
fail_msg: >
|
||||
Docker is not installed or not in PATH.
|
||||
Please install Docker before running this playbook.
|
||||
|
||||
# ── 2. Download forgejo-runner binary ──────────────────────────────
|
||||
- name: Download forgejo-runner binary
|
||||
get_url:
|
||||
url: "{{ forgejo_runner_url }}"
|
||||
dest: "{{ forgejo_runner_bin_path }}"
|
||||
mode: '0755'
|
||||
|
||||
# ── 3. Create runner system user ───────────────────────────────────
|
||||
- name: Create runner system user
|
||||
user:
|
||||
name: "{{ forgejo_runner_user }}"
|
||||
system: yes
|
||||
shell: /usr/sbin/nologin
|
||||
home: "{{ forgejo_runner_dir }}"
|
||||
create_home: no
|
||||
groups: docker
|
||||
append: yes
|
||||
comment: 'Forgejo Runner'
|
||||
|
||||
# ── 4. Create working directory ────────────────────────────────────
|
||||
- name: Create forgejo-runner working directory
|
||||
file:
|
||||
path: "{{ forgejo_runner_dir }}"
|
||||
state: directory
|
||||
owner: "{{ forgejo_runner_user }}"
|
||||
group: "{{ forgejo_runner_user }}"
|
||||
mode: '0750'
|
||||
|
||||
# ── 5. Generate default config ─────────────────────────────────────
|
||||
- name: Check if config already exists
|
||||
stat:
|
||||
path: "{{ forgejo_runner_config_path }}"
|
||||
register: config_stat
|
||||
|
||||
- name: Generate default config
|
||||
shell: "{{ forgejo_runner_bin_path }} generate-config > {{ forgejo_runner_config_path }}"
|
||||
args:
|
||||
chdir: "{{ forgejo_runner_dir }}"
|
||||
when: not config_stat.stat.exists
|
||||
|
||||
- name: Set config file ownership
|
||||
file:
|
||||
path: "{{ forgejo_runner_config_path }}"
|
||||
owner: "{{ forgejo_runner_user }}"
|
||||
group: "{{ forgejo_runner_user }}"
|
||||
when: not config_stat.stat.exists
|
||||
|
||||
# ── 6. Register runner ─────────────────────────────────────────────
|
||||
- name: Check if runner is already registered
|
||||
stat:
|
||||
path: "{{ forgejo_runner_dir }}/.runner"
|
||||
register: runner_stat
|
||||
|
||||
- name: Register runner with Forgejo instance
|
||||
command: >
|
||||
{{ forgejo_runner_bin_path }} register --no-interactive
|
||||
--instance {{ forgejo_instance_url }}
|
||||
--token {{ forgejo_runner_registration_token }}
|
||||
--name forgejo-runner-box
|
||||
--labels "{{ forgejo_runner_labels }}"
|
||||
args:
|
||||
chdir: "{{ forgejo_runner_dir }}"
|
||||
when: not runner_stat.stat.exists
|
||||
|
||||
- name: Set runner registration file ownership
|
||||
file:
|
||||
path: "{{ forgejo_runner_dir }}/.runner"
|
||||
owner: "{{ forgejo_runner_user }}"
|
||||
group: "{{ forgejo_runner_user }}"
|
||||
when: not runner_stat.stat.exists
|
||||
|
||||
# ── 7. Create systemd service ──────────────────────────────────────
|
||||
- name: Create forgejo-runner systemd service
|
||||
copy:
|
||||
dest: /etc/systemd/system/forgejo-runner.service
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Forgejo Runner
|
||||
Documentation=https://forgejo.org/docs/latest/admin/actions/
|
||||
After=docker.service
|
||||
Requires=docker.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User={{ forgejo_runner_user }}
|
||||
Group={{ forgejo_runner_user }}
|
||||
WorkingDirectory={{ forgejo_runner_dir }}
|
||||
ExecStart={{ forgejo_runner_bin_path }} daemon --config {{ forgejo_runner_config_path }}
|
||||
Restart=on-failure
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
# ── 8. Reload systemd, enable and start ────────────────────────────
|
||||
- name: Reload systemd
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Enable and start forgejo-runner service
|
||||
systemd:
|
||||
name: forgejo-runner
|
||||
enabled: yes
|
||||
state: started
|
||||
|
||||
# ── 9. Verify runner is active ─────────────────────────────────────
|
||||
- name: Verify forgejo-runner is active
|
||||
command: systemctl is-active forgejo-runner
|
||||
register: runner_active
|
||||
changed_when: false
|
||||
|
||||
- name: Assert runner is running
|
||||
assert:
|
||||
that:
|
||||
- runner_active.stdout == "active"
|
||||
fail_msg: "forgejo-runner service is not active: {{ runner_active.stdout }}"
|
||||
|
||||
# ── 10. Set up Uptime Kuma push monitor ────────────────────────────
|
||||
- name: Create Uptime Kuma push monitor setup script
|
||||
copy:
|
||||
dest: /tmp/setup_forgejo_runner_monitor.py
|
||||
content: |
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import json
|
||||
from uptime_kuma_api import UptimeKumaApi
|
||||
|
||||
def main():
|
||||
api_url = sys.argv[1]
|
||||
username = sys.argv[2]
|
||||
password = sys.argv[3]
|
||||
group_name = sys.argv[4]
|
||||
monitor_name = sys.argv[5]
|
||||
monitor_description = sys.argv[6]
|
||||
interval = int(sys.argv[7])
|
||||
retries = int(sys.argv[8])
|
||||
ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"
|
||||
|
||||
api = UptimeKumaApi(api_url, timeout=60, wait_events=2.0)
|
||||
api.login(username, password)
|
||||
|
||||
# Get all monitors
|
||||
monitors = api.get_monitors()
|
||||
|
||||
# Get all notifications and find ntfy notification
|
||||
notifications = api.get_notifications()
|
||||
ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
|
||||
notification_id_list = {}
|
||||
if ntfy_notification:
|
||||
notification_id_list[ntfy_notification['id']] = True
|
||||
|
||||
# Find or create group
|
||||
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
|
||||
if not group:
|
||||
group_result = api.add_monitor(type='group', name=group_name)
|
||||
# Refresh to get the full group object with id
|
||||
monitors = api.get_monitors()
|
||||
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
|
||||
|
||||
# Find or create/update push monitor
|
||||
existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
|
||||
monitor_data = {
|
||||
'type': 'push',
|
||||
'name': monitor_name,
|
||||
'parent': group['id'],
|
||||
'interval': interval,
|
||||
'upsideDown': False,
|
||||
'maxretries': retries,
|
||||
'description': monitor_description,
|
||||
'notificationIDList': notification_id_list
|
||||
}
|
||||
|
||||
if existing_monitor:
|
||||
monitor = api.edit_monitor(existing_monitor['id'], **monitor_data)
|
||||
# Refresh to get the full monitor object with pushToken
|
||||
monitors = api.get_monitors()
|
||||
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
else:
|
||||
monitor_result = api.add_monitor(**monitor_data)
|
||||
# Refresh to get the full monitor object with pushToken
|
||||
monitors = api.get_monitors()
|
||||
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
|
||||
result = {
|
||||
'monitor_id': monitor['id'],
|
||||
'push_token': monitor['pushToken'],
|
||||
'group_name': group_name,
|
||||
'group_id': group['id'],
|
||||
'monitor_name': monitor_name
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
api.disconnect()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
mode: '0755'
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
|
||||
- name: Run Uptime Kuma push monitor setup
|
||||
command: >
|
||||
{{ ansible_playbook_python }}
|
||||
/tmp/setup_forgejo_runner_monitor.py
|
||||
"{{ uptime_kuma_api_url }}"
|
||||
"{{ uptime_kuma_username }}"
|
||||
"{{ uptime_kuma_password }}"
|
||||
"services"
|
||||
"forgejo-runner-healthcheck"
|
||||
"Forgejo Runner healthcheck - ping every {{ healthcheck_interval_seconds }}s"
|
||||
"{{ healthcheck_timeout_seconds }}"
|
||||
"{{ healthcheck_retries }}"
|
||||
"{{ ntfy_topic }}"
|
||||
register: monitor_setup_result
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
changed_when: false
|
||||
|
||||
- name: Parse monitor setup result
|
||||
set_fact:
|
||||
monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
|
||||
|
||||
- name: Set push URL
|
||||
set_fact:
|
||||
uptime_kuma_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
|
||||
|
||||
- name: Create healthcheck script directory
|
||||
file:
|
||||
path: "{{ healthcheck_script_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: Create forgejo-runner healthcheck script
|
||||
copy:
|
||||
dest: "{{ healthcheck_script_path }}"
|
||||
content: |
|
||||
#!/bin/bash
|
||||
|
||||
# Forgejo Runner Healthcheck Script
|
||||
# Checks if forgejo-runner is active and pings Uptime Kuma on success
|
||||
|
||||
LOG_FILE="{{ healthcheck_log_file }}"
|
||||
UPTIME_KUMA_URL="{{ uptime_kuma_push_url }}"
|
||||
|
||||
log_message() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
main() {
|
||||
if systemctl is-active --quiet forgejo-runner; then
|
||||
log_message "forgejo-runner is active, sending ping"
|
||||
response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=forgejo-runner%20is%20active" 2>&1)
|
||||
http_code=$(echo "$response" | tail -n1)
|
||||
if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
|
||||
log_message "Ping sent successfully (HTTP $http_code)"
|
||||
else
|
||||
log_message "ERROR: Failed to send ping (HTTP $http_code)"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
log_message "ERROR: forgejo-runner is not active"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
main
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: Create healthcheck systemd service
|
||||
copy:
|
||||
dest: "/etc/systemd/system/{{ healthcheck_service_name }}.service"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Forgejo Runner Healthcheck
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart={{ healthcheck_script_path }}
|
||||
User=root
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Create healthcheck systemd timer
|
||||
copy:
|
||||
dest: "/etc/systemd/system/{{ healthcheck_service_name }}.timer"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Run Forgejo Runner Healthcheck every minute
|
||||
Requires={{ healthcheck_service_name }}.service
|
||||
|
||||
[Timer]
|
||||
OnBootSec=30sec
|
||||
OnUnitActiveSec={{ healthcheck_interval_seconds }}sec
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Reload systemd for healthcheck units
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Enable and start healthcheck timer
|
||||
systemd:
|
||||
name: "{{ healthcheck_service_name }}.timer"
|
||||
enabled: yes
|
||||
state: started
|
||||
|
||||
- name: Test healthcheck script
|
||||
command: "{{ healthcheck_script_path }}"
|
||||
register: healthcheck_test
|
||||
changed_when: false
|
||||
|
||||
- name: Verify healthcheck script works
|
||||
assert:
|
||||
that:
|
||||
- healthcheck_test.rc == 0
|
||||
fail_msg: "Healthcheck script failed to execute properly"
|
||||
|
||||
- name: Display deployment summary
|
||||
debug:
|
||||
msg: |
|
||||
Forgejo Runner deployed successfully!
|
||||
|
||||
Runner Name: forgejo-runner-box
|
||||
Instance: {{ forgejo_instance_url }}
|
||||
Working Directory: {{ forgejo_runner_dir }}
|
||||
Service: forgejo-runner.service ({{ runner_active.stdout }})
|
||||
|
||||
Healthcheck Monitor: forgejo-runner-healthcheck
|
||||
Healthcheck Interval: Every {{ healthcheck_interval_seconds }}s
|
||||
Timeout: {{ healthcheck_timeout_seconds }}s
|
||||
|
||||
- name: Clean up temporary monitor setup script
|
||||
file:
|
||||
path: /tmp/setup_forgejo_runner_monitor.py
|
||||
state: absent
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
9
ansible/services/forgejo-runner/forgejo_runner_vars.yml
Normal file
9
ansible/services/forgejo-runner/forgejo_runner_vars.yml
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
forgejo_runner_version: "6.3.1"
|
||||
forgejo_runner_arch: "linux-amd64"
|
||||
forgejo_runner_url: "https://code.forgejo.org/forgejo/runner/releases/download/v{{ forgejo_runner_version }}/forgejo-runner-{{ forgejo_runner_version }}-{{ forgejo_runner_arch }}"
|
||||
forgejo_runner_bin_path: "/usr/local/bin/forgejo-runner"
|
||||
forgejo_runner_user: "runner"
|
||||
forgejo_runner_dir: "/opt/forgejo-runner"
|
||||
forgejo_runner_config_path: "{{ forgejo_runner_dir }}/config.yml"
|
||||
forgejo_runner_labels: "docker:docker://node:20-bookworm,ubuntu-latest:docker://node:20-bookworm,ubuntu-22.04:docker://node:20-bookworm,ubuntu-24.04:docker://node:20-bookworm"
|
||||
forgejo_instance_url: "https://forgejo.contrapeso.xyz"
|
||||
Loading…
Add table
Add a link
Reference in a new issue