ups playbook

This commit is contained in:
counterweight 2026-01-11 22:43:27 +01:00
parent fe321050c1
commit 08281ce349
Signed by: counterweight
GPG key ID: 883EDBAA726BD96C
2 changed files with 578 additions and 0 deletions

View file

@ -0,0 +1,569 @@
- name: Setup NUT (Network UPS Tools) for CyberPower UPS
hosts: nodito_host
become: true
vars_files:
- ../../infra_vars.yml
- nodito_vars.yml
- nodito_secrets.yml
tasks:
# ------------------------------------------------------------------
# Installation
# ------------------------------------------------------------------
- name: Install NUT packages
apt:
name:
- nut
- nut-client
- nut-server
state: present
update_cache: true
# ------------------------------------------------------------------
# Verify UPS is detected
# ------------------------------------------------------------------
- name: Check if UPS is detected via USB
shell: lsusb | grep -i cyber
register: lsusb_output
changed_when: false
failed_when: false
- name: Display USB detection result
debug:
msg: "{{ lsusb_output.stdout | default('UPS not detected via USB - ensure it is plugged in') }}"
- name: Fail if UPS not detected
fail:
msg: "CyberPower UPS not detected via USB. Ensure the USB cable is connected."
when: lsusb_output.rc != 0
- name: Reload udev rules for USB permissions
shell: |
udevadm control --reload-rules
udevadm trigger --subsystem-match=usb --action=add
changed_when: true
- name: Verify USB device has nut group permissions
shell: |
BUS_DEV=$(lsusb | grep -i cyber | grep -oP 'Bus \K\d+|Device \K\d+' | tr '\n' '/' | sed 's/\/$//')
if [ -n "$BUS_DEV" ]; then
BUS=$(echo $BUS_DEV | cut -d'/' -f1)
DEV=$(echo $BUS_DEV | cut -d'/' -f2)
ls -la /dev/bus/usb/$BUS/$DEV
else
echo "UPS device not found"
exit 1
fi
register: usb_permissions
changed_when: false
- name: Display USB permissions
debug:
msg: "{{ usb_permissions.stdout }} (should show 'root nut', not 'root root')"
- name: Scan for UPS with nut-scanner
command: nut-scanner -U
register: nut_scanner_output
changed_when: false
failed_when: false
- name: Display nut-scanner result
debug:
msg: "{{ nut_scanner_output.stdout_lines }}"
# ------------------------------------------------------------------
# Configuration files
# ------------------------------------------------------------------
- name: Configure NUT mode (standalone)
copy:
dest: /etc/nut/nut.conf
content: |
# Managed by Ansible
MODE=standalone
owner: root
group: nut
mode: "0640"
notify: Restart NUT services
- name: Configure UPS device
copy:
dest: /etc/nut/ups.conf
content: |
# Managed by Ansible
[{{ ups_name }}]
driver = {{ ups_driver }}
port = {{ ups_port }}
desc = "{{ ups_desc }}"
offdelay = {{ ups_offdelay }}
ondelay = {{ ups_ondelay }}
owner: root
group: nut
mode: "0640"
notify: Restart NUT services
- name: Configure upsd to listen on localhost
copy:
dest: /etc/nut/upsd.conf
content: |
# Managed by Ansible
LISTEN 127.0.0.1 3493
owner: root
group: nut
mode: "0640"
notify: Restart NUT services
- name: Configure upsd users
copy:
dest: /etc/nut/upsd.users
content: |
# Managed by Ansible
[{{ ups_user }}]
password = {{ ups_password }}
upsmon master
owner: root
group: nut
mode: "0640"
notify: Restart NUT services
- name: Configure upsmon
copy:
dest: /etc/nut/upsmon.conf
content: |
# Managed by Ansible
MONITOR {{ ups_name }}@localhost 1 {{ ups_user }} {{ ups_password }} master
MINSUPPLIES 1
SHUTDOWNCMD "/sbin/shutdown -h +0"
POLLFREQ 5
POLLFREQALERT 5
HOSTSYNC 15
DEADTIME 15
POWERDOWNFLAG /etc/killpower
# Notifications
NOTIFYMSG ONLINE "UPS %s on line power"
NOTIFYMSG ONBATT "UPS %s on battery"
NOTIFYMSG LOWBATT "UPS %s battery is low"
NOTIFYMSG FSD "UPS %s: forced shutdown in progress"
NOTIFYMSG COMMOK "Communications with UPS %s established"
NOTIFYMSG COMMBAD "Communications with UPS %s lost"
NOTIFYMSG SHUTDOWN "Auto logout and shutdown proceeding"
NOTIFYMSG REPLBATT "UPS %s battery needs replacing"
# Log all events to syslog
NOTIFYFLAG ONLINE SYSLOG
NOTIFYFLAG ONBATT SYSLOG
NOTIFYFLAG LOWBATT SYSLOG
NOTIFYFLAG FSD SYSLOG
NOTIFYFLAG COMMOK SYSLOG
NOTIFYFLAG COMMBAD SYSLOG
NOTIFYFLAG SHUTDOWN SYSLOG
NOTIFYFLAG REPLBATT SYSLOG
owner: root
group: nut
mode: "0640"
notify: Restart NUT services
# ------------------------------------------------------------------
# Verify late-stage shutdown script
# ------------------------------------------------------------------
- name: Verify nutshutdown script exists
stat:
path: /lib/systemd/system-shutdown/nutshutdown
register: nutshutdown_script
- name: Warn if nutshutdown script is missing
debug:
msg: "WARNING: /lib/systemd/system-shutdown/nutshutdown not found. UPS may not cut power after shutdown."
when: not nutshutdown_script.stat.exists
# ------------------------------------------------------------------
# Services
# ------------------------------------------------------------------
- name: Enable and start NUT driver enumerator
systemd:
name: nut-driver-enumerator
enabled: true
state: started
- name: Enable and start NUT server
systemd:
name: nut-server
enabled: true
state: started
- name: Enable and start NUT monitor
systemd:
name: nut-monitor
enabled: true
state: started
# ------------------------------------------------------------------
# Verification
# ------------------------------------------------------------------
- name: Wait for NUT services to stabilize
pause:
seconds: 3
- name: Verify NUT can communicate with UPS
command: upsc {{ ups_name }}@localhost
register: upsc_output
changed_when: false
failed_when: upsc_output.rc != 0
- name: Display UPS status
debug:
msg: "{{ upsc_output.stdout_lines }}"
- name: Get UPS status summary
shell: |
echo "Status: $(upsc {{ ups_name }}@localhost ups.status 2>/dev/null)"
echo "Battery: $(upsc {{ ups_name }}@localhost battery.charge 2>/dev/null)%"
echo "Runtime: $(upsc {{ ups_name }}@localhost battery.runtime 2>/dev/null)s"
echo "Load: $(upsc {{ ups_name }}@localhost ups.load 2>/dev/null)%"
register: ups_summary
changed_when: false
- name: Display UPS summary
debug:
msg: "{{ ups_summary.stdout_lines }}"
- name: Verify low battery thresholds
shell: |
echo "Runtime threshold: $(upsc {{ ups_name }}@localhost battery.runtime.low 2>/dev/null)s"
echo "Charge threshold: $(upsc {{ ups_name }}@localhost battery.charge.low 2>/dev/null)%"
register: thresholds
changed_when: false
- name: Display low battery thresholds
debug:
msg: "{{ thresholds.stdout_lines }}"
handlers:
- name: Restart NUT services
systemd:
name: "{{ item }}"
state: restarted
loop:
- nut-driver-enumerator
- nut-server
- nut-monitor
- name: Setup UPS Heartbeat Monitoring with Uptime Kuma
hosts: nodito
become: true
vars_files:
- ../../infra_vars.yml
- ../../services_config.yml
- ../../infra_secrets.yml
- nodito_vars.yml
- nodito_secrets.yml
vars:
ups_heartbeat_interval_seconds: 60
ups_heartbeat_timeout_seconds: 120
ups_heartbeat_retries: 1
ups_monitoring_script_dir: /opt/ups-monitoring
ups_monitoring_script_path: "{{ ups_monitoring_script_dir }}/ups_heartbeat.sh"
ups_log_file: "{{ ups_monitoring_script_dir }}/ups_heartbeat.log"
ups_systemd_service_name: ups-heartbeat
uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
ntfy_topic: "{{ service_settings.ntfy.topic }}"
tasks:
- name: Validate Uptime Kuma configuration
assert:
that:
- uptime_kuma_api_url is defined
- uptime_kuma_api_url != ""
- uptime_kuma_username is defined
- uptime_kuma_username != ""
- uptime_kuma_password is defined
- uptime_kuma_password != ""
fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"
- name: Get hostname for monitor identification
command: hostname
register: host_name
changed_when: false
- name: Set monitor name and group based on hostname
set_fact:
monitor_name: "ups-{{ host_name.stdout }}"
monitor_friendly_name: "UPS Status: {{ host_name.stdout }}"
uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"
- name: Create Uptime Kuma UPS monitor setup script
copy:
dest: /tmp/setup_uptime_kuma_ups_monitor.py
content: |
#!/usr/bin/env python3
import sys
import json
from uptime_kuma_api import UptimeKumaApi
def main():
api_url = sys.argv[1]
username = sys.argv[2]
password = sys.argv[3]
group_name = sys.argv[4]
monitor_name = sys.argv[5]
monitor_description = sys.argv[6]
interval = int(sys.argv[7])
retries = int(sys.argv[8])
ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"
api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0)
api.login(username, password)
monitors = api.get_monitors()
notifications = api.get_notifications()
ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
notification_id_list = {}
if ntfy_notification:
notification_id_list[ntfy_notification['id']] = True
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
if not group:
api.add_monitor(type='group', name=group_name)
monitors = api.get_monitors()
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
monitor_data = {
'type': 'push',
'name': monitor_name,
'parent': group['id'],
'interval': interval,
'upsideDown': False, # Normal heartbeat mode: receiving pings = healthy
'maxretries': retries,
'description': monitor_description,
'notificationIDList': notification_id_list
}
if existing_monitor:
api.edit_monitor(existing_monitor['id'], **monitor_data)
monitors = api.get_monitors()
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
else:
api.add_monitor(**monitor_data)
monitors = api.get_monitors()
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
result = {
'monitor_id': monitor['id'],
'push_token': monitor['pushToken'],
'group_name': group_name,
'group_id': group['id'],
'monitor_name': monitor_name
}
print(json.dumps(result))
api.disconnect()
if __name__ == '__main__':
main()
mode: '0755'
delegate_to: localhost
become: no
- name: Run Uptime Kuma UPS monitor setup script
command: >
{{ ansible_playbook_python }}
/tmp/setup_uptime_kuma_ups_monitor.py
"{{ uptime_kuma_api_url }}"
"{{ uptime_kuma_username }}"
"{{ uptime_kuma_password }}"
"{{ uptime_kuma_monitor_group }}"
"{{ monitor_name }}"
"{{ monitor_friendly_name }} - Alerts when UPS goes on battery or loses communication"
"{{ ups_heartbeat_timeout_seconds }}"
"{{ ups_heartbeat_retries }}"
"{{ ntfy_topic }}"
register: monitor_setup_result
delegate_to: localhost
become: no
changed_when: false
- name: Parse monitor setup result
set_fact:
monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
- name: Set push URL as fact
set_fact:
uptime_kuma_ups_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
- name: Install required packages for UPS monitoring
package:
name:
- curl
state: present
- name: Create monitoring script directory
file:
path: "{{ ups_monitoring_script_dir }}"
state: directory
owner: root
group: root
mode: '0755'
- name: Create UPS heartbeat monitoring script
copy:
dest: "{{ ups_monitoring_script_path }}"
content: |
#!/bin/bash
# UPS Heartbeat Monitoring Script
# Sends heartbeat to Uptime Kuma only when UPS is on mains power
# When on battery or communication lost, no heartbeat is sent (triggers timeout alert)
LOG_FILE="{{ ups_log_file }}"
UPTIME_KUMA_URL="{{ uptime_kuma_ups_push_url }}"
UPS_NAME="{{ ups_name }}"
log_message() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
}
send_heartbeat() {
local message="$1"
local encoded_message
encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g; s/%/%25/g')
local response http_code
response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1)
http_code=$(echo "$response" | tail -n1)
if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
log_message "Heartbeat sent: $message (HTTP $http_code)"
return 0
else
log_message "ERROR: Failed to send heartbeat (HTTP $http_code)"
return 1
fi
}
main() {
local status charge runtime load
status=$(upsc ${UPS_NAME}@localhost ups.status 2>/dev/null)
if [ -z "$status" ]; then
log_message "ERROR: Cannot communicate with UPS - NOT sending heartbeat"
exit 1
fi
charge=$(upsc ${UPS_NAME}@localhost battery.charge 2>/dev/null)
runtime=$(upsc ${UPS_NAME}@localhost battery.runtime 2>/dev/null)
load=$(upsc ${UPS_NAME}@localhost ups.load 2>/dev/null)
if [[ "$status" == *"OL"* ]]; then
local message="UPS on mains (charge=${charge}% runtime=${runtime}s load=${load}%)"
send_heartbeat "$message"
exit 0
else
log_message "UPS not on mains power (status=$status) - NOT sending heartbeat"
exit 1
fi
}
main
owner: root
group: root
mode: '0755'
- name: Create systemd service for UPS heartbeat
copy:
dest: "/etc/systemd/system/{{ ups_systemd_service_name }}.service"
content: |
[Unit]
Description=UPS Heartbeat Monitor
After=network.target nut-monitor.service
[Service]
Type=oneshot
ExecStart={{ ups_monitoring_script_path }}
User=root
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target
owner: root
group: root
mode: '0644'
- name: Create systemd timer for UPS heartbeat
copy:
dest: "/etc/systemd/system/{{ ups_systemd_service_name }}.timer"
content: |
[Unit]
Description=Run UPS Heartbeat Monitor every {{ ups_heartbeat_interval_seconds }} seconds
Requires={{ ups_systemd_service_name }}.service
[Timer]
OnBootSec=1min
OnUnitActiveSec={{ ups_heartbeat_interval_seconds }}sec
Persistent=true
[Install]
WantedBy=timers.target
owner: root
group: root
mode: '0644'
- name: Reload systemd daemon
systemd:
daemon_reload: yes
- name: Enable and start UPS heartbeat timer
systemd:
name: "{{ ups_systemd_service_name }}.timer"
enabled: yes
state: started
- name: Test UPS heartbeat script
command: "{{ ups_monitoring_script_path }}"
register: script_test
changed_when: false
- name: Verify script execution
assert:
that:
- script_test.rc == 0
fail_msg: "UPS heartbeat script failed - check UPS status and communication"
- name: Display monitoring configuration
debug:
msg:
- "UPS Monitoring configured successfully"
- ""
- "NUT Configuration:"
- " UPS Name: {{ ups_name }}"
- " UPS Description: {{ ups_desc }}"
- " Off Delay: {{ ups_offdelay }}s (time after shutdown before UPS cuts power)"
- " On Delay: {{ ups_ondelay }}s (time after mains returns before UPS restores power)"
- ""
- "Uptime Kuma Monitoring:"
- " Monitor Name: {{ monitor_friendly_name }}"
- " Monitor Group: {{ uptime_kuma_monitor_group }}"
- " Push URL: {{ uptime_kuma_ups_push_url }}"
- " Heartbeat Interval: {{ ups_heartbeat_interval_seconds }}s"
- " Timeout: {{ ups_heartbeat_timeout_seconds }}s"
- ""
- "Scripts and Services:"
- " Script: {{ ups_monitoring_script_path }}"
- " Log: {{ ups_log_file }}"
- " Service: {{ ups_systemd_service_name }}.service"
- " Timer: {{ ups_systemd_service_name }}.timer"
- name: Clean up temporary Uptime Kuma setup script
file:
path: /tmp/setup_uptime_kuma_ups_monitor.py
state: absent
delegate_to: localhost
become: no

View file

@ -17,3 +17,12 @@ zfs_pool_name: "proxmox-tank-1"
zfs_disk_1: "/dev/disk/by-id/ata-ST4000NT001-3M2101_WX11TN0Z" # First disk for RAID 1 mirror zfs_disk_1: "/dev/disk/by-id/ata-ST4000NT001-3M2101_WX11TN0Z" # First disk for RAID 1 mirror
zfs_disk_2: "/dev/disk/by-id/ata-ST4000NT001-3M2101_WX11TN2P" # Second disk for RAID 1 mirror zfs_disk_2: "/dev/disk/by-id/ata-ST4000NT001-3M2101_WX11TN2P" # Second disk for RAID 1 mirror
zfs_pool_mountpoint: "/var/lib/vz" zfs_pool_mountpoint: "/var/lib/vz"
# UPS Configuration (CyberPower CP900EPFCLCD via USB)
ups_name: cyberpower
ups_desc: "CyberPower CP900EPFCLCD"
ups_driver: usbhid-ups
ups_port: auto
ups_user: counterweight
ups_offdelay: 120 # Seconds after shutdown before UPS cuts outlet power
ups_ondelay: 30 # Seconds after mains returns before UPS restores outlet power