personal_infra/ansible/infra/430_cpu_temp_alerts.yml
2025-12-01 11:16:47 +01:00

316 lines
11 KiB
YAML

- name: Deploy CPU Temperature Monitoring
hosts: nodito_host
become: yes
vars_files:
- ../infra_vars.yml
- ../services_config.yml
- ../infra_secrets.yml
vars:
temp_threshold_celsius: 80
temp_check_interval_minutes: 1
monitoring_script_dir: /opt/nodito-monitoring
monitoring_script_path: "{{ monitoring_script_dir }}/cpu_temp_monitor.sh"
log_file: "{{ monitoring_script_dir }}/cpu_temp_monitor.log"
systemd_service_name: nodito-cpu-temp-monitor
uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
ntfy_topic: "{{ service_settings.ntfy.topic }}"
tasks:
- name: Validate Uptime Kuma configuration
assert:
that:
- uptime_kuma_api_url is defined
- uptime_kuma_api_url != ""
- uptime_kuma_username is defined
- uptime_kuma_username != ""
- uptime_kuma_password is defined
- uptime_kuma_password != ""
fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"
- name: Get hostname for monitor identification
command: hostname
register: host_name
changed_when: false
- name: Set monitor name and group based on hostname
set_fact:
monitor_name: "cpu-temp-{{ host_name.stdout }}"
monitor_friendly_name: "CPU Temperature: {{ host_name.stdout }}"
uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"
- name: Create Uptime Kuma CPU temperature monitor setup script
copy:
dest: /tmp/setup_uptime_kuma_cpu_temp_monitor.py
content: |
#!/usr/bin/env python3
import sys
import json
from uptime_kuma_api import UptimeKumaApi
def main():
api_url = sys.argv[1]
username = sys.argv[2]
password = sys.argv[3]
group_name = sys.argv[4]
monitor_name = sys.argv[5]
monitor_description = sys.argv[6]
interval = int(sys.argv[7])
ntfy_topic = sys.argv[8] if len(sys.argv) > 8 else "alerts"
api = UptimeKumaApi(api_url, timeout=60, wait_events=2.0)
api.login(username, password)
monitors = api.get_monitors()
notifications = api.get_notifications()
ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
notification_id_list = {}
if ntfy_notification:
notification_id_list[ntfy_notification['id']] = True
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
if not group:
api.add_monitor(type='group', name=group_name)
monitors = api.get_monitors()
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
monitor_data = {
'type': 'push',
'name': monitor_name,
'parent': group['id'],
'interval': interval,
'upsideDown': True,
'description': monitor_description,
'notificationIDList': notification_id_list
}
if existing_monitor:
api.edit_monitor(existing_monitor['id'], **monitor_data)
else:
api.add_monitor(**monitor_data)
monitors = api.get_monitors()
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
result = {
'monitor_id': monitor['id'],
'push_token': monitor['pushToken'],
'group_name': group_name,
'group_id': group['id'],
'monitor_name': monitor_name
}
print(json.dumps(result))
api.disconnect()
if __name__ == '__main__':
main()
mode: '0755'
delegate_to: localhost
become: no
- name: Run Uptime Kuma monitor setup script
command: >
{{ ansible_playbook_python }}
/tmp/setup_uptime_kuma_cpu_temp_monitor.py
"{{ uptime_kuma_api_url }}"
"{{ uptime_kuma_username }}"
"{{ uptime_kuma_password }}"
"{{ uptime_kuma_monitor_group }}"
"{{ monitor_name }}"
"{{ monitor_friendly_name }} - Alerts when temperature exceeds {{ temp_threshold_celsius }}°C"
"{{ (temp_check_interval_minutes * 60) + 60 }}"
"{{ ntfy_topic }}"
register: monitor_setup_result
delegate_to: localhost
become: no
changed_when: false
- name: Parse monitor setup result
set_fact:
monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
- name: Set push URL and monitor ID as facts
set_fact:
uptime_kuma_cpu_temp_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}"
- name: Install required packages for temperature monitoring
package:
name:
- lm-sensors
- curl
- jq
- bc
state: present
- name: Create monitoring script directory
file:
path: "{{ monitoring_script_dir }}"
state: directory
owner: root
group: root
mode: '0755'
- name: Create CPU temperature monitoring script
copy:
dest: "{{ monitoring_script_path }}"
content: |
#!/bin/bash
# CPU Temperature Monitoring Script
# Monitors CPU temperature and sends alerts to Uptime Kuma
LOG_FILE="{{ log_file }}"
TEMP_THRESHOLD="{{ temp_threshold_celsius }}"
UPTIME_KUMA_URL="{{ uptime_kuma_cpu_temp_push_url }}"
log_message() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
}
get_cpu_temp() {
local temp=""
if command -v sensors >/dev/null 2>&1; then
temp=$(sensors 2>/dev/null | grep -E "Core 0|Package id 0|Tdie|Tctl" | head -1 | grep -oE '[0-9]+\.[0-9]+°C' | grep -oE '[0-9]+\.[0-9]+')
fi
if [ -z "$temp" ] && [ -f /sys/class/thermal/thermal_zone0/temp ]; then
temp=$(cat /sys/class/thermal/thermal_zone0/temp)
temp=$(echo "scale=1; $temp/1000" | bc -l 2>/dev/null || echo "$temp")
fi
if [ -z "$temp" ] && command -v acpi >/dev/null 2>&1; then
temp=$(acpi -t 2>/dev/null | grep -oE '[0-9]+\.[0-9]+' | head -1)
fi
echo "$temp"
}
send_uptime_kuma_alert() {
local temp="$1"
local message="CPU Temperature Alert: ${temp}°C (Threshold: ${TEMP_THRESHOLD}°C)"
log_message "ALERT: $message"
encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/°/%C2%B0/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g')
response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1)
http_code=$(echo "$response" | tail -n1)
if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
log_message "Alert sent successfully to Uptime Kuma (HTTP $http_code)"
else
log_message "ERROR: Failed to send alert to Uptime Kuma (HTTP $http_code)"
fi
}
main() {
log_message "Starting CPU temperature check"
current_temp=$(get_cpu_temp)
if [ -z "$current_temp" ]; then
log_message "ERROR: Could not read CPU temperature"
exit 1
fi
log_message "Current CPU temperature: ${current_temp}°C"
if (( $(echo "$current_temp > $TEMP_THRESHOLD" | bc -l) )); then
log_message "WARNING: CPU temperature ${current_temp}°C exceeds threshold ${TEMP_THRESHOLD}°C"
send_uptime_kuma_alert "$current_temp"
else
log_message "CPU temperature is within normal range"
fi
}
main
owner: root
group: root
mode: '0755'
- name: Create systemd service for CPU temperature monitoring
copy:
dest: "/etc/systemd/system/{{ systemd_service_name }}.service"
content: |
[Unit]
Description=CPU Temperature Monitor
After=network.target
[Service]
Type=oneshot
ExecStart={{ monitoring_script_path }}
User=root
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target
owner: root
group: root
mode: '0644'
- name: Create systemd timer for CPU temperature monitoring
copy:
dest: "/etc/systemd/system/{{ systemd_service_name }}.timer"
content: |
[Unit]
Description=Run CPU Temperature Monitor every {{ temp_check_interval_minutes }} minute(s)
Requires={{ systemd_service_name }}.service
[Timer]
OnBootSec={{ temp_check_interval_minutes }}min
OnUnitActiveSec={{ temp_check_interval_minutes }}min
Persistent=true
[Install]
WantedBy=timers.target
owner: root
group: root
mode: '0644'
- name: Reload systemd daemon
systemd:
daemon_reload: yes
- name: Enable and start CPU temperature monitoring timer
systemd:
name: "{{ systemd_service_name }}.timer"
enabled: yes
state: started
- name: Test CPU temperature monitoring script
command: "{{ monitoring_script_path }}"
register: script_test
changed_when: false
- name: Verify script execution
assert:
that:
- script_test.rc == 0
fail_msg: "CPU temperature monitoring script failed to execute properly"
- name: Display monitoring configuration
debug:
msg:
- "CPU Temperature Monitoring configured successfully"
- "Temperature threshold: {{ temp_threshold_celsius }}°C"
- "Check interval: {{ temp_check_interval_minutes }} minute(s)"
- "Monitor Name: {{ monitor_friendly_name }}"
- "Monitor Group: {{ uptime_kuma_monitor_group }}"
- "Uptime Kuma Push URL: {{ uptime_kuma_cpu_temp_push_url }}"
- "Monitoring script: {{ monitoring_script_path }}"
- "Systemd Service: {{ systemd_service_name }}.service"
- "Systemd Timer: {{ systemd_service_name }}.timer"
- name: Clean up temporary Uptime Kuma setup script
file:
path: /tmp/setup_uptime_kuma_cpu_temp_monitor.py
state: absent
delegate_to: localhost
become: no