lots of stuff man
This commit is contained in:
parent
3b88e6c5e8
commit
c8754e1bdc
43 changed files with 7310 additions and 121 deletions
331
ansible/infra/410_disk_usage_alerts.yml
Normal file
331
ansible/infra/410_disk_usage_alerts.yml
Normal file
|
|
@ -0,0 +1,331 @@
|
|||
- name: Deploy Disk Usage Monitoring
|
||||
hosts: all
|
||||
become: yes
|
||||
vars_files:
|
||||
- ../infra_vars.yml
|
||||
- ../services_config.yml
|
||||
- ../infra_secrets.yml
|
||||
- ../services/uptime_kuma/uptime_kuma_vars.yml
|
||||
- ../services/ntfy/ntfy_vars.yml
|
||||
|
||||
vars:
|
||||
disk_usage_threshold_percent: 80
|
||||
disk_check_interval_minutes: 15
|
||||
monitored_mount_point: "/"
|
||||
monitoring_script_dir: /opt/disk-monitoring
|
||||
monitoring_script_path: "{{ monitoring_script_dir }}/disk_usage_monitor.sh"
|
||||
log_file: "{{ monitoring_script_dir }}/disk_usage_monitor.log"
|
||||
systemd_service_name: disk-usage-monitor
|
||||
# Uptime Kuma configuration (auto-configured from services_config.yml and infra_secrets.yml)
|
||||
uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
|
||||
|
||||
tasks:
|
||||
- name: Validate Uptime Kuma configuration
|
||||
assert:
|
||||
that:
|
||||
- uptime_kuma_api_url is defined
|
||||
- uptime_kuma_api_url != ""
|
||||
- uptime_kuma_username is defined
|
||||
- uptime_kuma_username != ""
|
||||
- uptime_kuma_password is defined
|
||||
- uptime_kuma_password != ""
|
||||
fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"
|
||||
|
||||
- name: Get hostname for monitor identification
|
||||
command: hostname
|
||||
register: host_name
|
||||
changed_when: false
|
||||
|
||||
- name: Set monitor name and group based on hostname and mount point
|
||||
set_fact:
|
||||
monitor_name: "disk-usage-{{ host_name.stdout }}-{{ monitored_mount_point | replace('/', 'root') }}"
|
||||
monitor_friendly_name: "Disk Usage: {{ host_name.stdout }} ({{ monitored_mount_point }})"
|
||||
uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"
|
||||
|
||||
- name: Create Uptime Kuma monitor setup script
|
||||
copy:
|
||||
dest: /tmp/setup_uptime_kuma_monitor.py
|
||||
content: |
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import json
|
||||
from uptime_kuma_api import UptimeKumaApi
|
||||
|
||||
def main():
|
||||
api_url = sys.argv[1]
|
||||
username = sys.argv[2]
|
||||
password = sys.argv[3]
|
||||
group_name = sys.argv[4]
|
||||
monitor_name = sys.argv[5]
|
||||
monitor_description = sys.argv[6]
|
||||
interval = int(sys.argv[7])
|
||||
ntfy_topic = sys.argv[8] if len(sys.argv) > 8 else "alerts"
|
||||
|
||||
api = UptimeKumaApi(api_url, timeout=60, wait_events=2.0)
|
||||
api.login(username, password)
|
||||
|
||||
# Get all monitors
|
||||
monitors = api.get_monitors()
|
||||
|
||||
# Get all notifications and find ntfy notification
|
||||
notifications = api.get_notifications()
|
||||
ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
|
||||
notification_id_list = {}
|
||||
if ntfy_notification:
|
||||
notification_id_list[ntfy_notification['id']] = True
|
||||
|
||||
# Find or create group
|
||||
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
|
||||
if not group:
|
||||
group_result = api.add_monitor(type='group', name=group_name)
|
||||
# Refresh to get the full group object with id
|
||||
monitors = api.get_monitors()
|
||||
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
|
||||
|
||||
# Find or create/update push monitor
|
||||
existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
|
||||
monitor_data = {
|
||||
'type': 'push',
|
||||
'name': monitor_name,
|
||||
'parent': group['id'],
|
||||
'interval': interval,
|
||||
'upsideDown': True,
|
||||
'description': monitor_description,
|
||||
'notificationIDList': notification_id_list
|
||||
}
|
||||
|
||||
if existing_monitor:
|
||||
monitor = api.edit_monitor(existing_monitor['id'], **monitor_data)
|
||||
# Refresh to get the full monitor object with pushToken
|
||||
monitors = api.get_monitors()
|
||||
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
else:
|
||||
monitor_result = api.add_monitor(**monitor_data)
|
||||
# Refresh to get the full monitor object with pushToken
|
||||
monitors = api.get_monitors()
|
||||
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
|
||||
# Output result as JSON
|
||||
result = {
|
||||
'monitor_id': monitor['id'],
|
||||
'push_token': monitor['pushToken'],
|
||||
'group_name': group_name,
|
||||
'group_id': group['id'],
|
||||
'monitor_name': monitor_name
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
api.disconnect()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
mode: '0755'
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
|
||||
- name: Run Uptime Kuma monitor setup script
|
||||
command: >
|
||||
{{ ansible_playbook_python }}
|
||||
/tmp/setup_uptime_kuma_monitor.py
|
||||
"{{ uptime_kuma_api_url }}"
|
||||
"{{ uptime_kuma_username }}"
|
||||
"{{ uptime_kuma_password }}"
|
||||
"{{ uptime_kuma_monitor_group }}"
|
||||
"{{ monitor_name }}"
|
||||
"{{ monitor_friendly_name }} - Alerts when usage exceeds {{ disk_usage_threshold_percent }}%"
|
||||
"{{ (disk_check_interval_minutes * 60) + 60 }}"
|
||||
"{{ ntfy_topic }}"
|
||||
register: monitor_setup_result
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
changed_when: false
|
||||
|
||||
- name: Parse monitor setup result
|
||||
set_fact:
|
||||
monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
|
||||
|
||||
- name: Set push URL and monitor ID as facts
|
||||
set_fact:
|
||||
uptime_kuma_disk_usage_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
|
||||
uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}"
|
||||
|
||||
- name: Install required packages for disk monitoring
|
||||
package:
|
||||
name:
|
||||
- curl
|
||||
state: present
|
||||
|
||||
- name: Create monitoring script directory
|
||||
file:
|
||||
path: "{{ monitoring_script_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: Create disk usage monitoring script
|
||||
copy:
|
||||
dest: "{{ monitoring_script_path }}"
|
||||
content: |
|
||||
#!/bin/bash
|
||||
|
||||
# Disk Usage Monitoring Script
|
||||
# Monitors disk usage and sends alerts to Uptime Kuma
|
||||
# Mode: "No news is good news" - only sends alerts when disk usage is HIGH
|
||||
|
||||
LOG_FILE="{{ log_file }}"
|
||||
USAGE_THRESHOLD="{{ disk_usage_threshold_percent }}"
|
||||
UPTIME_KUMA_URL="{{ uptime_kuma_disk_usage_push_url }}"
|
||||
MOUNT_POINT="{{ monitored_mount_point }}"
|
||||
|
||||
# Function to log messages
|
||||
log_message() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Function to get disk usage percentage
|
||||
get_disk_usage() {
|
||||
local mount_point="$1"
|
||||
local usage=""
|
||||
|
||||
# Get disk usage percentage (without % sign)
|
||||
usage=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {gsub(/%/, "", $5); print $5}')
|
||||
|
||||
if [ -z "$usage" ]; then
|
||||
log_message "ERROR: Could not read disk usage for $mount_point"
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "$usage"
|
||||
}
|
||||
|
||||
# Function to get disk usage details
|
||||
get_disk_details() {
|
||||
local mount_point="$1"
|
||||
df -h "$mount_point" 2>/dev/null | awk 'NR==2 {print "Used: "$3" / Total: "$2" ("$5" full)"}'
|
||||
}
|
||||
|
||||
# Function to send alert to Uptime Kuma when disk usage exceeds threshold
|
||||
# With upside-down mode enabled, sending status=up will trigger an alert
|
||||
send_uptime_kuma_alert() {
|
||||
local usage="$1"
|
||||
local details="$2"
|
||||
local message="DISK FULL WARNING: ${MOUNT_POINT} is ${usage}% full (Threshold: ${USAGE_THRESHOLD}%) - ${details}"
|
||||
|
||||
log_message "ALERT: $message"
|
||||
|
||||
# Send push notification to Uptime Kuma with status=up
|
||||
# In upside-down mode, status=up is treated as down/alert
|
||||
response=$(curl -s -w "\n%{http_code}" -G \
|
||||
--data-urlencode "status=up" \
|
||||
--data-urlencode "msg=$message" \
|
||||
"$UPTIME_KUMA_URL" 2>&1)
|
||||
http_code=$(echo "$response" | tail -n1)
|
||||
|
||||
if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
|
||||
log_message "Alert sent successfully to Uptime Kuma (HTTP $http_code)"
|
||||
else
|
||||
log_message "ERROR: Failed to send alert to Uptime Kuma (HTTP $http_code)"
|
||||
fi
|
||||
}
|
||||
|
||||
# Main monitoring logic
|
||||
main() {
|
||||
log_message "Starting disk usage check for $MOUNT_POINT"
|
||||
|
||||
# Get current disk usage
|
||||
current_usage=$(get_disk_usage "$MOUNT_POINT")
|
||||
|
||||
if [ $? -ne 0 ] || [ -z "$current_usage" ]; then
|
||||
log_message "ERROR: Could not read disk usage"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get disk details
|
||||
disk_details=$(get_disk_details "$MOUNT_POINT")
|
||||
|
||||
log_message "Current disk usage: ${current_usage}% - $disk_details"
|
||||
|
||||
# Check if usage exceeds threshold
|
||||
if [ "$current_usage" -gt "$USAGE_THRESHOLD" ]; then
|
||||
log_message "WARNING: Disk usage ${current_usage}% exceeds threshold ${USAGE_THRESHOLD}%"
|
||||
send_uptime_kuma_alert "$current_usage" "$disk_details"
|
||||
else
|
||||
log_message "Disk usage is within normal range - no alert needed (no news is good news)"
|
||||
fi
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: Create systemd service for disk usage monitoring
|
||||
copy:
|
||||
dest: "/etc/systemd/system/{{ systemd_service_name }}.service"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Disk Usage Monitor
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart={{ monitoring_script_path }}
|
||||
User=root
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Create systemd timer for disk usage monitoring
|
||||
copy:
|
||||
dest: "/etc/systemd/system/{{ systemd_service_name }}.timer"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Run Disk Usage Monitor every {{ disk_check_interval_minutes }} minute(s)
|
||||
Requires={{ systemd_service_name }}.service
|
||||
|
||||
[Timer]
|
||||
OnBootSec={{ disk_check_interval_minutes }}min
|
||||
OnUnitActiveSec={{ disk_check_interval_minutes }}min
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Reload systemd daemon
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Enable and start disk usage monitoring timer
|
||||
systemd:
|
||||
name: "{{ systemd_service_name }}.timer"
|
||||
enabled: yes
|
||||
state: started
|
||||
|
||||
- name: Test disk usage monitoring script
|
||||
command: "{{ monitoring_script_path }}"
|
||||
register: script_test
|
||||
changed_when: false
|
||||
|
||||
- name: Verify script execution
|
||||
assert:
|
||||
that:
|
||||
- script_test.rc == 0
|
||||
fail_msg: "Disk usage monitoring script failed to execute properly"
|
||||
|
||||
- name: Clean up temporary Uptime Kuma setup script
|
||||
file:
|
||||
path: /tmp/setup_uptime_kuma_monitor.py
|
||||
state: absent
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
313
ansible/infra/420_system_healthcheck.yml
Normal file
313
ansible/infra/420_system_healthcheck.yml
Normal file
|
|
@ -0,0 +1,313 @@
|
|||
- name: Deploy System Healthcheck Monitoring
|
||||
hosts: all
|
||||
become: yes
|
||||
vars_files:
|
||||
- ../infra_vars.yml
|
||||
- ../services_config.yml
|
||||
- ../infra_secrets.yml
|
||||
- ../services/uptime_kuma/uptime_kuma_vars.yml
|
||||
- ../services/ntfy/ntfy_vars.yml
|
||||
|
||||
vars:
|
||||
healthcheck_interval_seconds: 60 # Send healthcheck every 60 seconds (1 minute)
|
||||
healthcheck_timeout_seconds: 90 # Uptime Kuma should alert if no ping received within 90s
|
||||
healthcheck_retries: 1 # Number of retries before alerting
|
||||
monitoring_script_dir: /opt/system-healthcheck
|
||||
monitoring_script_path: "{{ monitoring_script_dir }}/system_healthcheck.sh"
|
||||
log_file: "{{ monitoring_script_dir }}/system_healthcheck.log"
|
||||
systemd_service_name: system-healthcheck
|
||||
# Uptime Kuma configuration (auto-configured from services_config.yml and infra_secrets.yml)
|
||||
uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
|
||||
|
||||
tasks:
|
||||
- name: Validate Uptime Kuma configuration
|
||||
assert:
|
||||
that:
|
||||
- uptime_kuma_api_url is defined
|
||||
- uptime_kuma_api_url != ""
|
||||
- uptime_kuma_username is defined
|
||||
- uptime_kuma_username != ""
|
||||
- uptime_kuma_password is defined
|
||||
- uptime_kuma_password != ""
|
||||
fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"
|
||||
|
||||
- name: Get hostname for monitor identification
|
||||
command: hostname
|
||||
register: host_name
|
||||
changed_when: false
|
||||
|
||||
- name: Set monitor name and group based on hostname
|
||||
set_fact:
|
||||
monitor_name: "system-healthcheck-{{ host_name.stdout }}"
|
||||
monitor_friendly_name: "System Healthcheck: {{ host_name.stdout }}"
|
||||
uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"
|
||||
|
||||
- name: Create Uptime Kuma monitor setup script
|
||||
copy:
|
||||
dest: /tmp/setup_uptime_kuma_healthcheck_monitor.py
|
||||
content: |
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import json
|
||||
from uptime_kuma_api import UptimeKumaApi
|
||||
|
||||
def main():
|
||||
api_url = sys.argv[1]
|
||||
username = sys.argv[2]
|
||||
password = sys.argv[3]
|
||||
group_name = sys.argv[4]
|
||||
monitor_name = sys.argv[5]
|
||||
monitor_description = sys.argv[6]
|
||||
interval = int(sys.argv[7])
|
||||
retries = int(sys.argv[8])
|
||||
ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"
|
||||
|
||||
api = UptimeKumaApi(api_url, timeout=60, wait_events=2.0)
|
||||
api.login(username, password)
|
||||
|
||||
# Get all monitors
|
||||
monitors = api.get_monitors()
|
||||
|
||||
# Get all notifications and find ntfy notification
|
||||
notifications = api.get_notifications()
|
||||
ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
|
||||
notification_id_list = {}
|
||||
if ntfy_notification:
|
||||
notification_id_list[ntfy_notification['id']] = True
|
||||
|
||||
# Find or create group
|
||||
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
|
||||
if not group:
|
||||
group_result = api.add_monitor(type='group', name=group_name)
|
||||
# Refresh to get the full group object with id
|
||||
monitors = api.get_monitors()
|
||||
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
|
||||
|
||||
# Find or create/update push monitor
|
||||
existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
|
||||
monitor_data = {
|
||||
'type': 'push',
|
||||
'name': monitor_name,
|
||||
'parent': group['id'],
|
||||
'interval': interval,
|
||||
'upsideDown': False, # Normal mode: receiving pings = healthy
|
||||
'maxretries': retries,
|
||||
'description': monitor_description,
|
||||
'notificationIDList': notification_id_list
|
||||
}
|
||||
|
||||
if existing_monitor:
|
||||
monitor = api.edit_monitor(existing_monitor['id'], **monitor_data)
|
||||
# Refresh to get the full monitor object with pushToken
|
||||
monitors = api.get_monitors()
|
||||
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
else:
|
||||
monitor_result = api.add_monitor(**monitor_data)
|
||||
# Refresh to get the full monitor object with pushToken
|
||||
monitors = api.get_monitors()
|
||||
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||
|
||||
# Output result as JSON
|
||||
result = {
|
||||
'monitor_id': monitor['id'],
|
||||
'push_token': monitor['pushToken'],
|
||||
'group_name': group_name,
|
||||
'group_id': group['id'],
|
||||
'monitor_name': monitor_name
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
api.disconnect()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
mode: '0755'
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
|
||||
- name: Run Uptime Kuma monitor setup script
|
||||
command: >
|
||||
{{ ansible_playbook_python }}
|
||||
/tmp/setup_uptime_kuma_healthcheck_monitor.py
|
||||
"{{ uptime_kuma_api_url }}"
|
||||
"{{ uptime_kuma_username }}"
|
||||
"{{ uptime_kuma_password }}"
|
||||
"{{ uptime_kuma_monitor_group }}"
|
||||
"{{ monitor_name }}"
|
||||
"{{ monitor_friendly_name }} - Regular healthcheck ping every {{ healthcheck_interval_seconds }}s"
|
||||
"{{ healthcheck_timeout_seconds }}"
|
||||
"{{ healthcheck_retries }}"
|
||||
"{{ ntfy_topic }}"
|
||||
register: monitor_setup_result
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
changed_when: false
|
||||
|
||||
- name: Parse monitor setup result
|
||||
set_fact:
|
||||
monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
|
||||
|
||||
- name: Set push URL and monitor ID as facts
|
||||
set_fact:
|
||||
uptime_kuma_healthcheck_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
|
||||
uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}"
|
||||
|
||||
- name: Install required packages for healthcheck monitoring
|
||||
package:
|
||||
name:
|
||||
- curl
|
||||
state: present
|
||||
|
||||
- name: Create monitoring script directory
|
||||
file:
|
||||
path: "{{ monitoring_script_dir }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: Create system healthcheck script
|
||||
copy:
|
||||
dest: "{{ monitoring_script_path }}"
|
||||
content: |
|
||||
#!/bin/bash
|
||||
|
||||
# System Healthcheck Script
|
||||
# Sends regular heartbeat pings to Uptime Kuma
|
||||
# This ensures the system is running and able to communicate
|
||||
|
||||
LOG_FILE="{{ log_file }}"
|
||||
UPTIME_KUMA_URL="{{ uptime_kuma_healthcheck_push_url }}"
|
||||
HOSTNAME=$(hostname)
|
||||
|
||||
# Function to log messages
|
||||
log_message() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Function to send healthcheck ping to Uptime Kuma
|
||||
send_healthcheck() {
|
||||
local uptime_seconds=$(awk '{print int($1)}' /proc/uptime)
|
||||
local uptime_days=$((uptime_seconds / 86400))
|
||||
local uptime_hours=$(((uptime_seconds % 86400) / 3600))
|
||||
local uptime_minutes=$(((uptime_seconds % 3600) / 60))
|
||||
|
||||
local message="System healthy - Uptime: ${uptime_days}d ${uptime_hours}h ${uptime_minutes}m"
|
||||
|
||||
log_message "Sending healthcheck ping: $message"
|
||||
|
||||
# Send push notification to Uptime Kuma with status=up
|
||||
encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g')
|
||||
response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1)
|
||||
http_code=$(echo "$response" | tail -n1)
|
||||
|
||||
if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
|
||||
log_message "Healthcheck ping sent successfully (HTTP $http_code)"
|
||||
else
|
||||
log_message "ERROR: Failed to send healthcheck ping (HTTP $http_code)"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Main healthcheck logic
|
||||
main() {
|
||||
log_message "Starting system healthcheck for $HOSTNAME"
|
||||
|
||||
# Send healthcheck ping
|
||||
if send_healthcheck; then
|
||||
log_message "Healthcheck completed successfully"
|
||||
else
|
||||
log_message "ERROR: Healthcheck failed"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
|
||||
- name: Create systemd service for system healthcheck
|
||||
copy:
|
||||
dest: "/etc/systemd/system/{{ systemd_service_name }}.service"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=System Healthcheck Monitor
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart={{ monitoring_script_path }}
|
||||
User=root
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Create systemd timer for system healthcheck
|
||||
copy:
|
||||
dest: "/etc/systemd/system/{{ systemd_service_name }}.timer"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Run System Healthcheck every minute
|
||||
Requires={{ systemd_service_name }}.service
|
||||
|
||||
[Timer]
|
||||
OnBootSec=30sec
|
||||
OnUnitActiveSec={{ healthcheck_interval_seconds }}sec
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
|
||||
- name: Reload systemd daemon
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Enable and start system healthcheck timer
|
||||
systemd:
|
||||
name: "{{ systemd_service_name }}.timer"
|
||||
enabled: yes
|
||||
state: started
|
||||
|
||||
- name: Test system healthcheck script
|
||||
command: "{{ monitoring_script_path }}"
|
||||
register: script_test
|
||||
changed_when: false
|
||||
|
||||
- name: Verify script execution
|
||||
assert:
|
||||
that:
|
||||
- script_test.rc == 0
|
||||
fail_msg: "System healthcheck script failed to execute properly"
|
||||
|
||||
- name: Display monitor information
|
||||
debug:
|
||||
msg: |
|
||||
✓ System healthcheck monitoring deployed successfully!
|
||||
|
||||
Monitor Name: {{ monitor_friendly_name }}
|
||||
Monitor Group: {{ uptime_kuma_monitor_group }}
|
||||
Healthcheck Interval: Every {{ healthcheck_interval_seconds }} seconds (1 minute)
|
||||
Timeout: {{ healthcheck_timeout_seconds }} seconds (90s)
|
||||
Retries: {{ healthcheck_retries }}
|
||||
|
||||
The system will send a heartbeat ping every minute.
|
||||
Uptime Kuma will alert if no ping is received within 90 seconds (with 1 retry).
|
||||
|
||||
- name: Clean up temporary Uptime Kuma setup script
|
||||
file:
|
||||
path: /tmp/setup_uptime_kuma_healthcheck_monitor.py
|
||||
state: absent
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue