personal_infra/ansible/infra/410_disk_usage_alerts.yml

- name: Deploy Disk Usage Monitoring
  hosts: all
  become: yes
  vars_files:
    - ../infra_vars.yml
    - ../services_config.yml
    - ../infra_secrets.yml
    - ../services/uptime_kuma/uptime_kuma_vars.yml
    - ../services/ntfy/ntfy_vars.yml

  vars:
    disk_usage_threshold_percent: 80
    disk_check_interval_minutes: 15
    monitored_mount_point: "/"
    monitoring_script_dir: /opt/disk-monitoring
    monitoring_script_path: "{{ monitoring_script_dir }}/disk_usage_monitor.sh"
    log_file: "{{ monitoring_script_dir }}/disk_usage_monitor.log"
    systemd_service_name: disk-usage-monitor
    # Uptime Kuma configuration (auto-configured from services_config.yml and infra_secrets.yml)
    uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"

  tasks:
    - name: Validate Uptime Kuma configuration
      assert:
        that:
          - uptime_kuma_api_url is defined
          - uptime_kuma_api_url != ""
          - uptime_kuma_username is defined
          - uptime_kuma_username != ""
          - uptime_kuma_password is defined
          - uptime_kuma_password != ""
        fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"

    - name: Get hostname for monitor identification
      command: hostname
      register: host_name
      changed_when: false

    - name: Set monitor name and group based on hostname and mount point
      set_fact:
        monitor_name: "disk-usage-{{ host_name.stdout }}-{{ monitored_mount_point | replace('/', 'root') }}"
        monitor_friendly_name: "Disk Usage: {{ host_name.stdout }} ({{ monitored_mount_point }})"
        uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"

    - name: Create Uptime Kuma monitor setup script
      copy:
        dest: /tmp/setup_uptime_kuma_monitor.py
        content: |
          #!/usr/bin/env python3
          import sys
          import json
          from uptime_kuma_api import UptimeKumaApi
          
          def main():
              api_url = sys.argv[1]
              username = sys.argv[2]
              password = sys.argv[3]
              group_name = sys.argv[4]
              monitor_name = sys.argv[5]
              monitor_description = sys.argv[6]
              interval = int(sys.argv[7])
              ntfy_topic = sys.argv[8] if len(sys.argv) > 8 else "alerts"
              
              api = UptimeKumaApi(api_url, timeout=60, wait_events=2.0)
              api.login(username, password)
              
              # Get all monitors
              monitors = api.get_monitors()
              
              # Get all notifications and find ntfy notification
              notifications = api.get_notifications()
              ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
              notification_id_list = {}
              if ntfy_notification:
                  notification_id_list[ntfy_notification['id']] = True
              
              # Find or create group
              group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
              if not group:
                  group_result = api.add_monitor(type='group', name=group_name)
                  # Refresh to get the full group object with id
                  monitors = api.get_monitors()
                  group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
              
              # Find or create/update push monitor
              existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
              
              monitor_data = {
                  'type': 'push',
                  'name': monitor_name,
                  'parent': group['id'],
                  'interval': interval,
                  'upsideDown': True,
                  'description': monitor_description,
                  'notificationIDList': notification_id_list
              }
              
              if existing_monitor:
                  monitor = api.edit_monitor(existing_monitor['id'], **monitor_data)
                  # Refresh to get the full monitor object with pushToken
                  monitors = api.get_monitors()
                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
              else:
                  monitor_result = api.add_monitor(**monitor_data)
                  # Refresh to get the full monitor object with pushToken
                  monitors = api.get_monitors()
                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
              
              # Output result as JSON
              result = {
                  'monitor_id': monitor['id'],
                  'push_token': monitor['pushToken'],
                  'group_name': group_name,
                  'group_id': group['id'],
                  'monitor_name': monitor_name
              }
              print(json.dumps(result))
              
              api.disconnect()
          
          if __name__ == '__main__':
              main()
        mode: '0755'
      delegate_to: localhost
      become: no

    - name: Run Uptime Kuma monitor setup script
      command: >
        {{ ansible_playbook_python }}
        /tmp/setup_uptime_kuma_monitor.py
        "{{ uptime_kuma_api_url }}"
        "{{ uptime_kuma_username }}"
        "{{ uptime_kuma_password }}"
        "{{ uptime_kuma_monitor_group }}"
        "{{ monitor_name }}"
        "{{ monitor_friendly_name }} - Alerts when usage exceeds {{ disk_usage_threshold_percent }}%"
        "{{ (disk_check_interval_minutes * 60) + 60 }}"
        "{{ ntfy_topic }}"
      register: monitor_setup_result
      delegate_to: localhost
      become: no
      changed_when: false

    - name: Parse monitor setup result
      set_fact:
        monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"

    - name: Set push URL and monitor ID as facts
      set_fact:
        uptime_kuma_disk_usage_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
        uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}"

    - name: Install required packages for disk monitoring
      package:
        name:
          - curl
        state: present

    - name: Create monitoring script directory
      file:
        path: "{{ monitoring_script_dir }}"
        state: directory
        owner: root
        group: root
        mode: '0755'

    - name: Create disk usage monitoring script
      copy:
        dest: "{{ monitoring_script_path }}"
        content: |
          #!/bin/bash
          
          # Disk Usage Monitoring Script
          # Monitors disk usage and sends alerts to Uptime Kuma
          # Mode: "No news is good news" - only sends alerts when disk usage is HIGH
          
          LOG_FILE="{{ log_file }}"
          USAGE_THRESHOLD="{{ disk_usage_threshold_percent }}"
          UPTIME_KUMA_URL="{{ uptime_kuma_disk_usage_push_url }}"
          MOUNT_POINT="{{ monitored_mount_point }}"
          
          # Function to log messages
          log_message() {
              echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
          }
          
          # Function to get disk usage percentage
          get_disk_usage() {
              local mount_point="$1"
              local usage=""
              
              # Get disk usage percentage (without % sign)
              usage=$(df -h "$mount_point" 2>/dev/null | awk 'NR==2 {gsub(/%/, "", $5); print $5}')
              
              if [ -z "$usage" ]; then
                  log_message "ERROR: Could not read disk usage for $mount_point"
                  return 1
              fi
              
              echo "$usage"
          }
          
          # Function to get disk usage details
          get_disk_details() {
              local mount_point="$1"
              df -h "$mount_point" 2>/dev/null | awk 'NR==2 {print "Used: "$3" / Total: "$2" ("$5" full)"}'
          }
          
          # Function to send alert to Uptime Kuma when disk usage exceeds threshold
          # With upside-down mode enabled, sending status=up will trigger an alert
          send_uptime_kuma_alert() {
              local usage="$1"
              local details="$2"
              local message="DISK FULL WARNING: ${MOUNT_POINT} is ${usage}% full (Threshold: ${USAGE_THRESHOLD}%) - ${details}"
              
              log_message "ALERT: $message"
              
              # Send push notification to Uptime Kuma with status=up
              # In upside-down mode, status=up is treated as down/alert
              response=$(curl -s -w "\n%{http_code}" -G \
                  --data-urlencode "status=up" \
                  --data-urlencode "msg=$message" \
                  "$UPTIME_KUMA_URL" 2>&1)
              http_code=$(echo "$response" | tail -n1)
              
              if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
                  log_message "Alert sent successfully to Uptime Kuma (HTTP $http_code)"
              else
                  log_message "ERROR: Failed to send alert to Uptime Kuma (HTTP $http_code)"
              fi
          }
          
          # Main monitoring logic
          main() {
              log_message "Starting disk usage check for $MOUNT_POINT"
              
              # Get current disk usage
              current_usage=$(get_disk_usage "$MOUNT_POINT")
              
              if [ $? -ne 0 ] || [ -z "$current_usage" ]; then
                  log_message "ERROR: Could not read disk usage"
                  exit 1
              fi
              
              # Get disk details
              disk_details=$(get_disk_details "$MOUNT_POINT")
              
              log_message "Current disk usage: ${current_usage}% - $disk_details"
              
              # Check if usage exceeds threshold
              if [ "$current_usage" -gt "$USAGE_THRESHOLD" ]; then
                  log_message "WARNING: Disk usage ${current_usage}% exceeds threshold ${USAGE_THRESHOLD}%"
                  send_uptime_kuma_alert "$current_usage" "$disk_details"
              else
                  log_message "Disk usage is within normal range - no alert needed (no news is good news)"
              fi
          }
          
          # Run main function
          main
        owner: root
        group: root
        mode: '0755'

    - name: Create systemd service for disk usage monitoring
      copy:
        dest: "/etc/systemd/system/{{ systemd_service_name }}.service"
        content: |
          [Unit]
          Description=Disk Usage Monitor
          After=network.target
          
          [Service]
          Type=oneshot
          ExecStart={{ monitoring_script_path }}
          User=root
          StandardOutput=journal
          StandardError=journal
          
          [Install]
          WantedBy=multi-user.target
        owner: root
        group: root
        mode: '0644'

    - name: Create systemd timer for disk usage monitoring
      copy:
        dest: "/etc/systemd/system/{{ systemd_service_name }}.timer"
        content: |
          [Unit]
          Description=Run Disk Usage Monitor every {{ disk_check_interval_minutes }} minute(s)
          Requires={{ systemd_service_name }}.service
          
          [Timer]
          OnBootSec={{ disk_check_interval_minutes }}min
          OnUnitActiveSec={{ disk_check_interval_minutes }}min
          Persistent=true
          
          [Install]
          WantedBy=timers.target
        owner: root
        group: root
        mode: '0644'

    - name: Reload systemd daemon
      systemd:
        daemon_reload: yes

    - name: Enable and start disk usage monitoring timer
      systemd:
        name: "{{ systemd_service_name }}.timer"
        enabled: yes
        state: started

    - name: Test disk usage monitoring script
      command: "{{ monitoring_script_path }}"
      register: script_test
      changed_when: false

    - name: Verify script execution
      assert:
        that:
          - script_test.rc == 0
        fail_msg: "Disk usage monitoring script failed to execute properly"

    - name: Clean up temporary Uptime Kuma setup script
      file:
        path: /tmp/setup_uptime_kuma_monitor.py
        state: absent
      delegate_to: localhost
      become: no
lots of stuff man 2025-11-06 23:09:44 +01:00			`- name: Deploy Disk Usage Monitoring`
			`hosts: all`
			`become: yes`
			`vars_files:`
			`- ../infra_vars.yml`
			`- ../services_config.yml`
			`- ../infra_secrets.yml`
			`- ../services/uptime_kuma/uptime_kuma_vars.yml`
			`- ../services/ntfy/ntfy_vars.yml`

			`vars:`
			`disk_usage_threshold_percent: 80`
			`disk_check_interval_minutes: 15`
			`monitored_mount_point: "/"`
			`monitoring_script_dir: /opt/disk-monitoring`
			`monitoring_script_path: "{{ monitoring_script_dir }}/disk_usage_monitor.sh"`
			`log_file: "{{ monitoring_script_dir }}/disk_usage_monitor.log"`
			`systemd_service_name: disk-usage-monitor`
			`# Uptime Kuma configuration (auto-configured from services_config.yml and infra_secrets.yml)`
			`uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"`

			`tasks:`
			`- name: Validate Uptime Kuma configuration`
			`assert:`
			`that:`
			`- uptime_kuma_api_url is defined`
			`- uptime_kuma_api_url != ""`
			`- uptime_kuma_username is defined`
			`- uptime_kuma_username != ""`
			`- uptime_kuma_password is defined`
			`- uptime_kuma_password != ""`
			`fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"`

			`- name: Get hostname for monitor identification`
			`command: hostname`
			`register: host_name`
			`changed_when: false`

			`- name: Set monitor name and group based on hostname and mount point`
			`set_fact:`
			`monitor_name: "disk-usage-{{ host_name.stdout }}-{{ monitored_mount_point \| replace('/', 'root') }}"`
			`monitor_friendly_name: "Disk Usage: {{ host_name.stdout }} ({{ monitored_mount_point }})"`
			`uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"`

			`- name: Create Uptime Kuma monitor setup script`
			`copy:`
			`dest: /tmp/setup_uptime_kuma_monitor.py`
			`content: \|`
			`#!/usr/bin/env python3`
			`import sys`
			`import json`
			`from uptime_kuma_api import UptimeKumaApi`

			`def main():`
			`api_url = sys.argv[1]`
			`username = sys.argv[2]`
			`password = sys.argv[3]`
			`group_name = sys.argv[4]`
			`monitor_name = sys.argv[5]`
			`monitor_description = sys.argv[6]`
			`interval = int(sys.argv[7])`
			`ntfy_topic = sys.argv[8] if len(sys.argv) > 8 else "alerts"`

			`api = UptimeKumaApi(api_url, timeout=60, wait_events=2.0)`
			`api.login(username, password)`

			`# Get all monitors`
			`monitors = api.get_monitors()`

			`# Get all notifications and find ntfy notification`
			`notifications = api.get_notifications()`
			`ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)`
			`notification_id_list = {}`
			`if ntfy_notification:`
			`notification_id_list[ntfy_notification['id']] = True`

			`# Find or create group`
			`group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)`
			`if not group:`
			`group_result = api.add_monitor(type='group', name=group_name)`
			`# Refresh to get the full group object with id`
			`monitors = api.get_monitors()`
			`group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)`

			`# Find or create/update push monitor`
			`existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)`

			`monitor_data = {`
			`'type': 'push',`
			`'name': monitor_name,`
			`'parent': group['id'],`
			`'interval': interval,`
			`'upsideDown': True,`
			`'description': monitor_description,`
			`'notificationIDList': notification_id_list`
			`}`

			`if existing_monitor:`
			`monitor = api.edit_monitor(existing_monitor['id'], **monitor_data)`
			`# Refresh to get the full monitor object with pushToken`
			`monitors = api.get_monitors()`
			`monitor = next((m for m in monitors if m.get('name') == monitor_name), None)`
			`else:`
			`monitor_result = api.add_monitor(**monitor_data)`
			`# Refresh to get the full monitor object with pushToken`
			`monitors = api.get_monitors()`
			`monitor = next((m for m in monitors if m.get('name') == monitor_name), None)`

			`# Output result as JSON`
			`result = {`
			`'monitor_id': monitor['id'],`
			`'push_token': monitor['pushToken'],`
			`'group_name': group_name,`
			`'group_id': group['id'],`
			`'monitor_name': monitor_name`
			`}`
			`print(json.dumps(result))`

			`api.disconnect()`

			`if __name__ == '__main__':`
			`main()`
			`mode: '0755'`
			`delegate_to: localhost`
			`become: no`

			`- name: Run Uptime Kuma monitor setup script`
			`command: >`
			`{{ ansible_playbook_python }}`
			`/tmp/setup_uptime_kuma_monitor.py`
			`"{{ uptime_kuma_api_url }}"`
			`"{{ uptime_kuma_username }}"`
			`"{{ uptime_kuma_password }}"`
			`"{{ uptime_kuma_monitor_group }}"`
			`"{{ monitor_name }}"`
			`"{{ monitor_friendly_name }} - Alerts when usage exceeds {{ disk_usage_threshold_percent }}%"`
			`"{{ (disk_check_interval_minutes * 60) + 60 }}"`
			`"{{ ntfy_topic }}"`
			`register: monitor_setup_result`
			`delegate_to: localhost`
			`become: no`
			`changed_when: false`

			`- name: Parse monitor setup result`
			`set_fact:`
			`monitor_info_parsed: "{{ monitor_setup_result.stdout \| from_json }}"`

			`- name: Set push URL and monitor ID as facts`
			`set_fact:`
			`uptime_kuma_disk_usage_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"`
			`uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}"`

			`- name: Install required packages for disk monitoring`
			`package:`
			`name:`
			`- curl`
			`state: present`

			`- name: Create monitoring script directory`
			`file:`
			`path: "{{ monitoring_script_dir }}"`
			`state: directory`
			`owner: root`
			`group: root`
			`mode: '0755'`

			`- name: Create disk usage monitoring script`
			`copy:`
			`dest: "{{ monitoring_script_path }}"`
			`content: \|`
			`#!/bin/bash`

			`# Disk Usage Monitoring Script`
			`# Monitors disk usage and sends alerts to Uptime Kuma`
			`# Mode: "No news is good news" - only sends alerts when disk usage is HIGH`

			`LOG_FILE="{{ log_file }}"`
			`USAGE_THRESHOLD="{{ disk_usage_threshold_percent }}"`
			`UPTIME_KUMA_URL="{{ uptime_kuma_disk_usage_push_url }}"`
			`MOUNT_POINT="{{ monitored_mount_point }}"`

			`# Function to log messages`
			`log_message() {`
			`echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"`
			`}`

			`# Function to get disk usage percentage`
			`get_disk_usage() {`
			`local mount_point="$1"`
			`local usage=""`

			`# Get disk usage percentage (without % sign)`
			`usage=$(df -h "$mount_point" 2>/dev/null \| awk 'NR==2 {gsub(/%/, "", $5); print $5}')`

			`if [ -z "$usage" ]; then`
			`log_message "ERROR: Could not read disk usage for $mount_point"`
			`return 1`
			`fi`

			`echo "$usage"`
			`}`

			`# Function to get disk usage details`
			`get_disk_details() {`
			`local mount_point="$1"`
			`df -h "$mount_point" 2>/dev/null \| awk 'NR==2 {print "Used: "$3" / Total: "$2" ("$5" full)"}'`
			`}`

			`# Function to send alert to Uptime Kuma when disk usage exceeds threshold`
			`# With upside-down mode enabled, sending status=up will trigger an alert`
			`send_uptime_kuma_alert() {`
			`local usage="$1"`
			`local details="$2"`
			`local message="DISK FULL WARNING: ${MOUNT_POINT} is ${usage}% full (Threshold: ${USAGE_THRESHOLD}%) - ${details}"`

			`log_message "ALERT: $message"`

			`# Send push notification to Uptime Kuma with status=up`
			`# In upside-down mode, status=up is treated as down/alert`
			`response=$(curl -s -w "\n%{http_code}" -G \`
			`--data-urlencode "status=up" \`
			`--data-urlencode "msg=$message" \`
			`"$UPTIME_KUMA_URL" 2>&1)`
			`http_code=$(echo "$response" \| tail -n1)`

			`if [ "$http_code" = "200" ] \|\| [ "$http_code" = "201" ]; then`
			`log_message "Alert sent successfully to Uptime Kuma (HTTP $http_code)"`
			`else`
			`log_message "ERROR: Failed to send alert to Uptime Kuma (HTTP $http_code)"`
			`fi`
			`}`

			`# Main monitoring logic`
			`main() {`
			`log_message "Starting disk usage check for $MOUNT_POINT"`

			`# Get current disk usage`
			`current_usage=$(get_disk_usage "$MOUNT_POINT")`

			`if [ $? -ne 0 ] \|\| [ -z "$current_usage" ]; then`
			`log_message "ERROR: Could not read disk usage"`
			`exit 1`
			`fi`

			`# Get disk details`
			`disk_details=$(get_disk_details "$MOUNT_POINT")`

			`log_message "Current disk usage: ${current_usage}% - $disk_details"`

			`# Check if usage exceeds threshold`
			`if [ "$current_usage" -gt "$USAGE_THRESHOLD" ]; then`
			`log_message "WARNING: Disk usage ${current_usage}% exceeds threshold ${USAGE_THRESHOLD}%"`
			`send_uptime_kuma_alert "$current_usage" "$disk_details"`
			`else`
			`log_message "Disk usage is within normal range - no alert needed (no news is good news)"`
			`fi`
			`}`

			`# Run main function`
			`main`
			`owner: root`
			`group: root`
			`mode: '0755'`

			`- name: Create systemd service for disk usage monitoring`
			`copy:`
			`dest: "/etc/systemd/system/{{ systemd_service_name }}.service"`
			`content: \|`
			`[Unit]`
			`Description=Disk Usage Monitor`
			`After=network.target`

			`[Service]`
			`Type=oneshot`
			`ExecStart={{ monitoring_script_path }}`
			`User=root`
			`StandardOutput=journal`
			`StandardError=journal`

			`[Install]`
			`WantedBy=multi-user.target`
			`owner: root`
			`group: root`
			`mode: '0644'`

			`- name: Create systemd timer for disk usage monitoring`
			`copy:`
			`dest: "/etc/systemd/system/{{ systemd_service_name }}.timer"`
			`content: \|`
			`[Unit]`
			`Description=Run Disk Usage Monitor every {{ disk_check_interval_minutes }} minute(s)`
			`Requires={{ systemd_service_name }}.service`

			`[Timer]`
			`OnBootSec={{ disk_check_interval_minutes }}min`
			`OnUnitActiveSec={{ disk_check_interval_minutes }}min`
			`Persistent=true`

			`[Install]`
			`WantedBy=timers.target`
			`owner: root`
			`group: root`
			`mode: '0644'`

			`- name: Reload systemd daemon`
			`systemd:`
			`daemon_reload: yes`

			`- name: Enable and start disk usage monitoring timer`
			`systemd:`
			`name: "{{ systemd_service_name }}.timer"`
			`enabled: yes`
			`state: started`

			`- name: Test disk usage monitoring script`
			`command: "{{ monitoring_script_path }}"`
			`register: script_test`
			`changed_when: false`

			`- name: Verify script execution`
			`assert:`
			`that:`
			`- script_test.rc == 0`
			`fail_msg: "Disk usage monitoring script failed to execute properly"`

			`- name: Clean up temporary Uptime Kuma setup script`
			`file:`
			`path: /tmp/setup_uptime_kuma_monitor.py`
			`state: absent`
			`delegate_to: localhost`
			`become: no`