- name: Setup ZFS RAID 1 Pool for Proxmox Storage hosts: nodito_host become: true vars_files: - ../infra_vars.yml - nodito_vars.yml tasks: - name: Verify Proxmox VE is running command: pveversion register: pve_version_check changed_when: false failed_when: pve_version_check.rc != 0 - name: Update package cache apt: update_cache: yes cache_valid_time: 3600 - name: Install ZFS utilities package: name: - zfsutils-linux - zfs-initramfs state: present - name: Load ZFS kernel module modprobe: name: zfs - name: Ensure ZFS module loads at boot lineinfile: path: /etc/modules line: zfs state: present - name: Check if ZFS pool already exists command: zpool list {{ zfs_pool_name }} register: zfs_pool_exists failed_when: false changed_when: false - name: Check if disks are in use shell: | for disk in {{ zfs_disk_1 }} {{ zfs_disk_2 }}; do if mount | grep -q "^$disk"; then echo "ERROR: $disk is mounted" exit 1 fi if lsblk -n -o MOUNTPOINT "$disk" | grep -v "^$" | grep -q .; then echo "ERROR: $disk has mounted partitions" exit 1 fi done register: disk_usage_check failed_when: disk_usage_check.rc != 0 changed_when: false - name: Create ZFS RAID 1 pool with optimized settings command: > zpool create {{ zfs_pool_name }} -o ashift=12 -O mountpoint=none mirror {{ zfs_disk_1 }} {{ zfs_disk_2 }} when: zfs_pool_exists.rc != 0 register: zfs_pool_create_result - name: Check if ZFS dataset already exists command: zfs list {{ zfs_pool_name }}/vm-storage register: zfs_dataset_exists failed_when: false changed_when: false - name: Create ZFS dataset for Proxmox storage command: zfs create {{ zfs_pool_name }}/vm-storage when: zfs_dataset_exists.rc != 0 register: zfs_dataset_create_result - name: Set ZFS dataset properties for Proxmox command: zfs set {{ item.property }}={{ item.value }} {{ zfs_pool_name }}/vm-storage loop: - { property: "mountpoint", value: "{{ zfs_pool_mountpoint }}" } - { property: "compression", value: "lz4" } - { property: "atime", value: "off" } - { property: "xattr", value: "sa" } - { property: "acltype", value: "posixacl" } - { property: "dnodesize", value: "auto" } when: zfs_dataset_exists.rc != 0 - name: Set ZFS pool properties for Proxmox command: zpool set autotrim=off {{ zfs_pool_name }} when: zfs_pool_exists.rc != 0 - name: Set ZFS pool mountpoint for Proxmox command: zfs set mountpoint={{ zfs_pool_mountpoint }} {{ zfs_pool_name }} when: zfs_pool_exists.rc == 0 - name: Export and re-import ZFS pool for Proxmox compatibility shell: | zpool export {{ zfs_pool_name }} zpool import {{ zfs_pool_name }} when: zfs_pool_exists.rc != 0 register: zfs_pool_import_result - name: Ensure ZFS services are enabled systemd: name: "{{ item }}" enabled: yes state: started loop: - zfs-import-cache - zfs-import-scan - zfs-mount - zfs-share - zfs-zed - name: Check if ZFS pool storage already exists in Proxmox config stat: path: /etc/pve/storage.cfg register: storage_cfg_file - name: Check if storage name exists in Proxmox config shell: "grep -q '^zfspool: {{ zfs_pool_name }}' /etc/pve/storage.cfg" register: storage_exists_check failed_when: false changed_when: false when: storage_cfg_file.stat.exists - name: Set storage not configured when config file doesn't exist set_fact: storage_exists_check: rc: 1 when: not storage_cfg_file.stat.exists - name: Debug storage configuration status debug: msg: | Config file exists: {{ storage_cfg_file.stat.exists }} Storage check result: {{ storage_exists_check.rc }} Pool exists: {{ zfs_pool_exists.rc == 0 }} Will remove storage: {{ zfs_pool_exists.rc == 0 and storage_exists_check.rc == 0 }} Will add storage: {{ zfs_pool_exists.rc == 0 and storage_exists_check.rc != 0 }} - name: Remove existing storage if it exists command: pvesm remove {{ zfs_pool_name }} register: pvesm_remove_result failed_when: false when: - zfs_pool_exists.rc == 0 - storage_exists_check.rc == 0 - name: Add ZFS pool storage to Proxmox using pvesm command: > pvesm add zfspool {{ zfs_pool_name }} --pool {{ zfs_pool_name }} --content rootdir,images --sparse 1 when: - zfs_pool_exists.rc == 0 - storage_exists_check.rc != 0 register: pvesm_add_result - name: Verify ZFS pool is healthy command: zpool status {{ zfs_pool_name }} register: final_zfs_status changed_when: false - name: Fail if ZFS pool is not healthy fail: msg: "ZFS pool {{ zfs_pool_name }} is not in a healthy state" when: "'ONLINE' not in final_zfs_status.stdout" - name: Setup ZFS Pool Health Monitoring and Monthly Scrubs hosts: nodito become: true vars_files: - ../../infra_vars.yml - ../../services_config.yml - ../../infra_secrets.yml - nodito_vars.yml vars: zfs_check_interval_seconds: 86400 # 24 hours zfs_check_timeout_seconds: 90000 # ~25 hours (interval + buffer) zfs_check_retries: 1 zfs_monitoring_script_dir: /opt/zfs-monitoring zfs_monitoring_script_path: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.sh" zfs_log_file: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.log" zfs_systemd_health_service_name: zfs-health-monitor zfs_systemd_scrub_service_name: zfs-monthly-scrub uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}" ntfy_topic: "{{ service_settings.ntfy.topic }}" tasks: - name: Validate Uptime Kuma configuration assert: that: - uptime_kuma_api_url is defined - uptime_kuma_api_url != "" - uptime_kuma_username is defined - uptime_kuma_username != "" - uptime_kuma_password is defined - uptime_kuma_password != "" fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set" - name: Get hostname for monitor identification command: hostname register: host_name changed_when: false - name: Set monitor name and group based on hostname set_fact: monitor_name: "zfs-health-{{ host_name.stdout }}" monitor_friendly_name: "ZFS Pool Health: {{ host_name.stdout }}" uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra" - name: Create Uptime Kuma ZFS health monitor setup script copy: dest: /tmp/setup_uptime_kuma_zfs_monitor.py content: | #!/usr/bin/env python3 import sys import json from uptime_kuma_api import UptimeKumaApi def main(): api_url = sys.argv[1] username = sys.argv[2] password = sys.argv[3] group_name = sys.argv[4] monitor_name = sys.argv[5] monitor_description = sys.argv[6] interval = int(sys.argv[7]) retries = int(sys.argv[8]) ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts" api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0) api.login(username, password) # Get all monitors monitors = api.get_monitors() # Get all notifications and find ntfy notification notifications = api.get_notifications() ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None) notification_id_list = {} if ntfy_notification: notification_id_list[ntfy_notification['id']] = True # Find or create group group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None) if not group: group_result = api.add_monitor(type='group', name=group_name) # Refresh to get the full group object with id monitors = api.get_monitors() group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None) # Find or create/update push monitor existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None) monitor_data = { 'type': 'push', 'name': monitor_name, 'parent': group['id'], 'interval': interval, 'upsideDown': False, # Normal heartbeat mode: receiving pings = healthy 'maxretries': retries, 'description': monitor_description, 'notificationIDList': notification_id_list } if existing_monitor: monitor = api.edit_monitor(existing_monitor['id'], **monitor_data) # Refresh to get the full monitor object with pushToken monitors = api.get_monitors() monitor = next((m for m in monitors if m.get('name') == monitor_name), None) else: monitor_result = api.add_monitor(**monitor_data) # Refresh to get the full monitor object with pushToken monitors = api.get_monitors() monitor = next((m for m in monitors if m.get('name') == monitor_name), None) # Output result as JSON result = { 'monitor_id': monitor['id'], 'push_token': monitor['pushToken'], 'group_name': group_name, 'group_id': group['id'], 'monitor_name': monitor_name } print(json.dumps(result)) api.disconnect() if __name__ == '__main__': main() mode: '0755' delegate_to: localhost become: no - name: Run Uptime Kuma ZFS monitor setup script command: > {{ ansible_playbook_python }} /tmp/setup_uptime_kuma_zfs_monitor.py "{{ uptime_kuma_api_url }}" "{{ uptime_kuma_username }}" "{{ uptime_kuma_password }}" "{{ uptime_kuma_monitor_group }}" "{{ monitor_name }}" "{{ monitor_friendly_name }} - Daily health check for pool {{ zfs_pool_name }}" "{{ zfs_check_timeout_seconds }}" "{{ zfs_check_retries }}" "{{ ntfy_topic }}" register: monitor_setup_result delegate_to: localhost become: no changed_when: false - name: Parse monitor setup result set_fact: monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}" - name: Set push URL and monitor ID as facts set_fact: uptime_kuma_zfs_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}" uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}" - name: Install required packages for ZFS monitoring package: name: - curl - jq state: present - name: Create monitoring script directory file: path: "{{ zfs_monitoring_script_dir }}" state: directory owner: root group: root mode: '0755' - name: Create ZFS health monitoring script copy: dest: "{{ zfs_monitoring_script_path }}" content: | #!/bin/bash # ZFS Pool Health Monitoring Script # Checks ZFS pool health using JSON output and sends heartbeat to Uptime Kuma if healthy # If any issues detected, does NOT send heartbeat (triggers timeout alert) LOG_FILE="{{ zfs_log_file }}" UPTIME_KUMA_URL="{{ uptime_kuma_zfs_push_url }}" POOL_NAME="{{ zfs_pool_name }}" HOSTNAME=$(hostname) # Function to log messages log_message() { echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE" } # Function to check pool health using JSON output check_pool_health() { local pool="$1" local issues_found=0 # Get pool status as JSON local pool_json pool_json=$(zpool status -j "$pool" 2>&1) if [ $? -ne 0 ]; then log_message "ERROR: Failed to get pool status for $pool" log_message " -> $pool_json" return 1 fi # Check 1: Pool state must be ONLINE local pool_state pool_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].state') if [ "$pool_state" != "ONLINE" ]; then log_message "ISSUE: Pool state is $pool_state (expected ONLINE)" issues_found=1 else log_message "OK: Pool state is ONLINE" fi # Check 2: Check all vdevs and devices for non-ONLINE states local bad_states bad_states=$(echo "$pool_json" | jq -r --arg pool "$pool" ' .pools[$pool].vdevs[] | .. | objects | select(.state? and .state != "ONLINE") | "\(.name // "unknown"): \(.state)" ' 2>/dev/null) if [ -n "$bad_states" ]; then log_message "ISSUE: Found devices not in ONLINE state:" echo "$bad_states" | while read -r line; do log_message " -> $line" done issues_found=1 else log_message "OK: All devices are ONLINE" fi # Check 3: Check for resilvering in progress local scan_function scan_state scan_function=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"') scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"') if [ "$scan_function" = "RESILVER" ] && [ "$scan_state" = "SCANNING" ]; then local resilver_progress resilver_progress=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.issued // "unknown"') log_message "ISSUE: Pool is currently resilvering (disk reconstruction in progress) - ${resilver_progress} processed" issues_found=1 fi # Check 4: Check for read/write/checksum errors on all devices # Note: ZFS JSON output has error counts as strings, so convert to numbers for comparison local devices_with_errors devices_with_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" ' .pools[$pool].vdevs[] | .. | objects | select(.name? and ((.read_errors // "0" | tonumber) > 0 or (.write_errors // "0" | tonumber) > 0 or (.checksum_errors // "0" | tonumber) > 0)) | "\(.name): read=\(.read_errors // 0) write=\(.write_errors // 0) cksum=\(.checksum_errors // 0)" ' 2>/dev/null) if [ -n "$devices_with_errors" ]; then log_message "ISSUE: Found devices with I/O errors:" echo "$devices_with_errors" | while read -r line; do log_message " -> $line" done issues_found=1 else log_message "OK: No read/write/checksum errors detected" fi # Check 5: Check for scan errors (from last scrub/resilver) local scan_errors scan_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.errors // "0"') if [ "$scan_errors" != "0" ] && [ "$scan_errors" != "null" ] && [ -n "$scan_errors" ]; then log_message "ISSUE: Last scan reported $scan_errors errors" issues_found=1 else log_message "OK: No scan errors" fi return $issues_found } # Function to get last scrub info for status message get_scrub_info() { local pool="$1" local pool_json pool_json=$(zpool status -j "$pool" 2>/dev/null) local scan_func scan_state scan_start scan_func=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"') scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"') scan_start=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.start_time // ""') if [ "$scan_func" = "SCRUB" ] && [ "$scan_state" = "SCANNING" ]; then echo "scrub in progress (started $scan_start)" elif [ "$scan_func" = "SCRUB" ] && [ -n "$scan_start" ]; then echo "last scrub: $scan_start" else echo "no scrub history" fi } # Function to send heartbeat to Uptime Kuma send_heartbeat() { local message="$1" log_message "Sending heartbeat to Uptime Kuma: $message" # URL encode the message local encoded_message encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g') local response http_code response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1) http_code=$(echo "$response" | tail -n1) if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then log_message "Heartbeat sent successfully (HTTP $http_code)" return 0 else log_message "ERROR: Failed to send heartbeat (HTTP $http_code)" return 1 fi } # Main health check logic main() { log_message "==========================================" log_message "Starting ZFS health check for pool: $POOL_NAME on $HOSTNAME" # Run all health checks if check_pool_health "$POOL_NAME"; then # All checks passed - send heartbeat local scrub_info scrub_info=$(get_scrub_info "$POOL_NAME") local message="Pool $POOL_NAME healthy ($scrub_info)" send_heartbeat "$message" log_message "Health check completed: ALL OK" exit 0 else # Issues found - do NOT send heartbeat (will trigger timeout alert) log_message "Health check completed: ISSUES DETECTED - NOT sending heartbeat" log_message "Uptime Kuma will alert after timeout due to missing heartbeat" exit 1 fi } # Run main function main owner: root group: root mode: '0755' - name: Create systemd service for ZFS health monitoring copy: dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.service" content: | [Unit] Description=ZFS Pool Health Monitor After=zfs.target network.target [Service] Type=oneshot ExecStart={{ zfs_monitoring_script_path }} User=root StandardOutput=journal StandardError=journal [Install] WantedBy=multi-user.target owner: root group: root mode: '0644' - name: Create systemd timer for daily ZFS health monitoring copy: dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.timer" content: | [Unit] Description=Run ZFS Pool Health Monitor daily Requires={{ zfs_systemd_health_service_name }}.service [Timer] OnBootSec=5min OnUnitActiveSec={{ zfs_check_interval_seconds }}sec Persistent=true [Install] WantedBy=timers.target owner: root group: root mode: '0644' - name: Create systemd service for ZFS monthly scrub copy: dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.service" content: | [Unit] Description=ZFS Monthly Scrub for {{ zfs_pool_name }} After=zfs.target [Service] Type=oneshot ExecStart=/sbin/zpool scrub {{ zfs_pool_name }} User=root StandardOutput=journal StandardError=journal [Install] WantedBy=multi-user.target owner: root group: root mode: '0644' - name: Create systemd timer for monthly ZFS scrub copy: dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.timer" content: | [Unit] Description=Run ZFS Scrub on last day of every month at 4:00 AM Requires={{ zfs_systemd_scrub_service_name }}.service [Timer] OnCalendar=*-*~01 04:00:00 Persistent=true [Install] WantedBy=timers.target owner: root group: root mode: '0644' - name: Reload systemd daemon systemd: daemon_reload: yes - name: Enable and start ZFS health monitoring timer systemd: name: "{{ zfs_systemd_health_service_name }}.timer" enabled: yes state: started - name: Enable and start ZFS monthly scrub timer systemd: name: "{{ zfs_systemd_scrub_service_name }}.timer" enabled: yes state: started - name: Test ZFS health monitoring script command: "{{ zfs_monitoring_script_path }}" register: script_test changed_when: false - name: Verify script execution assert: that: - script_test.rc == 0 fail_msg: "ZFS health monitoring script failed - check pool health" - name: Display monitoring configuration debug: msg: | ✓ ZFS Pool Health Monitoring deployed successfully! Monitor Name: {{ monitor_friendly_name }} Monitor Group: {{ uptime_kuma_monitor_group }} Pool Name: {{ zfs_pool_name }} Health Check: - Frequency: Every {{ zfs_check_interval_seconds }} seconds (24 hours) - Timeout: {{ zfs_check_timeout_seconds }} seconds (~25 hours) - Script: {{ zfs_monitoring_script_path }} - Log: {{ zfs_log_file }} - Service: {{ zfs_systemd_health_service_name }}.service - Timer: {{ zfs_systemd_health_service_name }}.timer Monthly Scrub: - Schedule: Last day of month at 4:00 AM - Service: {{ zfs_systemd_scrub_service_name }}.service - Timer: {{ zfs_systemd_scrub_service_name }}.timer Conditions monitored: - Pool state (must be ONLINE) - Device states (no DEGRADED/FAULTED/OFFLINE/UNAVAIL) - Resilver status (alerts if resilvering) - Read/Write/Checksum errors - Scrub errors - name: Clean up temporary Uptime Kuma setup script file: path: /tmp/setup_uptime_kuma_zfs_monitor.py state: absent delegate_to: localhost become: no