personal_infra/ansible/infra/nodito/32_zfs_pool_setup_playbook.yml

- name: Setup ZFS RAID 1 Pool for Proxmox Storage
  hosts: nodito_host
  become: true
  vars_files:
    - ../infra_vars.yml
    - nodito_vars.yml

  tasks:
    - name: Verify Proxmox VE is running
      command: pveversion
      register: pve_version_check
      changed_when: false
      failed_when: pve_version_check.rc != 0

    - name: Update package cache
      apt:
        update_cache: yes
        cache_valid_time: 3600

    - name: Install ZFS utilities
      package:
        name:
          - zfsutils-linux
          - zfs-initramfs
        state: present

    - name: Load ZFS kernel module
      modprobe:
        name: zfs

    - name: Ensure ZFS module loads at boot
      lineinfile:
        path: /etc/modules
        line: zfs
        state: present

    - name: Check if ZFS pool already exists
      command: zpool list {{ zfs_pool_name }}
      register: zfs_pool_exists
      failed_when: false
      changed_when: false

    - name: Check if disks are in use
      shell: |
        for disk in {{ zfs_disk_1 }} {{ zfs_disk_2 }}; do
          if mount | grep -q "^$disk"; then
            echo "ERROR: $disk is mounted"
            exit 1
          fi
          if lsblk -n -o MOUNTPOINT "$disk" | grep -v "^$" | grep -q .; then
            echo "ERROR: $disk has mounted partitions"
            exit 1
          fi
        done
      register: disk_usage_check
      failed_when: disk_usage_check.rc != 0
      changed_when: false

    - name: Create ZFS RAID 1 pool with optimized settings
      command: >
        zpool create {{ zfs_pool_name }}
        -o ashift=12
        -O mountpoint=none
        mirror {{ zfs_disk_1 }} {{ zfs_disk_2 }}
      when: zfs_pool_exists.rc != 0
      register: zfs_pool_create_result

    - name: Check if ZFS dataset already exists
      command: zfs list {{ zfs_pool_name }}/vm-storage
      register: zfs_dataset_exists
      failed_when: false
      changed_when: false

    - name: Create ZFS dataset for Proxmox storage
      command: zfs create {{ zfs_pool_name }}/vm-storage
      when: zfs_dataset_exists.rc != 0
      register: zfs_dataset_create_result

    - name: Set ZFS dataset properties for Proxmox
      command: zfs set {{ item.property }}={{ item.value }} {{ zfs_pool_name }}/vm-storage
      loop:
        - { property: "mountpoint", value: "{{ zfs_pool_mountpoint }}" }
        - { property: "compression", value: "lz4" }
        - { property: "atime", value: "off" }
        - { property: "xattr", value: "sa" }
        - { property: "acltype", value: "posixacl" }
        - { property: "dnodesize", value: "auto" }
      when: zfs_dataset_exists.rc != 0

    - name: Set ZFS pool properties for Proxmox
      command: zpool set autotrim=off {{ zfs_pool_name }}
      when: zfs_pool_exists.rc != 0

    - name: Set ZFS pool mountpoint for Proxmox
      command: zfs set mountpoint={{ zfs_pool_mountpoint }} {{ zfs_pool_name }}
      when: zfs_pool_exists.rc == 0

    - name: Export and re-import ZFS pool for Proxmox compatibility
      shell: |
        zpool export {{ zfs_pool_name }}
        zpool import {{ zfs_pool_name }}
      when: zfs_pool_exists.rc != 0
      register: zfs_pool_import_result


    - name: Ensure ZFS services are enabled
      systemd:
        name: "{{ item }}"
        enabled: yes
        state: started
      loop:
        - zfs-import-cache
        - zfs-import-scan
        - zfs-mount
        - zfs-share
        - zfs-zed

    - name: Check if ZFS pool storage already exists in Proxmox config
      stat:
        path: /etc/pve/storage.cfg
      register: storage_cfg_file

    - name: Check if storage name exists in Proxmox config
      shell: "grep -q '^zfspool: {{ zfs_pool_name }}' /etc/pve/storage.cfg"
      register: storage_exists_check
      failed_when: false
      changed_when: false
      when: storage_cfg_file.stat.exists

    - name: Set storage not configured when config file doesn't exist
      set_fact:
        storage_exists_check:
          rc: 1
      when: not storage_cfg_file.stat.exists

    - name: Debug storage configuration status
      debug:
        msg: |
          Config file exists: {{ storage_cfg_file.stat.exists }}
          Storage check result: {{ storage_exists_check.rc }}
          Pool exists: {{ zfs_pool_exists.rc == 0 }}
          Will remove storage: {{ zfs_pool_exists.rc == 0 and storage_exists_check.rc == 0 }}
          Will add storage: {{ zfs_pool_exists.rc == 0 and storage_exists_check.rc != 0 }}

    - name: Remove existing storage if it exists
      command: pvesm remove {{ zfs_pool_name }}
      register: pvesm_remove_result
      failed_when: false
      when: 
        - zfs_pool_exists.rc == 0
        - storage_exists_check.rc == 0

    - name: Add ZFS pool storage to Proxmox using pvesm
      command: >
        pvesm add zfspool {{ zfs_pool_name }}
        --pool {{ zfs_pool_name }}
        --content rootdir,images
        --sparse 1
      when: 
        - zfs_pool_exists.rc == 0
        - storage_exists_check.rc != 0
      register: pvesm_add_result

    - name: Verify ZFS pool is healthy
      command: zpool status {{ zfs_pool_name }}
      register: final_zfs_status
      changed_when: false

    - name: Fail if ZFS pool is not healthy
      fail:
        msg: "ZFS pool {{ zfs_pool_name }} is not in a healthy state"
      when: "'ONLINE' not in final_zfs_status.stdout"

- name: Setup ZFS Pool Health Monitoring and Monthly Scrubs
  hosts: nodito
  become: true
  vars_files:
    - ../../infra_vars.yml
    - ../../services_config.yml
    - ../../infra_secrets.yml
    - nodito_vars.yml

  vars:
    zfs_check_interval_seconds: 86400  # 24 hours
    zfs_check_timeout_seconds: 90000   # ~25 hours (interval + buffer)
    zfs_check_retries: 1
    zfs_monitoring_script_dir: /opt/zfs-monitoring
    zfs_monitoring_script_path: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.sh"
    zfs_log_file: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.log"
    zfs_systemd_health_service_name: zfs-health-monitor
    zfs_systemd_scrub_service_name: zfs-monthly-scrub
    uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
    ntfy_topic: "{{ service_settings.ntfy.topic }}"

  tasks:
    - name: Validate Uptime Kuma configuration
      assert:
        that:
          - uptime_kuma_api_url is defined
          - uptime_kuma_api_url != ""
          - uptime_kuma_username is defined
          - uptime_kuma_username != ""
          - uptime_kuma_password is defined
          - uptime_kuma_password != ""
        fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"

    - name: Get hostname for monitor identification
      command: hostname
      register: host_name
      changed_when: false

    - name: Set monitor name and group based on hostname
      set_fact:
        monitor_name: "zfs-health-{{ host_name.stdout }}"
        monitor_friendly_name: "ZFS Pool Health: {{ host_name.stdout }}"
        uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"

    - name: Create Uptime Kuma ZFS health monitor setup script
      copy:
        dest: /tmp/setup_uptime_kuma_zfs_monitor.py
        content: |
          #!/usr/bin/env python3
          import sys
          import json
          from uptime_kuma_api import UptimeKumaApi
          
          def main():
              api_url = sys.argv[1]
              username = sys.argv[2]
              password = sys.argv[3]
              group_name = sys.argv[4]
              monitor_name = sys.argv[5]
              monitor_description = sys.argv[6]
              interval = int(sys.argv[7])
              retries = int(sys.argv[8])
              ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"
              
              api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0)
              api.login(username, password)
              
              # Get all monitors
              monitors = api.get_monitors()
              
              # Get all notifications and find ntfy notification
              notifications = api.get_notifications()
              ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
              notification_id_list = {}
              if ntfy_notification:
                  notification_id_list[ntfy_notification['id']] = True
              
              # Find or create group
              group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
              if not group:
                  group_result = api.add_monitor(type='group', name=group_name)
                  # Refresh to get the full group object with id
                  monitors = api.get_monitors()
                  group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
              
              # Find or create/update push monitor
              existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
              
              monitor_data = {
                  'type': 'push',
                  'name': monitor_name,
                  'parent': group['id'],
                  'interval': interval,
                  'upsideDown': False,  # Normal heartbeat mode: receiving pings = healthy
                  'maxretries': retries,
                  'description': monitor_description,
                  'notificationIDList': notification_id_list
              }
              
              if existing_monitor:
                  monitor = api.edit_monitor(existing_monitor['id'], **monitor_data)
                  # Refresh to get the full monitor object with pushToken
                  monitors = api.get_monitors()
                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
              else:
                  monitor_result = api.add_monitor(**monitor_data)
                  # Refresh to get the full monitor object with pushToken
                  monitors = api.get_monitors()
                  monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
              
              # Output result as JSON
              result = {
                  'monitor_id': monitor['id'],
                  'push_token': monitor['pushToken'],
                  'group_name': group_name,
                  'group_id': group['id'],
                  'monitor_name': monitor_name
              }
              print(json.dumps(result))
              
              api.disconnect()
          
          if __name__ == '__main__':
              main()
        mode: '0755'
      delegate_to: localhost
      become: no

    - name: Run Uptime Kuma ZFS monitor setup script
      command: >
        {{ ansible_playbook_python }}
        /tmp/setup_uptime_kuma_zfs_monitor.py
        "{{ uptime_kuma_api_url }}"
        "{{ uptime_kuma_username }}"
        "{{ uptime_kuma_password }}"
        "{{ uptime_kuma_monitor_group }}"
        "{{ monitor_name }}"
        "{{ monitor_friendly_name }} - Daily health check for pool {{ zfs_pool_name }}"
        "{{ zfs_check_timeout_seconds }}"
        "{{ zfs_check_retries }}"
        "{{ ntfy_topic }}"
      register: monitor_setup_result
      delegate_to: localhost
      become: no
      changed_when: false

    - name: Parse monitor setup result
      set_fact:
        monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"

    - name: Set push URL and monitor ID as facts
      set_fact:
        uptime_kuma_zfs_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
        uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}"

    - name: Install required packages for ZFS monitoring
      package:
        name:
          - curl
          - jq
        state: present

    - name: Create monitoring script directory
      file:
        path: "{{ zfs_monitoring_script_dir }}"
        state: directory
        owner: root
        group: root
        mode: '0755'

    - name: Create ZFS health monitoring script
      copy:
        dest: "{{ zfs_monitoring_script_path }}"
        content: |
          #!/bin/bash
          
          # ZFS Pool Health Monitoring Script
          # Checks ZFS pool health using JSON output and sends heartbeat to Uptime Kuma if healthy
          # If any issues detected, does NOT send heartbeat (triggers timeout alert)
          
          LOG_FILE="{{ zfs_log_file }}"
          UPTIME_KUMA_URL="{{ uptime_kuma_zfs_push_url }}"
          POOL_NAME="{{ zfs_pool_name }}"
          HOSTNAME=$(hostname)
          
          # Function to log messages
          log_message() {
              echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
          }
          
          # Function to check pool health using JSON output
          check_pool_health() {
              local pool="$1"
              local issues_found=0
              
              # Get pool status as JSON
              local pool_json
              pool_json=$(zpool status -j "$pool" 2>&1)
              
              if [ $? -ne 0 ]; then
                  log_message "ERROR: Failed to get pool status for $pool"
                  log_message "  -> $pool_json"
                  return 1
              fi
              
              # Check 1: Pool state must be ONLINE
              local pool_state
              pool_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].state')
              
              if [ "$pool_state" != "ONLINE" ]; then
                  log_message "ISSUE: Pool state is $pool_state (expected ONLINE)"
                  issues_found=1
              else
                  log_message "OK: Pool state is ONLINE"
              fi
              
              # Check 2: Check all vdevs and devices for non-ONLINE states
              local bad_states
              bad_states=$(echo "$pool_json" | jq -r --arg pool "$pool" '
                  .pools[$pool].vdevs[] | 
                  .. | objects | 
                  select(.state? and .state != "ONLINE") | 
                  "\(.name // "unknown"): \(.state)"
              ' 2>/dev/null)
              
              if [ -n "$bad_states" ]; then
                  log_message "ISSUE: Found devices not in ONLINE state:"
                  echo "$bad_states" | while read -r line; do
                      log_message "  -> $line"
                  done
                  issues_found=1
              else
                  log_message "OK: All devices are ONLINE"
              fi
              
              # Check 3: Check for resilvering in progress
              local scan_function scan_state
              scan_function=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')
              scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')
              
              if [ "$scan_function" = "RESILVER" ] && [ "$scan_state" = "SCANNING" ]; then
                  local resilver_progress
                  resilver_progress=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.issued // "unknown"')
                  log_message "ISSUE: Pool is currently resilvering (disk reconstruction in progress) - ${resilver_progress} processed"
                  issues_found=1
              fi
              
              # Check 4: Check for read/write/checksum errors on all devices
              # Note: ZFS JSON output has error counts as strings, so convert to numbers for comparison
              local devices_with_errors
              devices_with_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '
                  .pools[$pool].vdevs[] | 
                  .. | objects | 
                  select(.name? and ((.read_errors // "0" | tonumber) > 0 or (.write_errors // "0" | tonumber) > 0 or (.checksum_errors // "0" | tonumber) > 0)) | 
                  "\(.name): read=\(.read_errors // 0) write=\(.write_errors // 0) cksum=\(.checksum_errors // 0)"
              ' 2>/dev/null)
              
              if [ -n "$devices_with_errors" ]; then
                  log_message "ISSUE: Found devices with I/O errors:"
                  echo "$devices_with_errors" | while read -r line; do
                      log_message "  -> $line"
                  done
                  issues_found=1
              else
                  log_message "OK: No read/write/checksum errors detected"
              fi
              
              # Check 5: Check for scan errors (from last scrub/resilver)
              local scan_errors
              scan_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.errors // "0"')
              
              if [ "$scan_errors" != "0" ] && [ "$scan_errors" != "null" ] && [ -n "$scan_errors" ]; then
                  log_message "ISSUE: Last scan reported $scan_errors errors"
                  issues_found=1
              else
                  log_message "OK: No scan errors"
              fi
              
              return $issues_found
          }
          
          # Function to get last scrub info for status message
          get_scrub_info() {
              local pool="$1"
              local pool_json
              pool_json=$(zpool status -j "$pool" 2>/dev/null)
              
              local scan_func scan_state scan_start
              scan_func=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')
              scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')
              scan_start=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.start_time // ""')
              
              if [ "$scan_func" = "SCRUB" ] && [ "$scan_state" = "SCANNING" ]; then
                  echo "scrub in progress (started $scan_start)"
              elif [ "$scan_func" = "SCRUB" ] && [ -n "$scan_start" ]; then
                  echo "last scrub: $scan_start"
              else
                  echo "no scrub history"
              fi
          }
          
          # Function to send heartbeat to Uptime Kuma
          send_heartbeat() {
              local message="$1"
              
              log_message "Sending heartbeat to Uptime Kuma: $message"
              
              # URL encode the message
              local encoded_message
              encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g')
              
              local response http_code
              response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1)
              http_code=$(echo "$response" | tail -n1)
              
              if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
                  log_message "Heartbeat sent successfully (HTTP $http_code)"
                  return 0
              else
                  log_message "ERROR: Failed to send heartbeat (HTTP $http_code)"
                  return 1
              fi
          }
          
          # Main health check logic
          main() {
              log_message "=========================================="
              log_message "Starting ZFS health check for pool: $POOL_NAME on $HOSTNAME"
              
              # Run all health checks
              if check_pool_health "$POOL_NAME"; then
                  # All checks passed - send heartbeat
                  local scrub_info
                  scrub_info=$(get_scrub_info "$POOL_NAME")
                  
                  local message="Pool $POOL_NAME healthy ($scrub_info)"
                  send_heartbeat "$message"
                  
                  log_message "Health check completed: ALL OK"
                  exit 0
              else
                  # Issues found - do NOT send heartbeat (will trigger timeout alert)
                  log_message "Health check completed: ISSUES DETECTED - NOT sending heartbeat"
                  log_message "Uptime Kuma will alert after timeout due to missing heartbeat"
                  exit 1
              fi
          }
          
          # Run main function
          main
        owner: root
        group: root
        mode: '0755'

    - name: Create systemd service for ZFS health monitoring
      copy:
        dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.service"
        content: |
          [Unit]
          Description=ZFS Pool Health Monitor
          After=zfs.target network.target
          
          [Service]
          Type=oneshot
          ExecStart={{ zfs_monitoring_script_path }}
          User=root
          StandardOutput=journal
          StandardError=journal
          
          [Install]
          WantedBy=multi-user.target
        owner: root
        group: root
        mode: '0644'

    - name: Create systemd timer for daily ZFS health monitoring
      copy:
        dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.timer"
        content: |
          [Unit]
          Description=Run ZFS Pool Health Monitor daily
          Requires={{ zfs_systemd_health_service_name }}.service
          
          [Timer]
          OnBootSec=5min
          OnUnitActiveSec={{ zfs_check_interval_seconds }}sec
          Persistent=true
          
          [Install]
          WantedBy=timers.target
        owner: root
        group: root
        mode: '0644'

    - name: Create systemd service for ZFS monthly scrub
      copy:
        dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.service"
        content: |
          [Unit]
          Description=ZFS Monthly Scrub for {{ zfs_pool_name }}
          After=zfs.target
          
          [Service]
          Type=oneshot
          ExecStart=/sbin/zpool scrub {{ zfs_pool_name }}
          User=root
          StandardOutput=journal
          StandardError=journal
          
          [Install]
          WantedBy=multi-user.target
        owner: root
        group: root
        mode: '0644'

    - name: Create systemd timer for monthly ZFS scrub
      copy:
        dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.timer"
        content: |
          [Unit]
          Description=Run ZFS Scrub on last day of every month at 4:00 AM
          Requires={{ zfs_systemd_scrub_service_name }}.service
          
          [Timer]
          OnCalendar=*-*~01 04:00:00
          Persistent=true
          
          [Install]
          WantedBy=timers.target
        owner: root
        group: root
        mode: '0644'

    - name: Reload systemd daemon
      systemd:
        daemon_reload: yes

    - name: Enable and start ZFS health monitoring timer
      systemd:
        name: "{{ zfs_systemd_health_service_name }}.timer"
        enabled: yes
        state: started

    - name: Enable and start ZFS monthly scrub timer
      systemd:
        name: "{{ zfs_systemd_scrub_service_name }}.timer"
        enabled: yes
        state: started

    - name: Test ZFS health monitoring script
      command: "{{ zfs_monitoring_script_path }}"
      register: script_test
      changed_when: false

    - name: Verify script execution
      assert:
        that:
          - script_test.rc == 0
        fail_msg: "ZFS health monitoring script failed - check pool health"

    - name: Display monitoring configuration
      debug:
        msg: |
          ✓ ZFS Pool Health Monitoring deployed successfully!
          
          Monitor Name: {{ monitor_friendly_name }}
          Monitor Group: {{ uptime_kuma_monitor_group }}
          Pool Name: {{ zfs_pool_name }}
          
          Health Check:
            - Frequency: Every {{ zfs_check_interval_seconds }} seconds (24 hours)
            - Timeout: {{ zfs_check_timeout_seconds }} seconds (~25 hours)
            - Script: {{ zfs_monitoring_script_path }}
            - Log: {{ zfs_log_file }}
            - Service: {{ zfs_systemd_health_service_name }}.service
            - Timer: {{ zfs_systemd_health_service_name }}.timer
          
          Monthly Scrub:
            - Schedule: Last day of month at 4:00 AM
            - Service: {{ zfs_systemd_scrub_service_name }}.service
            - Timer: {{ zfs_systemd_scrub_service_name }}.timer
          
          Conditions monitored:
            - Pool state (must be ONLINE)
            - Device states (no DEGRADED/FAULTED/OFFLINE/UNAVAIL)
            - Resilver status (alerts if resilvering)
            - Read/Write/Checksum errors
            - Scrub errors

    - name: Clean up temporary Uptime Kuma setup script
      file:
        path: /tmp/setup_uptime_kuma_zfs_monitor.py
        state: absent
      delegate_to: localhost
      become: no
packages, zfs pool 2025-10-29 00:13:15 +01:00			`- name: Setup ZFS RAID 1 Pool for Proxmox Storage`
too much stuff 2025-12-01 11:16:47 +01:00			`hosts: nodito_host`
packages, zfs pool 2025-10-29 00:13:15 +01:00			`become: true`
			`vars_files:`
			`- ../infra_vars.yml`
			`- nodito_vars.yml`

			`tasks:`
			`- name: Verify Proxmox VE is running`
			`command: pveversion`
			`register: pve_version_check`
			`changed_when: false`
			`failed_when: pve_version_check.rc != 0`

			`- name: Update package cache`
			`apt:`
			`update_cache: yes`
			`cache_valid_time: 3600`

			`- name: Install ZFS utilities`
			`package:`
			`name:`
			`- zfsutils-linux`
			`- zfs-initramfs`
			`state: present`

			`- name: Load ZFS kernel module`
			`modprobe:`
			`name: zfs`

			`- name: Ensure ZFS module loads at boot`
			`lineinfile:`
			`path: /etc/modules`
			`line: zfs`
			`state: present`

			`- name: Check if ZFS pool already exists`
			`command: zpool list {{ zfs_pool_name }}`
			`register: zfs_pool_exists`
			`failed_when: false`
			`changed_when: false`

			`- name: Check if disks are in use`
			`shell: \|`
			`for disk in {{ zfs_disk_1 }} {{ zfs_disk_2 }}; do`
			`if mount \| grep -q "^$disk"; then`
			`echo "ERROR: $disk is mounted"`
			`exit 1`
			`fi`
			`if lsblk -n -o MOUNTPOINT "$disk" \| grep -v "^$" \| grep -q .; then`
			`echo "ERROR: $disk has mounted partitions"`
			`exit 1`
			`fi`
			`done`
			`register: disk_usage_check`
			`failed_when: disk_usage_check.rc != 0`
			`changed_when: false`

			`- name: Create ZFS RAID 1 pool with optimized settings`
			`command: >`
			`zpool create {{ zfs_pool_name }}`
			`-o ashift=12`
			`-O mountpoint=none`
			`mirror {{ zfs_disk_1 }} {{ zfs_disk_2 }}`
			`when: zfs_pool_exists.rc != 0`
			`register: zfs_pool_create_result`

			`- name: Check if ZFS dataset already exists`
			`command: zfs list {{ zfs_pool_name }}/vm-storage`
			`register: zfs_dataset_exists`
			`failed_when: false`
			`changed_when: false`

			`- name: Create ZFS dataset for Proxmox storage`
			`command: zfs create {{ zfs_pool_name }}/vm-storage`
			`when: zfs_dataset_exists.rc != 0`
			`register: zfs_dataset_create_result`

			`- name: Set ZFS dataset properties for Proxmox`
			`command: zfs set {{ item.property }}={{ item.value }} {{ zfs_pool_name }}/vm-storage`
			`loop:`
			`- { property: "mountpoint", value: "{{ zfs_pool_mountpoint }}" }`
			`- { property: "compression", value: "lz4" }`
			`- { property: "atime", value: "off" }`
			`- { property: "xattr", value: "sa" }`
			`- { property: "acltype", value: "posixacl" }`
			`- { property: "dnodesize", value: "auto" }`
			`when: zfs_dataset_exists.rc != 0`

			`- name: Set ZFS pool properties for Proxmox`
			`command: zpool set autotrim=off {{ zfs_pool_name }}`
			`when: zfs_pool_exists.rc != 0`

			`- name: Set ZFS pool mountpoint for Proxmox`
			`command: zfs set mountpoint={{ zfs_pool_mountpoint }} {{ zfs_pool_name }}`
			`when: zfs_pool_exists.rc == 0`

			`- name: Export and re-import ZFS pool for Proxmox compatibility`
			`shell: \|`
			`zpool export {{ zfs_pool_name }}`
			`zpool import {{ zfs_pool_name }}`
			`when: zfs_pool_exists.rc != 0`
			`register: zfs_pool_import_result`


			`- name: Ensure ZFS services are enabled`
			`systemd:`
			`name: "{{ item }}"`
			`enabled: yes`
			`state: started`
			`loop:`
			`- zfs-import-cache`
			`- zfs-import-scan`
			`- zfs-mount`
			`- zfs-share`
			`- zfs-zed`

			`- name: Check if ZFS pool storage already exists in Proxmox config`
			`stat:`
			`path: /etc/pve/storage.cfg`
			`register: storage_cfg_file`

			`- name: Check if storage name exists in Proxmox config`
			`shell: "grep -q '^zfspool: {{ zfs_pool_name }}' /etc/pve/storage.cfg"`
			`register: storage_exists_check`
			`failed_when: false`
			`changed_when: false`
			`when: storage_cfg_file.stat.exists`

			`- name: Set storage not configured when config file doesn't exist`
			`set_fact:`
			`storage_exists_check:`
			`rc: 1`
			`when: not storage_cfg_file.stat.exists`

			`- name: Debug storage configuration status`
			`debug:`
			`msg: \|`
			`Config file exists: {{ storage_cfg_file.stat.exists }}`
			`Storage check result: {{ storage_exists_check.rc }}`
			`Pool exists: {{ zfs_pool_exists.rc == 0 }}`
			`Will remove storage: {{ zfs_pool_exists.rc == 0 and storage_exists_check.rc == 0 }}`
			`Will add storage: {{ zfs_pool_exists.rc == 0 and storage_exists_check.rc != 0 }}`

			`- name: Remove existing storage if it exists`
			`command: pvesm remove {{ zfs_pool_name }}`
			`register: pvesm_remove_result`
			`failed_when: false`
			`when:`
			`- zfs_pool_exists.rc == 0`
			`- storage_exists_check.rc == 0`

			`- name: Add ZFS pool storage to Proxmox using pvesm`
			`command: >`
			`pvesm add zfspool {{ zfs_pool_name }}`
			`--pool {{ zfs_pool_name }}`
			`--content rootdir,images`
			`--sparse 1`
			`when:`
			`- zfs_pool_exists.rc == 0`
			`- storage_exists_check.rc != 0`
			`register: pvesm_add_result`

			`- name: Verify ZFS pool is healthy`
			`command: zpool status {{ zfs_pool_name }}`
			`register: final_zfs_status`
			`changed_when: false`

			`- name: Fail if ZFS pool is not healthy`
			`fail:`
			`msg: "ZFS pool {{ zfs_pool_name }} is not in a healthy state"`
			`when: "'ONLINE' not in final_zfs_status.stdout"`
monitor zfs 2026-01-04 23:19:19 +01:00
			`- name: Setup ZFS Pool Health Monitoring and Monthly Scrubs`
			`hosts: nodito`
			`become: true`
			`vars_files:`
			`- ../../infra_vars.yml`
			`- ../../services_config.yml`
			`- ../../infra_secrets.yml`
			`- nodito_vars.yml`

			`vars:`
			`zfs_check_interval_seconds: 86400 # 24 hours`
			`zfs_check_timeout_seconds: 90000 # ~25 hours (interval + buffer)`
			`zfs_check_retries: 1`
			`zfs_monitoring_script_dir: /opt/zfs-monitoring`
			`zfs_monitoring_script_path: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.sh"`
			`zfs_log_file: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.log"`
			`zfs_systemd_health_service_name: zfs-health-monitor`
			`zfs_systemd_scrub_service_name: zfs-monthly-scrub`
			`uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"`
			`ntfy_topic: "{{ service_settings.ntfy.topic }}"`

			`tasks:`
			`- name: Validate Uptime Kuma configuration`
			`assert:`
			`that:`
			`- uptime_kuma_api_url is defined`
			`- uptime_kuma_api_url != ""`
			`- uptime_kuma_username is defined`
			`- uptime_kuma_username != ""`
			`- uptime_kuma_password is defined`
			`- uptime_kuma_password != ""`
			`fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"`

			`- name: Get hostname for monitor identification`
			`command: hostname`
			`register: host_name`
			`changed_when: false`

			`- name: Set monitor name and group based on hostname`
			`set_fact:`
			`monitor_name: "zfs-health-{{ host_name.stdout }}"`
			`monitor_friendly_name: "ZFS Pool Health: {{ host_name.stdout }}"`
			`uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"`

			`- name: Create Uptime Kuma ZFS health monitor setup script`
			`copy:`
			`dest: /tmp/setup_uptime_kuma_zfs_monitor.py`
			`content: \|`
			`#!/usr/bin/env python3`
			`import sys`
			`import json`
			`from uptime_kuma_api import UptimeKumaApi`

			`def main():`
			`api_url = sys.argv[1]`
			`username = sys.argv[2]`
			`password = sys.argv[3]`
			`group_name = sys.argv[4]`
			`monitor_name = sys.argv[5]`
			`monitor_description = sys.argv[6]`
			`interval = int(sys.argv[7])`
			`retries = int(sys.argv[8])`
			`ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"`

			`api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0)`
			`api.login(username, password)`

			`# Get all monitors`
			`monitors = api.get_monitors()`

			`# Get all notifications and find ntfy notification`
			`notifications = api.get_notifications()`
			`ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)`
			`notification_id_list = {}`
			`if ntfy_notification:`
			`notification_id_list[ntfy_notification['id']] = True`

			`# Find or create group`
			`group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)`
			`if not group:`
			`group_result = api.add_monitor(type='group', name=group_name)`
			`# Refresh to get the full group object with id`
			`monitors = api.get_monitors()`
			`group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)`

			`# Find or create/update push monitor`
			`existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)`

			`monitor_data = {`
			`'type': 'push',`
			`'name': monitor_name,`
			`'parent': group['id'],`
			`'interval': interval,`
			`'upsideDown': False, # Normal heartbeat mode: receiving pings = healthy`
			`'maxretries': retries,`
			`'description': monitor_description,`
			`'notificationIDList': notification_id_list`
			`}`

			`if existing_monitor:`
			`monitor = api.edit_monitor(existing_monitor['id'], **monitor_data)`
			`# Refresh to get the full monitor object with pushToken`
			`monitors = api.get_monitors()`
			`monitor = next((m for m in monitors if m.get('name') == monitor_name), None)`
			`else:`
			`monitor_result = api.add_monitor(**monitor_data)`
			`# Refresh to get the full monitor object with pushToken`
			`monitors = api.get_monitors()`
			`monitor = next((m for m in monitors if m.get('name') == monitor_name), None)`

			`# Output result as JSON`
			`result = {`
			`'monitor_id': monitor['id'],`
			`'push_token': monitor['pushToken'],`
			`'group_name': group_name,`
			`'group_id': group['id'],`
			`'monitor_name': monitor_name`
			`}`
			`print(json.dumps(result))`

			`api.disconnect()`

			`if __name__ == '__main__':`
			`main()`
			`mode: '0755'`
			`delegate_to: localhost`
			`become: no`

			`- name: Run Uptime Kuma ZFS monitor setup script`
			`command: >`
			`{{ ansible_playbook_python }}`
			`/tmp/setup_uptime_kuma_zfs_monitor.py`
			`"{{ uptime_kuma_api_url }}"`
			`"{{ uptime_kuma_username }}"`
			`"{{ uptime_kuma_password }}"`
			`"{{ uptime_kuma_monitor_group }}"`
			`"{{ monitor_name }}"`
			`"{{ monitor_friendly_name }} - Daily health check for pool {{ zfs_pool_name }}"`
			`"{{ zfs_check_timeout_seconds }}"`
			`"{{ zfs_check_retries }}"`
			`"{{ ntfy_topic }}"`
			`register: monitor_setup_result`
			`delegate_to: localhost`
			`become: no`
			`changed_when: false`

			`- name: Parse monitor setup result`
			`set_fact:`
			`monitor_info_parsed: "{{ monitor_setup_result.stdout \| from_json }}"`

			`- name: Set push URL and monitor ID as facts`
			`set_fact:`
			`uptime_kuma_zfs_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"`
			`uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}"`

			`- name: Install required packages for ZFS monitoring`
			`package:`
			`name:`
			`- curl`
			`- jq`
			`state: present`

			`- name: Create monitoring script directory`
			`file:`
			`path: "{{ zfs_monitoring_script_dir }}"`
			`state: directory`
			`owner: root`
			`group: root`
			`mode: '0755'`

			`- name: Create ZFS health monitoring script`
			`copy:`
			`dest: "{{ zfs_monitoring_script_path }}"`
			`content: \|`
			`#!/bin/bash`

			`# ZFS Pool Health Monitoring Script`
			`# Checks ZFS pool health using JSON output and sends heartbeat to Uptime Kuma if healthy`
			`# If any issues detected, does NOT send heartbeat (triggers timeout alert)`

			`LOG_FILE="{{ zfs_log_file }}"`
			`UPTIME_KUMA_URL="{{ uptime_kuma_zfs_push_url }}"`
			`POOL_NAME="{{ zfs_pool_name }}"`
			`HOSTNAME=$(hostname)`

			`# Function to log messages`
			`log_message() {`
			`echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"`
			`}`

			`# Function to check pool health using JSON output`
			`check_pool_health() {`
			`local pool="$1"`
			`local issues_found=0`

			`# Get pool status as JSON`
			`local pool_json`
			`pool_json=$(zpool status -j "$pool" 2>&1)`

			`if [ $? -ne 0 ]; then`
			`log_message "ERROR: Failed to get pool status for $pool"`
			`log_message " -> $pool_json"`
			`return 1`
			`fi`

			`# Check 1: Pool state must be ONLINE`
			`local pool_state`
			`pool_state=$(echo "$pool_json" \| jq -r --arg pool "$pool" '.pools[$pool].state')`

			`if [ "$pool_state" != "ONLINE" ]; then`
			`log_message "ISSUE: Pool state is $pool_state (expected ONLINE)"`
			`issues_found=1`
			`else`
			`log_message "OK: Pool state is ONLINE"`
			`fi`

			`# Check 2: Check all vdevs and devices for non-ONLINE states`
			`local bad_states`
			`bad_states=$(echo "$pool_json" \| jq -r --arg pool "$pool" '`
			`.pools[$pool].vdevs[] \|`
			`.. \| objects \|`
			`select(.state? and .state != "ONLINE") \|`
			`"\(.name // "unknown"): \(.state)"`
			`' 2>/dev/null)`

			`if [ -n "$bad_states" ]; then`
			`log_message "ISSUE: Found devices not in ONLINE state:"`
			`echo "$bad_states" \| while read -r line; do`
			`log_message " -> $line"`
			`done`
			`issues_found=1`
			`else`
			`log_message "OK: All devices are ONLINE"`
			`fi`

			`# Check 3: Check for resilvering in progress`
			`local scan_function scan_state`
			`scan_function=$(echo "$pool_json" \| jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')`
			`scan_state=$(echo "$pool_json" \| jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')`

			`if [ "$scan_function" = "RESILVER" ] && [ "$scan_state" = "SCANNING" ]; then`
			`local resilver_progress`
			`resilver_progress=$(echo "$pool_json" \| jq -r --arg pool "$pool" '.pools[$pool].scan_stats.issued // "unknown"')`
			`log_message "ISSUE: Pool is currently resilvering (disk reconstruction in progress) - ${resilver_progress} processed"`
			`issues_found=1`
			`fi`

			`# Check 4: Check for read/write/checksum errors on all devices`
			`# Note: ZFS JSON output has error counts as strings, so convert to numbers for comparison`
			`local devices_with_errors`
			`devices_with_errors=$(echo "$pool_json" \| jq -r --arg pool "$pool" '`
			`.pools[$pool].vdevs[] \|`
			`.. \| objects \|`
			`select(.name? and ((.read_errors // "0" \| tonumber) > 0 or (.write_errors // "0" \| tonumber) > 0 or (.checksum_errors // "0" \| tonumber) > 0)) \|`
			`"\(.name): read=\(.read_errors // 0) write=\(.write_errors // 0) cksum=\(.checksum_errors // 0)"`
			`' 2>/dev/null)`

			`if [ -n "$devices_with_errors" ]; then`
			`log_message "ISSUE: Found devices with I/O errors:"`
			`echo "$devices_with_errors" \| while read -r line; do`
			`log_message " -> $line"`
			`done`
			`issues_found=1`
			`else`
			`log_message "OK: No read/write/checksum errors detected"`
			`fi`

			`# Check 5: Check for scan errors (from last scrub/resilver)`
			`local scan_errors`
			`scan_errors=$(echo "$pool_json" \| jq -r --arg pool "$pool" '.pools[$pool].scan_stats.errors // "0"')`

			`if [ "$scan_errors" != "0" ] && [ "$scan_errors" != "null" ] && [ -n "$scan_errors" ]; then`
			`log_message "ISSUE: Last scan reported $scan_errors errors"`
			`issues_found=1`
			`else`
			`log_message "OK: No scan errors"`
			`fi`

			`return $issues_found`
			`}`

			`# Function to get last scrub info for status message`
			`get_scrub_info() {`
			`local pool="$1"`
			`local pool_json`
			`pool_json=$(zpool status -j "$pool" 2>/dev/null)`

			`local scan_func scan_state scan_start`
			`scan_func=$(echo "$pool_json" \| jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')`
			`scan_state=$(echo "$pool_json" \| jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')`
			`scan_start=$(echo "$pool_json" \| jq -r --arg pool "$pool" '.pools[$pool].scan_stats.start_time // ""')`

			`if [ "$scan_func" = "SCRUB" ] && [ "$scan_state" = "SCANNING" ]; then`
			`echo "scrub in progress (started $scan_start)"`
			`elif [ "$scan_func" = "SCRUB" ] && [ -n "$scan_start" ]; then`
			`echo "last scrub: $scan_start"`
			`else`
			`echo "no scrub history"`
			`fi`
			`}`

			`# Function to send heartbeat to Uptime Kuma`
			`send_heartbeat() {`
			`local message="$1"`

			`log_message "Sending heartbeat to Uptime Kuma: $message"`

			`# URL encode the message`
			`local encoded_message`
			`encoded_message=$(printf '%s\n' "$message" \| sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g')`

			`local response http_code`
			`response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1)`
			`http_code=$(echo "$response" \| tail -n1)`

			`if [ "$http_code" = "200" ] \|\| [ "$http_code" = "201" ]; then`
			`log_message "Heartbeat sent successfully (HTTP $http_code)"`
			`return 0`
			`else`
			`log_message "ERROR: Failed to send heartbeat (HTTP $http_code)"`
			`return 1`
			`fi`
			`}`

			`# Main health check logic`
			`main() {`
			`log_message "=========================================="`
			`log_message "Starting ZFS health check for pool: $POOL_NAME on $HOSTNAME"`

			`# Run all health checks`
			`if check_pool_health "$POOL_NAME"; then`
			`# All checks passed - send heartbeat`
			`local scrub_info`
			`scrub_info=$(get_scrub_info "$POOL_NAME")`

			`local message="Pool $POOL_NAME healthy ($scrub_info)"`
			`send_heartbeat "$message"`

			`log_message "Health check completed: ALL OK"`
			`exit 0`
			`else`
			`# Issues found - do NOT send heartbeat (will trigger timeout alert)`
			`log_message "Health check completed: ISSUES DETECTED - NOT sending heartbeat"`
			`log_message "Uptime Kuma will alert after timeout due to missing heartbeat"`
			`exit 1`
			`fi`
			`}`

			`# Run main function`
			`main`
			`owner: root`
			`group: root`
			`mode: '0755'`

			`- name: Create systemd service for ZFS health monitoring`
			`copy:`
			`dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.service"`
			`content: \|`
			`[Unit]`
			`Description=ZFS Pool Health Monitor`
			`After=zfs.target network.target`

			`[Service]`
			`Type=oneshot`
			`ExecStart={{ zfs_monitoring_script_path }}`
			`User=root`
			`StandardOutput=journal`
			`StandardError=journal`

			`[Install]`
			`WantedBy=multi-user.target`
			`owner: root`
			`group: root`
			`mode: '0644'`

			`- name: Create systemd timer for daily ZFS health monitoring`
			`copy:`
			`dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.timer"`
			`content: \|`
			`[Unit]`
			`Description=Run ZFS Pool Health Monitor daily`
			`Requires={{ zfs_systemd_health_service_name }}.service`

			`[Timer]`
			`OnBootSec=5min`
			`OnUnitActiveSec={{ zfs_check_interval_seconds }}sec`
			`Persistent=true`

			`[Install]`
			`WantedBy=timers.target`
			`owner: root`
			`group: root`
			`mode: '0644'`

			`- name: Create systemd service for ZFS monthly scrub`
			`copy:`
			`dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.service"`
			`content: \|`
			`[Unit]`
			`Description=ZFS Monthly Scrub for {{ zfs_pool_name }}`
			`After=zfs.target`

			`[Service]`
			`Type=oneshot`
			`ExecStart=/sbin/zpool scrub {{ zfs_pool_name }}`
			`User=root`
			`StandardOutput=journal`
			`StandardError=journal`

			`[Install]`
			`WantedBy=multi-user.target`
			`owner: root`
			`group: root`
			`mode: '0644'`

			`- name: Create systemd timer for monthly ZFS scrub`
			`copy:`
			`dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.timer"`
			`content: \|`
			`[Unit]`
			`Description=Run ZFS Scrub on last day of every month at 4:00 AM`
			`Requires={{ zfs_systemd_scrub_service_name }}.service`

			`[Timer]`
			`OnCalendar=-~01 04:00:00`
			`Persistent=true`

			`[Install]`
			`WantedBy=timers.target`
			`owner: root`
			`group: root`
			`mode: '0644'`

			`- name: Reload systemd daemon`
			`systemd:`
			`daemon_reload: yes`

			`- name: Enable and start ZFS health monitoring timer`
			`systemd:`
			`name: "{{ zfs_systemd_health_service_name }}.timer"`
			`enabled: yes`
			`state: started`

			`- name: Enable and start ZFS monthly scrub timer`
			`systemd:`
			`name: "{{ zfs_systemd_scrub_service_name }}.timer"`
			`enabled: yes`
			`state: started`

			`- name: Test ZFS health monitoring script`
			`command: "{{ zfs_monitoring_script_path }}"`
			`register: script_test`
			`changed_when: false`

			`- name: Verify script execution`
			`assert:`
			`that:`
			`- script_test.rc == 0`
			`fail_msg: "ZFS health monitoring script failed - check pool health"`

			`- name: Display monitoring configuration`
			`debug:`
			`msg: \|`
			`✓ ZFS Pool Health Monitoring deployed successfully!`

			`Monitor Name: {{ monitor_friendly_name }}`
			`Monitor Group: {{ uptime_kuma_monitor_group }}`
			`Pool Name: {{ zfs_pool_name }}`

			`Health Check:`
			`- Frequency: Every {{ zfs_check_interval_seconds }} seconds (24 hours)`
			`- Timeout: {{ zfs_check_timeout_seconds }} seconds (~25 hours)`
			`- Script: {{ zfs_monitoring_script_path }}`
			`- Log: {{ zfs_log_file }}`
			`- Service: {{ zfs_systemd_health_service_name }}.service`
			`- Timer: {{ zfs_systemd_health_service_name }}.timer`

			`Monthly Scrub:`
			`- Schedule: Last day of month at 4:00 AM`
			`- Service: {{ zfs_systemd_scrub_service_name }}.service`
			`- Timer: {{ zfs_systemd_scrub_service_name }}.timer`

			`Conditions monitored:`
			`- Pool state (must be ONLINE)`
			`- Device states (no DEGRADED/FAULTED/OFFLINE/UNAVAIL)`
			`- Resilver status (alerts if resilvering)`
			`- Read/Write/Checksum errors`
			`- Scrub errors`

			`- name: Clean up temporary Uptime Kuma setup script`
			`file:`
			`path: /tmp/setup_uptime_kuma_zfs_monitor.py`
			`state: absent`
			`delegate_to: localhost`
			`become: no`