personal_infra/ansible/infra/nodito/32_zfs_pool_setup_playbook.yml

669 lines
24 KiB
YAML
Raw Normal View History

2025-10-29 00:13:15 +01:00
- name: Setup ZFS RAID 1 Pool for Proxmox Storage
2025-12-01 11:16:47 +01:00
hosts: nodito_host
2025-10-29 00:13:15 +01:00
become: true
vars_files:
- ../infra_vars.yml
- nodito_vars.yml
tasks:
- name: Verify Proxmox VE is running
command: pveversion
register: pve_version_check
changed_when: false
failed_when: pve_version_check.rc != 0
- name: Update package cache
apt:
update_cache: yes
cache_valid_time: 3600
- name: Install ZFS utilities
package:
name:
- zfsutils-linux
- zfs-initramfs
state: present
- name: Load ZFS kernel module
modprobe:
name: zfs
- name: Ensure ZFS module loads at boot
lineinfile:
path: /etc/modules
line: zfs
state: present
- name: Check if ZFS pool already exists
command: zpool list {{ zfs_pool_name }}
register: zfs_pool_exists
failed_when: false
changed_when: false
- name: Check if disks are in use
shell: |
for disk in {{ zfs_disk_1 }} {{ zfs_disk_2 }}; do
if mount | grep -q "^$disk"; then
echo "ERROR: $disk is mounted"
exit 1
fi
if lsblk -n -o MOUNTPOINT "$disk" | grep -v "^$" | grep -q .; then
echo "ERROR: $disk has mounted partitions"
exit 1
fi
done
register: disk_usage_check
failed_when: disk_usage_check.rc != 0
changed_when: false
- name: Create ZFS RAID 1 pool with optimized settings
command: >
zpool create {{ zfs_pool_name }}
-o ashift=12
-O mountpoint=none
mirror {{ zfs_disk_1 }} {{ zfs_disk_2 }}
when: zfs_pool_exists.rc != 0
register: zfs_pool_create_result
- name: Check if ZFS dataset already exists
command: zfs list {{ zfs_pool_name }}/vm-storage
register: zfs_dataset_exists
failed_when: false
changed_when: false
- name: Create ZFS dataset for Proxmox storage
command: zfs create {{ zfs_pool_name }}/vm-storage
when: zfs_dataset_exists.rc != 0
register: zfs_dataset_create_result
- name: Set ZFS dataset properties for Proxmox
command: zfs set {{ item.property }}={{ item.value }} {{ zfs_pool_name }}/vm-storage
loop:
- { property: "mountpoint", value: "{{ zfs_pool_mountpoint }}" }
- { property: "compression", value: "lz4" }
- { property: "atime", value: "off" }
- { property: "xattr", value: "sa" }
- { property: "acltype", value: "posixacl" }
- { property: "dnodesize", value: "auto" }
when: zfs_dataset_exists.rc != 0
- name: Set ZFS pool properties for Proxmox
command: zpool set autotrim=off {{ zfs_pool_name }}
when: zfs_pool_exists.rc != 0
- name: Set ZFS pool mountpoint for Proxmox
command: zfs set mountpoint={{ zfs_pool_mountpoint }} {{ zfs_pool_name }}
when: zfs_pool_exists.rc == 0
- name: Export and re-import ZFS pool for Proxmox compatibility
shell: |
zpool export {{ zfs_pool_name }}
zpool import {{ zfs_pool_name }}
when: zfs_pool_exists.rc != 0
register: zfs_pool_import_result
- name: Ensure ZFS services are enabled
systemd:
name: "{{ item }}"
enabled: yes
state: started
loop:
- zfs-import-cache
- zfs-import-scan
- zfs-mount
- zfs-share
- zfs-zed
- name: Check if ZFS pool storage already exists in Proxmox config
stat:
path: /etc/pve/storage.cfg
register: storage_cfg_file
- name: Check if storage name exists in Proxmox config
shell: "grep -q '^zfspool: {{ zfs_pool_name }}' /etc/pve/storage.cfg"
register: storage_exists_check
failed_when: false
changed_when: false
when: storage_cfg_file.stat.exists
- name: Set storage not configured when config file doesn't exist
set_fact:
storage_exists_check:
rc: 1
when: not storage_cfg_file.stat.exists
- name: Debug storage configuration status
debug:
msg: |
Config file exists: {{ storage_cfg_file.stat.exists }}
Storage check result: {{ storage_exists_check.rc }}
Pool exists: {{ zfs_pool_exists.rc == 0 }}
Will remove storage: {{ zfs_pool_exists.rc == 0 and storage_exists_check.rc == 0 }}
Will add storage: {{ zfs_pool_exists.rc == 0 and storage_exists_check.rc != 0 }}
- name: Remove existing storage if it exists
command: pvesm remove {{ zfs_pool_name }}
register: pvesm_remove_result
failed_when: false
when:
- zfs_pool_exists.rc == 0
- storage_exists_check.rc == 0
- name: Add ZFS pool storage to Proxmox using pvesm
command: >
pvesm add zfspool {{ zfs_pool_name }}
--pool {{ zfs_pool_name }}
--content rootdir,images
--sparse 1
when:
- zfs_pool_exists.rc == 0
- storage_exists_check.rc != 0
register: pvesm_add_result
- name: Verify ZFS pool is healthy
command: zpool status {{ zfs_pool_name }}
register: final_zfs_status
changed_when: false
- name: Fail if ZFS pool is not healthy
fail:
msg: "ZFS pool {{ zfs_pool_name }} is not in a healthy state"
when: "'ONLINE' not in final_zfs_status.stdout"
2026-01-04 23:19:19 +01:00
- name: Setup ZFS Pool Health Monitoring and Monthly Scrubs
hosts: nodito
become: true
vars_files:
- ../../infra_vars.yml
- ../../services_config.yml
- ../../infra_secrets.yml
- nodito_vars.yml
vars:
zfs_check_interval_seconds: 86400 # 24 hours
zfs_check_timeout_seconds: 90000 # ~25 hours (interval + buffer)
zfs_check_retries: 1
zfs_monitoring_script_dir: /opt/zfs-monitoring
zfs_monitoring_script_path: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.sh"
zfs_log_file: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.log"
zfs_systemd_health_service_name: zfs-health-monitor
zfs_systemd_scrub_service_name: zfs-monthly-scrub
uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
ntfy_topic: "{{ service_settings.ntfy.topic }}"
tasks:
- name: Validate Uptime Kuma configuration
assert:
that:
- uptime_kuma_api_url is defined
- uptime_kuma_api_url != ""
- uptime_kuma_username is defined
- uptime_kuma_username != ""
- uptime_kuma_password is defined
- uptime_kuma_password != ""
fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"
- name: Get hostname for monitor identification
command: hostname
register: host_name
changed_when: false
- name: Set monitor name and group based on hostname
set_fact:
monitor_name: "zfs-health-{{ host_name.stdout }}"
monitor_friendly_name: "ZFS Pool Health: {{ host_name.stdout }}"
uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"
- name: Create Uptime Kuma ZFS health monitor setup script
copy:
dest: /tmp/setup_uptime_kuma_zfs_monitor.py
content: |
#!/usr/bin/env python3
import sys
import json
from uptime_kuma_api import UptimeKumaApi
def main():
api_url = sys.argv[1]
username = sys.argv[2]
password = sys.argv[3]
group_name = sys.argv[4]
monitor_name = sys.argv[5]
monitor_description = sys.argv[6]
interval = int(sys.argv[7])
retries = int(sys.argv[8])
ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"
api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0)
api.login(username, password)
# Get all monitors
monitors = api.get_monitors()
# Get all notifications and find ntfy notification
notifications = api.get_notifications()
ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
notification_id_list = {}
if ntfy_notification:
notification_id_list[ntfy_notification['id']] = True
# Find or create group
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
if not group:
group_result = api.add_monitor(type='group', name=group_name)
# Refresh to get the full group object with id
monitors = api.get_monitors()
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
# Find or create/update push monitor
existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
monitor_data = {
'type': 'push',
'name': monitor_name,
'parent': group['id'],
'interval': interval,
'upsideDown': False, # Normal heartbeat mode: receiving pings = healthy
'maxretries': retries,
'description': monitor_description,
'notificationIDList': notification_id_list
}
if existing_monitor:
monitor = api.edit_monitor(existing_monitor['id'], **monitor_data)
# Refresh to get the full monitor object with pushToken
monitors = api.get_monitors()
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
else:
monitor_result = api.add_monitor(**monitor_data)
# Refresh to get the full monitor object with pushToken
monitors = api.get_monitors()
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
# Output result as JSON
result = {
'monitor_id': monitor['id'],
'push_token': monitor['pushToken'],
'group_name': group_name,
'group_id': group['id'],
'monitor_name': monitor_name
}
print(json.dumps(result))
api.disconnect()
if __name__ == '__main__':
main()
mode: '0755'
delegate_to: localhost
become: no
- name: Run Uptime Kuma ZFS monitor setup script
command: >
{{ ansible_playbook_python }}
/tmp/setup_uptime_kuma_zfs_monitor.py
"{{ uptime_kuma_api_url }}"
"{{ uptime_kuma_username }}"
"{{ uptime_kuma_password }}"
"{{ uptime_kuma_monitor_group }}"
"{{ monitor_name }}"
"{{ monitor_friendly_name }} - Daily health check for pool {{ zfs_pool_name }}"
"{{ zfs_check_timeout_seconds }}"
"{{ zfs_check_retries }}"
"{{ ntfy_topic }}"
register: monitor_setup_result
delegate_to: localhost
become: no
changed_when: false
- name: Parse monitor setup result
set_fact:
monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
- name: Set push URL and monitor ID as facts
set_fact:
uptime_kuma_zfs_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}"
- name: Install required packages for ZFS monitoring
package:
name:
- curl
- jq
state: present
- name: Create monitoring script directory
file:
path: "{{ zfs_monitoring_script_dir }}"
state: directory
owner: root
group: root
mode: '0755'
- name: Create ZFS health monitoring script
copy:
dest: "{{ zfs_monitoring_script_path }}"
content: |
#!/bin/bash
# ZFS Pool Health Monitoring Script
# Checks ZFS pool health using JSON output and sends heartbeat to Uptime Kuma if healthy
# If any issues detected, does NOT send heartbeat (triggers timeout alert)
LOG_FILE="{{ zfs_log_file }}"
UPTIME_KUMA_URL="{{ uptime_kuma_zfs_push_url }}"
POOL_NAME="{{ zfs_pool_name }}"
HOSTNAME=$(hostname)
# Function to log messages
log_message() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
}
# Function to check pool health using JSON output
check_pool_health() {
local pool="$1"
local issues_found=0
# Get pool status as JSON
local pool_json
pool_json=$(zpool status -j "$pool" 2>&1)
if [ $? -ne 0 ]; then
log_message "ERROR: Failed to get pool status for $pool"
log_message " -> $pool_json"
return 1
fi
# Check 1: Pool state must be ONLINE
local pool_state
pool_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].state')
if [ "$pool_state" != "ONLINE" ]; then
log_message "ISSUE: Pool state is $pool_state (expected ONLINE)"
issues_found=1
else
log_message "OK: Pool state is ONLINE"
fi
# Check 2: Check all vdevs and devices for non-ONLINE states
local bad_states
bad_states=$(echo "$pool_json" | jq -r --arg pool "$pool" '
.pools[$pool].vdevs[] |
.. | objects |
select(.state? and .state != "ONLINE") |
"\(.name // "unknown"): \(.state)"
' 2>/dev/null)
if [ -n "$bad_states" ]; then
log_message "ISSUE: Found devices not in ONLINE state:"
echo "$bad_states" | while read -r line; do
log_message " -> $line"
done
issues_found=1
else
log_message "OK: All devices are ONLINE"
fi
# Check 3: Check for resilvering in progress
local scan_function scan_state
scan_function=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')
scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')
if [ "$scan_function" = "RESILVER" ] && [ "$scan_state" = "SCANNING" ]; then
local resilver_progress
resilver_progress=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.issued // "unknown"')
log_message "ISSUE: Pool is currently resilvering (disk reconstruction in progress) - ${resilver_progress} processed"
issues_found=1
fi
# Check 4: Check for read/write/checksum errors on all devices
# Note: ZFS JSON output has error counts as strings, so convert to numbers for comparison
local devices_with_errors
devices_with_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '
.pools[$pool].vdevs[] |
.. | objects |
select(.name? and ((.read_errors // "0" | tonumber) > 0 or (.write_errors // "0" | tonumber) > 0 or (.checksum_errors // "0" | tonumber) > 0)) |
"\(.name): read=\(.read_errors // 0) write=\(.write_errors // 0) cksum=\(.checksum_errors // 0)"
' 2>/dev/null)
if [ -n "$devices_with_errors" ]; then
log_message "ISSUE: Found devices with I/O errors:"
echo "$devices_with_errors" | while read -r line; do
log_message " -> $line"
done
issues_found=1
else
log_message "OK: No read/write/checksum errors detected"
fi
# Check 5: Check for scan errors (from last scrub/resilver)
local scan_errors
scan_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.errors // "0"')
if [ "$scan_errors" != "0" ] && [ "$scan_errors" != "null" ] && [ -n "$scan_errors" ]; then
log_message "ISSUE: Last scan reported $scan_errors errors"
issues_found=1
else
log_message "OK: No scan errors"
fi
return $issues_found
}
# Function to get last scrub info for status message
get_scrub_info() {
local pool="$1"
local pool_json
pool_json=$(zpool status -j "$pool" 2>/dev/null)
local scan_func scan_state scan_start
scan_func=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')
scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')
scan_start=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.start_time // ""')
if [ "$scan_func" = "SCRUB" ] && [ "$scan_state" = "SCANNING" ]; then
echo "scrub in progress (started $scan_start)"
elif [ "$scan_func" = "SCRUB" ] && [ -n "$scan_start" ]; then
echo "last scrub: $scan_start"
else
echo "no scrub history"
fi
}
# Function to send heartbeat to Uptime Kuma
send_heartbeat() {
local message="$1"
log_message "Sending heartbeat to Uptime Kuma: $message"
# URL encode the message
local encoded_message
encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g')
local response http_code
response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1)
http_code=$(echo "$response" | tail -n1)
if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
log_message "Heartbeat sent successfully (HTTP $http_code)"
return 0
else
log_message "ERROR: Failed to send heartbeat (HTTP $http_code)"
return 1
fi
}
# Main health check logic
main() {
log_message "=========================================="
log_message "Starting ZFS health check for pool: $POOL_NAME on $HOSTNAME"
# Run all health checks
if check_pool_health "$POOL_NAME"; then
# All checks passed - send heartbeat
local scrub_info
scrub_info=$(get_scrub_info "$POOL_NAME")
local message="Pool $POOL_NAME healthy ($scrub_info)"
send_heartbeat "$message"
log_message "Health check completed: ALL OK"
exit 0
else
# Issues found - do NOT send heartbeat (will trigger timeout alert)
log_message "Health check completed: ISSUES DETECTED - NOT sending heartbeat"
log_message "Uptime Kuma will alert after timeout due to missing heartbeat"
exit 1
fi
}
# Run main function
main
owner: root
group: root
mode: '0755'
- name: Create systemd service for ZFS health monitoring
copy:
dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.service"
content: |
[Unit]
Description=ZFS Pool Health Monitor
After=zfs.target network.target
[Service]
Type=oneshot
ExecStart={{ zfs_monitoring_script_path }}
User=root
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target
owner: root
group: root
mode: '0644'
- name: Create systemd timer for daily ZFS health monitoring
copy:
dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.timer"
content: |
[Unit]
Description=Run ZFS Pool Health Monitor daily
Requires={{ zfs_systemd_health_service_name }}.service
[Timer]
OnBootSec=5min
OnUnitActiveSec={{ zfs_check_interval_seconds }}sec
Persistent=true
[Install]
WantedBy=timers.target
owner: root
group: root
mode: '0644'
- name: Create systemd service for ZFS monthly scrub
copy:
dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.service"
content: |
[Unit]
Description=ZFS Monthly Scrub for {{ zfs_pool_name }}
After=zfs.target
[Service]
Type=oneshot
ExecStart=/sbin/zpool scrub {{ zfs_pool_name }}
User=root
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target
owner: root
group: root
mode: '0644'
- name: Create systemd timer for monthly ZFS scrub
copy:
dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.timer"
content: |
[Unit]
Description=Run ZFS Scrub on last day of every month at 4:00 AM
Requires={{ zfs_systemd_scrub_service_name }}.service
[Timer]
OnCalendar=*-*~01 04:00:00
Persistent=true
[Install]
WantedBy=timers.target
owner: root
group: root
mode: '0644'
- name: Reload systemd daemon
systemd:
daemon_reload: yes
- name: Enable and start ZFS health monitoring timer
systemd:
name: "{{ zfs_systemd_health_service_name }}.timer"
enabled: yes
state: started
- name: Enable and start ZFS monthly scrub timer
systemd:
name: "{{ zfs_systemd_scrub_service_name }}.timer"
enabled: yes
state: started
- name: Test ZFS health monitoring script
command: "{{ zfs_monitoring_script_path }}"
register: script_test
changed_when: false
- name: Verify script execution
assert:
that:
- script_test.rc == 0
fail_msg: "ZFS health monitoring script failed - check pool health"
- name: Display monitoring configuration
debug:
msg: |
✓ ZFS Pool Health Monitoring deployed successfully!
Monitor Name: {{ monitor_friendly_name }}
Monitor Group: {{ uptime_kuma_monitor_group }}
Pool Name: {{ zfs_pool_name }}
Health Check:
- Frequency: Every {{ zfs_check_interval_seconds }} seconds (24 hours)
- Timeout: {{ zfs_check_timeout_seconds }} seconds (~25 hours)
- Script: {{ zfs_monitoring_script_path }}
- Log: {{ zfs_log_file }}
- Service: {{ zfs_systemd_health_service_name }}.service
- Timer: {{ zfs_systemd_health_service_name }}.timer
Monthly Scrub:
- Schedule: Last day of month at 4:00 AM
- Service: {{ zfs_systemd_scrub_service_name }}.service
- Timer: {{ zfs_systemd_scrub_service_name }}.timer
Conditions monitored:
- Pool state (must be ONLINE)
- Device states (no DEGRADED/FAULTED/OFFLINE/UNAVAIL)
- Resilver status (alerts if resilvering)
- Read/Write/Checksum errors
- Scrub errors
- name: Clean up temporary Uptime Kuma setup script
file:
path: /tmp/setup_uptime_kuma_zfs_monitor.py
state: absent
delegate_to: localhost
become: no