monitor zfs
This commit is contained in:
parent
c6795dc581
commit
fe321050c1
1 changed files with 496 additions and 0 deletions
|
|
@ -170,3 +170,499 @@
|
||||||
fail:
|
fail:
|
||||||
msg: "ZFS pool {{ zfs_pool_name }} is not in a healthy state"
|
msg: "ZFS pool {{ zfs_pool_name }} is not in a healthy state"
|
||||||
when: "'ONLINE' not in final_zfs_status.stdout"
|
when: "'ONLINE' not in final_zfs_status.stdout"
|
||||||
|
|
||||||
|
- name: Setup ZFS Pool Health Monitoring and Monthly Scrubs
|
||||||
|
hosts: nodito
|
||||||
|
become: true
|
||||||
|
vars_files:
|
||||||
|
- ../../infra_vars.yml
|
||||||
|
- ../../services_config.yml
|
||||||
|
- ../../infra_secrets.yml
|
||||||
|
- nodito_vars.yml
|
||||||
|
|
||||||
|
vars:
|
||||||
|
zfs_check_interval_seconds: 86400 # 24 hours
|
||||||
|
zfs_check_timeout_seconds: 90000 # ~25 hours (interval + buffer)
|
||||||
|
zfs_check_retries: 1
|
||||||
|
zfs_monitoring_script_dir: /opt/zfs-monitoring
|
||||||
|
zfs_monitoring_script_path: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.sh"
|
||||||
|
zfs_log_file: "{{ zfs_monitoring_script_dir }}/zfs_health_monitor.log"
|
||||||
|
zfs_systemd_health_service_name: zfs-health-monitor
|
||||||
|
zfs_systemd_scrub_service_name: zfs-monthly-scrub
|
||||||
|
uptime_kuma_api_url: "https://{{ subdomains.uptime_kuma }}.{{ root_domain }}"
|
||||||
|
ntfy_topic: "{{ service_settings.ntfy.topic }}"
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Validate Uptime Kuma configuration
|
||||||
|
assert:
|
||||||
|
that:
|
||||||
|
- uptime_kuma_api_url is defined
|
||||||
|
- uptime_kuma_api_url != ""
|
||||||
|
- uptime_kuma_username is defined
|
||||||
|
- uptime_kuma_username != ""
|
||||||
|
- uptime_kuma_password is defined
|
||||||
|
- uptime_kuma_password != ""
|
||||||
|
fail_msg: "uptime_kuma_api_url, uptime_kuma_username and uptime_kuma_password must be set"
|
||||||
|
|
||||||
|
- name: Get hostname for monitor identification
|
||||||
|
command: hostname
|
||||||
|
register: host_name
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Set monitor name and group based on hostname
|
||||||
|
set_fact:
|
||||||
|
monitor_name: "zfs-health-{{ host_name.stdout }}"
|
||||||
|
monitor_friendly_name: "ZFS Pool Health: {{ host_name.stdout }}"
|
||||||
|
uptime_kuma_monitor_group: "{{ host_name.stdout }} - infra"
|
||||||
|
|
||||||
|
- name: Create Uptime Kuma ZFS health monitor setup script
|
||||||
|
copy:
|
||||||
|
dest: /tmp/setup_uptime_kuma_zfs_monitor.py
|
||||||
|
content: |
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
from uptime_kuma_api import UptimeKumaApi
|
||||||
|
|
||||||
|
def main():
|
||||||
|
api_url = sys.argv[1]
|
||||||
|
username = sys.argv[2]
|
||||||
|
password = sys.argv[3]
|
||||||
|
group_name = sys.argv[4]
|
||||||
|
monitor_name = sys.argv[5]
|
||||||
|
monitor_description = sys.argv[6]
|
||||||
|
interval = int(sys.argv[7])
|
||||||
|
retries = int(sys.argv[8])
|
||||||
|
ntfy_topic = sys.argv[9] if len(sys.argv) > 9 else "alerts"
|
||||||
|
|
||||||
|
api = UptimeKumaApi(api_url, timeout=120, wait_events=2.0)
|
||||||
|
api.login(username, password)
|
||||||
|
|
||||||
|
# Get all monitors
|
||||||
|
monitors = api.get_monitors()
|
||||||
|
|
||||||
|
# Get all notifications and find ntfy notification
|
||||||
|
notifications = api.get_notifications()
|
||||||
|
ntfy_notification = next((n for n in notifications if n.get('name') == f'ntfy ({ntfy_topic})'), None)
|
||||||
|
notification_id_list = {}
|
||||||
|
if ntfy_notification:
|
||||||
|
notification_id_list[ntfy_notification['id']] = True
|
||||||
|
|
||||||
|
# Find or create group
|
||||||
|
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
|
||||||
|
if not group:
|
||||||
|
group_result = api.add_monitor(type='group', name=group_name)
|
||||||
|
# Refresh to get the full group object with id
|
||||||
|
monitors = api.get_monitors()
|
||||||
|
group = next((m for m in monitors if m.get('name') == group_name and m.get('type') == 'group'), None)
|
||||||
|
|
||||||
|
# Find or create/update push monitor
|
||||||
|
existing_monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||||
|
|
||||||
|
monitor_data = {
|
||||||
|
'type': 'push',
|
||||||
|
'name': monitor_name,
|
||||||
|
'parent': group['id'],
|
||||||
|
'interval': interval,
|
||||||
|
'upsideDown': False, # Normal heartbeat mode: receiving pings = healthy
|
||||||
|
'maxretries': retries,
|
||||||
|
'description': monitor_description,
|
||||||
|
'notificationIDList': notification_id_list
|
||||||
|
}
|
||||||
|
|
||||||
|
if existing_monitor:
|
||||||
|
monitor = api.edit_monitor(existing_monitor['id'], **monitor_data)
|
||||||
|
# Refresh to get the full monitor object with pushToken
|
||||||
|
monitors = api.get_monitors()
|
||||||
|
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||||
|
else:
|
||||||
|
monitor_result = api.add_monitor(**monitor_data)
|
||||||
|
# Refresh to get the full monitor object with pushToken
|
||||||
|
monitors = api.get_monitors()
|
||||||
|
monitor = next((m for m in monitors if m.get('name') == monitor_name), None)
|
||||||
|
|
||||||
|
# Output result as JSON
|
||||||
|
result = {
|
||||||
|
'monitor_id': monitor['id'],
|
||||||
|
'push_token': monitor['pushToken'],
|
||||||
|
'group_name': group_name,
|
||||||
|
'group_id': group['id'],
|
||||||
|
'monitor_name': monitor_name
|
||||||
|
}
|
||||||
|
print(json.dumps(result))
|
||||||
|
|
||||||
|
api.disconnect()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
mode: '0755'
|
||||||
|
delegate_to: localhost
|
||||||
|
become: no
|
||||||
|
|
||||||
|
- name: Run Uptime Kuma ZFS monitor setup script
|
||||||
|
command: >
|
||||||
|
{{ ansible_playbook_python }}
|
||||||
|
/tmp/setup_uptime_kuma_zfs_monitor.py
|
||||||
|
"{{ uptime_kuma_api_url }}"
|
||||||
|
"{{ uptime_kuma_username }}"
|
||||||
|
"{{ uptime_kuma_password }}"
|
||||||
|
"{{ uptime_kuma_monitor_group }}"
|
||||||
|
"{{ monitor_name }}"
|
||||||
|
"{{ monitor_friendly_name }} - Daily health check for pool {{ zfs_pool_name }}"
|
||||||
|
"{{ zfs_check_timeout_seconds }}"
|
||||||
|
"{{ zfs_check_retries }}"
|
||||||
|
"{{ ntfy_topic }}"
|
||||||
|
register: monitor_setup_result
|
||||||
|
delegate_to: localhost
|
||||||
|
become: no
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Parse monitor setup result
|
||||||
|
set_fact:
|
||||||
|
monitor_info_parsed: "{{ monitor_setup_result.stdout | from_json }}"
|
||||||
|
|
||||||
|
- name: Set push URL and monitor ID as facts
|
||||||
|
set_fact:
|
||||||
|
uptime_kuma_zfs_push_url: "{{ uptime_kuma_api_url }}/api/push/{{ monitor_info_parsed.push_token }}"
|
||||||
|
uptime_kuma_monitor_id: "{{ monitor_info_parsed.monitor_id }}"
|
||||||
|
|
||||||
|
- name: Install required packages for ZFS monitoring
|
||||||
|
package:
|
||||||
|
name:
|
||||||
|
- curl
|
||||||
|
- jq
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Create monitoring script directory
|
||||||
|
file:
|
||||||
|
path: "{{ zfs_monitoring_script_dir }}"
|
||||||
|
state: directory
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0755'
|
||||||
|
|
||||||
|
- name: Create ZFS health monitoring script
|
||||||
|
copy:
|
||||||
|
dest: "{{ zfs_monitoring_script_path }}"
|
||||||
|
content: |
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# ZFS Pool Health Monitoring Script
|
||||||
|
# Checks ZFS pool health using JSON output and sends heartbeat to Uptime Kuma if healthy
|
||||||
|
# If any issues detected, does NOT send heartbeat (triggers timeout alert)
|
||||||
|
|
||||||
|
LOG_FILE="{{ zfs_log_file }}"
|
||||||
|
UPTIME_KUMA_URL="{{ uptime_kuma_zfs_push_url }}"
|
||||||
|
POOL_NAME="{{ zfs_pool_name }}"
|
||||||
|
HOSTNAME=$(hostname)
|
||||||
|
|
||||||
|
# Function to log messages
|
||||||
|
log_message() {
|
||||||
|
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to check pool health using JSON output
|
||||||
|
check_pool_health() {
|
||||||
|
local pool="$1"
|
||||||
|
local issues_found=0
|
||||||
|
|
||||||
|
# Get pool status as JSON
|
||||||
|
local pool_json
|
||||||
|
pool_json=$(zpool status -j "$pool" 2>&1)
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
log_message "ERROR: Failed to get pool status for $pool"
|
||||||
|
log_message " -> $pool_json"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check 1: Pool state must be ONLINE
|
||||||
|
local pool_state
|
||||||
|
pool_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].state')
|
||||||
|
|
||||||
|
if [ "$pool_state" != "ONLINE" ]; then
|
||||||
|
log_message "ISSUE: Pool state is $pool_state (expected ONLINE)"
|
||||||
|
issues_found=1
|
||||||
|
else
|
||||||
|
log_message "OK: Pool state is ONLINE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check 2: Check all vdevs and devices for non-ONLINE states
|
||||||
|
local bad_states
|
||||||
|
bad_states=$(echo "$pool_json" | jq -r --arg pool "$pool" '
|
||||||
|
.pools[$pool].vdevs[] |
|
||||||
|
.. | objects |
|
||||||
|
select(.state? and .state != "ONLINE") |
|
||||||
|
"\(.name // "unknown"): \(.state)"
|
||||||
|
' 2>/dev/null)
|
||||||
|
|
||||||
|
if [ -n "$bad_states" ]; then
|
||||||
|
log_message "ISSUE: Found devices not in ONLINE state:"
|
||||||
|
echo "$bad_states" | while read -r line; do
|
||||||
|
log_message " -> $line"
|
||||||
|
done
|
||||||
|
issues_found=1
|
||||||
|
else
|
||||||
|
log_message "OK: All devices are ONLINE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check 3: Check for resilvering in progress
|
||||||
|
local scan_function scan_state
|
||||||
|
scan_function=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')
|
||||||
|
scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')
|
||||||
|
|
||||||
|
if [ "$scan_function" = "RESILVER" ] && [ "$scan_state" = "SCANNING" ]; then
|
||||||
|
local resilver_progress
|
||||||
|
resilver_progress=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.issued // "unknown"')
|
||||||
|
log_message "ISSUE: Pool is currently resilvering (disk reconstruction in progress) - ${resilver_progress} processed"
|
||||||
|
issues_found=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check 4: Check for read/write/checksum errors on all devices
|
||||||
|
# Note: ZFS JSON output has error counts as strings, so convert to numbers for comparison
|
||||||
|
local devices_with_errors
|
||||||
|
devices_with_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '
|
||||||
|
.pools[$pool].vdevs[] |
|
||||||
|
.. | objects |
|
||||||
|
select(.name? and ((.read_errors // "0" | tonumber) > 0 or (.write_errors // "0" | tonumber) > 0 or (.checksum_errors // "0" | tonumber) > 0)) |
|
||||||
|
"\(.name): read=\(.read_errors // 0) write=\(.write_errors // 0) cksum=\(.checksum_errors // 0)"
|
||||||
|
' 2>/dev/null)
|
||||||
|
|
||||||
|
if [ -n "$devices_with_errors" ]; then
|
||||||
|
log_message "ISSUE: Found devices with I/O errors:"
|
||||||
|
echo "$devices_with_errors" | while read -r line; do
|
||||||
|
log_message " -> $line"
|
||||||
|
done
|
||||||
|
issues_found=1
|
||||||
|
else
|
||||||
|
log_message "OK: No read/write/checksum errors detected"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check 5: Check for scan errors (from last scrub/resilver)
|
||||||
|
local scan_errors
|
||||||
|
scan_errors=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.errors // "0"')
|
||||||
|
|
||||||
|
if [ "$scan_errors" != "0" ] && [ "$scan_errors" != "null" ] && [ -n "$scan_errors" ]; then
|
||||||
|
log_message "ISSUE: Last scan reported $scan_errors errors"
|
||||||
|
issues_found=1
|
||||||
|
else
|
||||||
|
log_message "OK: No scan errors"
|
||||||
|
fi
|
||||||
|
|
||||||
|
return $issues_found
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to get last scrub info for status message
|
||||||
|
get_scrub_info() {
|
||||||
|
local pool="$1"
|
||||||
|
local pool_json
|
||||||
|
pool_json=$(zpool status -j "$pool" 2>/dev/null)
|
||||||
|
|
||||||
|
local scan_func scan_state scan_start
|
||||||
|
scan_func=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.function // "NONE"')
|
||||||
|
scan_state=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.state // "NONE"')
|
||||||
|
scan_start=$(echo "$pool_json" | jq -r --arg pool "$pool" '.pools[$pool].scan_stats.start_time // ""')
|
||||||
|
|
||||||
|
if [ "$scan_func" = "SCRUB" ] && [ "$scan_state" = "SCANNING" ]; then
|
||||||
|
echo "scrub in progress (started $scan_start)"
|
||||||
|
elif [ "$scan_func" = "SCRUB" ] && [ -n "$scan_start" ]; then
|
||||||
|
echo "last scrub: $scan_start"
|
||||||
|
else
|
||||||
|
echo "no scrub history"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to send heartbeat to Uptime Kuma
|
||||||
|
send_heartbeat() {
|
||||||
|
local message="$1"
|
||||||
|
|
||||||
|
log_message "Sending heartbeat to Uptime Kuma: $message"
|
||||||
|
|
||||||
|
# URL encode the message
|
||||||
|
local encoded_message
|
||||||
|
encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g; s/\//%2F/g')
|
||||||
|
|
||||||
|
local response http_code
|
||||||
|
response=$(curl -s -w "\n%{http_code}" "$UPTIME_KUMA_URL?status=up&msg=$encoded_message" 2>&1)
|
||||||
|
http_code=$(echo "$response" | tail -n1)
|
||||||
|
|
||||||
|
if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
|
||||||
|
log_message "Heartbeat sent successfully (HTTP $http_code)"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
log_message "ERROR: Failed to send heartbeat (HTTP $http_code)"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main health check logic
|
||||||
|
main() {
|
||||||
|
log_message "=========================================="
|
||||||
|
log_message "Starting ZFS health check for pool: $POOL_NAME on $HOSTNAME"
|
||||||
|
|
||||||
|
# Run all health checks
|
||||||
|
if check_pool_health "$POOL_NAME"; then
|
||||||
|
# All checks passed - send heartbeat
|
||||||
|
local scrub_info
|
||||||
|
scrub_info=$(get_scrub_info "$POOL_NAME")
|
||||||
|
|
||||||
|
local message="Pool $POOL_NAME healthy ($scrub_info)"
|
||||||
|
send_heartbeat "$message"
|
||||||
|
|
||||||
|
log_message "Health check completed: ALL OK"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
# Issues found - do NOT send heartbeat (will trigger timeout alert)
|
||||||
|
log_message "Health check completed: ISSUES DETECTED - NOT sending heartbeat"
|
||||||
|
log_message "Uptime Kuma will alert after timeout due to missing heartbeat"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run main function
|
||||||
|
main
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0755'
|
||||||
|
|
||||||
|
- name: Create systemd service for ZFS health monitoring
|
||||||
|
copy:
|
||||||
|
dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.service"
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=ZFS Pool Health Monitor
|
||||||
|
After=zfs.target network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart={{ zfs_monitoring_script_path }}
|
||||||
|
User=root
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0644'
|
||||||
|
|
||||||
|
- name: Create systemd timer for daily ZFS health monitoring
|
||||||
|
copy:
|
||||||
|
dest: "/etc/systemd/system/{{ zfs_systemd_health_service_name }}.timer"
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=Run ZFS Pool Health Monitor daily
|
||||||
|
Requires={{ zfs_systemd_health_service_name }}.service
|
||||||
|
|
||||||
|
[Timer]
|
||||||
|
OnBootSec=5min
|
||||||
|
OnUnitActiveSec={{ zfs_check_interval_seconds }}sec
|
||||||
|
Persistent=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0644'
|
||||||
|
|
||||||
|
- name: Create systemd service for ZFS monthly scrub
|
||||||
|
copy:
|
||||||
|
dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.service"
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=ZFS Monthly Scrub for {{ zfs_pool_name }}
|
||||||
|
After=zfs.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart=/sbin/zpool scrub {{ zfs_pool_name }}
|
||||||
|
User=root
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0644'
|
||||||
|
|
||||||
|
- name: Create systemd timer for monthly ZFS scrub
|
||||||
|
copy:
|
||||||
|
dest: "/etc/systemd/system/{{ zfs_systemd_scrub_service_name }}.timer"
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=Run ZFS Scrub on last day of every month at 4:00 AM
|
||||||
|
Requires={{ zfs_systemd_scrub_service_name }}.service
|
||||||
|
|
||||||
|
[Timer]
|
||||||
|
OnCalendar=*-*~01 04:00:00
|
||||||
|
Persistent=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0644'
|
||||||
|
|
||||||
|
- name: Reload systemd daemon
|
||||||
|
systemd:
|
||||||
|
daemon_reload: yes
|
||||||
|
|
||||||
|
- name: Enable and start ZFS health monitoring timer
|
||||||
|
systemd:
|
||||||
|
name: "{{ zfs_systemd_health_service_name }}.timer"
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
|
||||||
|
- name: Enable and start ZFS monthly scrub timer
|
||||||
|
systemd:
|
||||||
|
name: "{{ zfs_systemd_scrub_service_name }}.timer"
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
|
||||||
|
- name: Test ZFS health monitoring script
|
||||||
|
command: "{{ zfs_monitoring_script_path }}"
|
||||||
|
register: script_test
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Verify script execution
|
||||||
|
assert:
|
||||||
|
that:
|
||||||
|
- script_test.rc == 0
|
||||||
|
fail_msg: "ZFS health monitoring script failed - check pool health"
|
||||||
|
|
||||||
|
- name: Display monitoring configuration
|
||||||
|
debug:
|
||||||
|
msg: |
|
||||||
|
✓ ZFS Pool Health Monitoring deployed successfully!
|
||||||
|
|
||||||
|
Monitor Name: {{ monitor_friendly_name }}
|
||||||
|
Monitor Group: {{ uptime_kuma_monitor_group }}
|
||||||
|
Pool Name: {{ zfs_pool_name }}
|
||||||
|
|
||||||
|
Health Check:
|
||||||
|
- Frequency: Every {{ zfs_check_interval_seconds }} seconds (24 hours)
|
||||||
|
- Timeout: {{ zfs_check_timeout_seconds }} seconds (~25 hours)
|
||||||
|
- Script: {{ zfs_monitoring_script_path }}
|
||||||
|
- Log: {{ zfs_log_file }}
|
||||||
|
- Service: {{ zfs_systemd_health_service_name }}.service
|
||||||
|
- Timer: {{ zfs_systemd_health_service_name }}.timer
|
||||||
|
|
||||||
|
Monthly Scrub:
|
||||||
|
- Schedule: Last day of month at 4:00 AM
|
||||||
|
- Service: {{ zfs_systemd_scrub_service_name }}.service
|
||||||
|
- Timer: {{ zfs_systemd_scrub_service_name }}.timer
|
||||||
|
|
||||||
|
Conditions monitored:
|
||||||
|
- Pool state (must be ONLINE)
|
||||||
|
- Device states (no DEGRADED/FAULTED/OFFLINE/UNAVAIL)
|
||||||
|
- Resilver status (alerts if resilvering)
|
||||||
|
- Read/Write/Checksum errors
|
||||||
|
- Scrub errors
|
||||||
|
|
||||||
|
- name: Clean up temporary Uptime Kuma setup script
|
||||||
|
file:
|
||||||
|
path: /tmp/setup_uptime_kuma_zfs_monitor.py
|
||||||
|
state: absent
|
||||||
|
delegate_to: localhost
|
||||||
|
become: no
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue