personal_infra/scripts/setup_layer_6_infra_monitoring.sh
2025-11-06 23:09:44 +01:00

491 lines
15 KiB
Bash
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
###############################################################################
# Layer 6: Infrastructure Monitoring
#
# This script deploys disk usage, healthcheck, and CPU temp monitoring.
# Must be run after Layer 4 (Uptime Kuma) is complete with credentials set.
###############################################################################
set -e # Exit on error
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Project root directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ANSIBLE_DIR="$PROJECT_ROOT/ansible"
###############################################################################
# Helper Functions
###############################################################################
print_header() {
echo -e "\n${BLUE}========================================${NC}"
echo -e "${BLUE}$1${NC}"
echo -e "${BLUE}========================================${NC}\n"
}
print_success() {
echo -e "${GREEN}${NC} $1"
}
print_error() {
echo -e "${RED}${NC} $1"
}
print_warning() {
echo -e "${YELLOW}${NC} $1"
}
print_info() {
echo -e "${BLUE}${NC} $1"
}
confirm_action() {
local prompt="$1"
local response
read -p "$(echo -e ${YELLOW}${prompt}${NC} [y/N]: )" response
[[ "$response" =~ ^[Yy]$ ]]
}
###############################################################################
# Verification Functions
###############################################################################
check_prerequisites() {
print_header "Verifying Prerequisites"
local errors=0
if [ -z "$VIRTUAL_ENV" ]; then
print_error "Virtual environment not activated"
echo "Run: source venv/bin/activate"
((errors++))
else
print_success "Virtual environment activated"
fi
if ! command -v ansible &> /dev/null; then
print_error "Ansible not found"
((errors++))
else
print_success "Ansible found"
fi
if [ ! -f "$ANSIBLE_DIR/inventory.ini" ]; then
print_error "inventory.ini not found"
((errors++))
else
print_success "inventory.ini exists"
fi
# Check Python uptime-kuma-api
if ! python3 -c "import uptime_kuma_api" 2>/dev/null; then
print_error "uptime-kuma-api Python package not found"
print_info "Install with: pip install -r requirements.txt"
((errors++))
else
print_success "uptime-kuma-api package found"
fi
if [ $errors -gt 0 ]; then
print_error "Prerequisites not met"
exit 1
fi
print_success "Prerequisites verified"
}
check_uptime_kuma_credentials() {
print_header "Verifying Uptime Kuma Configuration"
cd "$ANSIBLE_DIR"
# Check if infra_secrets.yml has credentials
if ! grep -q "^uptime_kuma_username:" "$ANSIBLE_DIR/infra_secrets.yml" 2>/dev/null || \
! grep -q "^uptime_kuma_password:" "$ANSIBLE_DIR/infra_secrets.yml" 2>/dev/null; then
print_error "Uptime Kuma credentials not found in infra_secrets.yml"
print_info "You must complete Layer 4 post-deployment steps first:"
echo " 1. Create admin user in Uptime Kuma web UI"
echo " 2. Add credentials to ansible/infra_secrets.yml"
exit 1
fi
local uk_user=$(grep "^uptime_kuma_username:" "$ANSIBLE_DIR/infra_secrets.yml" | awk '{print $2}' | tr -d '"' | tr -d "'")
local uk_pass=$(grep "^uptime_kuma_password:" "$ANSIBLE_DIR/infra_secrets.yml" | awk '{print $2}' | tr -d '"' | tr -d "'")
if [ -z "$uk_user" ] || [ -z "$uk_pass" ]; then
print_error "Uptime Kuma credentials are empty in infra_secrets.yml"
exit 1
fi
print_success "Uptime Kuma credentials found"
# Test API connection
print_info "Testing Uptime Kuma API connection..."
local test_script=$(mktemp)
cat > "$test_script" << 'EOFPYTHON'
import sys
import yaml
from uptime_kuma_api import UptimeKumaApi
try:
with open('infra_vars.yml', 'r') as f:
infra_vars = yaml.safe_load(f)
with open('services_config.yml', 'r') as f:
services_config = yaml.safe_load(f)
with open('infra_secrets.yml', 'r') as f:
secrets = yaml.safe_load(f)
root_domain = infra_vars.get('root_domain')
subdomain = services_config.get('subdomains', {}).get('uptime_kuma', 'uptime')
url = f"https://{subdomain}.{root_domain}"
username = secrets.get('uptime_kuma_username')
password = secrets.get('uptime_kuma_password')
api = UptimeKumaApi(url)
api.login(username, password)
monitors = api.get_monitors()
print(f"SUCCESS:{len(monitors)}")
api.disconnect()
except Exception as e:
print(f"ERROR:{str(e)}", file=sys.stderr)
sys.exit(1)
EOFPYTHON
local result=$(cd "$ANSIBLE_DIR" && python3 "$test_script" 2>&1)
rm -f "$test_script"
if echo "$result" | grep -q "^SUCCESS:"; then
local monitor_count=$(echo "$result" | grep "^SUCCESS:" | cut -d: -f2)
print_success "Successfully connected to Uptime Kuma API"
print_info "Current monitors: $monitor_count"
else
print_error "Cannot connect to Uptime Kuma API"
print_info "Error: $result"
echo ""
print_info "Make sure:"
echo " • Uptime Kuma is running (Layer 4)"
echo " • Credentials are correct in infra_secrets.yml"
echo " • Uptime Kuma is accessible"
exit 1
fi
echo ""
print_success "Uptime Kuma configuration verified"
}
get_hosts_from_inventory() {
local group="$1"
cd "$ANSIBLE_DIR"
ansible-inventory -i inventory.ini --list | \
python3 -c "import sys, json; data=json.load(sys.stdin); print(' '.join(data.get('$group', {}).get('hosts', [])))" 2>/dev/null || echo ""
}
###############################################################################
# Disk Usage Monitoring
###############################################################################
deploy_disk_usage_monitoring() {
print_header "Deploying Disk Usage Monitoring"
cd "$ANSIBLE_DIR"
print_info "This will deploy disk usage monitoring on selected hosts"
print_info "Default settings:"
echo " • Threshold: 80%"
echo " • Check interval: 15 minutes"
echo " • Mount point: /"
echo ""
# Show available hosts
echo "Available hosts:"
for group in vipy watchtower spacey nodito lapy; do
local hosts=$(get_hosts_from_inventory "$group")
if [ -n "$hosts" ]; then
echo " [$group]: $hosts"
fi
done
echo ""
print_info "Deployment options:"
echo " 1. Deploy on all remote hosts (vipy, watchtower, spacey, nodito)"
echo " 2. Deploy on all hosts (including lapy)"
echo " 3. Custom selection (specify groups)"
echo " 4. Skip disk monitoring"
echo ""
echo -e -n "${BLUE}Choose option${NC} [1-4]: "
read option
local limit_hosts=""
case "$option" in
1)
limit_hosts="vipy,watchtower,spacey,nodito"
print_info "Deploying to remote hosts"
;;
2)
limit_hosts="all"
print_info "Deploying to all hosts"
;;
3)
echo -e -n "${BLUE}Enter groups (comma-separated)${NC}: "
read limit_hosts
print_info "Deploying to: $limit_hosts"
;;
4)
print_warning "Skipping disk usage monitoring"
return 0
;;
*)
print_error "Invalid option"
return 0
;;
esac
echo ""
if ! confirm_action "Proceed with disk usage monitoring deployment?"; then
print_warning "Skipped"
return 0
fi
print_info "Running: ansible-playbook -i inventory.ini infra/410_disk_usage_alerts.yml --limit $limit_hosts"
echo ""
if ansible-playbook -i inventory.ini infra/410_disk_usage_alerts.yml --limit "$limit_hosts"; then
print_success "Disk usage monitoring deployed"
return 0
else
print_error "Deployment failed"
return 0
fi
}
###############################################################################
# System Healthcheck Monitoring
###############################################################################
deploy_system_healthcheck() {
print_header "Deploying System Healthcheck Monitoring"
cd "$ANSIBLE_DIR"
print_info "This will deploy system healthcheck monitoring on selected hosts"
print_info "Default settings:"
echo " • Heartbeat interval: 60 seconds"
echo " • Upside-down mode (no news is good news)"
echo ""
# Show available hosts
echo "Available hosts:"
for group in vipy watchtower spacey nodito lapy; do
local hosts=$(get_hosts_from_inventory "$group")
if [ -n "$hosts" ]; then
echo " [$group]: $hosts"
fi
done
echo ""
print_info "Deployment options:"
echo " 1. Deploy on all remote hosts (vipy, watchtower, spacey, nodito)"
echo " 2. Deploy on all hosts (including lapy)"
echo " 3. Custom selection (specify groups)"
echo " 4. Skip healthcheck monitoring"
echo ""
echo -e -n "${BLUE}Choose option${NC} [1-4]: "
read option
local limit_hosts=""
case "$option" in
1)
limit_hosts="vipy,watchtower,spacey,nodito"
print_info "Deploying to remote hosts"
;;
2)
limit_hosts="all"
print_info "Deploying to all hosts"
;;
3)
echo -e -n "${BLUE}Enter groups (comma-separated)${NC}: "
read limit_hosts
print_info "Deploying to: $limit_hosts"
;;
4)
print_warning "Skipping healthcheck monitoring"
return 0
;;
*)
print_error "Invalid option"
return 0
;;
esac
echo ""
if ! confirm_action "Proceed with healthcheck monitoring deployment?"; then
print_warning "Skipped"
return 0
fi
print_info "Running: ansible-playbook -i inventory.ini infra/420_system_healthcheck.yml --limit $limit_hosts"
echo ""
if ansible-playbook -i inventory.ini infra/420_system_healthcheck.yml --limit "$limit_hosts"; then
print_success "System healthcheck monitoring deployed"
return 0
else
print_error "Deployment failed"
return 0
fi
}
###############################################################################
# CPU Temperature Monitoring (Nodito)
###############################################################################
deploy_cpu_temp_monitoring() {
print_header "Deploying CPU Temperature Monitoring (Nodito)"
cd "$ANSIBLE_DIR"
# Check if nodito is configured
local nodito_hosts=$(get_hosts_from_inventory "nodito")
if [ -z "$nodito_hosts" ]; then
print_info "Nodito not configured in inventory, skipping CPU temp monitoring"
return 0
fi
print_info "This will deploy CPU temperature monitoring on nodito (Proxmox)"
print_info "Default settings:"
echo " • Threshold: 80°C"
echo " • Check interval: 60 seconds"
echo ""
# Check if nodito_secrets.yml exists
if [ ! -f "$ANSIBLE_DIR/infra/nodito/nodito_secrets.yml" ]; then
print_warning "nodito_secrets.yml not found"
print_info "You need to create this file with Uptime Kuma push URL"
if confirm_action "Create nodito_secrets.yml now?"; then
# Get Uptime Kuma URL
local root_domain=$(grep "^root_domain:" "$ANSIBLE_DIR/infra_vars.yml" | awk '{print $2}' 2>/dev/null)
local uk_subdomain=$(grep "^uptime_kuma_subdomain:" "$ANSIBLE_DIR/services/uptime_kuma/uptime_kuma_vars.yml" | awk '{print $2}' 2>/dev/null || echo "uptime")
echo -e -n "${BLUE}Enter Uptime Kuma push URL${NC} (e.g., https://${uk_subdomain}.${root_domain}/api/push/xxxxx): "
read push_url
mkdir -p "$ANSIBLE_DIR/infra/nodito"
cat > "$ANSIBLE_DIR/infra/nodito/nodito_secrets.yml" << EOF
# Nodito Secrets
# DO NOT commit to git
# Uptime Kuma Push URL for CPU temperature monitoring
nodito_uptime_kuma_cpu_temp_push_url: "${push_url}"
EOF
print_success "Created nodito_secrets.yml"
else
print_warning "Skipping CPU temp monitoring"
return 0
fi
fi
echo ""
if ! confirm_action "Proceed with CPU temp monitoring deployment?"; then
print_warning "Skipped"
return 0
fi
print_info "Running: ansible-playbook -i inventory.ini infra/nodito/40_cpu_temp_alerts.yml"
echo ""
if ansible-playbook -i inventory.ini infra/nodito/40_cpu_temp_alerts.yml; then
print_success "CPU temperature monitoring deployed"
return 0
else
print_error "Deployment failed"
return 0
fi
}
###############################################################################
# Summary
###############################################################################
print_summary() {
print_header "Layer 6 Setup Complete! 🎉"
echo "Summary of what was deployed:"
echo ""
print_success "Infrastructure monitoring configured"
print_success "Monitors created in Uptime Kuma"
print_success "Systemd services and timers running"
echo ""
print_info "What you have now:"
echo " • Disk usage monitoring on selected hosts"
echo " • System healthcheck monitoring"
echo " • CPU temperature monitoring (if nodito configured)"
echo " • All organized in host-specific groups"
echo ""
print_info "Verify your monitoring:"
echo " 1. Open Uptime Kuma web UI"
echo " 2. Check monitors organized by host groups"
echo " 3. Verify monitors are receiving data"
echo " 4. Configure notification rules"
echo " 5. Watch for alerts via ntfy"
echo ""
print_info "Next steps:"
echo " 1. Customize thresholds if needed"
echo " 2. Proceed to Layer 7: Core Services deployment"
echo ""
}
###############################################################################
# Main Execution
###############################################################################
main() {
clear
print_header "📊 Layer 6: Infrastructure Monitoring"
echo "This script will deploy automated monitoring for your infrastructure."
echo ""
if ! confirm_action "Continue with Layer 6 setup?"; then
echo "Setup cancelled."
exit 0
fi
check_prerequisites
check_uptime_kuma_credentials
# Deploy monitoring
deploy_disk_usage_monitoring
echo ""
deploy_system_healthcheck
echo ""
deploy_cpu_temp_monitoring
echo ""
print_summary
}
# Run main function
main "$@"