#!/bin/bash ############################################################################### # Layer 6: Infrastructure Monitoring # # This script deploys disk usage, healthcheck, and CPU temp monitoring. # Must be run after Layer 4 (Uptime Kuma) is complete with credentials set. ############################################################################### set -e # Exit on error # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Project root directory SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" ANSIBLE_DIR="$PROJECT_ROOT/ansible" ############################################################################### # Helper Functions ############################################################################### print_header() { echo -e "\n${BLUE}========================================${NC}" echo -e "${BLUE}$1${NC}" echo -e "${BLUE}========================================${NC}\n" } print_success() { echo -e "${GREEN}✓${NC} $1" } print_error() { echo -e "${RED}✗${NC} $1" } print_warning() { echo -e "${YELLOW}⚠${NC} $1" } print_info() { echo -e "${BLUE}ℹ${NC} $1" } confirm_action() { local prompt="$1" local response read -p "$(echo -e ${YELLOW}${prompt}${NC} [y/N]: )" response [[ "$response" =~ ^[Yy]$ ]] } ############################################################################### # Verification Functions ############################################################################### check_prerequisites() { print_header "Verifying Prerequisites" local errors=0 if [ -z "$VIRTUAL_ENV" ]; then print_error "Virtual environment not activated" echo "Run: source venv/bin/activate" ((errors++)) else print_success "Virtual environment activated" fi if ! command -v ansible &> /dev/null; then print_error "Ansible not found" ((errors++)) else print_success "Ansible found" fi if [ ! -f "$ANSIBLE_DIR/inventory.ini" ]; then print_error "inventory.ini not found" ((errors++)) else print_success "inventory.ini exists" fi # Check Python uptime-kuma-api if ! python3 -c "import uptime_kuma_api" 2>/dev/null; then print_error "uptime-kuma-api Python package not found" print_info "Install with: pip install -r requirements.txt" ((errors++)) else print_success "uptime-kuma-api package found" fi if [ $errors -gt 0 ]; then print_error "Prerequisites not met" exit 1 fi print_success "Prerequisites verified" } check_uptime_kuma_credentials() { print_header "Verifying Uptime Kuma Configuration" cd "$ANSIBLE_DIR" # Check if infra_secrets.yml has credentials if ! grep -q "^uptime_kuma_username:" "$ANSIBLE_DIR/infra_secrets.yml" 2>/dev/null || \ ! grep -q "^uptime_kuma_password:" "$ANSIBLE_DIR/infra_secrets.yml" 2>/dev/null; then print_error "Uptime Kuma credentials not found in infra_secrets.yml" print_info "You must complete Layer 4 post-deployment steps first:" echo " 1. Create admin user in Uptime Kuma web UI" echo " 2. Add credentials to ansible/infra_secrets.yml" exit 1 fi local uk_user=$(grep "^uptime_kuma_username:" "$ANSIBLE_DIR/infra_secrets.yml" | awk '{print $2}' | tr -d '"' | tr -d "'") local uk_pass=$(grep "^uptime_kuma_password:" "$ANSIBLE_DIR/infra_secrets.yml" | awk '{print $2}' | tr -d '"' | tr -d "'") if [ -z "$uk_user" ] || [ -z "$uk_pass" ]; then print_error "Uptime Kuma credentials are empty in infra_secrets.yml" exit 1 fi print_success "Uptime Kuma credentials found" # Test API connection print_info "Testing Uptime Kuma API connection..." local test_script=$(mktemp) cat > "$test_script" << 'EOFPYTHON' import sys import yaml from uptime_kuma_api import UptimeKumaApi try: with open('infra_vars.yml', 'r') as f: infra_vars = yaml.safe_load(f) with open('services_config.yml', 'r') as f: services_config = yaml.safe_load(f) with open('infra_secrets.yml', 'r') as f: secrets = yaml.safe_load(f) root_domain = infra_vars.get('root_domain') subdomain = services_config.get('subdomains', {}).get('uptime_kuma', 'uptime') url = f"https://{subdomain}.{root_domain}" username = secrets.get('uptime_kuma_username') password = secrets.get('uptime_kuma_password') api = UptimeKumaApi(url) api.login(username, password) monitors = api.get_monitors() print(f"SUCCESS:{len(monitors)}") api.disconnect() except Exception as e: print(f"ERROR:{str(e)}", file=sys.stderr) sys.exit(1) EOFPYTHON local result=$(cd "$ANSIBLE_DIR" && python3 "$test_script" 2>&1) rm -f "$test_script" if echo "$result" | grep -q "^SUCCESS:"; then local monitor_count=$(echo "$result" | grep "^SUCCESS:" | cut -d: -f2) print_success "Successfully connected to Uptime Kuma API" print_info "Current monitors: $monitor_count" else print_error "Cannot connect to Uptime Kuma API" print_info "Error: $result" echo "" print_info "Make sure:" echo " • Uptime Kuma is running (Layer 4)" echo " • Credentials are correct in infra_secrets.yml" echo " • Uptime Kuma is accessible" exit 1 fi echo "" print_success "Uptime Kuma configuration verified" } get_hosts_from_inventory() { local group="$1" cd "$ANSIBLE_DIR" ansible-inventory -i inventory.ini --list | \ python3 -c "import sys, json; data=json.load(sys.stdin); print(' '.join(data.get('$group', {}).get('hosts', [])))" 2>/dev/null || echo "" } ############################################################################### # Disk Usage Monitoring ############################################################################### deploy_disk_usage_monitoring() { print_header "Deploying Disk Usage Monitoring" cd "$ANSIBLE_DIR" print_info "This will deploy disk usage monitoring on selected hosts" print_info "Default settings:" echo " • Threshold: 80%" echo " • Check interval: 15 minutes" echo " • Mount point: /" echo "" # Show available hosts echo "Available hosts:" for group in vipy watchtower spacey nodito lapy; do local hosts=$(get_hosts_from_inventory "$group") if [ -n "$hosts" ]; then echo " [$group]: $hosts" fi done echo "" print_info "Deployment options:" echo " 1. Deploy on all remote hosts (vipy, watchtower, spacey, nodito)" echo " 2. Deploy on all hosts (including lapy)" echo " 3. Custom selection (specify groups)" echo " 4. Skip disk monitoring" echo "" echo -e -n "${BLUE}Choose option${NC} [1-4]: " read option local limit_hosts="" case "$option" in 1) limit_hosts="vipy,watchtower,spacey,nodito" print_info "Deploying to remote hosts" ;; 2) limit_hosts="all" print_info "Deploying to all hosts" ;; 3) echo -e -n "${BLUE}Enter groups (comma-separated)${NC}: " read limit_hosts print_info "Deploying to: $limit_hosts" ;; 4) print_warning "Skipping disk usage monitoring" return 0 ;; *) print_error "Invalid option" return 0 ;; esac echo "" if ! confirm_action "Proceed with disk usage monitoring deployment?"; then print_warning "Skipped" return 0 fi print_info "Running: ansible-playbook -i inventory.ini infra/410_disk_usage_alerts.yml --limit $limit_hosts" echo "" if ansible-playbook -i inventory.ini infra/410_disk_usage_alerts.yml --limit "$limit_hosts"; then print_success "Disk usage monitoring deployed" return 0 else print_error "Deployment failed" return 0 fi } ############################################################################### # System Healthcheck Monitoring ############################################################################### deploy_system_healthcheck() { print_header "Deploying System Healthcheck Monitoring" cd "$ANSIBLE_DIR" print_info "This will deploy system healthcheck monitoring on selected hosts" print_info "Default settings:" echo " • Heartbeat interval: 60 seconds" echo " • Upside-down mode (no news is good news)" echo "" # Show available hosts echo "Available hosts:" for group in vipy watchtower spacey nodito lapy; do local hosts=$(get_hosts_from_inventory "$group") if [ -n "$hosts" ]; then echo " [$group]: $hosts" fi done echo "" print_info "Deployment options:" echo " 1. Deploy on all remote hosts (vipy, watchtower, spacey, nodito)" echo " 2. Deploy on all hosts (including lapy)" echo " 3. Custom selection (specify groups)" echo " 4. Skip healthcheck monitoring" echo "" echo -e -n "${BLUE}Choose option${NC} [1-4]: " read option local limit_hosts="" case "$option" in 1) limit_hosts="vipy,watchtower,spacey,nodito" print_info "Deploying to remote hosts" ;; 2) limit_hosts="all" print_info "Deploying to all hosts" ;; 3) echo -e -n "${BLUE}Enter groups (comma-separated)${NC}: " read limit_hosts print_info "Deploying to: $limit_hosts" ;; 4) print_warning "Skipping healthcheck monitoring" return 0 ;; *) print_error "Invalid option" return 0 ;; esac echo "" if ! confirm_action "Proceed with healthcheck monitoring deployment?"; then print_warning "Skipped" return 0 fi print_info "Running: ansible-playbook -i inventory.ini infra/420_system_healthcheck.yml --limit $limit_hosts" echo "" if ansible-playbook -i inventory.ini infra/420_system_healthcheck.yml --limit "$limit_hosts"; then print_success "System healthcheck monitoring deployed" return 0 else print_error "Deployment failed" return 0 fi } ############################################################################### # CPU Temperature Monitoring (Nodito) ############################################################################### deploy_cpu_temp_monitoring() { print_header "Deploying CPU Temperature Monitoring (Nodito)" cd "$ANSIBLE_DIR" # Check if nodito is configured local nodito_hosts=$(get_hosts_from_inventory "nodito") if [ -z "$nodito_hosts" ]; then print_info "Nodito not configured in inventory, skipping CPU temp monitoring" return 0 fi print_info "This will deploy CPU temperature monitoring on nodito (Proxmox)" print_info "Default settings:" echo " • Threshold: 80°C" echo " • Check interval: 60 seconds" echo "" # Check if nodito_secrets.yml exists if [ ! -f "$ANSIBLE_DIR/infra/nodito/nodito_secrets.yml" ]; then print_warning "nodito_secrets.yml not found" print_info "You need to create this file with Uptime Kuma push URL" if confirm_action "Create nodito_secrets.yml now?"; then # Get Uptime Kuma URL local root_domain=$(grep "^root_domain:" "$ANSIBLE_DIR/infra_vars.yml" | awk '{print $2}' 2>/dev/null) local uk_subdomain=$(grep "^uptime_kuma_subdomain:" "$ANSIBLE_DIR/services/uptime_kuma/uptime_kuma_vars.yml" | awk '{print $2}' 2>/dev/null || echo "uptime") echo -e -n "${BLUE}Enter Uptime Kuma push URL${NC} (e.g., https://${uk_subdomain}.${root_domain}/api/push/xxxxx): " read push_url mkdir -p "$ANSIBLE_DIR/infra/nodito" cat > "$ANSIBLE_DIR/infra/nodito/nodito_secrets.yml" << EOF # Nodito Secrets # DO NOT commit to git # Uptime Kuma Push URL for CPU temperature monitoring nodito_uptime_kuma_cpu_temp_push_url: "${push_url}" EOF print_success "Created nodito_secrets.yml" else print_warning "Skipping CPU temp monitoring" return 0 fi fi echo "" if ! confirm_action "Proceed with CPU temp monitoring deployment?"; then print_warning "Skipped" return 0 fi print_info "Running: ansible-playbook -i inventory.ini infra/nodito/40_cpu_temp_alerts.yml" echo "" if ansible-playbook -i inventory.ini infra/nodito/40_cpu_temp_alerts.yml; then print_success "CPU temperature monitoring deployed" return 0 else print_error "Deployment failed" return 0 fi } ############################################################################### # Summary ############################################################################### print_summary() { print_header "Layer 6 Setup Complete! 🎉" echo "Summary of what was deployed:" echo "" print_success "Infrastructure monitoring configured" print_success "Monitors created in Uptime Kuma" print_success "Systemd services and timers running" echo "" print_info "What you have now:" echo " • Disk usage monitoring on selected hosts" echo " • System healthcheck monitoring" echo " • CPU temperature monitoring (if nodito configured)" echo " • All organized in host-specific groups" echo "" print_info "Verify your monitoring:" echo " 1. Open Uptime Kuma web UI" echo " 2. Check monitors organized by host groups" echo " 3. Verify monitors are receiving data" echo " 4. Configure notification rules" echo " 5. Watch for alerts via ntfy" echo "" print_info "Next steps:" echo " 1. Customize thresholds if needed" echo " 2. Proceed to Layer 7: Core Services deployment" echo "" } ############################################################################### # Main Execution ############################################################################### main() { clear print_header "📊 Layer 6: Infrastructure Monitoring" echo "This script will deploy automated monitoring for your infrastructure." echo "" if ! confirm_action "Continue with Layer 6 setup?"; then echo "Setup cancelled." exit 0 fi check_prerequisites check_uptime_kuma_credentials # Deploy monitoring deploy_disk_usage_monitoring echo "" deploy_system_healthcheck echo "" deploy_cpu_temp_monitoring echo "" print_summary } # Run main function main "$@"