491 lines
15 KiB
Bash
Executable file
491 lines
15 KiB
Bash
Executable file
#!/bin/bash
|
||
|
||
###############################################################################
|
||
# Layer 6: Infrastructure Monitoring
|
||
#
|
||
# This script deploys disk usage, healthcheck, and CPU temp monitoring.
|
||
# Must be run after Layer 4 (Uptime Kuma) is complete with credentials set.
|
||
###############################################################################
|
||
|
||
set -e # Exit on error
|
||
|
||
# Colors for output
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
BLUE='\033[0;34m'
|
||
NC='\033[0m' # No Color
|
||
|
||
# Project root directory
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||
ANSIBLE_DIR="$PROJECT_ROOT/ansible"
|
||
|
||
###############################################################################
|
||
# Helper Functions
|
||
###############################################################################
|
||
|
||
print_header() {
|
||
echo -e "\n${BLUE}========================================${NC}"
|
||
echo -e "${BLUE}$1${NC}"
|
||
echo -e "${BLUE}========================================${NC}\n"
|
||
}
|
||
|
||
print_success() {
|
||
echo -e "${GREEN}✓${NC} $1"
|
||
}
|
||
|
||
print_error() {
|
||
echo -e "${RED}✗${NC} $1"
|
||
}
|
||
|
||
print_warning() {
|
||
echo -e "${YELLOW}⚠${NC} $1"
|
||
}
|
||
|
||
print_info() {
|
||
echo -e "${BLUE}ℹ${NC} $1"
|
||
}
|
||
|
||
confirm_action() {
|
||
local prompt="$1"
|
||
local response
|
||
|
||
read -p "$(echo -e ${YELLOW}${prompt}${NC} [y/N]: )" response
|
||
[[ "$response" =~ ^[Yy]$ ]]
|
||
}
|
||
|
||
###############################################################################
|
||
# Verification Functions
|
||
###############################################################################
|
||
|
||
check_prerequisites() {
|
||
print_header "Verifying Prerequisites"
|
||
|
||
local errors=0
|
||
|
||
if [ -z "$VIRTUAL_ENV" ]; then
|
||
print_error "Virtual environment not activated"
|
||
echo "Run: source venv/bin/activate"
|
||
((errors++))
|
||
else
|
||
print_success "Virtual environment activated"
|
||
fi
|
||
|
||
if ! command -v ansible &> /dev/null; then
|
||
print_error "Ansible not found"
|
||
((errors++))
|
||
else
|
||
print_success "Ansible found"
|
||
fi
|
||
|
||
if [ ! -f "$ANSIBLE_DIR/inventory.ini" ]; then
|
||
print_error "inventory.ini not found"
|
||
((errors++))
|
||
else
|
||
print_success "inventory.ini exists"
|
||
fi
|
||
|
||
# Check Python uptime-kuma-api
|
||
if ! python3 -c "import uptime_kuma_api" 2>/dev/null; then
|
||
print_error "uptime-kuma-api Python package not found"
|
||
print_info "Install with: pip install -r requirements.txt"
|
||
((errors++))
|
||
else
|
||
print_success "uptime-kuma-api package found"
|
||
fi
|
||
|
||
if [ $errors -gt 0 ]; then
|
||
print_error "Prerequisites not met"
|
||
exit 1
|
||
fi
|
||
|
||
print_success "Prerequisites verified"
|
||
}
|
||
|
||
check_uptime_kuma_credentials() {
|
||
print_header "Verifying Uptime Kuma Configuration"
|
||
|
||
cd "$ANSIBLE_DIR"
|
||
|
||
# Check if infra_secrets.yml has credentials
|
||
if ! grep -q "^uptime_kuma_username:" "$ANSIBLE_DIR/infra_secrets.yml" 2>/dev/null || \
|
||
! grep -q "^uptime_kuma_password:" "$ANSIBLE_DIR/infra_secrets.yml" 2>/dev/null; then
|
||
print_error "Uptime Kuma credentials not found in infra_secrets.yml"
|
||
print_info "You must complete Layer 4 post-deployment steps first:"
|
||
echo " 1. Create admin user in Uptime Kuma web UI"
|
||
echo " 2. Add credentials to ansible/infra_secrets.yml"
|
||
exit 1
|
||
fi
|
||
|
||
local uk_user=$(grep "^uptime_kuma_username:" "$ANSIBLE_DIR/infra_secrets.yml" | awk '{print $2}' | tr -d '"' | tr -d "'")
|
||
local uk_pass=$(grep "^uptime_kuma_password:" "$ANSIBLE_DIR/infra_secrets.yml" | awk '{print $2}' | tr -d '"' | tr -d "'")
|
||
|
||
if [ -z "$uk_user" ] || [ -z "$uk_pass" ]; then
|
||
print_error "Uptime Kuma credentials are empty in infra_secrets.yml"
|
||
exit 1
|
||
fi
|
||
|
||
print_success "Uptime Kuma credentials found"
|
||
|
||
# Test API connection
|
||
print_info "Testing Uptime Kuma API connection..."
|
||
|
||
local test_script=$(mktemp)
|
||
cat > "$test_script" << 'EOFPYTHON'
|
||
import sys
|
||
import yaml
|
||
from uptime_kuma_api import UptimeKumaApi
|
||
|
||
try:
|
||
with open('infra_vars.yml', 'r') as f:
|
||
infra_vars = yaml.safe_load(f)
|
||
|
||
with open('services_config.yml', 'r') as f:
|
||
services_config = yaml.safe_load(f)
|
||
|
||
with open('infra_secrets.yml', 'r') as f:
|
||
secrets = yaml.safe_load(f)
|
||
|
||
root_domain = infra_vars.get('root_domain')
|
||
subdomain = services_config.get('subdomains', {}).get('uptime_kuma', 'uptime')
|
||
url = f"https://{subdomain}.{root_domain}"
|
||
|
||
username = secrets.get('uptime_kuma_username')
|
||
password = secrets.get('uptime_kuma_password')
|
||
|
||
api = UptimeKumaApi(url)
|
||
api.login(username, password)
|
||
|
||
monitors = api.get_monitors()
|
||
print(f"SUCCESS:{len(monitors)}")
|
||
api.disconnect()
|
||
|
||
except Exception as e:
|
||
print(f"ERROR:{str(e)}", file=sys.stderr)
|
||
sys.exit(1)
|
||
EOFPYTHON
|
||
|
||
local result=$(cd "$ANSIBLE_DIR" && python3 "$test_script" 2>&1)
|
||
rm -f "$test_script"
|
||
|
||
if echo "$result" | grep -q "^SUCCESS:"; then
|
||
local monitor_count=$(echo "$result" | grep "^SUCCESS:" | cut -d: -f2)
|
||
print_success "Successfully connected to Uptime Kuma API"
|
||
print_info "Current monitors: $monitor_count"
|
||
else
|
||
print_error "Cannot connect to Uptime Kuma API"
|
||
print_info "Error: $result"
|
||
echo ""
|
||
print_info "Make sure:"
|
||
echo " • Uptime Kuma is running (Layer 4)"
|
||
echo " • Credentials are correct in infra_secrets.yml"
|
||
echo " • Uptime Kuma is accessible"
|
||
exit 1
|
||
fi
|
||
|
||
echo ""
|
||
print_success "Uptime Kuma configuration verified"
|
||
}
|
||
|
||
get_hosts_from_inventory() {
|
||
local group="$1"
|
||
cd "$ANSIBLE_DIR"
|
||
ansible-inventory -i inventory.ini --list | \
|
||
python3 -c "import sys, json; data=json.load(sys.stdin); print(' '.join(data.get('$group', {}).get('hosts', [])))" 2>/dev/null || echo ""
|
||
}
|
||
|
||
###############################################################################
|
||
# Disk Usage Monitoring
|
||
###############################################################################
|
||
|
||
deploy_disk_usage_monitoring() {
|
||
print_header "Deploying Disk Usage Monitoring"
|
||
|
||
cd "$ANSIBLE_DIR"
|
||
|
||
print_info "This will deploy disk usage monitoring on selected hosts"
|
||
print_info "Default settings:"
|
||
echo " • Threshold: 80%"
|
||
echo " • Check interval: 15 minutes"
|
||
echo " • Mount point: /"
|
||
echo ""
|
||
|
||
# Show available hosts
|
||
echo "Available hosts:"
|
||
for group in vipy watchtower spacey nodito lapy; do
|
||
local hosts=$(get_hosts_from_inventory "$group")
|
||
if [ -n "$hosts" ]; then
|
||
echo " [$group]: $hosts"
|
||
fi
|
||
done
|
||
echo ""
|
||
|
||
print_info "Deployment options:"
|
||
echo " 1. Deploy on all remote hosts (vipy, watchtower, spacey, nodito)"
|
||
echo " 2. Deploy on all hosts (including lapy)"
|
||
echo " 3. Custom selection (specify groups)"
|
||
echo " 4. Skip disk monitoring"
|
||
echo ""
|
||
|
||
echo -e -n "${BLUE}Choose option${NC} [1-4]: "
|
||
read option
|
||
|
||
local limit_hosts=""
|
||
case "$option" in
|
||
1)
|
||
limit_hosts="vipy,watchtower,spacey,nodito"
|
||
print_info "Deploying to remote hosts"
|
||
;;
|
||
2)
|
||
limit_hosts="all"
|
||
print_info "Deploying to all hosts"
|
||
;;
|
||
3)
|
||
echo -e -n "${BLUE}Enter groups (comma-separated)${NC}: "
|
||
read limit_hosts
|
||
print_info "Deploying to: $limit_hosts"
|
||
;;
|
||
4)
|
||
print_warning "Skipping disk usage monitoring"
|
||
return 0
|
||
;;
|
||
*)
|
||
print_error "Invalid option"
|
||
return 0
|
||
;;
|
||
esac
|
||
|
||
echo ""
|
||
if ! confirm_action "Proceed with disk usage monitoring deployment?"; then
|
||
print_warning "Skipped"
|
||
return 0
|
||
fi
|
||
|
||
print_info "Running: ansible-playbook -i inventory.ini infra/410_disk_usage_alerts.yml --limit $limit_hosts"
|
||
echo ""
|
||
|
||
if ansible-playbook -i inventory.ini infra/410_disk_usage_alerts.yml --limit "$limit_hosts"; then
|
||
print_success "Disk usage monitoring deployed"
|
||
return 0
|
||
else
|
||
print_error "Deployment failed"
|
||
return 0
|
||
fi
|
||
}
|
||
|
||
###############################################################################
|
||
# System Healthcheck Monitoring
|
||
###############################################################################
|
||
|
||
deploy_system_healthcheck() {
|
||
print_header "Deploying System Healthcheck Monitoring"
|
||
|
||
cd "$ANSIBLE_DIR"
|
||
|
||
print_info "This will deploy system healthcheck monitoring on selected hosts"
|
||
print_info "Default settings:"
|
||
echo " • Heartbeat interval: 60 seconds"
|
||
echo " • Upside-down mode (no news is good news)"
|
||
echo ""
|
||
|
||
# Show available hosts
|
||
echo "Available hosts:"
|
||
for group in vipy watchtower spacey nodito lapy; do
|
||
local hosts=$(get_hosts_from_inventory "$group")
|
||
if [ -n "$hosts" ]; then
|
||
echo " [$group]: $hosts"
|
||
fi
|
||
done
|
||
echo ""
|
||
|
||
print_info "Deployment options:"
|
||
echo " 1. Deploy on all remote hosts (vipy, watchtower, spacey, nodito)"
|
||
echo " 2. Deploy on all hosts (including lapy)"
|
||
echo " 3. Custom selection (specify groups)"
|
||
echo " 4. Skip healthcheck monitoring"
|
||
echo ""
|
||
|
||
echo -e -n "${BLUE}Choose option${NC} [1-4]: "
|
||
read option
|
||
|
||
local limit_hosts=""
|
||
case "$option" in
|
||
1)
|
||
limit_hosts="vipy,watchtower,spacey,nodito"
|
||
print_info "Deploying to remote hosts"
|
||
;;
|
||
2)
|
||
limit_hosts="all"
|
||
print_info "Deploying to all hosts"
|
||
;;
|
||
3)
|
||
echo -e -n "${BLUE}Enter groups (comma-separated)${NC}: "
|
||
read limit_hosts
|
||
print_info "Deploying to: $limit_hosts"
|
||
;;
|
||
4)
|
||
print_warning "Skipping healthcheck monitoring"
|
||
return 0
|
||
;;
|
||
*)
|
||
print_error "Invalid option"
|
||
return 0
|
||
;;
|
||
esac
|
||
|
||
echo ""
|
||
if ! confirm_action "Proceed with healthcheck monitoring deployment?"; then
|
||
print_warning "Skipped"
|
||
return 0
|
||
fi
|
||
|
||
print_info "Running: ansible-playbook -i inventory.ini infra/420_system_healthcheck.yml --limit $limit_hosts"
|
||
echo ""
|
||
|
||
if ansible-playbook -i inventory.ini infra/420_system_healthcheck.yml --limit "$limit_hosts"; then
|
||
print_success "System healthcheck monitoring deployed"
|
||
return 0
|
||
else
|
||
print_error "Deployment failed"
|
||
return 0
|
||
fi
|
||
}
|
||
|
||
###############################################################################
|
||
# CPU Temperature Monitoring (Nodito)
|
||
###############################################################################
|
||
|
||
deploy_cpu_temp_monitoring() {
|
||
print_header "Deploying CPU Temperature Monitoring (Nodito)"
|
||
|
||
cd "$ANSIBLE_DIR"
|
||
|
||
# Check if nodito is configured
|
||
local nodito_hosts=$(get_hosts_from_inventory "nodito")
|
||
if [ -z "$nodito_hosts" ]; then
|
||
print_info "Nodito not configured in inventory, skipping CPU temp monitoring"
|
||
return 0
|
||
fi
|
||
|
||
print_info "This will deploy CPU temperature monitoring on nodito (Proxmox)"
|
||
print_info "Default settings:"
|
||
echo " • Threshold: 80°C"
|
||
echo " • Check interval: 60 seconds"
|
||
echo ""
|
||
|
||
# Check if nodito_secrets.yml exists
|
||
if [ ! -f "$ANSIBLE_DIR/infra/nodito/nodito_secrets.yml" ]; then
|
||
print_warning "nodito_secrets.yml not found"
|
||
print_info "You need to create this file with Uptime Kuma push URL"
|
||
|
||
if confirm_action "Create nodito_secrets.yml now?"; then
|
||
# Get Uptime Kuma URL
|
||
local root_domain=$(grep "^root_domain:" "$ANSIBLE_DIR/infra_vars.yml" | awk '{print $2}' 2>/dev/null)
|
||
local uk_subdomain=$(grep "^uptime_kuma_subdomain:" "$ANSIBLE_DIR/services/uptime_kuma/uptime_kuma_vars.yml" | awk '{print $2}' 2>/dev/null || echo "uptime")
|
||
|
||
echo -e -n "${BLUE}Enter Uptime Kuma push URL${NC} (e.g., https://${uk_subdomain}.${root_domain}/api/push/xxxxx): "
|
||
read push_url
|
||
|
||
mkdir -p "$ANSIBLE_DIR/infra/nodito"
|
||
cat > "$ANSIBLE_DIR/infra/nodito/nodito_secrets.yml" << EOF
|
||
# Nodito Secrets
|
||
# DO NOT commit to git
|
||
|
||
# Uptime Kuma Push URL for CPU temperature monitoring
|
||
nodito_uptime_kuma_cpu_temp_push_url: "${push_url}"
|
||
EOF
|
||
print_success "Created nodito_secrets.yml"
|
||
else
|
||
print_warning "Skipping CPU temp monitoring"
|
||
return 0
|
||
fi
|
||
fi
|
||
|
||
echo ""
|
||
if ! confirm_action "Proceed with CPU temp monitoring deployment?"; then
|
||
print_warning "Skipped"
|
||
return 0
|
||
fi
|
||
|
||
print_info "Running: ansible-playbook -i inventory.ini infra/nodito/40_cpu_temp_alerts.yml"
|
||
echo ""
|
||
|
||
if ansible-playbook -i inventory.ini infra/nodito/40_cpu_temp_alerts.yml; then
|
||
print_success "CPU temperature monitoring deployed"
|
||
return 0
|
||
else
|
||
print_error "Deployment failed"
|
||
return 0
|
||
fi
|
||
}
|
||
|
||
###############################################################################
|
||
# Summary
|
||
###############################################################################
|
||
|
||
print_summary() {
|
||
print_header "Layer 6 Setup Complete! 🎉"
|
||
|
||
echo "Summary of what was deployed:"
|
||
echo ""
|
||
print_success "Infrastructure monitoring configured"
|
||
print_success "Monitors created in Uptime Kuma"
|
||
print_success "Systemd services and timers running"
|
||
echo ""
|
||
|
||
print_info "What you have now:"
|
||
echo " • Disk usage monitoring on selected hosts"
|
||
echo " • System healthcheck monitoring"
|
||
echo " • CPU temperature monitoring (if nodito configured)"
|
||
echo " • All organized in host-specific groups"
|
||
echo ""
|
||
|
||
print_info "Verify your monitoring:"
|
||
echo " 1. Open Uptime Kuma web UI"
|
||
echo " 2. Check monitors organized by host groups"
|
||
echo " 3. Verify monitors are receiving data"
|
||
echo " 4. Configure notification rules"
|
||
echo " 5. Watch for alerts via ntfy"
|
||
echo ""
|
||
|
||
print_info "Next steps:"
|
||
echo " 1. Customize thresholds if needed"
|
||
echo " 2. Proceed to Layer 7: Core Services deployment"
|
||
echo ""
|
||
}
|
||
|
||
###############################################################################
|
||
# Main Execution
|
||
###############################################################################
|
||
|
||
main() {
|
||
clear
|
||
|
||
print_header "📊 Layer 6: Infrastructure Monitoring"
|
||
|
||
echo "This script will deploy automated monitoring for your infrastructure."
|
||
echo ""
|
||
|
||
if ! confirm_action "Continue with Layer 6 setup?"; then
|
||
echo "Setup cancelled."
|
||
exit 0
|
||
fi
|
||
|
||
check_prerequisites
|
||
check_uptime_kuma_credentials
|
||
|
||
# Deploy monitoring
|
||
deploy_disk_usage_monitoring
|
||
echo ""
|
||
deploy_system_healthcheck
|
||
echo ""
|
||
deploy_cpu_temp_monitoring
|
||
|
||
echo ""
|
||
print_summary
|
||
}
|
||
|
||
# Run main function
|
||
main "$@"
|
||
|