lots of stuff man

This commit is contained in:
counterweight 2025-11-06 23:09:44 +01:00
parent 3b88e6c5e8
commit c8754e1bdc
Signed by: counterweight
GPG key ID: 883EDBAA726BD96C
43 changed files with 7310 additions and 121 deletions

View file

@ -0,0 +1,491 @@
#!/bin/bash
###############################################################################
# Layer 6: Infrastructure Monitoring
#
# This script deploys disk usage, healthcheck, and CPU temp monitoring.
# Must be run after Layer 4 (Uptime Kuma) is complete with credentials set.
###############################################################################
set -e # Exit on error
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Project root directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
ANSIBLE_DIR="$PROJECT_ROOT/ansible"
###############################################################################
# Helper Functions
###############################################################################
print_header() {
echo -e "\n${BLUE}========================================${NC}"
echo -e "${BLUE}$1${NC}"
echo -e "${BLUE}========================================${NC}\n"
}
print_success() {
echo -e "${GREEN}${NC} $1"
}
print_error() {
echo -e "${RED}${NC} $1"
}
print_warning() {
echo -e "${YELLOW}${NC} $1"
}
print_info() {
echo -e "${BLUE}${NC} $1"
}
confirm_action() {
local prompt="$1"
local response
read -p "$(echo -e ${YELLOW}${prompt}${NC} [y/N]: )" response
[[ "$response" =~ ^[Yy]$ ]]
}
###############################################################################
# Verification Functions
###############################################################################
check_prerequisites() {
print_header "Verifying Prerequisites"
local errors=0
if [ -z "$VIRTUAL_ENV" ]; then
print_error "Virtual environment not activated"
echo "Run: source venv/bin/activate"
((errors++))
else
print_success "Virtual environment activated"
fi
if ! command -v ansible &> /dev/null; then
print_error "Ansible not found"
((errors++))
else
print_success "Ansible found"
fi
if [ ! -f "$ANSIBLE_DIR/inventory.ini" ]; then
print_error "inventory.ini not found"
((errors++))
else
print_success "inventory.ini exists"
fi
# Check Python uptime-kuma-api
if ! python3 -c "import uptime_kuma_api" 2>/dev/null; then
print_error "uptime-kuma-api Python package not found"
print_info "Install with: pip install -r requirements.txt"
((errors++))
else
print_success "uptime-kuma-api package found"
fi
if [ $errors -gt 0 ]; then
print_error "Prerequisites not met"
exit 1
fi
print_success "Prerequisites verified"
}
check_uptime_kuma_credentials() {
print_header "Verifying Uptime Kuma Configuration"
cd "$ANSIBLE_DIR"
# Check if infra_secrets.yml has credentials
if ! grep -q "^uptime_kuma_username:" "$ANSIBLE_DIR/infra_secrets.yml" 2>/dev/null || \
! grep -q "^uptime_kuma_password:" "$ANSIBLE_DIR/infra_secrets.yml" 2>/dev/null; then
print_error "Uptime Kuma credentials not found in infra_secrets.yml"
print_info "You must complete Layer 4 post-deployment steps first:"
echo " 1. Create admin user in Uptime Kuma web UI"
echo " 2. Add credentials to ansible/infra_secrets.yml"
exit 1
fi
local uk_user=$(grep "^uptime_kuma_username:" "$ANSIBLE_DIR/infra_secrets.yml" | awk '{print $2}' | tr -d '"' | tr -d "'")
local uk_pass=$(grep "^uptime_kuma_password:" "$ANSIBLE_DIR/infra_secrets.yml" | awk '{print $2}' | tr -d '"' | tr -d "'")
if [ -z "$uk_user" ] || [ -z "$uk_pass" ]; then
print_error "Uptime Kuma credentials are empty in infra_secrets.yml"
exit 1
fi
print_success "Uptime Kuma credentials found"
# Test API connection
print_info "Testing Uptime Kuma API connection..."
local test_script=$(mktemp)
cat > "$test_script" << 'EOFPYTHON'
import sys
import yaml
from uptime_kuma_api import UptimeKumaApi
try:
with open('infra_vars.yml', 'r') as f:
infra_vars = yaml.safe_load(f)
with open('services_config.yml', 'r') as f:
services_config = yaml.safe_load(f)
with open('infra_secrets.yml', 'r') as f:
secrets = yaml.safe_load(f)
root_domain = infra_vars.get('root_domain')
subdomain = services_config.get('subdomains', {}).get('uptime_kuma', 'uptime')
url = f"https://{subdomain}.{root_domain}"
username = secrets.get('uptime_kuma_username')
password = secrets.get('uptime_kuma_password')
api = UptimeKumaApi(url)
api.login(username, password)
monitors = api.get_monitors()
print(f"SUCCESS:{len(monitors)}")
api.disconnect()
except Exception as e:
print(f"ERROR:{str(e)}", file=sys.stderr)
sys.exit(1)
EOFPYTHON
local result=$(cd "$ANSIBLE_DIR" && python3 "$test_script" 2>&1)
rm -f "$test_script"
if echo "$result" | grep -q "^SUCCESS:"; then
local monitor_count=$(echo "$result" | grep "^SUCCESS:" | cut -d: -f2)
print_success "Successfully connected to Uptime Kuma API"
print_info "Current monitors: $monitor_count"
else
print_error "Cannot connect to Uptime Kuma API"
print_info "Error: $result"
echo ""
print_info "Make sure:"
echo " • Uptime Kuma is running (Layer 4)"
echo " • Credentials are correct in infra_secrets.yml"
echo " • Uptime Kuma is accessible"
exit 1
fi
echo ""
print_success "Uptime Kuma configuration verified"
}
get_hosts_from_inventory() {
local group="$1"
cd "$ANSIBLE_DIR"
ansible-inventory -i inventory.ini --list | \
python3 -c "import sys, json; data=json.load(sys.stdin); print(' '.join(data.get('$group', {}).get('hosts', [])))" 2>/dev/null || echo ""
}
###############################################################################
# Disk Usage Monitoring
###############################################################################
deploy_disk_usage_monitoring() {
print_header "Deploying Disk Usage Monitoring"
cd "$ANSIBLE_DIR"
print_info "This will deploy disk usage monitoring on selected hosts"
print_info "Default settings:"
echo " • Threshold: 80%"
echo " • Check interval: 15 minutes"
echo " • Mount point: /"
echo ""
# Show available hosts
echo "Available hosts:"
for group in vipy watchtower spacey nodito lapy; do
local hosts=$(get_hosts_from_inventory "$group")
if [ -n "$hosts" ]; then
echo " [$group]: $hosts"
fi
done
echo ""
print_info "Deployment options:"
echo " 1. Deploy on all remote hosts (vipy, watchtower, spacey, nodito)"
echo " 2. Deploy on all hosts (including lapy)"
echo " 3. Custom selection (specify groups)"
echo " 4. Skip disk monitoring"
echo ""
echo -e -n "${BLUE}Choose option${NC} [1-4]: "
read option
local limit_hosts=""
case "$option" in
1)
limit_hosts="vipy,watchtower,spacey,nodito"
print_info "Deploying to remote hosts"
;;
2)
limit_hosts="all"
print_info "Deploying to all hosts"
;;
3)
echo -e -n "${BLUE}Enter groups (comma-separated)${NC}: "
read limit_hosts
print_info "Deploying to: $limit_hosts"
;;
4)
print_warning "Skipping disk usage monitoring"
return 0
;;
*)
print_error "Invalid option"
return 0
;;
esac
echo ""
if ! confirm_action "Proceed with disk usage monitoring deployment?"; then
print_warning "Skipped"
return 0
fi
print_info "Running: ansible-playbook -i inventory.ini infra/410_disk_usage_alerts.yml --limit $limit_hosts"
echo ""
if ansible-playbook -i inventory.ini infra/410_disk_usage_alerts.yml --limit "$limit_hosts"; then
print_success "Disk usage monitoring deployed"
return 0
else
print_error "Deployment failed"
return 0
fi
}
###############################################################################
# System Healthcheck Monitoring
###############################################################################
deploy_system_healthcheck() {
print_header "Deploying System Healthcheck Monitoring"
cd "$ANSIBLE_DIR"
print_info "This will deploy system healthcheck monitoring on selected hosts"
print_info "Default settings:"
echo " • Heartbeat interval: 60 seconds"
echo " • Upside-down mode (no news is good news)"
echo ""
# Show available hosts
echo "Available hosts:"
for group in vipy watchtower spacey nodito lapy; do
local hosts=$(get_hosts_from_inventory "$group")
if [ -n "$hosts" ]; then
echo " [$group]: $hosts"
fi
done
echo ""
print_info "Deployment options:"
echo " 1. Deploy on all remote hosts (vipy, watchtower, spacey, nodito)"
echo " 2. Deploy on all hosts (including lapy)"
echo " 3. Custom selection (specify groups)"
echo " 4. Skip healthcheck monitoring"
echo ""
echo -e -n "${BLUE}Choose option${NC} [1-4]: "
read option
local limit_hosts=""
case "$option" in
1)
limit_hosts="vipy,watchtower,spacey,nodito"
print_info "Deploying to remote hosts"
;;
2)
limit_hosts="all"
print_info "Deploying to all hosts"
;;
3)
echo -e -n "${BLUE}Enter groups (comma-separated)${NC}: "
read limit_hosts
print_info "Deploying to: $limit_hosts"
;;
4)
print_warning "Skipping healthcheck monitoring"
return 0
;;
*)
print_error "Invalid option"
return 0
;;
esac
echo ""
if ! confirm_action "Proceed with healthcheck monitoring deployment?"; then
print_warning "Skipped"
return 0
fi
print_info "Running: ansible-playbook -i inventory.ini infra/420_system_healthcheck.yml --limit $limit_hosts"
echo ""
if ansible-playbook -i inventory.ini infra/420_system_healthcheck.yml --limit "$limit_hosts"; then
print_success "System healthcheck monitoring deployed"
return 0
else
print_error "Deployment failed"
return 0
fi
}
###############################################################################
# CPU Temperature Monitoring (Nodito)
###############################################################################
deploy_cpu_temp_monitoring() {
print_header "Deploying CPU Temperature Monitoring (Nodito)"
cd "$ANSIBLE_DIR"
# Check if nodito is configured
local nodito_hosts=$(get_hosts_from_inventory "nodito")
if [ -z "$nodito_hosts" ]; then
print_info "Nodito not configured in inventory, skipping CPU temp monitoring"
return 0
fi
print_info "This will deploy CPU temperature monitoring on nodito (Proxmox)"
print_info "Default settings:"
echo " • Threshold: 80°C"
echo " • Check interval: 60 seconds"
echo ""
# Check if nodito_secrets.yml exists
if [ ! -f "$ANSIBLE_DIR/infra/nodito/nodito_secrets.yml" ]; then
print_warning "nodito_secrets.yml not found"
print_info "You need to create this file with Uptime Kuma push URL"
if confirm_action "Create nodito_secrets.yml now?"; then
# Get Uptime Kuma URL
local root_domain=$(grep "^root_domain:" "$ANSIBLE_DIR/infra_vars.yml" | awk '{print $2}' 2>/dev/null)
local uk_subdomain=$(grep "^uptime_kuma_subdomain:" "$ANSIBLE_DIR/services/uptime_kuma/uptime_kuma_vars.yml" | awk '{print $2}' 2>/dev/null || echo "uptime")
echo -e -n "${BLUE}Enter Uptime Kuma push URL${NC} (e.g., https://${uk_subdomain}.${root_domain}/api/push/xxxxx): "
read push_url
mkdir -p "$ANSIBLE_DIR/infra/nodito"
cat > "$ANSIBLE_DIR/infra/nodito/nodito_secrets.yml" << EOF
# Nodito Secrets
# DO NOT commit to git
# Uptime Kuma Push URL for CPU temperature monitoring
nodito_uptime_kuma_cpu_temp_push_url: "${push_url}"
EOF
print_success "Created nodito_secrets.yml"
else
print_warning "Skipping CPU temp monitoring"
return 0
fi
fi
echo ""
if ! confirm_action "Proceed with CPU temp monitoring deployment?"; then
print_warning "Skipped"
return 0
fi
print_info "Running: ansible-playbook -i inventory.ini infra/nodito/40_cpu_temp_alerts.yml"
echo ""
if ansible-playbook -i inventory.ini infra/nodito/40_cpu_temp_alerts.yml; then
print_success "CPU temperature monitoring deployed"
return 0
else
print_error "Deployment failed"
return 0
fi
}
###############################################################################
# Summary
###############################################################################
print_summary() {
print_header "Layer 6 Setup Complete! 🎉"
echo "Summary of what was deployed:"
echo ""
print_success "Infrastructure monitoring configured"
print_success "Monitors created in Uptime Kuma"
print_success "Systemd services and timers running"
echo ""
print_info "What you have now:"
echo " • Disk usage monitoring on selected hosts"
echo " • System healthcheck monitoring"
echo " • CPU temperature monitoring (if nodito configured)"
echo " • All organized in host-specific groups"
echo ""
print_info "Verify your monitoring:"
echo " 1. Open Uptime Kuma web UI"
echo " 2. Check monitors organized by host groups"
echo " 3. Verify monitors are receiving data"
echo " 4. Configure notification rules"
echo " 5. Watch for alerts via ntfy"
echo ""
print_info "Next steps:"
echo " 1. Customize thresholds if needed"
echo " 2. Proceed to Layer 7: Core Services deployment"
echo ""
}
###############################################################################
# Main Execution
###############################################################################
main() {
clear
print_header "📊 Layer 6: Infrastructure Monitoring"
echo "This script will deploy automated monitoring for your infrastructure."
echo ""
if ! confirm_action "Continue with Layer 6 setup?"; then
echo "Setup cancelled."
exit 0
fi
check_prerequisites
check_uptime_kuma_credentials
# Deploy monitoring
deploy_disk_usage_monitoring
echo ""
deploy_system_healthcheck
echo ""
deploy_cpu_temp_monitoring
echo ""
print_summary
}
# Run main function
main "$@"