temp monitor
This commit is contained in:
parent
85012f8ba5
commit
4a4c61308a
5 changed files with 366 additions and 1 deletions
6
.gitignore
vendored
6
.gitignore
vendored
|
|
@ -1,3 +1,9 @@
|
||||||
inventory.ini
|
inventory.ini
|
||||||
venv/*
|
venv/*
|
||||||
.env
|
.env
|
||||||
|
|
||||||
|
# Secrets and sensitive files
|
||||||
|
*_secrets.yml
|
||||||
|
*_secrets.yaml
|
||||||
|
secrets/
|
||||||
|
.secrets/
|
||||||
|
|
|
||||||
|
|
@ -74,6 +74,21 @@ Note that, by applying these playbooks, both the root user and the `counterweigh
|
||||||
|
|
||||||
Note that, by applying these playbooks, both the root user and the `counterweight` user will use the same SSH pubkey for auth, but root login will be disabled.
|
Note that, by applying these playbooks, both the root user and the `counterweight` user will use the same SSH pubkey for auth, but root login will be disabled.
|
||||||
|
|
||||||
|
### Deploy CPU Temperature Monitoring
|
||||||
|
|
||||||
|
* The nodito server can be configured with CPU temperature monitoring that sends alerts to Uptime Kuma when temperatures exceed a threshold.
|
||||||
|
* Before running the CPU temperature monitoring playbook, you need to create a secrets file with your Uptime Kuma push URL:
|
||||||
|
* Create `ansible/infra/nodito/nodito_secrets.yml` with:
|
||||||
|
```yaml
|
||||||
|
uptime_kuma_url: "https://your-uptime-kuma.com/api/push/your-push-key"
|
||||||
|
```
|
||||||
|
* Run the CPU temperature monitoring setup with: `ansible-playbook -i inventory.ini infra/nodito/40_cpu_temp_alerts.yml`
|
||||||
|
* This will:
|
||||||
|
* Install required packages (lm-sensors, curl, jq, bc)
|
||||||
|
* Create a monitoring script that checks CPU temperature every minute
|
||||||
|
* Set up a systemd service and timer for automated monitoring
|
||||||
|
* Send alerts to Uptime Kuma when temperature exceeds the threshold (default: 80°C)
|
||||||
|
|
||||||
## GPG Keys
|
## GPG Keys
|
||||||
|
|
||||||
Some of the backups are stored encrypted for security. To allow this, fill in the gpg variables listed in `example.inventory.ini` under the `lapy` block.
|
Some of the backups are stored encrypted for security. To allow this, fill in the gpg variables listed in `example.inventory.ini` under the `lapy` block.
|
||||||
|
|
|
||||||
128
ansible/infra/nodito/30_proxmox_bootstrap_playbook.yml
Normal file
128
ansible/infra/nodito/30_proxmox_bootstrap_playbook.yml
Normal file
|
|
@ -0,0 +1,128 @@
|
||||||
|
- name: Bootstrap Nodito SSH Key Access
|
||||||
|
hosts: nodito
|
||||||
|
become: true
|
||||||
|
vars_files:
|
||||||
|
- ../infra_vars.yml
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Install sudo package
|
||||||
|
package:
|
||||||
|
name: sudo
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Ensure SSH directory exists for root
|
||||||
|
file:
|
||||||
|
path: /root/.ssh
|
||||||
|
state: directory
|
||||||
|
mode: "0700"
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
|
||||||
|
- name: Install SSH public key for root
|
||||||
|
authorized_key:
|
||||||
|
user: root
|
||||||
|
key: "{{ lookup('file', ansible_ssh_private_key_file + '.pub') }}"
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Ensure SSH key-based authentication is enabled
|
||||||
|
lineinfile:
|
||||||
|
path: /etc/ssh/sshd_config
|
||||||
|
regexp: "^#?PubkeyAuthentication"
|
||||||
|
line: "PubkeyAuthentication yes"
|
||||||
|
state: present
|
||||||
|
backrefs: yes
|
||||||
|
|
||||||
|
- name: Ensure AuthorizedKeysFile is properly configured
|
||||||
|
lineinfile:
|
||||||
|
path: /etc/ssh/sshd_config
|
||||||
|
regexp: "^#?AuthorizedKeysFile"
|
||||||
|
line: "AuthorizedKeysFile .ssh/authorized_keys"
|
||||||
|
state: present
|
||||||
|
backrefs: yes
|
||||||
|
|
||||||
|
- name: Restart SSH service
|
||||||
|
service:
|
||||||
|
name: ssh
|
||||||
|
state: restarted
|
||||||
|
|
||||||
|
- name: Wait for SSH to be ready
|
||||||
|
wait_for:
|
||||||
|
port: "{{ ssh_port }}"
|
||||||
|
host: "{{ ansible_host }}"
|
||||||
|
delay: 2
|
||||||
|
timeout: 30
|
||||||
|
|
||||||
|
- name: Test SSH key authentication
|
||||||
|
command: whoami
|
||||||
|
register: ssh_key_test
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Verify SSH key authentication works
|
||||||
|
assert:
|
||||||
|
that:
|
||||||
|
- ssh_key_test.stdout == "root"
|
||||||
|
fail_msg: "SSH key authentication failed - expected 'root', got '{{ ssh_key_test.stdout }}'"
|
||||||
|
|
||||||
|
- name: Create new user
|
||||||
|
user:
|
||||||
|
name: "{{ new_user }}"
|
||||||
|
groups: sudo
|
||||||
|
shell: /bin/bash
|
||||||
|
state: present
|
||||||
|
create_home: yes
|
||||||
|
|
||||||
|
- name: Set up SSH directory for new user
|
||||||
|
file:
|
||||||
|
path: "/home/{{ new_user }}/.ssh"
|
||||||
|
state: directory
|
||||||
|
mode: "0700"
|
||||||
|
owner: "{{ new_user }}"
|
||||||
|
group: "{{ new_user }}"
|
||||||
|
|
||||||
|
- name: Install SSH public key for new user
|
||||||
|
authorized_key:
|
||||||
|
user: "{{ new_user }}"
|
||||||
|
key: "{{ lookup('file', ansible_ssh_private_key_file + '.pub') }}"
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Allow new user to run sudo without password
|
||||||
|
copy:
|
||||||
|
dest: "/etc/sudoers.d/{{ new_user }}"
|
||||||
|
content: "{{ new_user }} ALL=(ALL) NOPASSWD:ALL"
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: "0440"
|
||||||
|
|
||||||
|
- name: Disable root login
|
||||||
|
lineinfile:
|
||||||
|
path: /etc/ssh/sshd_config
|
||||||
|
regexp: "^#?PermitRootLogin .*"
|
||||||
|
line: "PermitRootLogin no"
|
||||||
|
state: present
|
||||||
|
backrefs: yes
|
||||||
|
|
||||||
|
- name: Disable password authentication
|
||||||
|
lineinfile:
|
||||||
|
path: /etc/ssh/sshd_config
|
||||||
|
regexp: "^#?PasswordAuthentication .*"
|
||||||
|
line: "PasswordAuthentication no"
|
||||||
|
state: present
|
||||||
|
backrefs: yes
|
||||||
|
|
||||||
|
- name: Restart SSH service
|
||||||
|
service:
|
||||||
|
name: ssh
|
||||||
|
state: restarted
|
||||||
|
|
||||||
|
- name: Wait for SSH to be ready
|
||||||
|
wait_for:
|
||||||
|
port: "{{ ssh_port }}"
|
||||||
|
host: "{{ ansible_host }}"
|
||||||
|
delay: 2
|
||||||
|
timeout: 30
|
||||||
|
|
||||||
|
- name: Test connection with new user
|
||||||
|
command: whoami
|
||||||
|
become_user: "{{ new_user }}"
|
||||||
|
register: new_user_test
|
||||||
|
changed_when: false
|
||||||
203
ansible/infra/nodito/40_cpu_temp_alerts.yml
Normal file
203
ansible/infra/nodito/40_cpu_temp_alerts.yml
Normal file
|
|
@ -0,0 +1,203 @@
|
||||||
|
- name: Deploy Nodito CPU Temperature Monitoring
|
||||||
|
hosts: nodito
|
||||||
|
become: yes
|
||||||
|
vars_files:
|
||||||
|
- ../../infra_vars.yml
|
||||||
|
- ./nodito_vars.yml
|
||||||
|
- ./nodito_secrets.yml
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: Validate Uptime Kuma URL is provided
|
||||||
|
assert:
|
||||||
|
that:
|
||||||
|
- nodito_uptime_kuma_cpu_temp_push_url != ""
|
||||||
|
fail_msg: "uptime_kuma_url must be set in nodito_secrets.yml"
|
||||||
|
|
||||||
|
- name: Install required packages for temperature monitoring
|
||||||
|
package:
|
||||||
|
name:
|
||||||
|
- lm-sensors
|
||||||
|
- curl
|
||||||
|
- jq
|
||||||
|
- bc
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Create monitoring script directory
|
||||||
|
file:
|
||||||
|
path: "{{ monitoring_script_dir }}"
|
||||||
|
state: directory
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0755'
|
||||||
|
|
||||||
|
- name: Create CPU temperature monitoring script
|
||||||
|
copy:
|
||||||
|
dest: "{{ monitoring_script_path }}"
|
||||||
|
content: |
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# CPU Temperature Monitoring Script for Nodito
|
||||||
|
# Monitors CPU temperature and sends alerts to Uptime Kuma
|
||||||
|
|
||||||
|
LOG_FILE="{{ log_file }}"
|
||||||
|
TEMP_THRESHOLD="{{ temp_threshold_celsius }}"
|
||||||
|
UPTIME_KUMA_URL="{{ nodito_uptime_kuma_cpu_temp_push_url }}"
|
||||||
|
|
||||||
|
# Function to log messages
|
||||||
|
log_message() {
|
||||||
|
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" >> "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to get CPU temperature
|
||||||
|
get_cpu_temp() {
|
||||||
|
# Try different methods to get CPU temperature
|
||||||
|
local temp=""
|
||||||
|
|
||||||
|
# Method 1: sensors command (most common)
|
||||||
|
if command -v sensors >/dev/null 2>&1; then
|
||||||
|
temp=$(sensors 2>/dev/null | grep -E "Core 0|Package id 0|Tdie|Tctl" | head -1 | grep -oE '[0-9]+\.[0-9]+°C' | grep -oE '[0-9]+\.[0-9]+')
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Method 2: thermal zone (fallback)
|
||||||
|
if [ -z "$temp" ] && [ -f /sys/class/thermal/thermal_zone0/temp ]; then
|
||||||
|
temp=$(cat /sys/class/thermal/thermal_zone0/temp)
|
||||||
|
temp=$(echo "scale=1; $temp/1000" | bc -l 2>/dev/null || echo "$temp")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Method 3: acpi (fallback)
|
||||||
|
if [ -z "$temp" ] && command -v acpi >/dev/null 2>&1; then
|
||||||
|
temp=$(acpi -t 2>/dev/null | grep -oE '[0-9]+\.[0-9]+' | head -1)
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "$temp"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to send alert to Uptime Kuma
|
||||||
|
send_uptime_kuma_alert() {
|
||||||
|
local temp="$1"
|
||||||
|
local message="CPU Temperature Alert: ${temp}°C (Threshold: ${TEMP_THRESHOLD}°C)"
|
||||||
|
|
||||||
|
log_message "ALERT: $message"
|
||||||
|
|
||||||
|
# Send push notification to Uptime Kuma
|
||||||
|
encoded_message=$(printf '%s\n' "$message" | sed 's/ /%20/g; s/°/%C2%B0/g; s/(/%28/g; s/)/%29/g; s/:/%3A/g')
|
||||||
|
curl "$UPTIME_KUMA_URL?status=up&msg=$encoded_message"
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
log_message "Alert sent successfully to Uptime Kuma"
|
||||||
|
else
|
||||||
|
log_message "ERROR: Failed to send alert to Uptime Kuma"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main monitoring logic
|
||||||
|
main() {
|
||||||
|
log_message "Starting CPU temperature check"
|
||||||
|
|
||||||
|
# Get current CPU temperature
|
||||||
|
current_temp=$(get_cpu_temp)
|
||||||
|
|
||||||
|
if [ -z "$current_temp" ]; then
|
||||||
|
log_message "ERROR: Could not read CPU temperature"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_message "Current CPU temperature: ${current_temp}°C"
|
||||||
|
|
||||||
|
# Check if temperature exceeds threshold
|
||||||
|
if (( $(echo "$current_temp > $TEMP_THRESHOLD" | bc -l) )); then
|
||||||
|
log_message "WARNING: CPU temperature ${current_temp}°C exceeds threshold ${TEMP_THRESHOLD}°C"
|
||||||
|
send_uptime_kuma_alert "$current_temp"
|
||||||
|
else
|
||||||
|
log_message "CPU temperature is within normal range"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run main function
|
||||||
|
main
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0755'
|
||||||
|
|
||||||
|
- name: Create systemd service for CPU temperature monitoring
|
||||||
|
copy:
|
||||||
|
dest: "/etc/systemd/system/{{ systemd_service_name }}.service"
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=Nodito CPU Temperature Monitor
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
ExecStart={{ monitoring_script_path }}
|
||||||
|
User=root
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0644'
|
||||||
|
|
||||||
|
- name: Create systemd timer for CPU temperature monitoring
|
||||||
|
copy:
|
||||||
|
dest: "/etc/systemd/system/{{ systemd_service_name }}.timer"
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=Run Nodito CPU Temperature Monitor every {{ temp_check_interval_minutes }} minute(s)
|
||||||
|
Requires={{ systemd_service_name }}.service
|
||||||
|
|
||||||
|
[Timer]
|
||||||
|
OnBootSec={{ temp_check_interval_minutes }}min
|
||||||
|
OnUnitActiveSec={{ temp_check_interval_minutes }}min
|
||||||
|
Persistent=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0644'
|
||||||
|
|
||||||
|
- name: Reload systemd daemon
|
||||||
|
systemd:
|
||||||
|
daemon_reload: yes
|
||||||
|
|
||||||
|
- name: Enable and start CPU temperature monitoring timer
|
||||||
|
systemd:
|
||||||
|
name: "{{ systemd_service_name }}.timer"
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
|
||||||
|
- name: Test CPU temperature monitoring script
|
||||||
|
command: "{{ monitoring_script_path }}"
|
||||||
|
register: script_test
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Verify script execution
|
||||||
|
assert:
|
||||||
|
that:
|
||||||
|
- script_test.rc == 0
|
||||||
|
fail_msg: "CPU temperature monitoring script failed to execute properly"
|
||||||
|
|
||||||
|
- name: Check if sensors are available
|
||||||
|
command: sensors
|
||||||
|
register: sensors_check
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Display sensor information
|
||||||
|
debug:
|
||||||
|
msg: "Sensor information: {{ sensors_check.stdout_lines if sensors_check.rc == 0 else 'Sensors not available - using fallback methods' }}"
|
||||||
|
|
||||||
|
- name: Show monitoring configuration
|
||||||
|
debug:
|
||||||
|
msg:
|
||||||
|
- "CPU Temperature Monitoring configured successfully"
|
||||||
|
- "Temperature threshold: {{ temp_threshold_celsius }}°C"
|
||||||
|
- "Check interval: {{ temp_check_interval_minutes }} minute(s)"
|
||||||
|
- "Uptime Kuma URL: {{ nodito_uptime_kuma_cpu_temp_push_url }}"
|
||||||
|
- "Monitoring script: {{ monitoring_script_path }}"
|
||||||
|
- "Log file: {{ log_file }}"
|
||||||
|
- "Service: {{ systemd_service_name }}.service"
|
||||||
|
- "Timer: {{ systemd_service_name }}.timer"
|
||||||
13
ansible/infra/nodito/nodito_vars.yml
Normal file
13
ansible/infra/nodito/nodito_vars.yml
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
# Nodito CPU Temperature Monitoring Configuration
|
||||||
|
|
||||||
|
# Temperature Monitoring Configuration
|
||||||
|
temp_threshold_celsius: 80
|
||||||
|
temp_check_interval_minutes: 1
|
||||||
|
|
||||||
|
# Script Configuration
|
||||||
|
monitoring_script_dir: /opt/nodito-monitoring
|
||||||
|
monitoring_script_path: "{{ monitoring_script_dir }}/cpu_temp_monitor.sh"
|
||||||
|
log_file: "{{ monitoring_script_dir }}/cpu_temp_monitor.log"
|
||||||
|
|
||||||
|
# System Configuration
|
||||||
|
systemd_service_name: nodito-cpu-temp-monitor
|
||||||
Loading…
Add table
Add a link
Reference in a new issue