Files
2025-10-30 13:33:08 +01:00

389 lines
11 KiB
Bash
Executable File

#!/bin/bash
# Turmli Calendar - Raspberry Pi Monitoring Script
# Monitors the health and performance of the deployed application
set -e
# Configuration
SERVICE_NAME="turmli-calendar"
APP_PORT="8000"
LOG_FILE="/var/log/turmli-calendar/monitor.log"
ALERT_THRESHOLD_MEM=200 # MB
ALERT_THRESHOLD_CPU=80 # Percentage
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'
# Helper functions
print_header() {
echo -e "${CYAN}════════════════════════════════════════════════${NC}"
echo -e "${CYAN} Turmli Calendar - System Monitor${NC}"
echo -e "${CYAN} $(date '+%Y-%m-%d %H:%M:%S')${NC}"
echo -e "${CYAN}════════════════════════════════════════════════${NC}"
echo
}
print_section() {
echo -e "${BLUE}━━━ $1 ━━━${NC}"
}
print_ok() {
echo -e "${GREEN}${NC} $1"
}
print_warning() {
echo -e "${YELLOW}${NC} $1"
}
print_error() {
echo -e "${RED}${NC} $1"
}
# System information
show_system_info() {
print_section "System Information"
# Hostname and OS
echo "Hostname: $(hostname)"
echo "OS: $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)"
echo "Kernel: $(uname -r)"
echo "Uptime: $(uptime -p)"
# CPU info
echo "CPU: $(cat /proc/cpuinfo | grep 'model name' | head -1 | cut -d':' -f2 | xargs)"
echo "Cores: $(nproc)"
# Temperature
if command -v vcgencmd &> /dev/null; then
local temp=$(vcgencmd measure_temp | cut -d'=' -f2)
echo "Temperature: $temp"
# Check throttling
local throttled=$(vcgencmd get_throttled | cut -d'=' -f2)
if [ "$throttled" != "0x0" ]; then
print_warning "CPU throttling detected: $throttled"
fi
fi
echo
}
# Service status
check_service_status() {
print_section "Service Status"
if systemctl is-active --quiet ${SERVICE_NAME}; then
print_ok "Service is running"
# Get PID and uptime
local pid=$(systemctl show ${SERVICE_NAME} -p MainPID --value)
if [ "$pid" != "0" ]; then
echo "PID: $pid"
# Process uptime
if [ -f "/proc/$pid/stat" ]; then
local start_time=$(stat -c %Y /proc/$pid)
local current_time=$(date +%s)
local uptime=$((current_time - start_time))
echo "Process uptime: $((uptime / 3600))h $((uptime % 3600 / 60))m"
fi
fi
else
print_error "Service is not running"
echo "Last exit status: $(systemctl show ${SERVICE_NAME} -p ExecMainStatus --value)"
fi
# Show recent restarts
local restarts=$(systemctl show ${SERVICE_NAME} -p NRestarts --value)
if [ "$restarts" -gt 0 ]; then
print_warning "Service has restarted $restarts times"
fi
echo
}
# Application health check
check_application_health() {
print_section "Application Health"
# Test API endpoint
if curl -s -f -m 5 "http://localhost:${APP_PORT}/api/events" > /dev/null 2>&1; then
print_ok "API endpoint is responding"
# Get event count
local response=$(curl -s "http://localhost:${APP_PORT}/api/events" 2>/dev/null)
if [ ! -z "$response" ]; then
local events=$(echo "$response" | python3 -c "import sys, json; data=json.load(sys.stdin); print(len(data.get('events', [])))" 2>/dev/null || echo "unknown")
echo "Calendar events: $events"
# Check last update time
local last_updated=$(echo "$response" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('last_updated', 'unknown'))" 2>/dev/null || echo "unknown")
echo "Last updated: $last_updated"
fi
else
print_error "API endpoint not responding"
fi
# Test web interface
if curl -s -f -m 5 "http://localhost:${APP_PORT}/" > /dev/null 2>&1; then
print_ok "Web interface is accessible"
else
print_error "Web interface not accessible"
fi
echo
}
# Resource usage
show_resource_usage() {
print_section "Resource Usage"
# Memory usage
local mem_total=$(free -m | grep Mem | awk '{print $2}')
local mem_used=$(free -m | grep Mem | awk '{print $3}')
local mem_percent=$((mem_used * 100 / mem_total))
echo "System Memory: ${mem_used}/${mem_total} MB (${mem_percent}%)"
if [ "$mem_percent" -gt 90 ]; then
print_warning "High memory usage detected"
fi
# Swap usage
local swap_total=$(free -m | grep Swap | awk '{print $2}')
local swap_used=$(free -m | grep Swap | awk '{print $3}')
if [ "$swap_total" -gt 0 ]; then
local swap_percent=$((swap_used * 100 / swap_total))
echo "Swap: ${swap_used}/${swap_total} MB (${swap_percent}%)"
if [ "$swap_percent" -gt 50 ]; then
print_warning "High swap usage - performance may be degraded"
fi
fi
# Process-specific memory
local pid=$(systemctl show ${SERVICE_NAME} -p MainPID --value)
if [ "$pid" != "0" ] && [ -f "/proc/$pid/status" ]; then
local proc_mem=$(grep VmRSS /proc/$pid/status | awk '{print $2/1024}')
printf "Process Memory: %.1f MB\n" "$proc_mem"
if (( $(echo "$proc_mem > $ALERT_THRESHOLD_MEM" | bc -l) )); then
print_warning "Process memory exceeds threshold (${ALERT_THRESHOLD_MEM} MB)"
fi
fi
# CPU usage
echo
echo "CPU Load:"
local load=$(uptime | awk -F'load average:' '{print $2}')
echo " Load average: $load"
# Process CPU usage (rough estimate)
if [ "$pid" != "0" ] && [ -f "/proc/$pid/stat" ]; then
local cpu_usage=$(ps -p $pid -o %cpu= | tr -d ' ')
printf " Process CPU: %.1f%%\n" "$cpu_usage"
if (( $(echo "$cpu_usage > $ALERT_THRESHOLD_CPU" | bc -l) )); then
print_warning "High CPU usage detected"
fi
fi
echo
}
# Disk usage
show_disk_usage() {
print_section "Disk Usage"
# Root filesystem
local disk_usage=$(df -h / | tail -1)
echo "Root filesystem:"
echo " $disk_usage"
local disk_percent=$(df / | tail -1 | awk '{print $5}' | tr -d '%')
if [ "$disk_percent" -gt 80 ]; then
print_warning "Disk usage above 80%"
fi
# Application directory
if [ -d "/opt/turmli-calendar" ]; then
local app_size=$(du -sh /opt/turmli-calendar 2>/dev/null | cut -f1)
echo "Application directory: $app_size"
fi
# Log directory
if [ -d "/var/log/turmli-calendar" ]; then
local log_size=$(du -sh /var/log/turmli-calendar 2>/dev/null | cut -f1)
echo "Log directory: $log_size"
fi
echo
}
# Network statistics
show_network_stats() {
print_section "Network Statistics"
# Network interfaces
local interfaces=$(ip -brief link show | grep UP | awk '{print $1}')
for iface in $interfaces; do
if [[ "$iface" != "lo" ]]; then
echo "Interface: $iface"
local ip=$(ip -brief addr show $iface | awk '{print $3}')
echo " IP: $ip"
# Connection count
local connections=$(ss -tan | grep :${APP_PORT} | grep ESTAB | wc -l)
echo " Active connections on port ${APP_PORT}: $connections"
fi
done
echo
}
# Recent errors
show_recent_errors() {
print_section "Recent Errors (last 10)"
journalctl -u ${SERVICE_NAME} -p err -n 10 --no-pager 2>/dev/null || echo "No recent errors"
echo
}
# Performance summary
show_performance_summary() {
print_section "Performance Summary"
local status="HEALTHY"
local issues=0
# Check service
if ! systemctl is-active --quiet ${SERVICE_NAME}; then
status="CRITICAL"
((issues++))
print_error "Service not running"
fi
# Check API
if ! curl -s -f -m 5 "http://localhost:${APP_PORT}/api/events" > /dev/null 2>&1; then
status="DEGRADED"
((issues++))
print_warning "API not responding"
fi
# Check memory
local mem_percent=$(free -m | grep Mem | awk '{print ($3*100)/$2}' | cut -d'.' -f1)
if [ "$mem_percent" -gt 90 ]; then
status="DEGRADED"
((issues++))
print_warning "High memory usage"
fi
# Check disk
local disk_percent=$(df / | tail -1 | awk '{print $5}' | tr -d '%')
if [ "$disk_percent" -gt 90 ]; then
status="DEGRADED"
((issues++))
print_warning "High disk usage"
fi
echo
if [ "$issues" -eq 0 ]; then
print_ok "System Status: $status"
elif [ "$issues" -lt 2 ]; then
print_warning "System Status: $status ($issues issue)"
else
print_error "System Status: $status ($issues issues)"
fi
echo
}
# Continuous monitoring mode
monitor_continuous() {
while true; do
clear
print_header
check_service_status
check_application_health
show_resource_usage
show_performance_summary
echo "Press Ctrl+C to exit"
echo "Refreshing in 30 seconds..."
sleep 30
done
}
# Log monitoring data
log_metrics() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local pid=$(systemctl show ${SERVICE_NAME} -p MainPID --value)
local mem_used=$(free -m | grep Mem | awk '{print $3}')
local cpu_load=$(uptime | awk -F'load average:' '{print $2}' | cut -d',' -f1)
local api_status="DOWN"
if curl -s -f -m 5 "http://localhost:${APP_PORT}/api/events" > /dev/null 2>&1; then
api_status="UP"
fi
echo "$timestamp,mem=$mem_used,cpu=$cpu_load,api=$api_status" >> "$LOG_FILE"
}
# Command line interface
case "$1" in
status)
print_header
check_service_status
check_application_health
show_performance_summary
;;
full)
print_header
show_system_info
check_service_status
check_application_health
show_resource_usage
show_disk_usage
show_network_stats
show_recent_errors
show_performance_summary
;;
monitor)
monitor_continuous
;;
resources)
print_header
show_resource_usage
show_disk_usage
;;
health)
print_header
check_application_health
;;
errors)
print_header
show_recent_errors
;;
log)
log_metrics
echo "Metrics logged to $LOG_FILE"
;;
*)
echo "Usage: $0 {status|full|monitor|resources|health|errors|log}"
echo
echo "Commands:"
echo " status - Quick status check"
echo " full - Complete system analysis"
echo " monitor - Continuous monitoring (30s refresh)"
echo " resources - Resource usage details"
echo " health - Application health check"
echo " errors - Show recent errors"
echo " log - Log metrics to file"
echo
echo "Examples:"
echo " $0 status # Quick status"
echo " $0 monitor # Live monitoring"
echo " $0 full # Full report"
exit 1
;;
esac
exit 0