389 lines
11 KiB
Bash
Executable File
389 lines
11 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Turmli Calendar - Raspberry Pi Monitoring Script
|
|
# Monitors the health and performance of the deployed application
|
|
|
|
set -e
|
|
|
|
# Configuration
|
|
SERVICE_NAME="turmli-calendar"
|
|
APP_PORT="8000"
|
|
LOG_FILE="/var/log/turmli-calendar/monitor.log"
|
|
ALERT_THRESHOLD_MEM=200 # MB
|
|
ALERT_THRESHOLD_CPU=80 # Percentage
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
CYAN='\033[0;36m'
|
|
NC='\033[0m'
|
|
|
|
# Helper functions
|
|
print_header() {
|
|
echo -e "${CYAN}════════════════════════════════════════════════${NC}"
|
|
echo -e "${CYAN} Turmli Calendar - System Monitor${NC}"
|
|
echo -e "${CYAN} $(date '+%Y-%m-%d %H:%M:%S')${NC}"
|
|
echo -e "${CYAN}════════════════════════════════════════════════${NC}"
|
|
echo
|
|
}
|
|
|
|
print_section() {
|
|
echo -e "${BLUE}━━━ $1 ━━━${NC}"
|
|
}
|
|
|
|
print_ok() {
|
|
echo -e "${GREEN}✓${NC} $1"
|
|
}
|
|
|
|
print_warning() {
|
|
echo -e "${YELLOW}⚠${NC} $1"
|
|
}
|
|
|
|
print_error() {
|
|
echo -e "${RED}✗${NC} $1"
|
|
}
|
|
|
|
# System information
|
|
show_system_info() {
|
|
print_section "System Information"
|
|
|
|
# Hostname and OS
|
|
echo "Hostname: $(hostname)"
|
|
echo "OS: $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)"
|
|
echo "Kernel: $(uname -r)"
|
|
echo "Uptime: $(uptime -p)"
|
|
|
|
# CPU info
|
|
echo "CPU: $(cat /proc/cpuinfo | grep 'model name' | head -1 | cut -d':' -f2 | xargs)"
|
|
echo "Cores: $(nproc)"
|
|
|
|
# Temperature
|
|
if command -v vcgencmd &> /dev/null; then
|
|
local temp=$(vcgencmd measure_temp | cut -d'=' -f2)
|
|
echo "Temperature: $temp"
|
|
|
|
# Check throttling
|
|
local throttled=$(vcgencmd get_throttled | cut -d'=' -f2)
|
|
if [ "$throttled" != "0x0" ]; then
|
|
print_warning "CPU throttling detected: $throttled"
|
|
fi
|
|
fi
|
|
echo
|
|
}
|
|
|
|
# Service status
|
|
check_service_status() {
|
|
print_section "Service Status"
|
|
|
|
if systemctl is-active --quiet ${SERVICE_NAME}; then
|
|
print_ok "Service is running"
|
|
|
|
# Get PID and uptime
|
|
local pid=$(systemctl show ${SERVICE_NAME} -p MainPID --value)
|
|
if [ "$pid" != "0" ]; then
|
|
echo "PID: $pid"
|
|
|
|
# Process uptime
|
|
if [ -f "/proc/$pid/stat" ]; then
|
|
local start_time=$(stat -c %Y /proc/$pid)
|
|
local current_time=$(date +%s)
|
|
local uptime=$((current_time - start_time))
|
|
echo "Process uptime: $((uptime / 3600))h $((uptime % 3600 / 60))m"
|
|
fi
|
|
fi
|
|
else
|
|
print_error "Service is not running"
|
|
echo "Last exit status: $(systemctl show ${SERVICE_NAME} -p ExecMainStatus --value)"
|
|
fi
|
|
|
|
# Show recent restarts
|
|
local restarts=$(systemctl show ${SERVICE_NAME} -p NRestarts --value)
|
|
if [ "$restarts" -gt 0 ]; then
|
|
print_warning "Service has restarted $restarts times"
|
|
fi
|
|
echo
|
|
}
|
|
|
|
# Application health check
|
|
check_application_health() {
|
|
print_section "Application Health"
|
|
|
|
# Test API endpoint
|
|
if curl -s -f -m 5 "http://localhost:${APP_PORT}/api/events" > /dev/null 2>&1; then
|
|
print_ok "API endpoint is responding"
|
|
|
|
# Get event count
|
|
local response=$(curl -s "http://localhost:${APP_PORT}/api/events" 2>/dev/null)
|
|
if [ ! -z "$response" ]; then
|
|
local events=$(echo "$response" | python3 -c "import sys, json; data=json.load(sys.stdin); print(len(data.get('events', [])))" 2>/dev/null || echo "unknown")
|
|
echo "Calendar events: $events"
|
|
|
|
# Check last update time
|
|
local last_updated=$(echo "$response" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('last_updated', 'unknown'))" 2>/dev/null || echo "unknown")
|
|
echo "Last updated: $last_updated"
|
|
fi
|
|
else
|
|
print_error "API endpoint not responding"
|
|
fi
|
|
|
|
# Test web interface
|
|
if curl -s -f -m 5 "http://localhost:${APP_PORT}/" > /dev/null 2>&1; then
|
|
print_ok "Web interface is accessible"
|
|
else
|
|
print_error "Web interface not accessible"
|
|
fi
|
|
echo
|
|
}
|
|
|
|
# Resource usage
|
|
show_resource_usage() {
|
|
print_section "Resource Usage"
|
|
|
|
# Memory usage
|
|
local mem_total=$(free -m | grep Mem | awk '{print $2}')
|
|
local mem_used=$(free -m | grep Mem | awk '{print $3}')
|
|
local mem_percent=$((mem_used * 100 / mem_total))
|
|
echo "System Memory: ${mem_used}/${mem_total} MB (${mem_percent}%)"
|
|
|
|
if [ "$mem_percent" -gt 90 ]; then
|
|
print_warning "High memory usage detected"
|
|
fi
|
|
|
|
# Swap usage
|
|
local swap_total=$(free -m | grep Swap | awk '{print $2}')
|
|
local swap_used=$(free -m | grep Swap | awk '{print $3}')
|
|
if [ "$swap_total" -gt 0 ]; then
|
|
local swap_percent=$((swap_used * 100 / swap_total))
|
|
echo "Swap: ${swap_used}/${swap_total} MB (${swap_percent}%)"
|
|
|
|
if [ "$swap_percent" -gt 50 ]; then
|
|
print_warning "High swap usage - performance may be degraded"
|
|
fi
|
|
fi
|
|
|
|
# Process-specific memory
|
|
local pid=$(systemctl show ${SERVICE_NAME} -p MainPID --value)
|
|
if [ "$pid" != "0" ] && [ -f "/proc/$pid/status" ]; then
|
|
local proc_mem=$(grep VmRSS /proc/$pid/status | awk '{print $2/1024}')
|
|
printf "Process Memory: %.1f MB\n" "$proc_mem"
|
|
|
|
if (( $(echo "$proc_mem > $ALERT_THRESHOLD_MEM" | bc -l) )); then
|
|
print_warning "Process memory exceeds threshold (${ALERT_THRESHOLD_MEM} MB)"
|
|
fi
|
|
fi
|
|
|
|
# CPU usage
|
|
echo
|
|
echo "CPU Load:"
|
|
local load=$(uptime | awk -F'load average:' '{print $2}')
|
|
echo " Load average: $load"
|
|
|
|
# Process CPU usage (rough estimate)
|
|
if [ "$pid" != "0" ] && [ -f "/proc/$pid/stat" ]; then
|
|
local cpu_usage=$(ps -p $pid -o %cpu= | tr -d ' ')
|
|
printf " Process CPU: %.1f%%\n" "$cpu_usage"
|
|
|
|
if (( $(echo "$cpu_usage > $ALERT_THRESHOLD_CPU" | bc -l) )); then
|
|
print_warning "High CPU usage detected"
|
|
fi
|
|
fi
|
|
echo
|
|
}
|
|
|
|
# Disk usage
|
|
show_disk_usage() {
|
|
print_section "Disk Usage"
|
|
|
|
# Root filesystem
|
|
local disk_usage=$(df -h / | tail -1)
|
|
echo "Root filesystem:"
|
|
echo " $disk_usage"
|
|
|
|
local disk_percent=$(df / | tail -1 | awk '{print $5}' | tr -d '%')
|
|
if [ "$disk_percent" -gt 80 ]; then
|
|
print_warning "Disk usage above 80%"
|
|
fi
|
|
|
|
# Application directory
|
|
if [ -d "/opt/turmli-calendar" ]; then
|
|
local app_size=$(du -sh /opt/turmli-calendar 2>/dev/null | cut -f1)
|
|
echo "Application directory: $app_size"
|
|
fi
|
|
|
|
# Log directory
|
|
if [ -d "/var/log/turmli-calendar" ]; then
|
|
local log_size=$(du -sh /var/log/turmli-calendar 2>/dev/null | cut -f1)
|
|
echo "Log directory: $log_size"
|
|
fi
|
|
echo
|
|
}
|
|
|
|
# Network statistics
|
|
show_network_stats() {
|
|
print_section "Network Statistics"
|
|
|
|
# Network interfaces
|
|
local interfaces=$(ip -brief link show | grep UP | awk '{print $1}')
|
|
for iface in $interfaces; do
|
|
if [[ "$iface" != "lo" ]]; then
|
|
echo "Interface: $iface"
|
|
local ip=$(ip -brief addr show $iface | awk '{print $3}')
|
|
echo " IP: $ip"
|
|
|
|
# Connection count
|
|
local connections=$(ss -tan | grep :${APP_PORT} | grep ESTAB | wc -l)
|
|
echo " Active connections on port ${APP_PORT}: $connections"
|
|
fi
|
|
done
|
|
echo
|
|
}
|
|
|
|
# Recent errors
|
|
show_recent_errors() {
|
|
print_section "Recent Errors (last 10)"
|
|
|
|
journalctl -u ${SERVICE_NAME} -p err -n 10 --no-pager 2>/dev/null || echo "No recent errors"
|
|
echo
|
|
}
|
|
|
|
# Performance summary
|
|
show_performance_summary() {
|
|
print_section "Performance Summary"
|
|
|
|
local status="HEALTHY"
|
|
local issues=0
|
|
|
|
# Check service
|
|
if ! systemctl is-active --quiet ${SERVICE_NAME}; then
|
|
status="CRITICAL"
|
|
((issues++))
|
|
print_error "Service not running"
|
|
fi
|
|
|
|
# Check API
|
|
if ! curl -s -f -m 5 "http://localhost:${APP_PORT}/api/events" > /dev/null 2>&1; then
|
|
status="DEGRADED"
|
|
((issues++))
|
|
print_warning "API not responding"
|
|
fi
|
|
|
|
# Check memory
|
|
local mem_percent=$(free -m | grep Mem | awk '{print ($3*100)/$2}' | cut -d'.' -f1)
|
|
if [ "$mem_percent" -gt 90 ]; then
|
|
status="DEGRADED"
|
|
((issues++))
|
|
print_warning "High memory usage"
|
|
fi
|
|
|
|
# Check disk
|
|
local disk_percent=$(df / | tail -1 | awk '{print $5}' | tr -d '%')
|
|
if [ "$disk_percent" -gt 90 ]; then
|
|
status="DEGRADED"
|
|
((issues++))
|
|
print_warning "High disk usage"
|
|
fi
|
|
|
|
echo
|
|
if [ "$issues" -eq 0 ]; then
|
|
print_ok "System Status: $status"
|
|
elif [ "$issues" -lt 2 ]; then
|
|
print_warning "System Status: $status ($issues issue)"
|
|
else
|
|
print_error "System Status: $status ($issues issues)"
|
|
fi
|
|
echo
|
|
}
|
|
|
|
# Continuous monitoring mode
|
|
monitor_continuous() {
|
|
while true; do
|
|
clear
|
|
print_header
|
|
check_service_status
|
|
check_application_health
|
|
show_resource_usage
|
|
show_performance_summary
|
|
|
|
echo "Press Ctrl+C to exit"
|
|
echo "Refreshing in 30 seconds..."
|
|
sleep 30
|
|
done
|
|
}
|
|
|
|
# Log monitoring data
|
|
log_metrics() {
|
|
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
|
local pid=$(systemctl show ${SERVICE_NAME} -p MainPID --value)
|
|
local mem_used=$(free -m | grep Mem | awk '{print $3}')
|
|
local cpu_load=$(uptime | awk -F'load average:' '{print $2}' | cut -d',' -f1)
|
|
local api_status="DOWN"
|
|
|
|
if curl -s -f -m 5 "http://localhost:${APP_PORT}/api/events" > /dev/null 2>&1; then
|
|
api_status="UP"
|
|
fi
|
|
|
|
echo "$timestamp,mem=$mem_used,cpu=$cpu_load,api=$api_status" >> "$LOG_FILE"
|
|
}
|
|
|
|
# Command line interface
|
|
case "$1" in
|
|
status)
|
|
print_header
|
|
check_service_status
|
|
check_application_health
|
|
show_performance_summary
|
|
;;
|
|
full)
|
|
print_header
|
|
show_system_info
|
|
check_service_status
|
|
check_application_health
|
|
show_resource_usage
|
|
show_disk_usage
|
|
show_network_stats
|
|
show_recent_errors
|
|
show_performance_summary
|
|
;;
|
|
monitor)
|
|
monitor_continuous
|
|
;;
|
|
resources)
|
|
print_header
|
|
show_resource_usage
|
|
show_disk_usage
|
|
;;
|
|
health)
|
|
print_header
|
|
check_application_health
|
|
;;
|
|
errors)
|
|
print_header
|
|
show_recent_errors
|
|
;;
|
|
log)
|
|
log_metrics
|
|
echo "Metrics logged to $LOG_FILE"
|
|
;;
|
|
*)
|
|
echo "Usage: $0 {status|full|monitor|resources|health|errors|log}"
|
|
echo
|
|
echo "Commands:"
|
|
echo " status - Quick status check"
|
|
echo " full - Complete system analysis"
|
|
echo " monitor - Continuous monitoring (30s refresh)"
|
|
echo " resources - Resource usage details"
|
|
echo " health - Application health check"
|
|
echo " errors - Show recent errors"
|
|
echo " log - Log metrics to file"
|
|
echo
|
|
echo "Examples:"
|
|
echo " $0 status # Quick status"
|
|
echo " $0 monitor # Live monitoring"
|
|
echo " $0 full # Full report"
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
exit 0 |