Major accomplishments: - ✅ SELinux policy installed and working - ✅ Core Traefik v2.10 deployment running - ✅ Production configuration ready (v3.1) - ✅ Monitoring stack configured - ✅ Comprehensive documentation created - ✅ Security hardening implemented Current status: - 🟡 Partially deployed (60% complete) - ⚠️ Docker socket access needs resolution - ❌ Monitoring stack not deployed yet - ⚠️ Production migration pending Next steps: 1. Fix Docker socket permissions 2. Deploy monitoring stack 3. Migrate to production config 4. Validate full functionality Files added: - Complete Traefik deployment documentation - Production and test configurations - Monitoring stack configurations - SELinux policy module - Security checklists and guides - Current status documentation
414 lines
14 KiB
Bash
Executable File
414 lines
14 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Dynamic Resource Scaling Automation
|
|
# Automatically scales services based on resource utilization metrics
|
|
|
|
set -euo pipefail
|
|
|
|
# Configuration
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
|
|
LOG_FILE="$PROJECT_ROOT/logs/resource-scaling-$(date +%Y%m%d-%H%M%S).log"
|
|
|
|
# Scaling thresholds
|
|
CPU_HIGH_THRESHOLD=80
|
|
CPU_LOW_THRESHOLD=20
|
|
MEMORY_HIGH_THRESHOLD=85
|
|
MEMORY_LOW_THRESHOLD=30
|
|
|
|
# Scaling limits
|
|
MAX_REPLICAS=5
|
|
MIN_REPLICAS=1
|
|
|
|
# Services to manage (add more as needed)
|
|
SCALABLE_SERVICES=(
|
|
"nextcloud_nextcloud"
|
|
"immich_immich_server"
|
|
"paperless_paperless"
|
|
"jellyfin_jellyfin"
|
|
"grafana_grafana"
|
|
)
|
|
|
|
# Create directories
|
|
mkdir -p "$(dirname "$LOG_FILE")" "$PROJECT_ROOT/logs"
|
|
|
|
# Logging function
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
# Get service metrics
|
|
get_service_metrics() {
|
|
local service_name="$1"
|
|
local metrics=()
|
|
|
|
# Get running containers for this service
|
|
local containers
|
|
containers=$(docker service ps "$service_name" --filter "desired-state=running" --format "{{.ID}}" 2>/dev/null || echo "")
|
|
|
|
if [[ -z "$containers" ]]; then
|
|
echo "0 0 0" # cpu_percent memory_percent replica_count
|
|
return
|
|
fi
|
|
|
|
# Calculate average metrics across all replicas
|
|
local total_cpu=0
|
|
local total_memory=0
|
|
local container_count=0
|
|
|
|
while IFS= read -r container_id; do
|
|
if [[ -n "$container_id" ]]; then
|
|
# Get container stats
|
|
local stats
|
|
stats=$(docker stats --no-stream --format "{{.CPUPerc}},{{.MemPerc}}" "$(docker ps -q -f name=$container_id)" 2>/dev/null || echo "0.00%,0.00%")
|
|
|
|
local cpu_percent
|
|
local mem_percent
|
|
cpu_percent=$(echo "$stats" | cut -d',' -f1 | sed 's/%//')
|
|
mem_percent=$(echo "$stats" | cut -d',' -f2 | sed 's/%//')
|
|
|
|
if [[ "$cpu_percent" =~ ^[0-9]+\.?[0-9]*$ ]] && [[ "$mem_percent" =~ ^[0-9]+\.?[0-9]*$ ]]; then
|
|
total_cpu=$(echo "$total_cpu + $cpu_percent" | bc -l)
|
|
total_memory=$(echo "$total_memory + $mem_percent" | bc -l)
|
|
((container_count++))
|
|
fi
|
|
fi
|
|
done <<< "$containers"
|
|
|
|
if [[ $container_count -gt 0 ]]; then
|
|
local avg_cpu
|
|
local avg_memory
|
|
avg_cpu=$(echo "scale=2; $total_cpu / $container_count" | bc -l)
|
|
avg_memory=$(echo "scale=2; $total_memory / $container_count" | bc -l)
|
|
echo "$avg_cpu $avg_memory $container_count"
|
|
else
|
|
echo "0 0 0"
|
|
fi
|
|
}
|
|
|
|
# Get current replica count
|
|
get_replica_count() {
|
|
local service_name="$1"
|
|
docker service ls --filter "name=$service_name" --format "{{.Replicas}}" | cut -d'/' -f1
|
|
}
|
|
|
|
# Scale service up
|
|
scale_up() {
|
|
local service_name="$1"
|
|
local current_replicas="$2"
|
|
local new_replicas=$((current_replicas + 1))
|
|
|
|
if [[ $new_replicas -le $MAX_REPLICAS ]]; then
|
|
log "🔼 Scaling UP $service_name: $current_replicas → $new_replicas replicas"
|
|
docker service update --replicas "$new_replicas" "$service_name" >/dev/null 2>&1 || {
|
|
log "❌ Failed to scale up $service_name"
|
|
return 1
|
|
}
|
|
log "✅ Successfully scaled up $service_name"
|
|
|
|
# Record scaling event
|
|
echo "$(date -Iseconds),scale_up,$service_name,$current_replicas,$new_replicas,auto" >> "$PROJECT_ROOT/logs/scaling-events.csv"
|
|
else
|
|
log "⚠️ $service_name already at maximum replicas ($MAX_REPLICAS)"
|
|
fi
|
|
}
|
|
|
|
# Scale service down
|
|
scale_down() {
|
|
local service_name="$1"
|
|
local current_replicas="$2"
|
|
local new_replicas=$((current_replicas - 1))
|
|
|
|
if [[ $new_replicas -ge $MIN_REPLICAS ]]; then
|
|
log "🔽 Scaling DOWN $service_name: $current_replicas → $new_replicas replicas"
|
|
docker service update --replicas "$new_replicas" "$service_name" >/dev/null 2>&1 || {
|
|
log "❌ Failed to scale down $service_name"
|
|
return 1
|
|
}
|
|
log "✅ Successfully scaled down $service_name"
|
|
|
|
# Record scaling event
|
|
echo "$(date -Iseconds),scale_down,$service_name,$current_replicas,$new_replicas,auto" >> "$PROJECT_ROOT/logs/scaling-events.csv"
|
|
else
|
|
log "⚠️ $service_name already at minimum replicas ($MIN_REPLICAS)"
|
|
fi
|
|
}
|
|
|
|
# Check if scaling is needed
|
|
evaluate_scaling() {
|
|
local service_name="$1"
|
|
local cpu_percent="$2"
|
|
local memory_percent="$3"
|
|
local current_replicas="$4"
|
|
|
|
# Convert to integer for comparison
|
|
local cpu_int
|
|
local memory_int
|
|
cpu_int=$(echo "$cpu_percent" | cut -d'.' -f1)
|
|
memory_int=$(echo "$memory_percent" | cut -d'.' -f1)
|
|
|
|
# Scale up conditions
|
|
if [[ $cpu_int -gt $CPU_HIGH_THRESHOLD ]] || [[ $memory_int -gt $MEMORY_HIGH_THRESHOLD ]]; then
|
|
log "📊 $service_name metrics: CPU=${cpu_percent}%, Memory=${memory_percent}% - HIGH usage detected"
|
|
scale_up "$service_name" "$current_replicas"
|
|
return
|
|
fi
|
|
|
|
# Scale down conditions (only if we have more than minimum replicas)
|
|
if [[ $current_replicas -gt $MIN_REPLICAS ]] && [[ $cpu_int -lt $CPU_LOW_THRESHOLD ]] && [[ $memory_int -lt $MEMORY_LOW_THRESHOLD ]]; then
|
|
log "📊 $service_name metrics: CPU=${cpu_percent}%, Memory=${memory_percent}% - LOW usage detected"
|
|
scale_down "$service_name" "$current_replicas"
|
|
return
|
|
fi
|
|
|
|
# No scaling needed
|
|
log "📊 $service_name metrics: CPU=${cpu_percent}%, Memory=${memory_percent}%, Replicas=$current_replicas - OK"
|
|
}
|
|
|
|
# Time-based scaling (scale down non-critical services at night)
|
|
time_based_scaling() {
|
|
local current_hour
|
|
current_hour=$(date +%H)
|
|
|
|
# Night hours (2 AM - 6 AM): scale down non-critical services
|
|
if [[ $current_hour -ge 2 && $current_hour -le 6 ]]; then
|
|
local night_services=("paperless_paperless" "grafana_grafana")
|
|
|
|
for service in "${night_services[@]}"; do
|
|
local current_replicas
|
|
current_replicas=$(get_replica_count "$service")
|
|
|
|
if [[ $current_replicas -gt 1 ]]; then
|
|
log "🌙 Night scaling: reducing $service to 1 replica (was $current_replicas)"
|
|
docker service update --replicas 1 "$service" >/dev/null 2>&1 || true
|
|
echo "$(date -Iseconds),night_scale_down,$service,$current_replicas,1,time_based" >> "$PROJECT_ROOT/logs/scaling-events.csv"
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# Morning hours (7 AM): scale back up
|
|
if [[ $current_hour -eq 7 ]]; then
|
|
local morning_services=("paperless_paperless" "grafana_grafana")
|
|
|
|
for service in "${morning_services[@]}"; do
|
|
local current_replicas
|
|
current_replicas=$(get_replica_count "$service")
|
|
|
|
if [[ $current_replicas -lt 2 ]]; then
|
|
log "🌅 Morning scaling: restoring $service to 2 replicas (was $current_replicas)"
|
|
docker service update --replicas 2 "$service" >/dev/null 2>&1 || true
|
|
echo "$(date -Iseconds),morning_scale_up,$service,$current_replicas,2,time_based" >> "$PROJECT_ROOT/logs/scaling-events.csv"
|
|
fi
|
|
done
|
|
fi
|
|
}
|
|
|
|
# Generate scaling report
|
|
generate_scaling_report() {
|
|
log "Generating scaling report..."
|
|
|
|
local report_file="$PROJECT_ROOT/logs/scaling-report-$(date +%Y%m%d).yaml"
|
|
cat > "$report_file" << EOF
|
|
scaling_report:
|
|
timestamp: "$(date -Iseconds)"
|
|
evaluation_cycle: $(date +%Y%m%d-%H%M%S)
|
|
|
|
current_state:
|
|
EOF
|
|
|
|
# Add current state of all services
|
|
for service in "${SCALABLE_SERVICES[@]}"; do
|
|
local metrics
|
|
metrics=$(get_service_metrics "$service")
|
|
local cpu_percent memory_percent replica_count
|
|
read -r cpu_percent memory_percent replica_count <<< "$metrics"
|
|
|
|
cat >> "$report_file" << EOF
|
|
- service: "$service"
|
|
replicas: $replica_count
|
|
cpu_usage: "${cpu_percent}%"
|
|
memory_usage: "${memory_percent}%"
|
|
status: $(if docker service ls --filter "name=$service" --format "{{.Name}}" >/dev/null 2>&1; then echo "running"; else echo "not_found"; fi)
|
|
EOF
|
|
done
|
|
|
|
# Add scaling events from today
|
|
local events_today
|
|
events_today=$(grep "$(date +%Y-%m-%d)" "$PROJECT_ROOT/logs/scaling-events.csv" 2>/dev/null | wc -l || echo "0")
|
|
|
|
cat >> "$report_file" << EOF
|
|
|
|
daily_summary:
|
|
scaling_events_today: $events_today
|
|
thresholds:
|
|
cpu_high: ${CPU_HIGH_THRESHOLD}%
|
|
cpu_low: ${CPU_LOW_THRESHOLD}%
|
|
memory_high: ${MEMORY_HIGH_THRESHOLD}%
|
|
memory_low: ${MEMORY_LOW_THRESHOLD}%
|
|
limits:
|
|
max_replicas: $MAX_REPLICAS
|
|
min_replicas: $MIN_REPLICAS
|
|
EOF
|
|
|
|
log "✅ Scaling report generated: $report_file"
|
|
}
|
|
|
|
# Setup continuous monitoring
|
|
setup_monitoring() {
|
|
log "Setting up dynamic scaling monitoring..."
|
|
|
|
# Create systemd service for continuous monitoring
|
|
cat > /tmp/docker-autoscaler.service << 'EOF'
|
|
[Unit]
|
|
Description=Docker Swarm Auto Scaler
|
|
After=docker.service
|
|
Requires=docker.service
|
|
|
|
[Service]
|
|
Type=simple
|
|
ExecStart=/home/jonathan/Coding/HomeAudit/scripts/dynamic-resource-scaling.sh --monitor
|
|
Restart=always
|
|
RestartSec=60
|
|
User=root
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
EOF
|
|
|
|
# Create monitoring loop script
|
|
cat > "$PROJECT_ROOT/scripts/scaling-monitor-loop.sh" << 'EOF'
|
|
#!/bin/bash
|
|
# Continuous monitoring loop for dynamic scaling
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
cd "$SCRIPT_DIR"
|
|
|
|
while true; do
|
|
# Run scaling evaluation
|
|
./dynamic-resource-scaling.sh --evaluate
|
|
|
|
# Wait 5 minutes between evaluations
|
|
sleep 300
|
|
done
|
|
EOF
|
|
|
|
chmod +x "$PROJECT_ROOT/scripts/scaling-monitor-loop.sh"
|
|
log "✅ Monitoring scripts created"
|
|
log "⚠️ To enable: sudo cp /tmp/docker-autoscaler.service /etc/systemd/system/ && sudo systemctl enable --now docker-autoscaler"
|
|
}
|
|
|
|
# Main execution
|
|
main() {
|
|
case "${1:-evaluate}" in
|
|
"--evaluate")
|
|
log "🔍 Starting dynamic scaling evaluation..."
|
|
|
|
# Initialize CSV file if it doesn't exist
|
|
if [[ ! -f "$PROJECT_ROOT/logs/scaling-events.csv" ]]; then
|
|
echo "timestamp,action,service,old_replicas,new_replicas,trigger" > "$PROJECT_ROOT/logs/scaling-events.csv"
|
|
fi
|
|
|
|
# Check each scalable service
|
|
for service in "${SCALABLE_SERVICES[@]}"; do
|
|
if docker service ls --filter "name=$service" --format "{{.Name}}" >/dev/null 2>&1; then
|
|
local metrics
|
|
metrics=$(get_service_metrics "$service")
|
|
local cpu_percent memory_percent current_replicas
|
|
read -r cpu_percent memory_percent current_replicas <<< "$metrics"
|
|
|
|
evaluate_scaling "$service" "$cpu_percent" "$memory_percent" "$current_replicas"
|
|
else
|
|
log "⚠️ Service not found: $service"
|
|
fi
|
|
done
|
|
|
|
# Apply time-based scaling
|
|
time_based_scaling
|
|
|
|
# Generate report
|
|
generate_scaling_report
|
|
;;
|
|
"--monitor")
|
|
log "🔄 Starting continuous monitoring mode..."
|
|
while true; do
|
|
./dynamic-resource-scaling.sh --evaluate
|
|
sleep 300 # 5-minute intervals
|
|
done
|
|
;;
|
|
"--setup")
|
|
setup_monitoring
|
|
;;
|
|
"--status")
|
|
log "📊 Current service status:"
|
|
for service in "${SCALABLE_SERVICES[@]}"; do
|
|
if docker service ls --filter "name=$service" --format "{{.Name}}" >/dev/null 2>&1; then
|
|
local metrics
|
|
metrics=$(get_service_metrics "$service")
|
|
local cpu_percent memory_percent current_replicas
|
|
read -r cpu_percent memory_percent current_replicas <<< "$metrics"
|
|
log " $service: ${current_replicas} replicas, CPU=${cpu_percent}%, Memory=${memory_percent}%"
|
|
else
|
|
log " $service: not found"
|
|
fi
|
|
done
|
|
;;
|
|
"--help"|"-h")
|
|
cat << 'EOF'
|
|
Dynamic Resource Scaling Automation
|
|
|
|
USAGE:
|
|
dynamic-resource-scaling.sh [OPTIONS]
|
|
|
|
OPTIONS:
|
|
--evaluate Run single scaling evaluation (default)
|
|
--monitor Start continuous monitoring mode
|
|
--setup Set up systemd service for continuous monitoring
|
|
--status Show current status of all scalable services
|
|
--help, -h Show this help message
|
|
|
|
EXAMPLES:
|
|
# Single evaluation
|
|
./dynamic-resource-scaling.sh --evaluate
|
|
|
|
# Check current status
|
|
./dynamic-resource-scaling.sh --status
|
|
|
|
# Set up continuous monitoring
|
|
./dynamic-resource-scaling.sh --setup
|
|
|
|
CONFIGURATION:
|
|
Edit the script to modify:
|
|
- CPU_HIGH_THRESHOLD: Scale up when CPU > 80%
|
|
- CPU_LOW_THRESHOLD: Scale down when CPU < 20%
|
|
- MEMORY_HIGH_THRESHOLD: Scale up when Memory > 85%
|
|
- MEMORY_LOW_THRESHOLD: Scale down when Memory < 30%
|
|
- MAX_REPLICAS: Maximum replicas per service (5)
|
|
- MIN_REPLICAS: Minimum replicas per service (1)
|
|
|
|
NOTES:
|
|
- Requires Docker Swarm mode
|
|
- Monitors CPU and memory usage
|
|
- Includes time-based scaling for night hours
|
|
- Logs all scaling events for audit
|
|
- Safe scaling with min/max limits
|
|
EOF
|
|
;;
|
|
*)
|
|
log "❌ Unknown option: $1"
|
|
log "Use --help for usage information"
|
|
exit 1
|
|
;;
|
|
esac
|
|
}
|
|
|
|
# Check dependencies
|
|
if ! command -v bc >/dev/null 2>&1; then
|
|
log "Installing bc for calculations..."
|
|
sudo apt-get update && sudo apt-get install -y bc || {
|
|
log "❌ Failed to install bc. Please install manually."
|
|
exit 1
|
|
}
|
|
fi
|
|
|
|
# Execute main function
|
|
main "$@" |