HomeAudit/scripts/dynamic-resource-scaling.sh

#!/bin/bash

# Dynamic Resource Scaling Automation
# Automatically scales services based on resource utilization metrics

set -euo pipefail

# Configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
LOG_FILE="$PROJECT_ROOT/logs/resource-scaling-$(date +%Y%m%d-%H%M%S).log"

# Scaling thresholds
CPU_HIGH_THRESHOLD=80
CPU_LOW_THRESHOLD=20
MEMORY_HIGH_THRESHOLD=85
MEMORY_LOW_THRESHOLD=30

# Scaling limits
MAX_REPLICAS=5
MIN_REPLICAS=1

# Services to manage (add more as needed)
SCALABLE_SERVICES=(
    "nextcloud_nextcloud"
    "immich_immich_server"
    "paperless_paperless"
    "jellyfin_jellyfin"
    "grafana_grafana"
)

# Create directories
mkdir -p "$(dirname "$LOG_FILE")" "$PROJECT_ROOT/logs"

# Logging function
log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
}

# Get service metrics
get_service_metrics() {
    local service_name="$1"
    local metrics=()

    # Get running containers for this service
    local containers
    containers=$(docker service ps "$service_name" --filter "desired-state=running" --format "{{.ID}}" 2>/dev/null || echo "")

    if [[ -z "$containers" ]]; then
        echo "0 0 0"  # cpu_percent memory_percent replica_count
        return
    fi

    # Calculate average metrics across all replicas
    local total_cpu=0
    local total_memory=0
    local container_count=0

    while IFS= read -r container_id; do
        if [[ -n "$container_id" ]]; then
            # Get container stats
            local stats
            stats=$(docker stats --no-stream --format "{{.CPUPerc}},{{.MemPerc}}" "$(docker ps -q -f name=$container_id)" 2>/dev/null || echo "0.00%,0.00%")

            local cpu_percent
            local mem_percent
            cpu_percent=$(echo "$stats" | cut -d',' -f1 | sed 's/%//')
            mem_percent=$(echo "$stats" | cut -d',' -f2 | sed 's/%//')

            if [[ "$cpu_percent" =~ ^[0-9]+\.?[0-9]*$ ]] && [[ "$mem_percent" =~ ^[0-9]+\.?[0-9]*$ ]]; then
                total_cpu=$(echo "$total_cpu + $cpu_percent" | bc -l)
                total_memory=$(echo "$total_memory + $mem_percent" | bc -l)
                ((container_count++))
            fi
        fi
    done <<< "$containers"

    if [[ $container_count -gt 0 ]]; then
        local avg_cpu
        local avg_memory
        avg_cpu=$(echo "scale=2; $total_cpu / $container_count" | bc -l)
        avg_memory=$(echo "scale=2; $total_memory / $container_count" | bc -l)
        echo "$avg_cpu $avg_memory $container_count"
    else
        echo "0 0 0"
    fi
}

# Get current replica count
get_replica_count() {
    local service_name="$1"
    docker service ls --filter "name=$service_name" --format "{{.Replicas}}" | cut -d'/' -f1
}

# Scale service up
scale_up() {
    local service_name="$1"
    local current_replicas="$2"
    local new_replicas=$((current_replicas + 1))

    if [[ $new_replicas -le $MAX_REPLICAS ]]; then
        log "🔼 Scaling UP $service_name: $current_replicas → $new_replicas replicas"
        docker service update --replicas "$new_replicas" "$service_name" >/dev/null 2>&1 || {
            log "❌ Failed to scale up $service_name"
            return 1
        }
        log "✅ Successfully scaled up $service_name"

        # Record scaling event
        echo "$(date -Iseconds),scale_up,$service_name,$current_replicas,$new_replicas,auto" >> "$PROJECT_ROOT/logs/scaling-events.csv"
    else
        log "⚠️  $service_name already at maximum replicas ($MAX_REPLICAS)"
    fi
}

# Scale service down
scale_down() {
    local service_name="$1"
    local current_replicas="$2"
    local new_replicas=$((current_replicas - 1))

    if [[ $new_replicas -ge $MIN_REPLICAS ]]; then
        log "🔽 Scaling DOWN $service_name: $current_replicas → $new_replicas replicas"
        docker service update --replicas "$new_replicas" "$service_name" >/dev/null 2>&1 || {
            log "❌ Failed to scale down $service_name"
            return 1
        }
        log "✅ Successfully scaled down $service_name"

        # Record scaling event
        echo "$(date -Iseconds),scale_down,$service_name,$current_replicas,$new_replicas,auto" >> "$PROJECT_ROOT/logs/scaling-events.csv"
    else
        log "⚠️  $service_name already at minimum replicas ($MIN_REPLICAS)"
    fi
}

# Check if scaling is needed
evaluate_scaling() {
    local service_name="$1"
    local cpu_percent="$2"
    local memory_percent="$3"
    local current_replicas="$4"

    # Convert to integer for comparison
    local cpu_int
    local memory_int
    cpu_int=$(echo "$cpu_percent" | cut -d'.' -f1)
    memory_int=$(echo "$memory_percent" | cut -d'.' -f1)

    # Scale up conditions
    if [[ $cpu_int -gt $CPU_HIGH_THRESHOLD ]] || [[ $memory_int -gt $MEMORY_HIGH_THRESHOLD ]]; then
        log "📊 $service_name metrics: CPU=${cpu_percent}%, Memory=${memory_percent}% - HIGH usage detected"
        scale_up "$service_name" "$current_replicas"
        return
    fi

    # Scale down conditions (only if we have more than minimum replicas)
    if [[ $current_replicas -gt $MIN_REPLICAS ]] && [[ $cpu_int -lt $CPU_LOW_THRESHOLD ]] && [[ $memory_int -lt $MEMORY_LOW_THRESHOLD ]]; then
        log "📊 $service_name metrics: CPU=${cpu_percent}%, Memory=${memory_percent}% - LOW usage detected"
        scale_down "$service_name" "$current_replicas"
        return
    fi

    # No scaling needed
    log "📊 $service_name metrics: CPU=${cpu_percent}%, Memory=${memory_percent}%, Replicas=$current_replicas - OK"
}

# Time-based scaling (scale down non-critical services at night)
time_based_scaling() {
    local current_hour
    current_hour=$(date +%H)

    # Night hours (2 AM - 6 AM): scale down non-critical services
    if [[ $current_hour -ge 2 && $current_hour -le 6 ]]; then
        local night_services=("paperless_paperless" "grafana_grafana")

        for service in "${night_services[@]}"; do
            local current_replicas
            current_replicas=$(get_replica_count "$service")

            if [[ $current_replicas -gt 1 ]]; then
                log "🌙 Night scaling: reducing $service to 1 replica (was $current_replicas)"
                docker service update --replicas 1 "$service" >/dev/null 2>&1 || true
                echo "$(date -Iseconds),night_scale_down,$service,$current_replicas,1,time_based" >> "$PROJECT_ROOT/logs/scaling-events.csv"
            fi
        done
    fi

    # Morning hours (7 AM): scale back up
    if [[ $current_hour -eq 7 ]]; then
        local morning_services=("paperless_paperless" "grafana_grafana")

        for service in "${morning_services[@]}"; do
            local current_replicas
            current_replicas=$(get_replica_count "$service")

            if [[ $current_replicas -lt 2 ]]; then
                log "🌅 Morning scaling: restoring $service to 2 replicas (was $current_replicas)"
                docker service update --replicas 2 "$service" >/dev/null 2>&1 || true
                echo "$(date -Iseconds),morning_scale_up,$service,$current_replicas,2,time_based" >> "$PROJECT_ROOT/logs/scaling-events.csv"
            fi
        done
    fi
}

# Generate scaling report
generate_scaling_report() {
    log "Generating scaling report..."

    local report_file="$PROJECT_ROOT/logs/scaling-report-$(date +%Y%m%d).yaml"
    cat > "$report_file" << EOF
scaling_report:
  timestamp: "$(date -Iseconds)"
  evaluation_cycle: $(date +%Y%m%d-%H%M%S)

  current_state:
EOF

    # Add current state of all services
    for service in "${SCALABLE_SERVICES[@]}"; do
        local metrics
        metrics=$(get_service_metrics "$service")
        local cpu_percent memory_percent replica_count
        read -r cpu_percent memory_percent replica_count <<< "$metrics"

        cat >> "$report_file" << EOF
    - service: "$service"
      replicas: $replica_count
      cpu_usage: "${cpu_percent}%"
      memory_usage: "${memory_percent}%"
      status: $(if docker service ls --filter "name=$service" --format "{{.Name}}" >/dev/null 2>&1; then echo "running"; else echo "not_found"; fi)
EOF
    done

    # Add scaling events from today
    local events_today
    events_today=$(grep "$(date +%Y-%m-%d)" "$PROJECT_ROOT/logs/scaling-events.csv" 2>/dev/null | wc -l || echo "0")

    cat >> "$report_file" << EOF

  daily_summary:
    scaling_events_today: $events_today
    thresholds:
      cpu_high: ${CPU_HIGH_THRESHOLD}%
      cpu_low: ${CPU_LOW_THRESHOLD}%
      memory_high: ${MEMORY_HIGH_THRESHOLD}%
      memory_low: ${MEMORY_LOW_THRESHOLD}%
    limits:
      max_replicas: $MAX_REPLICAS
      min_replicas: $MIN_REPLICAS
EOF

    log "✅ Scaling report generated: $report_file"
}

# Setup continuous monitoring
setup_monitoring() {
    log "Setting up dynamic scaling monitoring..."

    # Create systemd service for continuous monitoring
    cat > /tmp/docker-autoscaler.service << 'EOF'
[Unit]
Description=Docker Swarm Auto Scaler
After=docker.service
Requires=docker.service

[Service]
Type=simple
ExecStart=/home/jonathan/Coding/HomeAudit/scripts/dynamic-resource-scaling.sh --monitor
Restart=always
RestartSec=60
User=root

[Install]
WantedBy=multi-user.target
EOF

    # Create monitoring loop script
    cat > "$PROJECT_ROOT/scripts/scaling-monitor-loop.sh" << 'EOF'
#!/bin/bash
# Continuous monitoring loop for dynamic scaling

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"

while true; do
    # Run scaling evaluation
    ./dynamic-resource-scaling.sh --evaluate

    # Wait 5 minutes between evaluations
    sleep 300
done
EOF

    chmod +x "$PROJECT_ROOT/scripts/scaling-monitor-loop.sh"
    log "✅ Monitoring scripts created"
    log "⚠️  To enable: sudo cp /tmp/docker-autoscaler.service /etc/systemd/system/ && sudo systemctl enable --now docker-autoscaler"
}

# Main execution
main() {
    case "${1:-evaluate}" in
        "--evaluate")
            log "🔍 Starting dynamic scaling evaluation..."

            # Initialize CSV file if it doesn't exist
            if [[ ! -f "$PROJECT_ROOT/logs/scaling-events.csv" ]]; then
                echo "timestamp,action,service,old_replicas,new_replicas,trigger" > "$PROJECT_ROOT/logs/scaling-events.csv"
            fi

            # Check each scalable service
            for service in "${SCALABLE_SERVICES[@]}"; do
                if docker service ls --filter "name=$service" --format "{{.Name}}" >/dev/null 2>&1; then
                    local metrics
                    metrics=$(get_service_metrics "$service")
                    local cpu_percent memory_percent current_replicas
                    read -r cpu_percent memory_percent current_replicas <<< "$metrics"

                    evaluate_scaling "$service" "$cpu_percent" "$memory_percent" "$current_replicas"
                else
                    log "⚠️  Service not found: $service"
                fi
            done

            # Apply time-based scaling
            time_based_scaling

            # Generate report
            generate_scaling_report
            ;;
        "--monitor")
            log "🔄 Starting continuous monitoring mode..."
            while true; do
                ./dynamic-resource-scaling.sh --evaluate
                sleep 300  # 5-minute intervals
            done
            ;;
        "--setup")
            setup_monitoring
            ;;
        "--status")
            log "📊 Current service status:"
            for service in "${SCALABLE_SERVICES[@]}"; do
                if docker service ls --filter "name=$service" --format "{{.Name}}" >/dev/null 2>&1; then
                    local metrics
                    metrics=$(get_service_metrics "$service")
                    local cpu_percent memory_percent current_replicas
                    read -r cpu_percent memory_percent current_replicas <<< "$metrics"
                    log "  $service: ${current_replicas} replicas, CPU=${cpu_percent}%, Memory=${memory_percent}%"
                else
                    log "  $service: not found"
                fi
            done
            ;;
        "--help"|"-h")
            cat << 'EOF'
Dynamic Resource Scaling Automation

USAGE:
  dynamic-resource-scaling.sh [OPTIONS]

OPTIONS:
  --evaluate       Run single scaling evaluation (default)
  --monitor        Start continuous monitoring mode
  --setup          Set up systemd service for continuous monitoring
  --status         Show current status of all scalable services
  --help, -h       Show this help message

EXAMPLES:
  # Single evaluation
  ./dynamic-resource-scaling.sh --evaluate

  # Check current status
  ./dynamic-resource-scaling.sh --status

  # Set up continuous monitoring
  ./dynamic-resource-scaling.sh --setup

CONFIGURATION:
  Edit the script to modify:
  - CPU_HIGH_THRESHOLD: Scale up when CPU > 80%
  - CPU_LOW_THRESHOLD: Scale down when CPU < 20%
  - MEMORY_HIGH_THRESHOLD: Scale up when Memory > 85%
  - MEMORY_LOW_THRESHOLD: Scale down when Memory < 30%
  - MAX_REPLICAS: Maximum replicas per service (5)
  - MIN_REPLICAS: Minimum replicas per service (1)

NOTES:
  - Requires Docker Swarm mode
  - Monitors CPU and memory usage
  - Includes time-based scaling for night hours
  - Logs all scaling events for audit
  - Safe scaling with min/max limits
EOF
            ;;
        *)
            log "❌ Unknown option: $1"
            log "Use --help for usage information"
            exit 1
            ;;
    esac
}

# Check dependencies
if ! command -v bc >/dev/null 2>&1; then
    log "Installing bc for calculations..."
    sudo apt-get update && sudo apt-get install -y bc || {
        log "❌ Failed to install bc. Please install manually."
        exit 1
    }
fi

# Execute main function
main "$@"