Files
HomeAudit/migration_scripts/scripts/docker_swarm_optimizer.sh
2025-08-24 11:13:39 -04:00

973 lines
29 KiB
Bash
Executable File

#!/bin/bash
# Docker Swarm Optimizer
# Configures Docker Swarm with proper resource constraints, high availability, and anti-affinity rules
# Import error handling library
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/lib/error_handling.sh"
# Configuration
readonly HOSTS=("omv800" "fedora" "surface" "jonathan-2518f5u" "audrey" "raspberrypi")
readonly HOST_IPS=("192.168.50.229" "192.168.50.225" "192.168.50.254" "192.168.50.181" "192.168.50.145" "192.168.50.107")
readonly MANAGER_HOST="omv800"
readonly BACKUP_MANAGER="surface"
readonly SWARM_CONFIG_DIR="/opt/migration/configs/swarm"
readonly DOCKER_COMPOSE_DIR="/opt/migration/configs/services"
# Host capabilities and roles
declare -A HOST_ROLES=(
["omv800"]="primary-manager,storage,database"
["surface"]="backup-manager,compute,development"
["fedora"]="compute,automation"
["jonathan-2518f5u"]="iot,edge"
["audrey"]="monitoring,logging"
["raspberrypi"]="backup,storage"
)
# Resource specifications per host (in GB for memory, cores for CPU)
declare -A HOST_RESOURCES=(
["omv800"]="memory:31,cpu:4,storage:high"
["surface"]="memory:8,cpu:4,storage:medium"
["fedora"]="memory:15,cpu:4,storage:medium"
["jonathan-2518f5u"]="memory:8,cpu:4,storage:low"
["audrey"]="memory:4,cpu:2,storage:low"
["raspberrypi"]="memory:8,cpu:4,storage:high"
)
# Service resource requirements and constraints
declare -A SERVICE_CONFIGS=(
["traefik"]="memory:512m,cpu:0.5,replicas:2,placement:manager"
["immich-web"]="memory:2g,cpu:1.0,replicas:2,placement:storage"
["immich-ml"]="memory:4g,cpu:2.0,replicas:1,placement:compute"
["jellyfin"]="memory:4g,cpu:2.0,replicas:1,placement:storage"
["homeassistant"]="memory:1g,cpu:0.5,replicas:2,placement:iot"
["appflowy"]="memory:1g,cpu:0.5,replicas:2,placement:development"
["paperless"]="memory:2g,cpu:1.0,replicas:2,placement:any"
["postgres"]="memory:4g,cpu:2.0,replicas:1,placement:database"
["redis"]="memory:512m,cpu:0.25,replicas:3,placement:database"
["prometheus"]="memory:2g,cpu:1.0,replicas:1,placement:monitoring"
["grafana"]="memory:1g,cpu:0.5,replicas:2,placement:monitoring"
["portainer"]="memory:512m,cpu:0.25,replicas:1,placement:manager"
)
# Cleanup function
cleanup_swarm_config() {
log_info "Cleaning up Docker Swarm configuration..."
# Clean up temporary files
rm -f /tmp/swarm_*.tmp 2>/dev/null || true
rm -f /tmp/docker_*.tmp 2>/dev/null || true
log_info "Swarm configuration cleanup completed"
}
# Rollback function
rollback_swarm_config() {
log_info "Rolling back Docker Swarm configuration..."
# Stop any services that were deployed during configuration
local services=$(ssh "$MANAGER_HOST" "docker service ls -q" 2>/dev/null || echo "")
if [[ -n "$services" ]]; then
log_info "Stopping services for rollback..."
ssh "$MANAGER_HOST" "docker service ls -q | xargs -r docker service rm" 2>/dev/null || true
fi
cleanup_swarm_config
log_info "Swarm rollback completed"
}
# Function to validate Docker versions across hosts
validate_docker_versions() {
log_step "Validating Docker versions across hosts..."
local version_issues=0
local reference_version=""
for i in "${!HOSTS[@]}"; do
local host="${HOSTS[$i]}"
log_info "Checking Docker version on $host..."
local docker_version=$(ssh -o ConnectTimeout=10 "$host" "docker version --format '{{.Server.Version}}'" 2>/dev/null || echo "ERROR")
if [[ "$docker_version" == "ERROR" ]]; then
log_error "Cannot get Docker version from $host"
((version_issues++))
continue
fi
log_info "Docker version on $host: $docker_version"
# Set reference version from first host
if [[ -z "$reference_version" ]]; then
reference_version="$docker_version"
else
# Check version compatibility (allow minor version differences)
local ref_major=$(echo "$reference_version" | cut -d. -f1)
local current_major=$(echo "$docker_version" | cut -d. -f1)
if [[ "$ref_major" != "$current_major" ]]; then
log_warn "Docker major version mismatch: $host has $docker_version, reference is $reference_version"
((version_issues++))
fi
fi
done
if [[ $version_issues -eq 0 ]]; then
log_success "All Docker versions are compatible"
return 0
else
log_error "$version_issues hosts have Docker version issues"
return 1
fi
}
# Function to configure node labels for proper service placement
configure_node_labels() {
log_step "Configuring Docker Swarm node labels..."
for i in "${!HOSTS[@]}"; do
local host="${HOSTS[$i]}"
local roles="${HOST_ROLES[$host]}"
local resources="${HOST_RESOURCES[$host]}"
log_info "Configuring labels for $host: $roles"
# Parse roles and apply labels
IFS=',' read -ra ROLE_ARRAY <<< "$roles"
for role in "${ROLE_ARRAY[@]}"; do
if ssh "$MANAGER_HOST" "docker node update --label-add role.$role=true $host"; then
log_debug "Applied label role.$role=true to $host"
else
log_error "Failed to apply label role.$role=true to $host"
return 1
fi
done
# Parse and apply resource labels
IFS=',' read -ra RESOURCE_ARRAY <<< "$resources"
for resource in "${RESOURCE_ARRAY[@]}"; do
local key=$(echo "$resource" | cut -d: -f1)
local value=$(echo "$resource" | cut -d: -f2)
if ssh "$MANAGER_HOST" "docker node update --label-add $key=$value $host"; then
log_debug "Applied resource label $key=$value to $host"
else
log_warn "Failed to apply resource label $key=$value to $host"
fi
done
# Apply availability zone labels for anti-affinity
local zone="zone$(((i % 3) + 1))" # Distribute across 3 zones
if ssh "$MANAGER_HOST" "docker node update --label-add zone=$zone $host"; then
log_debug "Applied zone label $zone to $host"
else
log_warn "Failed to apply zone label to $host"
fi
done
log_success "Node labels configured successfully"
}
# Function to configure Docker daemon settings
configure_docker_daemon() {
log_step "Configuring Docker daemon settings..."
# Create optimized Docker daemon configuration
local daemon_config=$(cat << 'EOF'
{
"log-driver": "json-file",
"log-opts": {
"max-size": "10m",
"max-file": "3"
},
"storage-driver": "overlay2",
"live-restore": true,
"userland-proxy": false,
"experimental": false,
"metrics-addr": "127.0.0.1:9323",
"default-ulimits": {
"nofile": {
"Name": "nofile",
"Hard": 64000,
"Soft": 64000
}
},
"max-concurrent-downloads": 3,
"max-concurrent-uploads": 5,
"default-shm-size": "64M",
"storage-opts": [
"overlay2.override_kernel_check=true"
]
}
EOF
)
# Apply configuration to all hosts
for host in "${HOSTS[@]}"; do
log_info "Configuring Docker daemon on $host..."
# Backup existing configuration
ssh "$host" "sudo cp /etc/docker/daemon.json /etc/docker/daemon.json.backup 2>/dev/null || true"
# Apply new configuration
echo "$daemon_config" | ssh "$host" "sudo tee /etc/docker/daemon.json > /dev/null"
# Restart Docker daemon
if ssh "$host" "sudo systemctl restart docker"; then
log_success "Docker daemon configured on $host"
else
log_error "Failed to restart Docker daemon on $host"
return 1
fi
# Wait for Docker to be ready
wait_for_service "Docker-$host" "ssh $host docker info >/dev/null 2>&1" 30 5
done
log_success "Docker daemon configuration completed"
}
# Function to configure swarm settings for high availability
configure_swarm_settings() {
log_step "Configuring Docker Swarm for high availability..."
# Configure swarm with optimized settings
local swarm_config_updates=(
"--autolock=true"
"--cert-expiry=2160h0m0s" # 90 days
"--dispatcher-heartbeat=5s"
"--task-history-limit=5"
)
for config in "${swarm_config_updates[@]}"; do
if ssh "$MANAGER_HOST" "docker swarm update $config"; then
log_success "Applied swarm config: $config"
else
log_warn "Failed to apply swarm config: $config"
fi
done
# Ensure backup manager is promoted
if ssh "$MANAGER_HOST" "docker node ls" | grep -q "$BACKUP_MANAGER.*Leader\|$BACKUP_MANAGER.*Reachable"; then
log_success "Backup manager $BACKUP_MANAGER is already promoted"
else
log_info "Promoting $BACKUP_MANAGER to manager role..."
local manager_token=$(ssh "$MANAGER_HOST" "docker swarm join-token -q manager")
if ssh "$BACKUP_MANAGER" "docker swarm leave" 2>/dev/null || true; then
if ssh "$BACKUP_MANAGER" "docker swarm join --token $manager_token 192.168.50.229:2377"; then
log_success "Successfully promoted $BACKUP_MANAGER to manager"
else
log_error "Failed to promote $BACKUP_MANAGER to manager"
return 1
fi
fi
fi
log_success "Swarm high availability configuration completed"
}
# Function to create optimized service configurations
create_optimized_service_configs() {
log_step "Creating optimized service configurations..."
mkdir -p "$DOCKER_COMPOSE_DIR"
# Create Traefik configuration with proper resource constraints
cat > "$DOCKER_COMPOSE_DIR/traefik-optimized.yml" << 'EOF'
version: '3.8'
services:
traefik:
image: traefik:v3.0
command:
# API and dashboard
- --api.dashboard=true
- --api.insecure=false
# Docker provider
- --providers.docker.swarmMode=true
- --providers.docker.exposedbydefault=false
- --providers.docker.network=public-zone
# Entry points
- --entrypoints.web.address=:80
- --entrypoints.websecure.address=:443
- --entrypoints.web.http.redirections.entrypoint.to=websecure
- --entrypoints.web.http.redirections.entrypoint.scheme=https
# SSL/TLS configuration
- --certificatesresolvers.letsencrypt.acme.email=${EMAIL}
- --certificatesresolvers.letsencrypt.acme.storage=/certificates/acme.json
- --certificatesresolvers.letsencrypt.acme.httpchallenge.entrypoint=web
# Logging and monitoring
- --log.level=INFO
- --log.format=json
- --accesslog=true
- --accesslog.format=json
- --metrics.prometheus=true
- --ping=true
ports:
- target: 80
published: 80
protocol: tcp
mode: ingress
- target: 443
published: 443
protocol: tcp
mode: ingress
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- traefik-certificates:/certificates
- traefik-logs:/var/log/traefik
secrets:
- traefik_users
networks:
- public-zone
- management-zone
environment:
- DOMAIN=${DOMAIN}
- EMAIL=${EMAIL}
deploy:
mode: replicated
replicas: 2
placement:
constraints:
- node.role == manager
preferences:
- spread: node.labels.zone
resources:
limits:
memory: 512M
cpus: '0.5'
reservations:
memory: 256M
cpus: '0.25'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
update_config:
parallelism: 1
delay: 10s
order: start-first
failure_action: rollback
monitor: 60s
rollback_config:
parallelism: 1
delay: 5s
order: stop-first
monitor: 60s
labels:
- "traefik.enable=true"
- "traefik.http.routers.traefik-dashboard.rule=Host(`traefik.${DOMAIN}`)"
- "traefik.http.routers.traefik-dashboard.entrypoints=websecure"
- "traefik.http.routers.traefik-dashboard.tls.certresolver=letsencrypt"
- "traefik.http.routers.traefik-dashboard.service=api@internal"
- "traefik.http.routers.traefik-dashboard.middlewares=auth-secure@file"
secrets:
traefik_users:
external: true
volumes:
traefik-certificates:
driver: local
driver_opts:
type: none
o: bind
device: /opt/traefik/certificates
traefik-logs:
driver: local
driver_opts:
type: none
o: bind
device: /opt/traefik/logs
networks:
public-zone:
external: true
management-zone:
external: true
EOF
# Create PostgreSQL cluster configuration
cat > "$DOCKER_COMPOSE_DIR/postgres-cluster.yml" << 'EOF'
version: '3.8'
services:
postgres-primary:
image: postgres:15-alpine
environment:
POSTGRES_DB: ${POSTGRES_DB}
POSTGRES_USER: ${POSTGRES_USER}
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
POSTGRES_REPLICATION_USER: replicator
POSTGRES_REPLICATION_PASSWORD_FILE: /run/secrets/postgres_replication_password
secrets:
- postgres_password
- postgres_replication_password
volumes:
- postgres-primary-data:/var/lib/postgresql/data
- postgres-config:/etc/postgresql
networks:
- data-zone
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.labels.role.database == true
- node.labels.storage == high
resources:
limits:
memory: 4G
cpus: '2.0'
reservations:
memory: 2G
cpus: '1.0'
restart_policy:
condition: on-failure
delay: 10s
max_attempts: 3
update_config:
parallelism: 1
delay: 30s
order: stop-first
failure_action: rollback
monitor: 120s
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER}"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
postgres-replica:
image: postgres:15-alpine
environment:
POSTGRES_DB: ${POSTGRES_DB}
POSTGRES_USER: ${POSTGRES_USER}
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
PGUSER: ${POSTGRES_USER}
POSTGRES_PRIMARY_HOST: postgres-primary
secrets:
- postgres_password
volumes:
- postgres-replica-data:/var/lib/postgresql/data
networks:
- data-zone
depends_on:
- postgres-primary
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.labels.role.database == true
- node.labels.storage != low
preferences:
- spread: node.labels.zone
resources:
limits:
memory: 2G
cpus: '1.0'
reservations:
memory: 1G
cpus: '0.5'
restart_policy:
condition: on-failure
delay: 10s
max_attempts: 3
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER}"]
interval: 30s
timeout: 10s
retries: 3
secrets:
postgres_password:
external: true
postgres_replication_password:
external: true
volumes:
postgres-primary-data:
driver: local
driver_opts:
type: none
o: bind
device: /opt/postgresql/primary/data
postgres-replica-data:
driver: local
driver_opts:
type: none
o: bind
device: /opt/postgresql/replica/data
postgres-config:
driver: local
networks:
data-zone:
external: true
EOF
# Create Redis cluster configuration
cat > "$DOCKER_COMPOSE_DIR/redis-cluster.yml" << 'EOF'
version: '3.8'
services:
redis-primary:
image: redis:7-alpine
command: redis-server --appendonly yes --requirepass-file /run/secrets/redis_password
secrets:
- redis_password
volumes:
- redis-primary-data:/data
networks:
- data-zone
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.labels.role.database == true
preferences:
- spread: node.labels.zone
resources:
limits:
memory: 512M
cpus: '0.5'
reservations:
memory: 256M
cpus: '0.25'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
healthcheck:
test: ["CMD", "redis-cli", "--raw", "incr", "ping"]
interval: 30s
timeout: 10s
retries: 3
redis-replica:
image: redis:7-alpine
command: redis-server --appendonly yes --requirepass-file /run/secrets/redis_password --replicaof redis-primary 6379
secrets:
- redis_password
volumes:
- redis-replica-data:/data
networks:
- data-zone
depends_on:
- redis-primary
deploy:
mode: replicated
replicas: 2
placement:
constraints:
- node.labels.role.database == true
preferences:
- spread: node.labels.zone
resources:
limits:
memory: 256M
cpus: '0.25'
reservations:
memory: 128M
cpus: '0.1'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
secrets:
redis_password:
external: true
volumes:
redis-primary-data:
driver: local
redis-replica-data:
driver: local
networks:
data-zone:
external: true
EOF
log_success "Optimized service configurations created"
}
# Function to deploy resource monitoring
deploy_resource_monitoring() {
log_step "Deploying resource monitoring..."
# Create resource monitoring configuration
cat > "$DOCKER_COMPOSE_DIR/resource-monitoring.yml" << 'EOF'
version: '3.8'
services:
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
ports:
- target: 8080
published: 8080
protocol: tcp
mode: host
networks:
- monitoring-zone
deploy:
mode: global
resources:
limits:
memory: 256M
cpus: '0.2'
reservations:
memory: 128M
cpus: '0.1'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
command:
- '--housekeeping_interval=10s'
- '--docker_only=true'
- '--disable_metrics=disk,network,tcp,udp,percpu,sched,process'
node-exporter:
image: prom/node-exporter:latest
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
ports:
- target: 9100
published: 9100
protocol: tcp
mode: host
networks:
- monitoring-zone
deploy:
mode: global
resources:
limits:
memory: 128M
cpus: '0.1'
reservations:
memory: 64M
cpus: '0.05'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points'
- '^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)'
networks:
monitoring-zone:
external: true
EOF
# Deploy resource monitoring
if ssh "$MANAGER_HOST" "cd $DOCKER_COMPOSE_DIR && docker stack deploy -c resource-monitoring.yml monitoring"; then
log_success "Resource monitoring deployed successfully"
else
log_error "Failed to deploy resource monitoring"
return 1
fi
# Wait for services to be ready
wait_for_service "Resource monitoring" "ssh $MANAGER_HOST 'docker service ls | grep monitoring_cadvisor | grep -q \"1/\"'" 60 10
log_success "Resource monitoring deployment completed"
}
# Function to test swarm functionality
test_swarm_functionality() {
log_step "Testing Docker Swarm functionality..."
# Test service deployment
log_info "Testing service deployment..."
local test_service="test-swarm-function"
if ssh "$MANAGER_HOST" "docker service create --name $test_service --replicas 3 --constraint 'node.role!=manager' alpine sleep 300"; then
log_success "Test service deployed successfully"
else
log_error "Failed to deploy test service"
return 1
fi
# Wait for service to be ready
sleep 15
# Check service status
local running_replicas=$(ssh "$MANAGER_HOST" "docker service ps $test_service | grep -c Running")
if [[ $running_replicas -ge 2 ]]; then
log_success "Test service has $running_replicas running replicas"
else
log_error "Test service only has $running_replicas running replicas"
fi
# Test service scaling
log_info "Testing service scaling..."
if ssh "$MANAGER_HOST" "docker service scale ${test_service}=5"; then
sleep 10
local scaled_replicas=$(ssh "$MANAGER_HOST" "docker service ps $test_service | grep -c Running")
log_success "Service scaled to $scaled_replicas replicas"
else
log_warn "Service scaling test failed"
fi
# Test rolling update
log_info "Testing rolling update..."
if ssh "$MANAGER_HOST" "docker service update --image alpine:latest $test_service"; then
log_success "Rolling update test completed"
else
log_warn "Rolling update test failed"
fi
# Cleanup test service
ssh "$MANAGER_HOST" "docker service rm $test_service" >/dev/null 2>&1 || true
# Test network connectivity between nodes
log_info "Testing network connectivity..."
local connectivity_issues=0
for host in "${HOSTS[@]}"; do
if [[ "$host" != "$MANAGER_HOST" ]] && [[ "$host" != "raspberrypi" ]]; then
if ping -c 1 -W 5 "$host" >/dev/null 2>&1; then
log_debug "Network connectivity to $host: OK"
else
log_error "Network connectivity to $host: FAILED"
((connectivity_issues++))
fi
fi
done
if [[ $connectivity_issues -eq 0 ]]; then
log_success "All network connectivity tests passed"
else
log_error "$connectivity_issues network connectivity issues detected"
return 1
fi
log_success "Docker Swarm functionality tests completed successfully"
}
# Function to create swarm health monitoring script
create_swarm_health_monitor() {
log_step "Creating swarm health monitoring script..."
cat > "/opt/migration/scripts/swarm_health_monitor.sh" << 'EOF'
#!/bin/bash
# Docker Swarm Health Monitor
# Monitors swarm health and sends alerts for issues
MANAGER_HOST="omv800"
ALERT_LOG="/var/log/swarm_health.log"
ALERT_THRESHOLD_CPU=80
ALERT_THRESHOLD_MEMORY=85
log_alert() {
echo "$(date): SWARM_ALERT - $1" | tee -a "$ALERT_LOG"
logger "SWARM_HEALTH_ALERT: $1"
}
check_node_health() {
local nodes_down=$(ssh "$MANAGER_HOST" "docker node ls --format '{{.Status}}'" | grep -c Down || echo "0")
if [[ $nodes_down -gt 0 ]]; then
log_alert "Docker nodes down: $nodes_down"
fi
local nodes_unavailable=$(ssh "$MANAGER_HOST" "docker node ls --format '{{.Availability}}'" | grep -c Drain || echo "0")
if [[ $nodes_unavailable -gt 1 ]]; then # Allow one for maintenance
log_alert "Multiple nodes unavailable: $nodes_unavailable"
fi
}
check_service_health() {
local services_with_issues=$(ssh "$MANAGER_HOST" "docker service ls --format '{{.Name}} {{.Replicas}}'" | grep -c "0/\|1/[2-9]" || echo "0")
if [[ $services_with_issues -gt 0 ]]; then
log_alert "Services with replica issues: $services_with_issues"
fi
}
check_resource_usage() {
# Check if resource monitoring is available
for host in omv800 fedora surface jonathan-2518f5u audrey; do
local cpu_usage=$(curl -s "http://${host}:8080/api/v1.3/machine" 2>/dev/null | jq -r '.cpu_usage_rate // 0' 2>/dev/null || echo "0")
local memory_usage=$(curl -s "http://${host}:8080/api/v1.3/machine" 2>/dev/null | jq -r '.memory.usage // 0' 2>/dev/null || echo "0")
# Convert to percentage if needed
if (( $(echo "$cpu_usage > $ALERT_THRESHOLD_CPU" | bc -l 2>/dev/null || echo "0") )); then
log_alert "High CPU usage on $host: ${cpu_usage}%"
fi
# Memory usage calculation would need more complex logic
# This is simplified for demonstration
done
}
check_swarm_secrets() {
local secrets_count=$(ssh "$MANAGER_HOST" "docker secret ls -q | wc -l")
if [[ $secrets_count -lt 5 ]]; then # Expecting at least 5 secrets
log_alert "Unexpected low secret count: $secrets_count"
fi
}
# Main monitoring loop
while true; do
check_node_health
check_service_health
check_resource_usage
check_swarm_secrets
sleep 300 # Check every 5 minutes
done
EOF
chmod +x "/opt/migration/scripts/swarm_health_monitor.sh"
# Deploy health monitor as a systemd service on manager
ssh "$MANAGER_HOST" "cat > /tmp/swarm-health-monitor.service << 'SERVICE_EOF'
[Unit]
Description=Docker Swarm Health Monitor
After=docker.service
Requires=docker.service
[Service]
ExecStart=/opt/migration/scripts/swarm_health_monitor.sh
Restart=always
RestartSec=10
User=root
[Install]
WantedBy=multi-user.target
SERVICE_EOF"
scp "/opt/migration/scripts/swarm_health_monitor.sh" "$MANAGER_HOST:/opt/migration/scripts/"
ssh "$MANAGER_HOST" "sudo mv /tmp/swarm-health-monitor.service /etc/systemd/system/"
ssh "$MANAGER_HOST" "sudo systemctl daemon-reload && sudo systemctl enable swarm-health-monitor.service"
if ssh "$MANAGER_HOST" "sudo systemctl start swarm-health-monitor.service"; then
log_success "Swarm health monitor started on $MANAGER_HOST"
else
log_warn "Swarm health monitor may have issues"
fi
log_success "Swarm health monitoring setup completed"
}
# Main execution function
main() {
local action=${1:-"full"}
# Register cleanup and rollback functions
register_cleanup cleanup_swarm_config
register_rollback rollback_swarm_config
case $action in
"full")
log_step "Starting Docker Swarm optimization..."
# Validate prerequisites
validate_prerequisites ssh docker jq bc curl
# Validate network connectivity
validate_network_connectivity "${HOST_IPS[@]}"
# Create checkpoint
create_checkpoint "swarm_optimization_start"
# Validate Docker versions
validate_docker_versions
create_checkpoint "docker_versions_validated"
# Configure Docker daemon
configure_docker_daemon
create_checkpoint "docker_daemon_configured"
# Configure node labels
configure_node_labels
create_checkpoint "node_labels_configured"
# Configure swarm settings
configure_swarm_settings
create_checkpoint "swarm_settings_configured"
# Create optimized service configurations
create_optimized_service_configs
create_checkpoint "service_configs_created"
# Deploy resource monitoring
deploy_resource_monitoring
create_checkpoint "resource_monitoring_deployed"
# Test swarm functionality
test_swarm_functionality
create_checkpoint "swarm_functionality_tested"
# Create health monitoring
create_swarm_health_monitor
create_checkpoint "health_monitoring_setup"
log_success "✅ Docker Swarm optimization completed successfully!"
log_info "📊 Check swarm status: ssh $MANAGER_HOST docker node ls"
log_info "🔍 Monitor resources: http://any-host:8080 (cAdvisor)"
;;
"labels-only")
configure_node_labels
;;
"test-only")
test_swarm_functionality
;;
"monitor-only")
deploy_resource_monitoring
create_swarm_health_monitor
;;
"help"|*)
cat << EOF
Docker Swarm Optimizer
Usage: $0 <action>
Actions:
full - Complete swarm optimization (default)
labels-only - Only configure node labels
test-only - Only test swarm functionality
monitor-only - Only deploy monitoring
help - Show this help
Examples:
$0 full
$0 test-only
$0 monitor-only
EOF
;;
esac
}
# Execute main function
main "$@"