973 lines
29 KiB
Bash
Executable File
973 lines
29 KiB
Bash
Executable File
#!/bin/bash
|
|
# Docker Swarm Optimizer
|
|
# Configures Docker Swarm with proper resource constraints, high availability, and anti-affinity rules
|
|
|
|
# Import error handling library
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
source "$SCRIPT_DIR/lib/error_handling.sh"
|
|
|
|
# Configuration
|
|
readonly HOSTS=("omv800" "fedora" "surface" "jonathan-2518f5u" "audrey" "raspberrypi")
|
|
readonly HOST_IPS=("192.168.50.229" "192.168.50.225" "192.168.50.254" "192.168.50.181" "192.168.50.145" "192.168.50.107")
|
|
readonly MANAGER_HOST="omv800"
|
|
readonly BACKUP_MANAGER="surface"
|
|
readonly SWARM_CONFIG_DIR="/opt/migration/configs/swarm"
|
|
readonly DOCKER_COMPOSE_DIR="/opt/migration/configs/services"
|
|
|
|
# Host capabilities and roles
|
|
declare -A HOST_ROLES=(
|
|
["omv800"]="primary-manager,storage,database"
|
|
["surface"]="backup-manager,compute,development"
|
|
["fedora"]="compute,automation"
|
|
["jonathan-2518f5u"]="iot,edge"
|
|
["audrey"]="monitoring,logging"
|
|
["raspberrypi"]="backup,storage"
|
|
)
|
|
|
|
# Resource specifications per host (in GB for memory, cores for CPU)
|
|
declare -A HOST_RESOURCES=(
|
|
["omv800"]="memory:31,cpu:4,storage:high"
|
|
["surface"]="memory:8,cpu:4,storage:medium"
|
|
["fedora"]="memory:15,cpu:4,storage:medium"
|
|
["jonathan-2518f5u"]="memory:8,cpu:4,storage:low"
|
|
["audrey"]="memory:4,cpu:2,storage:low"
|
|
["raspberrypi"]="memory:8,cpu:4,storage:high"
|
|
)
|
|
|
|
# Service resource requirements and constraints
|
|
declare -A SERVICE_CONFIGS=(
|
|
["traefik"]="memory:512m,cpu:0.5,replicas:2,placement:manager"
|
|
["immich-web"]="memory:2g,cpu:1.0,replicas:2,placement:storage"
|
|
["immich-ml"]="memory:4g,cpu:2.0,replicas:1,placement:compute"
|
|
["jellyfin"]="memory:4g,cpu:2.0,replicas:1,placement:storage"
|
|
["homeassistant"]="memory:1g,cpu:0.5,replicas:2,placement:iot"
|
|
["appflowy"]="memory:1g,cpu:0.5,replicas:2,placement:development"
|
|
["paperless"]="memory:2g,cpu:1.0,replicas:2,placement:any"
|
|
["postgres"]="memory:4g,cpu:2.0,replicas:1,placement:database"
|
|
["redis"]="memory:512m,cpu:0.25,replicas:3,placement:database"
|
|
["prometheus"]="memory:2g,cpu:1.0,replicas:1,placement:monitoring"
|
|
["grafana"]="memory:1g,cpu:0.5,replicas:2,placement:monitoring"
|
|
["portainer"]="memory:512m,cpu:0.25,replicas:1,placement:manager"
|
|
)
|
|
|
|
# Cleanup function
|
|
cleanup_swarm_config() {
|
|
log_info "Cleaning up Docker Swarm configuration..."
|
|
|
|
# Clean up temporary files
|
|
rm -f /tmp/swarm_*.tmp 2>/dev/null || true
|
|
rm -f /tmp/docker_*.tmp 2>/dev/null || true
|
|
|
|
log_info "Swarm configuration cleanup completed"
|
|
}
|
|
|
|
# Rollback function
|
|
rollback_swarm_config() {
|
|
log_info "Rolling back Docker Swarm configuration..."
|
|
|
|
# Stop any services that were deployed during configuration
|
|
local services=$(ssh "$MANAGER_HOST" "docker service ls -q" 2>/dev/null || echo "")
|
|
if [[ -n "$services" ]]; then
|
|
log_info "Stopping services for rollback..."
|
|
ssh "$MANAGER_HOST" "docker service ls -q | xargs -r docker service rm" 2>/dev/null || true
|
|
fi
|
|
|
|
cleanup_swarm_config
|
|
log_info "Swarm rollback completed"
|
|
}
|
|
|
|
# Function to validate Docker versions across hosts
|
|
validate_docker_versions() {
|
|
log_step "Validating Docker versions across hosts..."
|
|
|
|
local version_issues=0
|
|
local reference_version=""
|
|
|
|
for i in "${!HOSTS[@]}"; do
|
|
local host="${HOSTS[$i]}"
|
|
log_info "Checking Docker version on $host..."
|
|
|
|
local docker_version=$(ssh -o ConnectTimeout=10 "$host" "docker version --format '{{.Server.Version}}'" 2>/dev/null || echo "ERROR")
|
|
|
|
if [[ "$docker_version" == "ERROR" ]]; then
|
|
log_error "Cannot get Docker version from $host"
|
|
((version_issues++))
|
|
continue
|
|
fi
|
|
|
|
log_info "Docker version on $host: $docker_version"
|
|
|
|
# Set reference version from first host
|
|
if [[ -z "$reference_version" ]]; then
|
|
reference_version="$docker_version"
|
|
else
|
|
# Check version compatibility (allow minor version differences)
|
|
local ref_major=$(echo "$reference_version" | cut -d. -f1)
|
|
local current_major=$(echo "$docker_version" | cut -d. -f1)
|
|
|
|
if [[ "$ref_major" != "$current_major" ]]; then
|
|
log_warn "Docker major version mismatch: $host has $docker_version, reference is $reference_version"
|
|
((version_issues++))
|
|
fi
|
|
fi
|
|
done
|
|
|
|
if [[ $version_issues -eq 0 ]]; then
|
|
log_success "All Docker versions are compatible"
|
|
return 0
|
|
else
|
|
log_error "$version_issues hosts have Docker version issues"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Function to configure node labels for proper service placement
|
|
configure_node_labels() {
|
|
log_step "Configuring Docker Swarm node labels..."
|
|
|
|
for i in "${!HOSTS[@]}"; do
|
|
local host="${HOSTS[$i]}"
|
|
local roles="${HOST_ROLES[$host]}"
|
|
local resources="${HOST_RESOURCES[$host]}"
|
|
|
|
log_info "Configuring labels for $host: $roles"
|
|
|
|
# Parse roles and apply labels
|
|
IFS=',' read -ra ROLE_ARRAY <<< "$roles"
|
|
for role in "${ROLE_ARRAY[@]}"; do
|
|
if ssh "$MANAGER_HOST" "docker node update --label-add role.$role=true $host"; then
|
|
log_debug "Applied label role.$role=true to $host"
|
|
else
|
|
log_error "Failed to apply label role.$role=true to $host"
|
|
return 1
|
|
fi
|
|
done
|
|
|
|
# Parse and apply resource labels
|
|
IFS=',' read -ra RESOURCE_ARRAY <<< "$resources"
|
|
for resource in "${RESOURCE_ARRAY[@]}"; do
|
|
local key=$(echo "$resource" | cut -d: -f1)
|
|
local value=$(echo "$resource" | cut -d: -f2)
|
|
|
|
if ssh "$MANAGER_HOST" "docker node update --label-add $key=$value $host"; then
|
|
log_debug "Applied resource label $key=$value to $host"
|
|
else
|
|
log_warn "Failed to apply resource label $key=$value to $host"
|
|
fi
|
|
done
|
|
|
|
# Apply availability zone labels for anti-affinity
|
|
local zone="zone$(((i % 3) + 1))" # Distribute across 3 zones
|
|
if ssh "$MANAGER_HOST" "docker node update --label-add zone=$zone $host"; then
|
|
log_debug "Applied zone label $zone to $host"
|
|
else
|
|
log_warn "Failed to apply zone label to $host"
|
|
fi
|
|
done
|
|
|
|
log_success "Node labels configured successfully"
|
|
}
|
|
|
|
# Function to configure Docker daemon settings
|
|
configure_docker_daemon() {
|
|
log_step "Configuring Docker daemon settings..."
|
|
|
|
# Create optimized Docker daemon configuration
|
|
local daemon_config=$(cat << 'EOF'
|
|
{
|
|
"log-driver": "json-file",
|
|
"log-opts": {
|
|
"max-size": "10m",
|
|
"max-file": "3"
|
|
},
|
|
"storage-driver": "overlay2",
|
|
"live-restore": true,
|
|
"userland-proxy": false,
|
|
"experimental": false,
|
|
"metrics-addr": "127.0.0.1:9323",
|
|
"default-ulimits": {
|
|
"nofile": {
|
|
"Name": "nofile",
|
|
"Hard": 64000,
|
|
"Soft": 64000
|
|
}
|
|
},
|
|
"max-concurrent-downloads": 3,
|
|
"max-concurrent-uploads": 5,
|
|
"default-shm-size": "64M",
|
|
"storage-opts": [
|
|
"overlay2.override_kernel_check=true"
|
|
]
|
|
}
|
|
EOF
|
|
)
|
|
|
|
# Apply configuration to all hosts
|
|
for host in "${HOSTS[@]}"; do
|
|
log_info "Configuring Docker daemon on $host..."
|
|
|
|
# Backup existing configuration
|
|
ssh "$host" "sudo cp /etc/docker/daemon.json /etc/docker/daemon.json.backup 2>/dev/null || true"
|
|
|
|
# Apply new configuration
|
|
echo "$daemon_config" | ssh "$host" "sudo tee /etc/docker/daemon.json > /dev/null"
|
|
|
|
# Restart Docker daemon
|
|
if ssh "$host" "sudo systemctl restart docker"; then
|
|
log_success "Docker daemon configured on $host"
|
|
else
|
|
log_error "Failed to restart Docker daemon on $host"
|
|
return 1
|
|
fi
|
|
|
|
# Wait for Docker to be ready
|
|
wait_for_service "Docker-$host" "ssh $host docker info >/dev/null 2>&1" 30 5
|
|
done
|
|
|
|
log_success "Docker daemon configuration completed"
|
|
}
|
|
|
|
# Function to configure swarm settings for high availability
|
|
configure_swarm_settings() {
|
|
log_step "Configuring Docker Swarm for high availability..."
|
|
|
|
# Configure swarm with optimized settings
|
|
local swarm_config_updates=(
|
|
"--autolock=true"
|
|
"--cert-expiry=2160h0m0s" # 90 days
|
|
"--dispatcher-heartbeat=5s"
|
|
"--task-history-limit=5"
|
|
)
|
|
|
|
for config in "${swarm_config_updates[@]}"; do
|
|
if ssh "$MANAGER_HOST" "docker swarm update $config"; then
|
|
log_success "Applied swarm config: $config"
|
|
else
|
|
log_warn "Failed to apply swarm config: $config"
|
|
fi
|
|
done
|
|
|
|
# Ensure backup manager is promoted
|
|
if ssh "$MANAGER_HOST" "docker node ls" | grep -q "$BACKUP_MANAGER.*Leader\|$BACKUP_MANAGER.*Reachable"; then
|
|
log_success "Backup manager $BACKUP_MANAGER is already promoted"
|
|
else
|
|
log_info "Promoting $BACKUP_MANAGER to manager role..."
|
|
local manager_token=$(ssh "$MANAGER_HOST" "docker swarm join-token -q manager")
|
|
if ssh "$BACKUP_MANAGER" "docker swarm leave" 2>/dev/null || true; then
|
|
if ssh "$BACKUP_MANAGER" "docker swarm join --token $manager_token 192.168.50.229:2377"; then
|
|
log_success "Successfully promoted $BACKUP_MANAGER to manager"
|
|
else
|
|
log_error "Failed to promote $BACKUP_MANAGER to manager"
|
|
return 1
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
log_success "Swarm high availability configuration completed"
|
|
}
|
|
|
|
# Function to create optimized service configurations
|
|
create_optimized_service_configs() {
|
|
log_step "Creating optimized service configurations..."
|
|
|
|
mkdir -p "$DOCKER_COMPOSE_DIR"
|
|
|
|
# Create Traefik configuration with proper resource constraints
|
|
cat > "$DOCKER_COMPOSE_DIR/traefik-optimized.yml" << 'EOF'
|
|
version: '3.8'
|
|
|
|
services:
|
|
traefik:
|
|
image: traefik:v3.0
|
|
command:
|
|
# API and dashboard
|
|
- --api.dashboard=true
|
|
- --api.insecure=false
|
|
|
|
# Docker provider
|
|
- --providers.docker.swarmMode=true
|
|
- --providers.docker.exposedbydefault=false
|
|
- --providers.docker.network=public-zone
|
|
|
|
# Entry points
|
|
- --entrypoints.web.address=:80
|
|
- --entrypoints.websecure.address=:443
|
|
- --entrypoints.web.http.redirections.entrypoint.to=websecure
|
|
- --entrypoints.web.http.redirections.entrypoint.scheme=https
|
|
|
|
# SSL/TLS configuration
|
|
- --certificatesresolvers.letsencrypt.acme.email=${EMAIL}
|
|
- --certificatesresolvers.letsencrypt.acme.storage=/certificates/acme.json
|
|
- --certificatesresolvers.letsencrypt.acme.httpchallenge.entrypoint=web
|
|
|
|
# Logging and monitoring
|
|
- --log.level=INFO
|
|
- --log.format=json
|
|
- --accesslog=true
|
|
- --accesslog.format=json
|
|
- --metrics.prometheus=true
|
|
- --ping=true
|
|
|
|
ports:
|
|
- target: 80
|
|
published: 80
|
|
protocol: tcp
|
|
mode: ingress
|
|
- target: 443
|
|
published: 443
|
|
protocol: tcp
|
|
mode: ingress
|
|
|
|
volumes:
|
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
- traefik-certificates:/certificates
|
|
- traefik-logs:/var/log/traefik
|
|
|
|
secrets:
|
|
- traefik_users
|
|
|
|
networks:
|
|
- public-zone
|
|
- management-zone
|
|
|
|
environment:
|
|
- DOMAIN=${DOMAIN}
|
|
- EMAIL=${EMAIL}
|
|
|
|
deploy:
|
|
mode: replicated
|
|
replicas: 2
|
|
placement:
|
|
constraints:
|
|
- node.role == manager
|
|
preferences:
|
|
- spread: node.labels.zone
|
|
resources:
|
|
limits:
|
|
memory: 512M
|
|
cpus: '0.5'
|
|
reservations:
|
|
memory: 256M
|
|
cpus: '0.25'
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 5s
|
|
max_attempts: 3
|
|
window: 120s
|
|
update_config:
|
|
parallelism: 1
|
|
delay: 10s
|
|
order: start-first
|
|
failure_action: rollback
|
|
monitor: 60s
|
|
rollback_config:
|
|
parallelism: 1
|
|
delay: 5s
|
|
order: stop-first
|
|
monitor: 60s
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.routers.traefik-dashboard.rule=Host(`traefik.${DOMAIN}`)"
|
|
- "traefik.http.routers.traefik-dashboard.entrypoints=websecure"
|
|
- "traefik.http.routers.traefik-dashboard.tls.certresolver=letsencrypt"
|
|
- "traefik.http.routers.traefik-dashboard.service=api@internal"
|
|
- "traefik.http.routers.traefik-dashboard.middlewares=auth-secure@file"
|
|
|
|
secrets:
|
|
traefik_users:
|
|
external: true
|
|
|
|
volumes:
|
|
traefik-certificates:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/traefik/certificates
|
|
traefik-logs:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/traefik/logs
|
|
|
|
networks:
|
|
public-zone:
|
|
external: true
|
|
management-zone:
|
|
external: true
|
|
EOF
|
|
|
|
# Create PostgreSQL cluster configuration
|
|
cat > "$DOCKER_COMPOSE_DIR/postgres-cluster.yml" << 'EOF'
|
|
version: '3.8'
|
|
|
|
services:
|
|
postgres-primary:
|
|
image: postgres:15-alpine
|
|
environment:
|
|
POSTGRES_DB: ${POSTGRES_DB}
|
|
POSTGRES_USER: ${POSTGRES_USER}
|
|
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
|
|
POSTGRES_REPLICATION_USER: replicator
|
|
POSTGRES_REPLICATION_PASSWORD_FILE: /run/secrets/postgres_replication_password
|
|
secrets:
|
|
- postgres_password
|
|
- postgres_replication_password
|
|
volumes:
|
|
- postgres-primary-data:/var/lib/postgresql/data
|
|
- postgres-config:/etc/postgresql
|
|
networks:
|
|
- data-zone
|
|
deploy:
|
|
mode: replicated
|
|
replicas: 1
|
|
placement:
|
|
constraints:
|
|
- node.labels.role.database == true
|
|
- node.labels.storage == high
|
|
resources:
|
|
limits:
|
|
memory: 4G
|
|
cpus: '2.0'
|
|
reservations:
|
|
memory: 2G
|
|
cpus: '1.0'
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 10s
|
|
max_attempts: 3
|
|
update_config:
|
|
parallelism: 1
|
|
delay: 30s
|
|
order: stop-first
|
|
failure_action: rollback
|
|
monitor: 120s
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER}"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 40s
|
|
|
|
postgres-replica:
|
|
image: postgres:15-alpine
|
|
environment:
|
|
POSTGRES_DB: ${POSTGRES_DB}
|
|
POSTGRES_USER: ${POSTGRES_USER}
|
|
POSTGRES_PASSWORD_FILE: /run/secrets/postgres_password
|
|
PGUSER: ${POSTGRES_USER}
|
|
POSTGRES_PRIMARY_HOST: postgres-primary
|
|
secrets:
|
|
- postgres_password
|
|
volumes:
|
|
- postgres-replica-data:/var/lib/postgresql/data
|
|
networks:
|
|
- data-zone
|
|
depends_on:
|
|
- postgres-primary
|
|
deploy:
|
|
mode: replicated
|
|
replicas: 1
|
|
placement:
|
|
constraints:
|
|
- node.labels.role.database == true
|
|
- node.labels.storage != low
|
|
preferences:
|
|
- spread: node.labels.zone
|
|
resources:
|
|
limits:
|
|
memory: 2G
|
|
cpus: '1.0'
|
|
reservations:
|
|
memory: 1G
|
|
cpus: '0.5'
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 10s
|
|
max_attempts: 3
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER}"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
|
|
secrets:
|
|
postgres_password:
|
|
external: true
|
|
postgres_replication_password:
|
|
external: true
|
|
|
|
volumes:
|
|
postgres-primary-data:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/postgresql/primary/data
|
|
postgres-replica-data:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/postgresql/replica/data
|
|
postgres-config:
|
|
driver: local
|
|
|
|
networks:
|
|
data-zone:
|
|
external: true
|
|
EOF
|
|
|
|
# Create Redis cluster configuration
|
|
cat > "$DOCKER_COMPOSE_DIR/redis-cluster.yml" << 'EOF'
|
|
version: '3.8'
|
|
|
|
services:
|
|
redis-primary:
|
|
image: redis:7-alpine
|
|
command: redis-server --appendonly yes --requirepass-file /run/secrets/redis_password
|
|
secrets:
|
|
- redis_password
|
|
volumes:
|
|
- redis-primary-data:/data
|
|
networks:
|
|
- data-zone
|
|
deploy:
|
|
mode: replicated
|
|
replicas: 1
|
|
placement:
|
|
constraints:
|
|
- node.labels.role.database == true
|
|
preferences:
|
|
- spread: node.labels.zone
|
|
resources:
|
|
limits:
|
|
memory: 512M
|
|
cpus: '0.5'
|
|
reservations:
|
|
memory: 256M
|
|
cpus: '0.25'
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 5s
|
|
max_attempts: 3
|
|
healthcheck:
|
|
test: ["CMD", "redis-cli", "--raw", "incr", "ping"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
|
|
redis-replica:
|
|
image: redis:7-alpine
|
|
command: redis-server --appendonly yes --requirepass-file /run/secrets/redis_password --replicaof redis-primary 6379
|
|
secrets:
|
|
- redis_password
|
|
volumes:
|
|
- redis-replica-data:/data
|
|
networks:
|
|
- data-zone
|
|
depends_on:
|
|
- redis-primary
|
|
deploy:
|
|
mode: replicated
|
|
replicas: 2
|
|
placement:
|
|
constraints:
|
|
- node.labels.role.database == true
|
|
preferences:
|
|
- spread: node.labels.zone
|
|
resources:
|
|
limits:
|
|
memory: 256M
|
|
cpus: '0.25'
|
|
reservations:
|
|
memory: 128M
|
|
cpus: '0.1'
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 5s
|
|
max_attempts: 3
|
|
|
|
secrets:
|
|
redis_password:
|
|
external: true
|
|
|
|
volumes:
|
|
redis-primary-data:
|
|
driver: local
|
|
redis-replica-data:
|
|
driver: local
|
|
|
|
networks:
|
|
data-zone:
|
|
external: true
|
|
EOF
|
|
|
|
log_success "Optimized service configurations created"
|
|
}
|
|
|
|
# Function to deploy resource monitoring
|
|
deploy_resource_monitoring() {
|
|
log_step "Deploying resource monitoring..."
|
|
|
|
# Create resource monitoring configuration
|
|
cat > "$DOCKER_COMPOSE_DIR/resource-monitoring.yml" << 'EOF'
|
|
version: '3.8'
|
|
|
|
services:
|
|
cadvisor:
|
|
image: gcr.io/cadvisor/cadvisor:latest
|
|
volumes:
|
|
- /:/rootfs:ro
|
|
- /var/run:/var/run:ro
|
|
- /sys:/sys:ro
|
|
- /var/lib/docker/:/var/lib/docker:ro
|
|
- /dev/disk/:/dev/disk:ro
|
|
ports:
|
|
- target: 8080
|
|
published: 8080
|
|
protocol: tcp
|
|
mode: host
|
|
networks:
|
|
- monitoring-zone
|
|
deploy:
|
|
mode: global
|
|
resources:
|
|
limits:
|
|
memory: 256M
|
|
cpus: '0.2'
|
|
reservations:
|
|
memory: 128M
|
|
cpus: '0.1'
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 5s
|
|
max_attempts: 3
|
|
command:
|
|
- '--housekeeping_interval=10s'
|
|
- '--docker_only=true'
|
|
- '--disable_metrics=disk,network,tcp,udp,percpu,sched,process'
|
|
|
|
node-exporter:
|
|
image: prom/node-exporter:latest
|
|
volumes:
|
|
- /proc:/host/proc:ro
|
|
- /sys:/host/sys:ro
|
|
- /:/rootfs:ro
|
|
ports:
|
|
- target: 9100
|
|
published: 9100
|
|
protocol: tcp
|
|
mode: host
|
|
networks:
|
|
- monitoring-zone
|
|
deploy:
|
|
mode: global
|
|
resources:
|
|
limits:
|
|
memory: 128M
|
|
cpus: '0.1'
|
|
reservations:
|
|
memory: 64M
|
|
cpus: '0.05'
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 5s
|
|
max_attempts: 3
|
|
command:
|
|
- '--path.procfs=/host/proc'
|
|
- '--path.sysfs=/host/sys'
|
|
- '--collector.filesystem.ignored-mount-points'
|
|
- '^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)'
|
|
|
|
networks:
|
|
monitoring-zone:
|
|
external: true
|
|
EOF
|
|
|
|
# Deploy resource monitoring
|
|
if ssh "$MANAGER_HOST" "cd $DOCKER_COMPOSE_DIR && docker stack deploy -c resource-monitoring.yml monitoring"; then
|
|
log_success "Resource monitoring deployed successfully"
|
|
else
|
|
log_error "Failed to deploy resource monitoring"
|
|
return 1
|
|
fi
|
|
|
|
# Wait for services to be ready
|
|
wait_for_service "Resource monitoring" "ssh $MANAGER_HOST 'docker service ls | grep monitoring_cadvisor | grep -q \"1/\"'" 60 10
|
|
|
|
log_success "Resource monitoring deployment completed"
|
|
}
|
|
|
|
# Function to test swarm functionality
|
|
test_swarm_functionality() {
|
|
log_step "Testing Docker Swarm functionality..."
|
|
|
|
# Test service deployment
|
|
log_info "Testing service deployment..."
|
|
local test_service="test-swarm-function"
|
|
|
|
if ssh "$MANAGER_HOST" "docker service create --name $test_service --replicas 3 --constraint 'node.role!=manager' alpine sleep 300"; then
|
|
log_success "Test service deployed successfully"
|
|
else
|
|
log_error "Failed to deploy test service"
|
|
return 1
|
|
fi
|
|
|
|
# Wait for service to be ready
|
|
sleep 15
|
|
|
|
# Check service status
|
|
local running_replicas=$(ssh "$MANAGER_HOST" "docker service ps $test_service | grep -c Running")
|
|
if [[ $running_replicas -ge 2 ]]; then
|
|
log_success "Test service has $running_replicas running replicas"
|
|
else
|
|
log_error "Test service only has $running_replicas running replicas"
|
|
fi
|
|
|
|
# Test service scaling
|
|
log_info "Testing service scaling..."
|
|
if ssh "$MANAGER_HOST" "docker service scale ${test_service}=5"; then
|
|
sleep 10
|
|
local scaled_replicas=$(ssh "$MANAGER_HOST" "docker service ps $test_service | grep -c Running")
|
|
log_success "Service scaled to $scaled_replicas replicas"
|
|
else
|
|
log_warn "Service scaling test failed"
|
|
fi
|
|
|
|
# Test rolling update
|
|
log_info "Testing rolling update..."
|
|
if ssh "$MANAGER_HOST" "docker service update --image alpine:latest $test_service"; then
|
|
log_success "Rolling update test completed"
|
|
else
|
|
log_warn "Rolling update test failed"
|
|
fi
|
|
|
|
# Cleanup test service
|
|
ssh "$MANAGER_HOST" "docker service rm $test_service" >/dev/null 2>&1 || true
|
|
|
|
# Test network connectivity between nodes
|
|
log_info "Testing network connectivity..."
|
|
local connectivity_issues=0
|
|
|
|
for host in "${HOSTS[@]}"; do
|
|
if [[ "$host" != "$MANAGER_HOST" ]] && [[ "$host" != "raspberrypi" ]]; then
|
|
if ping -c 1 -W 5 "$host" >/dev/null 2>&1; then
|
|
log_debug "Network connectivity to $host: OK"
|
|
else
|
|
log_error "Network connectivity to $host: FAILED"
|
|
((connectivity_issues++))
|
|
fi
|
|
fi
|
|
done
|
|
|
|
if [[ $connectivity_issues -eq 0 ]]; then
|
|
log_success "All network connectivity tests passed"
|
|
else
|
|
log_error "$connectivity_issues network connectivity issues detected"
|
|
return 1
|
|
fi
|
|
|
|
log_success "Docker Swarm functionality tests completed successfully"
|
|
}
|
|
|
|
# Function to create swarm health monitoring script
|
|
create_swarm_health_monitor() {
|
|
log_step "Creating swarm health monitoring script..."
|
|
|
|
cat > "/opt/migration/scripts/swarm_health_monitor.sh" << 'EOF'
|
|
#!/bin/bash
|
|
# Docker Swarm Health Monitor
|
|
# Monitors swarm health and sends alerts for issues
|
|
|
|
MANAGER_HOST="omv800"
|
|
ALERT_LOG="/var/log/swarm_health.log"
|
|
ALERT_THRESHOLD_CPU=80
|
|
ALERT_THRESHOLD_MEMORY=85
|
|
|
|
log_alert() {
|
|
echo "$(date): SWARM_ALERT - $1" | tee -a "$ALERT_LOG"
|
|
logger "SWARM_HEALTH_ALERT: $1"
|
|
}
|
|
|
|
check_node_health() {
|
|
local nodes_down=$(ssh "$MANAGER_HOST" "docker node ls --format '{{.Status}}'" | grep -c Down || echo "0")
|
|
if [[ $nodes_down -gt 0 ]]; then
|
|
log_alert "Docker nodes down: $nodes_down"
|
|
fi
|
|
|
|
local nodes_unavailable=$(ssh "$MANAGER_HOST" "docker node ls --format '{{.Availability}}'" | grep -c Drain || echo "0")
|
|
if [[ $nodes_unavailable -gt 1 ]]; then # Allow one for maintenance
|
|
log_alert "Multiple nodes unavailable: $nodes_unavailable"
|
|
fi
|
|
}
|
|
|
|
check_service_health() {
|
|
local services_with_issues=$(ssh "$MANAGER_HOST" "docker service ls --format '{{.Name}} {{.Replicas}}'" | grep -c "0/\|1/[2-9]" || echo "0")
|
|
if [[ $services_with_issues -gt 0 ]]; then
|
|
log_alert "Services with replica issues: $services_with_issues"
|
|
fi
|
|
}
|
|
|
|
check_resource_usage() {
|
|
# Check if resource monitoring is available
|
|
for host in omv800 fedora surface jonathan-2518f5u audrey; do
|
|
local cpu_usage=$(curl -s "http://${host}:8080/api/v1.3/machine" 2>/dev/null | jq -r '.cpu_usage_rate // 0' 2>/dev/null || echo "0")
|
|
local memory_usage=$(curl -s "http://${host}:8080/api/v1.3/machine" 2>/dev/null | jq -r '.memory.usage // 0' 2>/dev/null || echo "0")
|
|
|
|
# Convert to percentage if needed
|
|
if (( $(echo "$cpu_usage > $ALERT_THRESHOLD_CPU" | bc -l 2>/dev/null || echo "0") )); then
|
|
log_alert "High CPU usage on $host: ${cpu_usage}%"
|
|
fi
|
|
|
|
# Memory usage calculation would need more complex logic
|
|
# This is simplified for demonstration
|
|
done
|
|
}
|
|
|
|
check_swarm_secrets() {
|
|
local secrets_count=$(ssh "$MANAGER_HOST" "docker secret ls -q | wc -l")
|
|
if [[ $secrets_count -lt 5 ]]; then # Expecting at least 5 secrets
|
|
log_alert "Unexpected low secret count: $secrets_count"
|
|
fi
|
|
}
|
|
|
|
# Main monitoring loop
|
|
while true; do
|
|
check_node_health
|
|
check_service_health
|
|
check_resource_usage
|
|
check_swarm_secrets
|
|
|
|
sleep 300 # Check every 5 minutes
|
|
done
|
|
EOF
|
|
|
|
chmod +x "/opt/migration/scripts/swarm_health_monitor.sh"
|
|
|
|
# Deploy health monitor as a systemd service on manager
|
|
ssh "$MANAGER_HOST" "cat > /tmp/swarm-health-monitor.service << 'SERVICE_EOF'
|
|
[Unit]
|
|
Description=Docker Swarm Health Monitor
|
|
After=docker.service
|
|
Requires=docker.service
|
|
|
|
[Service]
|
|
ExecStart=/opt/migration/scripts/swarm_health_monitor.sh
|
|
Restart=always
|
|
RestartSec=10
|
|
User=root
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
SERVICE_EOF"
|
|
|
|
scp "/opt/migration/scripts/swarm_health_monitor.sh" "$MANAGER_HOST:/opt/migration/scripts/"
|
|
ssh "$MANAGER_HOST" "sudo mv /tmp/swarm-health-monitor.service /etc/systemd/system/"
|
|
ssh "$MANAGER_HOST" "sudo systemctl daemon-reload && sudo systemctl enable swarm-health-monitor.service"
|
|
|
|
if ssh "$MANAGER_HOST" "sudo systemctl start swarm-health-monitor.service"; then
|
|
log_success "Swarm health monitor started on $MANAGER_HOST"
|
|
else
|
|
log_warn "Swarm health monitor may have issues"
|
|
fi
|
|
|
|
log_success "Swarm health monitoring setup completed"
|
|
}
|
|
|
|
# Main execution function
|
|
main() {
|
|
local action=${1:-"full"}
|
|
|
|
# Register cleanup and rollback functions
|
|
register_cleanup cleanup_swarm_config
|
|
register_rollback rollback_swarm_config
|
|
|
|
case $action in
|
|
"full")
|
|
log_step "Starting Docker Swarm optimization..."
|
|
|
|
# Validate prerequisites
|
|
validate_prerequisites ssh docker jq bc curl
|
|
|
|
# Validate network connectivity
|
|
validate_network_connectivity "${HOST_IPS[@]}"
|
|
|
|
# Create checkpoint
|
|
create_checkpoint "swarm_optimization_start"
|
|
|
|
# Validate Docker versions
|
|
validate_docker_versions
|
|
create_checkpoint "docker_versions_validated"
|
|
|
|
# Configure Docker daemon
|
|
configure_docker_daemon
|
|
create_checkpoint "docker_daemon_configured"
|
|
|
|
# Configure node labels
|
|
configure_node_labels
|
|
create_checkpoint "node_labels_configured"
|
|
|
|
# Configure swarm settings
|
|
configure_swarm_settings
|
|
create_checkpoint "swarm_settings_configured"
|
|
|
|
# Create optimized service configurations
|
|
create_optimized_service_configs
|
|
create_checkpoint "service_configs_created"
|
|
|
|
# Deploy resource monitoring
|
|
deploy_resource_monitoring
|
|
create_checkpoint "resource_monitoring_deployed"
|
|
|
|
# Test swarm functionality
|
|
test_swarm_functionality
|
|
create_checkpoint "swarm_functionality_tested"
|
|
|
|
# Create health monitoring
|
|
create_swarm_health_monitor
|
|
create_checkpoint "health_monitoring_setup"
|
|
|
|
log_success "✅ Docker Swarm optimization completed successfully!"
|
|
log_info "📊 Check swarm status: ssh $MANAGER_HOST docker node ls"
|
|
log_info "🔍 Monitor resources: http://any-host:8080 (cAdvisor)"
|
|
;;
|
|
|
|
"labels-only")
|
|
configure_node_labels
|
|
;;
|
|
|
|
"test-only")
|
|
test_swarm_functionality
|
|
;;
|
|
|
|
"monitor-only")
|
|
deploy_resource_monitoring
|
|
create_swarm_health_monitor
|
|
;;
|
|
|
|
"help"|*)
|
|
cat << EOF
|
|
Docker Swarm Optimizer
|
|
|
|
Usage: $0 <action>
|
|
|
|
Actions:
|
|
full - Complete swarm optimization (default)
|
|
labels-only - Only configure node labels
|
|
test-only - Only test swarm functionality
|
|
monitor-only - Only deploy monitoring
|
|
help - Show this help
|
|
|
|
Examples:
|
|
$0 full
|
|
$0 test-only
|
|
$0 monitor-only
|
|
EOF
|
|
;;
|
|
esac
|
|
}
|
|
|
|
# Execute main function
|
|
main "$@" |