COMPREHENSIVE CHANGES: INFRASTRUCTURE MIGRATION: - Migrated services to Docker Swarm on OMV800 (192.168.50.229) - Deployed PostgreSQL database for Vaultwarden migration - Updated all stack configurations for Docker Swarm compatibility - Added comprehensive monitoring stack (Prometheus, Grafana, Blackbox) - Implemented proper secret management for all services VAULTWARDEN POSTGRESQL MIGRATION: - Attempted migration from SQLite to PostgreSQL for NFS compatibility - Created PostgreSQL stack with proper user/password configuration - Built custom Vaultwarden image with PostgreSQL support - Troubleshot persistent SQLite fallback issue despite PostgreSQL config - Identified known issue where Vaultwarden silently falls back to SQLite - Added ENABLE_DB_WAL=false to prevent filesystem compatibility issues - Current status: Old Vaultwarden on lenovo410 still working, new one has config issues PAPERLESS SERVICES: - Successfully deployed Paperless-NGX and Paperless-AI on OMV800 - Both services running on ports 8000 and 3000 respectively - Caddy configuration updated for external access - Services accessible via paperless.pressmess.duckdns.org and paperless-ai.pressmess.duckdns.org CADDY CONFIGURATION: - Updated Caddyfile on Surface (192.168.50.254) for new service locations - Fixed Vaultwarden reverse proxy to point to new Docker Swarm service - Removed old notification hub reference that was causing conflicts - All services properly configured for external access via DuckDNS BACKUP AND DISCOVERY: - Created comprehensive backup system for all hosts - Generated detailed discovery reports for infrastructure analysis - Implemented automated backup validation scripts - Created migration progress tracking and verification reports MONITORING STACK: - Deployed Prometheus, Grafana, and Blackbox monitoring - Created infrastructure and system overview dashboards - Added proper service discovery and alerting configuration - Implemented performance monitoring for all critical services DOCUMENTATION: - Reorganized documentation into logical structure - Created comprehensive migration playbook and troubleshooting guides - Added hardware specifications and optimization recommendations - Documented all configuration changes and service dependencies CURRENT STATUS: - Paperless services: ✅ Working and accessible externally - Vaultwarden: ❌ PostgreSQL configuration issues, old instance still working - Monitoring: ✅ Deployed and operational - Caddy: ✅ Updated and working for external access - PostgreSQL: ✅ Database running, connection issues with Vaultwarden NEXT STEPS: - Continue troubleshooting Vaultwarden PostgreSQL configuration - Consider alternative approaches for Vaultwarden migration - Validate all external service access - Complete final migration validation TECHNICAL NOTES: - Used Docker Swarm for orchestration on OMV800 - Implemented proper secret management for sensitive data - Added comprehensive logging and monitoring - Created automated backup and validation scripts
294 lines
7.9 KiB
Bash
Executable File
294 lines
7.9 KiB
Bash
Executable File
#!/bin/bash
|
||
# Setup Docker Swarm Cluster
|
||
# This script initializes Docker Swarm across all hosts
|
||
|
||
set -euo pipefail
|
||
|
||
echo "🐳 Setting up Docker Swarm cluster..."
|
||
|
||
# Define hosts and their roles
|
||
MANAGER_HOST="omv800"
|
||
MANAGER_IP="192.168.50.229"
|
||
WORKER_HOSTS=("fedora" "surface" "jonathan-2518f5u" "audrey")
|
||
WORKER_IPS=("192.168.50.225" "192.168.50.254" "192.168.50.181" "192.168.50.145")
|
||
|
||
# Colors for output
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
NC='\033[0m' # No Color
|
||
|
||
# Function to print colored output
|
||
print_status() {
|
||
echo -e "${GREEN}[INFO]${NC} $1"
|
||
}
|
||
|
||
print_warning() {
|
||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||
}
|
||
|
||
print_error() {
|
||
echo -e "${RED}[ERROR]${NC} $1"
|
||
}
|
||
|
||
# Function to check if Docker is installed
|
||
check_docker() {
|
||
local host=$1
|
||
print_status "Checking Docker installation on $host..."
|
||
|
||
if ssh -o ConnectTimeout=10 "$host" "docker --version" > /dev/null 2>&1; then
|
||
print_status "Docker is installed on $host"
|
||
return 0
|
||
else
|
||
print_error "Docker is not installed on $host"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# Function to check if host is already in swarm
|
||
check_swarm_status() {
|
||
local host=$1
|
||
if ssh -o ConnectTimeout=10 "$host" "docker info --format '{{.Swarm.LocalNodeState}}'" 2>/dev/null | grep -q "active"; then
|
||
print_warning "$host is already part of a swarm"
|
||
return 0
|
||
else
|
||
print_status "$host is not in swarm mode"
|
||
return 1
|
||
fi
|
||
}
|
||
|
||
# Function to leave swarm if already joined
|
||
leave_swarm() {
|
||
local host=$1
|
||
print_status "Leaving existing swarm on $host..."
|
||
ssh -o ConnectTimeout=10 "$host" "docker swarm leave --force" 2>/dev/null || true
|
||
sleep 5
|
||
}
|
||
|
||
# 1. Check Docker installation on all hosts
|
||
print_status "Step 1: Checking Docker installation..."
|
||
for host in "$MANAGER_HOST" "${WORKER_HOSTS[@]}"; do
|
||
if ! check_docker "$host"; then
|
||
print_error "Please install Docker on $host before proceeding"
|
||
exit 1
|
||
fi
|
||
done
|
||
|
||
# 2. Initialize swarm on manager
|
||
print_status "Step 2: Initializing swarm on manager ($MANAGER_HOST)..."
|
||
if check_swarm_status "$MANAGER_HOST"; then
|
||
leave_swarm "$MANAGER_HOST"
|
||
fi
|
||
|
||
ssh "$MANAGER_HOST" "docker swarm init --advertise-addr $MANAGER_IP --listen-addr $MANAGER_IP"
|
||
|
||
# Get join token for workers
|
||
print_status "Getting join token for workers..."
|
||
JOIN_TOKEN=$(ssh "$MANAGER_HOST" "docker swarm join-token -q worker")
|
||
MANAGER_TOKEN=$(ssh "$MANAGER_HOST" "docker swarm join-token -q manager")
|
||
|
||
print_status "Worker token: $JOIN_TOKEN"
|
||
print_status "Manager token: $MANAGER_TOKEN"
|
||
|
||
# 3. Join workers to swarm
|
||
print_status "Step 3: Joining workers to swarm..."
|
||
for i in "${!WORKER_HOSTS[@]}"; do
|
||
host="${WORKER_HOSTS[$i]}"
|
||
ip="${WORKER_IPS[$i]}"
|
||
|
||
print_status "Joining $host ($ip) to swarm..."
|
||
|
||
if check_swarm_status "$host"; then
|
||
leave_swarm "$host"
|
||
fi
|
||
|
||
if ssh -o ConnectTimeout=10 "$host" "docker swarm join --token $JOIN_TOKEN $MANAGER_IP:2377 --advertise-addr $ip --listen-addr $ip"; then
|
||
print_status "Successfully joined $host to swarm"
|
||
else
|
||
print_error "Failed to join $host to swarm"
|
||
exit 1
|
||
fi
|
||
done
|
||
|
||
# 4. Verify swarm status
|
||
print_status "Step 4: Verifying swarm status..."
|
||
sleep 10
|
||
|
||
print_status "Swarm nodes:"
|
||
ssh "$MANAGER_HOST" "docker node ls"
|
||
|
||
# 5. Create overlay networks
|
||
print_status "Step 5: Creating overlay networks..."
|
||
|
||
NETWORKS=(
|
||
"traefik-public"
|
||
"monitoring"
|
||
"databases"
|
||
"applications"
|
||
"iot-network"
|
||
"backup-network"
|
||
)
|
||
|
||
for network in "${NETWORKS[@]}"; do
|
||
print_status "Creating network: $network"
|
||
if ssh "$MANAGER_HOST" "docker network create --driver overlay --attachable $network" 2>/dev/null; then
|
||
print_status "Created network: $network"
|
||
else
|
||
print_warning "Network $network may already exist"
|
||
fi
|
||
done
|
||
|
||
# 6. Setup swarm manager backup
|
||
print_status "Step 6: Setting up manager backup..."
|
||
print_status "Promoting surface as backup manager..."
|
||
|
||
if ssh "surface" "docker swarm join --token $MANAGER_TOKEN $MANAGER_IP:2377 --advertise-addr 192.168.50.254 --listen-addr 192.168.50.254"; then
|
||
print_status "Successfully promoted surface as backup manager"
|
||
else
|
||
print_warning "Failed to promote surface as backup manager"
|
||
fi
|
||
|
||
# 7. Configure swarm settings
|
||
print_status "Step 7: Configuring swarm settings..."
|
||
|
||
# Set up auto-lock for security
|
||
ssh "$MANAGER_HOST" "docker swarm update --autolock=true"
|
||
|
||
# Configure logging
|
||
ssh "$MANAGER_HOST" "docker swarm update --log-driver=json-file --log-opt max-size=10m --log-opt max-file=3"
|
||
|
||
# 8. Create swarm configuration file
|
||
print_status "Step 8: Creating swarm configuration..."
|
||
cat > "/opt/migration/configs/swarm-config.yml" << EOF
|
||
# Docker Swarm Configuration
|
||
# Generated: $(date)
|
||
|
||
swarm:
|
||
manager:
|
||
primary: $MANAGER_HOST
|
||
backup: surface
|
||
ip: $MANAGER_IP
|
||
|
||
workers:
|
||
$(for i in "${!WORKER_HOSTS[@]}"; do echo " - host: ${WORKER_HOSTS[$i]}"; echo " ip: ${WORKER_IPS[$i]}"; done)
|
||
|
||
networks:
|
||
$(for network in "${NETWORKS[@]}"; do echo " - $network"; done)
|
||
|
||
tokens:
|
||
worker: $JOIN_TOKEN
|
||
manager: $MANAGER_TOKEN
|
||
|
||
settings:
|
||
autolock: true
|
||
log_driver: json-file
|
||
log_opts:
|
||
max_size: 10m
|
||
max_file: 3
|
||
EOF
|
||
|
||
# 9. Test swarm connectivity
|
||
print_status "Step 9: Testing swarm connectivity..."
|
||
|
||
# Test service deployment
|
||
print_status "Testing service deployment..."
|
||
ssh "$MANAGER_HOST" "docker service create --name test-service --replicas 2 --network traefik-public nginx:alpine"
|
||
|
||
sleep 10
|
||
|
||
# Check service status
|
||
print_status "Service status:"
|
||
ssh "$MANAGER_HOST" "docker service ls"
|
||
ssh "$MANAGER_HOST" "docker service ps test-service"
|
||
|
||
# Clean up test service
|
||
print_status "Cleaning up test service..."
|
||
ssh "$MANAGER_HOST" "docker service rm test-service"
|
||
|
||
# 10. Create health check script
|
||
print_status "Step 10: Creating health check script..."
|
||
cat > "/opt/migration/scripts/check_swarm_health.sh" << 'EOF'
|
||
#!/bin/bash
|
||
# Check Docker Swarm Health
|
||
|
||
set -euo pipefail
|
||
|
||
MANAGER_HOST="omv800"
|
||
|
||
echo "🏥 Checking Docker Swarm health..."
|
||
|
||
# Check node status
|
||
echo "📋 Node status:"
|
||
ssh "$MANAGER_HOST" "docker node ls"
|
||
|
||
# Check network status
|
||
echo "🌐 Network status:"
|
||
ssh "$MANAGER_HOST" "docker network ls --filter driver=overlay"
|
||
|
||
# Check service status
|
||
echo "🔧 Service status:"
|
||
ssh "$MANAGER_HOST" "docker service ls"
|
||
|
||
# Check swarm info
|
||
echo "ℹ️ Swarm info:"
|
||
ssh "$MANAGER_HOST" "docker info --format '{{.Swarm.LocalNodeState}}'"
|
||
|
||
echo "✅ Swarm health check completed"
|
||
EOF
|
||
|
||
chmod +x "/opt/migration/scripts/check_swarm_health.sh"
|
||
|
||
# 11. Final verification
|
||
print_status "Step 11: Final verification..."
|
||
|
||
print_status "Swarm nodes:"
|
||
ssh "$MANAGER_HOST" "docker node ls"
|
||
|
||
print_status "Overlay networks:"
|
||
ssh "$MANAGER_HOST" "docker network ls --filter driver=overlay"
|
||
|
||
print_status "Swarm info:"
|
||
ssh "$MANAGER_HOST" "docker info --format '{{.Swarm.LocalNodeState}}'"
|
||
|
||
# 12. Create summary
|
||
print_status "Step 12: Creating setup summary..."
|
||
cat > "/opt/migration/setup_summary.txt" << EOF
|
||
Docker Swarm Setup Summary
|
||
Generated: $(date)
|
||
|
||
Manager Node:
|
||
Host: $MANAGER_HOST
|
||
IP: $MANAGER_IP
|
||
Status: Active
|
||
|
||
Backup Manager:
|
||
Host: surface
|
||
IP: 192.168.50.254
|
||
Status: Active
|
||
|
||
Worker Nodes:
|
||
$(for i in "${!WORKER_HOSTS[@]}"; do echo " - ${WORKER_HOSTS[$i]}: ${WORKER_IPS[$i]}"; done)
|
||
|
||
Networks Created:
|
||
$(for network in "${NETWORKS[@]}"; do echo " - $network"; done)
|
||
|
||
Configuration Files:
|
||
- /opt/migration/configs/swarm-config.yml
|
||
- /opt/migration/scripts/check_swarm_health.sh
|
||
|
||
Next Steps:
|
||
1. Deploy Traefik reverse proxy
|
||
2. Setup monitoring stack
|
||
3. Begin service migration
|
||
EOF
|
||
|
||
print_status "✅ Docker Swarm setup completed successfully!"
|
||
print_status "📋 Setup summary saved to: /opt/migration/setup_summary.txt"
|
||
print_status "🔧 Health check script: /opt/migration/scripts/check_swarm_health.sh"
|
||
|
||
echo ""
|
||
print_status "Next steps:"
|
||
echo " 1. Deploy Traefik: ./scripts/deploy_traefik.sh"
|
||
echo " 2. Setup monitoring: ./scripts/setup_monitoring.sh"
|
||
echo " 3. Begin migration: ./scripts/start_migration.sh"
|