#!/bin/bash # Enhanced Document Current Infrastructure State # This script creates a complete snapshot with robust error handling and validation # Import error handling library SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/lib/error_handling.sh" # Configuration readonly HOSTS=("omv800" "fedora" "surface" "jonathan-2518f5u" "audrey" "raspberrypi") readonly HOST_IPS=("192.168.50.229" "192.168.50.225" "192.168.50.254" "192.168.50.181" "192.168.50.145" "192.168.50.107") readonly TIMESTAMP=$(date +%Y%m%d_%H%M%S) readonly SNAPSHOT_DIR="/opt/migration/backups/snapshot_${TIMESTAMP}" readonly REQUIRED_SPACE_GB=5 # Require 5GB free space readonly CONNECTION_TIMEOUT=30 readonly SSH_TIMEOUT=60 # Cleanup function cleanup_snapshot() { log_info "Cleaning up temporary files..." # Clean up temporary files on remote hosts for host in "${HOSTS[@]}"; do ssh -o ConnectTimeout=10 "$host" "rm -f /tmp/*_backup_${host}.tar.gz /tmp/*_dump_${host}.sql" 2>/dev/null || true done # Clean up incomplete snapshot if error occurred if [[ -d "$SNAPSHOT_DIR" ]] && [[ $ERROR_COUNT -gt 0 ]]; then log_warn "Removing incomplete snapshot directory: $SNAPSHOT_DIR" rm -rf "$SNAPSHOT_DIR" 2>/dev/null || true fi } # Rollback function rollback_snapshot() { log_info "Rolling back snapshot creation..." # Remove any partially created directories if [[ -d "$SNAPSHOT_DIR" ]]; then rm -rf "$SNAPSHOT_DIR" log_info "Removed partial snapshot directory" fi # Remove any temporary files cleanup_snapshot } # Function to validate host accessibility validate_host_access() { local host=$1 local ip=$2 log_info "Validating access to $host ($ip)..." # Test ping connectivity if ! ping -c 1 -W 5 "$ip" >/dev/null 2>&1; then log_error "Cannot ping $host ($ip)" return 1 fi # Test SSH connectivity if ! ssh -o ConnectTimeout=10 -o BatchMode=yes "$host" "echo 'SSH OK'" >/dev/null 2>&1; then log_error "Cannot SSH to $host" return 1 fi # Check if host has sufficient disk space for temporary files local available_gb=$(ssh "$host" "df -BG /tmp | awk 'NR==2 {print \$4}' | sed 's/G//'" 2>/dev/null || echo "0") if [[ $available_gb -lt 1 ]]; then log_warn "$host has limited disk space: ${available_gb}GB" fi log_success "Host $host is accessible and ready" return 0 } # Function to collect Docker information with error handling collect_docker_info() { local host=$1 local host_dir=$2 log_info "Collecting Docker information from $host..." # Create host directory mkdir -p "$host_dir" # Docker containers with timeout and error handling if execute_with_retry 3 5 ssh -o ConnectTimeout=10 "$host" "timeout 30 docker ps -a --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}\t{{.Ports}}\t{{.CreatedAt}}\t{{.Size}}'" > "$host_dir/docker_containers.txt"; then log_success "Docker containers collected from $host" else log_error "Failed to collect Docker containers from $host" echo "Failed to collect Docker containers" > "$host_dir/docker_containers.txt" fi # Docker images if execute_with_retry 3 5 ssh -o ConnectTimeout=10 "$host" "timeout 30 docker images --format 'table {{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}\t{{.Size}}'" > "$host_dir/docker_images.txt"; then log_success "Docker images collected from $host" else log_warn "Failed to collect Docker images from $host" echo "Failed to collect Docker images" > "$host_dir/docker_images.txt" fi # Docker networks if execute_with_retry 3 5 ssh -o ConnectTimeout=10 "$host" "timeout 30 docker network ls --format 'table {{.ID}}\t{{.Name}}\t{{.Driver}}\t{{.Scope}}'" > "$host_dir/docker_networks.txt"; then log_success "Docker networks collected from $host" else log_warn "Failed to collect Docker networks from $host" echo "Failed to collect Docker networks" > "$host_dir/docker_networks.txt" fi # Docker volumes if execute_with_retry 3 5 ssh -o ConnectTimeout=10 "$host" "timeout 30 docker volume ls --format 'table {{.Driver}}\t{{.Name}}'" > "$host_dir/docker_volumes.txt"; then log_success "Docker volumes collected from $host" else log_warn "Failed to collect Docker volumes from $host" echo "Failed to collect Docker volumes" > "$host_dir/docker_volumes.txt" fi # Docker system information if execute_with_retry 2 10 ssh -o ConnectTimeout=10 "$host" "timeout 60 docker system df -v" > "$host_dir/docker_system_df.txt"; then log_success "Docker system info collected from $host" else log_warn "Failed to collect Docker system info from $host" echo "Failed to collect Docker system info" > "$host_dir/docker_system_df.txt" fi # Docker compose files discovery if execute_with_retry 2 10 ssh -o ConnectTimeout=10 "$host" "find /opt /home -name 'docker-compose*.yml' -o -name 'compose*.yml' 2>/dev/null | head -20" > "$host_dir/compose_files.txt"; then log_success "Docker compose files discovered on $host" # Collect compose file contents local compose_dir="$host_dir/compose_files" mkdir -p "$compose_dir" while IFS= read -r compose_file; do if [[ -n "$compose_file" ]]; then local basename_file=$(basename "$compose_file") if ssh -o ConnectTimeout=10 "$host" "cat '$compose_file'" > "$compose_dir/${basename_file}_$(echo $compose_file | tr '/' '_')" 2>/dev/null; then log_debug "Collected compose file: $compose_file" fi fi done < "$host_dir/compose_files.txt" else log_warn "Failed to discover Docker compose files on $host" echo "Failed to discover compose files" > "$host_dir/compose_files.txt" fi } # Function to create database dumps with validation create_database_dumps() { log_step "Creating database dumps..." local dump_dir="$SNAPSHOT_DIR/database_dumps" mkdir -p "$dump_dir" # PostgreSQL dumps from hosts with PostgreSQL containers local postgres_hosts=("omv800" "surface" "jonathan-2518f5u") for host in "${postgres_hosts[@]}"; do log_info "Processing PostgreSQL dumps from $host..." # Check if PostgreSQL container exists if ssh -o ConnectTimeout=10 "$host" "docker ps | grep -i postgres" >/dev/null 2>&1; then log_info "PostgreSQL container found on $host, creating dump..." # Get PostgreSQL container ID local postgres_container=$(ssh "$host" "docker ps --filter 'ancestor=postgres' --format '{{.ID}}' | head -1" 2>/dev/null || echo "") if [[ -n "$postgres_container" ]]; then # Create database dump with timeout if execute_with_retry 2 30 ssh "$host" "timeout 300 docker exec $postgres_container pg_dumpall -U postgres > /tmp/postgres_dump_${host}.sql"; then # Verify dump was created and has content local dump_size=$(ssh "$host" "stat -f%z /tmp/postgres_dump_${host}.sql 2>/dev/null || stat -c%s /tmp/postgres_dump_${host}.sql 2>/dev/null || echo 0") if [[ $dump_size -gt 100 ]]; then # At least 100 bytes if scp -o ConnectTimeout=30 "$host:/tmp/postgres_dump_${host}.sql" "$dump_dir/"; then log_success "PostgreSQL dump created for $host (${dump_size} bytes)" else log_error "Failed to copy PostgreSQL dump from $host" fi else log_warn "PostgreSQL dump from $host is too small or empty" echo "PostgreSQL dump failed or empty" > "$dump_dir/postgres_dump_${host}.error" fi else log_error "Failed to create PostgreSQL dump on $host" echo "Failed to create PostgreSQL dump" > "$dump_dir/postgres_dump_${host}.error" fi else log_warn "No PostgreSQL container ID found on $host" echo "No PostgreSQL container found" > "$dump_dir/postgres_dump_${host}.info" fi else log_info "No PostgreSQL container found on $host" echo "No PostgreSQL container found" > "$dump_dir/postgres_dump_${host}.info" fi done # MySQL/MariaDB dumps if present for host in "${HOSTS[@]}"; do if ssh -o ConnectTimeout=10 "$host" "docker ps | grep -i -E 'mysql|mariadb'" >/dev/null 2>&1; then log_info "MySQL/MariaDB container found on $host, creating dump..." local mysql_container=$(ssh "$host" "docker ps --filter 'ancestor=mysql' --filter 'ancestor=mariadb' --format '{{.ID}}' | head -1" 2>/dev/null || echo "") if [[ -n "$mysql_container" ]]; then if execute_with_retry 2 30 ssh "$host" "timeout 300 docker exec $mysql_container mysqldump --all-databases > /tmp/mysql_dump_${host}.sql"; then if scp -o ConnectTimeout=30 "$host:/tmp/mysql_dump_${host}.sql" "$dump_dir/"; then log_success "MySQL dump created for $host" fi fi fi fi done } # Function to backup configurations safely backup_configurations() { log_step "Backing up configurations..." local config_dirs=("/etc/docker" "/opt" "/home/*/.config") for i in "${!HOSTS[@]}"; do local host="${HOSTS[$i]}" log_info "Backing up configurations from $host..." # Create configuration backup with error handling if execute_with_retry 2 60 ssh -o ConnectTimeout=10 "$host" "timeout 600 tar czf /tmp/config_backup_${host}.tar.gz ${config_dirs[*]} 2>/dev/null || echo 'Some configs may be missing'"; then # Check if backup file was created local backup_size=$(ssh "$host" "stat -f%z /tmp/config_backup_${host}.tar.gz 2>/dev/null || stat -c%s /tmp/config_backup_${host}.tar.gz 2>/dev/null || echo 0") if [[ $backup_size -gt 1000 ]]; then # At least 1KB if scp -o ConnectTimeout=60 "$host:/tmp/config_backup_${host}.tar.gz" "$SNAPSHOT_DIR/"; then log_success "Configuration backup created for $host (${backup_size} bytes)" else log_error "Failed to copy configuration backup from $host" fi else log_warn "Configuration backup from $host is too small" echo "Configuration backup failed or too small" > "$SNAPSHOT_DIR/config_backup_${host}.error" fi else log_error "Failed to create configuration backup on $host" echo "Failed to create configuration backup" > "$SNAPSHOT_DIR/config_backup_${host}.error" fi done } # Function to create comprehensive summary with validation create_comprehensive_summary() { log_step "Creating comprehensive summary report..." local summary_file="$SNAPSHOT_DIR/comprehensive_summary.md" cat > "$summary_file" << EOF # Infrastructure Snapshot Summary **Generated:** $(date) **Snapshot ID:** $TIMESTAMP **Script:** $SCRIPT_NAME **Directory:** $SNAPSHOT_DIR ## Snapshot Statistics - **Total Hosts:** ${#HOSTS[@]} - **Total Files:** $(find "$SNAPSHOT_DIR" -type f | wc -l) - **Total Size:** $(du -sh "$SNAPSHOT_DIR" | cut -f1) - **Errors During Collection:** $ERROR_COUNT - **Warnings During Collection:** $WARNING_COUNT ## Host Overview | Host | IP | Docker Containers | Database | Config Backup | |------|----|--------------------|----------|---------------| EOF # Generate host table for i in "${!HOSTS[@]}"; do local host="${HOSTS[$i]}" local ip="${HOST_IPS[$i]}" local host_dir="$SNAPSHOT_DIR/$host" # Count Docker containers local container_count=0 if [[ -f "$host_dir/docker_containers.txt" ]]; then container_count=$(grep -c "^[^$]" "$host_dir/docker_containers.txt" 2>/dev/null || echo "0") fi # Check database status local db_status="None" if [[ -f "$SNAPSHOT_DIR/database_dumps/postgres_dump_${host}.sql" ]]; then db_status="PostgreSQL" elif [[ -f "$SNAPSHOT_DIR/database_dumps/mysql_dump_${host}.sql" ]]; then db_status="MySQL" elif [[ -f "$SNAPSHOT_DIR/database_dumps/postgres_dump_${host}.info" ]]; then db_status="No DB" fi # Check config backup status local config_status="❌ Failed" if [[ -f "$SNAPSHOT_DIR/config_backup_${host}.tar.gz" ]]; then config_status="✅ Success" elif [[ -f "$SNAPSHOT_DIR/config_backup_${host}.error" ]]; then config_status="⚠️ Error" fi echo "| **$host** | $ip | $container_count | $db_status | $config_status |" >> "$summary_file" done # Add critical services section cat >> "$summary_file" << EOF ## Critical Services Detected EOF # Search for critical services across all hosts local critical_services=("immich" "jellyfin" "homeassistant" "appflowy" "paperless" "portainer" "traefik" "nginx" "apache") for service in "${critical_services[@]}"; do local found_hosts=() for host in "${HOSTS[@]}"; do if [[ -f "$SNAPSHOT_DIR/$host/docker_containers.txt" ]] && grep -qi "$service" "$SNAPSHOT_DIR/$host/docker_containers.txt" 2>/dev/null; then found_hosts+=("$host") fi done if [[ ${#found_hosts[@]} -gt 0 ]]; then echo "- **$service**: ${found_hosts[*]}" >> "$summary_file" fi done # Add validation results cat >> "$summary_file" << EOF ## Data Validation Results EOF # Validate database dumps local postgres_dumps=$(find "$SNAPSHOT_DIR/database_dumps" -name "postgres_dump_*.sql" 2>/dev/null | wc -l) local mysql_dumps=$(find "$SNAPSHOT_DIR/database_dumps" -name "mysql_dump_*.sql" 2>/dev/null | wc -l) echo "- **PostgreSQL Dumps:** $postgres_dumps" >> "$summary_file" echo "- **MySQL Dumps:** $mysql_dumps" >> "$summary_file" # Validate config backups local successful_backups=$(find "$SNAPSHOT_DIR" -name "config_backup_*.tar.gz" 2>/dev/null | wc -l) local failed_backups=$(find "$SNAPSHOT_DIR" -name "config_backup_*.error" 2>/dev/null | wc -l) echo "- **Successful Config Backups:** $successful_backups" >> "$summary_file" echo "- **Failed Config Backups:** $failed_backups" >> "$summary_file" # Add next steps cat >> "$summary_file" << EOF ## Next Steps 1. **Verify Data Integrity:** Run validation scripts on dumps and backups 2. **Test Restoration:** Test restore procedures in staging environment 3. **Security Review:** Ensure no sensitive data in backups 4. **Storage:** Move snapshot to secure long-term storage ## Files and Directories \`\`\` $(tree "$SNAPSHOT_DIR" 2>/dev/null || find "$SNAPSHOT_DIR" -type f | head -50) \`\`\` ## Logs and Errors - **Log File:** $LOG_FILE - **Error Log:** $ERROR_LOG - **Error Count:** $ERROR_COUNT - **Warning Count:** $WARNING_COUNT EOF log_success "Comprehensive summary created: $summary_file" } # Main execution function main() { log_step "Starting enhanced infrastructure documentation..." # Register cleanup and rollback functions register_cleanup cleanup_snapshot register_rollback rollback_snapshot # Validate prerequisites validate_prerequisites ssh scp ping docker tar gzip # Check available disk space check_disk_space $REQUIRED_SPACE_GB "/opt/migration/backups" # Create snapshot directory log_step "Creating snapshot directory: $SNAPSHOT_DIR" mkdir -p "$SNAPSHOT_DIR" chmod 755 "$SNAPSHOT_DIR" # Create checkpoint local checkpoint=$(create_checkpoint "snapshot_start") # Validate all host connectivity first log_step "Validating host connectivity..." for i in "${!HOSTS[@]}"; do validate_host_access "${HOSTS[$i]}" "${HOST_IPS[$i]}" done # Collect Docker information from all hosts log_step "Collecting Docker information from all hosts..." for i in "${!HOSTS[@]}"; do local host="${HOSTS[$i]}" local host_dir="$SNAPSHOT_DIR/$host" collect_docker_info "$host" "$host_dir" # Create individual checkpoint for each host create_checkpoint "docker_collected_$host" done # Create database dumps create_database_dumps create_checkpoint "database_dumps_complete" # Backup configurations backup_configurations create_checkpoint "config_backups_complete" # Collect additional system information log_step "Collecting system information..." for i in "${!HOSTS[@]}"; do local host="${HOSTS[$i]}" local host_dir="$SNAPSHOT_DIR/$host" log_info "Collecting system info from $host..." # System information if ssh -o ConnectTimeout=10 "$host" "uname -a && echo '---' && df -h && echo '---' && free -h && echo '---' && uptime && echo '---' && ps aux --sort=-%cpu | head -20" > "$host_dir/system_info.txt" 2>/dev/null; then log_success "System info collected from $host" else log_warn "Failed to collect system info from $host" fi # Network information if ssh -o ConnectTimeout=10 "$host" "ip addr show && echo '---' && ip route show && echo '---' && ss -tulpn" > "$host_dir/network_info.txt" 2>/dev/null; then log_success "Network info collected from $host" else log_warn "Failed to collect network info from $host" fi done # Create comprehensive summary create_comprehensive_summary # Create symbolic link to latest snapshot local latest_link="/opt/migration/backups/latest" ln -sfn "$SNAPSHOT_DIR" "$latest_link" log_info "Latest snapshot linked to: $latest_link" # Final validation log_step "Performing final validation..." local total_files=$(find "$SNAPSHOT_DIR" -type f | wc -l) local total_size=$(du -sh "$SNAPSHOT_DIR" | cut -f1) if [[ $total_files -gt 10 ]] && [[ $ERROR_COUNT -eq 0 ]]; then log_success "✅ Infrastructure documentation completed successfully!" log_success "📊 Snapshot statistics: $total_files files, $total_size total" log_success "📁 Snapshot location: $SNAPSHOT_DIR" elif [[ $ERROR_COUNT -gt 0 ]]; then log_warn "⚠️ Infrastructure documentation completed with $ERROR_COUNT errors" log_info "📊 Partial snapshot: $total_files files, $total_size total" log_info "📁 Location: $SNAPSHOT_DIR" else log_error "❌ Infrastructure documentation may have failed - too few files collected" return 1 fi # Display summary if [[ -f "$SNAPSHOT_DIR/comprehensive_summary.md" ]]; then echo "" echo "=== SNAPSHOT SUMMARY ===" head -30 "$SNAPSHOT_DIR/comprehensive_summary.md" echo "" echo "Full summary available at: $SNAPSHOT_DIR/comprehensive_summary.md" fi } # Execute main function main "$@"