Files
HomeAudit/migration_scripts/scripts/document_current_state_enhanced.sh
2025-08-24 11:13:39 -04:00

481 lines
19 KiB
Bash
Executable File

#!/bin/bash
# Enhanced Document Current Infrastructure State
# This script creates a complete snapshot with robust error handling and validation
# Import error handling library
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/lib/error_handling.sh"
# Configuration
readonly HOSTS=("omv800" "fedora" "surface" "jonathan-2518f5u" "audrey" "raspberrypi")
readonly HOST_IPS=("192.168.50.229" "192.168.50.225" "192.168.50.254" "192.168.50.181" "192.168.50.145" "192.168.50.107")
readonly TIMESTAMP=$(date +%Y%m%d_%H%M%S)
readonly SNAPSHOT_DIR="/opt/migration/backups/snapshot_${TIMESTAMP}"
readonly REQUIRED_SPACE_GB=5 # Require 5GB free space
readonly CONNECTION_TIMEOUT=30
readonly SSH_TIMEOUT=60
# Cleanup function
cleanup_snapshot() {
log_info "Cleaning up temporary files..."
# Clean up temporary files on remote hosts
for host in "${HOSTS[@]}"; do
ssh -o ConnectTimeout=10 "$host" "rm -f /tmp/*_backup_${host}.tar.gz /tmp/*_dump_${host}.sql" 2>/dev/null || true
done
# Clean up incomplete snapshot if error occurred
if [[ -d "$SNAPSHOT_DIR" ]] && [[ $ERROR_COUNT -gt 0 ]]; then
log_warn "Removing incomplete snapshot directory: $SNAPSHOT_DIR"
rm -rf "$SNAPSHOT_DIR" 2>/dev/null || true
fi
}
# Rollback function
rollback_snapshot() {
log_info "Rolling back snapshot creation..."
# Remove any partially created directories
if [[ -d "$SNAPSHOT_DIR" ]]; then
rm -rf "$SNAPSHOT_DIR"
log_info "Removed partial snapshot directory"
fi
# Remove any temporary files
cleanup_snapshot
}
# Function to validate host accessibility
validate_host_access() {
local host=$1
local ip=$2
log_info "Validating access to $host ($ip)..."
# Test ping connectivity
if ! ping -c 1 -W 5 "$ip" >/dev/null 2>&1; then
log_error "Cannot ping $host ($ip)"
return 1
fi
# Test SSH connectivity
if ! ssh -o ConnectTimeout=10 -o BatchMode=yes "$host" "echo 'SSH OK'" >/dev/null 2>&1; then
log_error "Cannot SSH to $host"
return 1
fi
# Check if host has sufficient disk space for temporary files
local available_gb=$(ssh "$host" "df -BG /tmp | awk 'NR==2 {print \$4}' | sed 's/G//'" 2>/dev/null || echo "0")
if [[ $available_gb -lt 1 ]]; then
log_warn "$host has limited disk space: ${available_gb}GB"
fi
log_success "Host $host is accessible and ready"
return 0
}
# Function to collect Docker information with error handling
collect_docker_info() {
local host=$1
local host_dir=$2
log_info "Collecting Docker information from $host..."
# Create host directory
mkdir -p "$host_dir"
# Docker containers with timeout and error handling
if execute_with_retry 3 5 ssh -o ConnectTimeout=10 "$host" "timeout 30 docker ps -a --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}\t{{.Ports}}\t{{.CreatedAt}}\t{{.Size}}'" > "$host_dir/docker_containers.txt"; then
log_success "Docker containers collected from $host"
else
log_error "Failed to collect Docker containers from $host"
echo "Failed to collect Docker containers" > "$host_dir/docker_containers.txt"
fi
# Docker images
if execute_with_retry 3 5 ssh -o ConnectTimeout=10 "$host" "timeout 30 docker images --format 'table {{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}\t{{.Size}}'" > "$host_dir/docker_images.txt"; then
log_success "Docker images collected from $host"
else
log_warn "Failed to collect Docker images from $host"
echo "Failed to collect Docker images" > "$host_dir/docker_images.txt"
fi
# Docker networks
if execute_with_retry 3 5 ssh -o ConnectTimeout=10 "$host" "timeout 30 docker network ls --format 'table {{.ID}}\t{{.Name}}\t{{.Driver}}\t{{.Scope}}'" > "$host_dir/docker_networks.txt"; then
log_success "Docker networks collected from $host"
else
log_warn "Failed to collect Docker networks from $host"
echo "Failed to collect Docker networks" > "$host_dir/docker_networks.txt"
fi
# Docker volumes
if execute_with_retry 3 5 ssh -o ConnectTimeout=10 "$host" "timeout 30 docker volume ls --format 'table {{.Driver}}\t{{.Name}}'" > "$host_dir/docker_volumes.txt"; then
log_success "Docker volumes collected from $host"
else
log_warn "Failed to collect Docker volumes from $host"
echo "Failed to collect Docker volumes" > "$host_dir/docker_volumes.txt"
fi
# Docker system information
if execute_with_retry 2 10 ssh -o ConnectTimeout=10 "$host" "timeout 60 docker system df -v" > "$host_dir/docker_system_df.txt"; then
log_success "Docker system info collected from $host"
else
log_warn "Failed to collect Docker system info from $host"
echo "Failed to collect Docker system info" > "$host_dir/docker_system_df.txt"
fi
# Docker compose files discovery
if execute_with_retry 2 10 ssh -o ConnectTimeout=10 "$host" "find /opt /home -name 'docker-compose*.yml' -o -name 'compose*.yml' 2>/dev/null | head -20" > "$host_dir/compose_files.txt"; then
log_success "Docker compose files discovered on $host"
# Collect compose file contents
local compose_dir="$host_dir/compose_files"
mkdir -p "$compose_dir"
while IFS= read -r compose_file; do
if [[ -n "$compose_file" ]]; then
local basename_file=$(basename "$compose_file")
if ssh -o ConnectTimeout=10 "$host" "cat '$compose_file'" > "$compose_dir/${basename_file}_$(echo $compose_file | tr '/' '_')" 2>/dev/null; then
log_debug "Collected compose file: $compose_file"
fi
fi
done < "$host_dir/compose_files.txt"
else
log_warn "Failed to discover Docker compose files on $host"
echo "Failed to discover compose files" > "$host_dir/compose_files.txt"
fi
}
# Function to create database dumps with validation
create_database_dumps() {
log_step "Creating database dumps..."
local dump_dir="$SNAPSHOT_DIR/database_dumps"
mkdir -p "$dump_dir"
# PostgreSQL dumps from hosts with PostgreSQL containers
local postgres_hosts=("omv800" "surface" "jonathan-2518f5u")
for host in "${postgres_hosts[@]}"; do
log_info "Processing PostgreSQL dumps from $host..."
# Check if PostgreSQL container exists
if ssh -o ConnectTimeout=10 "$host" "docker ps | grep -i postgres" >/dev/null 2>&1; then
log_info "PostgreSQL container found on $host, creating dump..."
# Get PostgreSQL container ID
local postgres_container=$(ssh "$host" "docker ps --filter 'ancestor=postgres' --format '{{.ID}}' | head -1" 2>/dev/null || echo "")
if [[ -n "$postgres_container" ]]; then
# Create database dump with timeout
if execute_with_retry 2 30 ssh "$host" "timeout 300 docker exec $postgres_container pg_dumpall -U postgres > /tmp/postgres_dump_${host}.sql"; then
# Verify dump was created and has content
local dump_size=$(ssh "$host" "stat -f%z /tmp/postgres_dump_${host}.sql 2>/dev/null || stat -c%s /tmp/postgres_dump_${host}.sql 2>/dev/null || echo 0")
if [[ $dump_size -gt 100 ]]; then # At least 100 bytes
if scp -o ConnectTimeout=30 "$host:/tmp/postgres_dump_${host}.sql" "$dump_dir/"; then
log_success "PostgreSQL dump created for $host (${dump_size} bytes)"
else
log_error "Failed to copy PostgreSQL dump from $host"
fi
else
log_warn "PostgreSQL dump from $host is too small or empty"
echo "PostgreSQL dump failed or empty" > "$dump_dir/postgres_dump_${host}.error"
fi
else
log_error "Failed to create PostgreSQL dump on $host"
echo "Failed to create PostgreSQL dump" > "$dump_dir/postgres_dump_${host}.error"
fi
else
log_warn "No PostgreSQL container ID found on $host"
echo "No PostgreSQL container found" > "$dump_dir/postgres_dump_${host}.info"
fi
else
log_info "No PostgreSQL container found on $host"
echo "No PostgreSQL container found" > "$dump_dir/postgres_dump_${host}.info"
fi
done
# MySQL/MariaDB dumps if present
for host in "${HOSTS[@]}"; do
if ssh -o ConnectTimeout=10 "$host" "docker ps | grep -i -E 'mysql|mariadb'" >/dev/null 2>&1; then
log_info "MySQL/MariaDB container found on $host, creating dump..."
local mysql_container=$(ssh "$host" "docker ps --filter 'ancestor=mysql' --filter 'ancestor=mariadb' --format '{{.ID}}' | head -1" 2>/dev/null || echo "")
if [[ -n "$mysql_container" ]]; then
if execute_with_retry 2 30 ssh "$host" "timeout 300 docker exec $mysql_container mysqldump --all-databases > /tmp/mysql_dump_${host}.sql"; then
if scp -o ConnectTimeout=30 "$host:/tmp/mysql_dump_${host}.sql" "$dump_dir/"; then
log_success "MySQL dump created for $host"
fi
fi
fi
fi
done
}
# Function to backup configurations safely
backup_configurations() {
log_step "Backing up configurations..."
local config_dirs=("/etc/docker" "/opt" "/home/*/.config")
for i in "${!HOSTS[@]}"; do
local host="${HOSTS[$i]}"
log_info "Backing up configurations from $host..."
# Create configuration backup with error handling
if execute_with_retry 2 60 ssh -o ConnectTimeout=10 "$host" "timeout 600 tar czf /tmp/config_backup_${host}.tar.gz ${config_dirs[*]} 2>/dev/null || echo 'Some configs may be missing'"; then
# Check if backup file was created
local backup_size=$(ssh "$host" "stat -f%z /tmp/config_backup_${host}.tar.gz 2>/dev/null || stat -c%s /tmp/config_backup_${host}.tar.gz 2>/dev/null || echo 0")
if [[ $backup_size -gt 1000 ]]; then # At least 1KB
if scp -o ConnectTimeout=60 "$host:/tmp/config_backup_${host}.tar.gz" "$SNAPSHOT_DIR/"; then
log_success "Configuration backup created for $host (${backup_size} bytes)"
else
log_error "Failed to copy configuration backup from $host"
fi
else
log_warn "Configuration backup from $host is too small"
echo "Configuration backup failed or too small" > "$SNAPSHOT_DIR/config_backup_${host}.error"
fi
else
log_error "Failed to create configuration backup on $host"
echo "Failed to create configuration backup" > "$SNAPSHOT_DIR/config_backup_${host}.error"
fi
done
}
# Function to create comprehensive summary with validation
create_comprehensive_summary() {
log_step "Creating comprehensive summary report..."
local summary_file="$SNAPSHOT_DIR/comprehensive_summary.md"
cat > "$summary_file" << EOF
# Infrastructure Snapshot Summary
**Generated:** $(date)
**Snapshot ID:** $TIMESTAMP
**Script:** $SCRIPT_NAME
**Directory:** $SNAPSHOT_DIR
## Snapshot Statistics
- **Total Hosts:** ${#HOSTS[@]}
- **Total Files:** $(find "$SNAPSHOT_DIR" -type f | wc -l)
- **Total Size:** $(du -sh "$SNAPSHOT_DIR" | cut -f1)
- **Errors During Collection:** $ERROR_COUNT
- **Warnings During Collection:** $WARNING_COUNT
## Host Overview
| Host | IP | Docker Containers | Database | Config Backup |
|------|----|--------------------|----------|---------------|
EOF
# Generate host table
for i in "${!HOSTS[@]}"; do
local host="${HOSTS[$i]}"
local ip="${HOST_IPS[$i]}"
local host_dir="$SNAPSHOT_DIR/$host"
# Count Docker containers
local container_count=0
if [[ -f "$host_dir/docker_containers.txt" ]]; then
container_count=$(grep -c "^[^$]" "$host_dir/docker_containers.txt" 2>/dev/null || echo "0")
fi
# Check database status
local db_status="None"
if [[ -f "$SNAPSHOT_DIR/database_dumps/postgres_dump_${host}.sql" ]]; then
db_status="PostgreSQL"
elif [[ -f "$SNAPSHOT_DIR/database_dumps/mysql_dump_${host}.sql" ]]; then
db_status="MySQL"
elif [[ -f "$SNAPSHOT_DIR/database_dumps/postgres_dump_${host}.info" ]]; then
db_status="No DB"
fi
# Check config backup status
local config_status="❌ Failed"
if [[ -f "$SNAPSHOT_DIR/config_backup_${host}.tar.gz" ]]; then
config_status="✅ Success"
elif [[ -f "$SNAPSHOT_DIR/config_backup_${host}.error" ]]; then
config_status="⚠️ Error"
fi
echo "| **$host** | $ip | $container_count | $db_status | $config_status |" >> "$summary_file"
done
# Add critical services section
cat >> "$summary_file" << EOF
## Critical Services Detected
EOF
# Search for critical services across all hosts
local critical_services=("immich" "jellyfin" "homeassistant" "appflowy" "paperless" "portainer" "traefik" "nginx" "apache")
for service in "${critical_services[@]}"; do
local found_hosts=()
for host in "${HOSTS[@]}"; do
if [[ -f "$SNAPSHOT_DIR/$host/docker_containers.txt" ]] && grep -qi "$service" "$SNAPSHOT_DIR/$host/docker_containers.txt" 2>/dev/null; then
found_hosts+=("$host")
fi
done
if [[ ${#found_hosts[@]} -gt 0 ]]; then
echo "- **$service**: ${found_hosts[*]}" >> "$summary_file"
fi
done
# Add validation results
cat >> "$summary_file" << EOF
## Data Validation Results
EOF
# Validate database dumps
local postgres_dumps=$(find "$SNAPSHOT_DIR/database_dumps" -name "postgres_dump_*.sql" 2>/dev/null | wc -l)
local mysql_dumps=$(find "$SNAPSHOT_DIR/database_dumps" -name "mysql_dump_*.sql" 2>/dev/null | wc -l)
echo "- **PostgreSQL Dumps:** $postgres_dumps" >> "$summary_file"
echo "- **MySQL Dumps:** $mysql_dumps" >> "$summary_file"
# Validate config backups
local successful_backups=$(find "$SNAPSHOT_DIR" -name "config_backup_*.tar.gz" 2>/dev/null | wc -l)
local failed_backups=$(find "$SNAPSHOT_DIR" -name "config_backup_*.error" 2>/dev/null | wc -l)
echo "- **Successful Config Backups:** $successful_backups" >> "$summary_file"
echo "- **Failed Config Backups:** $failed_backups" >> "$summary_file"
# Add next steps
cat >> "$summary_file" << EOF
## Next Steps
1. **Verify Data Integrity:** Run validation scripts on dumps and backups
2. **Test Restoration:** Test restore procedures in staging environment
3. **Security Review:** Ensure no sensitive data in backups
4. **Storage:** Move snapshot to secure long-term storage
## Files and Directories
\`\`\`
$(tree "$SNAPSHOT_DIR" 2>/dev/null || find "$SNAPSHOT_DIR" -type f | head -50)
\`\`\`
## Logs and Errors
- **Log File:** $LOG_FILE
- **Error Log:** $ERROR_LOG
- **Error Count:** $ERROR_COUNT
- **Warning Count:** $WARNING_COUNT
EOF
log_success "Comprehensive summary created: $summary_file"
}
# Main execution function
main() {
log_step "Starting enhanced infrastructure documentation..."
# Register cleanup and rollback functions
register_cleanup cleanup_snapshot
register_rollback rollback_snapshot
# Validate prerequisites
validate_prerequisites ssh scp ping docker tar gzip
# Check available disk space
check_disk_space $REQUIRED_SPACE_GB "/opt/migration/backups"
# Create snapshot directory
log_step "Creating snapshot directory: $SNAPSHOT_DIR"
mkdir -p "$SNAPSHOT_DIR"
chmod 755 "$SNAPSHOT_DIR"
# Create checkpoint
local checkpoint=$(create_checkpoint "snapshot_start")
# Validate all host connectivity first
log_step "Validating host connectivity..."
for i in "${!HOSTS[@]}"; do
validate_host_access "${HOSTS[$i]}" "${HOST_IPS[$i]}"
done
# Collect Docker information from all hosts
log_step "Collecting Docker information from all hosts..."
for i in "${!HOSTS[@]}"; do
local host="${HOSTS[$i]}"
local host_dir="$SNAPSHOT_DIR/$host"
collect_docker_info "$host" "$host_dir"
# Create individual checkpoint for each host
create_checkpoint "docker_collected_$host"
done
# Create database dumps
create_database_dumps
create_checkpoint "database_dumps_complete"
# Backup configurations
backup_configurations
create_checkpoint "config_backups_complete"
# Collect additional system information
log_step "Collecting system information..."
for i in "${!HOSTS[@]}"; do
local host="${HOSTS[$i]}"
local host_dir="$SNAPSHOT_DIR/$host"
log_info "Collecting system info from $host..."
# System information
if ssh -o ConnectTimeout=10 "$host" "uname -a && echo '---' && df -h && echo '---' && free -h && echo '---' && uptime && echo '---' && ps aux --sort=-%cpu | head -20" > "$host_dir/system_info.txt" 2>/dev/null; then
log_success "System info collected from $host"
else
log_warn "Failed to collect system info from $host"
fi
# Network information
if ssh -o ConnectTimeout=10 "$host" "ip addr show && echo '---' && ip route show && echo '---' && ss -tulpn" > "$host_dir/network_info.txt" 2>/dev/null; then
log_success "Network info collected from $host"
else
log_warn "Failed to collect network info from $host"
fi
done
# Create comprehensive summary
create_comprehensive_summary
# Create symbolic link to latest snapshot
local latest_link="/opt/migration/backups/latest"
ln -sfn "$SNAPSHOT_DIR" "$latest_link"
log_info "Latest snapshot linked to: $latest_link"
# Final validation
log_step "Performing final validation..."
local total_files=$(find "$SNAPSHOT_DIR" -type f | wc -l)
local total_size=$(du -sh "$SNAPSHOT_DIR" | cut -f1)
if [[ $total_files -gt 10 ]] && [[ $ERROR_COUNT -eq 0 ]]; then
log_success "✅ Infrastructure documentation completed successfully!"
log_success "📊 Snapshot statistics: $total_files files, $total_size total"
log_success "📁 Snapshot location: $SNAPSHOT_DIR"
elif [[ $ERROR_COUNT -gt 0 ]]; then
log_warn "⚠️ Infrastructure documentation completed with $ERROR_COUNT errors"
log_info "📊 Partial snapshot: $total_files files, $total_size total"
log_info "📁 Location: $SNAPSHOT_DIR"
else
log_error "❌ Infrastructure documentation may have failed - too few files collected"
return 1
fi
# Display summary
if [[ -f "$SNAPSHOT_DIR/comprehensive_summary.md" ]]; then
echo ""
echo "=== SNAPSHOT SUMMARY ==="
head -30 "$SNAPSHOT_DIR/comprehensive_summary.md"
echo ""
echo "Full summary available at: $SNAPSHOT_DIR/comprehensive_summary.md"
fi
}
# Execute main function
main "$@"