481 lines
19 KiB
Bash
Executable File
481 lines
19 KiB
Bash
Executable File
#!/bin/bash
|
|
# Enhanced Document Current Infrastructure State
|
|
# This script creates a complete snapshot with robust error handling and validation
|
|
|
|
# Import error handling library
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
source "$SCRIPT_DIR/lib/error_handling.sh"
|
|
|
|
# Configuration
|
|
readonly HOSTS=("omv800" "fedora" "surface" "jonathan-2518f5u" "audrey" "raspberrypi")
|
|
readonly HOST_IPS=("192.168.50.229" "192.168.50.225" "192.168.50.254" "192.168.50.181" "192.168.50.145" "192.168.50.107")
|
|
readonly TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
|
readonly SNAPSHOT_DIR="/opt/migration/backups/snapshot_${TIMESTAMP}"
|
|
readonly REQUIRED_SPACE_GB=5 # Require 5GB free space
|
|
readonly CONNECTION_TIMEOUT=30
|
|
readonly SSH_TIMEOUT=60
|
|
|
|
# Cleanup function
|
|
cleanup_snapshot() {
|
|
log_info "Cleaning up temporary files..."
|
|
|
|
# Clean up temporary files on remote hosts
|
|
for host in "${HOSTS[@]}"; do
|
|
ssh -o ConnectTimeout=10 "$host" "rm -f /tmp/*_backup_${host}.tar.gz /tmp/*_dump_${host}.sql" 2>/dev/null || true
|
|
done
|
|
|
|
# Clean up incomplete snapshot if error occurred
|
|
if [[ -d "$SNAPSHOT_DIR" ]] && [[ $ERROR_COUNT -gt 0 ]]; then
|
|
log_warn "Removing incomplete snapshot directory: $SNAPSHOT_DIR"
|
|
rm -rf "$SNAPSHOT_DIR" 2>/dev/null || true
|
|
fi
|
|
}
|
|
|
|
# Rollback function
|
|
rollback_snapshot() {
|
|
log_info "Rolling back snapshot creation..."
|
|
|
|
# Remove any partially created directories
|
|
if [[ -d "$SNAPSHOT_DIR" ]]; then
|
|
rm -rf "$SNAPSHOT_DIR"
|
|
log_info "Removed partial snapshot directory"
|
|
fi
|
|
|
|
# Remove any temporary files
|
|
cleanup_snapshot
|
|
}
|
|
|
|
# Function to validate host accessibility
|
|
validate_host_access() {
|
|
local host=$1
|
|
local ip=$2
|
|
|
|
log_info "Validating access to $host ($ip)..."
|
|
|
|
# Test ping connectivity
|
|
if ! ping -c 1 -W 5 "$ip" >/dev/null 2>&1; then
|
|
log_error "Cannot ping $host ($ip)"
|
|
return 1
|
|
fi
|
|
|
|
# Test SSH connectivity
|
|
if ! ssh -o ConnectTimeout=10 -o BatchMode=yes "$host" "echo 'SSH OK'" >/dev/null 2>&1; then
|
|
log_error "Cannot SSH to $host"
|
|
return 1
|
|
fi
|
|
|
|
# Check if host has sufficient disk space for temporary files
|
|
local available_gb=$(ssh "$host" "df -BG /tmp | awk 'NR==2 {print \$4}' | sed 's/G//'" 2>/dev/null || echo "0")
|
|
if [[ $available_gb -lt 1 ]]; then
|
|
log_warn "$host has limited disk space: ${available_gb}GB"
|
|
fi
|
|
|
|
log_success "Host $host is accessible and ready"
|
|
return 0
|
|
}
|
|
|
|
# Function to collect Docker information with error handling
|
|
collect_docker_info() {
|
|
local host=$1
|
|
local host_dir=$2
|
|
|
|
log_info "Collecting Docker information from $host..."
|
|
|
|
# Create host directory
|
|
mkdir -p "$host_dir"
|
|
|
|
# Docker containers with timeout and error handling
|
|
if execute_with_retry 3 5 ssh -o ConnectTimeout=10 "$host" "timeout 30 docker ps -a --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}\t{{.Ports}}\t{{.CreatedAt}}\t{{.Size}}'" > "$host_dir/docker_containers.txt"; then
|
|
log_success "Docker containers collected from $host"
|
|
else
|
|
log_error "Failed to collect Docker containers from $host"
|
|
echo "Failed to collect Docker containers" > "$host_dir/docker_containers.txt"
|
|
fi
|
|
|
|
# Docker images
|
|
if execute_with_retry 3 5 ssh -o ConnectTimeout=10 "$host" "timeout 30 docker images --format 'table {{.Repository}}\t{{.Tag}}\t{{.ID}}\t{{.CreatedAt}}\t{{.Size}}'" > "$host_dir/docker_images.txt"; then
|
|
log_success "Docker images collected from $host"
|
|
else
|
|
log_warn "Failed to collect Docker images from $host"
|
|
echo "Failed to collect Docker images" > "$host_dir/docker_images.txt"
|
|
fi
|
|
|
|
# Docker networks
|
|
if execute_with_retry 3 5 ssh -o ConnectTimeout=10 "$host" "timeout 30 docker network ls --format 'table {{.ID}}\t{{.Name}}\t{{.Driver}}\t{{.Scope}}'" > "$host_dir/docker_networks.txt"; then
|
|
log_success "Docker networks collected from $host"
|
|
else
|
|
log_warn "Failed to collect Docker networks from $host"
|
|
echo "Failed to collect Docker networks" > "$host_dir/docker_networks.txt"
|
|
fi
|
|
|
|
# Docker volumes
|
|
if execute_with_retry 3 5 ssh -o ConnectTimeout=10 "$host" "timeout 30 docker volume ls --format 'table {{.Driver}}\t{{.Name}}'" > "$host_dir/docker_volumes.txt"; then
|
|
log_success "Docker volumes collected from $host"
|
|
else
|
|
log_warn "Failed to collect Docker volumes from $host"
|
|
echo "Failed to collect Docker volumes" > "$host_dir/docker_volumes.txt"
|
|
fi
|
|
|
|
# Docker system information
|
|
if execute_with_retry 2 10 ssh -o ConnectTimeout=10 "$host" "timeout 60 docker system df -v" > "$host_dir/docker_system_df.txt"; then
|
|
log_success "Docker system info collected from $host"
|
|
else
|
|
log_warn "Failed to collect Docker system info from $host"
|
|
echo "Failed to collect Docker system info" > "$host_dir/docker_system_df.txt"
|
|
fi
|
|
|
|
# Docker compose files discovery
|
|
if execute_with_retry 2 10 ssh -o ConnectTimeout=10 "$host" "find /opt /home -name 'docker-compose*.yml' -o -name 'compose*.yml' 2>/dev/null | head -20" > "$host_dir/compose_files.txt"; then
|
|
log_success "Docker compose files discovered on $host"
|
|
|
|
# Collect compose file contents
|
|
local compose_dir="$host_dir/compose_files"
|
|
mkdir -p "$compose_dir"
|
|
|
|
while IFS= read -r compose_file; do
|
|
if [[ -n "$compose_file" ]]; then
|
|
local basename_file=$(basename "$compose_file")
|
|
if ssh -o ConnectTimeout=10 "$host" "cat '$compose_file'" > "$compose_dir/${basename_file}_$(echo $compose_file | tr '/' '_')" 2>/dev/null; then
|
|
log_debug "Collected compose file: $compose_file"
|
|
fi
|
|
fi
|
|
done < "$host_dir/compose_files.txt"
|
|
else
|
|
log_warn "Failed to discover Docker compose files on $host"
|
|
echo "Failed to discover compose files" > "$host_dir/compose_files.txt"
|
|
fi
|
|
}
|
|
|
|
# Function to create database dumps with validation
|
|
create_database_dumps() {
|
|
log_step "Creating database dumps..."
|
|
|
|
local dump_dir="$SNAPSHOT_DIR/database_dumps"
|
|
mkdir -p "$dump_dir"
|
|
|
|
# PostgreSQL dumps from hosts with PostgreSQL containers
|
|
local postgres_hosts=("omv800" "surface" "jonathan-2518f5u")
|
|
|
|
for host in "${postgres_hosts[@]}"; do
|
|
log_info "Processing PostgreSQL dumps from $host..."
|
|
|
|
# Check if PostgreSQL container exists
|
|
if ssh -o ConnectTimeout=10 "$host" "docker ps | grep -i postgres" >/dev/null 2>&1; then
|
|
log_info "PostgreSQL container found on $host, creating dump..."
|
|
|
|
# Get PostgreSQL container ID
|
|
local postgres_container=$(ssh "$host" "docker ps --filter 'ancestor=postgres' --format '{{.ID}}' | head -1" 2>/dev/null || echo "")
|
|
|
|
if [[ -n "$postgres_container" ]]; then
|
|
# Create database dump with timeout
|
|
if execute_with_retry 2 30 ssh "$host" "timeout 300 docker exec $postgres_container pg_dumpall -U postgres > /tmp/postgres_dump_${host}.sql"; then
|
|
# Verify dump was created and has content
|
|
local dump_size=$(ssh "$host" "stat -f%z /tmp/postgres_dump_${host}.sql 2>/dev/null || stat -c%s /tmp/postgres_dump_${host}.sql 2>/dev/null || echo 0")
|
|
|
|
if [[ $dump_size -gt 100 ]]; then # At least 100 bytes
|
|
if scp -o ConnectTimeout=30 "$host:/tmp/postgres_dump_${host}.sql" "$dump_dir/"; then
|
|
log_success "PostgreSQL dump created for $host (${dump_size} bytes)"
|
|
else
|
|
log_error "Failed to copy PostgreSQL dump from $host"
|
|
fi
|
|
else
|
|
log_warn "PostgreSQL dump from $host is too small or empty"
|
|
echo "PostgreSQL dump failed or empty" > "$dump_dir/postgres_dump_${host}.error"
|
|
fi
|
|
else
|
|
log_error "Failed to create PostgreSQL dump on $host"
|
|
echo "Failed to create PostgreSQL dump" > "$dump_dir/postgres_dump_${host}.error"
|
|
fi
|
|
else
|
|
log_warn "No PostgreSQL container ID found on $host"
|
|
echo "No PostgreSQL container found" > "$dump_dir/postgres_dump_${host}.info"
|
|
fi
|
|
else
|
|
log_info "No PostgreSQL container found on $host"
|
|
echo "No PostgreSQL container found" > "$dump_dir/postgres_dump_${host}.info"
|
|
fi
|
|
done
|
|
|
|
# MySQL/MariaDB dumps if present
|
|
for host in "${HOSTS[@]}"; do
|
|
if ssh -o ConnectTimeout=10 "$host" "docker ps | grep -i -E 'mysql|mariadb'" >/dev/null 2>&1; then
|
|
log_info "MySQL/MariaDB container found on $host, creating dump..."
|
|
|
|
local mysql_container=$(ssh "$host" "docker ps --filter 'ancestor=mysql' --filter 'ancestor=mariadb' --format '{{.ID}}' | head -1" 2>/dev/null || echo "")
|
|
|
|
if [[ -n "$mysql_container" ]]; then
|
|
if execute_with_retry 2 30 ssh "$host" "timeout 300 docker exec $mysql_container mysqldump --all-databases > /tmp/mysql_dump_${host}.sql"; then
|
|
if scp -o ConnectTimeout=30 "$host:/tmp/mysql_dump_${host}.sql" "$dump_dir/"; then
|
|
log_success "MySQL dump created for $host"
|
|
fi
|
|
fi
|
|
fi
|
|
fi
|
|
done
|
|
}
|
|
|
|
# Function to backup configurations safely
|
|
backup_configurations() {
|
|
log_step "Backing up configurations..."
|
|
|
|
local config_dirs=("/etc/docker" "/opt" "/home/*/.config")
|
|
|
|
for i in "${!HOSTS[@]}"; do
|
|
local host="${HOSTS[$i]}"
|
|
log_info "Backing up configurations from $host..."
|
|
|
|
# Create configuration backup with error handling
|
|
if execute_with_retry 2 60 ssh -o ConnectTimeout=10 "$host" "timeout 600 tar czf /tmp/config_backup_${host}.tar.gz ${config_dirs[*]} 2>/dev/null || echo 'Some configs may be missing'"; then
|
|
# Check if backup file was created
|
|
local backup_size=$(ssh "$host" "stat -f%z /tmp/config_backup_${host}.tar.gz 2>/dev/null || stat -c%s /tmp/config_backup_${host}.tar.gz 2>/dev/null || echo 0")
|
|
|
|
if [[ $backup_size -gt 1000 ]]; then # At least 1KB
|
|
if scp -o ConnectTimeout=60 "$host:/tmp/config_backup_${host}.tar.gz" "$SNAPSHOT_DIR/"; then
|
|
log_success "Configuration backup created for $host (${backup_size} bytes)"
|
|
else
|
|
log_error "Failed to copy configuration backup from $host"
|
|
fi
|
|
else
|
|
log_warn "Configuration backup from $host is too small"
|
|
echo "Configuration backup failed or too small" > "$SNAPSHOT_DIR/config_backup_${host}.error"
|
|
fi
|
|
else
|
|
log_error "Failed to create configuration backup on $host"
|
|
echo "Failed to create configuration backup" > "$SNAPSHOT_DIR/config_backup_${host}.error"
|
|
fi
|
|
done
|
|
}
|
|
|
|
# Function to create comprehensive summary with validation
|
|
create_comprehensive_summary() {
|
|
log_step "Creating comprehensive summary report..."
|
|
|
|
local summary_file="$SNAPSHOT_DIR/comprehensive_summary.md"
|
|
|
|
cat > "$summary_file" << EOF
|
|
# Infrastructure Snapshot Summary
|
|
**Generated:** $(date)
|
|
**Snapshot ID:** $TIMESTAMP
|
|
**Script:** $SCRIPT_NAME
|
|
**Directory:** $SNAPSHOT_DIR
|
|
|
|
## Snapshot Statistics
|
|
- **Total Hosts:** ${#HOSTS[@]}
|
|
- **Total Files:** $(find "$SNAPSHOT_DIR" -type f | wc -l)
|
|
- **Total Size:** $(du -sh "$SNAPSHOT_DIR" | cut -f1)
|
|
- **Errors During Collection:** $ERROR_COUNT
|
|
- **Warnings During Collection:** $WARNING_COUNT
|
|
|
|
## Host Overview
|
|
| Host | IP | Docker Containers | Database | Config Backup |
|
|
|------|----|--------------------|----------|---------------|
|
|
EOF
|
|
|
|
# Generate host table
|
|
for i in "${!HOSTS[@]}"; do
|
|
local host="${HOSTS[$i]}"
|
|
local ip="${HOST_IPS[$i]}"
|
|
local host_dir="$SNAPSHOT_DIR/$host"
|
|
|
|
# Count Docker containers
|
|
local container_count=0
|
|
if [[ -f "$host_dir/docker_containers.txt" ]]; then
|
|
container_count=$(grep -c "^[^$]" "$host_dir/docker_containers.txt" 2>/dev/null || echo "0")
|
|
fi
|
|
|
|
# Check database status
|
|
local db_status="None"
|
|
if [[ -f "$SNAPSHOT_DIR/database_dumps/postgres_dump_${host}.sql" ]]; then
|
|
db_status="PostgreSQL"
|
|
elif [[ -f "$SNAPSHOT_DIR/database_dumps/mysql_dump_${host}.sql" ]]; then
|
|
db_status="MySQL"
|
|
elif [[ -f "$SNAPSHOT_DIR/database_dumps/postgres_dump_${host}.info" ]]; then
|
|
db_status="No DB"
|
|
fi
|
|
|
|
# Check config backup status
|
|
local config_status="❌ Failed"
|
|
if [[ -f "$SNAPSHOT_DIR/config_backup_${host}.tar.gz" ]]; then
|
|
config_status="✅ Success"
|
|
elif [[ -f "$SNAPSHOT_DIR/config_backup_${host}.error" ]]; then
|
|
config_status="⚠️ Error"
|
|
fi
|
|
|
|
echo "| **$host** | $ip | $container_count | $db_status | $config_status |" >> "$summary_file"
|
|
done
|
|
|
|
# Add critical services section
|
|
cat >> "$summary_file" << EOF
|
|
|
|
## Critical Services Detected
|
|
EOF
|
|
|
|
# Search for critical services across all hosts
|
|
local critical_services=("immich" "jellyfin" "homeassistant" "appflowy" "paperless" "portainer" "traefik" "nginx" "apache")
|
|
|
|
for service in "${critical_services[@]}"; do
|
|
local found_hosts=()
|
|
for host in "${HOSTS[@]}"; do
|
|
if [[ -f "$SNAPSHOT_DIR/$host/docker_containers.txt" ]] && grep -qi "$service" "$SNAPSHOT_DIR/$host/docker_containers.txt" 2>/dev/null; then
|
|
found_hosts+=("$host")
|
|
fi
|
|
done
|
|
|
|
if [[ ${#found_hosts[@]} -gt 0 ]]; then
|
|
echo "- **$service**: ${found_hosts[*]}" >> "$summary_file"
|
|
fi
|
|
done
|
|
|
|
# Add validation results
|
|
cat >> "$summary_file" << EOF
|
|
|
|
## Data Validation Results
|
|
EOF
|
|
|
|
# Validate database dumps
|
|
local postgres_dumps=$(find "$SNAPSHOT_DIR/database_dumps" -name "postgres_dump_*.sql" 2>/dev/null | wc -l)
|
|
local mysql_dumps=$(find "$SNAPSHOT_DIR/database_dumps" -name "mysql_dump_*.sql" 2>/dev/null | wc -l)
|
|
|
|
echo "- **PostgreSQL Dumps:** $postgres_dumps" >> "$summary_file"
|
|
echo "- **MySQL Dumps:** $mysql_dumps" >> "$summary_file"
|
|
|
|
# Validate config backups
|
|
local successful_backups=$(find "$SNAPSHOT_DIR" -name "config_backup_*.tar.gz" 2>/dev/null | wc -l)
|
|
local failed_backups=$(find "$SNAPSHOT_DIR" -name "config_backup_*.error" 2>/dev/null | wc -l)
|
|
|
|
echo "- **Successful Config Backups:** $successful_backups" >> "$summary_file"
|
|
echo "- **Failed Config Backups:** $failed_backups" >> "$summary_file"
|
|
|
|
# Add next steps
|
|
cat >> "$summary_file" << EOF
|
|
|
|
## Next Steps
|
|
1. **Verify Data Integrity:** Run validation scripts on dumps and backups
|
|
2. **Test Restoration:** Test restore procedures in staging environment
|
|
3. **Security Review:** Ensure no sensitive data in backups
|
|
4. **Storage:** Move snapshot to secure long-term storage
|
|
|
|
## Files and Directories
|
|
\`\`\`
|
|
$(tree "$SNAPSHOT_DIR" 2>/dev/null || find "$SNAPSHOT_DIR" -type f | head -50)
|
|
\`\`\`
|
|
|
|
## Logs and Errors
|
|
- **Log File:** $LOG_FILE
|
|
- **Error Log:** $ERROR_LOG
|
|
- **Error Count:** $ERROR_COUNT
|
|
- **Warning Count:** $WARNING_COUNT
|
|
EOF
|
|
|
|
log_success "Comprehensive summary created: $summary_file"
|
|
}
|
|
|
|
# Main execution function
|
|
main() {
|
|
log_step "Starting enhanced infrastructure documentation..."
|
|
|
|
# Register cleanup and rollback functions
|
|
register_cleanup cleanup_snapshot
|
|
register_rollback rollback_snapshot
|
|
|
|
# Validate prerequisites
|
|
validate_prerequisites ssh scp ping docker tar gzip
|
|
|
|
# Check available disk space
|
|
check_disk_space $REQUIRED_SPACE_GB "/opt/migration/backups"
|
|
|
|
# Create snapshot directory
|
|
log_step "Creating snapshot directory: $SNAPSHOT_DIR"
|
|
mkdir -p "$SNAPSHOT_DIR"
|
|
chmod 755 "$SNAPSHOT_DIR"
|
|
|
|
# Create checkpoint
|
|
local checkpoint=$(create_checkpoint "snapshot_start")
|
|
|
|
# Validate all host connectivity first
|
|
log_step "Validating host connectivity..."
|
|
for i in "${!HOSTS[@]}"; do
|
|
validate_host_access "${HOSTS[$i]}" "${HOST_IPS[$i]}"
|
|
done
|
|
|
|
# Collect Docker information from all hosts
|
|
log_step "Collecting Docker information from all hosts..."
|
|
for i in "${!HOSTS[@]}"; do
|
|
local host="${HOSTS[$i]}"
|
|
local host_dir="$SNAPSHOT_DIR/$host"
|
|
|
|
collect_docker_info "$host" "$host_dir"
|
|
|
|
# Create individual checkpoint for each host
|
|
create_checkpoint "docker_collected_$host"
|
|
done
|
|
|
|
# Create database dumps
|
|
create_database_dumps
|
|
create_checkpoint "database_dumps_complete"
|
|
|
|
# Backup configurations
|
|
backup_configurations
|
|
create_checkpoint "config_backups_complete"
|
|
|
|
# Collect additional system information
|
|
log_step "Collecting system information..."
|
|
for i in "${!HOSTS[@]}"; do
|
|
local host="${HOSTS[$i]}"
|
|
local host_dir="$SNAPSHOT_DIR/$host"
|
|
|
|
log_info "Collecting system info from $host..."
|
|
|
|
# System information
|
|
if ssh -o ConnectTimeout=10 "$host" "uname -a && echo '---' && df -h && echo '---' && free -h && echo '---' && uptime && echo '---' && ps aux --sort=-%cpu | head -20" > "$host_dir/system_info.txt" 2>/dev/null; then
|
|
log_success "System info collected from $host"
|
|
else
|
|
log_warn "Failed to collect system info from $host"
|
|
fi
|
|
|
|
# Network information
|
|
if ssh -o ConnectTimeout=10 "$host" "ip addr show && echo '---' && ip route show && echo '---' && ss -tulpn" > "$host_dir/network_info.txt" 2>/dev/null; then
|
|
log_success "Network info collected from $host"
|
|
else
|
|
log_warn "Failed to collect network info from $host"
|
|
fi
|
|
done
|
|
|
|
# Create comprehensive summary
|
|
create_comprehensive_summary
|
|
|
|
# Create symbolic link to latest snapshot
|
|
local latest_link="/opt/migration/backups/latest"
|
|
ln -sfn "$SNAPSHOT_DIR" "$latest_link"
|
|
log_info "Latest snapshot linked to: $latest_link"
|
|
|
|
# Final validation
|
|
log_step "Performing final validation..."
|
|
local total_files=$(find "$SNAPSHOT_DIR" -type f | wc -l)
|
|
local total_size=$(du -sh "$SNAPSHOT_DIR" | cut -f1)
|
|
|
|
if [[ $total_files -gt 10 ]] && [[ $ERROR_COUNT -eq 0 ]]; then
|
|
log_success "✅ Infrastructure documentation completed successfully!"
|
|
log_success "📊 Snapshot statistics: $total_files files, $total_size total"
|
|
log_success "📁 Snapshot location: $SNAPSHOT_DIR"
|
|
elif [[ $ERROR_COUNT -gt 0 ]]; then
|
|
log_warn "⚠️ Infrastructure documentation completed with $ERROR_COUNT errors"
|
|
log_info "📊 Partial snapshot: $total_files files, $total_size total"
|
|
log_info "📁 Location: $SNAPSHOT_DIR"
|
|
else
|
|
log_error "❌ Infrastructure documentation may have failed - too few files collected"
|
|
return 1
|
|
fi
|
|
|
|
# Display summary
|
|
if [[ -f "$SNAPSHOT_DIR/comprehensive_summary.md" ]]; then
|
|
echo ""
|
|
echo "=== SNAPSHOT SUMMARY ==="
|
|
head -30 "$SNAPSHOT_DIR/comprehensive_summary.md"
|
|
echo ""
|
|
echo "Full summary available at: $SNAPSHOT_DIR/comprehensive_summary.md"
|
|
fi
|
|
}
|
|
|
|
# Execute main function
|
|
main "$@" |