#!/bin/bash # Enhanced Error Handling Library # Provides robust error handling, logging, and recovery mechanisms # Global error handling configuration set -euo pipefail IFS=$'\n\t' # Colors for output readonly RED='\033[0;31m' readonly GREEN='\033[0;32m' readonly YELLOW='\033[1;33m' readonly BLUE='\033[0;34m' readonly PURPLE='\033[0;35m' readonly CYAN='\033[0;36m' readonly NC='\033[0m' # No Color # Logging configuration readonly LOG_DIR="/opt/migration/logs" readonly LOG_FILE="$LOG_DIR/migration_$(date +%Y%m%d_%H%M%S).log" readonly ERROR_LOG="$LOG_DIR/errors_$(date +%Y%m%d_%H%M%S).log" # Ensure log directory exists mkdir -p "$LOG_DIR" chmod 755 "$LOG_DIR" # Initialize logging exec 3>&1 4>&2 exec 1> >(tee -a "$LOG_FILE") exec 2> >(tee -a "$ERROR_LOG" >&2) # Global variables declare -g SCRIPT_NAME="${0##*/}" declare -g SCRIPT_PID=$$ declare -g START_TIME=$(date +%s) declare -g CLEANUP_FUNCTIONS=() declare -g ROLLBACK_FUNCTIONS=() declare -g ERROR_COUNT=0 declare -g WARNING_COUNT=0 declare -g STEP_COUNT=0 declare -g CURRENT_STEP="" # Function to print formatted messages print_message() { local level=$1 local message=$2 local timestamp=$(date '+%Y-%m-%d %H:%M:%S') case $level in "INFO") echo -e "${GREEN}[INFO]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE" ;; "WARN") echo -e "${YELLOW}[WARN]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE" >&2 ((WARNING_COUNT++)) ;; "ERROR") echo -e "${RED}[ERROR]${NC} ${timestamp} - ${message}" | tee -a "$ERROR_LOG" >&2 ((ERROR_COUNT++)) ;; "DEBUG") if [[ "${DEBUG:-false}" == "true" ]]; then echo -e "${PURPLE}[DEBUG]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE" fi ;; "STEP") echo -e "${BLUE}[STEP $((++STEP_COUNT))]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE" CURRENT_STEP="$message" ;; "SUCCESS") echo -e "${GREEN}[SUCCESS]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE" ;; "CRITICAL") echo -e "${RED}[CRITICAL]${NC} ${timestamp} - ${message}" | tee -a "$ERROR_LOG" >&2 ((ERROR_COUNT++)) ;; esac } # Convenience functions log_info() { print_message "INFO" "$1"; } log_warn() { print_message "WARN" "$1"; } log_error() { print_message "ERROR" "$1"; } log_debug() { print_message "DEBUG" "$1"; } log_step() { print_message "STEP" "$1"; } log_success() { print_message "SUCCESS" "$1"; } log_critical() { print_message "CRITICAL" "$1"; } # Enhanced error handler with context and recovery error_handler() { local exit_code=$? local line_number=$1 local bash_lineno=$2 local last_command="${3:-unknown}" local funcstack=("${FUNCNAME[@]:1}") log_critical "Script failed in $SCRIPT_NAME" log_critical "Exit code: $exit_code" log_critical "Line number: $line_number" log_critical "Command: $last_command" log_critical "Current step: $CURRENT_STEP" log_critical "Function stack: ${funcstack[*]}" # Capture system state for debugging capture_system_state_on_error # Execute rollback functions in reverse order execute_rollback_functions # Show recovery options show_recovery_options # Execute cleanup functions execute_cleanup_functions # Generate error report generate_error_report exit $exit_code } # Capture system state when error occurs capture_system_state_on_error() { local error_state_dir="$LOG_DIR/error_state_$(date +%Y%m%d_%H%M%S)" mkdir -p "$error_state_dir" log_info "Capturing system state for debugging..." # Capture process information ps aux > "$error_state_dir/processes.txt" 2>/dev/null || true # Capture network state ss -tulpn > "$error_state_dir/network.txt" 2>/dev/null || true # Capture Docker state if available if command -v docker >/dev/null 2>&1; then docker ps -a > "$error_state_dir/docker_containers.txt" 2>/dev/null || true docker images > "$error_state_dir/docker_images.txt" 2>/dev/null || true docker system df > "$error_state_dir/docker_disk.txt" 2>/dev/null || true docker system events --since 1h --until now > "$error_state_dir/docker_events.txt" 2>/dev/null || true fi # Capture disk space df -h > "$error_state_dir/disk_space.txt" 2>/dev/null || true # Capture memory usage free -h > "$error_state_dir/memory.txt" 2>/dev/null || true # Capture recent logs tail -n 100 /var/log/syslog > "$error_state_dir/system_logs.txt" 2>/dev/null || true log_info "System state captured in: $error_state_dir" } # Execute rollback functions in reverse order execute_rollback_functions() { if [[ ${#ROLLBACK_FUNCTIONS[@]} -gt 0 ]]; then log_info "Executing rollback functions..." for ((i=${#ROLLBACK_FUNCTIONS[@]}-1; i>=0; i--)); do local rollback_func="${ROLLBACK_FUNCTIONS[i]}" log_info "Executing rollback: $rollback_func" if declare -F "$rollback_func" >/dev/null; then "$rollback_func" || log_error "Rollback function $rollback_func failed" else log_error "Rollback function $rollback_func not found" fi done fi } # Show recovery options to user show_recovery_options() { echo "" echo -e "${CYAN}╔══════════════════════════════════════════════════════════════╗${NC}" echo -e "${CYAN}║ RECOVERY OPTIONS ║${NC}" echo -e "${CYAN}╠══════════════════════════════════════════════════════════════╣${NC}" echo -e "${CYAN}║${NC} 1. Check logs: tail -f $LOG_FILE${CYAN}║${NC}" echo -e "${CYAN}║${NC} 2. Review errors: tail -f $ERROR_LOG${CYAN}║${NC}" echo -e "${CYAN}║${NC} 3. System state: ls -la $LOG_DIR/error_state_*${CYAN}║${NC}" echo -e "${CYAN}║${NC} 4. Resume from checkpoint (if available)${CYAN}║${NC}" echo -e "${CYAN}║${NC} 5. Run cleanup manually: execute_cleanup_functions${CYAN}║${NC}" echo -e "${CYAN}╚══════════════════════════════════════════════════════════════╝${NC}" echo "" } # Execute cleanup functions execute_cleanup_functions() { if [[ ${#CLEANUP_FUNCTIONS[@]} -gt 0 ]]; then log_info "Executing cleanup functions..." for cleanup_func in "${CLEANUP_FUNCTIONS[@]}"; do log_info "Executing cleanup: $cleanup_func" if declare -F "$cleanup_func" >/dev/null; then "$cleanup_func" || log_error "Cleanup function $cleanup_func failed" else log_error "Cleanup function $cleanup_func not found" fi done fi } # Generate comprehensive error report generate_error_report() { local report_file="$LOG_DIR/error_report_$(date +%Y%m%d_%H%M%S).md" local duration=$(($(date +%s) - START_TIME)) cat > "$report_file" << EOF # Migration Script Error Report **Script:** $SCRIPT_NAME **PID:** $SCRIPT_PID **Date:** $(date) **Duration:** ${duration}s **Exit Code:** $? ## Summary - **Errors:** $ERROR_COUNT - **Warnings:** $WARNING_COUNT - **Steps Completed:** $STEP_COUNT - **Failed Step:** $CURRENT_STEP ## Error Details \`\`\` $(tail -n 20 "$ERROR_LOG") \`\`\` ## System State - **Log File:** $LOG_FILE - **Error Log:** $ERROR_LOG - **System State:** $LOG_DIR/error_state_* ## Recovery Actions 1. Review error logs for specific failure cause 2. Check system state capture for debugging 3. Run cleanup functions if needed 4. Consider manual rollback if automatic rollback failed ## Next Steps - [ ] Identify root cause - [ ] Apply fix - [ ] Test fix in staging environment - [ ] Re-run migration with fix applied EOF log_info "Error report generated: $report_file" } # Register cleanup function register_cleanup() { local cleanup_func=$1 CLEANUP_FUNCTIONS+=("$cleanup_func") log_debug "Registered cleanup function: $cleanup_func" } # Register rollback function register_rollback() { local rollback_func=$1 ROLLBACK_FUNCTIONS+=("$rollback_func") log_debug "Registered rollback function: $rollback_func" } # Function to validate prerequisites validate_prerequisites() { local required_commands=("$@") local missing_commands=() log_step "Validating prerequisites..." for cmd in "${required_commands[@]}"; do if ! command -v "$cmd" >/dev/null 2>&1; then missing_commands+=("$cmd") log_error "Required command not found: $cmd" else log_debug "Found required command: $cmd" fi done if [[ ${#missing_commands[@]} -gt 0 ]]; then log_critical "Missing required commands: ${missing_commands[*]}" log_info "Install missing commands and retry" exit 1 fi log_success "All prerequisites validated" } # Function to check disk space check_disk_space() { local required_space_gb=${1:-1} local mount_point=${2:-"/"} log_step "Checking disk space for $mount_point..." local available_gb=$(df -BG "$mount_point" | awk 'NR==2 {print $4}' | sed 's/G//') if [[ $available_gb -lt $required_space_gb ]]; then log_critical "Insufficient disk space. Required: ${required_space_gb}GB, Available: ${available_gb}GB" return 1 else log_success "Sufficient disk space available: ${available_gb}GB" return 0 fi } # Function to validate network connectivity validate_network_connectivity() { local hosts=("$@") log_step "Validating network connectivity..." for host in "${hosts[@]}"; do log_info "Testing connectivity to $host..." if ping -c 1 -W 5 "$host" >/dev/null 2>&1; then log_success "Successfully connected to $host" else log_error "Cannot reach $host" return 1 fi # Test SSH connectivity if not localhost if [[ "$host" != "localhost" ]] && [[ "$host" != "127.0.0.1" ]]; then if ssh -o ConnectTimeout=10 -o BatchMode=yes "$host" "echo 'SSH OK'" >/dev/null 2>&1; then log_success "SSH connectivity to $host verified" else log_error "SSH connectivity to $host failed" return 1 fi fi done log_success "Network connectivity validated" } # Function to create checkpoint create_checkpoint() { local checkpoint_name=$1 local checkpoint_dir="$LOG_DIR/checkpoints" local checkpoint_file="$checkpoint_dir/${checkpoint_name}_$(date +%Y%m%d_%H%M%S).checkpoint" mkdir -p "$checkpoint_dir" cat > "$checkpoint_file" << EOF CHECKPOINT_NAME=$checkpoint_name CHECKPOINT_TIME=$(date) SCRIPT_NAME=$SCRIPT_NAME CURRENT_STEP=$CURRENT_STEP STEP_COUNT=$STEP_COUNT ERROR_COUNT=$ERROR_COUNT WARNING_COUNT=$WARNING_COUNT EOF log_info "Checkpoint created: $checkpoint_file" echo "$checkpoint_file" } # Function to restore from checkpoint restore_from_checkpoint() { local checkpoint_file=$1 if [[ -f "$checkpoint_file" ]]; then source "$checkpoint_file" log_info "Restored from checkpoint: $CHECKPOINT_NAME at $CHECKPOINT_TIME" return 0 else log_error "Checkpoint file not found: $checkpoint_file" return 1 fi } # Function to wait for service readiness wait_for_service() { local service_name=$1 local health_check_command=$2 local max_wait=${3:-300} # 5 minutes default local interval=${4:-10} # 10 seconds default log_step "Waiting for service $service_name to be ready..." local elapsed=0 while [[ $elapsed -lt $max_wait ]]; do if eval "$health_check_command" >/dev/null 2>&1; then log_success "Service $service_name is ready (${elapsed}s)" return 0 fi log_info "Service $service_name not ready yet, waiting ${interval}s... (${elapsed}/${max_wait}s)" sleep "$interval" elapsed=$((elapsed + interval)) done log_error "Service $service_name failed to become ready within ${max_wait}s" return 1 } # Function to execute with retry execute_with_retry() { local max_attempts=$1 local delay=$2 shift 2 local command=("$@") local attempt=1 while [[ $attempt -le $max_attempts ]]; do log_info "Executing (attempt $attempt/$max_attempts): ${command[*]}" if "${command[@]}"; then log_success "Command succeeded on attempt $attempt" return 0 else local exit_code=$? log_warn "Command failed on attempt $attempt with exit code $exit_code" if [[ $attempt -lt $max_attempts ]]; then log_info "Retrying in ${delay}s..." sleep "$delay" fi fi ((attempt++)) done log_error "Command failed after $max_attempts attempts" return 1 } # Function to monitor resource usage monitor_resources() { local duration=${1:-60} # Monitor for 60 seconds by default local interval=${2:-5} # Check every 5 seconds log_info "Monitoring system resources for ${duration}s..." local end_time=$(($(date +%s) + duration)) while [[ $(date +%s) -lt $end_time ]]; do local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//') local mem_usage=$(free | grep Mem | awk '{printf "%.1f", $3/$2 * 100.0}') local disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//') log_debug "Resource usage - CPU: ${cpu_usage}%, Memory: ${mem_usage}%, Disk: ${disk_usage}%" # Alert on high resource usage if (( $(echo "$cpu_usage > 90" | bc -l) )); then log_warn "High CPU usage detected: ${cpu_usage}%" fi if (( $(echo "$mem_usage > 90" | bc -l) )); then log_warn "High memory usage detected: ${mem_usage}%" fi if [[ ${disk_usage%.*} -gt 90 ]]; then log_warn "High disk usage detected: ${disk_usage}%" fi sleep "$interval" done } # Set up signal handlers cleanup_on_exit() { local exit_code=$? local duration=$(($(date +%s) - START_TIME)) log_info "Script execution completed" log_info "Duration: ${duration}s" log_info "Errors: $ERROR_COUNT, Warnings: $WARNING_COUNT" execute_cleanup_functions # Restore stdout/stderr exec 1>&3 2>&4 exec 3>&- 4>&- exit $exit_code } # Trap signals and errors trap 'error_handler ${LINENO} ${BASH_LINENO} "$BASH_COMMAND"' ERR trap 'cleanup_on_exit' EXIT trap 'log_warn "Received SIGINT, initiating graceful shutdown..."; exit 130' INT trap 'log_warn "Received SIGTERM, initiating graceful shutdown..."; exit 143' TERM # Initialize logging log_info "Started script: $SCRIPT_NAME (PID: $SCRIPT_PID)" log_info "Log file: $LOG_FILE" log_info "Error log: $ERROR_LOG" # Export functions for use in other scripts export -f log_info log_warn log_error log_debug log_step log_success log_critical export -f register_cleanup register_rollback validate_prerequisites export -f check_disk_space validate_network_connectivity export -f create_checkpoint restore_from_checkpoint export -f wait_for_service execute_with_retry monitor_resources