HomeAudit/migration_scripts/scripts/lib/error_handling.sh

#!/bin/bash
# Enhanced Error Handling Library
# Provides robust error handling, logging, and recovery mechanisms

# Global error handling configuration
set -euo pipefail
IFS=$'\n\t'

# Colors for output
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly BLUE='\033[0;34m'
readonly PURPLE='\033[0;35m'
readonly CYAN='\033[0;36m'
readonly NC='\033[0m' # No Color

# Logging configuration
readonly LOG_DIR="/opt/migration/logs"
readonly LOG_FILE="$LOG_DIR/migration_$(date +%Y%m%d_%H%M%S).log"
readonly ERROR_LOG="$LOG_DIR/errors_$(date +%Y%m%d_%H%M%S).log"

# Ensure log directory exists
mkdir -p "$LOG_DIR"
chmod 755 "$LOG_DIR"

# Initialize logging
exec 3>&1 4>&2
exec 1> >(tee -a "$LOG_FILE")
exec 2> >(tee -a "$ERROR_LOG" >&2)

# Global variables
declare -g SCRIPT_NAME="${0##*/}"
declare -g SCRIPT_PID=$$
declare -g START_TIME=$(date +%s)
declare -g CLEANUP_FUNCTIONS=()
declare -g ROLLBACK_FUNCTIONS=()
declare -g ERROR_COUNT=0
declare -g WARNING_COUNT=0
declare -g STEP_COUNT=0
declare -g CURRENT_STEP=""

# Function to print formatted messages
print_message() {
    local level=$1
    local message=$2
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')

    case $level in
        "INFO")
            echo -e "${GREEN}[INFO]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE"
            ;;
        "WARN")
            echo -e "${YELLOW}[WARN]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE" >&2
            ((WARNING_COUNT++))
            ;;
        "ERROR")
            echo -e "${RED}[ERROR]${NC} ${timestamp} - ${message}" | tee -a "$ERROR_LOG" >&2
            ((ERROR_COUNT++))
            ;;
        "DEBUG")
            if [[ "${DEBUG:-false}" == "true" ]]; then
                echo -e "${PURPLE}[DEBUG]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE"
            fi
            ;;
        "STEP")
            echo -e "${BLUE}[STEP $((++STEP_COUNT))]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE"
            CURRENT_STEP="$message"
            ;;
        "SUCCESS")
            echo -e "${GREEN}[SUCCESS]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE"
            ;;
        "CRITICAL")
            echo -e "${RED}[CRITICAL]${NC} ${timestamp} - ${message}" | tee -a "$ERROR_LOG" >&2
            ((ERROR_COUNT++))
            ;;
    esac
}

# Convenience functions
log_info() { print_message "INFO" "$1"; }
log_warn() { print_message "WARN" "$1"; }
log_error() { print_message "ERROR" "$1"; }
log_debug() { print_message "DEBUG" "$1"; }
log_step() { print_message "STEP" "$1"; }
log_success() { print_message "SUCCESS" "$1"; }
log_critical() { print_message "CRITICAL" "$1"; }

# Enhanced error handler with context and recovery
error_handler() {
    local exit_code=$?
    local line_number=$1
    local bash_lineno=$2
    local last_command="${3:-unknown}"
    local funcstack=("${FUNCNAME[@]:1}")

    log_critical "Script failed in $SCRIPT_NAME"
    log_critical "Exit code: $exit_code"
    log_critical "Line number: $line_number"
    log_critical "Command: $last_command"
    log_critical "Current step: $CURRENT_STEP"
    log_critical "Function stack: ${funcstack[*]}"

    # Capture system state for debugging
    capture_system_state_on_error

    # Execute rollback functions in reverse order
    execute_rollback_functions

    # Show recovery options
    show_recovery_options

    # Execute cleanup functions
    execute_cleanup_functions

    # Generate error report
    generate_error_report

    exit $exit_code
}

# Capture system state when error occurs
capture_system_state_on_error() {
    local error_state_dir="$LOG_DIR/error_state_$(date +%Y%m%d_%H%M%S)"
    mkdir -p "$error_state_dir"

    log_info "Capturing system state for debugging..."

    # Capture process information
    ps aux > "$error_state_dir/processes.txt" 2>/dev/null || true

    # Capture network state
    ss -tulpn > "$error_state_dir/network.txt" 2>/dev/null || true

    # Capture Docker state if available
    if command -v docker >/dev/null 2>&1; then
        docker ps -a > "$error_state_dir/docker_containers.txt" 2>/dev/null || true
        docker images > "$error_state_dir/docker_images.txt" 2>/dev/null || true
        docker system df > "$error_state_dir/docker_disk.txt" 2>/dev/null || true
        docker system events --since 1h --until now > "$error_state_dir/docker_events.txt" 2>/dev/null || true
    fi

    # Capture disk space
    df -h > "$error_state_dir/disk_space.txt" 2>/dev/null || true

    # Capture memory usage
    free -h > "$error_state_dir/memory.txt" 2>/dev/null || true

    # Capture recent logs
    tail -n 100 /var/log/syslog > "$error_state_dir/system_logs.txt" 2>/dev/null || true

    log_info "System state captured in: $error_state_dir"
}

# Execute rollback functions in reverse order
execute_rollback_functions() {
    if [[ ${#ROLLBACK_FUNCTIONS[@]} -gt 0 ]]; then
        log_info "Executing rollback functions..."

        for ((i=${#ROLLBACK_FUNCTIONS[@]}-1; i>=0; i--)); do
            local rollback_func="${ROLLBACK_FUNCTIONS[i]}"
            log_info "Executing rollback: $rollback_func"

            if declare -F "$rollback_func" >/dev/null; then
                "$rollback_func" || log_error "Rollback function $rollback_func failed"
            else
                log_error "Rollback function $rollback_func not found"
            fi
        done
    fi
}

# Show recovery options to user
show_recovery_options() {
    echo ""
    echo -e "${CYAN}╔══════════════════════════════════════════════════════════════╗${NC}"
    echo -e "${CYAN}║                        RECOVERY OPTIONS                     ║${NC}"
    echo -e "${CYAN}╠══════════════════════════════════════════════════════════════╣${NC}"
    echo -e "${CYAN}║${NC} 1. Check logs: tail -f $LOG_FILE${CYAN}║${NC}"
    echo -e "${CYAN}║${NC} 2. Review errors: tail -f $ERROR_LOG${CYAN}║${NC}"
    echo -e "${CYAN}║${NC} 3. System state: ls -la $LOG_DIR/error_state_*${CYAN}║${NC}"
    echo -e "${CYAN}║${NC} 4. Resume from checkpoint (if available)${CYAN}║${NC}"
    echo -e "${CYAN}║${NC} 5. Run cleanup manually: execute_cleanup_functions${CYAN}║${NC}"
    echo -e "${CYAN}╚══════════════════════════════════════════════════════════════╝${NC}"
    echo ""
}

# Execute cleanup functions
execute_cleanup_functions() {
    if [[ ${#CLEANUP_FUNCTIONS[@]} -gt 0 ]]; then
        log_info "Executing cleanup functions..."

        for cleanup_func in "${CLEANUP_FUNCTIONS[@]}"; do
            log_info "Executing cleanup: $cleanup_func"

            if declare -F "$cleanup_func" >/dev/null; then
                "$cleanup_func" || log_error "Cleanup function $cleanup_func failed"
            else
                log_error "Cleanup function $cleanup_func not found"
            fi
        done
    fi
}

# Generate comprehensive error report
generate_error_report() {
    local report_file="$LOG_DIR/error_report_$(date +%Y%m%d_%H%M%S).md"
    local duration=$(($(date +%s) - START_TIME))

    cat > "$report_file" << EOF
# Migration Script Error Report

**Script:** $SCRIPT_NAME
**PID:** $SCRIPT_PID
**Date:** $(date)
**Duration:** ${duration}s
**Exit Code:** $?

## Summary
- **Errors:** $ERROR_COUNT
- **Warnings:** $WARNING_COUNT
- **Steps Completed:** $STEP_COUNT
- **Failed Step:** $CURRENT_STEP

## Error Details
\`\`\`
$(tail -n 20 "$ERROR_LOG")
\`\`\`

## System State
- **Log File:** $LOG_FILE
- **Error Log:** $ERROR_LOG
- **System State:** $LOG_DIR/error_state_*

## Recovery Actions
1. Review error logs for specific failure cause
2. Check system state capture for debugging
3. Run cleanup functions if needed
4. Consider manual rollback if automatic rollback failed

## Next Steps
- [ ] Identify root cause
- [ ] Apply fix
- [ ] Test fix in staging environment
- [ ] Re-run migration with fix applied
EOF

    log_info "Error report generated: $report_file"
}

# Register cleanup function
register_cleanup() {
    local cleanup_func=$1
    CLEANUP_FUNCTIONS+=("$cleanup_func")
    log_debug "Registered cleanup function: $cleanup_func"
}

# Register rollback function
register_rollback() {
    local rollback_func=$1
    ROLLBACK_FUNCTIONS+=("$rollback_func")
    log_debug "Registered rollback function: $rollback_func"
}

# Function to validate prerequisites
validate_prerequisites() {
    local required_commands=("$@")
    local missing_commands=()

    log_step "Validating prerequisites..."

    for cmd in "${required_commands[@]}"; do
        if ! command -v "$cmd" >/dev/null 2>&1; then
            missing_commands+=("$cmd")
            log_error "Required command not found: $cmd"
        else
            log_debug "Found required command: $cmd"
        fi
    done

    if [[ ${#missing_commands[@]} -gt 0 ]]; then
        log_critical "Missing required commands: ${missing_commands[*]}"
        log_info "Install missing commands and retry"
        exit 1
    fi

    log_success "All prerequisites validated"
}

# Function to check disk space
check_disk_space() {
    local required_space_gb=${1:-1}
    local mount_point=${2:-"/"}

    log_step "Checking disk space for $mount_point..."

    local available_gb=$(df -BG "$mount_point" | awk 'NR==2 {print $4}' | sed 's/G//')

    if [[ $available_gb -lt $required_space_gb ]]; then
        log_critical "Insufficient disk space. Required: ${required_space_gb}GB, Available: ${available_gb}GB"
        return 1
    else
        log_success "Sufficient disk space available: ${available_gb}GB"
        return 0
    fi
}

# Function to validate network connectivity
validate_network_connectivity() {
    local hosts=("$@")

    log_step "Validating network connectivity..."

    for host in "${hosts[@]}"; do
        log_info "Testing connectivity to $host..."

        if ping -c 1 -W 5 "$host" >/dev/null 2>&1; then
            log_success "Successfully connected to $host"
        else
            log_error "Cannot reach $host"
            return 1
        fi

        # Test SSH connectivity if not localhost
        if [[ "$host" != "localhost" ]] && [[ "$host" != "127.0.0.1" ]]; then
            if ssh -o ConnectTimeout=10 -o BatchMode=yes "$host" "echo 'SSH OK'" >/dev/null 2>&1; then
                log_success "SSH connectivity to $host verified"
            else
                log_error "SSH connectivity to $host failed"
                return 1
            fi
        fi
    done

    log_success "Network connectivity validated"
}

# Function to create checkpoint
create_checkpoint() {
    local checkpoint_name=$1
    local checkpoint_dir="$LOG_DIR/checkpoints"
    local checkpoint_file="$checkpoint_dir/${checkpoint_name}_$(date +%Y%m%d_%H%M%S).checkpoint"

    mkdir -p "$checkpoint_dir"

    cat > "$checkpoint_file" << EOF
CHECKPOINT_NAME=$checkpoint_name
CHECKPOINT_TIME=$(date)
SCRIPT_NAME=$SCRIPT_NAME
CURRENT_STEP=$CURRENT_STEP
STEP_COUNT=$STEP_COUNT
ERROR_COUNT=$ERROR_COUNT
WARNING_COUNT=$WARNING_COUNT
EOF

    log_info "Checkpoint created: $checkpoint_file"
    echo "$checkpoint_file"
}

# Function to restore from checkpoint
restore_from_checkpoint() {
    local checkpoint_file=$1

    if [[ -f "$checkpoint_file" ]]; then
        source "$checkpoint_file"
        log_info "Restored from checkpoint: $CHECKPOINT_NAME at $CHECKPOINT_TIME"
        return 0
    else
        log_error "Checkpoint file not found: $checkpoint_file"
        return 1
    fi
}

# Function to wait for service readiness
wait_for_service() {
    local service_name=$1
    local health_check_command=$2
    local max_wait=${3:-300}  # 5 minutes default
    local interval=${4:-10}   # 10 seconds default

    log_step "Waiting for service $service_name to be ready..."

    local elapsed=0
    while [[ $elapsed -lt $max_wait ]]; do
        if eval "$health_check_command" >/dev/null 2>&1; then
            log_success "Service $service_name is ready (${elapsed}s)"
            return 0
        fi

        log_info "Service $service_name not ready yet, waiting ${interval}s... (${elapsed}/${max_wait}s)"
        sleep "$interval"
        elapsed=$((elapsed + interval))
    done

    log_error "Service $service_name failed to become ready within ${max_wait}s"
    return 1
}

# Function to execute with retry
execute_with_retry() {
    local max_attempts=$1
    local delay=$2
    shift 2
    local command=("$@")

    local attempt=1
    while [[ $attempt -le $max_attempts ]]; do
        log_info "Executing (attempt $attempt/$max_attempts): ${command[*]}"

        if "${command[@]}"; then
            log_success "Command succeeded on attempt $attempt"
            return 0
        else
            local exit_code=$?
            log_warn "Command failed on attempt $attempt with exit code $exit_code"

            if [[ $attempt -lt $max_attempts ]]; then
                log_info "Retrying in ${delay}s..."
                sleep "$delay"
            fi
        fi

        ((attempt++))
    done

    log_error "Command failed after $max_attempts attempts"
    return 1
}

# Function to monitor resource usage
monitor_resources() {
    local duration=${1:-60}  # Monitor for 60 seconds by default
    local interval=${2:-5}   # Check every 5 seconds

    log_info "Monitoring system resources for ${duration}s..."

    local end_time=$(($(date +%s) + duration))
    while [[ $(date +%s) -lt $end_time ]]; do
        local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
        local mem_usage=$(free | grep Mem | awk '{printf "%.1f", $3/$2 * 100.0}')
        local disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//')

        log_debug "Resource usage - CPU: ${cpu_usage}%, Memory: ${mem_usage}%, Disk: ${disk_usage}%"

        # Alert on high resource usage
        if (( $(echo "$cpu_usage > 90" | bc -l) )); then
            log_warn "High CPU usage detected: ${cpu_usage}%"
        fi

        if (( $(echo "$mem_usage > 90" | bc -l) )); then
            log_warn "High memory usage detected: ${mem_usage}%"
        fi

        if [[ ${disk_usage%.*} -gt 90 ]]; then
            log_warn "High disk usage detected: ${disk_usage}%"
        fi

        sleep "$interval"
    done
}

# Set up signal handlers
cleanup_on_exit() {
    local exit_code=$?
    local duration=$(($(date +%s) - START_TIME))

    log_info "Script execution completed"
    log_info "Duration: ${duration}s"
    log_info "Errors: $ERROR_COUNT, Warnings: $WARNING_COUNT"

    execute_cleanup_functions

    # Restore stdout/stderr
    exec 1>&3 2>&4
    exec 3>&- 4>&-

    exit $exit_code
}

# Trap signals and errors
trap 'error_handler ${LINENO} ${BASH_LINENO} "$BASH_COMMAND"' ERR
trap 'cleanup_on_exit' EXIT
trap 'log_warn "Received SIGINT, initiating graceful shutdown..."; exit 130' INT
trap 'log_warn "Received SIGTERM, initiating graceful shutdown..."; exit 143' TERM

# Initialize logging
log_info "Started script: $SCRIPT_NAME (PID: $SCRIPT_PID)"
log_info "Log file: $LOG_FILE"
log_info "Error log: $ERROR_LOG"

# Export functions for use in other scripts
export -f log_info log_warn log_error log_debug log_step log_success log_critical
export -f register_cleanup register_rollback validate_prerequisites
export -f check_disk_space validate_network_connectivity
export -f create_checkpoint restore_from_checkpoint
export -f wait_for_service execute_with_retry monitor_resources