Files
HomeAudit/migration_scripts/scripts/lib/error_handling.sh
2025-08-24 11:13:39 -04:00

496 lines
16 KiB
Bash
Executable File

#!/bin/bash
# Enhanced Error Handling Library
# Provides robust error handling, logging, and recovery mechanisms
# Global error handling configuration
set -euo pipefail
IFS=$'\n\t'
# Colors for output
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly BLUE='\033[0;34m'
readonly PURPLE='\033[0;35m'
readonly CYAN='\033[0;36m'
readonly NC='\033[0m' # No Color
# Logging configuration
readonly LOG_DIR="/opt/migration/logs"
readonly LOG_FILE="$LOG_DIR/migration_$(date +%Y%m%d_%H%M%S).log"
readonly ERROR_LOG="$LOG_DIR/errors_$(date +%Y%m%d_%H%M%S).log"
# Ensure log directory exists
mkdir -p "$LOG_DIR"
chmod 755 "$LOG_DIR"
# Initialize logging
exec 3>&1 4>&2
exec 1> >(tee -a "$LOG_FILE")
exec 2> >(tee -a "$ERROR_LOG" >&2)
# Global variables
declare -g SCRIPT_NAME="${0##*/}"
declare -g SCRIPT_PID=$$
declare -g START_TIME=$(date +%s)
declare -g CLEANUP_FUNCTIONS=()
declare -g ROLLBACK_FUNCTIONS=()
declare -g ERROR_COUNT=0
declare -g WARNING_COUNT=0
declare -g STEP_COUNT=0
declare -g CURRENT_STEP=""
# Function to print formatted messages
print_message() {
local level=$1
local message=$2
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
case $level in
"INFO")
echo -e "${GREEN}[INFO]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE"
;;
"WARN")
echo -e "${YELLOW}[WARN]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE" >&2
((WARNING_COUNT++))
;;
"ERROR")
echo -e "${RED}[ERROR]${NC} ${timestamp} - ${message}" | tee -a "$ERROR_LOG" >&2
((ERROR_COUNT++))
;;
"DEBUG")
if [[ "${DEBUG:-false}" == "true" ]]; then
echo -e "${PURPLE}[DEBUG]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE"
fi
;;
"STEP")
echo -e "${BLUE}[STEP $((++STEP_COUNT))]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE"
CURRENT_STEP="$message"
;;
"SUCCESS")
echo -e "${GREEN}[SUCCESS]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE"
;;
"CRITICAL")
echo -e "${RED}[CRITICAL]${NC} ${timestamp} - ${message}" | tee -a "$ERROR_LOG" >&2
((ERROR_COUNT++))
;;
esac
}
# Convenience functions
log_info() { print_message "INFO" "$1"; }
log_warn() { print_message "WARN" "$1"; }
log_error() { print_message "ERROR" "$1"; }
log_debug() { print_message "DEBUG" "$1"; }
log_step() { print_message "STEP" "$1"; }
log_success() { print_message "SUCCESS" "$1"; }
log_critical() { print_message "CRITICAL" "$1"; }
# Enhanced error handler with context and recovery
error_handler() {
local exit_code=$?
local line_number=$1
local bash_lineno=$2
local last_command="${3:-unknown}"
local funcstack=("${FUNCNAME[@]:1}")
log_critical "Script failed in $SCRIPT_NAME"
log_critical "Exit code: $exit_code"
log_critical "Line number: $line_number"
log_critical "Command: $last_command"
log_critical "Current step: $CURRENT_STEP"
log_critical "Function stack: ${funcstack[*]}"
# Capture system state for debugging
capture_system_state_on_error
# Execute rollback functions in reverse order
execute_rollback_functions
# Show recovery options
show_recovery_options
# Execute cleanup functions
execute_cleanup_functions
# Generate error report
generate_error_report
exit $exit_code
}
# Capture system state when error occurs
capture_system_state_on_error() {
local error_state_dir="$LOG_DIR/error_state_$(date +%Y%m%d_%H%M%S)"
mkdir -p "$error_state_dir"
log_info "Capturing system state for debugging..."
# Capture process information
ps aux > "$error_state_dir/processes.txt" 2>/dev/null || true
# Capture network state
ss -tulpn > "$error_state_dir/network.txt" 2>/dev/null || true
# Capture Docker state if available
if command -v docker >/dev/null 2>&1; then
docker ps -a > "$error_state_dir/docker_containers.txt" 2>/dev/null || true
docker images > "$error_state_dir/docker_images.txt" 2>/dev/null || true
docker system df > "$error_state_dir/docker_disk.txt" 2>/dev/null || true
docker system events --since 1h --until now > "$error_state_dir/docker_events.txt" 2>/dev/null || true
fi
# Capture disk space
df -h > "$error_state_dir/disk_space.txt" 2>/dev/null || true
# Capture memory usage
free -h > "$error_state_dir/memory.txt" 2>/dev/null || true
# Capture recent logs
tail -n 100 /var/log/syslog > "$error_state_dir/system_logs.txt" 2>/dev/null || true
log_info "System state captured in: $error_state_dir"
}
# Execute rollback functions in reverse order
execute_rollback_functions() {
if [[ ${#ROLLBACK_FUNCTIONS[@]} -gt 0 ]]; then
log_info "Executing rollback functions..."
for ((i=${#ROLLBACK_FUNCTIONS[@]}-1; i>=0; i--)); do
local rollback_func="${ROLLBACK_FUNCTIONS[i]}"
log_info "Executing rollback: $rollback_func"
if declare -F "$rollback_func" >/dev/null; then
"$rollback_func" || log_error "Rollback function $rollback_func failed"
else
log_error "Rollback function $rollback_func not found"
fi
done
fi
}
# Show recovery options to user
show_recovery_options() {
echo ""
echo -e "${CYAN}╔══════════════════════════════════════════════════════════════╗${NC}"
echo -e "${CYAN}║ RECOVERY OPTIONS ║${NC}"
echo -e "${CYAN}╠══════════════════════════════════════════════════════════════╣${NC}"
echo -e "${CYAN}${NC} 1. Check logs: tail -f $LOG_FILE${CYAN}${NC}"
echo -e "${CYAN}${NC} 2. Review errors: tail -f $ERROR_LOG${CYAN}${NC}"
echo -e "${CYAN}${NC} 3. System state: ls -la $LOG_DIR/error_state_*${CYAN}${NC}"
echo -e "${CYAN}${NC} 4. Resume from checkpoint (if available)${CYAN}${NC}"
echo -e "${CYAN}${NC} 5. Run cleanup manually: execute_cleanup_functions${CYAN}${NC}"
echo -e "${CYAN}╚══════════════════════════════════════════════════════════════╝${NC}"
echo ""
}
# Execute cleanup functions
execute_cleanup_functions() {
if [[ ${#CLEANUP_FUNCTIONS[@]} -gt 0 ]]; then
log_info "Executing cleanup functions..."
for cleanup_func in "${CLEANUP_FUNCTIONS[@]}"; do
log_info "Executing cleanup: $cleanup_func"
if declare -F "$cleanup_func" >/dev/null; then
"$cleanup_func" || log_error "Cleanup function $cleanup_func failed"
else
log_error "Cleanup function $cleanup_func not found"
fi
done
fi
}
# Generate comprehensive error report
generate_error_report() {
local report_file="$LOG_DIR/error_report_$(date +%Y%m%d_%H%M%S).md"
local duration=$(($(date +%s) - START_TIME))
cat > "$report_file" << EOF
# Migration Script Error Report
**Script:** $SCRIPT_NAME
**PID:** $SCRIPT_PID
**Date:** $(date)
**Duration:** ${duration}s
**Exit Code:** $?
## Summary
- **Errors:** $ERROR_COUNT
- **Warnings:** $WARNING_COUNT
- **Steps Completed:** $STEP_COUNT
- **Failed Step:** $CURRENT_STEP
## Error Details
\`\`\`
$(tail -n 20 "$ERROR_LOG")
\`\`\`
## System State
- **Log File:** $LOG_FILE
- **Error Log:** $ERROR_LOG
- **System State:** $LOG_DIR/error_state_*
## Recovery Actions
1. Review error logs for specific failure cause
2. Check system state capture for debugging
3. Run cleanup functions if needed
4. Consider manual rollback if automatic rollback failed
## Next Steps
- [ ] Identify root cause
- [ ] Apply fix
- [ ] Test fix in staging environment
- [ ] Re-run migration with fix applied
EOF
log_info "Error report generated: $report_file"
}
# Register cleanup function
register_cleanup() {
local cleanup_func=$1
CLEANUP_FUNCTIONS+=("$cleanup_func")
log_debug "Registered cleanup function: $cleanup_func"
}
# Register rollback function
register_rollback() {
local rollback_func=$1
ROLLBACK_FUNCTIONS+=("$rollback_func")
log_debug "Registered rollback function: $rollback_func"
}
# Function to validate prerequisites
validate_prerequisites() {
local required_commands=("$@")
local missing_commands=()
log_step "Validating prerequisites..."
for cmd in "${required_commands[@]}"; do
if ! command -v "$cmd" >/dev/null 2>&1; then
missing_commands+=("$cmd")
log_error "Required command not found: $cmd"
else
log_debug "Found required command: $cmd"
fi
done
if [[ ${#missing_commands[@]} -gt 0 ]]; then
log_critical "Missing required commands: ${missing_commands[*]}"
log_info "Install missing commands and retry"
exit 1
fi
log_success "All prerequisites validated"
}
# Function to check disk space
check_disk_space() {
local required_space_gb=${1:-1}
local mount_point=${2:-"/"}
log_step "Checking disk space for $mount_point..."
local available_gb=$(df -BG "$mount_point" | awk 'NR==2 {print $4}' | sed 's/G//')
if [[ $available_gb -lt $required_space_gb ]]; then
log_critical "Insufficient disk space. Required: ${required_space_gb}GB, Available: ${available_gb}GB"
return 1
else
log_success "Sufficient disk space available: ${available_gb}GB"
return 0
fi
}
# Function to validate network connectivity
validate_network_connectivity() {
local hosts=("$@")
log_step "Validating network connectivity..."
for host in "${hosts[@]}"; do
log_info "Testing connectivity to $host..."
if ping -c 1 -W 5 "$host" >/dev/null 2>&1; then
log_success "Successfully connected to $host"
else
log_error "Cannot reach $host"
return 1
fi
# Test SSH connectivity if not localhost
if [[ "$host" != "localhost" ]] && [[ "$host" != "127.0.0.1" ]]; then
if ssh -o ConnectTimeout=10 -o BatchMode=yes "$host" "echo 'SSH OK'" >/dev/null 2>&1; then
log_success "SSH connectivity to $host verified"
else
log_error "SSH connectivity to $host failed"
return 1
fi
fi
done
log_success "Network connectivity validated"
}
# Function to create checkpoint
create_checkpoint() {
local checkpoint_name=$1
local checkpoint_dir="$LOG_DIR/checkpoints"
local checkpoint_file="$checkpoint_dir/${checkpoint_name}_$(date +%Y%m%d_%H%M%S).checkpoint"
mkdir -p "$checkpoint_dir"
cat > "$checkpoint_file" << EOF
CHECKPOINT_NAME=$checkpoint_name
CHECKPOINT_TIME=$(date)
SCRIPT_NAME=$SCRIPT_NAME
CURRENT_STEP=$CURRENT_STEP
STEP_COUNT=$STEP_COUNT
ERROR_COUNT=$ERROR_COUNT
WARNING_COUNT=$WARNING_COUNT
EOF
log_info "Checkpoint created: $checkpoint_file"
echo "$checkpoint_file"
}
# Function to restore from checkpoint
restore_from_checkpoint() {
local checkpoint_file=$1
if [[ -f "$checkpoint_file" ]]; then
source "$checkpoint_file"
log_info "Restored from checkpoint: $CHECKPOINT_NAME at $CHECKPOINT_TIME"
return 0
else
log_error "Checkpoint file not found: $checkpoint_file"
return 1
fi
}
# Function to wait for service readiness
wait_for_service() {
local service_name=$1
local health_check_command=$2
local max_wait=${3:-300} # 5 minutes default
local interval=${4:-10} # 10 seconds default
log_step "Waiting for service $service_name to be ready..."
local elapsed=0
while [[ $elapsed -lt $max_wait ]]; do
if eval "$health_check_command" >/dev/null 2>&1; then
log_success "Service $service_name is ready (${elapsed}s)"
return 0
fi
log_info "Service $service_name not ready yet, waiting ${interval}s... (${elapsed}/${max_wait}s)"
sleep "$interval"
elapsed=$((elapsed + interval))
done
log_error "Service $service_name failed to become ready within ${max_wait}s"
return 1
}
# Function to execute with retry
execute_with_retry() {
local max_attempts=$1
local delay=$2
shift 2
local command=("$@")
local attempt=1
while [[ $attempt -le $max_attempts ]]; do
log_info "Executing (attempt $attempt/$max_attempts): ${command[*]}"
if "${command[@]}"; then
log_success "Command succeeded on attempt $attempt"
return 0
else
local exit_code=$?
log_warn "Command failed on attempt $attempt with exit code $exit_code"
if [[ $attempt -lt $max_attempts ]]; then
log_info "Retrying in ${delay}s..."
sleep "$delay"
fi
fi
((attempt++))
done
log_error "Command failed after $max_attempts attempts"
return 1
}
# Function to monitor resource usage
monitor_resources() {
local duration=${1:-60} # Monitor for 60 seconds by default
local interval=${2:-5} # Check every 5 seconds
log_info "Monitoring system resources for ${duration}s..."
local end_time=$(($(date +%s) + duration))
while [[ $(date +%s) -lt $end_time ]]; do
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
local mem_usage=$(free | grep Mem | awk '{printf "%.1f", $3/$2 * 100.0}')
local disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//')
log_debug "Resource usage - CPU: ${cpu_usage}%, Memory: ${mem_usage}%, Disk: ${disk_usage}%"
# Alert on high resource usage
if (( $(echo "$cpu_usage > 90" | bc -l) )); then
log_warn "High CPU usage detected: ${cpu_usage}%"
fi
if (( $(echo "$mem_usage > 90" | bc -l) )); then
log_warn "High memory usage detected: ${mem_usage}%"
fi
if [[ ${disk_usage%.*} -gt 90 ]]; then
log_warn "High disk usage detected: ${disk_usage}%"
fi
sleep "$interval"
done
}
# Set up signal handlers
cleanup_on_exit() {
local exit_code=$?
local duration=$(($(date +%s) - START_TIME))
log_info "Script execution completed"
log_info "Duration: ${duration}s"
log_info "Errors: $ERROR_COUNT, Warnings: $WARNING_COUNT"
execute_cleanup_functions
# Restore stdout/stderr
exec 1>&3 2>&4
exec 3>&- 4>&-
exit $exit_code
}
# Trap signals and errors
trap 'error_handler ${LINENO} ${BASH_LINENO} "$BASH_COMMAND"' ERR
trap 'cleanup_on_exit' EXIT
trap 'log_warn "Received SIGINT, initiating graceful shutdown..."; exit 130' INT
trap 'log_warn "Received SIGTERM, initiating graceful shutdown..."; exit 143' TERM
# Initialize logging
log_info "Started script: $SCRIPT_NAME (PID: $SCRIPT_PID)"
log_info "Log file: $LOG_FILE"
log_info "Error log: $ERROR_LOG"
# Export functions for use in other scripts
export -f log_info log_warn log_error log_debug log_step log_success log_critical
export -f register_cleanup register_rollback validate_prerequisites
export -f check_disk_space validate_network_connectivity
export -f create_checkpoint restore_from_checkpoint
export -f wait_for_service execute_with_retry monitor_resources