496 lines
16 KiB
Bash
Executable File
496 lines
16 KiB
Bash
Executable File
#!/bin/bash
|
|
# Enhanced Error Handling Library
|
|
# Provides robust error handling, logging, and recovery mechanisms
|
|
|
|
# Global error handling configuration
|
|
set -euo pipefail
|
|
IFS=$'\n\t'
|
|
|
|
# Colors for output
|
|
readonly RED='\033[0;31m'
|
|
readonly GREEN='\033[0;32m'
|
|
readonly YELLOW='\033[1;33m'
|
|
readonly BLUE='\033[0;34m'
|
|
readonly PURPLE='\033[0;35m'
|
|
readonly CYAN='\033[0;36m'
|
|
readonly NC='\033[0m' # No Color
|
|
|
|
# Logging configuration
|
|
readonly LOG_DIR="/opt/migration/logs"
|
|
readonly LOG_FILE="$LOG_DIR/migration_$(date +%Y%m%d_%H%M%S).log"
|
|
readonly ERROR_LOG="$LOG_DIR/errors_$(date +%Y%m%d_%H%M%S).log"
|
|
|
|
# Ensure log directory exists
|
|
mkdir -p "$LOG_DIR"
|
|
chmod 755 "$LOG_DIR"
|
|
|
|
# Initialize logging
|
|
exec 3>&1 4>&2
|
|
exec 1> >(tee -a "$LOG_FILE")
|
|
exec 2> >(tee -a "$ERROR_LOG" >&2)
|
|
|
|
# Global variables
|
|
declare -g SCRIPT_NAME="${0##*/}"
|
|
declare -g SCRIPT_PID=$$
|
|
declare -g START_TIME=$(date +%s)
|
|
declare -g CLEANUP_FUNCTIONS=()
|
|
declare -g ROLLBACK_FUNCTIONS=()
|
|
declare -g ERROR_COUNT=0
|
|
declare -g WARNING_COUNT=0
|
|
declare -g STEP_COUNT=0
|
|
declare -g CURRENT_STEP=""
|
|
|
|
# Function to print formatted messages
|
|
print_message() {
|
|
local level=$1
|
|
local message=$2
|
|
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
|
|
|
case $level in
|
|
"INFO")
|
|
echo -e "${GREEN}[INFO]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE"
|
|
;;
|
|
"WARN")
|
|
echo -e "${YELLOW}[WARN]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE" >&2
|
|
((WARNING_COUNT++))
|
|
;;
|
|
"ERROR")
|
|
echo -e "${RED}[ERROR]${NC} ${timestamp} - ${message}" | tee -a "$ERROR_LOG" >&2
|
|
((ERROR_COUNT++))
|
|
;;
|
|
"DEBUG")
|
|
if [[ "${DEBUG:-false}" == "true" ]]; then
|
|
echo -e "${PURPLE}[DEBUG]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE"
|
|
fi
|
|
;;
|
|
"STEP")
|
|
echo -e "${BLUE}[STEP $((++STEP_COUNT))]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE"
|
|
CURRENT_STEP="$message"
|
|
;;
|
|
"SUCCESS")
|
|
echo -e "${GREEN}[SUCCESS]${NC} ${timestamp} - ${message}" | tee -a "$LOG_FILE"
|
|
;;
|
|
"CRITICAL")
|
|
echo -e "${RED}[CRITICAL]${NC} ${timestamp} - ${message}" | tee -a "$ERROR_LOG" >&2
|
|
((ERROR_COUNT++))
|
|
;;
|
|
esac
|
|
}
|
|
|
|
# Convenience functions
|
|
log_info() { print_message "INFO" "$1"; }
|
|
log_warn() { print_message "WARN" "$1"; }
|
|
log_error() { print_message "ERROR" "$1"; }
|
|
log_debug() { print_message "DEBUG" "$1"; }
|
|
log_step() { print_message "STEP" "$1"; }
|
|
log_success() { print_message "SUCCESS" "$1"; }
|
|
log_critical() { print_message "CRITICAL" "$1"; }
|
|
|
|
# Enhanced error handler with context and recovery
|
|
error_handler() {
|
|
local exit_code=$?
|
|
local line_number=$1
|
|
local bash_lineno=$2
|
|
local last_command="${3:-unknown}"
|
|
local funcstack=("${FUNCNAME[@]:1}")
|
|
|
|
log_critical "Script failed in $SCRIPT_NAME"
|
|
log_critical "Exit code: $exit_code"
|
|
log_critical "Line number: $line_number"
|
|
log_critical "Command: $last_command"
|
|
log_critical "Current step: $CURRENT_STEP"
|
|
log_critical "Function stack: ${funcstack[*]}"
|
|
|
|
# Capture system state for debugging
|
|
capture_system_state_on_error
|
|
|
|
# Execute rollback functions in reverse order
|
|
execute_rollback_functions
|
|
|
|
# Show recovery options
|
|
show_recovery_options
|
|
|
|
# Execute cleanup functions
|
|
execute_cleanup_functions
|
|
|
|
# Generate error report
|
|
generate_error_report
|
|
|
|
exit $exit_code
|
|
}
|
|
|
|
# Capture system state when error occurs
|
|
capture_system_state_on_error() {
|
|
local error_state_dir="$LOG_DIR/error_state_$(date +%Y%m%d_%H%M%S)"
|
|
mkdir -p "$error_state_dir"
|
|
|
|
log_info "Capturing system state for debugging..."
|
|
|
|
# Capture process information
|
|
ps aux > "$error_state_dir/processes.txt" 2>/dev/null || true
|
|
|
|
# Capture network state
|
|
ss -tulpn > "$error_state_dir/network.txt" 2>/dev/null || true
|
|
|
|
# Capture Docker state if available
|
|
if command -v docker >/dev/null 2>&1; then
|
|
docker ps -a > "$error_state_dir/docker_containers.txt" 2>/dev/null || true
|
|
docker images > "$error_state_dir/docker_images.txt" 2>/dev/null || true
|
|
docker system df > "$error_state_dir/docker_disk.txt" 2>/dev/null || true
|
|
docker system events --since 1h --until now > "$error_state_dir/docker_events.txt" 2>/dev/null || true
|
|
fi
|
|
|
|
# Capture disk space
|
|
df -h > "$error_state_dir/disk_space.txt" 2>/dev/null || true
|
|
|
|
# Capture memory usage
|
|
free -h > "$error_state_dir/memory.txt" 2>/dev/null || true
|
|
|
|
# Capture recent logs
|
|
tail -n 100 /var/log/syslog > "$error_state_dir/system_logs.txt" 2>/dev/null || true
|
|
|
|
log_info "System state captured in: $error_state_dir"
|
|
}
|
|
|
|
# Execute rollback functions in reverse order
|
|
execute_rollback_functions() {
|
|
if [[ ${#ROLLBACK_FUNCTIONS[@]} -gt 0 ]]; then
|
|
log_info "Executing rollback functions..."
|
|
|
|
for ((i=${#ROLLBACK_FUNCTIONS[@]}-1; i>=0; i--)); do
|
|
local rollback_func="${ROLLBACK_FUNCTIONS[i]}"
|
|
log_info "Executing rollback: $rollback_func"
|
|
|
|
if declare -F "$rollback_func" >/dev/null; then
|
|
"$rollback_func" || log_error "Rollback function $rollback_func failed"
|
|
else
|
|
log_error "Rollback function $rollback_func not found"
|
|
fi
|
|
done
|
|
fi
|
|
}
|
|
|
|
# Show recovery options to user
|
|
show_recovery_options() {
|
|
echo ""
|
|
echo -e "${CYAN}╔══════════════════════════════════════════════════════════════╗${NC}"
|
|
echo -e "${CYAN}║ RECOVERY OPTIONS ║${NC}"
|
|
echo -e "${CYAN}╠══════════════════════════════════════════════════════════════╣${NC}"
|
|
echo -e "${CYAN}║${NC} 1. Check logs: tail -f $LOG_FILE${CYAN}║${NC}"
|
|
echo -e "${CYAN}║${NC} 2. Review errors: tail -f $ERROR_LOG${CYAN}║${NC}"
|
|
echo -e "${CYAN}║${NC} 3. System state: ls -la $LOG_DIR/error_state_*${CYAN}║${NC}"
|
|
echo -e "${CYAN}║${NC} 4. Resume from checkpoint (if available)${CYAN}║${NC}"
|
|
echo -e "${CYAN}║${NC} 5. Run cleanup manually: execute_cleanup_functions${CYAN}║${NC}"
|
|
echo -e "${CYAN}╚══════════════════════════════════════════════════════════════╝${NC}"
|
|
echo ""
|
|
}
|
|
|
|
# Execute cleanup functions
|
|
execute_cleanup_functions() {
|
|
if [[ ${#CLEANUP_FUNCTIONS[@]} -gt 0 ]]; then
|
|
log_info "Executing cleanup functions..."
|
|
|
|
for cleanup_func in "${CLEANUP_FUNCTIONS[@]}"; do
|
|
log_info "Executing cleanup: $cleanup_func"
|
|
|
|
if declare -F "$cleanup_func" >/dev/null; then
|
|
"$cleanup_func" || log_error "Cleanup function $cleanup_func failed"
|
|
else
|
|
log_error "Cleanup function $cleanup_func not found"
|
|
fi
|
|
done
|
|
fi
|
|
}
|
|
|
|
# Generate comprehensive error report
|
|
generate_error_report() {
|
|
local report_file="$LOG_DIR/error_report_$(date +%Y%m%d_%H%M%S).md"
|
|
local duration=$(($(date +%s) - START_TIME))
|
|
|
|
cat > "$report_file" << EOF
|
|
# Migration Script Error Report
|
|
|
|
**Script:** $SCRIPT_NAME
|
|
**PID:** $SCRIPT_PID
|
|
**Date:** $(date)
|
|
**Duration:** ${duration}s
|
|
**Exit Code:** $?
|
|
|
|
## Summary
|
|
- **Errors:** $ERROR_COUNT
|
|
- **Warnings:** $WARNING_COUNT
|
|
- **Steps Completed:** $STEP_COUNT
|
|
- **Failed Step:** $CURRENT_STEP
|
|
|
|
## Error Details
|
|
\`\`\`
|
|
$(tail -n 20 "$ERROR_LOG")
|
|
\`\`\`
|
|
|
|
## System State
|
|
- **Log File:** $LOG_FILE
|
|
- **Error Log:** $ERROR_LOG
|
|
- **System State:** $LOG_DIR/error_state_*
|
|
|
|
## Recovery Actions
|
|
1. Review error logs for specific failure cause
|
|
2. Check system state capture for debugging
|
|
3. Run cleanup functions if needed
|
|
4. Consider manual rollback if automatic rollback failed
|
|
|
|
## Next Steps
|
|
- [ ] Identify root cause
|
|
- [ ] Apply fix
|
|
- [ ] Test fix in staging environment
|
|
- [ ] Re-run migration with fix applied
|
|
EOF
|
|
|
|
log_info "Error report generated: $report_file"
|
|
}
|
|
|
|
# Register cleanup function
|
|
register_cleanup() {
|
|
local cleanup_func=$1
|
|
CLEANUP_FUNCTIONS+=("$cleanup_func")
|
|
log_debug "Registered cleanup function: $cleanup_func"
|
|
}
|
|
|
|
# Register rollback function
|
|
register_rollback() {
|
|
local rollback_func=$1
|
|
ROLLBACK_FUNCTIONS+=("$rollback_func")
|
|
log_debug "Registered rollback function: $rollback_func"
|
|
}
|
|
|
|
# Function to validate prerequisites
|
|
validate_prerequisites() {
|
|
local required_commands=("$@")
|
|
local missing_commands=()
|
|
|
|
log_step "Validating prerequisites..."
|
|
|
|
for cmd in "${required_commands[@]}"; do
|
|
if ! command -v "$cmd" >/dev/null 2>&1; then
|
|
missing_commands+=("$cmd")
|
|
log_error "Required command not found: $cmd"
|
|
else
|
|
log_debug "Found required command: $cmd"
|
|
fi
|
|
done
|
|
|
|
if [[ ${#missing_commands[@]} -gt 0 ]]; then
|
|
log_critical "Missing required commands: ${missing_commands[*]}"
|
|
log_info "Install missing commands and retry"
|
|
exit 1
|
|
fi
|
|
|
|
log_success "All prerequisites validated"
|
|
}
|
|
|
|
# Function to check disk space
|
|
check_disk_space() {
|
|
local required_space_gb=${1:-1}
|
|
local mount_point=${2:-"/"}
|
|
|
|
log_step "Checking disk space for $mount_point..."
|
|
|
|
local available_gb=$(df -BG "$mount_point" | awk 'NR==2 {print $4}' | sed 's/G//')
|
|
|
|
if [[ $available_gb -lt $required_space_gb ]]; then
|
|
log_critical "Insufficient disk space. Required: ${required_space_gb}GB, Available: ${available_gb}GB"
|
|
return 1
|
|
else
|
|
log_success "Sufficient disk space available: ${available_gb}GB"
|
|
return 0
|
|
fi
|
|
}
|
|
|
|
# Function to validate network connectivity
|
|
validate_network_connectivity() {
|
|
local hosts=("$@")
|
|
|
|
log_step "Validating network connectivity..."
|
|
|
|
for host in "${hosts[@]}"; do
|
|
log_info "Testing connectivity to $host..."
|
|
|
|
if ping -c 1 -W 5 "$host" >/dev/null 2>&1; then
|
|
log_success "Successfully connected to $host"
|
|
else
|
|
log_error "Cannot reach $host"
|
|
return 1
|
|
fi
|
|
|
|
# Test SSH connectivity if not localhost
|
|
if [[ "$host" != "localhost" ]] && [[ "$host" != "127.0.0.1" ]]; then
|
|
if ssh -o ConnectTimeout=10 -o BatchMode=yes "$host" "echo 'SSH OK'" >/dev/null 2>&1; then
|
|
log_success "SSH connectivity to $host verified"
|
|
else
|
|
log_error "SSH connectivity to $host failed"
|
|
return 1
|
|
fi
|
|
fi
|
|
done
|
|
|
|
log_success "Network connectivity validated"
|
|
}
|
|
|
|
# Function to create checkpoint
|
|
create_checkpoint() {
|
|
local checkpoint_name=$1
|
|
local checkpoint_dir="$LOG_DIR/checkpoints"
|
|
local checkpoint_file="$checkpoint_dir/${checkpoint_name}_$(date +%Y%m%d_%H%M%S).checkpoint"
|
|
|
|
mkdir -p "$checkpoint_dir"
|
|
|
|
cat > "$checkpoint_file" << EOF
|
|
CHECKPOINT_NAME=$checkpoint_name
|
|
CHECKPOINT_TIME=$(date)
|
|
SCRIPT_NAME=$SCRIPT_NAME
|
|
CURRENT_STEP=$CURRENT_STEP
|
|
STEP_COUNT=$STEP_COUNT
|
|
ERROR_COUNT=$ERROR_COUNT
|
|
WARNING_COUNT=$WARNING_COUNT
|
|
EOF
|
|
|
|
log_info "Checkpoint created: $checkpoint_file"
|
|
echo "$checkpoint_file"
|
|
}
|
|
|
|
# Function to restore from checkpoint
|
|
restore_from_checkpoint() {
|
|
local checkpoint_file=$1
|
|
|
|
if [[ -f "$checkpoint_file" ]]; then
|
|
source "$checkpoint_file"
|
|
log_info "Restored from checkpoint: $CHECKPOINT_NAME at $CHECKPOINT_TIME"
|
|
return 0
|
|
else
|
|
log_error "Checkpoint file not found: $checkpoint_file"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Function to wait for service readiness
|
|
wait_for_service() {
|
|
local service_name=$1
|
|
local health_check_command=$2
|
|
local max_wait=${3:-300} # 5 minutes default
|
|
local interval=${4:-10} # 10 seconds default
|
|
|
|
log_step "Waiting for service $service_name to be ready..."
|
|
|
|
local elapsed=0
|
|
while [[ $elapsed -lt $max_wait ]]; do
|
|
if eval "$health_check_command" >/dev/null 2>&1; then
|
|
log_success "Service $service_name is ready (${elapsed}s)"
|
|
return 0
|
|
fi
|
|
|
|
log_info "Service $service_name not ready yet, waiting ${interval}s... (${elapsed}/${max_wait}s)"
|
|
sleep "$interval"
|
|
elapsed=$((elapsed + interval))
|
|
done
|
|
|
|
log_error "Service $service_name failed to become ready within ${max_wait}s"
|
|
return 1
|
|
}
|
|
|
|
# Function to execute with retry
|
|
execute_with_retry() {
|
|
local max_attempts=$1
|
|
local delay=$2
|
|
shift 2
|
|
local command=("$@")
|
|
|
|
local attempt=1
|
|
while [[ $attempt -le $max_attempts ]]; do
|
|
log_info "Executing (attempt $attempt/$max_attempts): ${command[*]}"
|
|
|
|
if "${command[@]}"; then
|
|
log_success "Command succeeded on attempt $attempt"
|
|
return 0
|
|
else
|
|
local exit_code=$?
|
|
log_warn "Command failed on attempt $attempt with exit code $exit_code"
|
|
|
|
if [[ $attempt -lt $max_attempts ]]; then
|
|
log_info "Retrying in ${delay}s..."
|
|
sleep "$delay"
|
|
fi
|
|
fi
|
|
|
|
((attempt++))
|
|
done
|
|
|
|
log_error "Command failed after $max_attempts attempts"
|
|
return 1
|
|
}
|
|
|
|
# Function to monitor resource usage
|
|
monitor_resources() {
|
|
local duration=${1:-60} # Monitor for 60 seconds by default
|
|
local interval=${2:-5} # Check every 5 seconds
|
|
|
|
log_info "Monitoring system resources for ${duration}s..."
|
|
|
|
local end_time=$(($(date +%s) + duration))
|
|
while [[ $(date +%s) -lt $end_time ]]; do
|
|
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
|
|
local mem_usage=$(free | grep Mem | awk '{printf "%.1f", $3/$2 * 100.0}')
|
|
local disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//')
|
|
|
|
log_debug "Resource usage - CPU: ${cpu_usage}%, Memory: ${mem_usage}%, Disk: ${disk_usage}%"
|
|
|
|
# Alert on high resource usage
|
|
if (( $(echo "$cpu_usage > 90" | bc -l) )); then
|
|
log_warn "High CPU usage detected: ${cpu_usage}%"
|
|
fi
|
|
|
|
if (( $(echo "$mem_usage > 90" | bc -l) )); then
|
|
log_warn "High memory usage detected: ${mem_usage}%"
|
|
fi
|
|
|
|
if [[ ${disk_usage%.*} -gt 90 ]]; then
|
|
log_warn "High disk usage detected: ${disk_usage}%"
|
|
fi
|
|
|
|
sleep "$interval"
|
|
done
|
|
}
|
|
|
|
# Set up signal handlers
|
|
cleanup_on_exit() {
|
|
local exit_code=$?
|
|
local duration=$(($(date +%s) - START_TIME))
|
|
|
|
log_info "Script execution completed"
|
|
log_info "Duration: ${duration}s"
|
|
log_info "Errors: $ERROR_COUNT, Warnings: $WARNING_COUNT"
|
|
|
|
execute_cleanup_functions
|
|
|
|
# Restore stdout/stderr
|
|
exec 1>&3 2>&4
|
|
exec 3>&- 4>&-
|
|
|
|
exit $exit_code
|
|
}
|
|
|
|
# Trap signals and errors
|
|
trap 'error_handler ${LINENO} ${BASH_LINENO} "$BASH_COMMAND"' ERR
|
|
trap 'cleanup_on_exit' EXIT
|
|
trap 'log_warn "Received SIGINT, initiating graceful shutdown..."; exit 130' INT
|
|
trap 'log_warn "Received SIGTERM, initiating graceful shutdown..."; exit 143' TERM
|
|
|
|
# Initialize logging
|
|
log_info "Started script: $SCRIPT_NAME (PID: $SCRIPT_PID)"
|
|
log_info "Log file: $LOG_FILE"
|
|
log_info "Error log: $ERROR_LOG"
|
|
|
|
# Export functions for use in other scripts
|
|
export -f log_info log_warn log_error log_debug log_step log_success log_critical
|
|
export -f register_cleanup register_rollback validate_prerequisites
|
|
export -f check_disk_space validate_network_connectivity
|
|
export -f create_checkpoint restore_from_checkpoint
|
|
export -f wait_for_service execute_with_retry monitor_resources |