#!/bin/bash # Comprehensive Monitoring Setup # Deploys real-time monitoring, alerting, and performance tracking for migration health # Import error handling library SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/lib/error_handling.sh" # Configuration readonly MANAGER_HOST="omv800" readonly MONITORING_HOST="audrey" readonly MONITORING_CONFIG_DIR="/opt/migration/configs/monitoring" readonly MONITORING_DATA_DIR="/opt/monitoring/data" readonly ALERTING_CONFIG_DIR="/opt/migration/configs/alerting" # Service endpoints for monitoring declare -A MONITORING_ENDPOINTS=( ["traefik"]="http://omv800:8080/ping" ["immich"]="http://omv800:3001/api/server-info/ping" ["jellyfin"]="http://omv800:8096/health" ["homeassistant"]="http://jonathan-2518f5u:8123/" ["appflowy"]="http://surface:8000/health" ["grafana"]="http://audrey:3000/api/health" ["prometheus"]="http://audrey:9090/-/healthy" ) # Alert thresholds declare -A ALERT_THRESHOLDS=( ["cpu_usage"]="85" ["memory_usage"]="90" ["disk_usage"]="85" ["response_time_ms"]="5000" ["error_rate_percent"]="5" ["service_downtime_seconds"]="60" ) # Cleanup function cleanup_monitoring() { log_info "Cleaning up monitoring setup..." # Clean up temporary files rm -f /tmp/monitoring_*.tmp 2>/dev/null || true rm -f /tmp/prometheus_*.tmp 2>/dev/null || true rm -f /tmp/grafana_*.tmp 2>/dev/null || true log_info "Monitoring cleanup completed" } # Rollback function rollback_monitoring() { log_info "Rolling back monitoring setup..." # Stop monitoring services ssh "$MANAGER_HOST" "docker stack rm monitoring 2>/dev/null || true" ssh "$MANAGER_HOST" "docker stack rm alerting 2>/dev/null || true" cleanup_monitoring log_info "Monitoring rollback completed" } # Function to create Prometheus configuration create_prometheus_config() { log_step "Creating Prometheus configuration..." mkdir -p "$MONITORING_CONFIG_DIR/prometheus" # Create main Prometheus configuration cat > "$MONITORING_CONFIG_DIR/prometheus/prometheus.yml" << 'EOF' global: scrape_interval: 15s evaluation_interval: 15s external_labels: cluster: 'homelab' environment: 'production' rule_files: - "alert_rules.yml" - "recording_rules.yml" alerting: alertmanagers: - static_configs: - targets: - alertmanager:9093 scrape_configs: # Prometheus itself - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] # Node exporters on all hosts - job_name: 'node-exporter' static_configs: - targets: - 'omv800:9100' - 'fedora:9100' - 'surface:9100' - 'jonathan-2518f5u:9100' - 'audrey:9100' - 'raspberrypi:9100' scrape_interval: 10s metrics_path: /metrics # cAdvisor for container metrics - job_name: 'cadvisor' static_configs: - targets: - 'omv800:8080' - 'fedora:8080' - 'surface:8080' - 'jonathan-2518f5u:8080' - 'audrey:8080' scrape_interval: 10s metrics_path: /metrics # Docker daemon metrics - job_name: 'docker-daemon' static_configs: - targets: - 'omv800:9323' - 'fedora:9323' - 'surface:9323' - 'jonathan-2518f5u:9323' - 'audrey:9323' scrape_interval: 30s # Traefik metrics - job_name: 'traefik' static_configs: - targets: ['omv800:8080'] metrics_path: /metrics scrape_interval: 5s # Application health checks - job_name: 'service-health' static_configs: - targets: - 'omv800:3001' # Immich - 'omv800:8096' # Jellyfin - 'jonathan-2518f5u:8123' # Home Assistant - 'surface:8000' # AppFlowy scrape_interval: 30s metrics_path: /metrics # PostgreSQL metrics (if pg_exporter is available) - job_name: 'postgres' static_configs: - targets: ['omv800:9187'] scrape_interval: 30s # Redis metrics (if redis_exporter is available) - job_name: 'redis' static_configs: - targets: ['omv800:9121'] scrape_interval: 30s # Migration-specific monitoring - job_name: 'migration-health' static_configs: - targets: ['localhost:9999'] # Custom migration health endpoint scrape_interval: 10s EOF # Create alert rules cat > "$MONITORING_CONFIG_DIR/prometheus/alert_rules.yml" << 'EOF' groups: - name: infrastructure_alerts rules: # Node health alerts - alert: NodeDown expr: up{job="node-exporter"} == 0 for: 1m labels: severity: critical component: infrastructure annotations: summary: "Node {{ $labels.instance }} is down" description: "Node {{ $labels.instance }} has been down for more than 1 minute." - alert: HighCPUUsage expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 85 for: 5m labels: severity: warning component: infrastructure annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage is above 85% on {{ $labels.instance }} for more than 5 minutes." - alert: HighMemoryUsage expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 10 for: 5m labels: severity: warning component: infrastructure annotations: summary: "High memory usage on {{ $labels.instance }}" description: "Memory usage is above 90% on {{ $labels.instance }} for more than 5 minutes." - alert: HighDiskUsage expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 15 for: 10m labels: severity: warning component: infrastructure annotations: summary: "High disk usage on {{ $labels.instance }}" description: "Disk usage is above 85% on {{ $labels.instance }} for more than 10 minutes." - name: docker_alerts rules: # Docker Swarm alerts - alert: DockerNodeDown expr: docker_swarm_node_status != 1 for: 2m labels: severity: critical component: docker annotations: summary: "Docker Swarm node {{ $labels.node_name }} is not ready" description: "Docker Swarm node {{ $labels.node_name }} has been not ready for more than 2 minutes." - alert: ServiceReplicasMismatch expr: docker_service_replicas != docker_service_replicas_desired for: 5m labels: severity: warning component: docker annotations: summary: "Service {{ $labels.service_name }} has replica mismatch" description: "Service {{ $labels.service_name }} has {{ $labels.docker_service_replicas }} replicas running, but {{ $labels.docker_service_replicas_desired }} desired." - alert: HighContainerRestarts expr: rate(docker_container_restart_count[5m]) > 0.1 for: 5m labels: severity: warning component: docker annotations: summary: "High container restart rate for {{ $labels.name }}" description: "Container {{ $labels.name }} is restarting frequently." - name: application_alerts rules: # Application health alerts - alert: ServiceDown expr: up{job="service-health"} == 0 for: 1m labels: severity: critical component: application annotations: summary: "Service {{ $labels.instance }} is down" description: "Service {{ $labels.instance }} has been down for more than 1 minute." - alert: HighResponseTime expr: http_request_duration_seconds{quantile="0.95"} > 5 for: 2m labels: severity: warning component: application annotations: summary: "High response time for {{ $labels.instance }}" description: "95th percentile response time is above 5 seconds for {{ $labels.instance }}." - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 for: 2m labels: severity: warning component: application annotations: summary: "High error rate for {{ $labels.instance }}" description: "Error rate is above 5% for {{ $labels.instance }}." - name: migration_alerts rules: # Migration-specific alerts - alert: MigrationStalled expr: migration_progress_percent == migration_progress_percent offset 10m for: 10m labels: severity: critical component: migration annotations: summary: "Migration appears to be stalled" description: "Migration progress has not changed in the last 10 minutes." - alert: MigrationError expr: migration_errors_total > 0 for: 0s labels: severity: critical component: migration annotations: summary: "Migration errors detected" description: "{{ $value }} migration errors have been detected." EOF # Create recording rules for performance metrics cat > "$MONITORING_CONFIG_DIR/prometheus/recording_rules.yml" << 'EOF' groups: - name: performance_recording interval: 30s rules: # Node performance metrics - record: node:cpu_utilization_percent expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) - record: node:memory_utilization_percent expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 - record: node:disk_utilization_percent expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 # Container performance metrics - record: container:cpu_utilization_percent expr: rate(container_cpu_usage_seconds_total[5m]) * 100 - record: container:memory_utilization_percent expr: (container_memory_usage_bytes / container_memory_limit_bytes) * 100 # Application performance metrics - record: app:request_rate expr: rate(http_requests_total[5m]) - record: app:error_rate_percent expr: (rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100 - record: app:response_time_95th_percentile expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) EOF log_success "Prometheus configuration created" } # Function to create Grafana dashboards create_grafana_dashboards() { log_step "Creating Grafana dashboards..." mkdir -p "$MONITORING_CONFIG_DIR/grafana/dashboards" mkdir -p "$MONITORING_CONFIG_DIR/grafana/provisioning/dashboards" mkdir -p "$MONITORING_CONFIG_DIR/grafana/provisioning/datasources" # Create datasource provisioning cat > "$MONITORING_CONFIG_DIR/grafana/provisioning/datasources/prometheus.yml" << 'EOF' apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy url: http://prometheus:9090 isDefault: true editable: true EOF # Create dashboard provisioning cat > "$MONITORING_CONFIG_DIR/grafana/provisioning/dashboards/dashboards.yml" << 'EOF' apiVersion: 1 providers: - name: 'default' orgId: 1 folder: '' type: file disableDeletion: false updateIntervalSeconds: 10 allowUiUpdates: true options: path: /etc/grafana/provisioning/dashboards EOF # Create Infrastructure Overview dashboard cat > "$MONITORING_CONFIG_DIR/grafana/dashboards/infrastructure-overview.json" << 'EOF' { "dashboard": { "id": null, "title": "Infrastructure Overview", "description": "Home Lab Infrastructure Monitoring", "tags": ["infrastructure", "overview"], "timezone": "browser", "panels": [ { "title": "System Load", "type": "stat", "targets": [ { "expr": "avg(node:cpu_utilization_percent)", "legendFormat": "Average CPU Usage" } ], "fieldConfig": { "defaults": { "unit": "percent", "thresholds": { "steps": [ {"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85} ] } } } }, { "title": "Memory Usage", "type": "stat", "targets": [ { "expr": "avg(node:memory_utilization_percent)", "legendFormat": "Average Memory Usage" } ], "fieldConfig": { "defaults": { "unit": "percent", "thresholds": { "steps": [ {"color": "green", "value": null}, {"color": "yellow", "value": 75}, {"color": "red", "value": 90} ] } } } }, { "title": "Service Health", "type": "stat", "targets": [ { "expr": "sum(up{job=\"service-health\"})", "legendFormat": "Services Up" } ], "fieldConfig": { "defaults": { "unit": "short", "thresholds": { "steps": [ {"color": "red", "value": null}, {"color": "yellow", "value": 3}, {"color": "green", "value": 4} ] } } } } ], "time": { "from": "now-1h", "to": "now" }, "refresh": "10s" } } EOF # Create Migration Health dashboard cat > "$MONITORING_CONFIG_DIR/grafana/dashboards/migration-health.json" << 'EOF' { "dashboard": { "id": null, "title": "Migration Health", "description": "Real-time migration monitoring and health checks", "tags": ["migration", "health"], "panels": [ { "title": "Migration Progress", "type": "gauge", "targets": [ { "expr": "migration_progress_percent", "legendFormat": "Progress %" } ], "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { "steps": [ {"color": "red", "value": null}, {"color": "yellow", "value": 25}, {"color": "green", "value": 75} ] } } } }, { "title": "Migration Errors", "type": "stat", "targets": [ { "expr": "migration_errors_total", "legendFormat": "Total Errors" } ], "fieldConfig": { "defaults": { "unit": "short", "thresholds": { "steps": [ {"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 5} ] } } } }, { "title": "Data Transfer Rate", "type": "graph", "targets": [ { "expr": "rate(migration_data_transferred_bytes[5m])", "legendFormat": "Transfer Rate" } ], "yAxes": [ { "unit": "bytes" } ] } ], "time": { "from": "now-2h", "to": "now" }, "refresh": "5s" } } EOF log_success "Grafana dashboards created" } # Function to create Alertmanager configuration create_alertmanager_config() { log_step "Creating Alertmanager configuration..." mkdir -p "$ALERTING_CONFIG_DIR/alertmanager" # Create Alertmanager configuration cat > "$ALERTING_CONFIG_DIR/alertmanager/alertmanager.yml" << 'EOF' global: smtp_smarthost: 'localhost:587' smtp_from: 'alertmanager@homelab.local' smtp_auth_username: '' smtp_auth_password: '' route: group_by: ['alertname', 'cluster', 'service'] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: 'web.hook' routes: - match: severity: critical receiver: 'critical-alerts' group_wait: 5s repeat_interval: 15m - match: component: migration receiver: 'migration-alerts' group_wait: 0s repeat_interval: 5m receivers: - name: 'web.hook' webhook_configs: - url: 'http://localhost:9093/webhook' - name: 'critical-alerts' webhook_configs: - url: 'http://localhost:9093/webhook/critical' send_resolved: true # Uncomment and configure email if SMTP is available # email_configs: # - to: 'admin@homelab.local' # subject: 'CRITICAL: {{ .GroupLabels.alertname }}' # body: | # {{ range .Alerts }} # Alert: {{ .Annotations.summary }} # Description: {{ .Annotations.description }} # {{ end }} - name: 'migration-alerts' webhook_configs: - url: 'http://localhost:9093/webhook/migration' send_resolved: true inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'instance'] EOF log_success "Alertmanager configuration created" } # Function to create custom migration health exporter create_migration_health_exporter() { log_step "Creating migration health exporter..." mkdir -p "$MONITORING_CONFIG_DIR/exporters" # Create migration health exporter script cat > "$MONITORING_CONFIG_DIR/exporters/migration_health_exporter.py" << 'EOF' #!/usr/bin/env python3 """ Migration Health Exporter Exports custom metrics for migration monitoring """ import time import json import os import subprocess import glob from http.server import HTTPServer, BaseHTTPRequestHandler from urllib.parse import urlparse, parse_qs class MigrationHealthExporter: def __init__(self): self.migration_dir = "/opt/migration" self.log_dir = f"{self.migration_dir}/logs" self.backup_dir = f"{self.migration_dir}/backups" self.checkpoint_dir = f"{self.log_dir}/checkpoints" def get_migration_progress(self): """Calculate migration progress based on checkpoints""" try: if not os.path.exists(self.checkpoint_dir): return 0.0 checkpoints = glob.glob(f"{self.checkpoint_dir}/*.checkpoint") if not checkpoints: return 0.0 # Define expected checkpoints in order expected_checkpoints = [ "snapshot_start", "docker_collected", "database_dumps_complete", "config_backups_complete", "security_hardening_complete", "swarm_optimization_complete", "migration_complete" ] completed_checkpoints = [] for checkpoint_file in checkpoints: with open(checkpoint_file, 'r') as f: content = f.read() if "CHECKPOINT_NAME=" in content: name = content.split("CHECKPOINT_NAME=")[1].split("\n")[0] completed_checkpoints.append(name) progress = (len(completed_checkpoints) / len(expected_checkpoints)) * 100 return min(progress, 100.0) except Exception: return 0.0 def get_migration_errors(self): """Count migration errors from logs""" try: error_count = 0 if os.path.exists(self.log_dir): for log_file in glob.glob(f"{self.log_dir}/errors_*.log"): with open(log_file, 'r') as f: error_count += len([line for line in f if '[ERROR]' in line or '[CRITICAL]' in line]) return error_count except Exception: return 0 def get_backup_status(self): """Check backup health""" try: if not os.path.exists(self.backup_dir): return {"healthy": 0, "total": 0} backups = glob.glob(f"{self.backup_dir}/**/*.tar.gz", recursive=True) healthy_backups = 0 for backup in backups: # Simple health check - file size > 1MB if os.path.getsize(backup) > 1024 * 1024: healthy_backups += 1 return {"healthy": healthy_backups, "total": len(backups)} except Exception: return {"healthy": 0, "total": 0} def get_service_health(self): """Check critical service health""" services = { "traefik": "docker ps | grep traefik", "postgres": "docker ps | grep postgres", "redis": "docker ps | grep redis" } healthy_services = 0 for service, check_cmd in services.items(): try: result = subprocess.run(check_cmd, shell=True, capture_output=True) if result.returncode == 0 and result.stdout.strip(): healthy_services += 1 except Exception: pass return {"healthy": healthy_services, "total": len(services)} def generate_metrics(self): """Generate Prometheus metrics""" progress = self.get_migration_progress() errors = self.get_migration_errors() backup_status = self.get_backup_status() service_health = self.get_service_health() metrics = f"""# HELP migration_progress_percent Current migration progress percentage # TYPE migration_progress_percent gauge migration_progress_percent {progress} # HELP migration_errors_total Total number of migration errors # TYPE migration_errors_total counter migration_errors_total {errors} # HELP migration_backup_healthy Number of healthy backups # TYPE migration_backup_healthy gauge migration_backup_healthy {backup_status['healthy']} # HELP migration_backup_total Total number of backups # TYPE migration_backup_total gauge migration_backup_total {backup_status['total']} # HELP migration_services_healthy Number of healthy critical services # TYPE migration_services_healthy gauge migration_services_healthy {service_health['healthy']} # HELP migration_services_total Total number of critical services # TYPE migration_services_total gauge migration_services_total {service_health['total']} # HELP migration_timestamp_seconds Timestamp of last metric update # TYPE migration_timestamp_seconds gauge migration_timestamp_seconds {time.time()} """ return metrics class MetricsHandler(BaseHTTPRequestHandler): def __init__(self, exporter, *args, **kwargs): self.exporter = exporter super().__init__(*args, **kwargs) def do_GET(self): if self.path == '/metrics': self.send_response(200) self.send_header('Content-type', 'text/plain; charset=utf-8') self.end_headers() metrics = self.exporter.generate_metrics() self.wfile.write(metrics.encode('utf-8')) else: self.send_response(404) self.end_headers() def main(): exporter = MigrationHealthExporter() handler = lambda *args, **kwargs: MetricsHandler(exporter, *args, **kwargs) server = HTTPServer(('0.0.0.0', 9999), handler) print("Migration health exporter starting on port 9999") server.serve_forever() if __name__ == '__main__': main() EOF chmod +x "$MONITORING_CONFIG_DIR/exporters/migration_health_exporter.py" log_success "Migration health exporter created" } # Function to deploy monitoring stack deploy_monitoring_stack() { log_step "Deploying comprehensive monitoring stack..." # Create monitoring stack configuration cat > "$MONITORING_CONFIG_DIR/monitoring-stack.yml" << 'EOF' version: '3.8' services: prometheus: image: prom/prometheus:latest command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.console.libraries=/etc/prometheus/console_libraries' - '--web.console.templates=/etc/prometheus/consoles' - '--storage.tsdb.retention.time=30d' - '--web.enable-lifecycle' - '--web.enable-admin-api' volumes: - prometheus-data:/prometheus - prometheus-config:/etc/prometheus networks: - monitoring-zone deploy: placement: constraints: - node.labels.role.monitoring == true resources: limits: memory: 2G cpus: '1.0' reservations: memory: 1G cpus: '0.5' restart_policy: condition: on-failure delay: 5s max_attempts: 3 alertmanager: image: prom/alertmanager:latest command: - '--config.file=/etc/alertmanager/alertmanager.yml' - '--storage.path=/alertmanager' - '--web.external-url=http://alertmanager:9093' volumes: - alertmanager-data:/alertmanager - alertmanager-config:/etc/alertmanager networks: - monitoring-zone deploy: placement: constraints: - node.labels.role.monitoring == true resources: limits: memory: 512M cpus: '0.5' reservations: memory: 256M cpus: '0.25' restart_policy: condition: on-failure delay: 5s max_attempts: 3 grafana: image: grafana/grafana:latest environment: - GF_SECURITY_ADMIN_PASSWORD_FILE=/run/secrets/grafana_admin_password - GF_USERS_ALLOW_SIGN_UP=false - GF_SERVER_DOMAIN=grafana.${DOMAIN} - GF_SERVER_ROOT_URL=https://grafana.${DOMAIN} - GF_INSTALL_PLUGINS=grafana-piechart-panel volumes: - grafana-data:/var/lib/grafana - grafana-config:/etc/grafana/provisioning networks: - monitoring-zone - public-zone secrets: - grafana_admin_password deploy: placement: constraints: - node.labels.role.monitoring == true resources: limits: memory: 1G cpus: '0.5' reservations: memory: 512M cpus: '0.25' restart_policy: condition: on-failure delay: 5s max_attempts: 3 labels: - "traefik.enable=true" - "traefik.http.routers.grafana.rule=Host(`grafana.${DOMAIN}`)" - "traefik.http.routers.grafana.entrypoints=websecure" - "traefik.http.routers.grafana.tls.certresolver=letsencrypt" - "traefik.http.services.grafana.loadbalancer.server.port=3000" migration-health-exporter: image: python:3.9-alpine command: - /bin/sh - -c - | pip install --no-cache-dir requests python /app/migration_health_exporter.py volumes: - migration-exporter:/app - migration-logs:/opt/migration:ro networks: - monitoring-zone deploy: placement: constraints: - node.role == manager resources: limits: memory: 256M cpus: '0.25' reservations: memory: 128M cpus: '0.1' restart_policy: condition: on-failure delay: 5s max_attempts: 3 secrets: grafana_admin_password: external: true volumes: prometheus-data: driver: local prometheus-config: driver: local driver_opts: type: none o: bind device: /opt/migration/configs/monitoring/prometheus alertmanager-data: driver: local alertmanager-config: driver: local driver_opts: type: none o: bind device: /opt/migration/configs/alerting/alertmanager grafana-data: driver: local grafana-config: driver: local driver_opts: type: none o: bind device: /opt/migration/configs/monitoring/grafana/provisioning migration-exporter: driver: local driver_opts: type: none o: bind device: /opt/migration/configs/monitoring/exporters migration-logs: driver: local driver_opts: type: none o: bind device: /opt/migration networks: monitoring-zone: external: true public-zone: external: true EOF # Copy configurations to the monitoring host log_info "Deploying configuration files..." ssh "$MONITORING_HOST" "sudo mkdir -p /opt/migration/configs/monitoring" scp -r "$MONITORING_CONFIG_DIR" "$MONITORING_HOST:/opt/migration/configs/" scp -r "$ALERTING_CONFIG_DIR" "$MONITORING_HOST:/opt/migration/configs/" # Deploy the monitoring stack log_info "Deploying monitoring services..." if ssh "$MANAGER_HOST" "cd $MONITORING_CONFIG_DIR && docker stack deploy -c monitoring-stack.yml monitoring"; then log_success "Monitoring stack deployed successfully" else log_error "Failed to deploy monitoring stack" return 1 fi # Wait for services to be ready wait_for_service "Prometheus" "curl -f http://$MONITORING_HOST:9090/-/healthy" 120 10 wait_for_service "Grafana" "curl -f http://$MONITORING_HOST:3000/api/health" 120 10 log_success "Monitoring stack deployment completed" } # Main execution function main() { local action=${1:-"full"} # Register cleanup and rollback functions register_cleanup cleanup_monitoring register_rollback rollback_monitoring case $action in "full") log_step "Setting up comprehensive monitoring system..." # Validate prerequisites validate_prerequisites ssh curl jq python3 # Create configurations create_prometheus_config create_grafana_dashboards create_alertmanager_config create_migration_health_exporter # Deploy monitoring stack deploy_monitoring_stack log_success "✅ Comprehensive monitoring setup completed!" log_info "📊 Prometheus: http://$MONITORING_HOST:9090" log_info "📈 Grafana: http://$MONITORING_HOST:3000" log_info "🚨 Alertmanager: http://$MONITORING_HOST:9093" log_info "🔍 Migration Health: http://$MANAGER_HOST:9999/metrics" ;; "config-only") create_prometheus_config create_grafana_dashboards create_alertmanager_config create_migration_health_exporter ;; "deploy-only") deploy_monitoring_stack ;; "help"|*) cat << EOF Comprehensive Monitoring Setup Usage: $0 Actions: full - Complete monitoring setup (default) config-only - Only create configurations deploy-only - Only deploy services help - Show this help Examples: $0 full $0 config-only EOF ;; esac } # Execute main function main "$@"