HomeAudit/migration_scripts/scripts/comprehensive_monitoring_setup.sh

#!/bin/bash
# Comprehensive Monitoring Setup
# Deploys real-time monitoring, alerting, and performance tracking for migration health

# Import error handling library
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/lib/error_handling.sh"

# Configuration
readonly MANAGER_HOST="omv800"
readonly MONITORING_HOST="audrey"
readonly MONITORING_CONFIG_DIR="/opt/migration/configs/monitoring"
readonly MONITORING_DATA_DIR="/opt/monitoring/data"
readonly ALERTING_CONFIG_DIR="/opt/migration/configs/alerting"

# Service endpoints for monitoring
declare -A MONITORING_ENDPOINTS=(
    ["traefik"]="http://omv800:8080/ping"
    ["immich"]="http://omv800:3001/api/server-info/ping"
    ["jellyfin"]="http://omv800:8096/health"
    ["homeassistant"]="http://jonathan-2518f5u:8123/"
    ["appflowy"]="http://surface:8000/health"
    ["grafana"]="http://audrey:3000/api/health"
    ["prometheus"]="http://audrey:9090/-/healthy"
)

# Alert thresholds
declare -A ALERT_THRESHOLDS=(
    ["cpu_usage"]="85"
    ["memory_usage"]="90"
    ["disk_usage"]="85"
    ["response_time_ms"]="5000"
    ["error_rate_percent"]="5"
    ["service_downtime_seconds"]="60"
)

# Cleanup function
cleanup_monitoring() {
    log_info "Cleaning up monitoring setup..."

    # Clean up temporary files
    rm -f /tmp/monitoring_*.tmp 2>/dev/null || true
    rm -f /tmp/prometheus_*.tmp 2>/dev/null || true
    rm -f /tmp/grafana_*.tmp 2>/dev/null || true

    log_info "Monitoring cleanup completed"
}

# Rollback function
rollback_monitoring() {
    log_info "Rolling back monitoring setup..."

    # Stop monitoring services
    ssh "$MANAGER_HOST" "docker stack rm monitoring 2>/dev/null || true"
    ssh "$MANAGER_HOST" "docker stack rm alerting 2>/dev/null || true"

    cleanup_monitoring
    log_info "Monitoring rollback completed"
}

# Function to create Prometheus configuration
create_prometheus_config() {
    log_step "Creating Prometheus configuration..."

    mkdir -p "$MONITORING_CONFIG_DIR/prometheus"

    # Create main Prometheus configuration
    cat > "$MONITORING_CONFIG_DIR/prometheus/prometheus.yml" << 'EOF'
global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'homelab'
    environment: 'production'

rule_files:
  - "alert_rules.yml"
  - "recording_rules.yml"

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

scrape_configs:
  # Prometheus itself
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  # Node exporters on all hosts
  - job_name: 'node-exporter'
    static_configs:
      - targets:
        - 'omv800:9100'
        - 'fedora:9100'
        - 'surface:9100'
        - 'jonathan-2518f5u:9100'
        - 'audrey:9100'
        - 'raspberrypi:9100'
    scrape_interval: 10s
    metrics_path: /metrics

  # cAdvisor for container metrics
  - job_name: 'cadvisor'
    static_configs:
      - targets:
        - 'omv800:8080'
        - 'fedora:8080'
        - 'surface:8080'
        - 'jonathan-2518f5u:8080'
        - 'audrey:8080'
    scrape_interval: 10s
    metrics_path: /metrics

  # Docker daemon metrics
  - job_name: 'docker-daemon'
    static_configs:
      - targets:
        - 'omv800:9323'
        - 'fedora:9323'
        - 'surface:9323'
        - 'jonathan-2518f5u:9323'
        - 'audrey:9323'
    scrape_interval: 30s

  # Traefik metrics
  - job_name: 'traefik'
    static_configs:
      - targets: ['omv800:8080']
    metrics_path: /metrics
    scrape_interval: 5s

  # Application health checks
  - job_name: 'service-health'
    static_configs:
      - targets:
        - 'omv800:3001'  # Immich
        - 'omv800:8096'  # Jellyfin
        - 'jonathan-2518f5u:8123'  # Home Assistant
        - 'surface:8000'  # AppFlowy
    scrape_interval: 30s
    metrics_path: /metrics

  # PostgreSQL metrics (if pg_exporter is available)
  - job_name: 'postgres'
    static_configs:
      - targets: ['omv800:9187']
    scrape_interval: 30s

  # Redis metrics (if redis_exporter is available)
  - job_name: 'redis'
    static_configs:
      - targets: ['omv800:9121']
    scrape_interval: 30s

  # Migration-specific monitoring
  - job_name: 'migration-health'
    static_configs:
      - targets: ['localhost:9999']  # Custom migration health endpoint
    scrape_interval: 10s
EOF

    # Create alert rules
    cat > "$MONITORING_CONFIG_DIR/prometheus/alert_rules.yml" << 'EOF'
groups:
  - name: infrastructure_alerts
    rules:
      # Node health alerts
      - alert: NodeDown
        expr: up{job="node-exporter"} == 0
        for: 1m
        labels:
          severity: critical
          component: infrastructure
        annotations:
          summary: "Node {{ $labels.instance }} is down"
          description: "Node {{ $labels.instance }} has been down for more than 1 minute."

      - alert: HighCPUUsage
        expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 85
        for: 5m
        labels:
          severity: warning
          component: infrastructure
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "CPU usage is above 85% on {{ $labels.instance }} for more than 5 minutes."

      - alert: HighMemoryUsage
        expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 10
        for: 5m
        labels:
          severity: warning
          component: infrastructure
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: "Memory usage is above 90% on {{ $labels.instance }} for more than 5 minutes."

      - alert: HighDiskUsage
        expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 15
        for: 10m
        labels:
          severity: warning
          component: infrastructure
        annotations:
          summary: "High disk usage on {{ $labels.instance }}"
          description: "Disk usage is above 85% on {{ $labels.instance }} for more than 10 minutes."

  - name: docker_alerts
    rules:
      # Docker Swarm alerts
      - alert: DockerNodeDown
        expr: docker_swarm_node_status != 1
        for: 2m
        labels:
          severity: critical
          component: docker
        annotations:
          summary: "Docker Swarm node {{ $labels.node_name }} is not ready"
          description: "Docker Swarm node {{ $labels.node_name }} has been not ready for more than 2 minutes."

      - alert: ServiceReplicasMismatch
        expr: docker_service_replicas != docker_service_replicas_desired
        for: 5m
        labels:
          severity: warning
          component: docker
        annotations:
          summary: "Service {{ $labels.service_name }} has replica mismatch"
          description: "Service {{ $labels.service_name }} has {{ $labels.docker_service_replicas }} replicas running, but {{ $labels.docker_service_replicas_desired }} desired."

      - alert: HighContainerRestarts
        expr: rate(docker_container_restart_count[5m]) > 0.1
        for: 5m
        labels:
          severity: warning
          component: docker
        annotations:
          summary: "High container restart rate for {{ $labels.name }}"
          description: "Container {{ $labels.name }} is restarting frequently."

  - name: application_alerts
    rules:
      # Application health alerts
      - alert: ServiceDown
        expr: up{job="service-health"} == 0
        for: 1m
        labels:
          severity: critical
          component: application
        annotations:
          summary: "Service {{ $labels.instance }} is down"
          description: "Service {{ $labels.instance }} has been down for more than 1 minute."

      - alert: HighResponseTime
        expr: http_request_duration_seconds{quantile="0.95"} > 5
        for: 2m
        labels:
          severity: warning
          component: application
        annotations:
          summary: "High response time for {{ $labels.instance }}"
          description: "95th percentile response time is above 5 seconds for {{ $labels.instance }}."

      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
        for: 2m
        labels:
          severity: warning
          component: application
        annotations:
          summary: "High error rate for {{ $labels.instance }}"
          description: "Error rate is above 5% for {{ $labels.instance }}."

  - name: migration_alerts
    rules:
      # Migration-specific alerts
      - alert: MigrationStalled
        expr: migration_progress_percent == migration_progress_percent offset 10m
        for: 10m
        labels:
          severity: critical
          component: migration
        annotations:
          summary: "Migration appears to be stalled"
          description: "Migration progress has not changed in the last 10 minutes."

      - alert: MigrationError
        expr: migration_errors_total > 0
        for: 0s
        labels:
          severity: critical
          component: migration
        annotations:
          summary: "Migration errors detected"
          description: "{{ $value }} migration errors have been detected."
EOF

    # Create recording rules for performance metrics
    cat > "$MONITORING_CONFIG_DIR/prometheus/recording_rules.yml" << 'EOF'
groups:
  - name: performance_recording
    interval: 30s
    rules:
      # Node performance metrics
      - record: node:cpu_utilization_percent
        expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100)

      - record: node:memory_utilization_percent
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100

      - record: node:disk_utilization_percent
        expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100

      # Container performance metrics
      - record: container:cpu_utilization_percent
        expr: rate(container_cpu_usage_seconds_total[5m]) * 100

      - record: container:memory_utilization_percent
        expr: (container_memory_usage_bytes / container_memory_limit_bytes) * 100

      # Application performance metrics
      - record: app:request_rate
        expr: rate(http_requests_total[5m])

      - record: app:error_rate_percent
        expr: (rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100

      - record: app:response_time_95th_percentile
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
EOF

    log_success "Prometheus configuration created"
}

# Function to create Grafana dashboards
create_grafana_dashboards() {
    log_step "Creating Grafana dashboards..."

    mkdir -p "$MONITORING_CONFIG_DIR/grafana/dashboards"
    mkdir -p "$MONITORING_CONFIG_DIR/grafana/provisioning/dashboards"
    mkdir -p "$MONITORING_CONFIG_DIR/grafana/provisioning/datasources"

    # Create datasource provisioning
    cat > "$MONITORING_CONFIG_DIR/grafana/provisioning/datasources/prometheus.yml" << 'EOF'
apiVersion: 1

datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: true
    editable: true
EOF

    # Create dashboard provisioning
    cat > "$MONITORING_CONFIG_DIR/grafana/provisioning/dashboards/dashboards.yml" << 'EOF'
apiVersion: 1

providers:
  - name: 'default'
    orgId: 1
    folder: ''
    type: file
    disableDeletion: false
    updateIntervalSeconds: 10
    allowUiUpdates: true
    options:
      path: /etc/grafana/provisioning/dashboards
EOF

    # Create Infrastructure Overview dashboard
    cat > "$MONITORING_CONFIG_DIR/grafana/dashboards/infrastructure-overview.json" << 'EOF'
{
  "dashboard": {
    "id": null,
    "title": "Infrastructure Overview",
    "description": "Home Lab Infrastructure Monitoring",
    "tags": ["infrastructure", "overview"],
    "timezone": "browser",
    "panels": [
      {
        "title": "System Load",
        "type": "stat",
        "targets": [
          {
            "expr": "avg(node:cpu_utilization_percent)",
            "legendFormat": "Average CPU Usage"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 70},
                {"color": "red", "value": 85}
              ]
            }
          }
        }
      },
      {
        "title": "Memory Usage",
        "type": "stat",
        "targets": [
          {
            "expr": "avg(node:memory_utilization_percent)",
            "legendFormat": "Average Memory Usage"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 75},
                {"color": "red", "value": 90}
              ]
            }
          }
        }
      },
      {
        "title": "Service Health",
        "type": "stat",
        "targets": [
          {
            "expr": "sum(up{job=\"service-health\"})",
            "legendFormat": "Services Up"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "short",
            "thresholds": {
              "steps": [
                {"color": "red", "value": null},
                {"color": "yellow", "value": 3},
                {"color": "green", "value": 4}
              ]
            }
          }
        }
      }
    ],
    "time": {
      "from": "now-1h",
      "to": "now"
    },
    "refresh": "10s"
  }
}
EOF

    # Create Migration Health dashboard
    cat > "$MONITORING_CONFIG_DIR/grafana/dashboards/migration-health.json" << 'EOF'
{
  "dashboard": {
    "id": null,
    "title": "Migration Health",
    "description": "Real-time migration monitoring and health checks",
    "tags": ["migration", "health"],
    "panels": [
      {
        "title": "Migration Progress",
        "type": "gauge",
        "targets": [
          {
            "expr": "migration_progress_percent",
            "legendFormat": "Progress %"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percent",
            "min": 0,
            "max": 100,
            "thresholds": {
              "steps": [
                {"color": "red", "value": null},
                {"color": "yellow", "value": 25},
                {"color": "green", "value": 75}
              ]
            }
          }
        }
      },
      {
        "title": "Migration Errors",
        "type": "stat",
        "targets": [
          {
            "expr": "migration_errors_total",
            "legendFormat": "Total Errors"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "short",
            "thresholds": {
              "steps": [
                {"color": "green", "value": null},
                {"color": "yellow", "value": 1},
                {"color": "red", "value": 5}
              ]
            }
          }
        }
      },
      {
        "title": "Data Transfer Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(migration_data_transferred_bytes[5m])",
            "legendFormat": "Transfer Rate"
          }
        ],
        "yAxes": [
          {
            "unit": "bytes"
          }
        ]
      }
    ],
    "time": {
      "from": "now-2h",
      "to": "now"
    },
    "refresh": "5s"
  }
}
EOF

    log_success "Grafana dashboards created"
}

# Function to create Alertmanager configuration
create_alertmanager_config() {
    log_step "Creating Alertmanager configuration..."

    mkdir -p "$ALERTING_CONFIG_DIR/alertmanager"

    # Create Alertmanager configuration
    cat > "$ALERTING_CONFIG_DIR/alertmanager/alertmanager.yml" << 'EOF'
global:
  smtp_smarthost: 'localhost:587'
  smtp_from: 'alertmanager@homelab.local'
  smtp_auth_username: ''
  smtp_auth_password: ''

route:
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'
  routes:
    - match:
        severity: critical
      receiver: 'critical-alerts'
      group_wait: 5s
      repeat_interval: 15m
    - match:
        component: migration
      receiver: 'migration-alerts'
      group_wait: 0s
      repeat_interval: 5m

receivers:
  - name: 'web.hook'
    webhook_configs:
      - url: 'http://localhost:9093/webhook'

  - name: 'critical-alerts'
    webhook_configs:
      - url: 'http://localhost:9093/webhook/critical'
        send_resolved: true
    # Uncomment and configure email if SMTP is available
    # email_configs:
    #   - to: 'admin@homelab.local'
    #     subject: 'CRITICAL: {{ .GroupLabels.alertname }}'
    #     body: |
    #       {{ range .Alerts }}
    #       Alert: {{ .Annotations.summary }}
    #       Description: {{ .Annotations.description }}
    #       {{ end }}

  - name: 'migration-alerts'
    webhook_configs:
      - url: 'http://localhost:9093/webhook/migration'
        send_resolved: true

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'instance']
EOF

    log_success "Alertmanager configuration created"
}

# Function to create custom migration health exporter
create_migration_health_exporter() {
    log_step "Creating migration health exporter..."

    mkdir -p "$MONITORING_CONFIG_DIR/exporters"

    # Create migration health exporter script
    cat > "$MONITORING_CONFIG_DIR/exporters/migration_health_exporter.py" << 'EOF'
#!/usr/bin/env python3
"""
Migration Health Exporter
Exports custom metrics for migration monitoring
"""

import time
import json
import os
import subprocess
import glob
from http.server import HTTPServer, BaseHTTPRequestHandler
from urllib.parse import urlparse, parse_qs

class MigrationHealthExporter:
    def __init__(self):
        self.migration_dir = "/opt/migration"
        self.log_dir = f"{self.migration_dir}/logs"
        self.backup_dir = f"{self.migration_dir}/backups"
        self.checkpoint_dir = f"{self.log_dir}/checkpoints"

    def get_migration_progress(self):
        """Calculate migration progress based on checkpoints"""
        try:
            if not os.path.exists(self.checkpoint_dir):
                return 0.0

            checkpoints = glob.glob(f"{self.checkpoint_dir}/*.checkpoint")
            if not checkpoints:
                return 0.0

            # Define expected checkpoints in order
            expected_checkpoints = [
                "snapshot_start", "docker_collected", "database_dumps_complete",
                "config_backups_complete", "security_hardening_complete",
                "swarm_optimization_complete", "migration_complete"
            ]

            completed_checkpoints = []
            for checkpoint_file in checkpoints:
                with open(checkpoint_file, 'r') as f:
                    content = f.read()
                    if "CHECKPOINT_NAME=" in content:
                        name = content.split("CHECKPOINT_NAME=")[1].split("\n")[0]
                        completed_checkpoints.append(name)

            progress = (len(completed_checkpoints) / len(expected_checkpoints)) * 100
            return min(progress, 100.0)

        except Exception:
            return 0.0

    def get_migration_errors(self):
        """Count migration errors from logs"""
        try:
            error_count = 0
            if os.path.exists(self.log_dir):
                for log_file in glob.glob(f"{self.log_dir}/errors_*.log"):
                    with open(log_file, 'r') as f:
                        error_count += len([line for line in f if '[ERROR]' in line or '[CRITICAL]' in line])
            return error_count
        except Exception:
            return 0

    def get_backup_status(self):
        """Check backup health"""
        try:
            if not os.path.exists(self.backup_dir):
                return {"healthy": 0, "total": 0}

            backups = glob.glob(f"{self.backup_dir}/**/*.tar.gz", recursive=True)
            healthy_backups = 0

            for backup in backups:
                # Simple health check - file size > 1MB
                if os.path.getsize(backup) > 1024 * 1024:
                    healthy_backups += 1

            return {"healthy": healthy_backups, "total": len(backups)}
        except Exception:
            return {"healthy": 0, "total": 0}

    def get_service_health(self):
        """Check critical service health"""
        services = {
            "traefik": "docker ps | grep traefik",
            "postgres": "docker ps | grep postgres",
            "redis": "docker ps | grep redis"
        }

        healthy_services = 0
        for service, check_cmd in services.items():
            try:
                result = subprocess.run(check_cmd, shell=True, capture_output=True)
                if result.returncode == 0 and result.stdout.strip():
                    healthy_services += 1
            except Exception:
                pass

        return {"healthy": healthy_services, "total": len(services)}

    def generate_metrics(self):
        """Generate Prometheus metrics"""
        progress = self.get_migration_progress()
        errors = self.get_migration_errors()
        backup_status = self.get_backup_status()
        service_health = self.get_service_health()

        metrics = f"""# HELP migration_progress_percent Current migration progress percentage
# TYPE migration_progress_percent gauge
migration_progress_percent {progress}

# HELP migration_errors_total Total number of migration errors
# TYPE migration_errors_total counter
migration_errors_total {errors}

# HELP migration_backup_healthy Number of healthy backups
# TYPE migration_backup_healthy gauge
migration_backup_healthy {backup_status['healthy']}

# HELP migration_backup_total Total number of backups
# TYPE migration_backup_total gauge
migration_backup_total {backup_status['total']}

# HELP migration_services_healthy Number of healthy critical services
# TYPE migration_services_healthy gauge
migration_services_healthy {service_health['healthy']}

# HELP migration_services_total Total number of critical services
# TYPE migration_services_total gauge
migration_services_total {service_health['total']}

# HELP migration_timestamp_seconds Timestamp of last metric update
# TYPE migration_timestamp_seconds gauge
migration_timestamp_seconds {time.time()}
"""
        return metrics

class MetricsHandler(BaseHTTPRequestHandler):
    def __init__(self, exporter, *args, **kwargs):
        self.exporter = exporter
        super().__init__(*args, **kwargs)

    def do_GET(self):
        if self.path == '/metrics':
            self.send_response(200)
            self.send_header('Content-type', 'text/plain; charset=utf-8')
            self.end_headers()
            metrics = self.exporter.generate_metrics()
            self.wfile.write(metrics.encode('utf-8'))
        else:
            self.send_response(404)
            self.end_headers()

def main():
    exporter = MigrationHealthExporter()
    handler = lambda *args, **kwargs: MetricsHandler(exporter, *args, **kwargs)

    server = HTTPServer(('0.0.0.0', 9999), handler)
    print("Migration health exporter starting on port 9999")
    server.serve_forever()

if __name__ == '__main__':
    main()
EOF

    chmod +x "$MONITORING_CONFIG_DIR/exporters/migration_health_exporter.py"

    log_success "Migration health exporter created"
}

# Function to deploy monitoring stack
deploy_monitoring_stack() {
    log_step "Deploying comprehensive monitoring stack..."

    # Create monitoring stack configuration
    cat > "$MONITORING_CONFIG_DIR/monitoring-stack.yml" << 'EOF'
version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=30d'
      - '--web.enable-lifecycle'
      - '--web.enable-admin-api'
    volumes:
      - prometheus-data:/prometheus
      - prometheus-config:/etc/prometheus
    networks:
      - monitoring-zone
    deploy:
      placement:
        constraints:
          - node.labels.role.monitoring == true
      resources:
        limits:
          memory: 2G
          cpus: '1.0'
        reservations:
          memory: 1G
          cpus: '0.5'
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3

  alertmanager:
    image: prom/alertmanager:latest
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
      - '--web.external-url=http://alertmanager:9093'
    volumes:
      - alertmanager-data:/alertmanager
      - alertmanager-config:/etc/alertmanager
    networks:
      - monitoring-zone
    deploy:
      placement:
        constraints:
          - node.labels.role.monitoring == true
      resources:
        limits:
          memory: 512M
          cpus: '0.5'
        reservations:
          memory: 256M
          cpus: '0.25'
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3

  grafana:
    image: grafana/grafana:latest
    environment:
      - GF_SECURITY_ADMIN_PASSWORD_FILE=/run/secrets/grafana_admin_password
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_SERVER_DOMAIN=grafana.${DOMAIN}
      - GF_SERVER_ROOT_URL=https://grafana.${DOMAIN}
      - GF_INSTALL_PLUGINS=grafana-piechart-panel
    volumes:
      - grafana-data:/var/lib/grafana
      - grafana-config:/etc/grafana/provisioning
    networks:
      - monitoring-zone
      - public-zone
    secrets:
      - grafana_admin_password
    deploy:
      placement:
        constraints:
          - node.labels.role.monitoring == true
      resources:
        limits:
          memory: 1G
          cpus: '0.5'
        reservations:
          memory: 512M
          cpus: '0.25'
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3
      labels:
        - "traefik.enable=true"
        - "traefik.http.routers.grafana.rule=Host(`grafana.${DOMAIN}`)"
        - "traefik.http.routers.grafana.entrypoints=websecure"
        - "traefik.http.routers.grafana.tls.certresolver=letsencrypt"
        - "traefik.http.services.grafana.loadbalancer.server.port=3000"

  migration-health-exporter:
    image: python:3.9-alpine
    command:
      - /bin/sh
      - -c
      - |
        pip install --no-cache-dir requests
        python /app/migration_health_exporter.py
    volumes:
      - migration-exporter:/app
      - migration-logs:/opt/migration:ro
    networks:
      - monitoring-zone
    deploy:
      placement:
        constraints:
          - node.role == manager
      resources:
        limits:
          memory: 256M
          cpus: '0.25'
        reservations:
          memory: 128M
          cpus: '0.1'
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 3

secrets:
  grafana_admin_password:
    external: true

volumes:
  prometheus-data:
    driver: local
  prometheus-config:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/migration/configs/monitoring/prometheus
  alertmanager-data:
    driver: local
  alertmanager-config:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/migration/configs/alerting/alertmanager
  grafana-data:
    driver: local
  grafana-config:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/migration/configs/monitoring/grafana/provisioning
  migration-exporter:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/migration/configs/monitoring/exporters
  migration-logs:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/migration

networks:
  monitoring-zone:
    external: true
  public-zone:
    external: true
EOF

    # Copy configurations to the monitoring host
    log_info "Deploying configuration files..."
    ssh "$MONITORING_HOST" "sudo mkdir -p /opt/migration/configs/monitoring"
    scp -r "$MONITORING_CONFIG_DIR" "$MONITORING_HOST:/opt/migration/configs/"
    scp -r "$ALERTING_CONFIG_DIR" "$MONITORING_HOST:/opt/migration/configs/"

    # Deploy the monitoring stack
    log_info "Deploying monitoring services..."
    if ssh "$MANAGER_HOST" "cd $MONITORING_CONFIG_DIR && docker stack deploy -c monitoring-stack.yml monitoring"; then
        log_success "Monitoring stack deployed successfully"
    else
        log_error "Failed to deploy monitoring stack"
        return 1
    fi

    # Wait for services to be ready
    wait_for_service "Prometheus" "curl -f http://$MONITORING_HOST:9090/-/healthy" 120 10
    wait_for_service "Grafana" "curl -f http://$MONITORING_HOST:3000/api/health" 120 10

    log_success "Monitoring stack deployment completed"
}

# Main execution function
main() {
    local action=${1:-"full"}

    # Register cleanup and rollback functions
    register_cleanup cleanup_monitoring
    register_rollback rollback_monitoring

    case $action in
        "full")
            log_step "Setting up comprehensive monitoring system..."

            # Validate prerequisites
            validate_prerequisites ssh curl jq python3

            # Create configurations
            create_prometheus_config
            create_grafana_dashboards
            create_alertmanager_config
            create_migration_health_exporter

            # Deploy monitoring stack
            deploy_monitoring_stack

            log_success "✅ Comprehensive monitoring setup completed!"
            log_info "📊 Prometheus: http://$MONITORING_HOST:9090"
            log_info "📈 Grafana: http://$MONITORING_HOST:3000"
            log_info "🚨 Alertmanager: http://$MONITORING_HOST:9093"
            log_info "🔍 Migration Health: http://$MANAGER_HOST:9999/metrics"
            ;;

        "config-only")
            create_prometheus_config
            create_grafana_dashboards
            create_alertmanager_config
            create_migration_health_exporter
            ;;

        "deploy-only")
            deploy_monitoring_stack
            ;;

        "help"|*)
            cat << EOF
Comprehensive Monitoring Setup

Usage: $0 <action>

Actions:
  full         - Complete monitoring setup (default)
  config-only  - Only create configurations
  deploy-only  - Only deploy services
  help         - Show this help

Examples:
  $0 full
  $0 config-only
EOF
            ;;
    esac
}

# Execute main function
main "$@"