Files
HomeAudit/migration_scripts/scripts/comprehensive_monitoring_setup.sh
2025-08-24 11:13:39 -04:00

1058 lines
30 KiB
Bash
Executable File

#!/bin/bash
# Comprehensive Monitoring Setup
# Deploys real-time monitoring, alerting, and performance tracking for migration health
# Import error handling library
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/lib/error_handling.sh"
# Configuration
readonly MANAGER_HOST="omv800"
readonly MONITORING_HOST="audrey"
readonly MONITORING_CONFIG_DIR="/opt/migration/configs/monitoring"
readonly MONITORING_DATA_DIR="/opt/monitoring/data"
readonly ALERTING_CONFIG_DIR="/opt/migration/configs/alerting"
# Service endpoints for monitoring
declare -A MONITORING_ENDPOINTS=(
["traefik"]="http://omv800:8080/ping"
["immich"]="http://omv800:3001/api/server-info/ping"
["jellyfin"]="http://omv800:8096/health"
["homeassistant"]="http://jonathan-2518f5u:8123/"
["appflowy"]="http://surface:8000/health"
["grafana"]="http://audrey:3000/api/health"
["prometheus"]="http://audrey:9090/-/healthy"
)
# Alert thresholds
declare -A ALERT_THRESHOLDS=(
["cpu_usage"]="85"
["memory_usage"]="90"
["disk_usage"]="85"
["response_time_ms"]="5000"
["error_rate_percent"]="5"
["service_downtime_seconds"]="60"
)
# Cleanup function
cleanup_monitoring() {
log_info "Cleaning up monitoring setup..."
# Clean up temporary files
rm -f /tmp/monitoring_*.tmp 2>/dev/null || true
rm -f /tmp/prometheus_*.tmp 2>/dev/null || true
rm -f /tmp/grafana_*.tmp 2>/dev/null || true
log_info "Monitoring cleanup completed"
}
# Rollback function
rollback_monitoring() {
log_info "Rolling back monitoring setup..."
# Stop monitoring services
ssh "$MANAGER_HOST" "docker stack rm monitoring 2>/dev/null || true"
ssh "$MANAGER_HOST" "docker stack rm alerting 2>/dev/null || true"
cleanup_monitoring
log_info "Monitoring rollback completed"
}
# Function to create Prometheus configuration
create_prometheus_config() {
log_step "Creating Prometheus configuration..."
mkdir -p "$MONITORING_CONFIG_DIR/prometheus"
# Create main Prometheus configuration
cat > "$MONITORING_CONFIG_DIR/prometheus/prometheus.yml" << 'EOF'
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'homelab'
environment: 'production'
rule_files:
- "alert_rules.yml"
- "recording_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
# Prometheus itself
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Node exporters on all hosts
- job_name: 'node-exporter'
static_configs:
- targets:
- 'omv800:9100'
- 'fedora:9100'
- 'surface:9100'
- 'jonathan-2518f5u:9100'
- 'audrey:9100'
- 'raspberrypi:9100'
scrape_interval: 10s
metrics_path: /metrics
# cAdvisor for container metrics
- job_name: 'cadvisor'
static_configs:
- targets:
- 'omv800:8080'
- 'fedora:8080'
- 'surface:8080'
- 'jonathan-2518f5u:8080'
- 'audrey:8080'
scrape_interval: 10s
metrics_path: /metrics
# Docker daemon metrics
- job_name: 'docker-daemon'
static_configs:
- targets:
- 'omv800:9323'
- 'fedora:9323'
- 'surface:9323'
- 'jonathan-2518f5u:9323'
- 'audrey:9323'
scrape_interval: 30s
# Traefik metrics
- job_name: 'traefik'
static_configs:
- targets: ['omv800:8080']
metrics_path: /metrics
scrape_interval: 5s
# Application health checks
- job_name: 'service-health'
static_configs:
- targets:
- 'omv800:3001' # Immich
- 'omv800:8096' # Jellyfin
- 'jonathan-2518f5u:8123' # Home Assistant
- 'surface:8000' # AppFlowy
scrape_interval: 30s
metrics_path: /metrics
# PostgreSQL metrics (if pg_exporter is available)
- job_name: 'postgres'
static_configs:
- targets: ['omv800:9187']
scrape_interval: 30s
# Redis metrics (if redis_exporter is available)
- job_name: 'redis'
static_configs:
- targets: ['omv800:9121']
scrape_interval: 30s
# Migration-specific monitoring
- job_name: 'migration-health'
static_configs:
- targets: ['localhost:9999'] # Custom migration health endpoint
scrape_interval: 10s
EOF
# Create alert rules
cat > "$MONITORING_CONFIG_DIR/prometheus/alert_rules.yml" << 'EOF'
groups:
- name: infrastructure_alerts
rules:
# Node health alerts
- alert: NodeDown
expr: up{job="node-exporter"} == 0
for: 1m
labels:
severity: critical
component: infrastructure
annotations:
summary: "Node {{ $labels.instance }} is down"
description: "Node {{ $labels.instance }} has been down for more than 1 minute."
- alert: HighCPUUsage
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 85
for: 5m
labels:
severity: warning
component: infrastructure
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 85% on {{ $labels.instance }} for more than 5 minutes."
- alert: HighMemoryUsage
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 10
for: 5m
labels:
severity: warning
component: infrastructure
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is above 90% on {{ $labels.instance }} for more than 5 minutes."
- alert: HighDiskUsage
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 15
for: 10m
labels:
severity: warning
component: infrastructure
annotations:
summary: "High disk usage on {{ $labels.instance }}"
description: "Disk usage is above 85% on {{ $labels.instance }} for more than 10 minutes."
- name: docker_alerts
rules:
# Docker Swarm alerts
- alert: DockerNodeDown
expr: docker_swarm_node_status != 1
for: 2m
labels:
severity: critical
component: docker
annotations:
summary: "Docker Swarm node {{ $labels.node_name }} is not ready"
description: "Docker Swarm node {{ $labels.node_name }} has been not ready for more than 2 minutes."
- alert: ServiceReplicasMismatch
expr: docker_service_replicas != docker_service_replicas_desired
for: 5m
labels:
severity: warning
component: docker
annotations:
summary: "Service {{ $labels.service_name }} has replica mismatch"
description: "Service {{ $labels.service_name }} has {{ $labels.docker_service_replicas }} replicas running, but {{ $labels.docker_service_replicas_desired }} desired."
- alert: HighContainerRestarts
expr: rate(docker_container_restart_count[5m]) > 0.1
for: 5m
labels:
severity: warning
component: docker
annotations:
summary: "High container restart rate for {{ $labels.name }}"
description: "Container {{ $labels.name }} is restarting frequently."
- name: application_alerts
rules:
# Application health alerts
- alert: ServiceDown
expr: up{job="service-health"} == 0
for: 1m
labels:
severity: critical
component: application
annotations:
summary: "Service {{ $labels.instance }} is down"
description: "Service {{ $labels.instance }} has been down for more than 1 minute."
- alert: HighResponseTime
expr: http_request_duration_seconds{quantile="0.95"} > 5
for: 2m
labels:
severity: warning
component: application
annotations:
summary: "High response time for {{ $labels.instance }}"
description: "95th percentile response time is above 5 seconds for {{ $labels.instance }}."
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
for: 2m
labels:
severity: warning
component: application
annotations:
summary: "High error rate for {{ $labels.instance }}"
description: "Error rate is above 5% for {{ $labels.instance }}."
- name: migration_alerts
rules:
# Migration-specific alerts
- alert: MigrationStalled
expr: migration_progress_percent == migration_progress_percent offset 10m
for: 10m
labels:
severity: critical
component: migration
annotations:
summary: "Migration appears to be stalled"
description: "Migration progress has not changed in the last 10 minutes."
- alert: MigrationError
expr: migration_errors_total > 0
for: 0s
labels:
severity: critical
component: migration
annotations:
summary: "Migration errors detected"
description: "{{ $value }} migration errors have been detected."
EOF
# Create recording rules for performance metrics
cat > "$MONITORING_CONFIG_DIR/prometheus/recording_rules.yml" << 'EOF'
groups:
- name: performance_recording
interval: 30s
rules:
# Node performance metrics
- record: node:cpu_utilization_percent
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100)
- record: node:memory_utilization_percent
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
- record: node:disk_utilization_percent
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100
# Container performance metrics
- record: container:cpu_utilization_percent
expr: rate(container_cpu_usage_seconds_total[5m]) * 100
- record: container:memory_utilization_percent
expr: (container_memory_usage_bytes / container_memory_limit_bytes) * 100
# Application performance metrics
- record: app:request_rate
expr: rate(http_requests_total[5m])
- record: app:error_rate_percent
expr: (rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100
- record: app:response_time_95th_percentile
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
EOF
log_success "Prometheus configuration created"
}
# Function to create Grafana dashboards
create_grafana_dashboards() {
log_step "Creating Grafana dashboards..."
mkdir -p "$MONITORING_CONFIG_DIR/grafana/dashboards"
mkdir -p "$MONITORING_CONFIG_DIR/grafana/provisioning/dashboards"
mkdir -p "$MONITORING_CONFIG_DIR/grafana/provisioning/datasources"
# Create datasource provisioning
cat > "$MONITORING_CONFIG_DIR/grafana/provisioning/datasources/prometheus.yml" << 'EOF'
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
EOF
# Create dashboard provisioning
cat > "$MONITORING_CONFIG_DIR/grafana/provisioning/dashboards/dashboards.yml" << 'EOF'
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards
EOF
# Create Infrastructure Overview dashboard
cat > "$MONITORING_CONFIG_DIR/grafana/dashboards/infrastructure-overview.json" << 'EOF'
{
"dashboard": {
"id": null,
"title": "Infrastructure Overview",
"description": "Home Lab Infrastructure Monitoring",
"tags": ["infrastructure", "overview"],
"timezone": "browser",
"panels": [
{
"title": "System Load",
"type": "stat",
"targets": [
{
"expr": "avg(node:cpu_utilization_percent)",
"legendFormat": "Average CPU Usage"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 70},
{"color": "red", "value": 85}
]
}
}
}
},
{
"title": "Memory Usage",
"type": "stat",
"targets": [
{
"expr": "avg(node:memory_utilization_percent)",
"legendFormat": "Average Memory Usage"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 75},
{"color": "red", "value": 90}
]
}
}
}
},
{
"title": "Service Health",
"type": "stat",
"targets": [
{
"expr": "sum(up{job=\"service-health\"})",
"legendFormat": "Services Up"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"steps": [
{"color": "red", "value": null},
{"color": "yellow", "value": 3},
{"color": "green", "value": 4}
]
}
}
}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "10s"
}
}
EOF
# Create Migration Health dashboard
cat > "$MONITORING_CONFIG_DIR/grafana/dashboards/migration-health.json" << 'EOF'
{
"dashboard": {
"id": null,
"title": "Migration Health",
"description": "Real-time migration monitoring and health checks",
"tags": ["migration", "health"],
"panels": [
{
"title": "Migration Progress",
"type": "gauge",
"targets": [
{
"expr": "migration_progress_percent",
"legendFormat": "Progress %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "red", "value": null},
{"color": "yellow", "value": 25},
{"color": "green", "value": 75}
]
}
}
}
},
{
"title": "Migration Errors",
"type": "stat",
"targets": [
{
"expr": "migration_errors_total",
"legendFormat": "Total Errors"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 1},
{"color": "red", "value": 5}
]
}
}
}
},
{
"title": "Data Transfer Rate",
"type": "graph",
"targets": [
{
"expr": "rate(migration_data_transferred_bytes[5m])",
"legendFormat": "Transfer Rate"
}
],
"yAxes": [
{
"unit": "bytes"
}
]
}
],
"time": {
"from": "now-2h",
"to": "now"
},
"refresh": "5s"
}
}
EOF
log_success "Grafana dashboards created"
}
# Function to create Alertmanager configuration
create_alertmanager_config() {
log_step "Creating Alertmanager configuration..."
mkdir -p "$ALERTING_CONFIG_DIR/alertmanager"
# Create Alertmanager configuration
cat > "$ALERTING_CONFIG_DIR/alertmanager/alertmanager.yml" << 'EOF'
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alertmanager@homelab.local'
smtp_auth_username: ''
smtp_auth_password: ''
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 5s
repeat_interval: 15m
- match:
component: migration
receiver: 'migration-alerts'
group_wait: 0s
repeat_interval: 5m
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://localhost:9093/webhook'
- name: 'critical-alerts'
webhook_configs:
- url: 'http://localhost:9093/webhook/critical'
send_resolved: true
# Uncomment and configure email if SMTP is available
# email_configs:
# - to: 'admin@homelab.local'
# subject: 'CRITICAL: {{ .GroupLabels.alertname }}'
# body: |
# {{ range .Alerts }}
# Alert: {{ .Annotations.summary }}
# Description: {{ .Annotations.description }}
# {{ end }}
- name: 'migration-alerts'
webhook_configs:
- url: 'http://localhost:9093/webhook/migration'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
EOF
log_success "Alertmanager configuration created"
}
# Function to create custom migration health exporter
create_migration_health_exporter() {
log_step "Creating migration health exporter..."
mkdir -p "$MONITORING_CONFIG_DIR/exporters"
# Create migration health exporter script
cat > "$MONITORING_CONFIG_DIR/exporters/migration_health_exporter.py" << 'EOF'
#!/usr/bin/env python3
"""
Migration Health Exporter
Exports custom metrics for migration monitoring
"""
import time
import json
import os
import subprocess
import glob
from http.server import HTTPServer, BaseHTTPRequestHandler
from urllib.parse import urlparse, parse_qs
class MigrationHealthExporter:
def __init__(self):
self.migration_dir = "/opt/migration"
self.log_dir = f"{self.migration_dir}/logs"
self.backup_dir = f"{self.migration_dir}/backups"
self.checkpoint_dir = f"{self.log_dir}/checkpoints"
def get_migration_progress(self):
"""Calculate migration progress based on checkpoints"""
try:
if not os.path.exists(self.checkpoint_dir):
return 0.0
checkpoints = glob.glob(f"{self.checkpoint_dir}/*.checkpoint")
if not checkpoints:
return 0.0
# Define expected checkpoints in order
expected_checkpoints = [
"snapshot_start", "docker_collected", "database_dumps_complete",
"config_backups_complete", "security_hardening_complete",
"swarm_optimization_complete", "migration_complete"
]
completed_checkpoints = []
for checkpoint_file in checkpoints:
with open(checkpoint_file, 'r') as f:
content = f.read()
if "CHECKPOINT_NAME=" in content:
name = content.split("CHECKPOINT_NAME=")[1].split("\n")[0]
completed_checkpoints.append(name)
progress = (len(completed_checkpoints) / len(expected_checkpoints)) * 100
return min(progress, 100.0)
except Exception:
return 0.0
def get_migration_errors(self):
"""Count migration errors from logs"""
try:
error_count = 0
if os.path.exists(self.log_dir):
for log_file in glob.glob(f"{self.log_dir}/errors_*.log"):
with open(log_file, 'r') as f:
error_count += len([line for line in f if '[ERROR]' in line or '[CRITICAL]' in line])
return error_count
except Exception:
return 0
def get_backup_status(self):
"""Check backup health"""
try:
if not os.path.exists(self.backup_dir):
return {"healthy": 0, "total": 0}
backups = glob.glob(f"{self.backup_dir}/**/*.tar.gz", recursive=True)
healthy_backups = 0
for backup in backups:
# Simple health check - file size > 1MB
if os.path.getsize(backup) > 1024 * 1024:
healthy_backups += 1
return {"healthy": healthy_backups, "total": len(backups)}
except Exception:
return {"healthy": 0, "total": 0}
def get_service_health(self):
"""Check critical service health"""
services = {
"traefik": "docker ps | grep traefik",
"postgres": "docker ps | grep postgres",
"redis": "docker ps | grep redis"
}
healthy_services = 0
for service, check_cmd in services.items():
try:
result = subprocess.run(check_cmd, shell=True, capture_output=True)
if result.returncode == 0 and result.stdout.strip():
healthy_services += 1
except Exception:
pass
return {"healthy": healthy_services, "total": len(services)}
def generate_metrics(self):
"""Generate Prometheus metrics"""
progress = self.get_migration_progress()
errors = self.get_migration_errors()
backup_status = self.get_backup_status()
service_health = self.get_service_health()
metrics = f"""# HELP migration_progress_percent Current migration progress percentage
# TYPE migration_progress_percent gauge
migration_progress_percent {progress}
# HELP migration_errors_total Total number of migration errors
# TYPE migration_errors_total counter
migration_errors_total {errors}
# HELP migration_backup_healthy Number of healthy backups
# TYPE migration_backup_healthy gauge
migration_backup_healthy {backup_status['healthy']}
# HELP migration_backup_total Total number of backups
# TYPE migration_backup_total gauge
migration_backup_total {backup_status['total']}
# HELP migration_services_healthy Number of healthy critical services
# TYPE migration_services_healthy gauge
migration_services_healthy {service_health['healthy']}
# HELP migration_services_total Total number of critical services
# TYPE migration_services_total gauge
migration_services_total {service_health['total']}
# HELP migration_timestamp_seconds Timestamp of last metric update
# TYPE migration_timestamp_seconds gauge
migration_timestamp_seconds {time.time()}
"""
return metrics
class MetricsHandler(BaseHTTPRequestHandler):
def __init__(self, exporter, *args, **kwargs):
self.exporter = exporter
super().__init__(*args, **kwargs)
def do_GET(self):
if self.path == '/metrics':
self.send_response(200)
self.send_header('Content-type', 'text/plain; charset=utf-8')
self.end_headers()
metrics = self.exporter.generate_metrics()
self.wfile.write(metrics.encode('utf-8'))
else:
self.send_response(404)
self.end_headers()
def main():
exporter = MigrationHealthExporter()
handler = lambda *args, **kwargs: MetricsHandler(exporter, *args, **kwargs)
server = HTTPServer(('0.0.0.0', 9999), handler)
print("Migration health exporter starting on port 9999")
server.serve_forever()
if __name__ == '__main__':
main()
EOF
chmod +x "$MONITORING_CONFIG_DIR/exporters/migration_health_exporter.py"
log_success "Migration health exporter created"
}
# Function to deploy monitoring stack
deploy_monitoring_stack() {
log_step "Deploying comprehensive monitoring stack..."
# Create monitoring stack configuration
cat > "$MONITORING_CONFIG_DIR/monitoring-stack.yml" << 'EOF'
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
volumes:
- prometheus-data:/prometheus
- prometheus-config:/etc/prometheus
networks:
- monitoring-zone
deploy:
placement:
constraints:
- node.labels.role.monitoring == true
resources:
limits:
memory: 2G
cpus: '1.0'
reservations:
memory: 1G
cpus: '0.5'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
alertmanager:
image: prom/alertmanager:latest
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://alertmanager:9093'
volumes:
- alertmanager-data:/alertmanager
- alertmanager-config:/etc/alertmanager
networks:
- monitoring-zone
deploy:
placement:
constraints:
- node.labels.role.monitoring == true
resources:
limits:
memory: 512M
cpus: '0.5'
reservations:
memory: 256M
cpus: '0.25'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
grafana:
image: grafana/grafana:latest
environment:
- GF_SECURITY_ADMIN_PASSWORD_FILE=/run/secrets/grafana_admin_password
- GF_USERS_ALLOW_SIGN_UP=false
- GF_SERVER_DOMAIN=grafana.${DOMAIN}
- GF_SERVER_ROOT_URL=https://grafana.${DOMAIN}
- GF_INSTALL_PLUGINS=grafana-piechart-panel
volumes:
- grafana-data:/var/lib/grafana
- grafana-config:/etc/grafana/provisioning
networks:
- monitoring-zone
- public-zone
secrets:
- grafana_admin_password
deploy:
placement:
constraints:
- node.labels.role.monitoring == true
resources:
limits:
memory: 1G
cpus: '0.5'
reservations:
memory: 512M
cpus: '0.25'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
labels:
- "traefik.enable=true"
- "traefik.http.routers.grafana.rule=Host(`grafana.${DOMAIN}`)"
- "traefik.http.routers.grafana.entrypoints=websecure"
- "traefik.http.routers.grafana.tls.certresolver=letsencrypt"
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
migration-health-exporter:
image: python:3.9-alpine
command:
- /bin/sh
- -c
- |
pip install --no-cache-dir requests
python /app/migration_health_exporter.py
volumes:
- migration-exporter:/app
- migration-logs:/opt/migration:ro
networks:
- monitoring-zone
deploy:
placement:
constraints:
- node.role == manager
resources:
limits:
memory: 256M
cpus: '0.25'
reservations:
memory: 128M
cpus: '0.1'
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
secrets:
grafana_admin_password:
external: true
volumes:
prometheus-data:
driver: local
prometheus-config:
driver: local
driver_opts:
type: none
o: bind
device: /opt/migration/configs/monitoring/prometheus
alertmanager-data:
driver: local
alertmanager-config:
driver: local
driver_opts:
type: none
o: bind
device: /opt/migration/configs/alerting/alertmanager
grafana-data:
driver: local
grafana-config:
driver: local
driver_opts:
type: none
o: bind
device: /opt/migration/configs/monitoring/grafana/provisioning
migration-exporter:
driver: local
driver_opts:
type: none
o: bind
device: /opt/migration/configs/monitoring/exporters
migration-logs:
driver: local
driver_opts:
type: none
o: bind
device: /opt/migration
networks:
monitoring-zone:
external: true
public-zone:
external: true
EOF
# Copy configurations to the monitoring host
log_info "Deploying configuration files..."
ssh "$MONITORING_HOST" "sudo mkdir -p /opt/migration/configs/monitoring"
scp -r "$MONITORING_CONFIG_DIR" "$MONITORING_HOST:/opt/migration/configs/"
scp -r "$ALERTING_CONFIG_DIR" "$MONITORING_HOST:/opt/migration/configs/"
# Deploy the monitoring stack
log_info "Deploying monitoring services..."
if ssh "$MANAGER_HOST" "cd $MONITORING_CONFIG_DIR && docker stack deploy -c monitoring-stack.yml monitoring"; then
log_success "Monitoring stack deployed successfully"
else
log_error "Failed to deploy monitoring stack"
return 1
fi
# Wait for services to be ready
wait_for_service "Prometheus" "curl -f http://$MONITORING_HOST:9090/-/healthy" 120 10
wait_for_service "Grafana" "curl -f http://$MONITORING_HOST:3000/api/health" 120 10
log_success "Monitoring stack deployment completed"
}
# Main execution function
main() {
local action=${1:-"full"}
# Register cleanup and rollback functions
register_cleanup cleanup_monitoring
register_rollback rollback_monitoring
case $action in
"full")
log_step "Setting up comprehensive monitoring system..."
# Validate prerequisites
validate_prerequisites ssh curl jq python3
# Create configurations
create_prometheus_config
create_grafana_dashboards
create_alertmanager_config
create_migration_health_exporter
# Deploy monitoring stack
deploy_monitoring_stack
log_success "✅ Comprehensive monitoring setup completed!"
log_info "📊 Prometheus: http://$MONITORING_HOST:9090"
log_info "📈 Grafana: http://$MONITORING_HOST:3000"
log_info "🚨 Alertmanager: http://$MONITORING_HOST:9093"
log_info "🔍 Migration Health: http://$MANAGER_HOST:9999/metrics"
;;
"config-only")
create_prometheus_config
create_grafana_dashboards
create_alertmanager_config
create_migration_health_exporter
;;
"deploy-only")
deploy_monitoring_stack
;;
"help"|*)
cat << EOF
Comprehensive Monitoring Setup
Usage: $0 <action>
Actions:
full - Complete monitoring setup (default)
config-only - Only create configurations
deploy-only - Only deploy services
help - Show this help
Examples:
$0 full
$0 config-only
EOF
;;
esac
}
# Execute main function
main "$@"