1058 lines
30 KiB
Bash
Executable File
1058 lines
30 KiB
Bash
Executable File
#!/bin/bash
|
|
# Comprehensive Monitoring Setup
|
|
# Deploys real-time monitoring, alerting, and performance tracking for migration health
|
|
|
|
# Import error handling library
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
source "$SCRIPT_DIR/lib/error_handling.sh"
|
|
|
|
# Configuration
|
|
readonly MANAGER_HOST="omv800"
|
|
readonly MONITORING_HOST="audrey"
|
|
readonly MONITORING_CONFIG_DIR="/opt/migration/configs/monitoring"
|
|
readonly MONITORING_DATA_DIR="/opt/monitoring/data"
|
|
readonly ALERTING_CONFIG_DIR="/opt/migration/configs/alerting"
|
|
|
|
# Service endpoints for monitoring
|
|
declare -A MONITORING_ENDPOINTS=(
|
|
["traefik"]="http://omv800:8080/ping"
|
|
["immich"]="http://omv800:3001/api/server-info/ping"
|
|
["jellyfin"]="http://omv800:8096/health"
|
|
["homeassistant"]="http://jonathan-2518f5u:8123/"
|
|
["appflowy"]="http://surface:8000/health"
|
|
["grafana"]="http://audrey:3000/api/health"
|
|
["prometheus"]="http://audrey:9090/-/healthy"
|
|
)
|
|
|
|
# Alert thresholds
|
|
declare -A ALERT_THRESHOLDS=(
|
|
["cpu_usage"]="85"
|
|
["memory_usage"]="90"
|
|
["disk_usage"]="85"
|
|
["response_time_ms"]="5000"
|
|
["error_rate_percent"]="5"
|
|
["service_downtime_seconds"]="60"
|
|
)
|
|
|
|
# Cleanup function
|
|
cleanup_monitoring() {
|
|
log_info "Cleaning up monitoring setup..."
|
|
|
|
# Clean up temporary files
|
|
rm -f /tmp/monitoring_*.tmp 2>/dev/null || true
|
|
rm -f /tmp/prometheus_*.tmp 2>/dev/null || true
|
|
rm -f /tmp/grafana_*.tmp 2>/dev/null || true
|
|
|
|
log_info "Monitoring cleanup completed"
|
|
}
|
|
|
|
# Rollback function
|
|
rollback_monitoring() {
|
|
log_info "Rolling back monitoring setup..."
|
|
|
|
# Stop monitoring services
|
|
ssh "$MANAGER_HOST" "docker stack rm monitoring 2>/dev/null || true"
|
|
ssh "$MANAGER_HOST" "docker stack rm alerting 2>/dev/null || true"
|
|
|
|
cleanup_monitoring
|
|
log_info "Monitoring rollback completed"
|
|
}
|
|
|
|
# Function to create Prometheus configuration
|
|
create_prometheus_config() {
|
|
log_step "Creating Prometheus configuration..."
|
|
|
|
mkdir -p "$MONITORING_CONFIG_DIR/prometheus"
|
|
|
|
# Create main Prometheus configuration
|
|
cat > "$MONITORING_CONFIG_DIR/prometheus/prometheus.yml" << 'EOF'
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
external_labels:
|
|
cluster: 'homelab'
|
|
environment: 'production'
|
|
|
|
rule_files:
|
|
- "alert_rules.yml"
|
|
- "recording_rules.yml"
|
|
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
- targets:
|
|
- alertmanager:9093
|
|
|
|
scrape_configs:
|
|
# Prometheus itself
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['localhost:9090']
|
|
|
|
# Node exporters on all hosts
|
|
- job_name: 'node-exporter'
|
|
static_configs:
|
|
- targets:
|
|
- 'omv800:9100'
|
|
- 'fedora:9100'
|
|
- 'surface:9100'
|
|
- 'jonathan-2518f5u:9100'
|
|
- 'audrey:9100'
|
|
- 'raspberrypi:9100'
|
|
scrape_interval: 10s
|
|
metrics_path: /metrics
|
|
|
|
# cAdvisor for container metrics
|
|
- job_name: 'cadvisor'
|
|
static_configs:
|
|
- targets:
|
|
- 'omv800:8080'
|
|
- 'fedora:8080'
|
|
- 'surface:8080'
|
|
- 'jonathan-2518f5u:8080'
|
|
- 'audrey:8080'
|
|
scrape_interval: 10s
|
|
metrics_path: /metrics
|
|
|
|
# Docker daemon metrics
|
|
- job_name: 'docker-daemon'
|
|
static_configs:
|
|
- targets:
|
|
- 'omv800:9323'
|
|
- 'fedora:9323'
|
|
- 'surface:9323'
|
|
- 'jonathan-2518f5u:9323'
|
|
- 'audrey:9323'
|
|
scrape_interval: 30s
|
|
|
|
# Traefik metrics
|
|
- job_name: 'traefik'
|
|
static_configs:
|
|
- targets: ['omv800:8080']
|
|
metrics_path: /metrics
|
|
scrape_interval: 5s
|
|
|
|
# Application health checks
|
|
- job_name: 'service-health'
|
|
static_configs:
|
|
- targets:
|
|
- 'omv800:3001' # Immich
|
|
- 'omv800:8096' # Jellyfin
|
|
- 'jonathan-2518f5u:8123' # Home Assistant
|
|
- 'surface:8000' # AppFlowy
|
|
scrape_interval: 30s
|
|
metrics_path: /metrics
|
|
|
|
# PostgreSQL metrics (if pg_exporter is available)
|
|
- job_name: 'postgres'
|
|
static_configs:
|
|
- targets: ['omv800:9187']
|
|
scrape_interval: 30s
|
|
|
|
# Redis metrics (if redis_exporter is available)
|
|
- job_name: 'redis'
|
|
static_configs:
|
|
- targets: ['omv800:9121']
|
|
scrape_interval: 30s
|
|
|
|
# Migration-specific monitoring
|
|
- job_name: 'migration-health'
|
|
static_configs:
|
|
- targets: ['localhost:9999'] # Custom migration health endpoint
|
|
scrape_interval: 10s
|
|
EOF
|
|
|
|
# Create alert rules
|
|
cat > "$MONITORING_CONFIG_DIR/prometheus/alert_rules.yml" << 'EOF'
|
|
groups:
|
|
- name: infrastructure_alerts
|
|
rules:
|
|
# Node health alerts
|
|
- alert: NodeDown
|
|
expr: up{job="node-exporter"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
component: infrastructure
|
|
annotations:
|
|
summary: "Node {{ $labels.instance }} is down"
|
|
description: "Node {{ $labels.instance }} has been down for more than 1 minute."
|
|
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: infrastructure
|
|
annotations:
|
|
summary: "High CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is above 85% on {{ $labels.instance }} for more than 5 minutes."
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: infrastructure
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is above 90% on {{ $labels.instance }} for more than 5 minutes."
|
|
|
|
- alert: HighDiskUsage
|
|
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 15
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
component: infrastructure
|
|
annotations:
|
|
summary: "High disk usage on {{ $labels.instance }}"
|
|
description: "Disk usage is above 85% on {{ $labels.instance }} for more than 10 minutes."
|
|
|
|
- name: docker_alerts
|
|
rules:
|
|
# Docker Swarm alerts
|
|
- alert: DockerNodeDown
|
|
expr: docker_swarm_node_status != 1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
component: docker
|
|
annotations:
|
|
summary: "Docker Swarm node {{ $labels.node_name }} is not ready"
|
|
description: "Docker Swarm node {{ $labels.node_name }} has been not ready for more than 2 minutes."
|
|
|
|
- alert: ServiceReplicasMismatch
|
|
expr: docker_service_replicas != docker_service_replicas_desired
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: docker
|
|
annotations:
|
|
summary: "Service {{ $labels.service_name }} has replica mismatch"
|
|
description: "Service {{ $labels.service_name }} has {{ $labels.docker_service_replicas }} replicas running, but {{ $labels.docker_service_replicas_desired }} desired."
|
|
|
|
- alert: HighContainerRestarts
|
|
expr: rate(docker_container_restart_count[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: docker
|
|
annotations:
|
|
summary: "High container restart rate for {{ $labels.name }}"
|
|
description: "Container {{ $labels.name }} is restarting frequently."
|
|
|
|
- name: application_alerts
|
|
rules:
|
|
# Application health alerts
|
|
- alert: ServiceDown
|
|
expr: up{job="service-health"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
component: application
|
|
annotations:
|
|
summary: "Service {{ $labels.instance }} is down"
|
|
description: "Service {{ $labels.instance }} has been down for more than 1 minute."
|
|
|
|
- alert: HighResponseTime
|
|
expr: http_request_duration_seconds{quantile="0.95"} > 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
component: application
|
|
annotations:
|
|
summary: "High response time for {{ $labels.instance }}"
|
|
description: "95th percentile response time is above 5 seconds for {{ $labels.instance }}."
|
|
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
component: application
|
|
annotations:
|
|
summary: "High error rate for {{ $labels.instance }}"
|
|
description: "Error rate is above 5% for {{ $labels.instance }}."
|
|
|
|
- name: migration_alerts
|
|
rules:
|
|
# Migration-specific alerts
|
|
- alert: MigrationStalled
|
|
expr: migration_progress_percent == migration_progress_percent offset 10m
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
component: migration
|
|
annotations:
|
|
summary: "Migration appears to be stalled"
|
|
description: "Migration progress has not changed in the last 10 minutes."
|
|
|
|
- alert: MigrationError
|
|
expr: migration_errors_total > 0
|
|
for: 0s
|
|
labels:
|
|
severity: critical
|
|
component: migration
|
|
annotations:
|
|
summary: "Migration errors detected"
|
|
description: "{{ $value }} migration errors have been detected."
|
|
EOF
|
|
|
|
# Create recording rules for performance metrics
|
|
cat > "$MONITORING_CONFIG_DIR/prometheus/recording_rules.yml" << 'EOF'
|
|
groups:
|
|
- name: performance_recording
|
|
interval: 30s
|
|
rules:
|
|
# Node performance metrics
|
|
- record: node:cpu_utilization_percent
|
|
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100)
|
|
|
|
- record: node:memory_utilization_percent
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
|
|
|
|
- record: node:disk_utilization_percent
|
|
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100
|
|
|
|
# Container performance metrics
|
|
- record: container:cpu_utilization_percent
|
|
expr: rate(container_cpu_usage_seconds_total[5m]) * 100
|
|
|
|
- record: container:memory_utilization_percent
|
|
expr: (container_memory_usage_bytes / container_memory_limit_bytes) * 100
|
|
|
|
# Application performance metrics
|
|
- record: app:request_rate
|
|
expr: rate(http_requests_total[5m])
|
|
|
|
- record: app:error_rate_percent
|
|
expr: (rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100
|
|
|
|
- record: app:response_time_95th_percentile
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
|
|
EOF
|
|
|
|
log_success "Prometheus configuration created"
|
|
}
|
|
|
|
# Function to create Grafana dashboards
|
|
create_grafana_dashboards() {
|
|
log_step "Creating Grafana dashboards..."
|
|
|
|
mkdir -p "$MONITORING_CONFIG_DIR/grafana/dashboards"
|
|
mkdir -p "$MONITORING_CONFIG_DIR/grafana/provisioning/dashboards"
|
|
mkdir -p "$MONITORING_CONFIG_DIR/grafana/provisioning/datasources"
|
|
|
|
# Create datasource provisioning
|
|
cat > "$MONITORING_CONFIG_DIR/grafana/provisioning/datasources/prometheus.yml" << 'EOF'
|
|
apiVersion: 1
|
|
|
|
datasources:
|
|
- name: Prometheus
|
|
type: prometheus
|
|
access: proxy
|
|
url: http://prometheus:9090
|
|
isDefault: true
|
|
editable: true
|
|
EOF
|
|
|
|
# Create dashboard provisioning
|
|
cat > "$MONITORING_CONFIG_DIR/grafana/provisioning/dashboards/dashboards.yml" << 'EOF'
|
|
apiVersion: 1
|
|
|
|
providers:
|
|
- name: 'default'
|
|
orgId: 1
|
|
folder: ''
|
|
type: file
|
|
disableDeletion: false
|
|
updateIntervalSeconds: 10
|
|
allowUiUpdates: true
|
|
options:
|
|
path: /etc/grafana/provisioning/dashboards
|
|
EOF
|
|
|
|
# Create Infrastructure Overview dashboard
|
|
cat > "$MONITORING_CONFIG_DIR/grafana/dashboards/infrastructure-overview.json" << 'EOF'
|
|
{
|
|
"dashboard": {
|
|
"id": null,
|
|
"title": "Infrastructure Overview",
|
|
"description": "Home Lab Infrastructure Monitoring",
|
|
"tags": ["infrastructure", "overview"],
|
|
"timezone": "browser",
|
|
"panels": [
|
|
{
|
|
"title": "System Load",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "avg(node:cpu_utilization_percent)",
|
|
"legendFormat": "Average CPU Usage"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"thresholds": {
|
|
"steps": [
|
|
{"color": "green", "value": null},
|
|
{"color": "yellow", "value": 70},
|
|
{"color": "red", "value": 85}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"title": "Memory Usage",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "avg(node:memory_utilization_percent)",
|
|
"legendFormat": "Average Memory Usage"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"thresholds": {
|
|
"steps": [
|
|
{"color": "green", "value": null},
|
|
{"color": "yellow", "value": 75},
|
|
{"color": "red", "value": 90}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"title": "Service Health",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(up{job=\"service-health\"})",
|
|
"legendFormat": "Services Up"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "short",
|
|
"thresholds": {
|
|
"steps": [
|
|
{"color": "red", "value": null},
|
|
{"color": "yellow", "value": 3},
|
|
{"color": "green", "value": 4}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
],
|
|
"time": {
|
|
"from": "now-1h",
|
|
"to": "now"
|
|
},
|
|
"refresh": "10s"
|
|
}
|
|
}
|
|
EOF
|
|
|
|
# Create Migration Health dashboard
|
|
cat > "$MONITORING_CONFIG_DIR/grafana/dashboards/migration-health.json" << 'EOF'
|
|
{
|
|
"dashboard": {
|
|
"id": null,
|
|
"title": "Migration Health",
|
|
"description": "Real-time migration monitoring and health checks",
|
|
"tags": ["migration", "health"],
|
|
"panels": [
|
|
{
|
|
"title": "Migration Progress",
|
|
"type": "gauge",
|
|
"targets": [
|
|
{
|
|
"expr": "migration_progress_percent",
|
|
"legendFormat": "Progress %"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "percent",
|
|
"min": 0,
|
|
"max": 100,
|
|
"thresholds": {
|
|
"steps": [
|
|
{"color": "red", "value": null},
|
|
{"color": "yellow", "value": 25},
|
|
{"color": "green", "value": 75}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"title": "Migration Errors",
|
|
"type": "stat",
|
|
"targets": [
|
|
{
|
|
"expr": "migration_errors_total",
|
|
"legendFormat": "Total Errors"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "short",
|
|
"thresholds": {
|
|
"steps": [
|
|
{"color": "green", "value": null},
|
|
{"color": "yellow", "value": 1},
|
|
{"color": "red", "value": 5}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"title": "Data Transfer Rate",
|
|
"type": "graph",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(migration_data_transferred_bytes[5m])",
|
|
"legendFormat": "Transfer Rate"
|
|
}
|
|
],
|
|
"yAxes": [
|
|
{
|
|
"unit": "bytes"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"time": {
|
|
"from": "now-2h",
|
|
"to": "now"
|
|
},
|
|
"refresh": "5s"
|
|
}
|
|
}
|
|
EOF
|
|
|
|
log_success "Grafana dashboards created"
|
|
}
|
|
|
|
# Function to create Alertmanager configuration
|
|
create_alertmanager_config() {
|
|
log_step "Creating Alertmanager configuration..."
|
|
|
|
mkdir -p "$ALERTING_CONFIG_DIR/alertmanager"
|
|
|
|
# Create Alertmanager configuration
|
|
cat > "$ALERTING_CONFIG_DIR/alertmanager/alertmanager.yml" << 'EOF'
|
|
global:
|
|
smtp_smarthost: 'localhost:587'
|
|
smtp_from: 'alertmanager@homelab.local'
|
|
smtp_auth_username: ''
|
|
smtp_auth_password: ''
|
|
|
|
route:
|
|
group_by: ['alertname', 'cluster', 'service']
|
|
group_wait: 10s
|
|
group_interval: 10s
|
|
repeat_interval: 1h
|
|
receiver: 'web.hook'
|
|
routes:
|
|
- match:
|
|
severity: critical
|
|
receiver: 'critical-alerts'
|
|
group_wait: 5s
|
|
repeat_interval: 15m
|
|
- match:
|
|
component: migration
|
|
receiver: 'migration-alerts'
|
|
group_wait: 0s
|
|
repeat_interval: 5m
|
|
|
|
receivers:
|
|
- name: 'web.hook'
|
|
webhook_configs:
|
|
- url: 'http://localhost:9093/webhook'
|
|
|
|
- name: 'critical-alerts'
|
|
webhook_configs:
|
|
- url: 'http://localhost:9093/webhook/critical'
|
|
send_resolved: true
|
|
# Uncomment and configure email if SMTP is available
|
|
# email_configs:
|
|
# - to: 'admin@homelab.local'
|
|
# subject: 'CRITICAL: {{ .GroupLabels.alertname }}'
|
|
# body: |
|
|
# {{ range .Alerts }}
|
|
# Alert: {{ .Annotations.summary }}
|
|
# Description: {{ .Annotations.description }}
|
|
# {{ end }}
|
|
|
|
- name: 'migration-alerts'
|
|
webhook_configs:
|
|
- url: 'http://localhost:9093/webhook/migration'
|
|
send_resolved: true
|
|
|
|
inhibit_rules:
|
|
- source_match:
|
|
severity: 'critical'
|
|
target_match:
|
|
severity: 'warning'
|
|
equal: ['alertname', 'instance']
|
|
EOF
|
|
|
|
log_success "Alertmanager configuration created"
|
|
}
|
|
|
|
# Function to create custom migration health exporter
|
|
create_migration_health_exporter() {
|
|
log_step "Creating migration health exporter..."
|
|
|
|
mkdir -p "$MONITORING_CONFIG_DIR/exporters"
|
|
|
|
# Create migration health exporter script
|
|
cat > "$MONITORING_CONFIG_DIR/exporters/migration_health_exporter.py" << 'EOF'
|
|
#!/usr/bin/env python3
|
|
"""
|
|
Migration Health Exporter
|
|
Exports custom metrics for migration monitoring
|
|
"""
|
|
|
|
import time
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import glob
|
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
|
from urllib.parse import urlparse, parse_qs
|
|
|
|
class MigrationHealthExporter:
|
|
def __init__(self):
|
|
self.migration_dir = "/opt/migration"
|
|
self.log_dir = f"{self.migration_dir}/logs"
|
|
self.backup_dir = f"{self.migration_dir}/backups"
|
|
self.checkpoint_dir = f"{self.log_dir}/checkpoints"
|
|
|
|
def get_migration_progress(self):
|
|
"""Calculate migration progress based on checkpoints"""
|
|
try:
|
|
if not os.path.exists(self.checkpoint_dir):
|
|
return 0.0
|
|
|
|
checkpoints = glob.glob(f"{self.checkpoint_dir}/*.checkpoint")
|
|
if not checkpoints:
|
|
return 0.0
|
|
|
|
# Define expected checkpoints in order
|
|
expected_checkpoints = [
|
|
"snapshot_start", "docker_collected", "database_dumps_complete",
|
|
"config_backups_complete", "security_hardening_complete",
|
|
"swarm_optimization_complete", "migration_complete"
|
|
]
|
|
|
|
completed_checkpoints = []
|
|
for checkpoint_file in checkpoints:
|
|
with open(checkpoint_file, 'r') as f:
|
|
content = f.read()
|
|
if "CHECKPOINT_NAME=" in content:
|
|
name = content.split("CHECKPOINT_NAME=")[1].split("\n")[0]
|
|
completed_checkpoints.append(name)
|
|
|
|
progress = (len(completed_checkpoints) / len(expected_checkpoints)) * 100
|
|
return min(progress, 100.0)
|
|
|
|
except Exception:
|
|
return 0.0
|
|
|
|
def get_migration_errors(self):
|
|
"""Count migration errors from logs"""
|
|
try:
|
|
error_count = 0
|
|
if os.path.exists(self.log_dir):
|
|
for log_file in glob.glob(f"{self.log_dir}/errors_*.log"):
|
|
with open(log_file, 'r') as f:
|
|
error_count += len([line for line in f if '[ERROR]' in line or '[CRITICAL]' in line])
|
|
return error_count
|
|
except Exception:
|
|
return 0
|
|
|
|
def get_backup_status(self):
|
|
"""Check backup health"""
|
|
try:
|
|
if not os.path.exists(self.backup_dir):
|
|
return {"healthy": 0, "total": 0}
|
|
|
|
backups = glob.glob(f"{self.backup_dir}/**/*.tar.gz", recursive=True)
|
|
healthy_backups = 0
|
|
|
|
for backup in backups:
|
|
# Simple health check - file size > 1MB
|
|
if os.path.getsize(backup) > 1024 * 1024:
|
|
healthy_backups += 1
|
|
|
|
return {"healthy": healthy_backups, "total": len(backups)}
|
|
except Exception:
|
|
return {"healthy": 0, "total": 0}
|
|
|
|
def get_service_health(self):
|
|
"""Check critical service health"""
|
|
services = {
|
|
"traefik": "docker ps | grep traefik",
|
|
"postgres": "docker ps | grep postgres",
|
|
"redis": "docker ps | grep redis"
|
|
}
|
|
|
|
healthy_services = 0
|
|
for service, check_cmd in services.items():
|
|
try:
|
|
result = subprocess.run(check_cmd, shell=True, capture_output=True)
|
|
if result.returncode == 0 and result.stdout.strip():
|
|
healthy_services += 1
|
|
except Exception:
|
|
pass
|
|
|
|
return {"healthy": healthy_services, "total": len(services)}
|
|
|
|
def generate_metrics(self):
|
|
"""Generate Prometheus metrics"""
|
|
progress = self.get_migration_progress()
|
|
errors = self.get_migration_errors()
|
|
backup_status = self.get_backup_status()
|
|
service_health = self.get_service_health()
|
|
|
|
metrics = f"""# HELP migration_progress_percent Current migration progress percentage
|
|
# TYPE migration_progress_percent gauge
|
|
migration_progress_percent {progress}
|
|
|
|
# HELP migration_errors_total Total number of migration errors
|
|
# TYPE migration_errors_total counter
|
|
migration_errors_total {errors}
|
|
|
|
# HELP migration_backup_healthy Number of healthy backups
|
|
# TYPE migration_backup_healthy gauge
|
|
migration_backup_healthy {backup_status['healthy']}
|
|
|
|
# HELP migration_backup_total Total number of backups
|
|
# TYPE migration_backup_total gauge
|
|
migration_backup_total {backup_status['total']}
|
|
|
|
# HELP migration_services_healthy Number of healthy critical services
|
|
# TYPE migration_services_healthy gauge
|
|
migration_services_healthy {service_health['healthy']}
|
|
|
|
# HELP migration_services_total Total number of critical services
|
|
# TYPE migration_services_total gauge
|
|
migration_services_total {service_health['total']}
|
|
|
|
# HELP migration_timestamp_seconds Timestamp of last metric update
|
|
# TYPE migration_timestamp_seconds gauge
|
|
migration_timestamp_seconds {time.time()}
|
|
"""
|
|
return metrics
|
|
|
|
class MetricsHandler(BaseHTTPRequestHandler):
|
|
def __init__(self, exporter, *args, **kwargs):
|
|
self.exporter = exporter
|
|
super().__init__(*args, **kwargs)
|
|
|
|
def do_GET(self):
|
|
if self.path == '/metrics':
|
|
self.send_response(200)
|
|
self.send_header('Content-type', 'text/plain; charset=utf-8')
|
|
self.end_headers()
|
|
metrics = self.exporter.generate_metrics()
|
|
self.wfile.write(metrics.encode('utf-8'))
|
|
else:
|
|
self.send_response(404)
|
|
self.end_headers()
|
|
|
|
def main():
|
|
exporter = MigrationHealthExporter()
|
|
handler = lambda *args, **kwargs: MetricsHandler(exporter, *args, **kwargs)
|
|
|
|
server = HTTPServer(('0.0.0.0', 9999), handler)
|
|
print("Migration health exporter starting on port 9999")
|
|
server.serve_forever()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
EOF
|
|
|
|
chmod +x "$MONITORING_CONFIG_DIR/exporters/migration_health_exporter.py"
|
|
|
|
log_success "Migration health exporter created"
|
|
}
|
|
|
|
# Function to deploy monitoring stack
|
|
deploy_monitoring_stack() {
|
|
log_step "Deploying comprehensive monitoring stack..."
|
|
|
|
# Create monitoring stack configuration
|
|
cat > "$MONITORING_CONFIG_DIR/monitoring-stack.yml" << 'EOF'
|
|
version: '3.8'
|
|
|
|
services:
|
|
prometheus:
|
|
image: prom/prometheus:latest
|
|
command:
|
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
- '--storage.tsdb.path=/prometheus'
|
|
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
|
- '--web.console.templates=/etc/prometheus/consoles'
|
|
- '--storage.tsdb.retention.time=30d'
|
|
- '--web.enable-lifecycle'
|
|
- '--web.enable-admin-api'
|
|
volumes:
|
|
- prometheus-data:/prometheus
|
|
- prometheus-config:/etc/prometheus
|
|
networks:
|
|
- monitoring-zone
|
|
deploy:
|
|
placement:
|
|
constraints:
|
|
- node.labels.role.monitoring == true
|
|
resources:
|
|
limits:
|
|
memory: 2G
|
|
cpus: '1.0'
|
|
reservations:
|
|
memory: 1G
|
|
cpus: '0.5'
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 5s
|
|
max_attempts: 3
|
|
|
|
alertmanager:
|
|
image: prom/alertmanager:latest
|
|
command:
|
|
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
|
- '--storage.path=/alertmanager'
|
|
- '--web.external-url=http://alertmanager:9093'
|
|
volumes:
|
|
- alertmanager-data:/alertmanager
|
|
- alertmanager-config:/etc/alertmanager
|
|
networks:
|
|
- monitoring-zone
|
|
deploy:
|
|
placement:
|
|
constraints:
|
|
- node.labels.role.monitoring == true
|
|
resources:
|
|
limits:
|
|
memory: 512M
|
|
cpus: '0.5'
|
|
reservations:
|
|
memory: 256M
|
|
cpus: '0.25'
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 5s
|
|
max_attempts: 3
|
|
|
|
grafana:
|
|
image: grafana/grafana:latest
|
|
environment:
|
|
- GF_SECURITY_ADMIN_PASSWORD_FILE=/run/secrets/grafana_admin_password
|
|
- GF_USERS_ALLOW_SIGN_UP=false
|
|
- GF_SERVER_DOMAIN=grafana.${DOMAIN}
|
|
- GF_SERVER_ROOT_URL=https://grafana.${DOMAIN}
|
|
- GF_INSTALL_PLUGINS=grafana-piechart-panel
|
|
volumes:
|
|
- grafana-data:/var/lib/grafana
|
|
- grafana-config:/etc/grafana/provisioning
|
|
networks:
|
|
- monitoring-zone
|
|
- public-zone
|
|
secrets:
|
|
- grafana_admin_password
|
|
deploy:
|
|
placement:
|
|
constraints:
|
|
- node.labels.role.monitoring == true
|
|
resources:
|
|
limits:
|
|
memory: 1G
|
|
cpus: '0.5'
|
|
reservations:
|
|
memory: 512M
|
|
cpus: '0.25'
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 5s
|
|
max_attempts: 3
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.routers.grafana.rule=Host(`grafana.${DOMAIN}`)"
|
|
- "traefik.http.routers.grafana.entrypoints=websecure"
|
|
- "traefik.http.routers.grafana.tls.certresolver=letsencrypt"
|
|
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
|
|
|
|
migration-health-exporter:
|
|
image: python:3.9-alpine
|
|
command:
|
|
- /bin/sh
|
|
- -c
|
|
- |
|
|
pip install --no-cache-dir requests
|
|
python /app/migration_health_exporter.py
|
|
volumes:
|
|
- migration-exporter:/app
|
|
- migration-logs:/opt/migration:ro
|
|
networks:
|
|
- monitoring-zone
|
|
deploy:
|
|
placement:
|
|
constraints:
|
|
- node.role == manager
|
|
resources:
|
|
limits:
|
|
memory: 256M
|
|
cpus: '0.25'
|
|
reservations:
|
|
memory: 128M
|
|
cpus: '0.1'
|
|
restart_policy:
|
|
condition: on-failure
|
|
delay: 5s
|
|
max_attempts: 3
|
|
|
|
secrets:
|
|
grafana_admin_password:
|
|
external: true
|
|
|
|
volumes:
|
|
prometheus-data:
|
|
driver: local
|
|
prometheus-config:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/migration/configs/monitoring/prometheus
|
|
alertmanager-data:
|
|
driver: local
|
|
alertmanager-config:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/migration/configs/alerting/alertmanager
|
|
grafana-data:
|
|
driver: local
|
|
grafana-config:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/migration/configs/monitoring/grafana/provisioning
|
|
migration-exporter:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/migration/configs/monitoring/exporters
|
|
migration-logs:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/migration
|
|
|
|
networks:
|
|
monitoring-zone:
|
|
external: true
|
|
public-zone:
|
|
external: true
|
|
EOF
|
|
|
|
# Copy configurations to the monitoring host
|
|
log_info "Deploying configuration files..."
|
|
ssh "$MONITORING_HOST" "sudo mkdir -p /opt/migration/configs/monitoring"
|
|
scp -r "$MONITORING_CONFIG_DIR" "$MONITORING_HOST:/opt/migration/configs/"
|
|
scp -r "$ALERTING_CONFIG_DIR" "$MONITORING_HOST:/opt/migration/configs/"
|
|
|
|
# Deploy the monitoring stack
|
|
log_info "Deploying monitoring services..."
|
|
if ssh "$MANAGER_HOST" "cd $MONITORING_CONFIG_DIR && docker stack deploy -c monitoring-stack.yml monitoring"; then
|
|
log_success "Monitoring stack deployed successfully"
|
|
else
|
|
log_error "Failed to deploy monitoring stack"
|
|
return 1
|
|
fi
|
|
|
|
# Wait for services to be ready
|
|
wait_for_service "Prometheus" "curl -f http://$MONITORING_HOST:9090/-/healthy" 120 10
|
|
wait_for_service "Grafana" "curl -f http://$MONITORING_HOST:3000/api/health" 120 10
|
|
|
|
log_success "Monitoring stack deployment completed"
|
|
}
|
|
|
|
# Main execution function
|
|
main() {
|
|
local action=${1:-"full"}
|
|
|
|
# Register cleanup and rollback functions
|
|
register_cleanup cleanup_monitoring
|
|
register_rollback rollback_monitoring
|
|
|
|
case $action in
|
|
"full")
|
|
log_step "Setting up comprehensive monitoring system..."
|
|
|
|
# Validate prerequisites
|
|
validate_prerequisites ssh curl jq python3
|
|
|
|
# Create configurations
|
|
create_prometheus_config
|
|
create_grafana_dashboards
|
|
create_alertmanager_config
|
|
create_migration_health_exporter
|
|
|
|
# Deploy monitoring stack
|
|
deploy_monitoring_stack
|
|
|
|
log_success "✅ Comprehensive monitoring setup completed!"
|
|
log_info "📊 Prometheus: http://$MONITORING_HOST:9090"
|
|
log_info "📈 Grafana: http://$MONITORING_HOST:3000"
|
|
log_info "🚨 Alertmanager: http://$MONITORING_HOST:9093"
|
|
log_info "🔍 Migration Health: http://$MANAGER_HOST:9999/metrics"
|
|
;;
|
|
|
|
"config-only")
|
|
create_prometheus_config
|
|
create_grafana_dashboards
|
|
create_alertmanager_config
|
|
create_migration_health_exporter
|
|
;;
|
|
|
|
"deploy-only")
|
|
deploy_monitoring_stack
|
|
;;
|
|
|
|
"help"|*)
|
|
cat << EOF
|
|
Comprehensive Monitoring Setup
|
|
|
|
Usage: $0 <action>
|
|
|
|
Actions:
|
|
full - Complete monitoring setup (default)
|
|
config-only - Only create configurations
|
|
deploy-only - Only deploy services
|
|
help - Show this help
|
|
|
|
Examples:
|
|
$0 full
|
|
$0 config-only
|
|
EOF
|
|
;;
|
|
esac
|
|
}
|
|
|
|
# Execute main function
|
|
main "$@" |