HomeAudit/backups/stacks-pre-secrets-20250828-092958/comprehensive-monitoring.yml

version: '3.9'

services:
  # Prometheus for metrics collection
  prometheus:
    image: prom/prometheus:v2.47.0
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=30d'
      - '--web.enable-lifecycle'
      - '--web.enable-admin-api'
    volumes:
      - prometheus_data:/prometheus
      - prometheus_config:/etc/prometheus
    networks:
      - monitoring-network
      - traefik-public
    ports:
      - "9090:9090"
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s
    deploy:
      resources:
        limits:
          memory: 2G
          cpus: '1.0'
        reservations:
          memory: 1G
          cpus: '0.5'
      placement:
        constraints:
          - "node.labels.role==monitor"
      labels:
        - traefik.enable=true
        - traefik.http.routers.prometheus.rule=Host(`prometheus.localhost`)
        - traefik.http.routers.prometheus.entrypoints=websecure
        - traefik.http.routers.prometheus.tls=true
        - traefik.http.services.prometheus.loadbalancer.server.port=9090

  # Grafana for visualization
  grafana:
    image: grafana/grafana:10.1.2
    environment:
      - GF_SECURITY_ADMIN_PASSWORD_FILE=/run/secrets/grafana_admin_password
      - GF_PROVISIONING_PATH=/etc/grafana/provisioning
      - GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel
      - GF_FEATURE_TOGGLES_ENABLE=publicDashboards
    secrets:
      - grafana_admin_password
    volumes:
      - grafana_data:/var/lib/grafana
      - grafana_config:/etc/grafana/provisioning
    networks:
      - monitoring-network
      - traefik-public
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    deploy:
      resources:
        limits:
          memory: 1G
          cpus: '0.5'
        reservations:
          memory: 512M
          cpus: '0.25'
      placement:
        constraints:
          - "node.labels.role==monitor"
      labels:
        - traefik.enable=true
        - traefik.http.routers.grafana.rule=Host(`grafana.localhost`)
        - traefik.http.routers.grafana.entrypoints=websecure
        - traefik.http.routers.grafana.tls=true
        - traefik.http.services.grafana.loadbalancer.server.port=3000

  # AlertManager for alerting
  alertmanager:
    image: prom/alertmanager:v0.26.0
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
      - '--web.external-url=http://localhost:9093'
    volumes:
      - alertmanager_data:/alertmanager
      - alertmanager_config:/etc/alertmanager
    networks:
      - monitoring-network
      - traefik-public
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s
    deploy:
      resources:
        limits:
          memory: 512M
          cpus: '0.25'
        reservations:
          memory: 256M
          cpus: '0.1'
      placement:
        constraints:
          - "node.labels.role==monitor"
      labels:
        - traefik.enable=true
        - traefik.http.routers.alertmanager.rule=Host(`alerts.localhost`)
        - traefik.http.routers.alertmanager.entrypoints=websecure
        - traefik.http.routers.alertmanager.tls=true
        - traefik.http.services.alertmanager.loadbalancer.server.port=9093

  # Node Exporter for system metrics (deploy on all nodes)
  node-exporter:
    image: prom/node-exporter:v1.6.1
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
      - '--collector.textfile.directory=/var/lib/node_exporter/textfile_collector'
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
      - node_exporter_textfiles:/var/lib/node_exporter/textfile_collector
    networks:
      - monitoring-network
    ports:
      - "9100:9100"
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9100/metrics"]
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      mode: global
      resources:
        limits:
          memory: 256M
          cpus: '0.2'
        reservations:
          memory: 128M
          cpus: '0.1'

  # cAdvisor for container metrics
  cadvisor:
    image: gcr.io/cadvisor/cadvisor:v0.47.2
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
      - /dev/disk/:/dev/disk:ro
    networks:
      - monitoring-network
    ports:
      - "8080:8080"
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/healthz"]
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      mode: global
      resources:
        limits:
          memory: 512M
          cpus: '0.3'
        reservations:
          memory: 256M
          cpus: '0.1'

  # Business metrics collector
  business-metrics:
    image: alpine:3.18
    command: |
      sh -c "
        apk add --no-cache curl jq python3 py3-pip &&
        pip3 install requests pyyaml prometheus_client &&
        while true; do
          echo '[$(date)] Collecting business metrics...' &&
          # Immich metrics
          curl -s http://immich_server:3001/api/server-info/stats > /tmp/immich-stats.json 2>/dev/null || echo '{}' > /tmp/immich-stats.json &&
          # Nextcloud metrics
          curl -s -u admin:\$NEXTCLOUD_ADMIN_PASS http://nextcloud/ocs/v2.php/apps/serverinfo/api/v1/info?format=json > /tmp/nextcloud-stats.json 2>/dev/null || echo '{}' > /tmp/nextcloud-stats.json &&
          # Home Assistant metrics
          curl -s -H 'Authorization: Bearer \$HA_TOKEN' http://homeassistant:8123/api/states > /tmp/ha-stats.json 2>/dev/null || echo '[]' > /tmp/ha-stats.json &&
          # Process and expose metrics via HTTP for Prometheus scraping
          python3 /app/business_metrics_processor.py &&
          sleep 300
        done
      "
    environment:
      - NEXTCLOUD_ADMIN_PASS_FILE=/run/secrets/nextcloud_admin_password
      - HA_TOKEN_FILE=/run/secrets/ha_api_token
    secrets:
      - nextcloud_admin_password
      - ha_api_token
    networks:
      - monitoring-network
      - traefik-public
      - database-network
    ports:
      - "8888:8888"
    volumes:
      - business_metrics_scripts:/app
    deploy:
      resources:
        limits:
          memory: 256M
          cpus: '0.2'
        reservations:
          memory: 128M
          cpus: '0.05'
      placement:
        constraints:
          - "node.labels.role==monitor"

  # Loki for log aggregation
  loki:
    image: grafana/loki:2.9.0
    command: -config.file=/etc/loki/local-config.yaml
    volumes:
      - loki_data:/tmp/loki
      - loki_config:/etc/loki
    networks:
      - monitoring-network
    ports:
      - "3100:3100"
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3100/ready"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    deploy:
      resources:
        limits:
          memory: 1G
          cpus: '0.5'
        reservations:
          memory: 512M
          cpus: '0.25'
      placement:
        constraints:
          - "node.labels.role==monitor"

  # Promtail for log collection
  promtail:
    image: grafana/promtail:2.9.0
    command: -config.file=/etc/promtail/config.yml
    volumes:
      - /var/log:/var/log:ro
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
      - promtail_config:/etc/promtail
    networks:
      - monitoring-network
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9080/ready"]
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      mode: global
      resources:
        limits:
          memory: 256M
          cpus: '0.2'
        reservations:
          memory: 128M
          cpus: '0.05'

volumes:
  prometheus_data:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/monitoring/prometheus/data
  prometheus_config:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/monitoring/prometheus/config
  grafana_data:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/monitoring/grafana/data
  grafana_config:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/monitoring/grafana/config
  alertmanager_data:
    driver: local
  alertmanager_config:
    driver: local
  node_exporter_textfiles:
    driver: local
  business_metrics_scripts:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/monitoring/business-metrics
  loki_data:
    driver: local
  loki_config:
    driver: local
  promtail_config:
    driver: local

secrets:
  grafana_admin_password:
    external: true
  nextcloud_admin_password:
    external: true
  ha_api_token:
    external: true

networks:
  monitoring-network:
    external: true
  traefik-public:
    external: true
  database-network:
    external: true