HomeAudit/stacks/monitoring/comprehensive-monitoring.yml

version: '3.9'
services:
  prometheus:
    image: prom/prometheus:v2.47.0
    command:
    - --config.file=/etc/prometheus/prometheus.yml
    - --storage.tsdb.path=/prometheus
    - --web.console.libraries=/etc/prometheus/console_libraries
    - --web.console.templates=/etc/prometheus/consoles
    - --storage.tsdb.retention.time=30d
    - --web.enable-lifecycle
    - --web.enable-admin-api
    volumes:
    - prometheus_data:/prometheus
    - prometheus_config:/etc/prometheus
    networks:
    - monitoring-network
    - traefik-public
    ports:
    - 9090:9090
    healthcheck:
      test:
      - CMD
      - wget
      - --no-verbose
      - --tries=1
      - --spider
      - http://localhost:9090/-/healthy
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s
    deploy:
      resources:
        limits:
          memory: 2G
          cpus: '1.0'
        reservations:
          memory: 1G
          cpus: '0.5'
      placement:
        constraints:
        - node.labels.role==monitor
      labels:
      - traefik.enable=true
      - traefik.http.routers.prometheus.rule=Host(`prometheus.localhost`)
      - traefik.http.routers.prometheus.entrypoints=websecure
      - traefik.http.routers.prometheus.tls=true
      - traefik.http.services.prometheus.loadbalancer.server.port=9090
  grafana:
    image: grafana/grafana:10.1.2
    environment:
      GF_PROVISIONING_PATH: /etc/grafana/provisioning
      GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel
      GF_FEATURE_TOGGLES_ENABLE: publicDashboards
      GF_SECURITY_ADMIN_PASSWORD_FILE_FILE: /run/secrets/gf_security_admin_password_file
    secrets:
    - grafana_admin_password
    - gf_security_admin_password_file
    volumes:
    - grafana_data:/var/lib/grafana
    - grafana_config:/etc/grafana/provisioning
    networks:
    - monitoring-network
    - traefik-public
    healthcheck:
      test:
      - CMD-SHELL
      - curl -f http://localhost:3000/api/health || exit 1
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    deploy:
      resources:
        limits:
          memory: 1G
          cpus: '0.5'
        reservations:
          memory: 512M
          cpus: '0.25'
      placement:
        constraints:
        - node.labels.role==monitor
      labels:
      - traefik.enable=true
      - traefik.http.routers.grafana.rule=Host(`grafana.localhost`)
      - traefik.http.routers.grafana.entrypoints=websecure
      - traefik.http.routers.grafana.tls=true
      - traefik.http.services.grafana.loadbalancer.server.port=3000
  alertmanager:
    image: prom/alertmanager:v0.26.0
    command:
    - --config.file=/etc/alertmanager/alertmanager.yml
    - --storage.path=/alertmanager
    - --web.external-url=http://localhost:9093
    volumes:
    - alertmanager_data:/alertmanager
    - alertmanager_config:/etc/alertmanager
    networks:
    - monitoring-network
    - traefik-public
    healthcheck:
      test:
      - CMD
      - wget
      - --no-verbose
      - --tries=1
      - --spider
      - http://localhost:9093/-/healthy
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 30s
    deploy:
      resources:
        limits:
          memory: 512M
          cpus: '0.25'
        reservations:
          memory: 256M
          cpus: '0.1'
      placement:
        constraints:
        - node.labels.role==monitor
      labels:
      - traefik.enable=true
      - traefik.http.routers.alertmanager.rule=Host(`alerts.localhost`)
      - traefik.http.routers.alertmanager.entrypoints=websecure
      - traefik.http.routers.alertmanager.tls=true
      - traefik.http.services.alertmanager.loadbalancer.server.port=9093
  node-exporter:
    image: prom/node-exporter:v1.6.1
    command:
    - --path.procfs=/host/proc
    - --path.sysfs=/host/sys
    - --collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)
    - --collector.textfile.directory=/var/lib/node_exporter/textfile_collector
    volumes:
    - /proc:/host/proc:ro
    - /sys:/host/sys:ro
    - /:/rootfs:ro
    - node_exporter_textfiles:/var/lib/node_exporter/textfile_collector
    networks:
    - monitoring-network
    ports:
    - 9100:9100
    healthcheck:
      test:
      - CMD
      - wget
      - --no-verbose
      - --tries=1
      - --spider
      - http://localhost:9100/metrics
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      mode: global
      resources:
        limits:
          memory: 256M
          cpus: '0.2'
        reservations:
          memory: 128M
          cpus: '0.1'
  cadvisor:
    image: gcr.io/cadvisor/cadvisor:v0.47.2
    volumes:
    - /:/rootfs:ro
    - /var/run:/var/run:ro
    - /sys:/sys:ro
    - /var/lib/docker/:/var/lib/docker:ro
    - /dev/disk/:/dev/disk:ro
    networks:
    - monitoring-network
    ports:
    - 8080:8080
    healthcheck:
      test:
      - CMD
      - wget
      - --no-verbose
      - --tries=1
      - --spider
      - http://localhost:8080/healthz
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      mode: global
      resources:
        limits:
          memory: 512M
          cpus: '0.3'
        reservations:
          memory: 256M
          cpus: '0.1'
  business-metrics:
    image: alpine:3.18
    command: "sh -c \"\n  apk add --no-cache curl jq python3 py3-pip &&\n  pip3 install\
      \ requests pyyaml prometheus_client &&\n  while true; do\n    echo '[$(date)]\
      \ Collecting business metrics...' &&\n    # Immich metrics\n    curl -s http://immich_server:3001/api/server-info/stats\
      \ > /tmp/immich-stats.json 2>/dev/null || echo '{}' > /tmp/immich-stats.json\
      \ &&\n    # Nextcloud metrics  \n    curl -s -u admin:\\$NEXTCLOUD_ADMIN_PASS\
      \ http://nextcloud/ocs/v2.php/apps/serverinfo/api/v1/info?format=json > /tmp/nextcloud-stats.json\
      \ 2>/dev/null || echo '{}' > /tmp/nextcloud-stats.json &&\n    # Home Assistant\
      \ metrics\n    curl -s -H 'Authorization: Bearer \\$HA_TOKEN' http://homeassistant:8123/api/states\
      \ > /tmp/ha-stats.json 2>/dev/null || echo '[]' > /tmp/ha-stats.json &&\n  \
      \  # Process and expose metrics via HTTP for Prometheus scraping\n    python3\
      \ /app/business_metrics_processor.py &&\n    sleep 300\n  done\n\"\n"
    environment:
      NEXTCLOUD_ADMIN_PASS_FILE: /run/secrets/nextcloud_admin_password
      HA_TOKEN_FILE_FILE: /run/secrets/ha_token_file
    secrets:
    - nextcloud_admin_password
    - ha_api_token
    - ha_token_file
    networks:
    - monitoring-network
    - traefik-public
    - database-network
    ports:
    - 8888:8888
    volumes:
    - business_metrics_scripts:/app
    deploy:
      resources:
        limits:
          memory: 256M
          cpus: '0.2'
        reservations:
          memory: 128M
          cpus: '0.05'
      placement:
        constraints:
        - node.labels.role==monitor
  loki:
    image: grafana/loki:2.9.0
    command: -config.file=/etc/loki/local-config.yaml
    volumes:
    - loki_data:/tmp/loki
    - loki_config:/etc/loki
    networks:
    - monitoring-network
    ports:
    - 3100:3100
    healthcheck:
      test:
      - CMD
      - wget
      - --no-verbose
      - --tries=1
      - --spider
      - http://localhost:3100/ready
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    deploy:
      resources:
        limits:
          memory: 1G
          cpus: '0.5'
        reservations:
          memory: 512M
          cpus: '0.25'
      placement:
        constraints:
        - node.labels.role==monitor
  promtail:
    image: grafana/promtail:2.9.0
    command: -config.file=/etc/promtail/config.yml
    volumes:
    - /var/log:/var/log:ro
    - /var/lib/docker/containers:/var/lib/docker/containers:ro
    - promtail_config:/etc/promtail
    networks:
    - monitoring-network
    healthcheck:
      test:
      - CMD
      - wget
      - --no-verbose
      - --tries=1
      - --spider
      - http://localhost:9080/ready
      interval: 30s
      timeout: 10s
      retries: 3
    deploy:
      mode: global
      resources:
        limits:
          memory: 256M
          cpus: '0.2'
        reservations:
          memory: 128M
          cpus: '0.05'
volumes:
  prometheus_data:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/monitoring/prometheus/data
  prometheus_config:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/monitoring/prometheus/config
  grafana_data:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/monitoring/grafana/data
  grafana_config:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/monitoring/grafana/config
  alertmanager_data:
    driver: local
  alertmanager_config:
    driver: local
  node_exporter_textfiles:
    driver: local
  business_metrics_scripts:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /opt/monitoring/business-metrics
  loki_data:
    driver: local
  loki_config:
    driver: local
  promtail_config:
    driver: local
secrets:
  grafana_admin_password:
    external: true
  nextcloud_admin_password:
    external: true
  ha_api_token:
    external: true
  gf_security_admin_password_file:
    external: true
  ha_token_file:
    external: true
networks:
  monitoring-network:
    external: true
  traefik-public:
    external: true
  database-network:
    external: true