version: '3.9' services: # Prometheus for metrics collection prometheus: image: prom/prometheus:v2.47.0 command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.console.libraries=/etc/prometheus/console_libraries' - '--web.console.templates=/etc/prometheus/consoles' - '--storage.tsdb.retention.time=30d' - '--web.enable-lifecycle' - '--web.enable-admin-api' volumes: - prometheus_data:/prometheus - prometheus_config:/etc/prometheus networks: - monitoring-network - traefik-public ports: - "9090:9090" healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"] interval: 30s timeout: 10s retries: 3 start_period: 30s deploy: resources: limits: memory: 2G cpus: '1.0' reservations: memory: 1G cpus: '0.5' placement: constraints: - "node.labels.role==monitor" labels: - traefik.enable=true - traefik.http.routers.prometheus.rule=Host(`prometheus.localhost`) - traefik.http.routers.prometheus.entrypoints=websecure - traefik.http.routers.prometheus.tls=true - traefik.http.services.prometheus.loadbalancer.server.port=9090 # Grafana for visualization grafana: image: grafana/grafana:10.1.2 environment: - GF_SECURITY_ADMIN_PASSWORD_FILE=/run/secrets/grafana_admin_password - GF_PROVISIONING_PATH=/etc/grafana/provisioning - GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel - GF_FEATURE_TOGGLES_ENABLE=publicDashboards secrets: - grafana_admin_password volumes: - grafana_data:/var/lib/grafana - grafana_config:/etc/grafana/provisioning networks: - monitoring-network - traefik-public healthcheck: test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"] interval: 30s timeout: 10s retries: 3 start_period: 60s deploy: resources: limits: memory: 1G cpus: '0.5' reservations: memory: 512M cpus: '0.25' placement: constraints: - "node.labels.role==monitor" labels: - traefik.enable=true - traefik.http.routers.grafana.rule=Host(`grafana.localhost`) - traefik.http.routers.grafana.entrypoints=websecure - traefik.http.routers.grafana.tls=true - traefik.http.services.grafana.loadbalancer.server.port=3000 # AlertManager for alerting alertmanager: image: prom/alertmanager:v0.26.0 command: - '--config.file=/etc/alertmanager/alertmanager.yml' - '--storage.path=/alertmanager' - '--web.external-url=http://localhost:9093' volumes: - alertmanager_data:/alertmanager - alertmanager_config:/etc/alertmanager networks: - monitoring-network - traefik-public healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9093/-/healthy"] interval: 30s timeout: 10s retries: 3 start_period: 30s deploy: resources: limits: memory: 512M cpus: '0.25' reservations: memory: 256M cpus: '0.1' placement: constraints: - "node.labels.role==monitor" labels: - traefik.enable=true - traefik.http.routers.alertmanager.rule=Host(`alerts.localhost`) - traefik.http.routers.alertmanager.entrypoints=websecure - traefik.http.routers.alertmanager.tls=true - traefik.http.services.alertmanager.loadbalancer.server.port=9093 # Node Exporter for system metrics (deploy on all nodes) node-exporter: image: prom/node-exporter:v1.6.1 command: - '--path.procfs=/host/proc' - '--path.sysfs=/host/sys' - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)' - '--collector.textfile.directory=/var/lib/node_exporter/textfile_collector' volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/rootfs:ro - node_exporter_textfiles:/var/lib/node_exporter/textfile_collector networks: - monitoring-network ports: - "9100:9100" healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9100/metrics"] interval: 30s timeout: 10s retries: 3 deploy: mode: global resources: limits: memory: 256M cpus: '0.2' reservations: memory: 128M cpus: '0.1' # cAdvisor for container metrics cadvisor: image: gcr.io/cadvisor/cadvisor:v0.47.2 volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro - /dev/disk/:/dev/disk:ro networks: - monitoring-network ports: - "8080:8080" healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/healthz"] interval: 30s timeout: 10s retries: 3 deploy: mode: global resources: limits: memory: 512M cpus: '0.3' reservations: memory: 256M cpus: '0.1' # Business metrics collector business-metrics: image: alpine:3.18 command: | sh -c " apk add --no-cache curl jq python3 py3-pip && pip3 install requests pyyaml prometheus_client && while true; do echo '[$(date)] Collecting business metrics...' && # Immich metrics curl -s http://immich_server:3001/api/server-info/stats > /tmp/immich-stats.json 2>/dev/null || echo '{}' > /tmp/immich-stats.json && # Nextcloud metrics curl -s -u admin:\$NEXTCLOUD_ADMIN_PASS http://nextcloud/ocs/v2.php/apps/serverinfo/api/v1/info?format=json > /tmp/nextcloud-stats.json 2>/dev/null || echo '{}' > /tmp/nextcloud-stats.json && # Home Assistant metrics curl -s -H 'Authorization: Bearer \$HA_TOKEN' http://homeassistant:8123/api/states > /tmp/ha-stats.json 2>/dev/null || echo '[]' > /tmp/ha-stats.json && # Process and expose metrics via HTTP for Prometheus scraping python3 /app/business_metrics_processor.py && sleep 300 done " environment: - NEXTCLOUD_ADMIN_PASS_FILE=/run/secrets/nextcloud_admin_password - HA_TOKEN_FILE=/run/secrets/ha_api_token secrets: - nextcloud_admin_password - ha_api_token networks: - monitoring-network - traefik-public - database-network ports: - "8888:8888" volumes: - business_metrics_scripts:/app deploy: resources: limits: memory: 256M cpus: '0.2' reservations: memory: 128M cpus: '0.05' placement: constraints: - "node.labels.role==monitor" # Loki for log aggregation loki: image: grafana/loki:2.9.0 command: -config.file=/etc/loki/local-config.yaml volumes: - loki_data:/tmp/loki - loki_config:/etc/loki networks: - monitoring-network ports: - "3100:3100" healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3100/ready"] interval: 30s timeout: 10s retries: 3 start_period: 60s deploy: resources: limits: memory: 1G cpus: '0.5' reservations: memory: 512M cpus: '0.25' placement: constraints: - "node.labels.role==monitor" # Promtail for log collection promtail: image: grafana/promtail:2.9.0 command: -config.file=/etc/promtail/config.yml volumes: - /var/log:/var/log:ro - /var/lib/docker/containers:/var/lib/docker/containers:ro - promtail_config:/etc/promtail networks: - monitoring-network healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9080/ready"] interval: 30s timeout: 10s retries: 3 deploy: mode: global resources: limits: memory: 256M cpus: '0.2' reservations: memory: 128M cpus: '0.05' volumes: prometheus_data: driver: local driver_opts: type: none o: bind device: /opt/monitoring/prometheus/data prometheus_config: driver: local driver_opts: type: none o: bind device: /opt/monitoring/prometheus/config grafana_data: driver: local driver_opts: type: none o: bind device: /opt/monitoring/grafana/data grafana_config: driver: local driver_opts: type: none o: bind device: /opt/monitoring/grafana/config alertmanager_data: driver: local alertmanager_config: driver: local node_exporter_textfiles: driver: local business_metrics_scripts: driver: local driver_opts: type: none o: bind device: /opt/monitoring/business-metrics loki_data: driver: local loki_config: driver: local promtail_config: driver: local secrets: grafana_admin_password: external: true nextcloud_admin_password: external: true ha_api_token: external: true networks: monitoring-network: external: true traefik-public: external: true database-network: external: true