version: '3.9' services: prometheus: image: prom/prometheus:v2.47.0 command: - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.path=/prometheus - --web.console.libraries=/etc/prometheus/console_libraries - --web.console.templates=/etc/prometheus/consoles - --storage.tsdb.retention.time=30d - --web.enable-lifecycle - --web.enable-admin-api volumes: - prometheus_data:/prometheus - prometheus_config:/etc/prometheus networks: - monitoring-network - traefik-public ports: - 9090:9090 healthcheck: test: - CMD - wget - --no-verbose - --tries=1 - --spider - http://localhost:9090/-/healthy interval: 30s timeout: 10s retries: 3 start_period: 30s deploy: resources: limits: memory: 2G cpus: '1.0' reservations: memory: 1G cpus: '0.5' placement: constraints: - node.labels.role==monitor labels: - traefik.enable=true - traefik.http.routers.prometheus.rule=Host(`prometheus.localhost`) - traefik.http.routers.prometheus.entrypoints=websecure - traefik.http.routers.prometheus.tls=true - traefik.http.services.prometheus.loadbalancer.server.port=9090 grafana: image: grafana/grafana:10.1.2 environment: GF_PROVISIONING_PATH: /etc/grafana/provisioning GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel GF_FEATURE_TOGGLES_ENABLE: publicDashboards GF_SECURITY_ADMIN_PASSWORD_FILE_FILE: /run/secrets/gf_security_admin_password_file secrets: - grafana_admin_password - gf_security_admin_password_file volumes: - grafana_data:/var/lib/grafana - grafana_config:/etc/grafana/provisioning networks: - monitoring-network - traefik-public healthcheck: test: - CMD-SHELL - curl -f http://localhost:3000/api/health || exit 1 interval: 30s timeout: 10s retries: 3 start_period: 60s deploy: resources: limits: memory: 1G cpus: '0.5' reservations: memory: 512M cpus: '0.25' placement: constraints: - node.labels.role==monitor labels: - traefik.enable=true - traefik.http.routers.grafana.rule=Host(`grafana.localhost`) - traefik.http.routers.grafana.entrypoints=websecure - traefik.http.routers.grafana.tls=true - traefik.http.services.grafana.loadbalancer.server.port=3000 alertmanager: image: prom/alertmanager:v0.26.0 command: - --config.file=/etc/alertmanager/alertmanager.yml - --storage.path=/alertmanager - --web.external-url=http://localhost:9093 volumes: - alertmanager_data:/alertmanager - alertmanager_config:/etc/alertmanager networks: - monitoring-network - traefik-public healthcheck: test: - CMD - wget - --no-verbose - --tries=1 - --spider - http://localhost:9093/-/healthy interval: 30s timeout: 10s retries: 3 start_period: 30s deploy: resources: limits: memory: 512M cpus: '0.25' reservations: memory: 256M cpus: '0.1' placement: constraints: - node.labels.role==monitor labels: - traefik.enable=true - traefik.http.routers.alertmanager.rule=Host(`alerts.localhost`) - traefik.http.routers.alertmanager.entrypoints=websecure - traefik.http.routers.alertmanager.tls=true - traefik.http.services.alertmanager.loadbalancer.server.port=9093 node-exporter: image: prom/node-exporter:v1.6.1 command: - --path.procfs=/host/proc - --path.sysfs=/host/sys - --collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/) - --collector.textfile.directory=/var/lib/node_exporter/textfile_collector volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/rootfs:ro - node_exporter_textfiles:/var/lib/node_exporter/textfile_collector networks: - monitoring-network ports: - 9100:9100 healthcheck: test: - CMD - wget - --no-verbose - --tries=1 - --spider - http://localhost:9100/metrics interval: 30s timeout: 10s retries: 3 deploy: mode: global resources: limits: memory: 256M cpus: '0.2' reservations: memory: 128M cpus: '0.1' cadvisor: image: gcr.io/cadvisor/cadvisor:v0.47.2 volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro - /dev/disk/:/dev/disk:ro networks: - monitoring-network ports: - 8080:8080 healthcheck: test: - CMD - wget - --no-verbose - --tries=1 - --spider - http://localhost:8080/healthz interval: 30s timeout: 10s retries: 3 deploy: mode: global resources: limits: memory: 512M cpus: '0.3' reservations: memory: 256M cpus: '0.1' business-metrics: image: alpine:3.18 command: "sh -c \"\n apk add --no-cache curl jq python3 py3-pip &&\n pip3 install\ \ requests pyyaml prometheus_client &&\n while true; do\n echo '[$(date)]\ \ Collecting business metrics...' &&\n # Immich metrics\n curl -s http://immich_server:3001/api/server-info/stats\ \ > /tmp/immich-stats.json 2>/dev/null || echo '{}' > /tmp/immich-stats.json\ \ &&\n # Nextcloud metrics \n curl -s -u admin:\\$NEXTCLOUD_ADMIN_PASS\ \ http://nextcloud/ocs/v2.php/apps/serverinfo/api/v1/info?format=json > /tmp/nextcloud-stats.json\ \ 2>/dev/null || echo '{}' > /tmp/nextcloud-stats.json &&\n # Home Assistant\ \ metrics\n curl -s -H 'Authorization: Bearer \\$HA_TOKEN' http://homeassistant:8123/api/states\ \ > /tmp/ha-stats.json 2>/dev/null || echo '[]' > /tmp/ha-stats.json &&\n \ \ # Process and expose metrics via HTTP for Prometheus scraping\n python3\ \ /app/business_metrics_processor.py &&\n sleep 300\n done\n\"\n" environment: NEXTCLOUD_ADMIN_PASS_FILE: /run/secrets/nextcloud_admin_password HA_TOKEN_FILE_FILE: /run/secrets/ha_token_file secrets: - nextcloud_admin_password - ha_api_token - ha_token_file networks: - monitoring-network - traefik-public - database-network ports: - 8888:8888 volumes: - business_metrics_scripts:/app deploy: resources: limits: memory: 256M cpus: '0.2' reservations: memory: 128M cpus: '0.05' placement: constraints: - node.labels.role==monitor loki: image: grafana/loki:2.9.0 command: -config.file=/etc/loki/local-config.yaml volumes: - loki_data:/tmp/loki - loki_config:/etc/loki networks: - monitoring-network ports: - 3100:3100 healthcheck: test: - CMD - wget - --no-verbose - --tries=1 - --spider - http://localhost:3100/ready interval: 30s timeout: 10s retries: 3 start_period: 60s deploy: resources: limits: memory: 1G cpus: '0.5' reservations: memory: 512M cpus: '0.25' placement: constraints: - node.labels.role==monitor promtail: image: grafana/promtail:2.9.0 command: -config.file=/etc/promtail/config.yml volumes: - /var/log:/var/log:ro - /var/lib/docker/containers:/var/lib/docker/containers:ro - promtail_config:/etc/promtail networks: - monitoring-network healthcheck: test: - CMD - wget - --no-verbose - --tries=1 - --spider - http://localhost:9080/ready interval: 30s timeout: 10s retries: 3 deploy: mode: global resources: limits: memory: 256M cpus: '0.2' reservations: memory: 128M cpus: '0.05' volumes: prometheus_data: driver: local driver_opts: type: none o: bind device: /opt/monitoring/prometheus/data prometheus_config: driver: local driver_opts: type: none o: bind device: /opt/monitoring/prometheus/config grafana_data: driver: local driver_opts: type: none o: bind device: /opt/monitoring/grafana/data grafana_config: driver: local driver_opts: type: none o: bind device: /opt/monitoring/grafana/config alertmanager_data: driver: local alertmanager_config: driver: local node_exporter_textfiles: driver: local business_metrics_scripts: driver: local driver_opts: type: none o: bind device: /opt/monitoring/business-metrics loki_data: driver: local loki_config: driver: local promtail_config: driver: local secrets: grafana_admin_password: external: true nextcloud_admin_password: external: true ha_api_token: external: true gf_security_admin_password_file: external: true ha_token_file: external: true networks: monitoring-network: external: true traefik-public: external: true database-network: external: true