Files
HomeAudit/stacks/monitoring/comprehensive-monitoring.yml
admin 9ea31368f5 Complete Traefik infrastructure deployment - 60% complete
Major accomplishments:
-  SELinux policy installed and working
-  Core Traefik v2.10 deployment running
-  Production configuration ready (v3.1)
-  Monitoring stack configured
-  Comprehensive documentation created
-  Security hardening implemented

Current status:
- 🟡 Partially deployed (60% complete)
- ⚠️ Docker socket access needs resolution
-  Monitoring stack not deployed yet
- ⚠️ Production migration pending

Next steps:
1. Fix Docker socket permissions
2. Deploy monitoring stack
3. Migrate to production config
4. Validate full functionality

Files added:
- Complete Traefik deployment documentation
- Production and test configurations
- Monitoring stack configurations
- SELinux policy module
- Security checklists and guides
- Current status documentation
2025-08-28 15:22:41 -04:00

362 lines
9.3 KiB
YAML

version: '3.9'
services:
prometheus:
image: prom/prometheus:v2.47.0
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --web.console.libraries=/etc/prometheus/console_libraries
- --web.console.templates=/etc/prometheus/consoles
- --storage.tsdb.retention.time=30d
- --web.enable-lifecycle
- --web.enable-admin-api
volumes:
- prometheus_data:/prometheus
- prometheus_config:/etc/prometheus
networks:
- monitoring-network
- traefik-public
ports:
- 9090:9090
healthcheck:
test:
- CMD
- wget
- --no-verbose
- --tries=1
- --spider
- http://localhost:9090/-/healthy
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
deploy:
resources:
limits:
memory: 2G
cpus: '1.0'
reservations:
memory: 1G
cpus: '0.5'
placement:
constraints:
- node.labels.role==monitor
labels:
- traefik.enable=true
- traefik.http.routers.prometheus.rule=Host(`prometheus.localhost`)
- traefik.http.routers.prometheus.entrypoints=websecure
- traefik.http.routers.prometheus.tls=true
- traefik.http.services.prometheus.loadbalancer.server.port=9090
grafana:
image: grafana/grafana:10.1.2
environment:
GF_PROVISIONING_PATH: /etc/grafana/provisioning
GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel
GF_FEATURE_TOGGLES_ENABLE: publicDashboards
GF_SECURITY_ADMIN_PASSWORD_FILE_FILE: /run/secrets/gf_security_admin_password_file
secrets:
- grafana_admin_password
- gf_security_admin_password_file
volumes:
- grafana_data:/var/lib/grafana
- grafana_config:/etc/grafana/provisioning
networks:
- monitoring-network
- traefik-public
healthcheck:
test:
- CMD-SHELL
- curl -f http://localhost:3000/api/health || exit 1
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
deploy:
resources:
limits:
memory: 1G
cpus: '0.5'
reservations:
memory: 512M
cpus: '0.25'
placement:
constraints:
- node.labels.role==monitor
labels:
- traefik.enable=true
- traefik.http.routers.grafana.rule=Host(`grafana.localhost`)
- traefik.http.routers.grafana.entrypoints=websecure
- traefik.http.routers.grafana.tls=true
- traefik.http.services.grafana.loadbalancer.server.port=3000
alertmanager:
image: prom/alertmanager:v0.26.0
command:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
- --web.external-url=http://localhost:9093
volumes:
- alertmanager_data:/alertmanager
- alertmanager_config:/etc/alertmanager
networks:
- monitoring-network
- traefik-public
healthcheck:
test:
- CMD
- wget
- --no-verbose
- --tries=1
- --spider
- http://localhost:9093/-/healthy
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
deploy:
resources:
limits:
memory: 512M
cpus: '0.25'
reservations:
memory: 256M
cpus: '0.1'
placement:
constraints:
- node.labels.role==monitor
labels:
- traefik.enable=true
- traefik.http.routers.alertmanager.rule=Host(`alerts.localhost`)
- traefik.http.routers.alertmanager.entrypoints=websecure
- traefik.http.routers.alertmanager.tls=true
- traefik.http.services.alertmanager.loadbalancer.server.port=9093
node-exporter:
image: prom/node-exporter:v1.6.1
command:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)
- --collector.textfile.directory=/var/lib/node_exporter/textfile_collector
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- node_exporter_textfiles:/var/lib/node_exporter/textfile_collector
networks:
- monitoring-network
ports:
- 9100:9100
healthcheck:
test:
- CMD
- wget
- --no-verbose
- --tries=1
- --spider
- http://localhost:9100/metrics
interval: 30s
timeout: 10s
retries: 3
deploy:
mode: global
resources:
limits:
memory: 256M
cpus: '0.2'
reservations:
memory: 128M
cpus: '0.1'
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.2
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
networks:
- monitoring-network
ports:
- 8080:8080
healthcheck:
test:
- CMD
- wget
- --no-verbose
- --tries=1
- --spider
- http://localhost:8080/healthz
interval: 30s
timeout: 10s
retries: 3
deploy:
mode: global
resources:
limits:
memory: 512M
cpus: '0.3'
reservations:
memory: 256M
cpus: '0.1'
business-metrics:
image: alpine:3.18
command: "sh -c \"\n apk add --no-cache curl jq python3 py3-pip &&\n pip3 install\
\ requests pyyaml prometheus_client &&\n while true; do\n echo '[$(date)]\
\ Collecting business metrics...' &&\n # Immich metrics\n curl -s http://immich_server:3001/api/server-info/stats\
\ > /tmp/immich-stats.json 2>/dev/null || echo '{}' > /tmp/immich-stats.json\
\ &&\n # Nextcloud metrics \n curl -s -u admin:\\$NEXTCLOUD_ADMIN_PASS\
\ http://nextcloud/ocs/v2.php/apps/serverinfo/api/v1/info?format=json > /tmp/nextcloud-stats.json\
\ 2>/dev/null || echo '{}' > /tmp/nextcloud-stats.json &&\n # Home Assistant\
\ metrics\n curl -s -H 'Authorization: Bearer \\$HA_TOKEN' http://homeassistant:8123/api/states\
\ > /tmp/ha-stats.json 2>/dev/null || echo '[]' > /tmp/ha-stats.json &&\n \
\ # Process and expose metrics via HTTP for Prometheus scraping\n python3\
\ /app/business_metrics_processor.py &&\n sleep 300\n done\n\"\n"
environment:
NEXTCLOUD_ADMIN_PASS_FILE: /run/secrets/nextcloud_admin_password
HA_TOKEN_FILE_FILE: /run/secrets/ha_token_file
secrets:
- nextcloud_admin_password
- ha_api_token
- ha_token_file
networks:
- monitoring-network
- traefik-public
- database-network
ports:
- 8888:8888
volumes:
- business_metrics_scripts:/app
deploy:
resources:
limits:
memory: 256M
cpus: '0.2'
reservations:
memory: 128M
cpus: '0.05'
placement:
constraints:
- node.labels.role==monitor
loki:
image: grafana/loki:2.9.0
command: -config.file=/etc/loki/local-config.yaml
volumes:
- loki_data:/tmp/loki
- loki_config:/etc/loki
networks:
- monitoring-network
ports:
- 3100:3100
healthcheck:
test:
- CMD
- wget
- --no-verbose
- --tries=1
- --spider
- http://localhost:3100/ready
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
deploy:
resources:
limits:
memory: 1G
cpus: '0.5'
reservations:
memory: 512M
cpus: '0.25'
placement:
constraints:
- node.labels.role==monitor
promtail:
image: grafana/promtail:2.9.0
command: -config.file=/etc/promtail/config.yml
volumes:
- /var/log:/var/log:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- promtail_config:/etc/promtail
networks:
- monitoring-network
healthcheck:
test:
- CMD
- wget
- --no-verbose
- --tries=1
- --spider
- http://localhost:9080/ready
interval: 30s
timeout: 10s
retries: 3
deploy:
mode: global
resources:
limits:
memory: 256M
cpus: '0.2'
reservations:
memory: 128M
cpus: '0.05'
volumes:
prometheus_data:
driver: local
driver_opts:
type: none
o: bind
device: /opt/monitoring/prometheus/data
prometheus_config:
driver: local
driver_opts:
type: none
o: bind
device: /opt/monitoring/prometheus/config
grafana_data:
driver: local
driver_opts:
type: none
o: bind
device: /opt/monitoring/grafana/data
grafana_config:
driver: local
driver_opts:
type: none
o: bind
device: /opt/monitoring/grafana/config
alertmanager_data:
driver: local
alertmanager_config:
driver: local
node_exporter_textfiles:
driver: local
business_metrics_scripts:
driver: local
driver_opts:
type: none
o: bind
device: /opt/monitoring/business-metrics
loki_data:
driver: local
loki_config:
driver: local
promtail_config:
driver: local
secrets:
grafana_admin_password:
external: true
nextcloud_admin_password:
external: true
ha_api_token:
external: true
gf_security_admin_password_file:
external: true
ha_token_file:
external: true
networks:
monitoring-network:
external: true
traefik-public:
external: true
database-network:
external: true