Files
HomeAudit/backups/stacks-pre-secrets-20250828-092958/comprehensive-monitoring.yml
admin 9ea31368f5 Complete Traefik infrastructure deployment - 60% complete
Major accomplishments:
-  SELinux policy installed and working
-  Core Traefik v2.10 deployment running
-  Production configuration ready (v3.1)
-  Monitoring stack configured
-  Comprehensive documentation created
-  Security hardening implemented

Current status:
- 🟡 Partially deployed (60% complete)
- ⚠️ Docker socket access needs resolution
-  Monitoring stack not deployed yet
- ⚠️ Production migration pending

Next steps:
1. Fix Docker socket permissions
2. Deploy monitoring stack
3. Migrate to production config
4. Validate full functionality

Files added:
- Complete Traefik deployment documentation
- Production and test configurations
- Monitoring stack configurations
- SELinux policy module
- Security checklists and guides
- Current status documentation
2025-08-28 15:22:41 -04:00

342 lines
9.4 KiB
YAML

version: '3.9'
services:
# Prometheus for metrics collection
prometheus:
image: prom/prometheus:v2.47.0
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
volumes:
- prometheus_data:/prometheus
- prometheus_config:/etc/prometheus
networks:
- monitoring-network
- traefik-public
ports:
- "9090:9090"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
deploy:
resources:
limits:
memory: 2G
cpus: '1.0'
reservations:
memory: 1G
cpus: '0.5'
placement:
constraints:
- "node.labels.role==monitor"
labels:
- traefik.enable=true
- traefik.http.routers.prometheus.rule=Host(`prometheus.localhost`)
- traefik.http.routers.prometheus.entrypoints=websecure
- traefik.http.routers.prometheus.tls=true
- traefik.http.services.prometheus.loadbalancer.server.port=9090
# Grafana for visualization
grafana:
image: grafana/grafana:10.1.2
environment:
- GF_SECURITY_ADMIN_PASSWORD_FILE=/run/secrets/grafana_admin_password
- GF_PROVISIONING_PATH=/etc/grafana/provisioning
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel
- GF_FEATURE_TOGGLES_ENABLE=publicDashboards
secrets:
- grafana_admin_password
volumes:
- grafana_data:/var/lib/grafana
- grafana_config:/etc/grafana/provisioning
networks:
- monitoring-network
- traefik-public
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
deploy:
resources:
limits:
memory: 1G
cpus: '0.5'
reservations:
memory: 512M
cpus: '0.25'
placement:
constraints:
- "node.labels.role==monitor"
labels:
- traefik.enable=true
- traefik.http.routers.grafana.rule=Host(`grafana.localhost`)
- traefik.http.routers.grafana.entrypoints=websecure
- traefik.http.routers.grafana.tls=true
- traefik.http.services.grafana.loadbalancer.server.port=3000
# AlertManager for alerting
alertmanager:
image: prom/alertmanager:v0.26.0
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
volumes:
- alertmanager_data:/alertmanager
- alertmanager_config:/etc/alertmanager
networks:
- monitoring-network
- traefik-public
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
deploy:
resources:
limits:
memory: 512M
cpus: '0.25'
reservations:
memory: 256M
cpus: '0.1'
placement:
constraints:
- "node.labels.role==monitor"
labels:
- traefik.enable=true
- traefik.http.routers.alertmanager.rule=Host(`alerts.localhost`)
- traefik.http.routers.alertmanager.entrypoints=websecure
- traefik.http.routers.alertmanager.tls=true
- traefik.http.services.alertmanager.loadbalancer.server.port=9093
# Node Exporter for system metrics (deploy on all nodes)
node-exporter:
image: prom/node-exporter:v1.6.1
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
- '--collector.textfile.directory=/var/lib/node_exporter/textfile_collector'
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- node_exporter_textfiles:/var/lib/node_exporter/textfile_collector
networks:
- monitoring-network
ports:
- "9100:9100"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9100/metrics"]
interval: 30s
timeout: 10s
retries: 3
deploy:
mode: global
resources:
limits:
memory: 256M
cpus: '0.2'
reservations:
memory: 128M
cpus: '0.1'
# cAdvisor for container metrics
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.2
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
networks:
- monitoring-network
ports:
- "8080:8080"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/healthz"]
interval: 30s
timeout: 10s
retries: 3
deploy:
mode: global
resources:
limits:
memory: 512M
cpus: '0.3'
reservations:
memory: 256M
cpus: '0.1'
# Business metrics collector
business-metrics:
image: alpine:3.18
command: |
sh -c "
apk add --no-cache curl jq python3 py3-pip &&
pip3 install requests pyyaml prometheus_client &&
while true; do
echo '[$(date)] Collecting business metrics...' &&
# Immich metrics
curl -s http://immich_server:3001/api/server-info/stats > /tmp/immich-stats.json 2>/dev/null || echo '{}' > /tmp/immich-stats.json &&
# Nextcloud metrics
curl -s -u admin:\$NEXTCLOUD_ADMIN_PASS http://nextcloud/ocs/v2.php/apps/serverinfo/api/v1/info?format=json > /tmp/nextcloud-stats.json 2>/dev/null || echo '{}' > /tmp/nextcloud-stats.json &&
# Home Assistant metrics
curl -s -H 'Authorization: Bearer \$HA_TOKEN' http://homeassistant:8123/api/states > /tmp/ha-stats.json 2>/dev/null || echo '[]' > /tmp/ha-stats.json &&
# Process and expose metrics via HTTP for Prometheus scraping
python3 /app/business_metrics_processor.py &&
sleep 300
done
"
environment:
- NEXTCLOUD_ADMIN_PASS_FILE=/run/secrets/nextcloud_admin_password
- HA_TOKEN_FILE=/run/secrets/ha_api_token
secrets:
- nextcloud_admin_password
- ha_api_token
networks:
- monitoring-network
- traefik-public
- database-network
ports:
- "8888:8888"
volumes:
- business_metrics_scripts:/app
deploy:
resources:
limits:
memory: 256M
cpus: '0.2'
reservations:
memory: 128M
cpus: '0.05'
placement:
constraints:
- "node.labels.role==monitor"
# Loki for log aggregation
loki:
image: grafana/loki:2.9.0
command: -config.file=/etc/loki/local-config.yaml
volumes:
- loki_data:/tmp/loki
- loki_config:/etc/loki
networks:
- monitoring-network
ports:
- "3100:3100"
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3100/ready"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
deploy:
resources:
limits:
memory: 1G
cpus: '0.5'
reservations:
memory: 512M
cpus: '0.25'
placement:
constraints:
- "node.labels.role==monitor"
# Promtail for log collection
promtail:
image: grafana/promtail:2.9.0
command: -config.file=/etc/promtail/config.yml
volumes:
- /var/log:/var/log:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- promtail_config:/etc/promtail
networks:
- monitoring-network
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9080/ready"]
interval: 30s
timeout: 10s
retries: 3
deploy:
mode: global
resources:
limits:
memory: 256M
cpus: '0.2'
reservations:
memory: 128M
cpus: '0.05'
volumes:
prometheus_data:
driver: local
driver_opts:
type: none
o: bind
device: /opt/monitoring/prometheus/data
prometheus_config:
driver: local
driver_opts:
type: none
o: bind
device: /opt/monitoring/prometheus/config
grafana_data:
driver: local
driver_opts:
type: none
o: bind
device: /opt/monitoring/grafana/data
grafana_config:
driver: local
driver_opts:
type: none
o: bind
device: /opt/monitoring/grafana/config
alertmanager_data:
driver: local
alertmanager_config:
driver: local
node_exporter_textfiles:
driver: local
business_metrics_scripts:
driver: local
driver_opts:
type: none
o: bind
device: /opt/monitoring/business-metrics
loki_data:
driver: local
loki_config:
driver: local
promtail_config:
driver: local
secrets:
grafana_admin_password:
external: true
nextcloud_admin_password:
external: true
ha_api_token:
external: true
networks:
monitoring-network:
external: true
traefik-public:
external: true
database-network:
external: true