Major accomplishments: - ✅ SELinux policy installed and working - ✅ Core Traefik v2.10 deployment running - ✅ Production configuration ready (v3.1) - ✅ Monitoring stack configured - ✅ Comprehensive documentation created - ✅ Security hardening implemented Current status: - 🟡 Partially deployed (60% complete) - ⚠️ Docker socket access needs resolution - ❌ Monitoring stack not deployed yet - ⚠️ Production migration pending Next steps: 1. Fix Docker socket permissions 2. Deploy monitoring stack 3. Migrate to production config 4. Validate full functionality Files added: - Complete Traefik deployment documentation - Production and test configurations - Monitoring stack configurations - SELinux policy module - Security checklists and guides - Current status documentation
362 lines
9.3 KiB
YAML
362 lines
9.3 KiB
YAML
version: '3.9'
|
|
services:
|
|
prometheus:
|
|
image: prom/prometheus:v2.47.0
|
|
command:
|
|
- --config.file=/etc/prometheus/prometheus.yml
|
|
- --storage.tsdb.path=/prometheus
|
|
- --web.console.libraries=/etc/prometheus/console_libraries
|
|
- --web.console.templates=/etc/prometheus/consoles
|
|
- --storage.tsdb.retention.time=30d
|
|
- --web.enable-lifecycle
|
|
- --web.enable-admin-api
|
|
volumes:
|
|
- prometheus_data:/prometheus
|
|
- prometheus_config:/etc/prometheus
|
|
networks:
|
|
- monitoring-network
|
|
- traefik-public
|
|
ports:
|
|
- 9090:9090
|
|
healthcheck:
|
|
test:
|
|
- CMD
|
|
- wget
|
|
- --no-verbose
|
|
- --tries=1
|
|
- --spider
|
|
- http://localhost:9090/-/healthy
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 30s
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
memory: 2G
|
|
cpus: '1.0'
|
|
reservations:
|
|
memory: 1G
|
|
cpus: '0.5'
|
|
placement:
|
|
constraints:
|
|
- node.labels.role==monitor
|
|
labels:
|
|
- traefik.enable=true
|
|
- traefik.http.routers.prometheus.rule=Host(`prometheus.localhost`)
|
|
- traefik.http.routers.prometheus.entrypoints=websecure
|
|
- traefik.http.routers.prometheus.tls=true
|
|
- traefik.http.services.prometheus.loadbalancer.server.port=9090
|
|
grafana:
|
|
image: grafana/grafana:10.1.2
|
|
environment:
|
|
GF_PROVISIONING_PATH: /etc/grafana/provisioning
|
|
GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel
|
|
GF_FEATURE_TOGGLES_ENABLE: publicDashboards
|
|
GF_SECURITY_ADMIN_PASSWORD_FILE_FILE: /run/secrets/gf_security_admin_password_file
|
|
secrets:
|
|
- grafana_admin_password
|
|
- gf_security_admin_password_file
|
|
volumes:
|
|
- grafana_data:/var/lib/grafana
|
|
- grafana_config:/etc/grafana/provisioning
|
|
networks:
|
|
- monitoring-network
|
|
- traefik-public
|
|
healthcheck:
|
|
test:
|
|
- CMD-SHELL
|
|
- curl -f http://localhost:3000/api/health || exit 1
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 60s
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
memory: 1G
|
|
cpus: '0.5'
|
|
reservations:
|
|
memory: 512M
|
|
cpus: '0.25'
|
|
placement:
|
|
constraints:
|
|
- node.labels.role==monitor
|
|
labels:
|
|
- traefik.enable=true
|
|
- traefik.http.routers.grafana.rule=Host(`grafana.localhost`)
|
|
- traefik.http.routers.grafana.entrypoints=websecure
|
|
- traefik.http.routers.grafana.tls=true
|
|
- traefik.http.services.grafana.loadbalancer.server.port=3000
|
|
alertmanager:
|
|
image: prom/alertmanager:v0.26.0
|
|
command:
|
|
- --config.file=/etc/alertmanager/alertmanager.yml
|
|
- --storage.path=/alertmanager
|
|
- --web.external-url=http://localhost:9093
|
|
volumes:
|
|
- alertmanager_data:/alertmanager
|
|
- alertmanager_config:/etc/alertmanager
|
|
networks:
|
|
- monitoring-network
|
|
- traefik-public
|
|
healthcheck:
|
|
test:
|
|
- CMD
|
|
- wget
|
|
- --no-verbose
|
|
- --tries=1
|
|
- --spider
|
|
- http://localhost:9093/-/healthy
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 30s
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
memory: 512M
|
|
cpus: '0.25'
|
|
reservations:
|
|
memory: 256M
|
|
cpus: '0.1'
|
|
placement:
|
|
constraints:
|
|
- node.labels.role==monitor
|
|
labels:
|
|
- traefik.enable=true
|
|
- traefik.http.routers.alertmanager.rule=Host(`alerts.localhost`)
|
|
- traefik.http.routers.alertmanager.entrypoints=websecure
|
|
- traefik.http.routers.alertmanager.tls=true
|
|
- traefik.http.services.alertmanager.loadbalancer.server.port=9093
|
|
node-exporter:
|
|
image: prom/node-exporter:v1.6.1
|
|
command:
|
|
- --path.procfs=/host/proc
|
|
- --path.sysfs=/host/sys
|
|
- --collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)
|
|
- --collector.textfile.directory=/var/lib/node_exporter/textfile_collector
|
|
volumes:
|
|
- /proc:/host/proc:ro
|
|
- /sys:/host/sys:ro
|
|
- /:/rootfs:ro
|
|
- node_exporter_textfiles:/var/lib/node_exporter/textfile_collector
|
|
networks:
|
|
- monitoring-network
|
|
ports:
|
|
- 9100:9100
|
|
healthcheck:
|
|
test:
|
|
- CMD
|
|
- wget
|
|
- --no-verbose
|
|
- --tries=1
|
|
- --spider
|
|
- http://localhost:9100/metrics
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
deploy:
|
|
mode: global
|
|
resources:
|
|
limits:
|
|
memory: 256M
|
|
cpus: '0.2'
|
|
reservations:
|
|
memory: 128M
|
|
cpus: '0.1'
|
|
cadvisor:
|
|
image: gcr.io/cadvisor/cadvisor:v0.47.2
|
|
volumes:
|
|
- /:/rootfs:ro
|
|
- /var/run:/var/run:ro
|
|
- /sys:/sys:ro
|
|
- /var/lib/docker/:/var/lib/docker:ro
|
|
- /dev/disk/:/dev/disk:ro
|
|
networks:
|
|
- monitoring-network
|
|
ports:
|
|
- 8080:8080
|
|
healthcheck:
|
|
test:
|
|
- CMD
|
|
- wget
|
|
- --no-verbose
|
|
- --tries=1
|
|
- --spider
|
|
- http://localhost:8080/healthz
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
deploy:
|
|
mode: global
|
|
resources:
|
|
limits:
|
|
memory: 512M
|
|
cpus: '0.3'
|
|
reservations:
|
|
memory: 256M
|
|
cpus: '0.1'
|
|
business-metrics:
|
|
image: alpine:3.18
|
|
command: "sh -c \"\n apk add --no-cache curl jq python3 py3-pip &&\n pip3 install\
|
|
\ requests pyyaml prometheus_client &&\n while true; do\n echo '[$(date)]\
|
|
\ Collecting business metrics...' &&\n # Immich metrics\n curl -s http://immich_server:3001/api/server-info/stats\
|
|
\ > /tmp/immich-stats.json 2>/dev/null || echo '{}' > /tmp/immich-stats.json\
|
|
\ &&\n # Nextcloud metrics \n curl -s -u admin:\\$NEXTCLOUD_ADMIN_PASS\
|
|
\ http://nextcloud/ocs/v2.php/apps/serverinfo/api/v1/info?format=json > /tmp/nextcloud-stats.json\
|
|
\ 2>/dev/null || echo '{}' > /tmp/nextcloud-stats.json &&\n # Home Assistant\
|
|
\ metrics\n curl -s -H 'Authorization: Bearer \\$HA_TOKEN' http://homeassistant:8123/api/states\
|
|
\ > /tmp/ha-stats.json 2>/dev/null || echo '[]' > /tmp/ha-stats.json &&\n \
|
|
\ # Process and expose metrics via HTTP for Prometheus scraping\n python3\
|
|
\ /app/business_metrics_processor.py &&\n sleep 300\n done\n\"\n"
|
|
environment:
|
|
NEXTCLOUD_ADMIN_PASS_FILE: /run/secrets/nextcloud_admin_password
|
|
HA_TOKEN_FILE_FILE: /run/secrets/ha_token_file
|
|
secrets:
|
|
- nextcloud_admin_password
|
|
- ha_api_token
|
|
- ha_token_file
|
|
networks:
|
|
- monitoring-network
|
|
- traefik-public
|
|
- database-network
|
|
ports:
|
|
- 8888:8888
|
|
volumes:
|
|
- business_metrics_scripts:/app
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
memory: 256M
|
|
cpus: '0.2'
|
|
reservations:
|
|
memory: 128M
|
|
cpus: '0.05'
|
|
placement:
|
|
constraints:
|
|
- node.labels.role==monitor
|
|
loki:
|
|
image: grafana/loki:2.9.0
|
|
command: -config.file=/etc/loki/local-config.yaml
|
|
volumes:
|
|
- loki_data:/tmp/loki
|
|
- loki_config:/etc/loki
|
|
networks:
|
|
- monitoring-network
|
|
ports:
|
|
- 3100:3100
|
|
healthcheck:
|
|
test:
|
|
- CMD
|
|
- wget
|
|
- --no-verbose
|
|
- --tries=1
|
|
- --spider
|
|
- http://localhost:3100/ready
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 60s
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
memory: 1G
|
|
cpus: '0.5'
|
|
reservations:
|
|
memory: 512M
|
|
cpus: '0.25'
|
|
placement:
|
|
constraints:
|
|
- node.labels.role==monitor
|
|
promtail:
|
|
image: grafana/promtail:2.9.0
|
|
command: -config.file=/etc/promtail/config.yml
|
|
volumes:
|
|
- /var/log:/var/log:ro
|
|
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
|
- promtail_config:/etc/promtail
|
|
networks:
|
|
- monitoring-network
|
|
healthcheck:
|
|
test:
|
|
- CMD
|
|
- wget
|
|
- --no-verbose
|
|
- --tries=1
|
|
- --spider
|
|
- http://localhost:9080/ready
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
deploy:
|
|
mode: global
|
|
resources:
|
|
limits:
|
|
memory: 256M
|
|
cpus: '0.2'
|
|
reservations:
|
|
memory: 128M
|
|
cpus: '0.05'
|
|
volumes:
|
|
prometheus_data:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/monitoring/prometheus/data
|
|
prometheus_config:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/monitoring/prometheus/config
|
|
grafana_data:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/monitoring/grafana/data
|
|
grafana_config:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/monitoring/grafana/config
|
|
alertmanager_data:
|
|
driver: local
|
|
alertmanager_config:
|
|
driver: local
|
|
node_exporter_textfiles:
|
|
driver: local
|
|
business_metrics_scripts:
|
|
driver: local
|
|
driver_opts:
|
|
type: none
|
|
o: bind
|
|
device: /opt/monitoring/business-metrics
|
|
loki_data:
|
|
driver: local
|
|
loki_config:
|
|
driver: local
|
|
promtail_config:
|
|
driver: local
|
|
secrets:
|
|
grafana_admin_password:
|
|
external: true
|
|
nextcloud_admin_password:
|
|
external: true
|
|
ha_api_token:
|
|
external: true
|
|
gf_security_admin_password_file:
|
|
external: true
|
|
ha_token_file:
|
|
external: true
|
|
networks:
|
|
monitoring-network:
|
|
external: true
|
|
traefik-public:
|
|
external: true
|
|
database-network:
|
|
external: true
|