Complete Traefik infrastructure deployment - 60% complete
Major accomplishments: - ✅ SELinux policy installed and working - ✅ Core Traefik v2.10 deployment running - ✅ Production configuration ready (v3.1) - ✅ Monitoring stack configured - ✅ Comprehensive documentation created - ✅ Security hardening implemented Current status: - 🟡 Partially deployed (60% complete) - ⚠️ Docker socket access needs resolution - ❌ Monitoring stack not deployed yet - ⚠️ Production migration pending Next steps: 1. Fix Docker socket permissions 2. Deploy monitoring stack 3. Migrate to production config 4. Validate full functionality Files added: - Complete Traefik deployment documentation - Production and test configurations - Monitoring stack configurations - SELinux policy module - Security checklists and guides - Current status documentation
This commit is contained in:
@@ -0,0 +1,342 @@
|
||||
version: '3.9'
|
||||
|
||||
services:
|
||||
# Prometheus for metrics collection
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.47.0
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||
- '--web.console.templates=/etc/prometheus/consoles'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--web.enable-lifecycle'
|
||||
- '--web.enable-admin-api'
|
||||
volumes:
|
||||
- prometheus_data:/prometheus
|
||||
- prometheus_config:/etc/prometheus
|
||||
networks:
|
||||
- monitoring-network
|
||||
- traefik-public
|
||||
ports:
|
||||
- "9090:9090"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
reservations:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
placement:
|
||||
constraints:
|
||||
- "node.labels.role==monitor"
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.http.routers.prometheus.rule=Host(`prometheus.localhost`)
|
||||
- traefik.http.routers.prometheus.entrypoints=websecure
|
||||
- traefik.http.routers.prometheus.tls=true
|
||||
- traefik.http.services.prometheus.loadbalancer.server.port=9090
|
||||
|
||||
# Grafana for visualization
|
||||
grafana:
|
||||
image: grafana/grafana:10.1.2
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD_FILE=/run/secrets/grafana_admin_password
|
||||
- GF_PROVISIONING_PATH=/etc/grafana/provisioning
|
||||
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel
|
||||
- GF_FEATURE_TOGGLES_ENABLE=publicDashboards
|
||||
secrets:
|
||||
- grafana_admin_password
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- grafana_config:/etc/grafana/provisioning
|
||||
networks:
|
||||
- monitoring-network
|
||||
- traefik-public
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
reservations:
|
||||
memory: 512M
|
||||
cpus: '0.25'
|
||||
placement:
|
||||
constraints:
|
||||
- "node.labels.role==monitor"
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.http.routers.grafana.rule=Host(`grafana.localhost`)
|
||||
- traefik.http.routers.grafana.entrypoints=websecure
|
||||
- traefik.http.routers.grafana.tls=true
|
||||
- traefik.http.services.grafana.loadbalancer.server.port=3000
|
||||
|
||||
# AlertManager for alerting
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.26.0
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--web.external-url=http://localhost:9093'
|
||||
volumes:
|
||||
- alertmanager_data:/alertmanager
|
||||
- alertmanager_config:/etc/alertmanager
|
||||
networks:
|
||||
- monitoring-network
|
||||
- traefik-public
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9093/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
cpus: '0.25'
|
||||
reservations:
|
||||
memory: 256M
|
||||
cpus: '0.1'
|
||||
placement:
|
||||
constraints:
|
||||
- "node.labels.role==monitor"
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.http.routers.alertmanager.rule=Host(`alerts.localhost`)
|
||||
- traefik.http.routers.alertmanager.entrypoints=websecure
|
||||
- traefik.http.routers.alertmanager.tls=true
|
||||
- traefik.http.services.alertmanager.loadbalancer.server.port=9093
|
||||
|
||||
# Node Exporter for system metrics (deploy on all nodes)
|
||||
node-exporter:
|
||||
image: prom/node-exporter:v1.6.1
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
- '--collector.textfile.directory=/var/lib/node_exporter/textfile_collector'
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
- node_exporter_textfiles:/var/lib/node_exporter/textfile_collector
|
||||
networks:
|
||||
- monitoring-network
|
||||
ports:
|
||||
- "9100:9100"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9100/metrics"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
deploy:
|
||||
mode: global
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.2'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.1'
|
||||
|
||||
# cAdvisor for container metrics
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:v0.47.2
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
networks:
|
||||
- monitoring-network
|
||||
ports:
|
||||
- "8080:8080"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/healthz"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
deploy:
|
||||
mode: global
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
cpus: '0.3'
|
||||
reservations:
|
||||
memory: 256M
|
||||
cpus: '0.1'
|
||||
|
||||
# Business metrics collector
|
||||
business-metrics:
|
||||
image: alpine:3.18
|
||||
command: |
|
||||
sh -c "
|
||||
apk add --no-cache curl jq python3 py3-pip &&
|
||||
pip3 install requests pyyaml prometheus_client &&
|
||||
while true; do
|
||||
echo '[$(date)] Collecting business metrics...' &&
|
||||
# Immich metrics
|
||||
curl -s http://immich_server:3001/api/server-info/stats > /tmp/immich-stats.json 2>/dev/null || echo '{}' > /tmp/immich-stats.json &&
|
||||
# Nextcloud metrics
|
||||
curl -s -u admin:\$NEXTCLOUD_ADMIN_PASS http://nextcloud/ocs/v2.php/apps/serverinfo/api/v1/info?format=json > /tmp/nextcloud-stats.json 2>/dev/null || echo '{}' > /tmp/nextcloud-stats.json &&
|
||||
# Home Assistant metrics
|
||||
curl -s -H 'Authorization: Bearer \$HA_TOKEN' http://homeassistant:8123/api/states > /tmp/ha-stats.json 2>/dev/null || echo '[]' > /tmp/ha-stats.json &&
|
||||
# Process and expose metrics via HTTP for Prometheus scraping
|
||||
python3 /app/business_metrics_processor.py &&
|
||||
sleep 300
|
||||
done
|
||||
"
|
||||
environment:
|
||||
- NEXTCLOUD_ADMIN_PASS_FILE=/run/secrets/nextcloud_admin_password
|
||||
- HA_TOKEN_FILE=/run/secrets/ha_api_token
|
||||
secrets:
|
||||
- nextcloud_admin_password
|
||||
- ha_api_token
|
||||
networks:
|
||||
- monitoring-network
|
||||
- traefik-public
|
||||
- database-network
|
||||
ports:
|
||||
- "8888:8888"
|
||||
volumes:
|
||||
- business_metrics_scripts:/app
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.2'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.05'
|
||||
placement:
|
||||
constraints:
|
||||
- "node.labels.role==monitor"
|
||||
|
||||
# Loki for log aggregation
|
||||
loki:
|
||||
image: grafana/loki:2.9.0
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
volumes:
|
||||
- loki_data:/tmp/loki
|
||||
- loki_config:/etc/loki
|
||||
networks:
|
||||
- monitoring-network
|
||||
ports:
|
||||
- "3100:3100"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3100/ready"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
reservations:
|
||||
memory: 512M
|
||||
cpus: '0.25'
|
||||
placement:
|
||||
constraints:
|
||||
- "node.labels.role==monitor"
|
||||
|
||||
# Promtail for log collection
|
||||
promtail:
|
||||
image: grafana/promtail:2.9.0
|
||||
command: -config.file=/etc/promtail/config.yml
|
||||
volumes:
|
||||
- /var/log:/var/log:ro
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- promtail_config:/etc/promtail
|
||||
networks:
|
||||
- monitoring-network
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9080/ready"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
deploy:
|
||||
mode: global
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.2'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.05'
|
||||
|
||||
volumes:
|
||||
prometheus_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/prometheus/data
|
||||
prometheus_config:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/prometheus/config
|
||||
grafana_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/grafana/data
|
||||
grafana_config:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/grafana/config
|
||||
alertmanager_data:
|
||||
driver: local
|
||||
alertmanager_config:
|
||||
driver: local
|
||||
node_exporter_textfiles:
|
||||
driver: local
|
||||
business_metrics_scripts:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/business-metrics
|
||||
loki_data:
|
||||
driver: local
|
||||
loki_config:
|
||||
driver: local
|
||||
promtail_config:
|
||||
driver: local
|
||||
|
||||
secrets:
|
||||
grafana_admin_password:
|
||||
external: true
|
||||
nextcloud_admin_password:
|
||||
external: true
|
||||
ha_api_token:
|
||||
external: true
|
||||
|
||||
networks:
|
||||
monitoring-network:
|
||||
external: true
|
||||
traefik-public:
|
||||
external: true
|
||||
database-network:
|
||||
external: true
|
||||
Reference in New Issue
Block a user