Complete Traefik infrastructure deployment - 60% complete
Major accomplishments: - ✅ SELinux policy installed and working - ✅ Core Traefik v2.10 deployment running - ✅ Production configuration ready (v3.1) - ✅ Monitoring stack configured - ✅ Comprehensive documentation created - ✅ Security hardening implemented Current status: - 🟡 Partially deployed (60% complete) - ⚠️ Docker socket access needs resolution - ❌ Monitoring stack not deployed yet - ⚠️ Production migration pending Next steps: 1. Fix Docker socket permissions 2. Deploy monitoring stack 3. Migrate to production config 4. Validate full functionality Files added: - Complete Traefik deployment documentation - Production and test configurations - Monitoring stack configurations - SELinux policy module - Security checklists and guides - Current status documentation
This commit is contained in:
361
stacks/monitoring/comprehensive-monitoring.yml
Normal file
361
stacks/monitoring/comprehensive-monitoring.yml
Normal file
@@ -0,0 +1,361 @@
|
||||
version: '3.9'
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.47.0
|
||||
command:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
- --storage.tsdb.path=/prometheus
|
||||
- --web.console.libraries=/etc/prometheus/console_libraries
|
||||
- --web.console.templates=/etc/prometheus/consoles
|
||||
- --storage.tsdb.retention.time=30d
|
||||
- --web.enable-lifecycle
|
||||
- --web.enable-admin-api
|
||||
volumes:
|
||||
- prometheus_data:/prometheus
|
||||
- prometheus_config:/etc/prometheus
|
||||
networks:
|
||||
- monitoring-network
|
||||
- traefik-public
|
||||
ports:
|
||||
- 9090:9090
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- wget
|
||||
- --no-verbose
|
||||
- --tries=1
|
||||
- --spider
|
||||
- http://localhost:9090/-/healthy
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
reservations:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
placement:
|
||||
constraints:
|
||||
- node.labels.role==monitor
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.http.routers.prometheus.rule=Host(`prometheus.localhost`)
|
||||
- traefik.http.routers.prometheus.entrypoints=websecure
|
||||
- traefik.http.routers.prometheus.tls=true
|
||||
- traefik.http.services.prometheus.loadbalancer.server.port=9090
|
||||
grafana:
|
||||
image: grafana/grafana:10.1.2
|
||||
environment:
|
||||
GF_PROVISIONING_PATH: /etc/grafana/provisioning
|
||||
GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource,grafana-piechart-panel
|
||||
GF_FEATURE_TOGGLES_ENABLE: publicDashboards
|
||||
GF_SECURITY_ADMIN_PASSWORD_FILE_FILE: /run/secrets/gf_security_admin_password_file
|
||||
secrets:
|
||||
- grafana_admin_password
|
||||
- gf_security_admin_password_file
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- grafana_config:/etc/grafana/provisioning
|
||||
networks:
|
||||
- monitoring-network
|
||||
- traefik-public
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD-SHELL
|
||||
- curl -f http://localhost:3000/api/health || exit 1
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
reservations:
|
||||
memory: 512M
|
||||
cpus: '0.25'
|
||||
placement:
|
||||
constraints:
|
||||
- node.labels.role==monitor
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.http.routers.grafana.rule=Host(`grafana.localhost`)
|
||||
- traefik.http.routers.grafana.entrypoints=websecure
|
||||
- traefik.http.routers.grafana.tls=true
|
||||
- traefik.http.services.grafana.loadbalancer.server.port=3000
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.26.0
|
||||
command:
|
||||
- --config.file=/etc/alertmanager/alertmanager.yml
|
||||
- --storage.path=/alertmanager
|
||||
- --web.external-url=http://localhost:9093
|
||||
volumes:
|
||||
- alertmanager_data:/alertmanager
|
||||
- alertmanager_config:/etc/alertmanager
|
||||
networks:
|
||||
- monitoring-network
|
||||
- traefik-public
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- wget
|
||||
- --no-verbose
|
||||
- --tries=1
|
||||
- --spider
|
||||
- http://localhost:9093/-/healthy
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
cpus: '0.25'
|
||||
reservations:
|
||||
memory: 256M
|
||||
cpus: '0.1'
|
||||
placement:
|
||||
constraints:
|
||||
- node.labels.role==monitor
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.http.routers.alertmanager.rule=Host(`alerts.localhost`)
|
||||
- traefik.http.routers.alertmanager.entrypoints=websecure
|
||||
- traefik.http.routers.alertmanager.tls=true
|
||||
- traefik.http.services.alertmanager.loadbalancer.server.port=9093
|
||||
node-exporter:
|
||||
image: prom/node-exporter:v1.6.1
|
||||
command:
|
||||
- --path.procfs=/host/proc
|
||||
- --path.sysfs=/host/sys
|
||||
- --collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)
|
||||
- --collector.textfile.directory=/var/lib/node_exporter/textfile_collector
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
- node_exporter_textfiles:/var/lib/node_exporter/textfile_collector
|
||||
networks:
|
||||
- monitoring-network
|
||||
ports:
|
||||
- 9100:9100
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- wget
|
||||
- --no-verbose
|
||||
- --tries=1
|
||||
- --spider
|
||||
- http://localhost:9100/metrics
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
deploy:
|
||||
mode: global
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.2'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.1'
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:v0.47.2
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
networks:
|
||||
- monitoring-network
|
||||
ports:
|
||||
- 8080:8080
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- wget
|
||||
- --no-verbose
|
||||
- --tries=1
|
||||
- --spider
|
||||
- http://localhost:8080/healthz
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
deploy:
|
||||
mode: global
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
cpus: '0.3'
|
||||
reservations:
|
||||
memory: 256M
|
||||
cpus: '0.1'
|
||||
business-metrics:
|
||||
image: alpine:3.18
|
||||
command: "sh -c \"\n apk add --no-cache curl jq python3 py3-pip &&\n pip3 install\
|
||||
\ requests pyyaml prometheus_client &&\n while true; do\n echo '[$(date)]\
|
||||
\ Collecting business metrics...' &&\n # Immich metrics\n curl -s http://immich_server:3001/api/server-info/stats\
|
||||
\ > /tmp/immich-stats.json 2>/dev/null || echo '{}' > /tmp/immich-stats.json\
|
||||
\ &&\n # Nextcloud metrics \n curl -s -u admin:\\$NEXTCLOUD_ADMIN_PASS\
|
||||
\ http://nextcloud/ocs/v2.php/apps/serverinfo/api/v1/info?format=json > /tmp/nextcloud-stats.json\
|
||||
\ 2>/dev/null || echo '{}' > /tmp/nextcloud-stats.json &&\n # Home Assistant\
|
||||
\ metrics\n curl -s -H 'Authorization: Bearer \\$HA_TOKEN' http://homeassistant:8123/api/states\
|
||||
\ > /tmp/ha-stats.json 2>/dev/null || echo '[]' > /tmp/ha-stats.json &&\n \
|
||||
\ # Process and expose metrics via HTTP for Prometheus scraping\n python3\
|
||||
\ /app/business_metrics_processor.py &&\n sleep 300\n done\n\"\n"
|
||||
environment:
|
||||
NEXTCLOUD_ADMIN_PASS_FILE: /run/secrets/nextcloud_admin_password
|
||||
HA_TOKEN_FILE_FILE: /run/secrets/ha_token_file
|
||||
secrets:
|
||||
- nextcloud_admin_password
|
||||
- ha_api_token
|
||||
- ha_token_file
|
||||
networks:
|
||||
- monitoring-network
|
||||
- traefik-public
|
||||
- database-network
|
||||
ports:
|
||||
- 8888:8888
|
||||
volumes:
|
||||
- business_metrics_scripts:/app
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.2'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.05'
|
||||
placement:
|
||||
constraints:
|
||||
- node.labels.role==monitor
|
||||
loki:
|
||||
image: grafana/loki:2.9.0
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
volumes:
|
||||
- loki_data:/tmp/loki
|
||||
- loki_config:/etc/loki
|
||||
networks:
|
||||
- monitoring-network
|
||||
ports:
|
||||
- 3100:3100
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- wget
|
||||
- --no-verbose
|
||||
- --tries=1
|
||||
- --spider
|
||||
- http://localhost:3100/ready
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
reservations:
|
||||
memory: 512M
|
||||
cpus: '0.25'
|
||||
placement:
|
||||
constraints:
|
||||
- node.labels.role==monitor
|
||||
promtail:
|
||||
image: grafana/promtail:2.9.0
|
||||
command: -config.file=/etc/promtail/config.yml
|
||||
volumes:
|
||||
- /var/log:/var/log:ro
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- promtail_config:/etc/promtail
|
||||
networks:
|
||||
- monitoring-network
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- wget
|
||||
- --no-verbose
|
||||
- --tries=1
|
||||
- --spider
|
||||
- http://localhost:9080/ready
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
deploy:
|
||||
mode: global
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.2'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.05'
|
||||
volumes:
|
||||
prometheus_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/prometheus/data
|
||||
prometheus_config:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/prometheus/config
|
||||
grafana_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/grafana/data
|
||||
grafana_config:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/grafana/config
|
||||
alertmanager_data:
|
||||
driver: local
|
||||
alertmanager_config:
|
||||
driver: local
|
||||
node_exporter_textfiles:
|
||||
driver: local
|
||||
business_metrics_scripts:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/business-metrics
|
||||
loki_data:
|
||||
driver: local
|
||||
loki_config:
|
||||
driver: local
|
||||
promtail_config:
|
||||
driver: local
|
||||
secrets:
|
||||
grafana_admin_password:
|
||||
external: true
|
||||
nextcloud_admin_password:
|
||||
external: true
|
||||
ha_api_token:
|
||||
external: true
|
||||
gf_security_admin_password_file:
|
||||
external: true
|
||||
ha_token_file:
|
||||
external: true
|
||||
networks:
|
||||
monitoring-network:
|
||||
external: true
|
||||
traefik-public:
|
||||
external: true
|
||||
database-network:
|
||||
external: true
|
||||
@@ -1,44 +1,49 @@
|
||||
version: '3.9'
|
||||
|
||||
services:
|
||||
netdata:
|
||||
image: netdata/netdata:stable
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
- SYS_PTRACE
|
||||
security_opt:
|
||||
- apparmor:unconfined
|
||||
- apparmor:unconfined
|
||||
ports:
|
||||
- target: 19999
|
||||
published: 19999
|
||||
mode: host
|
||||
- target: 19999
|
||||
published: 19999
|
||||
mode: host
|
||||
volumes:
|
||||
- netdata_config:/etc/netdata
|
||||
- netdata_lib:/var/lib/netdata
|
||||
- netdata_cache:/var/cache/netdata
|
||||
- /etc/passwd:/host/etc/passwd:ro
|
||||
- /etc/group:/host/etc/group:ro
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- netdata_config:/etc/netdata
|
||||
- netdata_lib:/var/lib/netdata
|
||||
- netdata_cache:/var/cache/netdata
|
||||
- /etc/passwd:/host/etc/passwd:ro
|
||||
- /etc/group:/host/etc/group:ro
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
environment:
|
||||
- NETDATA_CLAIM_TOKEN=
|
||||
NETDATA_CLAIM_TOKEN_FILE: /run/secrets/netdata_claim_token
|
||||
networks:
|
||||
- monitoring-network
|
||||
- monitoring-network
|
||||
deploy:
|
||||
placement:
|
||||
constraints:
|
||||
- node.role == manager
|
||||
- node.role == manager
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.http.routers.netdata.rule=Host(`netdata.localhost`)
|
||||
- traefik.http.routers.netdata.entrypoints=websecure
|
||||
- traefik.http.routers.netdata.tls=true
|
||||
- traefik.http.services.netdata.loadbalancer.server.port=19999
|
||||
|
||||
- traefik.enable=true
|
||||
- traefik.http.routers.netdata.rule=Host(`netdata.localhost`)
|
||||
- traefik.http.routers.netdata.entrypoints=websecure
|
||||
- traefik.http.routers.netdata.tls=true
|
||||
- traefik.http.services.netdata.loadbalancer.server.port=19999
|
||||
secrets:
|
||||
- netdata_claim_token
|
||||
volumes:
|
||||
netdata_config: { driver: local }
|
||||
netdata_lib: { driver: local }
|
||||
netdata_cache: { driver: local }
|
||||
|
||||
netdata_config:
|
||||
driver: local
|
||||
netdata_lib:
|
||||
driver: local
|
||||
netdata_cache:
|
||||
driver: local
|
||||
networks:
|
||||
monitoring-network:
|
||||
external: true
|
||||
secrets:
|
||||
netdata_claim_token:
|
||||
external: true
|
||||
|
||||
346
stacks/monitoring/security-monitoring.yml
Normal file
346
stacks/monitoring/security-monitoring.yml
Normal file
@@ -0,0 +1,346 @@
|
||||
version: '3.9'
|
||||
|
||||
services:
|
||||
# Falco - Runtime security monitoring
|
||||
falco:
|
||||
image: falcosecurity/falco:0.36.2
|
||||
privileged: true # Required for kernel monitoring
|
||||
environment:
|
||||
- FALCO_GRPC_ENABLED=true
|
||||
- FALCO_GRPC_BIND_ADDRESS=0.0.0.0:5060
|
||||
- FALCO_K8S_API_CERT=/etc/ssl/falco.crt
|
||||
volumes:
|
||||
- /var/run/docker.sock:/host/var/run/docker.sock:ro
|
||||
- /proc:/host/proc:ro
|
||||
- /etc:/host/etc:ro
|
||||
- /lib/modules:/host/lib/modules:ro
|
||||
- /usr:/host/usr:ro
|
||||
- falco_rules:/etc/falco/rules.d
|
||||
- falco_logs:/var/log/falco
|
||||
networks:
|
||||
- monitoring-network
|
||||
ports:
|
||||
- "5060:5060" # gRPC API
|
||||
command:
|
||||
- /usr/bin/falco
|
||||
- --cri
|
||||
- /run/containerd/containerd.sock
|
||||
- --k8s-api
|
||||
- --k8s-api-cert=/etc/ssl/falco.crt
|
||||
healthcheck:
|
||||
test: ["CMD", "test", "-S", "/var/run/falco/falco.sock"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
deploy:
|
||||
mode: global # Deploy on all nodes
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
cpus: '0.5'
|
||||
reservations:
|
||||
memory: 256M
|
||||
cpus: '0.1'
|
||||
|
||||
# Falco Sidekick - Events processing and forwarding
|
||||
falco-sidekick:
|
||||
image: falcosecurity/falcosidekick:2.28.0
|
||||
environment:
|
||||
- WEBUI_URL=http://falco-sidekick-ui:2802
|
||||
- PROMETHEUS_URL=http://prometheus:9090
|
||||
- SLACK_WEBHOOKURL=${SLACK_WEBHOOK_URL:-}
|
||||
- SLACK_CHANNEL=#security-alerts
|
||||
- SLACK_USERNAME=Falco
|
||||
volumes:
|
||||
- falco_sidekick_config:/etc/falcosidekick
|
||||
networks:
|
||||
- monitoring-network
|
||||
ports:
|
||||
- "2801:2801"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:2801/ping"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.25'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.05'
|
||||
placement:
|
||||
constraints:
|
||||
- "node.labels.role==monitor"
|
||||
depends_on:
|
||||
- falco
|
||||
|
||||
# Falco Sidekick UI - Web interface for security events
|
||||
falco-sidekick-ui:
|
||||
image: falcosecurity/falcosidekick-ui:v2.2.0
|
||||
environment:
|
||||
- FALCOSIDEKICK_UI_REDIS_URL=redis://redis_master:6379
|
||||
networks:
|
||||
- monitoring-network
|
||||
- traefik-public
|
||||
- database-network
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:2802/"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.25'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.05'
|
||||
placement:
|
||||
constraints:
|
||||
- "node.labels.role==monitor"
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.http.routers.falco-ui.rule=Host(`security.localhost`)
|
||||
- traefik.http.routers.falco-ui.entrypoints=websecure
|
||||
- traefik.http.routers.falco-ui.tls=true
|
||||
- traefik.http.services.falco-ui.loadbalancer.server.port=2802
|
||||
depends_on:
|
||||
- falco-sidekick
|
||||
|
||||
# Suricata - Network intrusion detection
|
||||
suricata:
|
||||
image: jasonish/suricata:7.0.2
|
||||
network_mode: host
|
||||
cap_add:
|
||||
- NET_ADMIN
|
||||
- SYS_NICE
|
||||
environment:
|
||||
- SURICATA_OPTIONS=-i any
|
||||
volumes:
|
||||
- suricata_config:/etc/suricata
|
||||
- suricata_logs:/var/log/suricata
|
||||
- suricata_rules:/var/lib/suricata/rules
|
||||
command: ["/usr/bin/suricata", "-c", "/etc/suricata/suricata.yaml", "-i", "any"]
|
||||
healthcheck:
|
||||
test: ["CMD", "test", "-f", "/var/run/suricata.pid"]
|
||||
interval: 60s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 120s
|
||||
deploy:
|
||||
mode: global
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
cpus: '0.5'
|
||||
reservations:
|
||||
memory: 512M
|
||||
cpus: '0.1'
|
||||
|
||||
# Trivy - Vulnerability scanner
|
||||
trivy-scanner:
|
||||
image: aquasec/trivy:0.48.3
|
||||
environment:
|
||||
- TRIVY_LISTEN=0.0.0.0:8080
|
||||
- TRIVY_CACHE_DIR=/tmp/trivy
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- trivy_cache:/tmp/trivy
|
||||
- trivy_reports:/reports
|
||||
networks:
|
||||
- monitoring-network
|
||||
command: |
|
||||
sh -c "
|
||||
# Start Trivy server
|
||||
trivy server --listen 0.0.0.0:8080 &
|
||||
|
||||
# Automated scanning loop
|
||||
while true; do
|
||||
echo '[$(date)] Starting vulnerability scan...'
|
||||
|
||||
# Scan all running images
|
||||
docker images --format '{{.Repository}}:{{.Tag}}' | \
|
||||
grep -v '<none>' | \
|
||||
head -20 | \
|
||||
while read image; do
|
||||
echo 'Scanning: $$image'
|
||||
trivy image --format json --output /reports/scan-$$(echo $$image | tr '/:' '_')-$$(date +%Y%m%d).json $$image || true
|
||||
done
|
||||
|
||||
# Wait 24 hours before next scan
|
||||
sleep 86400
|
||||
done
|
||||
"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/version"]
|
||||
interval: 60s
|
||||
timeout: 15s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
reservations:
|
||||
memory: 1G
|
||||
cpus: '0.25'
|
||||
placement:
|
||||
constraints:
|
||||
- "node.labels.role==monitor"
|
||||
|
||||
# ClamAV - Antivirus scanning
|
||||
clamav:
|
||||
image: clamav/clamav:1.2.1
|
||||
volumes:
|
||||
- clamav_db:/var/lib/clamav
|
||||
- clamav_logs:/var/log/clamav
|
||||
- /var/lib/docker/volumes:/scan:ro # Mount volumes for scanning
|
||||
networks:
|
||||
- monitoring-network
|
||||
environment:
|
||||
- CLAMAV_NO_CLAMD=false
|
||||
- CLAMAV_NO_FRESHCLAMD=false
|
||||
healthcheck:
|
||||
test: ["CMD", "clamdscan", "--version"]
|
||||
interval: 300s
|
||||
timeout: 30s
|
||||
retries: 3
|
||||
start_period: 300s # Allow time for signature updates
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 2G
|
||||
cpus: '1.0'
|
||||
reservations:
|
||||
memory: 1G
|
||||
cpus: '0.25'
|
||||
placement:
|
||||
constraints:
|
||||
- "node.labels.role==monitor"
|
||||
|
||||
# Security metrics exporter
|
||||
security-metrics-exporter:
|
||||
image: alpine:3.18
|
||||
command: |
|
||||
sh -c "
|
||||
apk add --no-cache curl jq python3 py3-pip &&
|
||||
pip3 install prometheus_client requests &&
|
||||
|
||||
# Create metrics collection script
|
||||
cat > /app/security_metrics.py << 'PYEOF'
|
||||
import time
|
||||
import json
|
||||
import subprocess
|
||||
import requests
|
||||
from prometheus_client import start_http_server, Gauge, Counter
|
||||
|
||||
# Prometheus metrics
|
||||
falco_alerts = Counter('falco_security_alerts_total', 'Total Falco security alerts', ['rule', 'priority'])
|
||||
vuln_count = Gauge('trivy_vulnerabilities_total', 'Total vulnerabilities found', ['severity', 'image'])
|
||||
clamav_threats = Counter('clamav_threats_total', 'Total threats detected by ClamAV')
|
||||
suricata_alerts = Counter('suricata_network_alerts_total', 'Total network alerts from Suricata')
|
||||
|
||||
def collect_falco_metrics():
|
||||
try:
|
||||
# Get Falco alerts from logs
|
||||
result = subprocess.run(['tail', '-n', '100', '/var/log/falco/falco.log'],
|
||||
capture_output=True, text=True)
|
||||
for line in result.stdout.split('\n'):
|
||||
if 'Alert' in line:
|
||||
# Parse alert and increment counter
|
||||
falco_alerts.labels(rule='unknown', priority='info').inc()
|
||||
except Exception as e:
|
||||
print(f'Error collecting Falco metrics: {e}')
|
||||
|
||||
def collect_trivy_metrics():
|
||||
try:
|
||||
# Read latest Trivy reports
|
||||
import os
|
||||
reports_dir = '/reports'
|
||||
if os.path.exists(reports_dir):
|
||||
for filename in os.listdir(reports_dir):
|
||||
if filename.endswith('.json'):
|
||||
with open(os.path.join(reports_dir, filename)) as f:
|
||||
data = json.load(f)
|
||||
if 'Results' in data:
|
||||
for result in data['Results']:
|
||||
if 'Vulnerabilities' in result:
|
||||
for vuln in result['Vulnerabilities']:
|
||||
severity = vuln.get('Severity', 'unknown').lower()
|
||||
image = data.get('ArtifactName', 'unknown')
|
||||
vuln_count.labels(severity=severity, image=image).inc()
|
||||
except Exception as e:
|
||||
print(f'Error collecting Trivy metrics: {e}')
|
||||
|
||||
# Start metrics server
|
||||
start_http_server(8888)
|
||||
print('Security metrics server started on port 8888')
|
||||
|
||||
# Collection loop
|
||||
while True:
|
||||
collect_falco_metrics()
|
||||
collect_trivy_metrics()
|
||||
time.sleep(60)
|
||||
PYEOF
|
||||
|
||||
python3 /app/security_metrics.py
|
||||
"
|
||||
volumes:
|
||||
- falco_logs:/var/log/falco:ro
|
||||
- trivy_reports:/reports:ro
|
||||
- clamav_logs:/var/log/clamav:ro
|
||||
- suricata_logs:/var/log/suricata:ro
|
||||
networks:
|
||||
- monitoring-network
|
||||
ports:
|
||||
- "8888:8888" # Prometheus metrics endpoint
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
cpus: '0.25'
|
||||
reservations:
|
||||
memory: 128M
|
||||
cpus: '0.05'
|
||||
placement:
|
||||
constraints:
|
||||
- "node.labels.role==monitor"
|
||||
|
||||
volumes:
|
||||
falco_rules:
|
||||
driver: local
|
||||
falco_logs:
|
||||
driver: local
|
||||
falco_sidekick_config:
|
||||
driver: local
|
||||
suricata_config:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /home/jonathan/Coding/HomeAudit/stacks/monitoring/suricata-config
|
||||
suricata_logs:
|
||||
driver: local
|
||||
suricata_rules:
|
||||
driver: local
|
||||
trivy_cache:
|
||||
driver: local
|
||||
trivy_reports:
|
||||
driver: local
|
||||
clamav_db:
|
||||
driver: local
|
||||
clamav_logs:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
monitoring-network:
|
||||
external: true
|
||||
traefik-public:
|
||||
external: true
|
||||
database-network:
|
||||
external: true
|
||||
193
stacks/monitoring/traefik-monitoring.yml
Normal file
193
stacks/monitoring/traefik-monitoring.yml
Normal file
@@ -0,0 +1,193 @@
|
||||
version: '3.9'
|
||||
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=30d'
|
||||
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||
- '--web.console.templates=/etc/prometheus/consoles'
|
||||
- '--web.enable-lifecycle'
|
||||
- '--web.enable-admin-api'
|
||||
volumes:
|
||||
- prometheus_data:/prometheus
|
||||
- prometheus_config:/etc/prometheus
|
||||
networks:
|
||||
- monitoring
|
||||
- traefik-public
|
||||
deploy:
|
||||
mode: replicated
|
||||
replicas: 1
|
||||
placement:
|
||||
constraints:
|
||||
- node.role == manager
|
||||
resources:
|
||||
limits:
|
||||
memory: 1G
|
||||
reservations:
|
||||
memory: 512M
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.docker.network=traefik-public
|
||||
- traefik.http.routers.prometheus.rule=Host(`prometheus.${DOMAIN:-localhost}`)
|
||||
- traefik.http.routers.prometheus.entrypoints=websecure
|
||||
- traefik.http.routers.prometheus.tls=true
|
||||
- traefik.http.routers.prometheus.tls.certresolver=letsencrypt
|
||||
- traefik.http.routers.prometheus.middlewares=prometheus-auth,security-headers
|
||||
- traefik.http.middlewares.prometheus-auth.basicauth.users=admin:$$2y$$10$$xvzBkbKKvRX.jGG6F7L.ReEMyEx.7BkqNGQO2rFt/1aBgx8jPElXW
|
||||
- traefik.http.services.prometheus.loadbalancer.server.port=9090
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_SECURITY_ADMIN_PASSWORD=secure_grafana_2024
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
- GF_SECURITY_DISABLE_GRAVATAR=true
|
||||
- GF_ANALYTICS_REPORTING_ENABLED=false
|
||||
- GF_ANALYTICS_CHECK_FOR_UPDATES=false
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- grafana_config:/etc/grafana
|
||||
networks:
|
||||
- monitoring
|
||||
- traefik-public
|
||||
deploy:
|
||||
mode: replicated
|
||||
replicas: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
reservations:
|
||||
memory: 256M
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.docker.network=traefik-public
|
||||
- traefik.http.routers.grafana.rule=Host(`grafana.${DOMAIN:-localhost}`)
|
||||
- traefik.http.routers.grafana.entrypoints=websecure
|
||||
- traefik.http.routers.grafana.tls=true
|
||||
- traefik.http.routers.grafana.tls.certresolver=letsencrypt
|
||||
- traefik.http.routers.grafana.middlewares=security-headers
|
||||
- traefik.http.services.grafana.loadbalancer.server.port=3000
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
volumes:
|
||||
- alertmanager_data:/alertmanager
|
||||
- alertmanager_config:/etc/alertmanager
|
||||
networks:
|
||||
- monitoring
|
||||
- traefik-public
|
||||
deploy:
|
||||
mode: replicated
|
||||
replicas: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
reservations:
|
||||
memory: 128M
|
||||
labels:
|
||||
- traefik.enable=true
|
||||
- traefik.docker.network=traefik-public
|
||||
- traefik.http.routers.alertmanager.rule=Host(`alertmanager.${DOMAIN:-localhost}`)
|
||||
- traefik.http.routers.alertmanager.entrypoints=websecure
|
||||
- traefik.http.routers.alertmanager.tls=true
|
||||
- traefik.http.routers.alertmanager.tls.certresolver=letsencrypt
|
||||
- traefik.http.routers.alertmanager.middlewares=alertmanager-auth,security-headers
|
||||
- traefik.http.middlewares.alertmanager-auth.basicauth.users=admin:$$2y$$10$$xvzBkbKKvRX.jGG6F7L.ReEMyEx.7BkqNGQO2rFt/1aBgx8jPElXW
|
||||
- traefik.http.services.alertmanager.loadbalancer.server.port=9093
|
||||
|
||||
loki:
|
||||
image: grafana/loki:latest
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
volumes:
|
||||
- loki_data:/loki
|
||||
networks:
|
||||
- monitoring
|
||||
deploy:
|
||||
mode: replicated
|
||||
replicas: 1
|
||||
resources:
|
||||
limits:
|
||||
memory: 512M
|
||||
reservations:
|
||||
memory: 256M
|
||||
|
||||
promtail:
|
||||
image: grafana/promtail:latest
|
||||
command: -config.file=/etc/promtail/config.yml
|
||||
volumes:
|
||||
- /var/log:/var/log:ro
|
||||
- /opt/traefik/logs:/traefik-logs:ro
|
||||
- promtail_config:/etc/promtail
|
||||
networks:
|
||||
- monitoring
|
||||
deploy:
|
||||
mode: global
|
||||
resources:
|
||||
limits:
|
||||
memory: 128M
|
||||
reservations:
|
||||
memory: 64M
|
||||
|
||||
volumes:
|
||||
prometheus_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/prometheus/data
|
||||
prometheus_config:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/prometheus/config
|
||||
grafana_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/grafana/data
|
||||
grafana_config:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/grafana/config
|
||||
alertmanager_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/alertmanager/data
|
||||
alertmanager_config:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/alertmanager/config
|
||||
loki_data:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/loki/data
|
||||
promtail_config:
|
||||
driver: local
|
||||
driver_opts:
|
||||
type: none
|
||||
o: bind
|
||||
device: /opt/monitoring/promtail/config
|
||||
|
||||
networks:
|
||||
monitoring:
|
||||
driver: overlay
|
||||
attachable: true
|
||||
traefik-public:
|
||||
external: true
|
||||
Reference in New Issue
Block a user