Major accomplishments: - ✅ SELinux policy installed and working - ✅ Core Traefik v2.10 deployment running - ✅ Production configuration ready (v3.1) - ✅ Monitoring stack configured - ✅ Comprehensive documentation created - ✅ Security hardening implemented Current status: - 🟡 Partially deployed (60% complete) - ⚠️ Docker socket access needs resolution - ❌ Monitoring stack not deployed yet - ⚠️ Production migration pending Next steps: 1. Fix Docker socket permissions 2. Deploy monitoring stack 3. Migrate to production config 4. Validate full functionality Files added: - Complete Traefik deployment documentation - Production and test configurations - Monitoring stack configurations - SELinux policy module - Security checklists and guides - Current status documentation
90 lines
3.5 KiB
YAML
90 lines
3.5 KiB
YAML
groups:
|
|
- name: traefik.rules
|
|
rules:
|
|
# Authentication failure alerts
|
|
- alert: TraefikHighAuthFailureRate
|
|
expr: rate(traefik_service_requests_total{code=~"401|403"}[5m]) > 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High authentication failure rate detected"
|
|
description: "Traefik is experiencing {{ $value }} authentication failures per second on {{ $labels.service }}."
|
|
|
|
- alert: TraefikAuthenticationCompromiseAttempt
|
|
expr: rate(traefik_service_requests_total{code="401"}[1m]) > 50
|
|
for: 30s
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Possible brute force attack detected"
|
|
description: "Extremely high authentication failure rate: {{ $value }} failures per second on {{ $labels.service }}."
|
|
|
|
# Service availability
|
|
- alert: TraefikServiceDown
|
|
expr: traefik_service_backend_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Traefik service backend is down"
|
|
description: "Service {{ $labels.service }} backend {{ $labels.backend }} has been down for more than 1 minute."
|
|
|
|
# High response times
|
|
- alert: TraefikHighResponseTime
|
|
expr: histogram_quantile(0.95, rate(traefik_service_request_duration_seconds_bucket[5m])) > 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High response time detected"
|
|
description: "95th percentile response time is {{ $value }}s for service {{ $labels.service }}."
|
|
|
|
# Error rate alerts
|
|
- alert: TraefikHighErrorRate
|
|
expr: rate(traefik_service_requests_total{code=~"5.."}[5m]) / rate(traefik_service_requests_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "Error rate is {{ $value | humanizePercentage }} for service {{ $labels.service }}."
|
|
|
|
# TLS certificate expiration
|
|
- alert: TraefikTLSCertificateExpiringSoon
|
|
expr: traefik_tls_certs_not_after - time() < 7 * 24 * 60 * 60
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "TLS certificate expiring soon"
|
|
description: "TLS certificate for {{ $labels.san }} will expire in {{ $value | humanizeDuration }}."
|
|
|
|
- alert: TraefikTLSCertificateExpired
|
|
expr: traefik_tls_certs_not_after - time() <= 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "TLS certificate expired"
|
|
description: "TLS certificate for {{ $labels.san }} has expired."
|
|
|
|
# Docker socket access issues
|
|
- alert: TraefikDockerProviderError
|
|
expr: increase(traefik_config_last_reload_failure_total[5m]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Traefik Docker provider configuration reload failed"
|
|
description: "Traefik failed to reload configuration from Docker provider. Check Docker socket permissions."
|
|
|
|
# Rate limiting alerts
|
|
- alert: TraefikRateLimitReached
|
|
expr: rate(traefik_entrypoint_requests_total{code="429"}[5m]) > 1
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Rate limit frequently reached"
|
|
description: "Rate limiting is being triggered {{ $value }} times per second on entrypoint {{ $labels.entrypoint }}." |