Files
HomeAudit/configs/monitoring/traefik_rules.yml
admin 9ea31368f5 Complete Traefik infrastructure deployment - 60% complete
Major accomplishments:
-  SELinux policy installed and working
-  Core Traefik v2.10 deployment running
-  Production configuration ready (v3.1)
-  Monitoring stack configured
-  Comprehensive documentation created
-  Security hardening implemented

Current status:
- 🟡 Partially deployed (60% complete)
- ⚠️ Docker socket access needs resolution
-  Monitoring stack not deployed yet
- ⚠️ Production migration pending

Next steps:
1. Fix Docker socket permissions
2. Deploy monitoring stack
3. Migrate to production config
4. Validate full functionality

Files added:
- Complete Traefik deployment documentation
- Production and test configurations
- Monitoring stack configurations
- SELinux policy module
- Security checklists and guides
- Current status documentation
2025-08-28 15:22:41 -04:00

90 lines
3.5 KiB
YAML

groups:
- name: traefik.rules
rules:
# Authentication failure alerts
- alert: TraefikHighAuthFailureRate
expr: rate(traefik_service_requests_total{code=~"401|403"}[5m]) > 10
for: 2m
labels:
severity: warning
annotations:
summary: "High authentication failure rate detected"
description: "Traefik is experiencing {{ $value }} authentication failures per second on {{ $labels.service }}."
- alert: TraefikAuthenticationCompromiseAttempt
expr: rate(traefik_service_requests_total{code="401"}[1m]) > 50
for: 30s
labels:
severity: critical
annotations:
summary: "Possible brute force attack detected"
description: "Extremely high authentication failure rate: {{ $value }} failures per second on {{ $labels.service }}."
# Service availability
- alert: TraefikServiceDown
expr: traefik_service_backend_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Traefik service backend is down"
description: "Service {{ $labels.service }} backend {{ $labels.backend }} has been down for more than 1 minute."
# High response times
- alert: TraefikHighResponseTime
expr: histogram_quantile(0.95, rate(traefik_service_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High response time detected"
description: "95th percentile response time is {{ $value }}s for service {{ $labels.service }}."
# Error rate alerts
- alert: TraefikHighErrorRate
expr: rate(traefik_service_requests_total{code=~"5.."}[5m]) / rate(traefik_service_requests_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for service {{ $labels.service }}."
# TLS certificate expiration
- alert: TraefikTLSCertificateExpiringSoon
expr: traefik_tls_certs_not_after - time() < 7 * 24 * 60 * 60
for: 1h
labels:
severity: warning
annotations:
summary: "TLS certificate expiring soon"
description: "TLS certificate for {{ $labels.san }} will expire in {{ $value | humanizeDuration }}."
- alert: TraefikTLSCertificateExpired
expr: traefik_tls_certs_not_after - time() <= 0
for: 1m
labels:
severity: critical
annotations:
summary: "TLS certificate expired"
description: "TLS certificate for {{ $labels.san }} has expired."
# Docker socket access issues
- alert: TraefikDockerProviderError
expr: increase(traefik_config_last_reload_failure_total[5m]) > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Traefik Docker provider configuration reload failed"
description: "Traefik failed to reload configuration from Docker provider. Check Docker socket permissions."
# Rate limiting alerts
- alert: TraefikRateLimitReached
expr: rate(traefik_entrypoint_requests_total{code="429"}[5m]) > 1
for: 2m
labels:
severity: warning
annotations:
summary: "Rate limit frequently reached"
description: "Rate limiting is being triggered {{ $value }} times per second on entrypoint {{ $labels.entrypoint }}."