groups: - name: traefik.rules rules: # Authentication failure alerts - alert: TraefikHighAuthFailureRate expr: rate(traefik_service_requests_total{code=~"401|403"}[5m]) > 10 for: 2m labels: severity: warning annotations: summary: "High authentication failure rate detected" description: "Traefik is experiencing {{ $value }} authentication failures per second on {{ $labels.service }}." - alert: TraefikAuthenticationCompromiseAttempt expr: rate(traefik_service_requests_total{code="401"}[1m]) > 50 for: 30s labels: severity: critical annotations: summary: "Possible brute force attack detected" description: "Extremely high authentication failure rate: {{ $value }} failures per second on {{ $labels.service }}." # Service availability - alert: TraefikServiceDown expr: traefik_service_backend_up == 0 for: 1m labels: severity: critical annotations: summary: "Traefik service backend is down" description: "Service {{ $labels.service }} backend {{ $labels.backend }} has been down for more than 1 minute." # High response times - alert: TraefikHighResponseTime expr: histogram_quantile(0.95, rate(traefik_service_request_duration_seconds_bucket[5m])) > 2 for: 5m labels: severity: warning annotations: summary: "High response time detected" description: "95th percentile response time is {{ $value }}s for service {{ $labels.service }}." # Error rate alerts - alert: TraefikHighErrorRate expr: rate(traefik_service_requests_total{code=~"5.."}[5m]) / rate(traefik_service_requests_total[5m]) > 0.1 for: 5m labels: severity: warning annotations: summary: "High error rate detected" description: "Error rate is {{ $value | humanizePercentage }} for service {{ $labels.service }}." # TLS certificate expiration - alert: TraefikTLSCertificateExpiringSoon expr: traefik_tls_certs_not_after - time() < 7 * 24 * 60 * 60 for: 1h labels: severity: warning annotations: summary: "TLS certificate expiring soon" description: "TLS certificate for {{ $labels.san }} will expire in {{ $value | humanizeDuration }}." - alert: TraefikTLSCertificateExpired expr: traefik_tls_certs_not_after - time() <= 0 for: 1m labels: severity: critical annotations: summary: "TLS certificate expired" description: "TLS certificate for {{ $labels.san }} has expired." # Docker socket access issues - alert: TraefikDockerProviderError expr: increase(traefik_config_last_reload_failure_total[5m]) > 0 for: 1m labels: severity: critical annotations: summary: "Traefik Docker provider configuration reload failed" description: "Traefik failed to reload configuration from Docker provider. Check Docker socket permissions." # Rate limiting alerts - alert: TraefikRateLimitReached expr: rate(traefik_entrypoint_requests_total{code="429"}[5m]) > 1 for: 2m labels: severity: warning annotations: summary: "Rate limit frequently reached" description: "Rate limiting is being triggered {{ $value }} times per second on entrypoint {{ $labels.entrypoint }}."