Complete Traefik infrastructure deployment - 60% complete

Major accomplishments:
-  SELinux policy installed and working
-  Core Traefik v2.10 deployment running
-  Production configuration ready (v3.1)
-  Monitoring stack configured
-  Comprehensive documentation created
-  Security hardening implemented

Current status:
- 🟡 Partially deployed (60% complete)
- ⚠️ Docker socket access needs resolution
-  Monitoring stack not deployed yet
- ⚠️ Production migration pending

Next steps:
1. Fix Docker socket permissions
2. Deploy monitoring stack
3. Migrate to production config
4. Validate full functionality

Files added:
- Complete Traefik deployment documentation
- Production and test configurations
- Monitoring stack configurations
- SELinux policy module
- Security checklists and guides
- Current status documentation
This commit is contained in:
admin
2025-08-28 15:22:41 -04:00
parent 5c1d529164
commit 9ea31368f5
72 changed files with 440075 additions and 87 deletions

View File

@@ -0,0 +1,74 @@
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@homeaudit.local'
smtp_auth_username: 'alerts@homeaudit.local'
smtp_auth_password: 'your_email_password'
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 0s
group_interval: 5m
repeat_interval: 30m
- match:
alertname: TraefikAuthenticationCompromiseAttempt
receiver: 'security-alerts'
group_wait: 0s
repeat_interval: 15m
receivers:
- name: 'default'
email_configs:
- to: 'admin@homeaudit.local'
subject: '[MONITORING] {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Severity: {{ .Labels.severity }}
Instance: {{ .Labels.instance }}
{{ end }}
- name: 'critical-alerts'
email_configs:
- to: 'admin@homeaudit.local'
subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
body: |
🚨 CRITICAL ALERT 🚨
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Instance: {{ .Labels.instance }}
Time: {{ .StartsAt }}
{{ end }}
- name: 'security-alerts'
email_configs:
- to: 'security@homeaudit.local'
subject: '[SECURITY ALERT] Possible Authentication Attack'
body: |
🔒 SECURITY ALERT 🔒
Possible brute force or credential stuffing attack detected!
{{ range .Alerts }}
Description: {{ .Annotations.description }}
Service: {{ .Labels.service }}
Instance: {{ .Labels.instance }}
Time: {{ .StartsAt }}
{{ end }}
Immediate action may be required to block attacking IPs.
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'cluster', 'service']

View File

@@ -0,0 +1,54 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "traefik_rules.yml"
- "system_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
# Traefik metrics
- job_name: 'traefik'
static_configs:
- targets: ['traefik:8080']
metrics_path: /metrics
scrape_interval: 10s
# Docker Swarm services
- job_name: 'docker-swarm'
dockerswarm_sd_configs:
- host: unix:///var/run/docker.sock
role: services
port: 9090
relabel_configs:
- source_labels: [__meta_dockerswarm_service_label_prometheus_job]
target_label: __tmp_prometheus_job_name
- source_labels: [__tmp_prometheus_job_name]
regex: .+
target_label: job
replacement: '${1}'
- regex: __tmp_prometheus_job_name
action: labeldrop
# Node exporter for system metrics
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
scrape_interval: 30s
# cAdvisor for container metrics
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
scrape_interval: 30s
# Prometheus itself
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']

View File

@@ -0,0 +1,90 @@
groups:
- name: traefik.rules
rules:
# Authentication failure alerts
- alert: TraefikHighAuthFailureRate
expr: rate(traefik_service_requests_total{code=~"401|403"}[5m]) > 10
for: 2m
labels:
severity: warning
annotations:
summary: "High authentication failure rate detected"
description: "Traefik is experiencing {{ $value }} authentication failures per second on {{ $labels.service }}."
- alert: TraefikAuthenticationCompromiseAttempt
expr: rate(traefik_service_requests_total{code="401"}[1m]) > 50
for: 30s
labels:
severity: critical
annotations:
summary: "Possible brute force attack detected"
description: "Extremely high authentication failure rate: {{ $value }} failures per second on {{ $labels.service }}."
# Service availability
- alert: TraefikServiceDown
expr: traefik_service_backend_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Traefik service backend is down"
description: "Service {{ $labels.service }} backend {{ $labels.backend }} has been down for more than 1 minute."
# High response times
- alert: TraefikHighResponseTime
expr: histogram_quantile(0.95, rate(traefik_service_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High response time detected"
description: "95th percentile response time is {{ $value }}s for service {{ $labels.service }}."
# Error rate alerts
- alert: TraefikHighErrorRate
expr: rate(traefik_service_requests_total{code=~"5.."}[5m]) / rate(traefik_service_requests_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for service {{ $labels.service }}."
# TLS certificate expiration
- alert: TraefikTLSCertificateExpiringSoon
expr: traefik_tls_certs_not_after - time() < 7 * 24 * 60 * 60
for: 1h
labels:
severity: warning
annotations:
summary: "TLS certificate expiring soon"
description: "TLS certificate for {{ $labels.san }} will expire in {{ $value | humanizeDuration }}."
- alert: TraefikTLSCertificateExpired
expr: traefik_tls_certs_not_after - time() <= 0
for: 1m
labels:
severity: critical
annotations:
summary: "TLS certificate expired"
description: "TLS certificate for {{ $labels.san }} has expired."
# Docker socket access issues
- alert: TraefikDockerProviderError
expr: increase(traefik_config_last_reload_failure_total[5m]) > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Traefik Docker provider configuration reload failed"
description: "Traefik failed to reload configuration from Docker provider. Check Docker socket permissions."
# Rate limiting alerts
- alert: TraefikRateLimitReached
expr: rate(traefik_entrypoint_requests_total{code="429"}[5m]) > 1
for: 2m
labels:
severity: warning
annotations:
summary: "Rate limit frequently reached"
description: "Rate limiting is being triggered {{ $value }} times per second on entrypoint {{ $labels.entrypoint }}."