Complete Traefik infrastructure deployment - 60% complete
Major accomplishments: - ✅ SELinux policy installed and working - ✅ Core Traefik v2.10 deployment running - ✅ Production configuration ready (v3.1) - ✅ Monitoring stack configured - ✅ Comprehensive documentation created - ✅ Security hardening implemented Current status: - 🟡 Partially deployed (60% complete) - ⚠️ Docker socket access needs resolution - ❌ Monitoring stack not deployed yet - ⚠️ Production migration pending Next steps: 1. Fix Docker socket permissions 2. Deploy monitoring stack 3. Migrate to production config 4. Validate full functionality Files added: - Complete Traefik deployment documentation - Production and test configurations - Monitoring stack configurations - SELinux policy module - Security checklists and guides - Current status documentation
This commit is contained in:
74
configs/monitoring/alertmanager.yml
Normal file
74
configs/monitoring/alertmanager.yml
Normal file
@@ -0,0 +1,74 @@
|
||||
global:
|
||||
smtp_smarthost: 'localhost:587'
|
||||
smtp_from: 'alerts@homeaudit.local'
|
||||
smtp_auth_username: 'alerts@homeaudit.local'
|
||||
smtp_auth_password: 'your_email_password'
|
||||
|
||||
route:
|
||||
group_by: ['alertname', 'cluster', 'service']
|
||||
group_wait: 10s
|
||||
group_interval: 10s
|
||||
repeat_interval: 1h
|
||||
receiver: 'default'
|
||||
routes:
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-alerts'
|
||||
group_wait: 0s
|
||||
group_interval: 5m
|
||||
repeat_interval: 30m
|
||||
- match:
|
||||
alertname: TraefikAuthenticationCompromiseAttempt
|
||||
receiver: 'security-alerts'
|
||||
group_wait: 0s
|
||||
repeat_interval: 15m
|
||||
|
||||
receivers:
|
||||
- name: 'default'
|
||||
email_configs:
|
||||
- to: 'admin@homeaudit.local'
|
||||
subject: '[MONITORING] {{ .GroupLabels.alertname }}'
|
||||
body: |
|
||||
{{ range .Alerts }}
|
||||
Alert: {{ .Annotations.summary }}
|
||||
Description: {{ .Annotations.description }}
|
||||
Severity: {{ .Labels.severity }}
|
||||
Instance: {{ .Labels.instance }}
|
||||
{{ end }}
|
||||
|
||||
- name: 'critical-alerts'
|
||||
email_configs:
|
||||
- to: 'admin@homeaudit.local'
|
||||
subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
|
||||
body: |
|
||||
🚨 CRITICAL ALERT 🚨
|
||||
{{ range .Alerts }}
|
||||
Alert: {{ .Annotations.summary }}
|
||||
Description: {{ .Annotations.description }}
|
||||
Instance: {{ .Labels.instance }}
|
||||
Time: {{ .StartsAt }}
|
||||
{{ end }}
|
||||
|
||||
- name: 'security-alerts'
|
||||
email_configs:
|
||||
- to: 'security@homeaudit.local'
|
||||
subject: '[SECURITY ALERT] Possible Authentication Attack'
|
||||
body: |
|
||||
🔒 SECURITY ALERT 🔒
|
||||
Possible brute force or credential stuffing attack detected!
|
||||
|
||||
{{ range .Alerts }}
|
||||
Description: {{ .Annotations.description }}
|
||||
Service: {{ .Labels.service }}
|
||||
Instance: {{ .Labels.instance }}
|
||||
Time: {{ .StartsAt }}
|
||||
{{ end }}
|
||||
|
||||
Immediate action may be required to block attacking IPs.
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'cluster', 'service']
|
||||
54
configs/monitoring/prometheus.yml
Normal file
54
configs/monitoring/prometheus.yml
Normal file
@@ -0,0 +1,54 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
rule_files:
|
||||
- "traefik_rules.yml"
|
||||
- "system_rules.yml"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
scrape_configs:
|
||||
# Traefik metrics
|
||||
- job_name: 'traefik'
|
||||
static_configs:
|
||||
- targets: ['traefik:8080']
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 10s
|
||||
|
||||
# Docker Swarm services
|
||||
- job_name: 'docker-swarm'
|
||||
dockerswarm_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
role: services
|
||||
port: 9090
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_dockerswarm_service_label_prometheus_job]
|
||||
target_label: __tmp_prometheus_job_name
|
||||
- source_labels: [__tmp_prometheus_job_name]
|
||||
regex: .+
|
||||
target_label: job
|
||||
replacement: '${1}'
|
||||
- regex: __tmp_prometheus_job_name
|
||||
action: labeldrop
|
||||
|
||||
# Node exporter for system metrics
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
scrape_interval: 30s
|
||||
|
||||
# cAdvisor for container metrics
|
||||
- job_name: 'cadvisor'
|
||||
static_configs:
|
||||
- targets: ['cadvisor:8080']
|
||||
scrape_interval: 30s
|
||||
|
||||
# Prometheus itself
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
90
configs/monitoring/traefik_rules.yml
Normal file
90
configs/monitoring/traefik_rules.yml
Normal file
@@ -0,0 +1,90 @@
|
||||
groups:
|
||||
- name: traefik.rules
|
||||
rules:
|
||||
# Authentication failure alerts
|
||||
- alert: TraefikHighAuthFailureRate
|
||||
expr: rate(traefik_service_requests_total{code=~"401|403"}[5m]) > 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High authentication failure rate detected"
|
||||
description: "Traefik is experiencing {{ $value }} authentication failures per second on {{ $labels.service }}."
|
||||
|
||||
- alert: TraefikAuthenticationCompromiseAttempt
|
||||
expr: rate(traefik_service_requests_total{code="401"}[1m]) > 50
|
||||
for: 30s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Possible brute force attack detected"
|
||||
description: "Extremely high authentication failure rate: {{ $value }} failures per second on {{ $labels.service }}."
|
||||
|
||||
# Service availability
|
||||
- alert: TraefikServiceDown
|
||||
expr: traefik_service_backend_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Traefik service backend is down"
|
||||
description: "Service {{ $labels.service }} backend {{ $labels.backend }} has been down for more than 1 minute."
|
||||
|
||||
# High response times
|
||||
- alert: TraefikHighResponseTime
|
||||
expr: histogram_quantile(0.95, rate(traefik_service_request_duration_seconds_bucket[5m])) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High response time detected"
|
||||
description: "95th percentile response time is {{ $value }}s for service {{ $labels.service }}."
|
||||
|
||||
# Error rate alerts
|
||||
- alert: TraefikHighErrorRate
|
||||
expr: rate(traefik_service_requests_total{code=~"5.."}[5m]) / rate(traefik_service_requests_total[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate detected"
|
||||
description: "Error rate is {{ $value | humanizePercentage }} for service {{ $labels.service }}."
|
||||
|
||||
# TLS certificate expiration
|
||||
- alert: TraefikTLSCertificateExpiringSoon
|
||||
expr: traefik_tls_certs_not_after - time() < 7 * 24 * 60 * 60
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "TLS certificate expiring soon"
|
||||
description: "TLS certificate for {{ $labels.san }} will expire in {{ $value | humanizeDuration }}."
|
||||
|
||||
- alert: TraefikTLSCertificateExpired
|
||||
expr: traefik_tls_certs_not_after - time() <= 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "TLS certificate expired"
|
||||
description: "TLS certificate for {{ $labels.san }} has expired."
|
||||
|
||||
# Docker socket access issues
|
||||
- alert: TraefikDockerProviderError
|
||||
expr: increase(traefik_config_last_reload_failure_total[5m]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Traefik Docker provider configuration reload failed"
|
||||
description: "Traefik failed to reload configuration from Docker provider. Check Docker socket permissions."
|
||||
|
||||
# Rate limiting alerts
|
||||
- alert: TraefikRateLimitReached
|
||||
expr: rate(traefik_entrypoint_requests_total{code="429"}[5m]) > 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Rate limit frequently reached"
|
||||
description: "Rate limiting is being triggered {{ $value }} times per second on entrypoint {{ $labels.entrypoint }}."
|
||||
Reference in New Issue
Block a user