groups: - name: vaultwarden rules: - alert: VaultwardenDown expr: up{job="vaultwarden-monitoring"} == 0 for: 1m labels: severity: critical service: vaultwarden annotations: summary: "🔴 Vaultwarden service is down" description: "Vaultwarden password manager service has been down for more than 1 minute" runbook_url: "https://grafana.pressmess.duckdns.org/d/vaultwarden" - alert: VaultwardenHighResponseTime expr: probe_duration_seconds{job="vaultwarden-monitoring"} > 5 for: 2m labels: severity: warning service: vaultwarden annotations: summary: "⚠️ Vaultwarden response time is high" description: "Vaultwarden is responding slowly ({{ $value }}s)" - name: critical_services rules: - alert: CriticalServiceDown expr: up{job="http-service-health"} == 0 for: 2m labels: severity: critical annotations: summary: "🔴 Critical service {{ $labels.instance }} is down" description: "Service {{ $labels.instance }} has been down for more than 2 minutes" - alert: DatabaseServiceDown expr: up{job="tcp-service-health"} == 0 for: 1m labels: severity: critical annotations: summary: "🔴 Database service {{ $labels.instance }} is down" description: "Database service {{ $labels.instance }} is not responding" - name: system_resources rules: - alert: HighCPUUsage expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "⚠️ High CPU usage on {{ $labels.instance }}" description: "CPU usage is {{ $value }}% on {{ $labels.instance }}" - alert: HighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 5m labels: severity: warning annotations: summary: "⚠️ High memory usage on {{ $labels.instance }}" description: "Memory usage is {{ $value }}% on {{ $labels.instance }}" - alert: HighDiskUsage expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 90 for: 5m labels: severity: warning annotations: summary: "⚠️ High disk usage on {{ $labels.instance }}" description: "Disk usage is {{ $value }}% on {{ $labels.instance }}" - name: monitoring rules: - alert: PrometheusDown expr: up{job="prometheus"} == 0 for: 1m labels: severity: critical annotations: summary: "🔴 Prometheus is down" description: "Prometheus monitoring service is not responding" - alert: GrafanaDown expr: up{job="docker-swarm-metrics", instance=~".*:3002"} == 0 for: 1m labels: severity: critical annotations: summary: "🔴 Grafana is down" description: "Grafana dashboard service is not responding" - name: ssl_certificates rules: - alert: SSLCertificateExpiringSoon expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 for: 1m labels: severity: warning annotations: summary: "⚠️ SSL certificate expiring soon" description: "SSL certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}" - alert: SSLCertificateExpired expr: probe_ssl_earliest_cert_expiry - time() <= 0 for: 1m labels: severity: critical annotations: summary: "🔴 SSL certificate expired" description: "SSL certificate for {{ $labels.instance }} has expired"