## Major Infrastructure Milestones Achieved ### ✅ Service Migrations Completed - Jellyfin: Successfully migrated to Docker Swarm with latest version - Vaultwarden: Running in Docker Swarm on OMV800 (eliminated duplicate) - Nextcloud: Operational with database optimization and cron setup - Paperless services: Both NGX and AI running successfully ### 🚨 Duplicate Service Analysis Complete - Identified MariaDB conflict (OMV800 Swarm vs lenovo410 standalone) - Identified Vaultwarden duplication (now resolved) - Documented PostgreSQL and Redis consolidation opportunities - Mapped monitoring stack optimization needs ### 🏗️ Infrastructure Status Documentation - Updated README with current cleanup phase status - Enhanced Service Analysis with duplicate service inventory - Updated Quick Start guide with immediate action items - Documented current container distribution across 6 nodes ### 📋 Action Plan Documentation - Phase 1: Immediate service conflict resolution (this week) - Phase 2: Service migration and load balancing (next 2 weeks) - Phase 3: Database consolidation and optimization (future) ### 🔧 Current Infrastructure Health - Docker Swarm: All 6 nodes operational and healthy - Caddy Reverse Proxy: Fully operational with SSL certificates - Storage: MergerFS healthy, local storage for databases - Monitoring: Prometheus + Grafana + Uptime Kuma operational ### 📊 Container Distribution Status - OMV800: 25+ containers (needs load balancing) - lenovo410: 9 containers (cleanup in progress) - fedora: 1 container (ready for additional services) - audrey: 4 containers (well-balanced, monitoring hub) - lenovo420: 7 containers (balanced, can assist) - surface: 9 containers (specialized, reverse proxy) ### 🎯 Next Steps 1. Remove lenovo410 MariaDB (eliminate port 3306 conflict) 2. Clean up lenovo410 Vaultwarden (256MB space savings) 3. Verify no service conflicts exist 4. Begin service migration from OMV800 to fedora/audrey Status: Infrastructure 99% complete, entering cleanup and optimization phase
113 lines
3.8 KiB
YAML
113 lines
3.8 KiB
YAML
groups:
|
|
- name: vaultwarden
|
|
rules:
|
|
- alert: VaultwardenDown
|
|
expr: up{job="vaultwarden-monitoring"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: vaultwarden
|
|
annotations:
|
|
summary: "🔴 Vaultwarden service is down"
|
|
description: "Vaultwarden password manager service has been down for more than 1 minute"
|
|
runbook_url: "https://grafana.pressmess.duckdns.org/d/vaultwarden"
|
|
|
|
- alert: VaultwardenHighResponseTime
|
|
expr: probe_duration_seconds{job="vaultwarden-monitoring"} > 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
service: vaultwarden
|
|
annotations:
|
|
summary: "⚠️ Vaultwarden response time is high"
|
|
description: "Vaultwarden is responding slowly ({{ $value }}s)"
|
|
|
|
- name: critical_services
|
|
rules:
|
|
- alert: CriticalServiceDown
|
|
expr: up{job="http-service-health"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "🔴 Critical service {{ $labels.instance }} is down"
|
|
description: "Service {{ $labels.instance }} has been down for more than 2 minutes"
|
|
|
|
- alert: DatabaseServiceDown
|
|
expr: up{job="tcp-service-health"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "🔴 Database service {{ $labels.instance }} is down"
|
|
description: "Database service {{ $labels.instance }} is not responding"
|
|
|
|
- name: system_resources
|
|
rules:
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ High CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is {{ $value }}% on {{ $labels.instance }}"
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ High memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is {{ $value }}% on {{ $labels.instance }}"
|
|
|
|
- alert: HighDiskUsage
|
|
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ High disk usage on {{ $labels.instance }}"
|
|
description: "Disk usage is {{ $value }}% on {{ $labels.instance }}"
|
|
|
|
- name: monitoring
|
|
rules:
|
|
- alert: PrometheusDown
|
|
expr: up{job="prometheus"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "🔴 Prometheus is down"
|
|
description: "Prometheus monitoring service is not responding"
|
|
|
|
- alert: GrafanaDown
|
|
expr: up{job="docker-swarm-metrics", instance=~".*:3002"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "🔴 Grafana is down"
|
|
description: "Grafana dashboard service is not responding"
|
|
|
|
- name: ssl_certificates
|
|
rules:
|
|
- alert: SSLCertificateExpiringSoon
|
|
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "⚠️ SSL certificate expiring soon"
|
|
description: "SSL certificate for {{ $labels.instance }} expires in {{ $value | humanizeDuration }}"
|
|
|
|
- alert: SSLCertificateExpired
|
|
expr: probe_ssl_earliest_cert_expiry - time() <= 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "🔴 SSL certificate expired"
|
|
description: "SSL certificate for {{ $labels.instance }} has expired"
|