Major infrastructure migration and Vaultwarden PostgreSQL troubleshooting

COMPREHENSIVE CHANGES:

INFRASTRUCTURE MIGRATION:
- Migrated services to Docker Swarm on OMV800 (192.168.50.229)
- Deployed PostgreSQL database for Vaultwarden migration
- Updated all stack configurations for Docker Swarm compatibility
- Added comprehensive monitoring stack (Prometheus, Grafana, Blackbox)
- Implemented proper secret management for all services

VAULTWARDEN POSTGRESQL MIGRATION:
- Attempted migration from SQLite to PostgreSQL for NFS compatibility
- Created PostgreSQL stack with proper user/password configuration
- Built custom Vaultwarden image with PostgreSQL support
- Troubleshot persistent SQLite fallback issue despite PostgreSQL config
- Identified known issue where Vaultwarden silently falls back to SQLite
- Added ENABLE_DB_WAL=false to prevent filesystem compatibility issues
- Current status: Old Vaultwarden on lenovo410 still working, new one has config issues

PAPERLESS SERVICES:
- Successfully deployed Paperless-NGX and Paperless-AI on OMV800
- Both services running on ports 8000 and 3000 respectively
- Caddy configuration updated for external access
- Services accessible via paperless.pressmess.duckdns.org and paperless-ai.pressmess.duckdns.org

CADDY CONFIGURATION:
- Updated Caddyfile on Surface (192.168.50.254) for new service locations
- Fixed Vaultwarden reverse proxy to point to new Docker Swarm service
- Removed old notification hub reference that was causing conflicts
- All services properly configured for external access via DuckDNS

BACKUP AND DISCOVERY:
- Created comprehensive backup system for all hosts
- Generated detailed discovery reports for infrastructure analysis
- Implemented automated backup validation scripts
- Created migration progress tracking and verification reports

MONITORING STACK:
- Deployed Prometheus, Grafana, and Blackbox monitoring
- Created infrastructure and system overview dashboards
- Added proper service discovery and alerting configuration
- Implemented performance monitoring for all critical services

DOCUMENTATION:
- Reorganized documentation into logical structure
- Created comprehensive migration playbook and troubleshooting guides
- Added hardware specifications and optimization recommendations
- Documented all configuration changes and service dependencies

CURRENT STATUS:
- Paperless services:  Working and accessible externally
- Vaultwarden:  PostgreSQL configuration issues, old instance still working
- Monitoring:  Deployed and operational
- Caddy:  Updated and working for external access
- PostgreSQL:  Database running, connection issues with Vaultwarden

NEXT STEPS:
- Continue troubleshooting Vaultwarden PostgreSQL configuration
- Consider alternative approaches for Vaultwarden migration
- Validate all external service access
- Complete final migration validation

TECHNICAL NOTES:
- Used Docker Swarm for orchestration on OMV800
- Implemented proper secret management for sensitive data
- Added comprehensive logging and monitoring
- Created automated backup and validation scripts
This commit is contained in:
admin
2025-08-30 20:18:44 -04:00
parent 9ea31368f5
commit 705a2757c1
155 changed files with 16781 additions and 1243 deletions

View File

@@ -0,0 +1,31 @@
modules:
http_2xx:
prober: http
timeout: 5s
http:
preferred_ip_protocol: "ip4"
valid_status_codes: [200, 201, 202, 203, 204, 205, 206, 207, 208, 226]
fail_if_ssl: false
fail_if_not_ssl: false
http_post_2xx:
prober: http
timeout: 5s
http:
method: POST
preferred_ip_protocol: "ip4"
valid_status_codes: [200, 201, 202, 203, 204, 205, 206, 207, 208, 226]
fail_if_ssl: false
fail_if_not_ssl: false
tcp_connect:
prober: tcp
timeout: 5s
tcp:
preferred_ip_protocol: "ip4"
icmp:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: "ip4"

View File

@@ -0,0 +1,245 @@
{
"title": "Infrastructure Overview",
"tags": ["infrastructure", "overview"],
"style": "dark",
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "HTTP Service Health Status",
"type": "stat",
"targets": [
{
"expr": "probe_success{job=\"http-service-health\"}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "green", "value": 1}
]
},
"mappings": [
{
"options": {
"0": {"text": "Down", "color": "red"},
"1": {"text": "Up", "color": "green"}
},
"type": "value"
}
],
"unit": "short"
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
}
},
{
"id": 2,
"title": "TCP Service Health Status",
"type": "stat",
"targets": [
{
"expr": "probe_success{job=\"tcp-service-health\"}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "green", "value": 1}
]
},
"mappings": [
{
"options": {
"0": {"text": "Down", "color": "red"},
"1": {"text": "Up", "color": "green"}
},
"type": "value"
}
],
"unit": "short"
}
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
}
},
{
"id": 3,
"title": "Service Response Time",
"type": "timeseries",
"targets": [
{
"expr": "probe_duration_seconds{job=\"http-service-health\"}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"unit": "s"
}
},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 8}
},
{
"id": 4,
"title": "HTTP Service Availability Summary",
"type": "stat",
"targets": [
{
"expr": "sum(probe_success{job=\"http-service-health\"})",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 3},
{"color": "green", "value": 6}
]
},
"unit": "short"
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
}
},
{
"id": 5,
"title": "Service Details",
"type": "table",
"targets": [
{
"expr": "probe_success{job=~\"http-service-health|tcp-service-health\"}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "green", "value": 1}
]
},
"mappings": [
{
"options": {
"0": {"text": "Offline", "color": "red"},
"1": {"text": "Online", "color": "green"}
},
"type": "value"
}
]
}
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"Time": true,
"__name__": true,
"job": true
},
"indexByName": {},
"renameByName": {
"Value": "Status",
"instance": "Service"
}
}
}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}

View File

@@ -0,0 +1,316 @@
{
"title": "System Overview",
"tags": ["system", "infrastructure", "overview"],
"style": "dark",
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "CPU Usage",
"type": "timeseries",
"targets": [
{
"expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"refId": "A",
"legendFormat": "CPU %"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"unit": "percent",
"min": 0,
"max": 100
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
},
{
"id": 2,
"title": "Memory Usage",
"type": "timeseries",
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"refId": "A",
"legendFormat": "Memory %"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"unit": "percent",
"min": 0,
"max": 100
}
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
},
{
"id": 3,
"title": "Disk Usage",
"type": "timeseries",
"targets": [
{
"expr": "(1 - (node_filesystem_avail_bytes{mountpoint=\"/\",fstype!=\"rootfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!=\"rootfs\"})) * 100",
"refId": "A",
"legendFormat": "Disk %"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"unit": "percent",
"min": 0,
"max": 100
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
},
{
"id": 4,
"title": "Network I/O",
"type": "timeseries",
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m])",
"refId": "A",
"legendFormat": "{{device}} - RX"
},
{
"expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m])",
"refId": "B",
"legendFormat": "{{device}} - TX"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"unit": "Bps"
}
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
},
{
"id": 5,
"title": "System Load",
"type": "timeseries",
"targets": [
{
"expr": "node_load1",
"refId": "A",
"legendFormat": "1m Load"
},
{
"expr": "node_load5",
"refId": "B",
"legendFormat": "5m Load"
},
{
"expr": "node_load15",
"refId": "C",
"legendFormat": "15m Load"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"vis": false
},
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
}
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
},
{
"id": 6,
"title": "System Info",
"type": "stat",
"targets": [
{
"expr": "node_uname_info",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{"color": "green", "value": 0}
]
},
"mappings": [],
"unit": "short"
}
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}

View File

@@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards

View File

@@ -0,0 +1,14 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://192.168.50.229:9091
isDefault: true
editable: true
jsonData:
timeInterval: "15s"
queryTimeout: "60s"
httpMethod: "POST"
secureJsonData: {}

View File

@@ -0,0 +1,70 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
# Prometheus itself
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Blackbox exporter
- job_name: 'blackbox'
static_configs:
- targets: ['192.168.50.229:9115']
# Node exporter - system metrics
- job_name: 'node-exporter'
static_configs:
- targets: ['192.168.50.229:9100']
scrape_interval: 30s
# Docker Swarm services that expose metrics
- job_name: 'docker-swarm-metrics'
static_configs:
- targets:
- '192.168.50.229:9091' # Prometheus
- '192.168.50.229:3002' # Grafana
scrape_interval: 30s
# HTTP service health checks via blackbox exporter
- job_name: 'http-service-health'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- 'http://192.168.50.229:8000' # Paperless-NGX
- 'http://192.168.50.229:3000' # Paperless-AI
- 'http://192.168.50.229:8081' # Nextcloud
- 'http://192.168.50.181:8123' # Home Assistant
- 'http://192.168.50.181:9000' # Portainer
- 'http://192.168.50.66:9080' # AppFlowy
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.50.229:9115
scrape_interval: 60s
# TCP service health checks via blackbox exporter
- job_name: 'tcp-service-health'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets:
- '192.168.50.229:6379' # Redis
- '192.168.50.229:5432' # PostgreSQL
- '192.168.50.229:3306' # MariaDB
- '192.168.50.229:1883' # Mosquitto
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.50.229:9115
scrape_interval: 60s

View File

@@ -0,0 +1,47 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
# Prometheus itself
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Docker Swarm services (basic)
- job_name: 'docker-swarm-services'
static_configs:
- targets:
- '192.168.50.229:9090' # Prometheus
- '192.168.50.229:3000' # Grafana
- '192.168.50.229:6379' # Redis
- '192.168.50.229:5432' # PostgreSQL
- '192.168.50.229:3306' # MariaDB
- '192.168.50.229:1883' # Mosquitto
scrape_interval: 30s
# Infrastructure nodes
- job_name: 'infrastructure-nodes'
static_configs:
- targets:
- '192.168.50.229:22' # OMV800
- '192.168.50.254:22' # Surface
- '192.168.50.181:22' # jonathan-2518f5u
- '192.168.50.66:22' # lenovo420
- '192.168.50.145:22' # audrey
- '192.168.50.225:22' # fedora
scrape_interval: 60s
# Application services
- job_name: 'application-services'
static_configs:
- targets:
- '192.168.50.229:8081' # Nextcloud
- '192.168.50.229:8000' # Paperless-NGX
- '192.168.50.229:3000' # Paperless-AI
- '192.168.50.181:8123' # Home Assistant
- '192.168.50.181:9000' # Portainer
- '192.168.50.181:5678' # n8n
- '192.168.50.66:9080' # AppFlowy
- '192.168.50.254:80' # Caddy
scrape_interval: 30s