## Major Infrastructure Milestones Achieved ### ✅ Service Migrations Completed - Jellyfin: Successfully migrated to Docker Swarm with latest version - Vaultwarden: Running in Docker Swarm on OMV800 (eliminated duplicate) - Nextcloud: Operational with database optimization and cron setup - Paperless services: Both NGX and AI running successfully ### 🚨 Duplicate Service Analysis Complete - Identified MariaDB conflict (OMV800 Swarm vs lenovo410 standalone) - Identified Vaultwarden duplication (now resolved) - Documented PostgreSQL and Redis consolidation opportunities - Mapped monitoring stack optimization needs ### 🏗️ Infrastructure Status Documentation - Updated README with current cleanup phase status - Enhanced Service Analysis with duplicate service inventory - Updated Quick Start guide with immediate action items - Documented current container distribution across 6 nodes ### 📋 Action Plan Documentation - Phase 1: Immediate service conflict resolution (this week) - Phase 2: Service migration and load balancing (next 2 weeks) - Phase 3: Database consolidation and optimization (future) ### 🔧 Current Infrastructure Health - Docker Swarm: All 6 nodes operational and healthy - Caddy Reverse Proxy: Fully operational with SSL certificates - Storage: MergerFS healthy, local storage for databases - Monitoring: Prometheus + Grafana + Uptime Kuma operational ### 📊 Container Distribution Status - OMV800: 25+ containers (needs load balancing) - lenovo410: 9 containers (cleanup in progress) - fedora: 1 container (ready for additional services) - audrey: 4 containers (well-balanced, monitoring hub) - lenovo420: 7 containers (balanced, can assist) - surface: 9 containers (specialized, reverse proxy) ### 🎯 Next Steps 1. Remove lenovo410 MariaDB (eliminate port 3306 conflict) 2. Clean up lenovo410 Vaultwarden (256MB space savings) 3. Verify no service conflicts exist 4. Begin service migration from OMV800 to fedora/audrey Status: Infrastructure 99% complete, entering cleanup and optimization phase
148 lines
4.7 KiB
Bash
Executable File
148 lines
4.7 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Re-process All Documents Script
|
|
# This script clears existing tags/titles and triggers Paperless AI to re-process all documents
|
|
|
|
set -e
|
|
|
|
echo "🔄 Re-processing All Documents in Paperless-ngx"
|
|
echo "=============================================="
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m' # No Color
|
|
|
|
print_status() {
|
|
echo -e "${BLUE}[INFO]${NC} $1"
|
|
}
|
|
|
|
print_success() {
|
|
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
|
}
|
|
|
|
print_warning() {
|
|
echo -e "${YELLOW}[WARNING]${NC} $1"
|
|
}
|
|
|
|
print_error() {
|
|
echo -e "${RED}[ERROR]${NC} $1"
|
|
}
|
|
|
|
# Configuration
|
|
PAPERLESS_HOST="192.168.50.229"
|
|
PAPERLESS_PORT="8000"
|
|
API_TOKEN="e10c341c7c67b9bce7a968e1a3349963a70f800c"
|
|
API_BASE_URL="http://${PAPERLESS_HOST}:${PAPERLESS_PORT}/api"
|
|
|
|
echo ""
|
|
print_status "Step 1: Checking current document count..."
|
|
|
|
# Get total document count
|
|
TOTAL_DOCS=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/" | jq -r '.count')
|
|
print_success "Found ${TOTAL_DOCS} documents to re-process"
|
|
|
|
echo ""
|
|
print_status "Step 2: Clearing existing tags and titles from all documents..."
|
|
|
|
# Function to clear tags and titles for a document
|
|
clear_document_metadata() {
|
|
local doc_id=$1
|
|
local doc_data=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/${doc_id}/")
|
|
|
|
# Extract current data
|
|
local title=$(echo "$doc_data" | jq -r '.title // empty')
|
|
local tags=$(echo "$doc_data" | jq -r '.tags // empty')
|
|
local correspondent=$(echo "$doc_data" | jq -r '.correspondent // empty')
|
|
local document_type=$(echo "$doc_data" | jq -r '.document_type // empty')
|
|
|
|
# Create update payload - clear tags, correspondent, document_type, and reset title
|
|
local update_payload=$(cat <<EOF
|
|
{
|
|
"title": "${title}",
|
|
"tags": [],
|
|
"correspondent": null,
|
|
"document_type": null
|
|
}
|
|
EOF
|
|
)
|
|
|
|
# Update the document
|
|
local response=$(curl -s -X PATCH \
|
|
-H "Authorization: Token ${API_TOKEN}" \
|
|
-H "Content-Type: application/json" \
|
|
-d "$update_payload" \
|
|
"${API_BASE_URL}/documents/${doc_id}/")
|
|
|
|
if [[ $? -eq 0 ]]; then
|
|
echo " ✅ Cleared metadata for document ${doc_id}"
|
|
else
|
|
echo " ❌ Failed to clear metadata for document ${doc_id}"
|
|
fi
|
|
}
|
|
|
|
# Process documents in batches
|
|
BATCH_SIZE=50
|
|
PROCESSED=0
|
|
|
|
for ((i=0; i<TOTAL_DOCS; i+=BATCH_SIZE)); do
|
|
print_status "Processing batch $((i/BATCH_SIZE + 1)) of $((TOTAL_DOCS/BATCH_SIZE + 1))..."
|
|
|
|
# Get documents in current batch
|
|
local docs_response=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/?page_size=${BATCH_SIZE}&page=$((i/BATCH_SIZE + 1))")
|
|
local doc_ids=$(echo "$docs_response" | jq -r '.results[].id')
|
|
|
|
for doc_id in $doc_ids; do
|
|
if [[ -n "$doc_id" && "$doc_id" != "null" ]]; then
|
|
clear_document_metadata "$doc_id"
|
|
PROCESSED=$((PROCESSED + 1))
|
|
fi
|
|
done
|
|
|
|
print_success "Processed ${PROCESSED}/${TOTAL_DOCS} documents"
|
|
done
|
|
|
|
echo ""
|
|
print_success "Step 2 Complete: Cleared tags and titles from all ${TOTAL_DOCS} documents"
|
|
|
|
echo ""
|
|
print_status "Step 3: Triggering Paperless AI to re-process all documents..."
|
|
|
|
# Check if Paperless AI is running
|
|
if ! ssh root@${PAPERLESS_HOST} "docker ps | grep -q paperless-ai"; then
|
|
print_error "Paperless AI is not running!"
|
|
exit 1
|
|
fi
|
|
|
|
# Restart Paperless AI to trigger re-processing
|
|
print_status "Restarting Paperless AI to trigger re-processing..."
|
|
ssh root@${PAPERLESS_HOST} "docker restart paperless-ai"
|
|
|
|
echo ""
|
|
print_success "Step 3 Complete: Paperless AI restarted and will begin re-processing"
|
|
|
|
echo ""
|
|
print_status "Step 4: Monitoring re-processing progress..."
|
|
|
|
# Monitor progress
|
|
echo "📊 Monitoring re-processing progress..."
|
|
echo " You can check progress by:"
|
|
echo " - Viewing Paperless AI logs: ssh root@${PAPERLESS_HOST} 'docker logs paperless-ai -f'"
|
|
echo " - Checking document count with tags: curl -H 'Authorization: Token ${API_TOKEN}' '${API_BASE_URL}/documents/?tags__isnull=false' | jq '.count'"
|
|
echo ""
|
|
|
|
print_success "Re-processing initiated!"
|
|
echo ""
|
|
echo "🔧 Next steps:"
|
|
echo "1. Monitor progress: ssh root@${PAPERLESS_HOST} 'docker logs paperless-ai -f'"
|
|
echo "2. Check processed documents: curl -H 'Authorization: Token ${API_TOKEN}' '${API_BASE_URL}/documents/?tags__isnull=false' | jq '.count'"
|
|
echo "3. Wait for completion (this may take several hours depending on document count)"
|
|
echo ""
|
|
echo "📚 The re-processing will:"
|
|
echo " - Clear all existing incorrect tags and titles"
|
|
echo " - Re-analyze all documents with Paperless AI"
|
|
echo " - Apply correct tags and titles based on document content"
|
|
echo " - Update the database so both services see the same information"
|