Files
HomeAudit/scripts/reprocess_all_documents.sh
admin 45363040f3 feat: Complete infrastructure cleanup phase documentation and status updates
## Major Infrastructure Milestones Achieved

###  Service Migrations Completed
- Jellyfin: Successfully migrated to Docker Swarm with latest version
- Vaultwarden: Running in Docker Swarm on OMV800 (eliminated duplicate)
- Nextcloud: Operational with database optimization and cron setup
- Paperless services: Both NGX and AI running successfully

### 🚨 Duplicate Service Analysis Complete
- Identified MariaDB conflict (OMV800 Swarm vs lenovo410 standalone)
- Identified Vaultwarden duplication (now resolved)
- Documented PostgreSQL and Redis consolidation opportunities
- Mapped monitoring stack optimization needs

### 🏗️ Infrastructure Status Documentation
- Updated README with current cleanup phase status
- Enhanced Service Analysis with duplicate service inventory
- Updated Quick Start guide with immediate action items
- Documented current container distribution across 6 nodes

### 📋 Action Plan Documentation
- Phase 1: Immediate service conflict resolution (this week)
- Phase 2: Service migration and load balancing (next 2 weeks)
- Phase 3: Database consolidation and optimization (future)

### 🔧 Current Infrastructure Health
- Docker Swarm: All 6 nodes operational and healthy
- Caddy Reverse Proxy: Fully operational with SSL certificates
- Storage: MergerFS healthy, local storage for databases
- Monitoring: Prometheus + Grafana + Uptime Kuma operational

### 📊 Container Distribution Status
- OMV800: 25+ containers (needs load balancing)
- lenovo410: 9 containers (cleanup in progress)
- fedora: 1 container (ready for additional services)
- audrey: 4 containers (well-balanced, monitoring hub)
- lenovo420: 7 containers (balanced, can assist)
- surface: 9 containers (specialized, reverse proxy)

### 🎯 Next Steps
1. Remove lenovo410 MariaDB (eliminate port 3306 conflict)
2. Clean up lenovo410 Vaultwarden (256MB space savings)
3. Verify no service conflicts exist
4. Begin service migration from OMV800 to fedora/audrey

Status: Infrastructure 99% complete, entering cleanup and optimization phase
2025-09-01 16:50:37 -04:00

148 lines
4.7 KiB
Bash
Executable File

#!/bin/bash
# Re-process All Documents Script
# This script clears existing tags/titles and triggers Paperless AI to re-process all documents
set -e
echo "🔄 Re-processing All Documents in Paperless-ngx"
echo "=============================================="
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
print_status() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Configuration
PAPERLESS_HOST="192.168.50.229"
PAPERLESS_PORT="8000"
API_TOKEN="e10c341c7c67b9bce7a968e1a3349963a70f800c"
API_BASE_URL="http://${PAPERLESS_HOST}:${PAPERLESS_PORT}/api"
echo ""
print_status "Step 1: Checking current document count..."
# Get total document count
TOTAL_DOCS=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/" | jq -r '.count')
print_success "Found ${TOTAL_DOCS} documents to re-process"
echo ""
print_status "Step 2: Clearing existing tags and titles from all documents..."
# Function to clear tags and titles for a document
clear_document_metadata() {
local doc_id=$1
local doc_data=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/${doc_id}/")
# Extract current data
local title=$(echo "$doc_data" | jq -r '.title // empty')
local tags=$(echo "$doc_data" | jq -r '.tags // empty')
local correspondent=$(echo "$doc_data" | jq -r '.correspondent // empty')
local document_type=$(echo "$doc_data" | jq -r '.document_type // empty')
# Create update payload - clear tags, correspondent, document_type, and reset title
local update_payload=$(cat <<EOF
{
"title": "${title}",
"tags": [],
"correspondent": null,
"document_type": null
}
EOF
)
# Update the document
local response=$(curl -s -X PATCH \
-H "Authorization: Token ${API_TOKEN}" \
-H "Content-Type: application/json" \
-d "$update_payload" \
"${API_BASE_URL}/documents/${doc_id}/")
if [[ $? -eq 0 ]]; then
echo " ✅ Cleared metadata for document ${doc_id}"
else
echo " ❌ Failed to clear metadata for document ${doc_id}"
fi
}
# Process documents in batches
BATCH_SIZE=50
PROCESSED=0
for ((i=0; i<TOTAL_DOCS; i+=BATCH_SIZE)); do
print_status "Processing batch $((i/BATCH_SIZE + 1)) of $((TOTAL_DOCS/BATCH_SIZE + 1))..."
# Get documents in current batch
local docs_response=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/?page_size=${BATCH_SIZE}&page=$((i/BATCH_SIZE + 1))")
local doc_ids=$(echo "$docs_response" | jq -r '.results[].id')
for doc_id in $doc_ids; do
if [[ -n "$doc_id" && "$doc_id" != "null" ]]; then
clear_document_metadata "$doc_id"
PROCESSED=$((PROCESSED + 1))
fi
done
print_success "Processed ${PROCESSED}/${TOTAL_DOCS} documents"
done
echo ""
print_success "Step 2 Complete: Cleared tags and titles from all ${TOTAL_DOCS} documents"
echo ""
print_status "Step 3: Triggering Paperless AI to re-process all documents..."
# Check if Paperless AI is running
if ! ssh root@${PAPERLESS_HOST} "docker ps | grep -q paperless-ai"; then
print_error "Paperless AI is not running!"
exit 1
fi
# Restart Paperless AI to trigger re-processing
print_status "Restarting Paperless AI to trigger re-processing..."
ssh root@${PAPERLESS_HOST} "docker restart paperless-ai"
echo ""
print_success "Step 3 Complete: Paperless AI restarted and will begin re-processing"
echo ""
print_status "Step 4: Monitoring re-processing progress..."
# Monitor progress
echo "📊 Monitoring re-processing progress..."
echo " You can check progress by:"
echo " - Viewing Paperless AI logs: ssh root@${PAPERLESS_HOST} 'docker logs paperless-ai -f'"
echo " - Checking document count with tags: curl -H 'Authorization: Token ${API_TOKEN}' '${API_BASE_URL}/documents/?tags__isnull=false' | jq '.count'"
echo ""
print_success "Re-processing initiated!"
echo ""
echo "🔧 Next steps:"
echo "1. Monitor progress: ssh root@${PAPERLESS_HOST} 'docker logs paperless-ai -f'"
echo "2. Check processed documents: curl -H 'Authorization: Token ${API_TOKEN}' '${API_BASE_URL}/documents/?tags__isnull=false' | jq '.count'"
echo "3. Wait for completion (this may take several hours depending on document count)"
echo ""
echo "📚 The re-processing will:"
echo " - Clear all existing incorrect tags and titles"
echo " - Re-analyze all documents with Paperless AI"
echo " - Apply correct tags and titles based on document content"
echo " - Update the database so both services see the same information"