HomeAudit/scripts/reprocess_all_documents.sh

#!/bin/bash

# Re-process All Documents Script
# This script clears existing tags/titles and triggers Paperless AI to re-process all documents

set -e

echo "🔄 Re-processing All Documents in Paperless-ngx"
echo "=============================================="

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

print_status() {
    echo -e "${BLUE}[INFO]${NC} $1"
}

print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
}

print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
}

print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# Configuration
PAPERLESS_HOST="192.168.50.229"
PAPERLESS_PORT="8000"
API_TOKEN="e10c341c7c67b9bce7a968e1a3349963a70f800c"
API_BASE_URL="http://${PAPERLESS_HOST}:${PAPERLESS_PORT}/api"

echo ""
print_status "Step 1: Checking current document count..."

# Get total document count
TOTAL_DOCS=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/" | jq -r '.count')
print_success "Found ${TOTAL_DOCS} documents to re-process"

echo ""
print_status "Step 2: Clearing existing tags and titles from all documents..."

# Function to clear tags and titles for a document
clear_document_metadata() {
    local doc_id=$1
    local doc_data=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/${doc_id}/")

    # Extract current data
    local title=$(echo "$doc_data" | jq -r '.title // empty')
    local tags=$(echo "$doc_data" | jq -r '.tags // empty')
    local correspondent=$(echo "$doc_data" | jq -r '.correspondent // empty')
    local document_type=$(echo "$doc_data" | jq -r '.document_type // empty')

    # Create update payload - clear tags, correspondent, document_type, and reset title
    local update_payload=$(cat <<EOF
{
    "title": "${title}",
    "tags": [],
    "correspondent": null,
    "document_type": null
}
EOF
)

    # Update the document
    local response=$(curl -s -X PATCH \
        -H "Authorization: Token ${API_TOKEN}" \
        -H "Content-Type: application/json" \
        -d "$update_payload" \
        "${API_BASE_URL}/documents/${doc_id}/")

    if [[ $? -eq 0 ]]; then
        echo "  ✅ Cleared metadata for document ${doc_id}"
    else
        echo "  ❌ Failed to clear metadata for document ${doc_id}"
    fi
}

# Process documents in batches
BATCH_SIZE=50
PROCESSED=0

for ((i=0; i<TOTAL_DOCS; i+=BATCH_SIZE)); do
    print_status "Processing batch $((i/BATCH_SIZE + 1)) of $((TOTAL_DOCS/BATCH_SIZE + 1))..."

    # Get documents in current batch
    local docs_response=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/?page_size=${BATCH_SIZE}&page=$((i/BATCH_SIZE + 1))")
    local doc_ids=$(echo "$docs_response" | jq -r '.results[].id')

    for doc_id in $doc_ids; do
        if [[ -n "$doc_id" && "$doc_id" != "null" ]]; then
            clear_document_metadata "$doc_id"
            PROCESSED=$((PROCESSED + 1))
        fi
    done

    print_success "Processed ${PROCESSED}/${TOTAL_DOCS} documents"
done

echo ""
print_success "Step 2 Complete: Cleared tags and titles from all ${TOTAL_DOCS} documents"

echo ""
print_status "Step 3: Triggering Paperless AI to re-process all documents..."

# Check if Paperless AI is running
if ! ssh root@${PAPERLESS_HOST} "docker ps | grep -q paperless-ai"; then
    print_error "Paperless AI is not running!"
    exit 1
fi

# Restart Paperless AI to trigger re-processing
print_status "Restarting Paperless AI to trigger re-processing..."
ssh root@${PAPERLESS_HOST} "docker restart paperless-ai"

echo ""
print_success "Step 3 Complete: Paperless AI restarted and will begin re-processing"

echo ""
print_status "Step 4: Monitoring re-processing progress..."

# Monitor progress
echo "📊 Monitoring re-processing progress..."
echo "   You can check progress by:"
echo "   - Viewing Paperless AI logs: ssh root@${PAPERLESS_HOST} 'docker logs paperless-ai -f'"
echo "   - Checking document count with tags: curl -H 'Authorization: Token ${API_TOKEN}' '${API_BASE_URL}/documents/?tags__isnull=false' | jq '.count'"
echo ""

print_success "Re-processing initiated!"
echo ""
echo "🔧 Next steps:"
echo "1. Monitor progress: ssh root@${PAPERLESS_HOST} 'docker logs paperless-ai -f'"
echo "2. Check processed documents: curl -H 'Authorization: Token ${API_TOKEN}' '${API_BASE_URL}/documents/?tags__isnull=false' | jq '.count'"
echo "3. Wait for completion (this may take several hours depending on document count)"
echo ""
echo "📚 The re-processing will:"
echo "   - Clear all existing incorrect tags and titles"
echo "   - Re-analyze all documents with Paperless AI"
echo "   - Apply correct tags and titles based on document content"
echo "   - Update the database so both services see the same information"