HomeAudit/scripts/test_reprocess.sh

#!/bin/bash

# Test Re-processing Script
# This script tests the re-processing approach on a small sample of documents

set -e

echo "🧪 Testing Document Re-processing (Sample)"
echo "=========================================="

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

print_status() {
    echo -e "${BLUE}[INFO]${NC} $1"
}

print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
}

print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
}

print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# Configuration
PAPERLESS_HOST="192.168.50.229"
PAPERLESS_PORT="8000"
API_TOKEN="e10c341c7c67b9bce7a968e1a3349963a70f800c"
API_BASE_URL="http://${PAPERLESS_HOST}:${PAPERLESS_PORT}/api"
TEST_COUNT=5

echo ""
print_status "Step 1: Testing API connectivity..."

# Test API connection
API_RESPONSE=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/")
if [[ $? -eq 0 ]]; then
    print_success "API connection successful"
else
    print_error "API connection failed"
    exit 1
fi

echo ""
print_status "Step 2: Getting total document count..."

# Get total document count
TOTAL_DOCS=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/" | jq -r '.count')
print_success "Found ${TOTAL_DOCS} total documents"

echo ""
print_status "Step 3: Testing with first ${TEST_COUNT} documents..."

# Get first few documents
DOCS_RESPONSE=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/?page_size=${TEST_COUNT}")
DOC_IDS=$(echo "$DOCS_RESPONSE" | jq -r '.results[].id')

echo "Document IDs to test: $DOC_IDS"

# Function to clear tags and titles for a document
clear_document_metadata() {
    local doc_id=$1
    print_status "Processing document ${doc_id}..."

    # Get current document data
    local doc_data=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/${doc_id}/")

    if [[ $? -ne 0 ]]; then
        print_error "Failed to get document ${doc_id}"
        return 1
    fi

    # Extract current data
    local title=$(echo "$doc_data" | jq -r '.title // empty')
    local tags=$(echo "$doc_data" | jq -r '.tags // empty')
    local correspondent=$(echo "$doc_data" | jq -r '.correspondent // empty')
    local document_type=$(echo "$doc_data" | jq -r '.document_type // empty')

    print_status "  Current title: ${title}"
    print_status "  Current tags: ${tags}"
    print_status "  Current correspondent: ${correspondent}"
    print_status "  Current document_type: ${document_type}"

    # Create update payload - clear tags, correspondent, document_type, but keep title
    local update_payload=$(cat <<EOF
{
    "title": "${title}",
    "tags": [],
    "correspondent": null,
    "document_type": null
}
EOF
)

    print_status "  Clearing metadata..."

    # Update the document
    local response=$(curl -s -X PATCH \
        -H "Authorization: Token ${API_TOKEN}" \
        -H "Content-Type: application/json" \
        -d "$update_payload" \
        "${API_BASE_URL}/documents/${doc_id}/")

    if [[ $? -eq 0 ]]; then
        print_success "  ✅ Successfully cleared metadata for document ${doc_id}"

        # Verify the changes
        local updated_data=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/${doc_id}/")
        local updated_tags=$(echo "$updated_data" | jq -r '.tags // empty')
        local updated_correspondent=$(echo "$updated_data" | jq -r '.correspondent // empty')
        local updated_document_type=$(echo "$updated_data" | jq -r '.document_type // empty')

        print_status "  Verification:"
        print_status "    Tags: ${updated_tags}"
        print_status "    Correspondent: ${updated_correspondent}"
        print_status "    Document Type: ${updated_document_type}"

        return 0
    else
        print_error "  ❌ Failed to clear metadata for document ${doc_id}"
        print_error "  Response: ${response}"
        return 1
    fi
}

# Process test documents
SUCCESS_COUNT=0
FAILED_COUNT=0

for doc_id in $DOC_IDS; do
    if [[ -n "$doc_id" && "$doc_id" != "null" ]]; then
        if clear_document_metadata "$doc_id"; then
            SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
        else
            FAILED_COUNT=$((FAILED_COUNT + 1))
        fi
        echo ""
    fi
done

echo ""
print_status "Step 4: Test Results Summary"
echo "=================================="
print_success "Successfully processed: ${SUCCESS_COUNT} documents"
if [[ $FAILED_COUNT -gt 0 ]]; then
    print_error "Failed to process: ${FAILED_COUNT} documents"
else
    print_success "All test documents processed successfully!"
fi

echo ""
print_status "Step 5: Testing Paperless AI connection..."

# Check if Paperless AI is running
if ssh root@${PAPERLESS_HOST} "docker ps | grep -q paperless-ai"; then
    print_success "Paperless AI is running"

    # Check Paperless AI logs
    print_status "Checking Paperless AI logs..."
    ssh root@${PAPERLESS_HOST} "docker logs paperless-ai --tail 5"
else
    print_error "Paperless AI is not running!"
fi

echo ""
print_status "Test Complete!"
echo ""
if [[ $SUCCESS_COUNT -eq $TEST_COUNT ]]; then
    print_success "✅ All tests passed! The re-processing approach is working correctly."
    echo ""
    echo "🚀 You can now run the full re-processing script:"
    echo "   ./scripts/reprocess_all_documents.sh"
else
    print_warning "⚠️  Some tests failed. Please review the errors before running the full script."
fi