Files
HomeAudit/scripts/test_reprocess.sh
admin 45363040f3 feat: Complete infrastructure cleanup phase documentation and status updates
## Major Infrastructure Milestones Achieved

###  Service Migrations Completed
- Jellyfin: Successfully migrated to Docker Swarm with latest version
- Vaultwarden: Running in Docker Swarm on OMV800 (eliminated duplicate)
- Nextcloud: Operational with database optimization and cron setup
- Paperless services: Both NGX and AI running successfully

### 🚨 Duplicate Service Analysis Complete
- Identified MariaDB conflict (OMV800 Swarm vs lenovo410 standalone)
- Identified Vaultwarden duplication (now resolved)
- Documented PostgreSQL and Redis consolidation opportunities
- Mapped monitoring stack optimization needs

### 🏗️ Infrastructure Status Documentation
- Updated README with current cleanup phase status
- Enhanced Service Analysis with duplicate service inventory
- Updated Quick Start guide with immediate action items
- Documented current container distribution across 6 nodes

### 📋 Action Plan Documentation
- Phase 1: Immediate service conflict resolution (this week)
- Phase 2: Service migration and load balancing (next 2 weeks)
- Phase 3: Database consolidation and optimization (future)

### 🔧 Current Infrastructure Health
- Docker Swarm: All 6 nodes operational and healthy
- Caddy Reverse Proxy: Fully operational with SSL certificates
- Storage: MergerFS healthy, local storage for databases
- Monitoring: Prometheus + Grafana + Uptime Kuma operational

### 📊 Container Distribution Status
- OMV800: 25+ containers (needs load balancing)
- lenovo410: 9 containers (cleanup in progress)
- fedora: 1 container (ready for additional services)
- audrey: 4 containers (well-balanced, monitoring hub)
- lenovo420: 7 containers (balanced, can assist)
- surface: 9 containers (specialized, reverse proxy)

### 🎯 Next Steps
1. Remove lenovo410 MariaDB (eliminate port 3306 conflict)
2. Clean up lenovo410 Vaultwarden (256MB space savings)
3. Verify no service conflicts exist
4. Begin service migration from OMV800 to fedora/audrey

Status: Infrastructure 99% complete, entering cleanup and optimization phase
2025-09-01 16:50:37 -04:00

186 lines
5.4 KiB
Bash
Executable File

#!/bin/bash
# Test Re-processing Script
# This script tests the re-processing approach on a small sample of documents
set -e
echo "🧪 Testing Document Re-processing (Sample)"
echo "=========================================="
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
print_status() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Configuration
PAPERLESS_HOST="192.168.50.229"
PAPERLESS_PORT="8000"
API_TOKEN="e10c341c7c67b9bce7a968e1a3349963a70f800c"
API_BASE_URL="http://${PAPERLESS_HOST}:${PAPERLESS_PORT}/api"
TEST_COUNT=5
echo ""
print_status "Step 1: Testing API connectivity..."
# Test API connection
API_RESPONSE=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/")
if [[ $? -eq 0 ]]; then
print_success "API connection successful"
else
print_error "API connection failed"
exit 1
fi
echo ""
print_status "Step 2: Getting total document count..."
# Get total document count
TOTAL_DOCS=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/" | jq -r '.count')
print_success "Found ${TOTAL_DOCS} total documents"
echo ""
print_status "Step 3: Testing with first ${TEST_COUNT} documents..."
# Get first few documents
DOCS_RESPONSE=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/?page_size=${TEST_COUNT}")
DOC_IDS=$(echo "$DOCS_RESPONSE" | jq -r '.results[].id')
echo "Document IDs to test: $DOC_IDS"
# Function to clear tags and titles for a document
clear_document_metadata() {
local doc_id=$1
print_status "Processing document ${doc_id}..."
# Get current document data
local doc_data=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/${doc_id}/")
if [[ $? -ne 0 ]]; then
print_error "Failed to get document ${doc_id}"
return 1
fi
# Extract current data
local title=$(echo "$doc_data" | jq -r '.title // empty')
local tags=$(echo "$doc_data" | jq -r '.tags // empty')
local correspondent=$(echo "$doc_data" | jq -r '.correspondent // empty')
local document_type=$(echo "$doc_data" | jq -r '.document_type // empty')
print_status " Current title: ${title}"
print_status " Current tags: ${tags}"
print_status " Current correspondent: ${correspondent}"
print_status " Current document_type: ${document_type}"
# Create update payload - clear tags, correspondent, document_type, but keep title
local update_payload=$(cat <<EOF
{
"title": "${title}",
"tags": [],
"correspondent": null,
"document_type": null
}
EOF
)
print_status " Clearing metadata..."
# Update the document
local response=$(curl -s -X PATCH \
-H "Authorization: Token ${API_TOKEN}" \
-H "Content-Type: application/json" \
-d "$update_payload" \
"${API_BASE_URL}/documents/${doc_id}/")
if [[ $? -eq 0 ]]; then
print_success " ✅ Successfully cleared metadata for document ${doc_id}"
# Verify the changes
local updated_data=$(curl -s -H "Authorization: Token ${API_TOKEN}" "${API_BASE_URL}/documents/${doc_id}/")
local updated_tags=$(echo "$updated_data" | jq -r '.tags // empty')
local updated_correspondent=$(echo "$updated_data" | jq -r '.correspondent // empty')
local updated_document_type=$(echo "$updated_data" | jq -r '.document_type // empty')
print_status " Verification:"
print_status " Tags: ${updated_tags}"
print_status " Correspondent: ${updated_correspondent}"
print_status " Document Type: ${updated_document_type}"
return 0
else
print_error " ❌ Failed to clear metadata for document ${doc_id}"
print_error " Response: ${response}"
return 1
fi
}
# Process test documents
SUCCESS_COUNT=0
FAILED_COUNT=0
for doc_id in $DOC_IDS; do
if [[ -n "$doc_id" && "$doc_id" != "null" ]]; then
if clear_document_metadata "$doc_id"; then
SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
else
FAILED_COUNT=$((FAILED_COUNT + 1))
fi
echo ""
fi
done
echo ""
print_status "Step 4: Test Results Summary"
echo "=================================="
print_success "Successfully processed: ${SUCCESS_COUNT} documents"
if [[ $FAILED_COUNT -gt 0 ]]; then
print_error "Failed to process: ${FAILED_COUNT} documents"
else
print_success "All test documents processed successfully!"
fi
echo ""
print_status "Step 5: Testing Paperless AI connection..."
# Check if Paperless AI is running
if ssh root@${PAPERLESS_HOST} "docker ps | grep -q paperless-ai"; then
print_success "Paperless AI is running"
# Check Paperless AI logs
print_status "Checking Paperless AI logs..."
ssh root@${PAPERLESS_HOST} "docker logs paperless-ai --tail 5"
else
print_error "Paperless AI is not running!"
fi
echo ""
print_status "Test Complete!"
echo ""
if [[ $SUCCESS_COUNT -eq $TEST_COUNT ]]; then
print_success "✅ All tests passed! The re-processing approach is working correctly."
echo ""
echo "🚀 You can now run the full re-processing script:"
echo " ./scripts/reprocess_all_documents.sh"
else
print_warning "⚠️ Some tests failed. Please review the errors before running the full script."
fi