Files
cim_summary/backend/src/scripts/monitor-document-processing.ts
admin 9c916d12f4 feat: Production release v2.0.0 - Simple Document Processor
Major release with significant performance improvements and new processing strategy.

## Core Changes
- Implemented simple_full_document processing strategy (default)
- Full document → LLM approach: 1-2 passes, ~5-6 minutes processing time
- Achieved 100% completeness with 2 API calls (down from 5+)
- Removed redundant Document AI passes for faster processing

## Financial Data Extraction
- Enhanced deterministic financial table parser
- Improved FY3/FY2/FY1/LTM identification from varying CIM formats
- Automatic merging of parser results with LLM extraction

## Code Quality & Infrastructure
- Cleaned up debug logging (removed emoji markers from production code)
- Fixed Firebase Secrets configuration (using modern defineSecret approach)
- Updated OpenAI API key
- Resolved deployment conflicts (secrets vs environment variables)
- Added .env files to Firebase ignore list

## Deployment
- Firebase Functions v2 deployment successful
- All 7 required secrets verified and configured
- Function URL: https://api-y56ccs6wva-uc.a.run.app

## Performance Improvements
- Processing time: ~5-6 minutes (down from 23+ minutes)
- API calls: 1-2 (down from 5+)
- Completeness: 100% achievable
- LLM Model: claude-3-7-sonnet-latest

## Breaking Changes
- Default processing strategy changed to 'simple_full_document'
- RAG processor available as alternative strategy 'document_ai_agentic_rag'

## Files Changed
- 36 files changed, 5642 insertions(+), 4451 deletions(-)
- Removed deprecated documentation files
- Cleaned up unused services and models

This release represents a major refactoring focused on speed, accuracy, and maintainability.
2025-11-09 21:07:22 -05:00

243 lines
7.5 KiB
TypeScript
Executable File

#!/usr/bin/env ts-node
/**
* Monitor Document Processing Script
*
* Usage:
* npx ts-node src/scripts/monitor-document-processing.ts <documentId>
*
* This script provides real-time monitoring of document processing steps
* and detailed audit information.
*/
import { getSupabaseServiceClient } from '../config/supabase';
import { logger } from '../utils/logger';
interface ProcessingStep {
step: string;
status: 'completed' | 'in_progress' | 'failed' | 'pending';
details: any;
timestamp?: string;
}
async function monitorDocument(documentId: string, intervalSeconds: number = 5) {
const supabase = getSupabaseServiceClient();
console.log(`\n🔍 Monitoring Document: ${documentId}`);
console.log(`📊 Refresh interval: ${intervalSeconds} seconds\n`);
console.log('Press Ctrl+C to stop monitoring\n');
console.log('='.repeat(80));
let previousStatus: string | null = null;
let checkCount = 0;
const monitorInterval = setInterval(async () => {
checkCount++;
const timestamp = new Date().toISOString();
try {
// Get document status
const { data: document, error: docError } = await supabase
.from('documents')
.select('*')
.eq('id', documentId)
.single();
if (docError || !document) {
console.log(`\n❌ [${timestamp}] Document not found`);
clearInterval(monitorInterval);
return;
}
// Get latest job
const { data: jobs } = await supabase
.from('processing_jobs')
.select('*')
.eq('document_id', documentId)
.order('created_at', { ascending: false })
.limit(1);
const latestJob = jobs?.[0];
// Get chunks
const { count: chunkCount } = await supabase
.from('document_chunks')
.select('*', { count: 'exact', head: true })
.eq('document_id', documentId);
const { count: embeddingCount } = await supabase
.from('document_chunks')
.select('*', { count: 'exact', head: true })
.eq('document_id', documentId)
.not('embedding', 'is', null);
// Get review
const { data: review } = await supabase
.from('cim_reviews')
.select('id')
.eq('document_id', documentId)
.single();
// Status change detection
const statusChanged = previousStatus !== document.status;
if (statusChanged || checkCount === 1) {
console.log(`\n📋 [${new Date().toLocaleTimeString()}] Status Update #${checkCount}`);
console.log('─'.repeat(80));
}
// Display current status
const statusIcon =
document.status === 'completed' ? '✅' :
document.status === 'failed' ? '❌' :
document.status === 'processing_llm' ? '🤖' :
'⏳';
console.log(`${statusIcon} Document Status: ${document.status}`);
if (latestJob) {
const jobIcon =
latestJob.status === 'completed' ? '✅' :
latestJob.status === 'failed' ? '❌' :
latestJob.status === 'processing' ? '🔄' :
'⏸️';
console.log(`${jobIcon} Job Status: ${latestJob.status} (Attempt ${latestJob.attempts}/${latestJob.max_attempts})`);
if (latestJob.started_at) {
const elapsed = Math.round((Date.now() - new Date(latestJob.started_at).getTime()) / 1000);
console.log(` ⏱️ Processing Time: ${elapsed}s (${Math.round(elapsed/60)}m)`);
}
if (latestJob.error) {
console.log(` ⚠️ Error: ${latestJob.error.substring(0, 100)}${latestJob.error.length > 100 ? '...' : ''}`);
}
}
// Processing steps
console.log('\n📊 Processing Steps:');
const steps: ProcessingStep[] = [
{
step: '1. Document Upload',
status: document.upload_status === 'completed' ? 'completed' : 'pending',
details: {},
timestamp: document.created_at,
},
{
step: '2. Text Extraction',
status: document.processing_status ? 'completed' : 'pending',
details: {},
},
{
step: '3. Document Chunking',
status: (chunkCount || 0) > 0 ? 'completed' : 'pending',
details: { chunks: chunkCount || 0 },
},
{
step: '4. Vector Embeddings',
status: (embeddingCount || 0) === (chunkCount || 0) && (chunkCount || 0) > 0
? 'completed'
: (embeddingCount || 0) > 0
? 'in_progress'
: 'pending',
details: {
embeddings: embeddingCount || 0,
chunks: chunkCount || 0,
progress: chunkCount ? `${Math.round(((embeddingCount || 0) / chunkCount) * 100)}%` : '0%',
},
},
{
step: '5. LLM Analysis',
status: latestJob
? latestJob.status === 'completed'
? 'completed'
: latestJob.status === 'failed'
? 'failed'
: 'in_progress'
: 'pending',
details: {
strategy: latestJob?.options?.strategy || 'unknown',
},
},
{
step: '6. CIM Review',
status: review ? 'completed' : document.analysis_data ? 'completed' : 'pending',
details: {},
},
];
steps.forEach((step, index) => {
const icon =
step.status === 'completed' ? '✅' :
step.status === 'failed' ? '❌' :
step.status === 'in_progress' ? '🔄' :
'⏸️';
const detailsStr = Object.keys(step.details).length > 0
? ` (${Object.entries(step.details).map(([k, v]) => `${k}: ${v}`).join(', ')})`
: '';
console.log(` ${icon} ${step.step}${detailsStr}`);
});
// Completion check
if (document.status === 'completed' || document.status === 'failed') {
console.log('\n' + '='.repeat(80));
console.log(`\n${document.status === 'completed' ? '✅' : '❌'} Processing ${document.status}!`);
if (document.status === 'completed') {
console.log(`📄 Review ID: ${review?.id || 'N/A'}`);
console.log(`📝 Has Summary: ${document.generated_summary ? 'Yes' : 'No'}`);
}
clearInterval(monitorInterval);
process.exit(0);
}
previousStatus = document.status;
console.log('\n' + '─'.repeat(80));
} catch (error) {
console.error(`\n❌ Error monitoring document:`, error);
clearInterval(monitorInterval);
process.exit(1);
}
}, intervalSeconds * 1000);
// Initial check
const initialCheck = async () => {
try {
const { data: document } = await supabase
.from('documents')
.select('status, file_path')
.eq('id', documentId)
.single();
if (document) {
console.log(`📄 File: ${document.file_path?.split('/').pop() || 'Unknown'}`);
console.log(`📊 Initial Status: ${document.status}\n`);
}
} catch (error) {
console.error('Error in initial check:', error);
}
};
await initialCheck();
}
// Main execution
const documentId = process.argv[2];
const interval = parseInt(process.argv[3]) || 5;
if (!documentId) {
console.error('Usage: npx ts-node src/scripts/monitor-document-processing.ts <documentId> [intervalSeconds]');
console.error('\nExample:');
console.error(' npx ts-node src/scripts/monitor-document-processing.ts 5b5a1ab6-ba51-4a... 5');
process.exit(1);
}
monitorDocument(documentId, interval).catch((error) => {
console.error('Fatal error:', error);
process.exit(1);
});