Major release with significant performance improvements and new processing strategy. ## Core Changes - Implemented simple_full_document processing strategy (default) - Full document → LLM approach: 1-2 passes, ~5-6 minutes processing time - Achieved 100% completeness with 2 API calls (down from 5+) - Removed redundant Document AI passes for faster processing ## Financial Data Extraction - Enhanced deterministic financial table parser - Improved FY3/FY2/FY1/LTM identification from varying CIM formats - Automatic merging of parser results with LLM extraction ## Code Quality & Infrastructure - Cleaned up debug logging (removed emoji markers from production code) - Fixed Firebase Secrets configuration (using modern defineSecret approach) - Updated OpenAI API key - Resolved deployment conflicts (secrets vs environment variables) - Added .env files to Firebase ignore list ## Deployment - Firebase Functions v2 deployment successful - All 7 required secrets verified and configured - Function URL: https://api-y56ccs6wva-uc.a.run.app ## Performance Improvements - Processing time: ~5-6 minutes (down from 23+ minutes) - API calls: 1-2 (down from 5+) - Completeness: 100% achievable - LLM Model: claude-3-7-sonnet-latest ## Breaking Changes - Default processing strategy changed to 'simple_full_document' - RAG processor available as alternative strategy 'document_ai_agentic_rag' ## Files Changed - 36 files changed, 5642 insertions(+), 4451 deletions(-) - Removed deprecated documentation files - Cleaned up unused services and models This release represents a major refactoring focused on speed, accuracy, and maintainability.
243 lines
7.5 KiB
TypeScript
Executable File
243 lines
7.5 KiB
TypeScript
Executable File
#!/usr/bin/env ts-node
|
|
|
|
/**
|
|
* Monitor Document Processing Script
|
|
*
|
|
* Usage:
|
|
* npx ts-node src/scripts/monitor-document-processing.ts <documentId>
|
|
*
|
|
* This script provides real-time monitoring of document processing steps
|
|
* and detailed audit information.
|
|
*/
|
|
|
|
import { getSupabaseServiceClient } from '../config/supabase';
|
|
import { logger } from '../utils/logger';
|
|
|
|
interface ProcessingStep {
|
|
step: string;
|
|
status: 'completed' | 'in_progress' | 'failed' | 'pending';
|
|
details: any;
|
|
timestamp?: string;
|
|
}
|
|
|
|
async function monitorDocument(documentId: string, intervalSeconds: number = 5) {
|
|
const supabase = getSupabaseServiceClient();
|
|
|
|
console.log(`\n🔍 Monitoring Document: ${documentId}`);
|
|
console.log(`📊 Refresh interval: ${intervalSeconds} seconds\n`);
|
|
console.log('Press Ctrl+C to stop monitoring\n');
|
|
console.log('='.repeat(80));
|
|
|
|
let previousStatus: string | null = null;
|
|
let checkCount = 0;
|
|
|
|
const monitorInterval = setInterval(async () => {
|
|
checkCount++;
|
|
const timestamp = new Date().toISOString();
|
|
|
|
try {
|
|
// Get document status
|
|
const { data: document, error: docError } = await supabase
|
|
.from('documents')
|
|
.select('*')
|
|
.eq('id', documentId)
|
|
.single();
|
|
|
|
if (docError || !document) {
|
|
console.log(`\n❌ [${timestamp}] Document not found`);
|
|
clearInterval(monitorInterval);
|
|
return;
|
|
}
|
|
|
|
// Get latest job
|
|
const { data: jobs } = await supabase
|
|
.from('processing_jobs')
|
|
.select('*')
|
|
.eq('document_id', documentId)
|
|
.order('created_at', { ascending: false })
|
|
.limit(1);
|
|
|
|
const latestJob = jobs?.[0];
|
|
|
|
// Get chunks
|
|
const { count: chunkCount } = await supabase
|
|
.from('document_chunks')
|
|
.select('*', { count: 'exact', head: true })
|
|
.eq('document_id', documentId);
|
|
|
|
const { count: embeddingCount } = await supabase
|
|
.from('document_chunks')
|
|
.select('*', { count: 'exact', head: true })
|
|
.eq('document_id', documentId)
|
|
.not('embedding', 'is', null);
|
|
|
|
// Get review
|
|
const { data: review } = await supabase
|
|
.from('cim_reviews')
|
|
.select('id')
|
|
.eq('document_id', documentId)
|
|
.single();
|
|
|
|
// Status change detection
|
|
const statusChanged = previousStatus !== document.status;
|
|
if (statusChanged || checkCount === 1) {
|
|
console.log(`\n📋 [${new Date().toLocaleTimeString()}] Status Update #${checkCount}`);
|
|
console.log('─'.repeat(80));
|
|
}
|
|
|
|
// Display current status
|
|
const statusIcon =
|
|
document.status === 'completed' ? '✅' :
|
|
document.status === 'failed' ? '❌' :
|
|
document.status === 'processing_llm' ? '🤖' :
|
|
'⏳';
|
|
|
|
console.log(`${statusIcon} Document Status: ${document.status}`);
|
|
|
|
if (latestJob) {
|
|
const jobIcon =
|
|
latestJob.status === 'completed' ? '✅' :
|
|
latestJob.status === 'failed' ? '❌' :
|
|
latestJob.status === 'processing' ? '🔄' :
|
|
'⏸️';
|
|
|
|
console.log(`${jobIcon} Job Status: ${latestJob.status} (Attempt ${latestJob.attempts}/${latestJob.max_attempts})`);
|
|
|
|
if (latestJob.started_at) {
|
|
const elapsed = Math.round((Date.now() - new Date(latestJob.started_at).getTime()) / 1000);
|
|
console.log(` ⏱️ Processing Time: ${elapsed}s (${Math.round(elapsed/60)}m)`);
|
|
}
|
|
|
|
if (latestJob.error) {
|
|
console.log(` ⚠️ Error: ${latestJob.error.substring(0, 100)}${latestJob.error.length > 100 ? '...' : ''}`);
|
|
}
|
|
}
|
|
|
|
// Processing steps
|
|
console.log('\n📊 Processing Steps:');
|
|
const steps: ProcessingStep[] = [
|
|
{
|
|
step: '1. Document Upload',
|
|
status: document.upload_status === 'completed' ? 'completed' : 'pending',
|
|
details: {},
|
|
timestamp: document.created_at,
|
|
},
|
|
{
|
|
step: '2. Text Extraction',
|
|
status: document.processing_status ? 'completed' : 'pending',
|
|
details: {},
|
|
},
|
|
{
|
|
step: '3. Document Chunking',
|
|
status: (chunkCount || 0) > 0 ? 'completed' : 'pending',
|
|
details: { chunks: chunkCount || 0 },
|
|
},
|
|
{
|
|
step: '4. Vector Embeddings',
|
|
status: (embeddingCount || 0) === (chunkCount || 0) && (chunkCount || 0) > 0
|
|
? 'completed'
|
|
: (embeddingCount || 0) > 0
|
|
? 'in_progress'
|
|
: 'pending',
|
|
details: {
|
|
embeddings: embeddingCount || 0,
|
|
chunks: chunkCount || 0,
|
|
progress: chunkCount ? `${Math.round(((embeddingCount || 0) / chunkCount) * 100)}%` : '0%',
|
|
},
|
|
},
|
|
{
|
|
step: '5. LLM Analysis',
|
|
status: latestJob
|
|
? latestJob.status === 'completed'
|
|
? 'completed'
|
|
: latestJob.status === 'failed'
|
|
? 'failed'
|
|
: 'in_progress'
|
|
: 'pending',
|
|
details: {
|
|
strategy: latestJob?.options?.strategy || 'unknown',
|
|
},
|
|
},
|
|
{
|
|
step: '6. CIM Review',
|
|
status: review ? 'completed' : document.analysis_data ? 'completed' : 'pending',
|
|
details: {},
|
|
},
|
|
];
|
|
|
|
steps.forEach((step, index) => {
|
|
const icon =
|
|
step.status === 'completed' ? '✅' :
|
|
step.status === 'failed' ? '❌' :
|
|
step.status === 'in_progress' ? '🔄' :
|
|
'⏸️';
|
|
|
|
const detailsStr = Object.keys(step.details).length > 0
|
|
? ` (${Object.entries(step.details).map(([k, v]) => `${k}: ${v}`).join(', ')})`
|
|
: '';
|
|
|
|
console.log(` ${icon} ${step.step}${detailsStr}`);
|
|
});
|
|
|
|
// Completion check
|
|
if (document.status === 'completed' || document.status === 'failed') {
|
|
console.log('\n' + '='.repeat(80));
|
|
console.log(`\n${document.status === 'completed' ? '✅' : '❌'} Processing ${document.status}!`);
|
|
|
|
if (document.status === 'completed') {
|
|
console.log(`📄 Review ID: ${review?.id || 'N/A'}`);
|
|
console.log(`📝 Has Summary: ${document.generated_summary ? 'Yes' : 'No'}`);
|
|
}
|
|
|
|
clearInterval(monitorInterval);
|
|
process.exit(0);
|
|
}
|
|
|
|
previousStatus = document.status;
|
|
console.log('\n' + '─'.repeat(80));
|
|
|
|
} catch (error) {
|
|
console.error(`\n❌ Error monitoring document:`, error);
|
|
clearInterval(monitorInterval);
|
|
process.exit(1);
|
|
}
|
|
}, intervalSeconds * 1000);
|
|
|
|
// Initial check
|
|
const initialCheck = async () => {
|
|
try {
|
|
const { data: document } = await supabase
|
|
.from('documents')
|
|
.select('status, file_path')
|
|
.eq('id', documentId)
|
|
.single();
|
|
|
|
if (document) {
|
|
console.log(`📄 File: ${document.file_path?.split('/').pop() || 'Unknown'}`);
|
|
console.log(`📊 Initial Status: ${document.status}\n`);
|
|
}
|
|
} catch (error) {
|
|
console.error('Error in initial check:', error);
|
|
}
|
|
};
|
|
|
|
await initialCheck();
|
|
}
|
|
|
|
// Main execution
|
|
const documentId = process.argv[2];
|
|
const interval = parseInt(process.argv[3]) || 5;
|
|
|
|
if (!documentId) {
|
|
console.error('Usage: npx ts-node src/scripts/monitor-document-processing.ts <documentId> [intervalSeconds]');
|
|
console.error('\nExample:');
|
|
console.error(' npx ts-node src/scripts/monitor-document-processing.ts 5b5a1ab6-ba51-4a... 5');
|
|
process.exit(1);
|
|
}
|
|
|
|
monitorDocument(documentId, interval).catch((error) => {
|
|
console.error('Fatal error:', error);
|
|
process.exit(1);
|
|
});
|
|
|