Files
cim_summary/backend/src/routes/documentAudit.ts
admin 9c916d12f4 feat: Production release v2.0.0 - Simple Document Processor
Major release with significant performance improvements and new processing strategy.

## Core Changes
- Implemented simple_full_document processing strategy (default)
- Full document → LLM approach: 1-2 passes, ~5-6 minutes processing time
- Achieved 100% completeness with 2 API calls (down from 5+)
- Removed redundant Document AI passes for faster processing

## Financial Data Extraction
- Enhanced deterministic financial table parser
- Improved FY3/FY2/FY1/LTM identification from varying CIM formats
- Automatic merging of parser results with LLM extraction

## Code Quality & Infrastructure
- Cleaned up debug logging (removed emoji markers from production code)
- Fixed Firebase Secrets configuration (using modern defineSecret approach)
- Updated OpenAI API key
- Resolved deployment conflicts (secrets vs environment variables)
- Added .env files to Firebase ignore list

## Deployment
- Firebase Functions v2 deployment successful
- All 7 required secrets verified and configured
- Function URL: https://api-y56ccs6wva-uc.a.run.app

## Performance Improvements
- Processing time: ~5-6 minutes (down from 23+ minutes)
- API calls: 1-2 (down from 5+)
- Completeness: 100% achievable
- LLM Model: claude-3-7-sonnet-latest

## Breaking Changes
- Default processing strategy changed to 'simple_full_document'
- RAG processor available as alternative strategy 'document_ai_agentic_rag'

## Files Changed
- 36 files changed, 5642 insertions(+), 4451 deletions(-)
- Removed deprecated documentation files
- Cleaned up unused services and models

This release represents a major refactoring focused on speed, accuracy, and maintainability.
2025-11-09 21:07:22 -05:00

362 lines
12 KiB
TypeScript

import { Router, Request, Response } from 'express';
import { getSupabaseServiceClient } from '../config/supabase';
import { logger } from '../utils/logger';
import { addCorrelationId } from '../middleware/validation';
const router = Router();
router.use(addCorrelationId);
/**
* GET /api/audit/document/:documentId
* Get detailed step-by-step audit trail for a document processing
*/
router.get('/document/:documentId', async (req: Request, res: Response): Promise<void> => {
try {
const { documentId } = req.params;
const supabase = getSupabaseServiceClient();
// Get document details
const { data: document, error: docError } = await supabase
.from('documents')
.select('*')
.eq('id', documentId)
.single();
if (docError || !document) {
res.status(404).json({
success: false,
error: 'Document not found',
documentId,
correlationId: req.correlationId || undefined,
});
return;
}
// Get all processing jobs for this document
const { data: jobs, error: jobsError } = await supabase
.from('processing_jobs')
.select('*')
.eq('document_id', documentId)
.order('created_at', { ascending: false });
// Get document chunks (embeddings)
const { data: chunks, error: chunksError } = await supabase
.from('document_chunks')
.select('id, chunk_index, content, metadata, created_at, embedding')
.eq('document_id', documentId)
.order('chunk_index', { ascending: true });
// Get CIM review if exists
const { data: review, error: reviewError } = await supabase
.from('cim_reviews')
.select('*')
.eq('document_id', documentId)
.single();
// Build comprehensive audit trail
const auditTrail = {
document: {
id: document.id,
filePath: document.file_path,
fileName: document.file_path?.split('/').pop() || 'Unknown',
status: document.status,
uploadStatus: document.upload_status,
processingStatus: document.processing_status,
createdAt: document.created_at,
updatedAt: document.updated_at,
processingCompletedAt: document.processing_completed_at,
generatedSummary: document.generated_summary ? 'Yes' : 'No',
hasAnalysisData: !!document.analysis_data,
},
processingJobs: jobs?.map(job => ({
id: job.id,
status: job.status,
strategy: job.options?.strategy || 'unknown',
attempts: job.attempts,
maxAttempts: job.max_attempts,
createdAt: job.created_at,
startedAt: job.started_at,
completedAt: job.completed_at,
error: job.error,
processingDuration: job.started_at && job.completed_at
? Math.round((new Date(job.completed_at).getTime() - new Date(job.started_at).getTime()) / 1000)
: job.started_at
? Math.round((Date.now() - new Date(job.started_at).getTime()) / 1000)
: null,
options: job.options,
})) || [],
vectorEmbeddings: {
totalChunks: chunks?.length || 0,
chunksWithEmbeddings: chunks?.filter(c => c.embedding).length || 0,
chunks: chunks?.map(chunk => ({
index: chunk.chunk_index,
contentLength: chunk.content?.length || 0,
contentPreview: chunk.content?.substring(0, 200) + '...' || 'No content',
hasEmbedding: !!chunk.embedding,
embeddingDimensions: chunk.embedding ? (typeof chunk.embedding === 'string' ? JSON.parse(chunk.embedding).length : chunk.embedding.length) : 0,
createdAt: chunk.created_at,
metadata: chunk.metadata,
})) || [],
},
cimReview: review ? {
id: review.id,
exists: true,
createdAt: review.created_at,
updatedAt: review.updated_at,
hasData: true,
} : {
exists: false,
message: 'No CIM review generated yet',
},
processingSteps: buildProcessingSteps(document, jobs || [], chunks || [], review),
timeline: buildTimeline(document, jobs || [], chunks || [], review),
summary: {
overallStatus: document.status,
totalProcessingTime: document.processing_completed_at && document.created_at
? Math.round((new Date(document.processing_completed_at).getTime() - new Date(document.created_at).getTime()) / 1000)
: null,
totalJobs: jobs?.length || 0,
successfulJobs: jobs?.filter(j => j.status === 'completed').length || 0,
failedJobs: jobs?.filter(j => j.status === 'failed').length || 0,
totalChunks: chunks?.length || 0,
chunksWithEmbeddings: chunks?.filter(c => c.embedding).length || 0,
hasReview: !!review,
lastError: jobs?.find(j => j.error)?.error || null,
},
};
logger.info('Document audit trail retrieved', {
documentId,
status: document.status,
totalJobs: jobs?.length || 0,
totalChunks: chunks?.length || 0,
correlationId: req.correlationId || undefined,
});
res.json({
success: true,
data: auditTrail,
correlationId: req.correlationId || undefined,
});
} catch (error) {
logger.error('Failed to get document audit trail', {
error: error instanceof Error ? error.message : 'Unknown error',
documentId: req.params.documentId,
correlationId: req.correlationId || undefined,
});
res.status(500).json({
success: false,
error: 'Failed to retrieve document audit trail',
message: error instanceof Error ? error.message : 'Unknown error',
correlationId: req.correlationId || undefined,
});
}
});
/**
* Build detailed processing steps from audit data
*/
function buildProcessingSteps(
document: any,
jobs: any[],
chunks: any[],
review: any
): Array<{ step: string; status: 'completed' | 'in_progress' | 'failed' | 'pending'; details: any; timestamp?: string }> {
const steps: Array<{ step: string; status: 'completed' | 'in_progress' | 'failed' | 'pending'; details: any; timestamp?: string }> = [];
// Step 1: Document Upload
steps.push({
step: '1. Document Upload',
status: document.upload_status === 'completed' ? 'completed' : document.upload_status === 'failed' ? 'failed' : 'pending',
details: {
filePath: document.file_path,
uploadStatus: document.upload_status,
},
timestamp: document.created_at,
});
// Step 2: Document AI Text Extraction
const hasExtractedText = document.processing_status || document.status !== 'pending';
steps.push({
step: '2. Document AI Text Extraction',
status: hasExtractedText ? 'completed' : 'pending',
details: {
processingStatus: document.processing_status,
documentStatus: document.status,
},
timestamp: document.updated_at,
});
// Step 3: Chunking
steps.push({
step: '3. Document Chunking',
status: chunks.length > 0 ? 'completed' : 'pending',
details: {
totalChunks: chunks.length,
averageChunkSize: chunks.length > 0
? Math.round(chunks.reduce((sum, c) => sum + (c.content?.length || 0), 0) / chunks.length)
: 0,
},
timestamp: chunks.length > 0 ? chunks[0].created_at : undefined,
});
// Step 4: Vector Embedding Generation
const chunksWithEmbeddings = chunks.filter(c => c.embedding).length;
steps.push({
step: '4. Vector Embedding Generation',
status: chunksWithEmbeddings === chunks.length && chunks.length > 0
? 'completed'
: chunksWithEmbeddings > 0
? 'in_progress'
: 'pending',
details: {
chunksWithEmbeddings,
totalChunks: chunks.length,
completionRate: chunks.length > 0 ? ((chunksWithEmbeddings / chunks.length) * 100).toFixed(1) + '%' : '0%',
embeddingDimensions: chunks.find(c => c.embedding)
? (typeof chunks.find(c => c.embedding)!.embedding === 'string'
? JSON.parse(chunks.find(c => c.embedding)!.embedding).length
: chunks.find(c => c.embedding)!.embedding.length)
: 0,
},
timestamp: chunks.find(c => c.embedding)?.created_at,
});
// Step 5: LLM Analysis
const latestJob = jobs[0];
const llmStepStatus = latestJob
? latestJob.status === 'completed'
? 'completed'
: latestJob.status === 'failed'
? 'failed'
: 'in_progress'
: 'pending';
steps.push({
step: '5. LLM Analysis & CIM Review Generation',
status: llmStepStatus,
details: {
jobStatus: latestJob?.status,
attempts: latestJob ? `${latestJob.attempts}/${latestJob.max_attempts}` : '0/0',
strategy: latestJob?.options?.strategy || 'unknown',
error: latestJob?.error || null,
hasAnalysisData: !!document.analysis_data,
},
timestamp: latestJob?.started_at || latestJob?.created_at,
});
// Step 6: CIM Review Storage
steps.push({
step: '6. CIM Review Storage',
status: review ? 'completed' : document.analysis_data ? 'completed' : 'pending',
details: {
reviewExists: !!review,
hasAnalysisData: !!document.analysis_data,
reviewId: review?.id || null,
},
timestamp: review?.created_at || document.processing_completed_at,
});
// Step 7: Final Status
steps.push({
step: '7. Processing Complete',
status: document.status === 'completed' ? 'completed' : document.status === 'failed' ? 'failed' : 'in_progress',
details: {
finalStatus: document.status,
processingCompletedAt: document.processing_completed_at,
hasSummary: !!document.generated_summary,
},
timestamp: document.processing_completed_at || document.updated_at,
});
return steps;
}
/**
* Build chronological timeline of events
*/
function buildTimeline(
document: any,
jobs: any[],
chunks: any[],
review: any
): Array<{ timestamp: string; event: string; details: any }> {
const timeline: Array<{ timestamp: string; event: string; details: any }> = [];
// Document creation
timeline.push({
timestamp: document.created_at,
event: 'Document Created',
details: { filePath: document.file_path },
});
// Job events
jobs.forEach((job, index) => {
timeline.push({
timestamp: job.created_at,
event: `Job ${index + 1} Created`,
details: { jobId: job.id, strategy: job.options?.strategy },
});
if (job.started_at) {
timeline.push({
timestamp: job.started_at,
event: `Job ${index + 1} Started`,
details: { jobId: job.id },
});
}
if (job.completed_at) {
timeline.push({
timestamp: job.completed_at,
event: `Job ${index + 1} ${job.status === 'completed' ? 'Completed' : 'Failed'}`,
details: { jobId: job.id, status: job.status, error: job.error || null },
});
}
});
// Chunk creation (first chunk)
if (chunks.length > 0) {
timeline.push({
timestamp: chunks[0].created_at,
event: 'First Chunk Created',
details: { totalChunks: chunks.length },
});
}
// Review creation
if (review) {
timeline.push({
timestamp: review.created_at,
event: 'CIM Review Created',
details: { reviewId: review.id },
});
}
// Document updates
if (document.updated_at !== document.created_at) {
timeline.push({
timestamp: document.updated_at,
event: 'Document Updated',
details: { status: document.status },
});
}
if (document.processing_completed_at) {
timeline.push({
timestamp: document.processing_completed_at,
event: 'Processing Completed',
details: { finalStatus: document.status },
});
}
// Sort by timestamp
timeline.sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
return timeline;
}
export default router;