Major release with significant performance improvements and new processing strategy. ## Core Changes - Implemented simple_full_document processing strategy (default) - Full document → LLM approach: 1-2 passes, ~5-6 minutes processing time - Achieved 100% completeness with 2 API calls (down from 5+) - Removed redundant Document AI passes for faster processing ## Financial Data Extraction - Enhanced deterministic financial table parser - Improved FY3/FY2/FY1/LTM identification from varying CIM formats - Automatic merging of parser results with LLM extraction ## Code Quality & Infrastructure - Cleaned up debug logging (removed emoji markers from production code) - Fixed Firebase Secrets configuration (using modern defineSecret approach) - Updated OpenAI API key - Resolved deployment conflicts (secrets vs environment variables) - Added .env files to Firebase ignore list ## Deployment - Firebase Functions v2 deployment successful - All 7 required secrets verified and configured - Function URL: https://api-y56ccs6wva-uc.a.run.app ## Performance Improvements - Processing time: ~5-6 minutes (down from 23+ minutes) - API calls: 1-2 (down from 5+) - Completeness: 100% achievable - LLM Model: claude-3-7-sonnet-latest ## Breaking Changes - Default processing strategy changed to 'simple_full_document' - RAG processor available as alternative strategy 'document_ai_agentic_rag' ## Files Changed - 36 files changed, 5642 insertions(+), 4451 deletions(-) - Removed deprecated documentation files - Cleaned up unused services and models This release represents a major refactoring focused on speed, accuracy, and maintainability.
362 lines
12 KiB
TypeScript
362 lines
12 KiB
TypeScript
import { Router, Request, Response } from 'express';
|
|
import { getSupabaseServiceClient } from '../config/supabase';
|
|
import { logger } from '../utils/logger';
|
|
import { addCorrelationId } from '../middleware/validation';
|
|
|
|
const router = Router();
|
|
router.use(addCorrelationId);
|
|
|
|
/**
|
|
* GET /api/audit/document/:documentId
|
|
* Get detailed step-by-step audit trail for a document processing
|
|
*/
|
|
router.get('/document/:documentId', async (req: Request, res: Response): Promise<void> => {
|
|
try {
|
|
const { documentId } = req.params;
|
|
const supabase = getSupabaseServiceClient();
|
|
|
|
// Get document details
|
|
const { data: document, error: docError } = await supabase
|
|
.from('documents')
|
|
.select('*')
|
|
.eq('id', documentId)
|
|
.single();
|
|
|
|
if (docError || !document) {
|
|
res.status(404).json({
|
|
success: false,
|
|
error: 'Document not found',
|
|
documentId,
|
|
correlationId: req.correlationId || undefined,
|
|
});
|
|
return;
|
|
}
|
|
|
|
// Get all processing jobs for this document
|
|
const { data: jobs, error: jobsError } = await supabase
|
|
.from('processing_jobs')
|
|
.select('*')
|
|
.eq('document_id', documentId)
|
|
.order('created_at', { ascending: false });
|
|
|
|
// Get document chunks (embeddings)
|
|
const { data: chunks, error: chunksError } = await supabase
|
|
.from('document_chunks')
|
|
.select('id, chunk_index, content, metadata, created_at, embedding')
|
|
.eq('document_id', documentId)
|
|
.order('chunk_index', { ascending: true });
|
|
|
|
// Get CIM review if exists
|
|
const { data: review, error: reviewError } = await supabase
|
|
.from('cim_reviews')
|
|
.select('*')
|
|
.eq('document_id', documentId)
|
|
.single();
|
|
|
|
// Build comprehensive audit trail
|
|
const auditTrail = {
|
|
document: {
|
|
id: document.id,
|
|
filePath: document.file_path,
|
|
fileName: document.file_path?.split('/').pop() || 'Unknown',
|
|
status: document.status,
|
|
uploadStatus: document.upload_status,
|
|
processingStatus: document.processing_status,
|
|
createdAt: document.created_at,
|
|
updatedAt: document.updated_at,
|
|
processingCompletedAt: document.processing_completed_at,
|
|
generatedSummary: document.generated_summary ? 'Yes' : 'No',
|
|
hasAnalysisData: !!document.analysis_data,
|
|
},
|
|
processingJobs: jobs?.map(job => ({
|
|
id: job.id,
|
|
status: job.status,
|
|
strategy: job.options?.strategy || 'unknown',
|
|
attempts: job.attempts,
|
|
maxAttempts: job.max_attempts,
|
|
createdAt: job.created_at,
|
|
startedAt: job.started_at,
|
|
completedAt: job.completed_at,
|
|
error: job.error,
|
|
processingDuration: job.started_at && job.completed_at
|
|
? Math.round((new Date(job.completed_at).getTime() - new Date(job.started_at).getTime()) / 1000)
|
|
: job.started_at
|
|
? Math.round((Date.now() - new Date(job.started_at).getTime()) / 1000)
|
|
: null,
|
|
options: job.options,
|
|
})) || [],
|
|
vectorEmbeddings: {
|
|
totalChunks: chunks?.length || 0,
|
|
chunksWithEmbeddings: chunks?.filter(c => c.embedding).length || 0,
|
|
chunks: chunks?.map(chunk => ({
|
|
index: chunk.chunk_index,
|
|
contentLength: chunk.content?.length || 0,
|
|
contentPreview: chunk.content?.substring(0, 200) + '...' || 'No content',
|
|
hasEmbedding: !!chunk.embedding,
|
|
embeddingDimensions: chunk.embedding ? (typeof chunk.embedding === 'string' ? JSON.parse(chunk.embedding).length : chunk.embedding.length) : 0,
|
|
createdAt: chunk.created_at,
|
|
metadata: chunk.metadata,
|
|
})) || [],
|
|
},
|
|
cimReview: review ? {
|
|
id: review.id,
|
|
exists: true,
|
|
createdAt: review.created_at,
|
|
updatedAt: review.updated_at,
|
|
hasData: true,
|
|
} : {
|
|
exists: false,
|
|
message: 'No CIM review generated yet',
|
|
},
|
|
processingSteps: buildProcessingSteps(document, jobs || [], chunks || [], review),
|
|
timeline: buildTimeline(document, jobs || [], chunks || [], review),
|
|
summary: {
|
|
overallStatus: document.status,
|
|
totalProcessingTime: document.processing_completed_at && document.created_at
|
|
? Math.round((new Date(document.processing_completed_at).getTime() - new Date(document.created_at).getTime()) / 1000)
|
|
: null,
|
|
totalJobs: jobs?.length || 0,
|
|
successfulJobs: jobs?.filter(j => j.status === 'completed').length || 0,
|
|
failedJobs: jobs?.filter(j => j.status === 'failed').length || 0,
|
|
totalChunks: chunks?.length || 0,
|
|
chunksWithEmbeddings: chunks?.filter(c => c.embedding).length || 0,
|
|
hasReview: !!review,
|
|
lastError: jobs?.find(j => j.error)?.error || null,
|
|
},
|
|
};
|
|
|
|
logger.info('Document audit trail retrieved', {
|
|
documentId,
|
|
status: document.status,
|
|
totalJobs: jobs?.length || 0,
|
|
totalChunks: chunks?.length || 0,
|
|
correlationId: req.correlationId || undefined,
|
|
});
|
|
|
|
res.json({
|
|
success: true,
|
|
data: auditTrail,
|
|
correlationId: req.correlationId || undefined,
|
|
});
|
|
} catch (error) {
|
|
logger.error('Failed to get document audit trail', {
|
|
error: error instanceof Error ? error.message : 'Unknown error',
|
|
documentId: req.params.documentId,
|
|
correlationId: req.correlationId || undefined,
|
|
});
|
|
|
|
res.status(500).json({
|
|
success: false,
|
|
error: 'Failed to retrieve document audit trail',
|
|
message: error instanceof Error ? error.message : 'Unknown error',
|
|
correlationId: req.correlationId || undefined,
|
|
});
|
|
}
|
|
});
|
|
|
|
/**
|
|
* Build detailed processing steps from audit data
|
|
*/
|
|
function buildProcessingSteps(
|
|
document: any,
|
|
jobs: any[],
|
|
chunks: any[],
|
|
review: any
|
|
): Array<{ step: string; status: 'completed' | 'in_progress' | 'failed' | 'pending'; details: any; timestamp?: string }> {
|
|
const steps: Array<{ step: string; status: 'completed' | 'in_progress' | 'failed' | 'pending'; details: any; timestamp?: string }> = [];
|
|
|
|
// Step 1: Document Upload
|
|
steps.push({
|
|
step: '1. Document Upload',
|
|
status: document.upload_status === 'completed' ? 'completed' : document.upload_status === 'failed' ? 'failed' : 'pending',
|
|
details: {
|
|
filePath: document.file_path,
|
|
uploadStatus: document.upload_status,
|
|
},
|
|
timestamp: document.created_at,
|
|
});
|
|
|
|
// Step 2: Document AI Text Extraction
|
|
const hasExtractedText = document.processing_status || document.status !== 'pending';
|
|
steps.push({
|
|
step: '2. Document AI Text Extraction',
|
|
status: hasExtractedText ? 'completed' : 'pending',
|
|
details: {
|
|
processingStatus: document.processing_status,
|
|
documentStatus: document.status,
|
|
},
|
|
timestamp: document.updated_at,
|
|
});
|
|
|
|
// Step 3: Chunking
|
|
steps.push({
|
|
step: '3. Document Chunking',
|
|
status: chunks.length > 0 ? 'completed' : 'pending',
|
|
details: {
|
|
totalChunks: chunks.length,
|
|
averageChunkSize: chunks.length > 0
|
|
? Math.round(chunks.reduce((sum, c) => sum + (c.content?.length || 0), 0) / chunks.length)
|
|
: 0,
|
|
},
|
|
timestamp: chunks.length > 0 ? chunks[0].created_at : undefined,
|
|
});
|
|
|
|
// Step 4: Vector Embedding Generation
|
|
const chunksWithEmbeddings = chunks.filter(c => c.embedding).length;
|
|
steps.push({
|
|
step: '4. Vector Embedding Generation',
|
|
status: chunksWithEmbeddings === chunks.length && chunks.length > 0
|
|
? 'completed'
|
|
: chunksWithEmbeddings > 0
|
|
? 'in_progress'
|
|
: 'pending',
|
|
details: {
|
|
chunksWithEmbeddings,
|
|
totalChunks: chunks.length,
|
|
completionRate: chunks.length > 0 ? ((chunksWithEmbeddings / chunks.length) * 100).toFixed(1) + '%' : '0%',
|
|
embeddingDimensions: chunks.find(c => c.embedding)
|
|
? (typeof chunks.find(c => c.embedding)!.embedding === 'string'
|
|
? JSON.parse(chunks.find(c => c.embedding)!.embedding).length
|
|
: chunks.find(c => c.embedding)!.embedding.length)
|
|
: 0,
|
|
},
|
|
timestamp: chunks.find(c => c.embedding)?.created_at,
|
|
});
|
|
|
|
// Step 5: LLM Analysis
|
|
const latestJob = jobs[0];
|
|
const llmStepStatus = latestJob
|
|
? latestJob.status === 'completed'
|
|
? 'completed'
|
|
: latestJob.status === 'failed'
|
|
? 'failed'
|
|
: 'in_progress'
|
|
: 'pending';
|
|
|
|
steps.push({
|
|
step: '5. LLM Analysis & CIM Review Generation',
|
|
status: llmStepStatus,
|
|
details: {
|
|
jobStatus: latestJob?.status,
|
|
attempts: latestJob ? `${latestJob.attempts}/${latestJob.max_attempts}` : '0/0',
|
|
strategy: latestJob?.options?.strategy || 'unknown',
|
|
error: latestJob?.error || null,
|
|
hasAnalysisData: !!document.analysis_data,
|
|
},
|
|
timestamp: latestJob?.started_at || latestJob?.created_at,
|
|
});
|
|
|
|
// Step 6: CIM Review Storage
|
|
steps.push({
|
|
step: '6. CIM Review Storage',
|
|
status: review ? 'completed' : document.analysis_data ? 'completed' : 'pending',
|
|
details: {
|
|
reviewExists: !!review,
|
|
hasAnalysisData: !!document.analysis_data,
|
|
reviewId: review?.id || null,
|
|
},
|
|
timestamp: review?.created_at || document.processing_completed_at,
|
|
});
|
|
|
|
// Step 7: Final Status
|
|
steps.push({
|
|
step: '7. Processing Complete',
|
|
status: document.status === 'completed' ? 'completed' : document.status === 'failed' ? 'failed' : 'in_progress',
|
|
details: {
|
|
finalStatus: document.status,
|
|
processingCompletedAt: document.processing_completed_at,
|
|
hasSummary: !!document.generated_summary,
|
|
},
|
|
timestamp: document.processing_completed_at || document.updated_at,
|
|
});
|
|
|
|
return steps;
|
|
}
|
|
|
|
/**
|
|
* Build chronological timeline of events
|
|
*/
|
|
function buildTimeline(
|
|
document: any,
|
|
jobs: any[],
|
|
chunks: any[],
|
|
review: any
|
|
): Array<{ timestamp: string; event: string; details: any }> {
|
|
const timeline: Array<{ timestamp: string; event: string; details: any }> = [];
|
|
|
|
// Document creation
|
|
timeline.push({
|
|
timestamp: document.created_at,
|
|
event: 'Document Created',
|
|
details: { filePath: document.file_path },
|
|
});
|
|
|
|
// Job events
|
|
jobs.forEach((job, index) => {
|
|
timeline.push({
|
|
timestamp: job.created_at,
|
|
event: `Job ${index + 1} Created`,
|
|
details: { jobId: job.id, strategy: job.options?.strategy },
|
|
});
|
|
|
|
if (job.started_at) {
|
|
timeline.push({
|
|
timestamp: job.started_at,
|
|
event: `Job ${index + 1} Started`,
|
|
details: { jobId: job.id },
|
|
});
|
|
}
|
|
|
|
if (job.completed_at) {
|
|
timeline.push({
|
|
timestamp: job.completed_at,
|
|
event: `Job ${index + 1} ${job.status === 'completed' ? 'Completed' : 'Failed'}`,
|
|
details: { jobId: job.id, status: job.status, error: job.error || null },
|
|
});
|
|
}
|
|
});
|
|
|
|
// Chunk creation (first chunk)
|
|
if (chunks.length > 0) {
|
|
timeline.push({
|
|
timestamp: chunks[0].created_at,
|
|
event: 'First Chunk Created',
|
|
details: { totalChunks: chunks.length },
|
|
});
|
|
}
|
|
|
|
// Review creation
|
|
if (review) {
|
|
timeline.push({
|
|
timestamp: review.created_at,
|
|
event: 'CIM Review Created',
|
|
details: { reviewId: review.id },
|
|
});
|
|
}
|
|
|
|
// Document updates
|
|
if (document.updated_at !== document.created_at) {
|
|
timeline.push({
|
|
timestamp: document.updated_at,
|
|
event: 'Document Updated',
|
|
details: { status: document.status },
|
|
});
|
|
}
|
|
|
|
if (document.processing_completed_at) {
|
|
timeline.push({
|
|
timestamp: document.processing_completed_at,
|
|
event: 'Processing Completed',
|
|
details: { finalStatus: document.status },
|
|
});
|
|
}
|
|
|
|
// Sort by timestamp
|
|
timeline.sort((a, b) => new Date(a.timestamp).getTime() - new Date(b.timestamp).getTime());
|
|
|
|
return timeline;
|
|
}
|
|
|
|
export default router;
|
|
|