Files
cim_summary/backend/src/services/documentAiProcessor.ts
admin 9c916d12f4 feat: Production release v2.0.0 - Simple Document Processor
Major release with significant performance improvements and new processing strategy.

## Core Changes
- Implemented simple_full_document processing strategy (default)
- Full document → LLM approach: 1-2 passes, ~5-6 minutes processing time
- Achieved 100% completeness with 2 API calls (down from 5+)
- Removed redundant Document AI passes for faster processing

## Financial Data Extraction
- Enhanced deterministic financial table parser
- Improved FY3/FY2/FY1/LTM identification from varying CIM formats
- Automatic merging of parser results with LLM extraction

## Code Quality & Infrastructure
- Cleaned up debug logging (removed emoji markers from production code)
- Fixed Firebase Secrets configuration (using modern defineSecret approach)
- Updated OpenAI API key
- Resolved deployment conflicts (secrets vs environment variables)
- Added .env files to Firebase ignore list

## Deployment
- Firebase Functions v2 deployment successful
- All 7 required secrets verified and configured
- Function URL: https://api-y56ccs6wva-uc.a.run.app

## Performance Improvements
- Processing time: ~5-6 minutes (down from 23+ minutes)
- API calls: 1-2 (down from 5+)
- Completeness: 100% achievable
- LLM Model: claude-3-7-sonnet-latest

## Breaking Changes
- Default processing strategy changed to 'simple_full_document'
- RAG processor available as alternative strategy 'document_ai_agentic_rag'

## Files Changed
- 36 files changed, 5642 insertions(+), 4451 deletions(-)
- Removed deprecated documentation files
- Cleaned up unused services and models

This release represents a major refactoring focused on speed, accuracy, and maintainability.
2025-11-09 21:07:22 -05:00

676 lines
22 KiB
TypeScript

import { logger } from '../utils/logger';
import { DocumentProcessorServiceClient } from '@google-cloud/documentai';
import { Storage } from '@google-cloud/storage';
import { config } from '../config/env';
import pdf from 'pdf-parse';
import { PDFDocument } from 'pdf-lib';
interface ProcessingResult {
success: boolean;
content: string;
metadata?: any;
error?: string;
}
export interface StructuredTable {
headers: string[];
rows: string[][];
position: {
pageNumber: number;
confidence: number;
};
rawTable?: any;
}
interface DocumentAIOutput {
text: string;
entities: Array<{
type: string;
mentionText: string;
confidence: number;
}>;
tables: StructuredTable[];
pages: Array<any>;
mimeType: string;
}
export class DocumentAiProcessor {
private gcsBucketName: string;
private documentAiClient: DocumentProcessorServiceClient;
private storageClient: Storage;
private processorName: string;
// Reduced to 15 pages to work with non-imageless mode (safer default)
// If imageless mode is enabled, can increase to 30
private readonly MAX_PAGES_PER_CHUNK = 15;
constructor() {
this.gcsBucketName = config.googleCloud.gcsBucketName;
this.documentAiClient = new DocumentProcessorServiceClient();
this.storageClient = new Storage();
// Construct the processor name
this.processorName = `projects/${config.googleCloud.projectId}/locations/${config.googleCloud.documentAiLocation}/processors/${config.googleCloud.documentAiProcessorId}`;
logger.info('Document AI processor initialized', {
projectId: config.googleCloud.projectId,
location: config.googleCloud.documentAiLocation,
processorId: config.googleCloud.documentAiProcessorId,
processorName: this.processorName,
maxPagesPerChunk: this.MAX_PAGES_PER_CHUNK
});
}
/**
* Extract text from a Document AI layout object using text anchors
*/
private getTextFromLayout(layout: any, documentText: string): string {
try {
const textAnchor = layout?.textAnchor;
if (!textAnchor?.textSegments || textAnchor.textSegments.length === 0) {
return '';
}
const segment = textAnchor.textSegments[0];
const startIndex = parseInt(segment.startIndex || '0', 10);
const endIndex = parseInt(segment.endIndex || documentText.length.toString(), 10);
if (Number.isNaN(startIndex) || Number.isNaN(endIndex) || startIndex < 0 || endIndex > documentText.length || startIndex >= endIndex) {
logger.warn('Invalid text anchor indices detected when extracting table cell text', {
startIndex,
endIndex,
documentLength: documentText.length
});
return '';
}
return documentText.substring(startIndex, endIndex).trim();
} catch (error) {
logger.error('Failed to extract text from layout', {
error: error instanceof Error ? error.message : String(error),
layout
});
return '';
}
}
/**
* Convert Document AI table response into a structured, text-based representation
*/
private extractStructuredTables(document: any, documentText: string): StructuredTable[] {
const tables: StructuredTable[] = [];
try {
const pages = document?.pages || [];
logger.info('Extracting structured tables from Document AI response', {
pageCount: pages.length
});
for (const page of pages) {
const pageTables = page.tables || [];
const pageNumber = page.pageNumber || 0;
for (let tableIndex = 0; tableIndex < pageTables.length; tableIndex++) {
const table = pageTables[tableIndex];
try {
const headers: string[] = [];
if (Array.isArray(table.headerRows) && table.headerRows.length > 0) {
const headerRow = table.headerRows[0];
for (const cell of headerRow.cells || []) {
headers.push(this.getTextFromLayout(cell.layout, documentText));
}
}
const rows: string[][] = [];
for (const bodyRow of table.bodyRows || []) {
const row: string[] = [];
for (const cell of bodyRow.cells || []) {
row.push(this.getTextFromLayout(cell.layout, documentText));
}
if (row.some(value => value && value.length > 0)) {
rows.push(row);
}
}
if (headers.length > 0 || rows.length > 0) {
tables.push({
headers,
rows,
position: {
pageNumber,
confidence: typeof table.confidence === 'number' ? table.confidence : 0.9
},
rawTable: table
});
logger.info('Structured table extracted', {
pageNumber,
tableIndex,
headerCount: headers.length,
rowCount: rows.length
});
}
} catch (tableError) {
logger.error('Failed to extract structured table from Document AI response', {
pageNumber,
tableIndex,
error: tableError instanceof Error ? tableError.message : String(tableError)
});
}
}
}
logger.info('Structured table extraction completed', {
totalTables: tables.length
});
} catch (error) {
logger.error('Structured table extraction failed', {
error: error instanceof Error ? error.message : String(error)
});
}
return tables;
}
async processDocument(
documentId: string,
userId: string,
fileBuffer: Buffer,
fileName: string,
mimeType: string
): Promise<ProcessingResult> {
const startTime = Date.now();
try {
logger.info('Document AI processor: processDocument called (RAG-enabled)', {
documentId,
userId,
fileName,
fileSize: fileBuffer.length,
mimeType
});
// Step 1: Extract text/structured data using Document AI or fallback
const { text: extractedText, structuredTables } = await this.extractTextFromDocument(fileBuffer, fileName, mimeType);
if (!extractedText) {
throw new Error('Failed to extract text from document');
}
logger.info('Text extraction completed', {
textLength: extractedText.length
});
// Step 2: Process extracted text through Agentic RAG
const agenticRagResult = await this.processWithAgenticRAG(documentId, extractedText, structuredTables);
const processingTime = Date.now() - startTime;
return {
success: true,
content: agenticRagResult.summary || extractedText,
metadata: {
processingStrategy: 'document_ai_agentic_rag',
processingTime,
extractedTextLength: extractedText.length,
agenticRagResult,
structuredTables,
structuredTablesFound: structuredTables.length,
fileSize: fileBuffer.length,
fileName,
mimeType
}
};
} catch (error) {
const processingTime = Date.now() - startTime;
// Improved error message handling
let errorMessage: string;
if (error instanceof Error) {
errorMessage = error.message;
} else if (typeof error === 'string') {
errorMessage = error;
} else if (error && typeof error === 'object') {
// Try to extract meaningful information from object
errorMessage = (error as any).message || error.toString() || JSON.stringify(error, Object.getOwnPropertyNames(error));
} else {
errorMessage = String(error);
}
const errorStack = error instanceof Error ? error.stack : undefined;
const errorDetails = error instanceof Error ? {
name: error.name,
message: error.message,
stack: error.stack
} : {
type: typeof error,
value: error
};
logger.error('Document AI + Agentic RAG processing failed', {
documentId,
error: errorMessage,
errorDetails,
stack: errorStack,
processingTime,
originalError: error
});
return {
success: false,
content: '',
error: `Document AI + Agentic RAG processing failed: ${errorMessage}`,
metadata: {
processingStrategy: 'document_ai_agentic_rag',
processingTime,
error: errorMessage,
errorDetails,
stack: errorStack
}
};
}
}
/**
* Extract text only (no RAG processing) - for simple processor
*/
async extractTextOnly(
documentId: string,
userId: string,
fileBuffer: Buffer,
fileName: string,
mimeType: string
): Promise<{ text: string; structuredTables: StructuredTable[] }> {
logger.info('Document AI processor: extractTextOnly called (text-only, no RAG)', {
documentId,
fileName,
fileSize: fileBuffer.length,
mimeType
});
return await this.extractTextFromDocument(fileBuffer, fileName, mimeType);
}
private async extractTextFromDocument(
fileBuffer: Buffer,
fileName: string,
mimeType: string
): Promise<{ text: string; structuredTables: StructuredTable[] }> {
try {
// Check document size first
const pdfData = await pdf(fileBuffer);
const totalPages = pdfData.numpages;
logger.info('PDF analysis completed', {
totalPages,
textLength: pdfData.text?.length || 0
});
// If document has more than 30 pages, split into chunks and process each
if (totalPages > this.MAX_PAGES_PER_CHUNK) {
logger.info('Document exceeds Document AI page limit, splitting into chunks', {
totalPages,
maxPagesPerChunk: this.MAX_PAGES_PER_CHUNK,
estimatedChunks: Math.ceil(totalPages / this.MAX_PAGES_PER_CHUNK)
});
return await this.extractDocumentDataFromChunkedPDF(fileBuffer, fileName, mimeType, totalPages);
}
// For documents <= 30 pages, use Document AI directly
logger.info('Using Document AI for text extraction', {
totalPages,
maxPagesPerChunk: this.MAX_PAGES_PER_CHUNK
});
// Upload file to GCS
const gcsFilePath = await this.uploadToGCS(fileBuffer, fileName);
// Process with Document AI
const documentAiOutput = await this.processWithDocumentAI(gcsFilePath, mimeType);
// Cleanup GCS file
await this.cleanupGCSFiles(gcsFilePath);
return {
text: documentAiOutput.text,
structuredTables: documentAiOutput.tables || []
};
} catch (error) {
logger.error('Text extraction failed, using pdf-parse fallback', {
error: error instanceof Error ? error.message : String(error)
});
// Fallback to pdf-parse
try {
const pdfDataFallback = await pdf(fileBuffer);
return {
text: pdfDataFallback.text || '',
structuredTables: []
};
} catch (fallbackError) {
logger.error('Both Document AI and pdf-parse failed', {
originalError: error instanceof Error ? error.message : String(error),
fallbackError: fallbackError instanceof Error ? fallbackError.message : String(fallbackError)
});
throw new Error('Failed to extract text from document using any method');
}
}
}
/**
* Split PDF into chunks and process each chunk with Document AI, then combine results
*/
private async extractDocumentDataFromChunkedPDF(
fileBuffer: Buffer,
fileName: string,
mimeType: string,
totalPages: number
): Promise<{ text: string; structuredTables: StructuredTable[] }> {
const chunks: string[] = [];
const structuredTables: StructuredTable[] = [];
const numChunks = Math.ceil(totalPages / this.MAX_PAGES_PER_CHUNK);
logger.info('Starting chunked PDF processing', {
totalPages,
maxPagesPerChunk: this.MAX_PAGES_PER_CHUNK,
numChunks
});
try {
// Load the original PDF
const sourcePdf = await PDFDocument.load(fileBuffer);
const pageCount = sourcePdf.getPageCount();
// Process each chunk
for (let chunkIndex = 0; chunkIndex < numChunks; chunkIndex++) {
const startPageIndex = chunkIndex * this.MAX_PAGES_PER_CHUNK;
const endPageIndex = Math.min(startPageIndex + this.MAX_PAGES_PER_CHUNK, pageCount);
logger.info(`Processing chunk ${chunkIndex + 1}/${numChunks}`, {
startPage: startPageIndex + 1, // 1-indexed for logging
endPage: endPageIndex,
pagesInChunk: endPageIndex - startPageIndex
});
// Create a new PDF with pages from this chunk
const chunkPdf = await PDFDocument.create();
// Create array of page indices to copy (0-indexed)
const pageIndices: number[] = [];
for (let i = startPageIndex; i < endPageIndex; i++) {
pageIndices.push(i);
}
// Copy pages to chunk PDF
const copiedPages = await chunkPdf.copyPages(sourcePdf, pageIndices);
copiedPages.forEach((page) => {
chunkPdf.addPage(page);
});
// Serialize chunk PDF to buffer
const chunkBuffer = Buffer.from(await chunkPdf.save());
const chunkFileName = `${fileName.replace('.pdf', '')}_chunk_${chunkIndex + 1}.pdf`;
// Upload chunk to GCS
const gcsFilePath = await this.uploadToGCS(chunkBuffer, chunkFileName);
try {
// Process chunk with Document AI
const chunkOutput = await this.processWithDocumentAI(gcsFilePath, mimeType);
chunks.push(chunkOutput.text);
if (Array.isArray(chunkOutput.tables) && chunkOutput.tables.length > 0) {
structuredTables.push(...chunkOutput.tables);
}
logger.info(`Chunk ${chunkIndex + 1}/${numChunks} processed successfully`, {
textLength: chunkOutput.text.length,
pagesProcessed: endPageIndex - startPageIndex
});
} catch (chunkError) {
logger.error(`Failed to process chunk ${chunkIndex + 1}/${numChunks}, falling back to pdf-parse`, {
chunkIndex: chunkIndex + 1,
error: chunkError instanceof Error ? chunkError.message : String(chunkError)
});
// Fallback to pdf-parse for this chunk
const chunkPdfData = await pdf(chunkBuffer);
chunks.push(chunkPdfData.text || '');
} finally {
// Cleanup chunk file from GCS
await this.cleanupGCSFiles(gcsFilePath);
}
}
// Combine all chunks with page separators
const combinedText = chunks
.map((chunk, index) => {
const startPageNum = (index * this.MAX_PAGES_PER_CHUNK) + 1;
const endPageNum = Math.min((index + 1) * this.MAX_PAGES_PER_CHUNK, totalPages);
const chunkHeader = `\n\n--- Page Range ${startPageNum}-${endPageNum} ---\n\n`;
return chunkHeader + chunk;
})
.join('\n\n');
logger.info('Chunked PDF processing completed', {
totalPages,
numChunks,
combinedTextLength: combinedText.length,
averageChunkLength: Math.round(combinedText.length / numChunks)
});
return {
text: combinedText,
structuredTables
};
} catch (error) {
logger.error('Chunked PDF processing failed, falling back to pdf-parse', {
error: error instanceof Error ? error.message : String(error),
totalPages
});
// Fallback to pdf-parse for entire document
const pdfData = await pdf(fileBuffer);
return {
text: pdfData.text || '',
structuredTables: []
};
}
}
private async processWithAgenticRAG(documentId: string, extractedText: string, structuredTables: StructuredTable[]): Promise<any> {
try {
logger.info('Processing extracted text with Agentic RAG', {
documentId,
textLength: extractedText.length,
structuredTableCount: structuredTables.length
});
// Import and use the optimized agentic RAG processor
logger.info('Importing optimized agentic RAG processor...');
const { optimizedAgenticRAGProcessor } = await import('./optimizedAgenticRAGProcessor');
logger.info('Agentic RAG processor imported successfully', {
processorType: typeof optimizedAgenticRAGProcessor,
hasProcessLargeDocument: typeof optimizedAgenticRAGProcessor?.processLargeDocument === 'function'
});
logger.info('Calling processLargeDocument...');
const result = await optimizedAgenticRAGProcessor.processLargeDocument(documentId, extractedText, {
structuredTables
});
logger.info('Agentic RAG processing completed', {
success: result.success,
summaryLength: result.summary?.length || 0,
analysisDataKeys: result.analysisData ? Object.keys(result.analysisData) : [],
apiCalls: result.apiCalls,
processingStrategy: result.processingStrategy,
resultType: typeof result
});
return result;
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
const errorStack = error instanceof Error ? error.stack : undefined;
const errorDetails = error instanceof Error ? {
name: error.name,
message: error.message,
stack: error.stack
} : {
type: typeof error,
value: error
};
logger.error('Agentic RAG processing failed', {
documentId,
error: errorMessage,
errorDetails,
stack: errorStack
});
throw error;
}
}
private async uploadToGCS(fileBuffer: Buffer, fileName: string): Promise<string> {
try {
const bucket = this.storageClient.bucket(this.gcsBucketName);
const file = bucket.file(`uploads/${Date.now()}_${fileName}`);
logger.info('Uploading file to GCS', {
fileName,
fileSize: fileBuffer.length,
bucket: this.gcsBucketName,
destination: file.name
});
await file.save(fileBuffer, {
metadata: {
contentType: 'application/pdf'
}
});
logger.info('File uploaded successfully to GCS', {
gcsPath: `gs://${this.gcsBucketName}/${file.name}`
});
return `gs://${this.gcsBucketName}/${file.name}`;
} catch (error) {
logger.error('Failed to upload file to GCS', {
fileName,
error: error instanceof Error ? error.message : String(error)
});
throw error;
}
}
private async processWithDocumentAI(gcsFilePath: string, mimeType: string): Promise<DocumentAIOutput> {
try {
logger.info('Processing with Document AI', {
gcsFilePath,
processorName: this.processorName,
mimeType
});
// Create the request with imageless mode enabled to support up to 30 pages
// (non-imageless mode only supports 15 pages)
const request = {
name: this.processorName,
rawDocument: {
content: '', // We'll use GCS source instead
mimeType: mimeType
},
gcsDocument: {
gcsUri: gcsFilePath,
mimeType: mimeType
},
// Note: For processors that support it, imageless mode can be enabled
// via processor settings in Google Cloud Console to support up to 30 pages
// For now, we limit chunks to 15 pages to work with default processor settings
};
logger.info('Sending Document AI request', {
processorName: this.processorName,
gcsUri: gcsFilePath
});
// Process the document
const [result] = await this.documentAiClient.processDocument(request);
const { document } = result;
if (!document) {
throw new Error('Document AI returned no document');
}
logger.info('Document AI processing successful', {
textLength: document.text?.length || 0,
pagesCount: document.pages?.length || 0,
entitiesCount: document.entities?.length || 0
});
// Extract text
const text = document.text || '';
// Extract entities
const entities = document.entities?.map(entity => ({
type: entity.type || 'UNKNOWN',
mentionText: entity.mentionText || '',
confidence: entity.confidence || 0
})) || [];
// Extract structured tables
const structuredTables = this.extractStructuredTables(document, text);
// Extract pages info
const pages = document.pages?.map(page => ({
pageNumber: page.pageNumber || 0,
blocksCount: page.blocks?.length || 0
})) || [];
return {
text,
entities,
tables: structuredTables,
pages,
mimeType: document.mimeType || mimeType
};
} catch (error) {
logger.error('Document AI processing failed', {
gcsFilePath,
processorName: this.processorName,
error: error instanceof Error ? error.message : String(error),
stack: error instanceof Error ? error.stack : undefined
});
throw error;
}
}
private async cleanupGCSFiles(gcsFilePath: string): Promise<void> {
try {
const bucketName = gcsFilePath.replace('gs://', '').split('/')[0];
const fileName = gcsFilePath.replace(`gs://${bucketName}/`, '');
logger.info('Cleaning up GCS files', { gcsFilePath, bucketName, fileName });
const bucket = this.storageClient.bucket(bucketName);
const file = bucket.file(fileName);
await file.delete();
logger.info('GCS file cleanup completed', { gcsFilePath });
} catch (error) {
logger.warn('Failed to cleanup GCS files', {
gcsFilePath,
error: error instanceof Error ? error.message : String(error)
});
// Don't throw error for cleanup failures
}
}
}
export const documentAiProcessor = new DocumentAiProcessor();