import { logger } from '../utils/logger'; import { DocumentProcessorServiceClient } from '@google-cloud/documentai'; import { Storage } from '@google-cloud/storage'; import { config } from '../config/env'; import pdf from 'pdf-parse'; import { PDFDocument } from 'pdf-lib'; interface ProcessingResult { success: boolean; content: string; metadata?: any; error?: string; } export interface StructuredTable { headers: string[]; rows: string[][]; position: { pageNumber: number; confidence: number; }; rawTable?: any; } interface DocumentAIOutput { text: string; entities: Array<{ type: string; mentionText: string; confidence: number; }>; tables: StructuredTable[]; pages: Array; mimeType: string; } export class DocumentAiProcessor { private gcsBucketName: string; private documentAiClient: DocumentProcessorServiceClient; private storageClient: Storage; private processorName: string; // Reduced to 15 pages to work with non-imageless mode (safer default) // If imageless mode is enabled, can increase to 30 private readonly MAX_PAGES_PER_CHUNK = 15; constructor() { this.gcsBucketName = config.googleCloud.gcsBucketName; this.documentAiClient = new DocumentProcessorServiceClient(); this.storageClient = new Storage(); // Construct the processor name this.processorName = `projects/${config.googleCloud.projectId}/locations/${config.googleCloud.documentAiLocation}/processors/${config.googleCloud.documentAiProcessorId}`; logger.info('Document AI processor initialized', { projectId: config.googleCloud.projectId, location: config.googleCloud.documentAiLocation, processorId: config.googleCloud.documentAiProcessorId, processorName: this.processorName, maxPagesPerChunk: this.MAX_PAGES_PER_CHUNK }); } /** * Extract text from a Document AI layout object using text anchors */ private getTextFromLayout(layout: any, documentText: string): string { try { const textAnchor = layout?.textAnchor; if (!textAnchor?.textSegments || textAnchor.textSegments.length === 0) { return ''; } const segment = textAnchor.textSegments[0]; const startIndex = parseInt(segment.startIndex || '0', 10); const endIndex = parseInt(segment.endIndex || documentText.length.toString(), 10); if (Number.isNaN(startIndex) || Number.isNaN(endIndex) || startIndex < 0 || endIndex > documentText.length || startIndex >= endIndex) { logger.warn('Invalid text anchor indices detected when extracting table cell text', { startIndex, endIndex, documentLength: documentText.length }); return ''; } return documentText.substring(startIndex, endIndex).trim(); } catch (error) { logger.error('Failed to extract text from layout', { error: error instanceof Error ? error.message : String(error), layout }); return ''; } } /** * Convert Document AI table response into a structured, text-based representation */ private extractStructuredTables(document: any, documentText: string): StructuredTable[] { const tables: StructuredTable[] = []; try { const pages = document?.pages || []; logger.info('Extracting structured tables from Document AI response', { pageCount: pages.length }); for (const page of pages) { const pageTables = page.tables || []; const pageNumber = page.pageNumber || 0; for (let tableIndex = 0; tableIndex < pageTables.length; tableIndex++) { const table = pageTables[tableIndex]; try { const headers: string[] = []; if (Array.isArray(table.headerRows) && table.headerRows.length > 0) { const headerRow = table.headerRows[0]; for (const cell of headerRow.cells || []) { headers.push(this.getTextFromLayout(cell.layout, documentText)); } } const rows: string[][] = []; for (const bodyRow of table.bodyRows || []) { const row: string[] = []; for (const cell of bodyRow.cells || []) { row.push(this.getTextFromLayout(cell.layout, documentText)); } if (row.some(value => value && value.length > 0)) { rows.push(row); } } if (headers.length > 0 || rows.length > 0) { tables.push({ headers, rows, position: { pageNumber, confidence: typeof table.confidence === 'number' ? table.confidence : 0.9 }, rawTable: table }); logger.info('Structured table extracted', { pageNumber, tableIndex, headerCount: headers.length, rowCount: rows.length }); } } catch (tableError) { logger.error('Failed to extract structured table from Document AI response', { pageNumber, tableIndex, error: tableError instanceof Error ? tableError.message : String(tableError) }); } } } logger.info('Structured table extraction completed', { totalTables: tables.length }); } catch (error) { logger.error('Structured table extraction failed', { error: error instanceof Error ? error.message : String(error) }); } return tables; } async processDocument( documentId: string, userId: string, fileBuffer: Buffer, fileName: string, mimeType: string ): Promise { const startTime = Date.now(); try { logger.info('Document AI processor: processDocument called (RAG-enabled)', { documentId, userId, fileName, fileSize: fileBuffer.length, mimeType }); // Step 1: Extract text/structured data using Document AI or fallback const { text: extractedText, structuredTables } = await this.extractTextFromDocument(fileBuffer, fileName, mimeType); if (!extractedText) { throw new Error('Failed to extract text from document'); } logger.info('Text extraction completed', { textLength: extractedText.length }); // Step 2: Process extracted text through Agentic RAG const agenticRagResult = await this.processWithAgenticRAG(documentId, extractedText, structuredTables); const processingTime = Date.now() - startTime; return { success: true, content: agenticRagResult.summary || extractedText, metadata: { processingStrategy: 'document_ai_agentic_rag', processingTime, extractedTextLength: extractedText.length, agenticRagResult, structuredTables, structuredTablesFound: structuredTables.length, fileSize: fileBuffer.length, fileName, mimeType } }; } catch (error) { const processingTime = Date.now() - startTime; // Improved error message handling let errorMessage: string; if (error instanceof Error) { errorMessage = error.message; } else if (typeof error === 'string') { errorMessage = error; } else if (error && typeof error === 'object') { // Try to extract meaningful information from object errorMessage = (error as any).message || error.toString() || JSON.stringify(error, Object.getOwnPropertyNames(error)); } else { errorMessage = String(error); } const errorStack = error instanceof Error ? error.stack : undefined; const errorDetails = error instanceof Error ? { name: error.name, message: error.message, stack: error.stack } : { type: typeof error, value: error }; logger.error('Document AI + Agentic RAG processing failed', { documentId, error: errorMessage, errorDetails, stack: errorStack, processingTime, originalError: error }); return { success: false, content: '', error: `Document AI + Agentic RAG processing failed: ${errorMessage}`, metadata: { processingStrategy: 'document_ai_agentic_rag', processingTime, error: errorMessage, errorDetails, stack: errorStack } }; } } /** * Extract text only (no RAG processing) - for simple processor */ async extractTextOnly( documentId: string, userId: string, fileBuffer: Buffer, fileName: string, mimeType: string ): Promise<{ text: string; structuredTables: StructuredTable[] }> { logger.info('Document AI processor: extractTextOnly called (text-only, no RAG)', { documentId, fileName, fileSize: fileBuffer.length, mimeType }); return await this.extractTextFromDocument(fileBuffer, fileName, mimeType); } private async extractTextFromDocument( fileBuffer: Buffer, fileName: string, mimeType: string ): Promise<{ text: string; structuredTables: StructuredTable[] }> { try { // Check document size first const pdfData = await pdf(fileBuffer); const totalPages = pdfData.numpages; logger.info('PDF analysis completed', { totalPages, textLength: pdfData.text?.length || 0 }); // If document has more than 30 pages, split into chunks and process each if (totalPages > this.MAX_PAGES_PER_CHUNK) { logger.info('Document exceeds Document AI page limit, splitting into chunks', { totalPages, maxPagesPerChunk: this.MAX_PAGES_PER_CHUNK, estimatedChunks: Math.ceil(totalPages / this.MAX_PAGES_PER_CHUNK) }); return await this.extractDocumentDataFromChunkedPDF(fileBuffer, fileName, mimeType, totalPages); } // For documents <= 30 pages, use Document AI directly logger.info('Using Document AI for text extraction', { totalPages, maxPagesPerChunk: this.MAX_PAGES_PER_CHUNK }); // Upload file to GCS const gcsFilePath = await this.uploadToGCS(fileBuffer, fileName); // Process with Document AI const documentAiOutput = await this.processWithDocumentAI(gcsFilePath, mimeType); // Cleanup GCS file await this.cleanupGCSFiles(gcsFilePath); return { text: documentAiOutput.text, structuredTables: documentAiOutput.tables || [] }; } catch (error) { logger.error('Text extraction failed, using pdf-parse fallback', { error: error instanceof Error ? error.message : String(error) }); // Fallback to pdf-parse try { const pdfDataFallback = await pdf(fileBuffer); return { text: pdfDataFallback.text || '', structuredTables: [] }; } catch (fallbackError) { logger.error('Both Document AI and pdf-parse failed', { originalError: error instanceof Error ? error.message : String(error), fallbackError: fallbackError instanceof Error ? fallbackError.message : String(fallbackError) }); throw new Error('Failed to extract text from document using any method'); } } } /** * Split PDF into chunks and process each chunk with Document AI, then combine results */ private async extractDocumentDataFromChunkedPDF( fileBuffer: Buffer, fileName: string, mimeType: string, totalPages: number ): Promise<{ text: string; structuredTables: StructuredTable[] }> { const chunks: string[] = []; const structuredTables: StructuredTable[] = []; const numChunks = Math.ceil(totalPages / this.MAX_PAGES_PER_CHUNK); logger.info('Starting chunked PDF processing', { totalPages, maxPagesPerChunk: this.MAX_PAGES_PER_CHUNK, numChunks }); try { // Load the original PDF const sourcePdf = await PDFDocument.load(fileBuffer); const pageCount = sourcePdf.getPageCount(); // Process each chunk for (let chunkIndex = 0; chunkIndex < numChunks; chunkIndex++) { const startPageIndex = chunkIndex * this.MAX_PAGES_PER_CHUNK; const endPageIndex = Math.min(startPageIndex + this.MAX_PAGES_PER_CHUNK, pageCount); logger.info(`Processing chunk ${chunkIndex + 1}/${numChunks}`, { startPage: startPageIndex + 1, // 1-indexed for logging endPage: endPageIndex, pagesInChunk: endPageIndex - startPageIndex }); // Create a new PDF with pages from this chunk const chunkPdf = await PDFDocument.create(); // Create array of page indices to copy (0-indexed) const pageIndices: number[] = []; for (let i = startPageIndex; i < endPageIndex; i++) { pageIndices.push(i); } // Copy pages to chunk PDF const copiedPages = await chunkPdf.copyPages(sourcePdf, pageIndices); copiedPages.forEach((page) => { chunkPdf.addPage(page); }); // Serialize chunk PDF to buffer const chunkBuffer = Buffer.from(await chunkPdf.save()); const chunkFileName = `${fileName.replace('.pdf', '')}_chunk_${chunkIndex + 1}.pdf`; // Upload chunk to GCS const gcsFilePath = await this.uploadToGCS(chunkBuffer, chunkFileName); try { // Process chunk with Document AI const chunkOutput = await this.processWithDocumentAI(gcsFilePath, mimeType); chunks.push(chunkOutput.text); if (Array.isArray(chunkOutput.tables) && chunkOutput.tables.length > 0) { structuredTables.push(...chunkOutput.tables); } logger.info(`Chunk ${chunkIndex + 1}/${numChunks} processed successfully`, { textLength: chunkOutput.text.length, pagesProcessed: endPageIndex - startPageIndex }); } catch (chunkError) { logger.error(`Failed to process chunk ${chunkIndex + 1}/${numChunks}, falling back to pdf-parse`, { chunkIndex: chunkIndex + 1, error: chunkError instanceof Error ? chunkError.message : String(chunkError) }); // Fallback to pdf-parse for this chunk const chunkPdfData = await pdf(chunkBuffer); chunks.push(chunkPdfData.text || ''); } finally { // Cleanup chunk file from GCS await this.cleanupGCSFiles(gcsFilePath); } } // Combine all chunks with page separators const combinedText = chunks .map((chunk, index) => { const startPageNum = (index * this.MAX_PAGES_PER_CHUNK) + 1; const endPageNum = Math.min((index + 1) * this.MAX_PAGES_PER_CHUNK, totalPages); const chunkHeader = `\n\n--- Page Range ${startPageNum}-${endPageNum} ---\n\n`; return chunkHeader + chunk; }) .join('\n\n'); logger.info('Chunked PDF processing completed', { totalPages, numChunks, combinedTextLength: combinedText.length, averageChunkLength: Math.round(combinedText.length / numChunks) }); return { text: combinedText, structuredTables }; } catch (error) { logger.error('Chunked PDF processing failed, falling back to pdf-parse', { error: error instanceof Error ? error.message : String(error), totalPages }); // Fallback to pdf-parse for entire document const pdfData = await pdf(fileBuffer); return { text: pdfData.text || '', structuredTables: [] }; } } private async processWithAgenticRAG(documentId: string, extractedText: string, structuredTables: StructuredTable[]): Promise { try { logger.info('Processing extracted text with Agentic RAG', { documentId, textLength: extractedText.length, structuredTableCount: structuredTables.length }); // Import and use the optimized agentic RAG processor logger.info('Importing optimized agentic RAG processor...'); const { optimizedAgenticRAGProcessor } = await import('./optimizedAgenticRAGProcessor'); logger.info('Agentic RAG processor imported successfully', { processorType: typeof optimizedAgenticRAGProcessor, hasProcessLargeDocument: typeof optimizedAgenticRAGProcessor?.processLargeDocument === 'function' }); logger.info('Calling processLargeDocument...'); const result = await optimizedAgenticRAGProcessor.processLargeDocument(documentId, extractedText, { structuredTables }); logger.info('Agentic RAG processing completed', { success: result.success, summaryLength: result.summary?.length || 0, analysisDataKeys: result.analysisData ? Object.keys(result.analysisData) : [], apiCalls: result.apiCalls, processingStrategy: result.processingStrategy, resultType: typeof result }); return result; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); const errorStack = error instanceof Error ? error.stack : undefined; const errorDetails = error instanceof Error ? { name: error.name, message: error.message, stack: error.stack } : { type: typeof error, value: error }; logger.error('Agentic RAG processing failed', { documentId, error: errorMessage, errorDetails, stack: errorStack }); throw error; } } private async uploadToGCS(fileBuffer: Buffer, fileName: string): Promise { try { const bucket = this.storageClient.bucket(this.gcsBucketName); const file = bucket.file(`uploads/${Date.now()}_${fileName}`); logger.info('Uploading file to GCS', { fileName, fileSize: fileBuffer.length, bucket: this.gcsBucketName, destination: file.name }); await file.save(fileBuffer, { metadata: { contentType: 'application/pdf' } }); logger.info('File uploaded successfully to GCS', { gcsPath: `gs://${this.gcsBucketName}/${file.name}` }); return `gs://${this.gcsBucketName}/${file.name}`; } catch (error) { logger.error('Failed to upload file to GCS', { fileName, error: error instanceof Error ? error.message : String(error) }); throw error; } } private async processWithDocumentAI(gcsFilePath: string, mimeType: string): Promise { try { logger.info('Processing with Document AI', { gcsFilePath, processorName: this.processorName, mimeType }); // Create the request with imageless mode enabled to support up to 30 pages // (non-imageless mode only supports 15 pages) const request = { name: this.processorName, rawDocument: { content: '', // We'll use GCS source instead mimeType: mimeType }, gcsDocument: { gcsUri: gcsFilePath, mimeType: mimeType }, // Note: For processors that support it, imageless mode can be enabled // via processor settings in Google Cloud Console to support up to 30 pages // For now, we limit chunks to 15 pages to work with default processor settings }; logger.info('Sending Document AI request', { processorName: this.processorName, gcsUri: gcsFilePath }); // Process the document const [result] = await this.documentAiClient.processDocument(request); const { document } = result; if (!document) { throw new Error('Document AI returned no document'); } logger.info('Document AI processing successful', { textLength: document.text?.length || 0, pagesCount: document.pages?.length || 0, entitiesCount: document.entities?.length || 0 }); // Extract text const text = document.text || ''; // Extract entities const entities = document.entities?.map(entity => ({ type: entity.type || 'UNKNOWN', mentionText: entity.mentionText || '', confidence: entity.confidence || 0 })) || []; // Extract structured tables const structuredTables = this.extractStructuredTables(document, text); // Extract pages info const pages = document.pages?.map(page => ({ pageNumber: page.pageNumber || 0, blocksCount: page.blocks?.length || 0 })) || []; return { text, entities, tables: structuredTables, pages, mimeType: document.mimeType || mimeType }; } catch (error) { logger.error('Document AI processing failed', { gcsFilePath, processorName: this.processorName, error: error instanceof Error ? error.message : String(error), stack: error instanceof Error ? error.stack : undefined }); throw error; } } private async cleanupGCSFiles(gcsFilePath: string): Promise { try { const bucketName = gcsFilePath.replace('gs://', '').split('/')[0]; const fileName = gcsFilePath.replace(`gs://${bucketName}/`, ''); logger.info('Cleaning up GCS files', { gcsFilePath, bucketName, fileName }); const bucket = this.storageClient.bucket(bucketName); const file = bucket.file(fileName); await file.delete(); logger.info('GCS file cleanup completed', { gcsFilePath }); } catch (error) { logger.warn('Failed to cleanup GCS files', { gcsFilePath, error: error instanceof Error ? error.message : String(error) }); // Don't throw error for cleanup failures } } } export const documentAiProcessor = new DocumentAiProcessor();