diff --git a/backend/sql/fix_vector_search_timeout.sql b/backend/sql/fix_vector_search_timeout.sql index 707716c..1bd2387 100644 --- a/backend/sql/fix_vector_search_timeout.sql +++ b/backend/sql/fix_vector_search_timeout.sql @@ -1,12 +1,12 @@ --- Fix vector search timeout by adding document_id filtering and optimizing the query --- This prevents searching across all documents and only searches within a specific document +-- Fix vector search timeout by pre-filtering on document_id BEFORE vector search +-- When document_id is provided, this avoids the full IVFFlat index scan (26K+ rows) +-- and instead computes distances on only ~80 chunks per document. --- Drop the old function (handle all possible signatures) +-- Drop old function signatures DROP FUNCTION IF EXISTS match_document_chunks(vector(1536), float, int); DROP FUNCTION IF EXISTS match_document_chunks(vector(1536), float, int, text); --- Create optimized function with document_id filtering --- document_id is TEXT (varchar) in the actual schema +-- Create optimized function that branches based on whether document_id is provided CREATE OR REPLACE FUNCTION match_document_chunks ( query_embedding vector(1536), match_threshold float, @@ -15,29 +15,51 @@ CREATE OR REPLACE FUNCTION match_document_chunks ( ) RETURNS TABLE ( id UUID, - document_id TEXT, + document_id VARCHAR(255), content text, metadata JSONB, chunk_index INT, similarity float ) -LANGUAGE sql STABLE +LANGUAGE plpgsql STABLE AS $$ - SELECT - document_chunks.id, - document_chunks.document_id, - document_chunks.content, - document_chunks.metadata, - document_chunks.chunk_index, - 1 - (document_chunks.embedding <=> query_embedding) AS similarity - FROM document_chunks - WHERE document_chunks.embedding IS NOT NULL - AND (filter_document_id IS NULL OR document_chunks.document_id = filter_document_id) - AND 1 - (document_chunks.embedding <=> query_embedding) > match_threshold - ORDER BY document_chunks.embedding <=> query_embedding - LIMIT match_count; +BEGIN + IF filter_document_id IS NOT NULL THEN + -- FAST PATH: Pre-filter by document_id using btree index, then compute + -- vector distances on only that document's chunks (~80 rows). + -- This completely bypasses the IVFFlat index scan. + RETURN QUERY + SELECT + dc.id, + dc.document_id, + dc.content, + dc.metadata, + dc.chunk_index, + 1 - (dc.embedding <=> query_embedding) AS similarity + FROM document_chunks dc + WHERE dc.document_id = filter_document_id + AND dc.embedding IS NOT NULL + AND 1 - (dc.embedding <=> query_embedding) > match_threshold + ORDER BY dc.embedding <=> query_embedding + LIMIT match_count; + ELSE + -- SLOW PATH: Search across all documents using IVFFlat index. + -- Only used when no document_id filter is provided. + RETURN QUERY + SELECT + dc.id, + dc.document_id, + dc.content, + dc.metadata, + dc.chunk_index, + 1 - (dc.embedding <=> query_embedding) AS similarity + FROM document_chunks dc + WHERE dc.embedding IS NOT NULL + AND 1 - (dc.embedding <=> query_embedding) > match_threshold + ORDER BY dc.embedding <=> query_embedding + LIMIT match_count; + END IF; +END; $$; --- Add comment explaining the optimization -COMMENT ON FUNCTION match_document_chunks IS 'Optimized vector search that filters by document_id first to prevent timeouts. Always pass filter_document_id when searching within a specific document.'; - +COMMENT ON FUNCTION match_document_chunks IS 'Vector search with fast document-scoped path. When filter_document_id is provided, uses btree index to pre-filter (~80 rows) instead of scanning the full IVFFlat index (26K+ rows).'; diff --git a/backend/src/__tests__/financial-summary.test.ts b/backend/src/__tests__/financial-summary.test.ts index 0a9e36f..caa687f 100644 --- a/backend/src/__tests__/financial-summary.test.ts +++ b/backend/src/__tests__/financial-summary.test.ts @@ -28,10 +28,10 @@ describe('Financial Summary Fixes', () => { const result = parseFinancialsFromText(text); - expect(result.fy3.revenue).toBeDefined(); - expect(result.fy2.revenue).toBeDefined(); - expect(result.fy1.revenue).toBeDefined(); - expect(result.ltm.revenue).toBeDefined(); + expect(result.data.fy3.revenue).toBeDefined(); + expect(result.data.fy2.revenue).toBeDefined(); + expect(result.data.fy1.revenue).toBeDefined(); + expect(result.data.ltm.revenue).toBeDefined(); }); test('Should parse financial table with year format', () => { @@ -45,7 +45,7 @@ describe('Financial Summary Fixes', () => { const result = parseFinancialsFromText(text); // Should assign years to periods (oldest = FY3, newest = FY1) - expect(result.fy3.revenue || result.fy2.revenue || result.fy1.revenue).toBeDefined(); + expect(result.data.fy3.revenue || result.data.fy2.revenue || result.data.fy1.revenue).toBeDefined(); }); test('Should handle tables with only 2-3 periods', () => { @@ -59,7 +59,7 @@ describe('Financial Summary Fixes', () => { const result = parseFinancialsFromText(text); // Should still parse what's available - expect(result.fy1 || result.fy2).toBeDefined(); + expect(result.data.fy1 || result.data.fy2).toBeDefined(); }); test('Should extract Gross Profit and Gross Margin', () => { @@ -74,8 +74,8 @@ describe('Financial Summary Fixes', () => { const result = parseFinancialsFromText(text); - expect(result.fy1.grossProfit).toBeDefined(); - expect(result.fy1.grossMargin).toBeDefined(); + expect(result.data.fy1.grossProfit).toBeDefined(); + expect(result.data.fy1.grossMargin).toBeDefined(); }); }); @@ -91,10 +91,10 @@ describe('Financial Summary Fixes', () => { const result = parseFinancialsFromText(text); // Values should be correctly aligned with their periods - expect(result.fy3.revenue).toBeDefined(); - expect(result.fy2.revenue).toBeDefined(); - expect(result.fy1.revenue).toBeDefined(); - expect(result.ltm.revenue).toBeDefined(); + expect(result.data.fy3.revenue).toBeDefined(); + expect(result.data.fy2.revenue).toBeDefined(); + expect(result.data.fy1.revenue).toBeDefined(); + expect(result.data.ltm.revenue).toBeDefined(); }); }); }); diff --git a/backend/src/config/env.ts b/backend/src/config/env.ts index 83fb448..b3315ba 100644 --- a/backend/src/config/env.ts +++ b/backend/src/config/env.ts @@ -152,7 +152,8 @@ const envSchema = Joi.object({ LOG_FILE: Joi.string().default('logs/app.log'), // Processing Strategy - PROCESSING_STRATEGY: Joi.string().valid('document_ai_agentic_rag').default('document_ai_agentic_rag'), + PROCESSING_STRATEGY: Joi.string().valid('document_ai_agentic_rag', 'single_pass_quality_check').default('single_pass_quality_check'), + BACKGROUND_EMBEDDING_ENABLED: Joi.boolean().default(true), // Agentic RAG Configuration AGENTIC_RAG_ENABLED: Joi.boolean().default(false), @@ -355,7 +356,8 @@ export const config = { }, // Processing Strategy - processingStrategy: envVars['PROCESSING_STRATEGY'] || 'agentic_rag', // 'chunking' | 'rag' | 'agentic_rag' + processingStrategy: envVars['PROCESSING_STRATEGY'] || 'single_pass_quality_check', + backgroundEmbeddingEnabled: envVars['BACKGROUND_EMBEDDING_ENABLED'] !== false, enableRAGProcessing: envVars['ENABLE_RAG_PROCESSING'] === 'true', enableProcessingComparison: envVars['ENABLE_PROCESSING_COMPARISON'] === 'true', diff --git a/backend/src/index.ts b/backend/src/index.ts index 6abc611..2e95491 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -252,7 +252,7 @@ import { onSchedule } from 'firebase-functions/v2/scheduler'; export const processDocumentJobs = onSchedule({ schedule: 'every 1 minutes', // Minimum interval for Firebase Cloud Scheduler (immediate processing handles most cases) timeoutSeconds: 900, // 15 minutes (max for Gen2 scheduled functions) - increased for large documents - memory: '1GiB', + memory: '2GiB', retryCount: 2, // Retry up to 2 times on failure before waiting for next scheduled run secrets: [ anthropicApiKey, diff --git a/backend/src/scripts/test-haiku-financial-extraction.ts b/backend/src/scripts/test-haiku-financial-extraction.ts index 88b2fe7..1739e0f 100644 --- a/backend/src/scripts/test-haiku-financial-extraction.ts +++ b/backend/src/scripts/test-haiku-financial-extraction.ts @@ -162,13 +162,17 @@ async function testHaikuFinancialExtraction() { // Test 2: Test deterministic parser first console.log('\nšŸ“‹ Test 2: Deterministic Parser'); console.log('-'.repeat(60)); - const parserResults = parseFinancialsFromText(textToUse); + const parseResult = parseFinancialsFromText(textToUse); + const parserResults = parseResult.data; console.log('Parser Results:'); + console.log(` Confidence: ${parseResult.confidence}`); + console.log(` Tables Found: ${parseResult.tablesFound}`); + console.log(` Matched Rows: ${parseResult.matchedRows}`); console.log(` FY3 Revenue: ${parserResults.fy3.revenue || 'Not found'}`); console.log(` FY2 Revenue: ${parserResults.fy2.revenue || 'Not found'}`); console.log(` FY1 Revenue: ${parserResults.fy1.revenue || 'Not found'}`); console.log(` LTM Revenue: ${parserResults.ltm.revenue || 'Not found'}`); - + const parserHasData = !!(parserResults.fy3.revenue || parserResults.fy2.revenue || parserResults.fy1.revenue || parserResults.ltm.revenue); console.log(`\n${parserHasData ? 'āœ…' : 'āš ļø '} Parser ${parserHasData ? 'found' : 'did not find'} financial data`); diff --git a/backend/src/scripts/test-single-pass-local.ts b/backend/src/scripts/test-single-pass-local.ts new file mode 100644 index 0000000..e48896d --- /dev/null +++ b/backend/src/scripts/test-single-pass-local.ts @@ -0,0 +1,135 @@ +/** + * Local test: Run the single-pass processor on real CIM PDFs + * without deploying to Firebase. + * + * Usage: + * cd backend + * npx ts-node src/scripts/test-single-pass-local.ts + */ + +import * as fs from 'fs'; +import * as path from 'path'; + +// Load .env before any service imports +import dotenv from 'dotenv'; +dotenv.config({ path: path.resolve(__dirname, '../../.env') }); + +async function main() { + const projectRoot = path.resolve(__dirname, '../../../'); + const cimFiles = [ + path.join(projectRoot, 'Project Panther - Confidential Information Memorandum_vBluePoint.pdf'), + path.join(projectRoot, 'Project SNAP - CIP_Blue Point.pdf'), + ]; + + const existingFiles = cimFiles.filter(f => fs.existsSync(f)); + if (existingFiles.length === 0) { + console.error('No CIM PDFs found in project root. Expected:'); + cimFiles.forEach(f => console.error(` ${f}`)); + process.exit(1); + } + + console.log(`\n${'='.repeat(70)}`); + console.log('SINGLE-PASS PROCESSOR LOCAL TEST'); + console.log(`${'='.repeat(70)}\n`); + console.log(`Found ${existingFiles.length} CIM file(s) to test.\n`); + + // Lazy-import services so .env is loaded first + const { singlePassProcessor } = await import('../services/singlePassProcessor'); + + for (const filePath of existingFiles) { + const fileName = path.basename(filePath); + const fileBuffer = fs.readFileSync(filePath); + const documentId = `test-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`; + + console.log(`\n${'─'.repeat(70)}`); + console.log(`Processing: ${fileName}`); + console.log(` Size: ${(fileBuffer.length / 1024 / 1024).toFixed(1)} MB`); + console.log(` Document ID: ${documentId}`); + console.log(`${'─'.repeat(70)}\n`); + + const startTime = Date.now(); + try { + const result = await singlePassProcessor.processDocument( + documentId, + 'test-user', + '', // text will be extracted from fileBuffer + { fileBuffer, fileName, mimeType: 'application/pdf' }, + ); + + const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); + + console.log(`\n--- RESULT: ${fileName} ---`); + console.log(` Success: ${result.success}`); + console.log(` Processing time: ${elapsed}s`); + console.log(` API calls: ${result.apiCalls}`); + console.log(` Quality check: ${result.qualityCheckPassed ? 'PASSED' : 'FAILED (unverified)'}`); + console.log(` Was truncated: ${result.wasTruncated}`); + console.log(` Completeness: ${result.completenessScore.toFixed(1)}%`); + console.log(` Error: ${result.error || 'none'}`); + + if (result.success && result.analysisData) { + const data = result.analysisData; + console.log(`\n --- KEY EXTRACTED DATA ---`); + console.log(` Company: ${data.dealOverview?.targetCompanyName || '???'}`); + console.log(` Industry: ${data.dealOverview?.industrySector || '???'}`); + console.log(` Geography: ${data.dealOverview?.geography || '???'}`); + console.log(` Page count: ${data.dealOverview?.cimPageCount || '???'}`); + console.log(` Employees: ${data.dealOverview?.employeeCount || '???'}`); + + const fin = data.financialSummary?.financials; + if (fin) { + console.log(`\n --- FINANCIALS ---`); + for (const period of ['fy3', 'fy2', 'fy1', 'ltm'] as const) { + const p = fin[period]; + if (p) { + console.log(` ${period.toUpperCase().padEnd(4)} Revenue: ${(p.revenue || 'N/A').padEnd(15)} EBITDA: ${(p.ebitda || 'N/A').padEnd(15)} Margin: ${p.ebitdaMargin || 'N/A'}`); + } + } + } + + console.log(`\n --- INVESTMENT THESIS (preview) ---`); + const thesis = data.preliminaryInvestmentThesis; + if (thesis?.keyAttractions) { + // Count items by numbered list, bullets, or newline-separated entries + const text = thesis.keyAttractions; + const count = (text.match(/\d+[\.\)]\s/g) || []).length || text.split(/\n/).filter((l: string) => l.trim().length > 10).length; + console.log(` Key Attractions: ${count} items (${text.length} chars)`); + } + if (thesis?.potentialRisks) { + const text = thesis.potentialRisks; + const count = (text.match(/\d+[\.\)]\s/g) || []).length || text.split(/\n/).filter((l: string) => l.trim().length > 10).length; + console.log(` Potential Risks: ${count} items (${text.length} chars)`); + } + if (thesis?.valueCreationLevers) { + const text = thesis.valueCreationLevers; + const count = (text.match(/\d+[\.\)]\s/g) || []).length || text.split(/\n/).filter((l: string) => l.trim().length > 10).length; + console.log(` Value Creation: ${count} items (${text.length} chars)`); + } + + const recommendation = data.keyQuestionsNextSteps?.preliminaryRecommendation; + console.log(` Recommendation: ${recommendation || '???'}`); + + // Write full output to a JSON file for inspection + const outPath = path.join(projectRoot, `test-output-${fileName.replace(/\s+/g, '_').replace('.pdf', '')}.json`); + fs.writeFileSync(outPath, JSON.stringify(result, null, 2)); + console.log(`\n Full output written to: ${outPath}`); + } + } catch (error) { + const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); + console.error(`\n FATAL ERROR after ${elapsed}s:`); + console.error(` ${error instanceof Error ? error.message : String(error)}`); + if (error instanceof Error && error.stack) { + console.error(` ${error.stack.split('\n').slice(1, 4).join('\n ')}`); + } + } + } + + console.log(`\n${'='.repeat(70)}`); + console.log('TEST COMPLETE'); + console.log(`${'='.repeat(70)}\n`); +} + +main().catch(err => { + console.error('Unhandled error:', err); + process.exit(1); +}); diff --git a/backend/src/services/financialTableParser.ts b/backend/src/services/financialTableParser.ts index 53a14b7..cef36ee 100644 --- a/backend/src/services/financialTableParser.ts +++ b/backend/src/services/financialTableParser.ts @@ -16,6 +16,16 @@ export interface ParsedFinancials { ltm: FinancialPeriod; } +export interface FinancialParseResult { + data: ParsedFinancials; + /** Number of candidate financial table headers found in the document. */ + tablesFound: number; + /** Number of financial metric rows successfully matched (revenue, EBITDA, etc.). */ + matchedRows: number; + /** Confidence: 'high' if both revenue+EBITDA found, 'low' if partial, 'none' if nothing. */ + confidence: 'high' | 'low' | 'none'; +} + type Bucket = keyof ParsedFinancials; const PERIOD_TOKEN_REGEX = /\b(?:(?:FY[-\s]?\d{1,2})|(?:FY[-\s]?)?20\d{2}[A-Z]*|(?:FY[-\s]?[1234])|(?:LTM|TTM))\b/gi; @@ -270,7 +280,7 @@ function assignTokensToBuckets( } } -export function parseFinancialsFromText(fullText: string): ParsedFinancials { +export function parseFinancialsFromText(fullText: string): FinancialParseResult { const startTime = Date.now(); const result: ParsedFinancials = { fy3: {}, @@ -278,12 +288,13 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials { fy1: {}, ltm: {} }; + let candidateHeaders = 0; try { const text = fullText.replace(/\u00A0/g, ' '); const lines = text.split('\n').map((line) => line.trim()).filter(Boolean); if (lines.length === 0) { - return result; + return { data: result, tablesFound: 0, matchedRows: 0, confidence: 'none' }; } let bestHeaderIndex = -1; @@ -295,9 +306,10 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials { for (let i = 0; i < lines.length; i++) { const tokens = tokenizePeriodHeaders(lines[i]); if (tokens.length >= 2) { + candidateHeaders++; const buckets = yearTokensToBuckets(tokens); const validBuckets = buckets.filter(Boolean).length; - + // Score this header: prioritize headers followed by financial metric rows let score = validBuckets; @@ -381,9 +393,10 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials { if (bestHeaderIndex === -1 || bestBuckets.filter(Boolean).length === 0) { logger.info('Financial parser could not identify year header, returning empty result', { totalLines: lines.length, + candidateHeaders, sampleLines: lines.slice(0, 20).join(' | ') }); - return result; + return { data: result, tablesFound: candidateHeaders, matchedRows: 0, confidence: 'none' }; } logger.info('Financial parser selected best header', { @@ -511,19 +524,31 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials { buckets: bestBuckets.map((bucket) => bucket || 'skip') }); + // Determine confidence based on what was found + const hasRevenue = !!(result.fy1.revenue || result.fy2.revenue || result.ltm.revenue); + const hasEbitda = !!(result.fy1.ebitda || result.fy2.ebitda || result.ltm.ebitda); + const confidence: 'high' | 'low' | 'none' = + hasRevenue && hasEbitda ? 'high' : + hasRevenue || hasEbitda ? 'low' : 'none'; + logger.info('Financial parser results', { elapsedMs: Date.now() - startTime, headerLine: lines[bestHeaderIndex], + candidateHeaders, + matchedRows, + confidence, fy3: result.fy3, fy2: result.fy2, fy1: result.fy1, ltm: result.ltm }); + + return { data: result, tablesFound: candidateHeaders, matchedRows, confidence }; } catch (error) { logger.warn('Financial parser failed', { error: error instanceof Error ? error.message : String(error) }); } - return result; + return { data: result, tablesFound: candidateHeaders, matchedRows: 0, confidence: 'none' }; } const containsMoneyOrPercent = (text: string): boolean => { resetRegex(MONEY_REGEX); diff --git a/backend/src/services/jobProcessorService.ts b/backend/src/services/jobProcessorService.ts index ae3207d..1ddbd74 100644 --- a/backend/src/services/jobProcessorService.ts +++ b/backend/src/services/jobProcessorService.ts @@ -133,10 +133,11 @@ export class JobProcessorService { await ProcessingJobModel.markAsProcessing(jobId); jobStatusUpdated = true; // Track that we've updated status - // Add timeout protection (14 minutes, leaving 1 minute buffer before scheduled function timeout) - const processingTimeout = 14 * 60 * 1000; // 14 minutes in milliseconds + // Add timeout protection (28 minutes for API immediate processing, 14 for scheduled function) + // API endpoint has 30-min Cloud Run timeout; scheduled function has 15 min + const processingTimeout = 28 * 60 * 1000; // 28 minutes in milliseconds const timeoutPromise = new Promise((_, reject) => { - timeoutId = setTimeout(() => reject(new Error('Job processing timeout after 14 minutes')), processingTimeout); + timeoutId = setTimeout(() => reject(new Error('Job processing timeout after 28 minutes')), processingTimeout); }); // Wrap processing logic in Promise.race with timeout @@ -217,10 +218,15 @@ export class JobProcessorService { } // Process the document + // Strategy priority: job options > env config > default + const { config: envConfig } = await import('../config/env'); + const processingStrategy = job.options?.strategy || envConfig.processingStrategy || 'document_ai_agentic_rag'; + logger.info('Starting document processing', { jobId, documentId: job.document_id, - strategy: job.options?.strategy || 'document_ai_agentic_rag', + strategy: processingStrategy, + strategySource: job.options?.strategy ? 'job_options' : (envConfig.processingStrategy ? 'env_config' : 'default'), }); const result = await unifiedDocumentProcessor.processDocument( @@ -228,7 +234,7 @@ export class JobProcessorService { job.user_id, '', // Text will be extracted from fileBuffer { - strategy: job.options?.strategy || 'document_ai_agentic_rag', + strategy: processingStrategy, fileBuffer, fileName: document.original_file_name, mimeType: 'application/pdf', diff --git a/backend/src/services/llmService.ts b/backend/src/services/llmService.ts index ef05dd9..9fb9a7b 100644 --- a/backend/src/services/llmService.ts +++ b/backend/src/services/llmService.ts @@ -32,6 +32,12 @@ export interface CIMAnalysisResult { inputTokens: number; outputTokens: number; validationIssues?: z.ZodIssue[]; + /** The text actually sent to the LLM (may be truncated from the original). */ + processedText?: string; + /** True if the input text was truncated to fit token limits. */ + wasTruncated?: boolean; + /** Delta corrections from quality-check (delta mode only). */ + corrections?: Array<{ path: string; value: string; reason?: string }>; } class LLMService { @@ -41,6 +47,34 @@ class LLMService { private maxTokens: number; private temperature: number; + // Concurrency limiter to prevent Anthropic 429 rate limit errors. + // Max 2 concurrent API calls; additional calls queue up and wait. + private static readonly MAX_CONCURRENT_LLM_CALLS = 1; + private static activeCalls = 0; + private static waitQueue: Array<() => void> = []; + + private async acquireLLMSlot(): Promise { + if (LLMService.activeCalls < LLMService.MAX_CONCURRENT_LLM_CALLS) { + LLMService.activeCalls++; + return; + } + // Wait for a slot to free up + return new Promise((resolve) => { + LLMService.waitQueue.push(() => { + LLMService.activeCalls++; + resolve(); + }); + }); + } + + private releaseLLMSlot(): void { + LLMService.activeCalls--; + if (LLMService.waitQueue.length > 0) { + const next = LLMService.waitQueue.shift()!; + next(); + } + } + constructor() { // CRITICAL DEBUG: Log what we're reading from config logger.info('LLM Service constructor - Reading provider from config', { @@ -313,6 +347,8 @@ class LLMService { cost: this.estimateCost(estimatedTokens + response.content.length, selectedModel), inputTokens: estimatedTokens, outputTokens: response.content.length, + processedText: processedText, + wasTruncated, }; } else { lastError = new Error(`JSON validation failed: ${validation.error.errors.map(e => e.message).join(', ')}`); @@ -353,10 +389,154 @@ class LLMService { throw lastError || new Error('All LLM processing attempts failed'); } + /** + * Quality-check a first-pass CIM extraction. + * + * A second LLM call reviews the pass-1 JSON against the original document text + * and returns a corrected/enhanced version. Uses the same retry + Zod validation + * logic as processCIMDocument. Falls back to the pass-1 result on total failure. + */ + async qualityCheckCIMDocument( + text: string, + extractedData: CIMReview, + financialContext?: string, + options?: { wasTruncated?: boolean; truncationPercent?: number }, + ): Promise }> { + logger.info('Starting quality-check LLM call (delta mode)', { + textLength: text.length, + extractedDataKeys: Object.keys(extractedData), + wasTruncated: options?.wasTruncated, + }); + + // Condense the document: send first 20K + last 20K chars instead of full text. + // The quality check is reviewing the extraction, not re-reading the whole CIM. + const MAX_CONTEXT_CHARS = 40000; + let condensedText: string; + if (text.length > MAX_CONTEXT_CHARS) { + const halfSize = Math.floor(MAX_CONTEXT_CHARS / 2); + condensedText = text.substring(0, halfSize) + + `\n\n[... ${text.length - MAX_CONTEXT_CHARS} characters omitted for brevity ...]\n\n` + + text.substring(text.length - halfSize); + logger.info('Quality-check: condensed document text', { + originalLength: text.length, + condensedLength: condensedText.length, + }); + } else { + condensedText = text; + } + + const extractedJson = JSON.stringify(extractedData, null, 2); + // Use fast model (Haiku) for quality check — it's a review task, not original extraction + const selectedModel = config.llm.fastModel || config.llm.model; + const estimatedTokens = this.estimateTokenCount(condensedText) + this.estimateTokenCount(extractedJson); + + const truncationNote = options?.wasTruncated + ? `\nNote: The original document was truncated before the first-pass extraction. Do NOT invent data not present in the extracted JSON or the text excerpts below.` + : ''; + + const systemPrompt = `You are a senior investment analyst performing a quality review of a first-pass CIM extraction. Your job is to identify ERRORS and GAPS in the extracted data. +${truncationNote} +Return ONLY a JSON array of corrections. Each correction is an object with: +- "path": dot-notation path to the field (e.g. "financialSummary.financials.fy1.revenue") +- "value": the corrected value (string) +- "reason": brief explanation of why this correction is needed + +RULES: +1. Only include fields that need changes. If the extraction is correct, return an empty array: [] +2. For financial figures, cross-check against the document text. Fix wrong values or misaligned periods. +3. For "Not specified in CIM" on QUANTITATIVE fields (revenue, headcount, etc.), only replace if you can find the actual data in the text. +4. For "Not specified in CIM" on QUALITATIVE fields (statedReasonForSale, postTransactionIntentions, workingCapitalIntensity, typicalContractLength, etc.), ALWAYS provide a professional inference based on document context. For example: if reason for sale isn't stated but the seller is PE-backed, write "Not explicitly stated; likely PE sponsor exit based on [context]". If working capital intensity isn't stated but the company is asset-light, infer "Low working capital intensity consistent with asset-light business model". +5. For thin list fields (keyAttractions, potentialRisks, etc.), provide enhanced content with 5-8 items, 2-3 sentences each. +6. Do NOT hallucinate. Only use information present in or inferable from the document text. +7. Your ENTIRE response must be a valid JSON array. No markdown, no explanation outside the array.`; + + const financialSection = financialContext + ? `\n\nDETERMINISTIC FINANCIAL DATA (ground truth — do not contradict):\n${financialContext}` + : ''; + + const userPrompt = `Review this first-pass CIM extraction for errors and gaps. + +DOCUMENT TEXT (condensed excerpts): +${condensedText} +${financialSection} + +EXTRACTED JSON TO REVIEW: +${extractedJson} + +Return a JSON array of corrections. If no corrections needed, return [].`; + + let lastError: Error | null = null; + + for (let attempt = 1; attempt <= 2; attempt++) { + try { + if (lastError && lastError.message.includes('rate limit')) { + const retryDelay = Math.min(60000 * attempt, 300000); + logger.warn(`Quality check: rate limit, waiting ${retryDelay}ms before attempt ${attempt}`); + await new Promise(resolve => setTimeout(resolve, retryDelay)); + } + + logger.info(`Quality-check LLM attempt ${attempt}/2`, { model: selectedModel }); + + const response = await this.callLLM({ + prompt: userPrompt, + systemPrompt, + model: selectedModel, + maxTokens: 8000, // Delta output is much smaller than full JSON + temperature: 0.1, // Low temperature for consistency + }); + + if (!response.success) { + throw new Error(response.error || 'Quality-check LLM call failed'); + } + + // Parse the corrections array from the response + const corrections = this.parseCorrectionsArray(response.content); + + if (corrections !== null) { + logger.info(`Quality-check completed on attempt ${attempt}`, { + correctionsCount: corrections.length, + }); + return { + success: true, + corrections: corrections.length > 0 ? corrections : undefined, + model: selectedModel, + cost: this.estimateCost(estimatedTokens + response.content.length, selectedModel), + inputTokens: estimatedTokens, + outputTokens: response.content.length, + }; + } else { + lastError = new Error('Quality-check response was not a valid JSON array'); + logger.warn(`Quality-check parse failed on attempt ${attempt}`, { + responsePreview: response.content.substring(0, 200), + }); + } + } catch (error) { + lastError = error instanceof Error ? error : new Error('Unknown error'); + logger.error(`Quality-check attempt ${attempt} failed`, { error: lastError.message }); + } + } + + // All quality-check attempts failed — return failure (caller falls back to pass-1) + logger.warn('All quality-check attempts failed — caller should use pass-1 result', { + lastError: lastError?.message, + }); + + return { + success: false, + error: lastError?.message || 'All quality-check attempts failed', + model: selectedModel, + cost: 0, + inputTokens: estimatedTokens, + outputTokens: 0, + }; + } + /** * Call the appropriate LLM API */ private async callLLM(request: LLMRequest): Promise { + // Acquire a concurrency slot (max 2 concurrent LLM calls to avoid Anthropic 429s) + await this.acquireLLMSlot(); try { // Use configured timeout from config.llm.timeoutMs (default 6 minutes for complex analysis) // Increased from 3 minutes to handle complex CIM analysis even with RAG reduction @@ -367,7 +547,7 @@ class LLMService { ...request, model: normalizedModel }; - + // Add a timeout wrapper to prevent hanging const timeoutPromise = new Promise((_, reject) => { setTimeout(() => reject(new Error(`LLM call timeout after ${timeoutMinutes} minutes`)), timeoutMs); @@ -380,9 +560,11 @@ class LLMService { model: normalizedRequest.model || this.defaultModel, willCallOpenRouter: this.provider === 'openrouter', willCallAnthropic: this.provider === 'anthropic', - willCallOpenAI: this.provider === 'openai' + willCallOpenAI: this.provider === 'openai', + queuedCalls: LLMService.waitQueue.length, + activeCalls: LLMService.activeCalls }); - + if (this.provider === 'openai') { return await this.callOpenAI(normalizedRequest); } else if (this.provider === 'openrouter') { @@ -405,6 +587,8 @@ class LLMService { content: '', error: error instanceof Error ? error.message : 'Unknown error', }; + } finally { + this.releaseLLMSlot(); } } @@ -910,13 +1094,13 @@ class LLMService { CRITICAL REQUIREMENTS: 1. **JSON OUTPUT ONLY**: Your entire response MUST be a single, valid JSON object. Do not include any text or explanation before or after the JSON object. 2. **BPCP TEMPLATE FORMAT**: The JSON object MUST follow the BPCP CIM Review Template structure exactly as specified. -3. **COMPLETE ALL FIELDS**: You MUST provide a value for every field. Use "Not specified in CIM" for any information that is not available in the document. -4. **NO PLACEHOLDERS**: Do not use placeholders like "..." or "TBD". Use "Not specified in CIM" instead. +3. **COMPLETE ALL FIELDS**: You MUST provide a value for every field. For qualitative/assessment fields, ALWAYS provide your professional inference or best judgment based on context — do NOT default to "Not specified in CIM" for fields like statedReasonForSale, postTransactionIntentions, workingCapitalIntensity, or typicalContractLength. For example: if reason for sale is not stated but the seller is PE-backed, write "Not explicitly stated; likely PE sponsor exit/liquidity event based on [context]". Only use "Not specified in CIM" for specific quantitative data (exact financial figures, headcounts) that truly cannot be determined or reasonably inferred. +4. **NO PLACEHOLDERS**: Do not use placeholders like "..." or "TBD". 5. **PROFESSIONAL ANALYSIS**: The content should be high-quality and suitable for BPCP's investment committee. 6. **BPCP FOCUS**: Focus on companies in 5+MM EBITDA range in consumer and industrial end markets, with emphasis on M&A, technology & data usage, supply chain and human capital optimization. 7. **BPCP PREFERENCES**: BPCP prefers companies which are founder/family-owned and within driving distance of Cleveland and Charlotte. 8. **EXACT FIELD NAMES**: Use the exact field names and descriptions from the BPCP CIM Review Template. -9. **FINANCIAL DATA**: For financial metrics, use actual numbers if available, otherwise use "Not specified in CIM". +9. **FINANCIAL DATA**: For financial metrics, use actual numbers if available. Only use "Not specified in CIM" for specific financial line items that are truly absent from the document. 10. **VALID JSON**: Ensure your response is valid JSON that can be parsed without errors. ANALYSIS QUALITY REQUIREMENTS: @@ -1235,6 +1419,54 @@ SPECIAL REQUIREMENTS FOR PRELIMINARY INVESTMENT THESIS: `; } + /** + * Parse a corrections array from quality-check LLM response. + * Expects a JSON array like: [{"path":"...", "value":"...", "reason":"..."}] + * Returns null if parsing fails. + */ + private parseCorrectionsArray(content: string): Array<{ path: string; value: string; reason?: string }> | null { + try { + // Strip markdown fences if present + let cleaned = content.trim(); + if (cleaned.startsWith('```')) { + const firstNewline = cleaned.indexOf('\n'); + cleaned = cleaned.substring(firstNewline + 1); + const lastFence = cleaned.lastIndexOf('```'); + if (lastFence > 0) cleaned = cleaned.substring(0, lastFence); + cleaned = cleaned.trim(); + } + + const parsed = JSON.parse(cleaned); + if (!Array.isArray(parsed)) { + logger.warn('Quality-check response is not an array', { type: typeof parsed }); + return null; + } + + // Validate each correction has at minimum path + value + const valid = parsed.filter((item: any) => + item && typeof item.path === 'string' && item.path.length > 0 && + (typeof item.value === 'string' || typeof item.value === 'number') + ).map((item: any) => ({ + path: item.path, + value: String(item.value), + reason: item.reason ? String(item.reason) : undefined, + })); + + logger.info('Quality-check corrections parsed', { + rawCount: parsed.length, + validCount: valid.length, + }); + + return valid; + } catch (error) { + logger.warn('Failed to parse quality-check corrections array', { + error: error instanceof Error ? error.message : String(error), + contentPreview: content.substring(0, 200), + }); + return null; + } + } + /** * Extract JSON from LLM response */ @@ -2536,33 +2768,34 @@ CRITICAL REQUIREMENTS: const trimmed = model.trim(); const lower = trimmed.toLowerCase(); - const canonicalSonnet46 = 'claude-sonnet-4-6'; - const canonicalHaiku45 = 'claude-haiku-4-5'; + // Valid Anthropic API model IDs (date-stamped or aliased) + const canonicalSonnet46 = 'claude-sonnet-4-20250514'; + const canonicalHaiku45 = 'claude-haiku-4-5-20251001'; const legacySonnet35 = 'claude-3-5-sonnet-latest'; const legacyHaiku35 = 'claude-3-5-haiku-latest'; - - // Keep modern 4.6/4.5 identifiers as-is - if (lower.includes('sonnet-4-6')) { + + // Keep valid date-stamped identifiers as-is + if (lower.match(/sonnet-4-\d{8}/)) { return trimmed; } - if (lower.includes('haiku-4-5')) { + if (lower.match(/haiku-4-\d+-\d{8}/)) { return trimmed; } - - // Map older Claude 4.x labels (4.0/4.5) to the active 4.6/4.5 SKUs + + // Map shorthand Claude 4.x labels to valid API model IDs if (lower.includes('sonnet-4')) { if (trimmed !== canonicalSonnet46) { - logger.warn('Normalizing Claude Sonnet 4.x model to 4.6', { + logger.warn('Normalizing Claude Sonnet 4.x model to canonical API ID', { requestedModel: trimmed, normalizedModel: canonicalSonnet46 }); } return canonicalSonnet46; } - + if (lower.includes('haiku-4')) { if (trimmed !== canonicalHaiku45) { - logger.warn('Normalizing Claude Haiku 4.x model to 4.5', { + logger.warn('Normalizing Claude Haiku 4.x model to canonical API ID', { requestedModel: trimmed, normalizedModel: canonicalHaiku45 }); diff --git a/backend/src/services/optimizedAgenticRAGProcessor.ts b/backend/src/services/optimizedAgenticRAGProcessor.ts index a2ea987..8965647 100644 --- a/backend/src/services/optimizedAgenticRAGProcessor.ts +++ b/backend/src/services/optimizedAgenticRAGProcessor.ts @@ -125,7 +125,7 @@ export class OptimizedAgenticRAGProcessor { /** * Create intelligent chunks with semantic boundaries */ - private async createIntelligentChunks( + async createIntelligentChunks( text: string, documentId: string, enableSemanticChunking: boolean = true, @@ -515,7 +515,7 @@ export class OptimizedAgenticRAGProcessor { * Store chunks with optimized batching * Returns the number of API calls made for embeddings */ - private async storeChunksOptimized( + async storeChunksOptimized( chunks: ProcessingChunk[], documentId: string ): Promise { @@ -1216,20 +1216,27 @@ export class OptimizedAgenticRAGProcessor { try { const { parseFinancialsFromText } = await import('./financialTableParser'); - const parsedFinancials = parseFinancialsFromText(text); + const parseResult = parseFinancialsFromText(text); + const parsedFinancials = parseResult.data; if (this.hasStructuredFinancialData(parsedFinancials)) { deterministicFinancials = parsedFinancials; deterministicFinancialChunk = this.buildDeterministicFinancialChunk(documentId, parsedFinancials); logger.info('Deterministic financial parser prepared structured data', { documentId, + confidence: parseResult.confidence, + tablesFound: parseResult.tablesFound, fy3: parsedFinancials.fy3, fy2: parsedFinancials.fy2, fy1: parsedFinancials.fy1, ltm: parsedFinancials.ltm }); } else { - logger.info('Deterministic financial parser did not find structured data', { documentId }); + logger.info('Deterministic financial parser did not find structured data', { + documentId, + confidence: parseResult.confidence, + tablesFound: parseResult.tablesFound, + }); } } catch (parserError) { logger.warn('Deterministic financial parser failed during initialization', { @@ -1241,31 +1248,23 @@ export class OptimizedAgenticRAGProcessor { let totalApiCalls = 0; const partialResults: Partial[] = []; - // OPTIMIZED SEQUENTIAL EXECUTION: Combined passes to reduce API calls - logger.info('Starting agentic RAG extraction (optimized with combined passes)'); - const sequentialStart = Date.now(); - - // Run combined passes sequentially to reduce total API calls from 5 to 3 - logger.info('Pass 1: Metadata + Financial Data (Combined)'); + // PARALLEL EXECUTION: Run all 3 independent extraction passes concurrently + logger.info('Starting agentic RAG extraction (3 passes in parallel)'); + const parallelStart = Date.now(); + const deterministicPinnedChunks = deterministicFinancialChunk ? [deterministicFinancialChunk] : []; - const pass1CombinedResult = await this.extractPass1CombinedMetadataFinancial( + + const [pass1CombinedResult, pass2CombinedResult, pass3Result] = await Promise.all([ + this.extractPass1CombinedMetadataFinancial(documentId, text, chunks, deterministicPinnedChunks), + this.extractPass2CombinedMarketBusiness(documentId, text, chunks), + this.extractPass5InvestmentThesis(documentId, text, chunks) + ]); + + const parallelTime = Date.now() - parallelStart; + logger.info('Parallel extraction completed', { documentId, - text, - chunks, - deterministicPinnedChunks - ); - - logger.info('Pass 2: Market Analysis + Business Operations (Combined)'); - const pass2CombinedResult = await this.extractPass2CombinedMarketBusiness(documentId, text, chunks); - - logger.info('Pass 3: Investment Thesis'); - const pass3Result = await this.extractPass5InvestmentThesis(documentId, text, chunks); - - const sequentialTime = Date.now() - sequentialStart; - logger.info('Sequential extraction completed', { - documentId, - sequentialTimeMs: sequentialTime, - sequentialTimeSec: (sequentialTime / 1000).toFixed(1) + parallelTimeMs: parallelTime, + parallelTimeSec: (parallelTime / 1000).toFixed(1) }); partialResults.push(pass1CombinedResult.data); @@ -2135,49 +2134,53 @@ IMPORTANT EXTRACTION RULES: const batches = this.groupMissingFieldsIntoBatches(missingFields); let totalApiCalls = 0; - for (const batch of batches) { - // Create a targeted query for this batch of missing fields - const query = this.createGapFillingQuery(batch); - - try { - const { chunks: relevantChunks } = await this.findRelevantChunks( - documentId, - query, - chunks, - 30000 // Smaller context for gap-filling - ); + // Process all gap-filling batches in parallel + const batchResults = await Promise.all( + batches.map(async (batch) => { + const query = this.createGapFillingQuery(batch); + try { + const { chunks: relevantChunks } = await this.findRelevantChunks( + documentId, query, chunks, 30000 + ); if (relevantChunks.length === 0) { - logger.info('No relevant chunks found for gap-filling batch', { batch }); - continue; + logger.info('No relevant chunks found for gap-filling batch', { batch }); + return { apiCalls: 0, data: null }; } const reducedText = relevantChunks - .slice(0, 15) // Limit to 15 chunks for gap-filling + .slice(0, 15) .map((chunk, index) => { const separator = index > 0 ? '\n\n---\n\n' : ''; - return `${separator}[Section ${chunk.chunkIndex + 1}]\n${chunk.content}`; + return `${separator}[Section ${chunk.chunkIndex + 1}]\n${chunk.content}`; }) .join('\n\n'); - // Make LLM call for this batch const result = await llmService.processCIMDocument(reducedText, 'BPCP CIM Review Template'); - totalApiCalls++; if (result.success && result.jsonOutput) { - // Merge gap-filled data (only for the missing fields) - this.deepMerge(currentData, result.jsonOutput); - logger.info('Gap-filling batch completed', { - batch: batch.slice(0, 5), - batchSize: batch.length - }); + logger.info('Gap-filling batch completed', { + batch: batch.slice(0, 5), + batchSize: batch.length + }); + return { apiCalls: 1, data: result.jsonOutput }; } - + return { apiCalls: 1, data: null }; } catch (error) { logger.error('Gap-filling batch failed', { error: error instanceof Error ? error.message : String(error), - batch: batch.slice(0, 5) - }); + batch: batch.slice(0, 5) + }); + return { apiCalls: 0, data: null }; + } + }) + ); + + // Merge results sequentially to maintain deterministic merge order + for (const result of batchResults) { + totalApiCalls += result.apiCalls; + if (result.data) { + this.deepMerge(currentData, result.data); } } @@ -2249,54 +2252,49 @@ Extract exact values, numbers, percentages, names, and detailed information.`; { path: 'keyQuestionsNextSteps.missingInformation', name: 'Missing Information' } ]; - for (const { path, name } of listFieldPaths) { + // Identify which fields need repair + const fieldsNeedingRepair = listFieldPaths.filter(({ path, name }) => { const value = this.getNestedField(validatedData, path); - if (value && typeof value === 'string') { const itemCount = (value.match(/^\d+\.\s/gm) || []).length; - if (itemCount < 5 || itemCount > 8) { logger.warn(`List field validation failed: ${name}`, { - documentId, - field: path, - currentCount: itemCount, - required: '5-8 items' + documentId, field: path, currentCount: itemCount, required: '5-8 items' }); + return true; + } + logger.debug(`List field validated: ${name}`, { documentId, field: path, itemCount }); + } + return false; + }); + // Repair all failing fields in parallel + if (fieldsNeedingRepair.length > 0) { + const repairResults = await Promise.all( + fieldsNeedingRepair.map(async ({ path, name }) => { + const value = this.getNestedField(validatedData, path) as string; + const itemCount = (value.match(/^\d+\.\s/gm) || []).length; try { - // Attempt to repair the list - const repairedValue = await this.repairListField( - name, - value, - itemCount, - documentId, - chunks - ); - apiCalls++; - - // Update the nested field - this.setNestedField(validatedData, path, repairedValue); - + const repairedValue = await this.repairListField(name, value, itemCount, documentId, chunks); logger.info(`List field repaired: ${name}`, { - documentId, - field: path, - originalCount: itemCount, + documentId, field: path, originalCount: itemCount, newCount: (repairedValue.match(/^\d+\.\s/gm) || []).length }); + return { path, repairedValue, success: true }; } catch (error) { logger.error(`Failed to repair list field: ${name}`, { - documentId, - field: path, + documentId, field: path, error: error instanceof Error ? error.message : String(error) }); - // Continue with original value if repair fails + return { path, repairedValue: null, success: false }; } - } else { - logger.debug(`List field validated: ${name}`, { - documentId, - field: path, - itemCount - }); + }) + ); + + for (const result of repairResults) { + if (result.success && result.repairedValue) { + this.setNestedField(validatedData, result.path, result.repairedValue); + apiCalls++; } } } diff --git a/backend/src/services/simpleDocumentProcessor.ts b/backend/src/services/simpleDocumentProcessor.ts index d926042..3db41ff 100644 --- a/backend/src/services/simpleDocumentProcessor.ts +++ b/backend/src/services/simpleDocumentProcessor.ts @@ -79,12 +79,13 @@ class SimpleDocumentProcessor { let deterministicFinancials: any = null; try { const { parseFinancialsFromText } = await import('./financialTableParser'); - const parsedFinancials = parseFinancialsFromText(extractedText); - + const parseResult = parseFinancialsFromText(extractedText); + const parsedFinancials = parseResult.data; + // Check if parser found structured data - const hasData = parsedFinancials.fy3?.revenue || parsedFinancials.fy2?.revenue || + const hasData = parsedFinancials.fy3?.revenue || parsedFinancials.fy2?.revenue || parsedFinancials.fy1?.revenue || parsedFinancials.ltm?.revenue; - + if (hasData) { deterministicFinancials = parsedFinancials; logger.info('Deterministic financial parser found structured data', { diff --git a/backend/src/services/singlePassProcessor.ts b/backend/src/services/singlePassProcessor.ts new file mode 100644 index 0000000..ae44279 --- /dev/null +++ b/backend/src/services/singlePassProcessor.ts @@ -0,0 +1,376 @@ +import { logger } from '../utils/logger'; +import { config } from '../config/env'; +import { llmService } from './llmService'; +import { CIMReview } from './llmSchemas'; +import { documentAiProcessor } from './documentAiProcessor'; + +interface SinglePassResult { + success: boolean; + summary: string; + analysisData: CIMReview; + processingStrategy: 'single_pass_quality_check'; + processingTime: number; + apiCalls: number; + error: string | undefined; + /** Whether the quality-check LLM pass succeeded. False means pass-1 result is unverified. */ + qualityCheckPassed: boolean; + /** Whether the input text was truncated before LLM processing. */ + wasTruncated: boolean; + /** Completeness score (0-100) from final validation. */ + completenessScore: number; +} + +/** + * Single-pass CIM processor: 2 LLM calls total (extraction + quality check). + * + * Flow: + * Document AI OCR (if needed) -> deterministic financial parse -> + * single LLM extraction -> quality-check LLM -> PDF generation -> done + * + * Background: chunk + embed for future search (non-blocking, fire-and-forget) + */ +class SinglePassProcessor { + /** + * Main entry point — processes a CIM document with 2 LLM calls. + */ + async processDocument( + documentId: string, + userId: string, + text: string, + options: { + fileBuffer?: Buffer; + fileName?: string; + mimeType?: string; + } = {} + ): Promise { + const startTime = Date.now(); + let apiCalls = 0; + + try { + // ── Step 0: Extract text via Document AI if not provided ── + let extractedText = text; + if (!extractedText || extractedText.length === 0) { + const { fileBuffer, fileName, mimeType } = options; + if (!fileBuffer || !fileName || !mimeType) { + throw new Error('Missing required options: fileBuffer, fileName, mimeType'); + } + + const docAiResult = await documentAiProcessor.extractTextOnly( + documentId, userId, fileBuffer, fileName, mimeType + ); + if (!docAiResult.text || docAiResult.text.length === 0) { + throw new Error('Document AI text extraction returned no text'); + } + extractedText = docAiResult.text; + + logger.info('Single-pass: Document AI text extraction completed', { + documentId, textLength: extractedText.length, + }); + } + + // ── Step 1: Single LLM extraction ── + // Note: Deterministic financial parser disabled for single-pass mode. + // The LLM consistently outperforms it — the parser matches narrative text + // and footnotes, producing wrong values that override correct LLM output. + logger.info('Single-pass: Starting LLM extraction (pass 1)', { documentId }); + const pass1Result = await llmService.processCIMDocument( + extractedText, + '', // template is embedded in the prompt + ); + apiCalls++; + + if (!pass1Result.success || !pass1Result.jsonOutput) { + throw new Error(pass1Result.error || 'LLM extraction (pass 1) failed'); + } + + // Use the same text the LLM actually saw (may be truncated) + const llmText = pass1Result.processedText || extractedText; + const wasTruncated = pass1Result.wasTruncated || false; + const truncationPercent = wasTruncated + ? (llmText.length / extractedText.length) * 100 + : 100; + + if (wasTruncated) { + logger.warn('Single-pass: Pass 1 text was truncated', { + documentId, + originalLength: extractedText.length, + truncatedLength: llmText.length, + truncationPercent: truncationPercent.toFixed(1), + }); + } + + let analysisData: CIMReview = pass1Result.jsonOutput; + + logger.info('Single-pass: Pass 1 complete', { + documentId, + model: pass1Result.model, + inputTokens: pass1Result.inputTokens, + outputTokens: pass1Result.outputTokens, + cost: pass1Result.cost, + wasTruncated, + }); + + // ── Step 3: Quality-check LLM (pass 2) ── + // Uses delta-only approach: quality check returns only corrections, not full JSON. + // Uses Haiku for speed. Sends condensed document (first/last 20K chars) not full text. + logger.info('Single-pass: Starting quality-check LLM (pass 2)', { documentId }); + let qualityCheckPassed = false; + try { + const qualityResult = await llmService.qualityCheckCIMDocument( + llmText, + analysisData, + undefined, // no deterministic financial context + { wasTruncated, truncationPercent }, + ); + apiCalls++; + + if (qualityResult.success && qualityResult.corrections) { + // Apply delta corrections to the existing analysis data + analysisData = this.applyCorrections(analysisData, qualityResult.corrections); + qualityCheckPassed = true; + + logger.info('Single-pass: Quality check complete — corrections applied', { + documentId, + model: qualityResult.model, + correctionsCount: qualityResult.corrections.length, + inputTokens: qualityResult.inputTokens, + outputTokens: qualityResult.outputTokens, + cost: qualityResult.cost, + }); + } else if (qualityResult.success && !qualityResult.corrections) { + // Quality check returned success with no corrections — pass 1 was good + qualityCheckPassed = true; + logger.info('Single-pass: Quality check found no corrections needed', { documentId }); + } else { + logger.error('Single-pass: Quality check returned no valid output — using pass 1 result', { + documentId, + error: qualityResult.error, + }); + } + } catch (qualityError) { + // Quality check failed — fall back to pass 1 but log at ERROR + logger.error('Single-pass: Quality check failed — using unverified pass 1 result', { + documentId, + error: qualityError instanceof Error ? qualityError.message : String(qualityError), + }); + } + + // ── Step 4: Completeness gate ── + const completenessScore = this.scoreCompleteness(analysisData); + const COMPLETENESS_THRESHOLD = 85; + + if (completenessScore < COMPLETENESS_THRESHOLD) { + const processingTime = Date.now() - startTime; + const msg = `Completeness score ${completenessScore.toFixed(1)}% is below ${COMPLETENESS_THRESHOLD}% threshold`; + logger.error('Single-pass: Completeness gate FAILED', { + documentId, + completenessScore, + threshold: COMPLETENESS_THRESHOLD, + qualityCheckPassed, + wasTruncated, + }); + + return { + success: false, + summary: '', + analysisData, + processingStrategy: 'single_pass_quality_check', + processingTime, + apiCalls, + error: msg, + qualityCheckPassed, + wasTruncated, + completenessScore, + }; + } + + // ── Step 5: Fire-and-forget background embedding ── + if (config.backgroundEmbeddingEnabled !== false) { + setImmediate(() => { + this.backgroundChunkAndEmbed(documentId, extractedText).catch((err) => { + logger.warn('Single-pass: Background embedding failed (non-blocking)', { + documentId, + error: err instanceof Error ? err.message : String(err), + }); + }); + }); + } + + const processingTime = Date.now() - startTime; + const summary = this.buildSummary(analysisData); + + logger.info('Single-pass: Processing complete', { + documentId, + processingTime, + apiCalls, + textLength: extractedText.length, + qualityCheckPassed, + wasTruncated, + completenessScore, + }); + + return { + success: true, + summary, + analysisData, + processingStrategy: 'single_pass_quality_check', + processingTime, + apiCalls, + error: undefined, + qualityCheckPassed, + wasTruncated, + completenessScore, + }; + } catch (error) { + const processingTime = Date.now() - startTime; + const errorMessage = error instanceof Error ? error.message : String(error); + + logger.error('Single-pass: Processing failed', { + documentId, + error: errorMessage, + stack: error instanceof Error ? error.stack : undefined, + processingTime, + apiCalls, + }); + + return { + success: false, + summary: '', + analysisData: {} as CIMReview, + processingStrategy: 'single_pass_quality_check', + processingTime, + apiCalls, + error: `Single-pass processing failed: ${errorMessage}`, + qualityCheckPassed: false, + wasTruncated: false, + completenessScore: 0, + }; + } + } + + /** + * Apply delta corrections from the quality check to the existing analysis data. + * Each correction has a dot-path (e.g. "financialSummary.financials.fy1.revenue") + * and a new value. + */ + private applyCorrections( + data: CIMReview, + corrections: Array<{ path: string; value: string; reason?: string }>, + ): CIMReview { + const patched = JSON.parse(JSON.stringify(data)) as any; + let applied = 0; + + for (const { path, value, reason } of corrections) { + const parts = path.split('.'); + let target = patched; + for (let i = 0; i < parts.length - 1; i++) { + if (target[parts[i]] === undefined || target[parts[i]] === null) { + target[parts[i]] = {}; + } + target = target[parts[i]]; + } + const lastKey = parts[parts.length - 1]; + const oldValue = target[lastKey]; + target[lastKey] = value; + applied++; + + logger.info('Quality-check correction applied', { + path, + oldValue: typeof oldValue === 'string' ? oldValue.substring(0, 80) : String(oldValue), + newValue: typeof value === 'string' ? value.substring(0, 80) : String(value), + reason: reason || '', + }); + } + + logger.info('Quality-check corrections summary', { total: corrections.length, applied }); + return patched as CIMReview; + } + + /** + * Build a short executive summary from the analysis data. + */ + private buildSummary(data: CIMReview): string { + const companyName = data.dealOverview?.targetCompanyName || 'Unknown Company'; + const sector = data.dealOverview?.industrySector || 'Unknown Sector'; + const recommendation = data.keyQuestionsNextSteps?.preliminaryRecommendation || 'No recommendation'; + return `CIM Review: ${companyName} (${sector}) — ${recommendation}`; + } + + /** + * Quick completeness score (0-100): counts non-empty string leaf fields. + * BPCP-internal fields (reviewers, dateReviewed, dateCIMReceived) are excluded. + */ + private scoreCompleteness(data: CIMReview): number { + const excludedPaths = new Set([ + 'dealOverview.reviewers', + 'dealOverview.dateReviewed', + 'dealOverview.dateCIMReceived', + ]); + + let total = 0; + let filled = 0; + const emptyFields: string[] = []; + + const walk = (obj: any, path: string) => { + if (obj === null || obj === undefined) return; + if (typeof obj === 'string') { + if (excludedPaths.has(path)) return; + total++; + const trimmed = obj.trim(); + if (trimmed !== '' && trimmed !== 'Not specified in CIM') { + filled++; + } else { + emptyFields.push(path); + } + } else if (typeof obj === 'object' && !Array.isArray(obj)) { + for (const key of Object.keys(obj)) { + walk(obj[key], path ? `${path}.${key}` : key); + } + } + }; + + walk(data, ''); + const score = total > 0 ? (filled / total) * 100 : 0; + + if (emptyFields.length > 0) { + logger.info('Completeness scoring: empty fields', { + score: score.toFixed(1), + total, + filled, + emptyCount: emptyFields.length, + emptyFields, + }); + } + + return score; + } + + /** + * Background chunking and embedding for future vector search. + * Runs asynchronously after the main processing completes. + */ + private async backgroundChunkAndEmbed(documentId: string, text: string): Promise { + try { + const { OptimizedAgenticRAGProcessor } = await import('./optimizedAgenticRAGProcessor'); + const ragProcessor = new OptimizedAgenticRAGProcessor(); + + logger.info('Single-pass: Starting background chunk & embed', { + documentId, textLength: text.length, + }); + + const chunks = await ragProcessor.createIntelligentChunks(text, documentId, true, []); + await ragProcessor.storeChunksOptimized(chunks, documentId); + + logger.info('Single-pass: Background chunk & embed complete', { + documentId, chunkCount: chunks.length, + }); + } catch (error) { + logger.warn('Single-pass: Background chunk & embed failed', { + documentId, + error: error instanceof Error ? error.message : String(error), + }); + } + } +} + +export const singlePassProcessor = new SinglePassProcessor(); diff --git a/backend/src/services/unifiedDocumentProcessor.ts b/backend/src/services/unifiedDocumentProcessor.ts index 7285292..5cdfe49 100644 --- a/backend/src/services/unifiedDocumentProcessor.ts +++ b/backend/src/services/unifiedDocumentProcessor.ts @@ -3,6 +3,7 @@ import { config } from '../config/env'; import { optimizedAgenticRAGProcessor } from './optimizedAgenticRAGProcessor'; import { simpleDocumentProcessor } from './simpleDocumentProcessor'; import { documentAiProcessor } from './documentAiProcessor'; +import { singlePassProcessor } from './singlePassProcessor'; import { CIMReview } from './llmSchemas'; // Default empty CIMReview object @@ -111,7 +112,7 @@ interface ProcessingResult { success: boolean; summary: string; analysisData: CIMReview; - processingStrategy: 'document_ai_agentic_rag' | 'simple_full_document'; + processingStrategy: 'document_ai_agentic_rag' | 'simple_full_document' | 'single_pass_quality_check'; processingTime: number; apiCalls: number; error: string | undefined; @@ -159,12 +160,101 @@ class UnifiedDocumentProcessor { } else if (strategy === 'document_ai_agentic_rag') { logger.info('Unified processor: Routing to RAG processor', { documentId, strategy }); return await this.processWithDocumentAiAgenticRag(documentId, userId, text, options); + } else if (strategy === 'single_pass_quality_check') { + logger.info('Unified processor: Routing to single-pass processor', { documentId, strategy }); + return await this.processWithSinglePass(documentId, userId, text, options); } else { logger.error('Unified processor: Unsupported strategy', { documentId, strategy }); - throw new Error(`Unsupported processing strategy: ${strategy}. Supported: 'simple_full_document', 'document_ai_agentic_rag'`); + throw new Error(`Unsupported processing strategy: ${strategy}. Supported: 'simple_full_document', 'document_ai_agentic_rag', 'single_pass_quality_check'`); } } + /** + * Process document using single-pass extraction + quality check (2 LLM calls) + */ + private async processWithSinglePass( + documentId: string, + userId: string, + text: string, + options: any + ): Promise { + logger.info('Using single-pass quality-check processing strategy', { documentId }); + + const result = await singlePassProcessor.processDocument( + documentId, userId, text, { + fileBuffer: options.fileBuffer, + fileName: options.fileName, + mimeType: options.mimeType, + } + ); + + if (!result.success) { + return { + success: false, + summary: '', + analysisData: defaultCIMReview, + processingStrategy: 'single_pass_quality_check', + processingTime: result.processingTime, + apiCalls: result.apiCalls, + error: result.error, + }; + } + + // Calculate page count from PDF if available + if (options.fileBuffer && options.fileName?.toLowerCase().endsWith('.pdf')) { + try { + const pdf = require('pdf-parse'); + const pdfData = await pdf(options.fileBuffer); + if (pdfData.numpages > 0) { + if (!result.analysisData.dealOverview) { + result.analysisData.dealOverview = {} as any; + } + result.analysisData.dealOverview.cimPageCount = pdfData.numpages.toString(); + } + } catch (error) { + logger.warn('Failed to calculate page count from PDF', { + documentId, + error: error instanceof Error ? error.message : String(error), + }); + } + } + + // Log quality metadata for monitoring + logger.info('Single-pass: Quality metadata', { + documentId, + qualityCheckPassed: result.qualityCheckPassed, + wasTruncated: result.wasTruncated, + completenessScore: result.completenessScore, + }); + + if (!result.qualityCheckPassed) { + logger.error('Single-pass: Quality check did NOT pass — extraction is unverified', { + documentId, + }); + } + + // Run the unified processor's own validation as well + const finalValidation = this.validateFinalData(result.analysisData); + if (!finalValidation.isValid) { + logger.warn('Single-pass: Final validation found issues', { + documentId, + completenessScore: finalValidation.completenessScore, + emptyFields: finalValidation.emptyFields.length, + lowQualityFields: finalValidation.lowQualityFields.length, + }); + } + + return { + success: true, + summary: result.summary, + analysisData: result.analysisData, + processingStrategy: 'single_pass_quality_check', + processingTime: result.processingTime, + apiCalls: result.apiCalls, + error: undefined, + }; + } + /** * Process document using Document AI + Agentic RAG approach */ diff --git a/backend/src/services/vectorDatabaseService.ts b/backend/src/services/vectorDatabaseService.ts index dab19d0..f4d039d 100644 --- a/backend/src/services/vectorDatabaseService.ts +++ b/backend/src/services/vectorDatabaseService.ts @@ -119,12 +119,12 @@ class VectorDatabaseService { rpcParams.filter_document_id = documentId; } - // Set a timeout for the RPC call (10 seconds) + // Set a timeout for the RPC call (30 seconds) const searchPromise = this.supabaseClient .rpc('match_document_chunks', rpcParams); const timeoutPromise = new Promise<{ data: null; error: { message: string } }>((_, reject) => { - setTimeout(() => reject(new Error('Vector search timeout after 10s')), 10000); + setTimeout(() => reject(new Error('Vector search timeout after 30s')), 30000); }); let result: any; @@ -132,8 +132,8 @@ class VectorDatabaseService { result = await Promise.race([searchPromise, timeoutPromise]); } catch (timeoutError: any) { if (timeoutError.message?.includes('timeout')) { - logger.error('Vector search timed out', { documentId, timeout: '10s' }); - throw new Error('Vector search timeout after 10s'); + logger.error('Vector search timed out', { documentId, timeout: '30s' }); + throw new Error('Vector search timeout after 30s'); } throw timeoutError; }