Add single-pass CIM processor: 2 LLM calls, ~2.5 min processing

New processing strategy `single_pass_quality_check` replaces the multi-pass
agentic RAG pipeline (15-25 min) with a streamlined 2-call approach:

1. Full-document LLM extraction (Sonnet) — single call with complete CIM text
2. Delta quality-check (Haiku) — reviews extraction, returns only corrections

Key changes:
- New singlePassProcessor.ts with extraction + quality check flow
- llmService: qualityCheckCIMDocument() with delta-only corrections array
- llmService: improved prompt requiring professional inferences for qualitative
  fields instead of defaulting to "Not specified in CIM"
- Removed deterministic financial parser from single-pass flow (LLM outperforms
  it — parser matched footnotes and narrative text as financials)
- Default strategy changed to single_pass_quality_check
- Completeness scoring with diagnostic logging of empty fields

Tested on 2 real CIMs: 100% completeness, correct financials, ~150s each.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
admin
2026-02-23 22:28:45 -05:00
parent f4bd60ca38
commit 5cfb136484
14 changed files with 1054 additions and 162 deletions

View File

@@ -1,12 +1,12 @@
-- Fix vector search timeout by adding document_id filtering and optimizing the query
-- This prevents searching across all documents and only searches within a specific document
-- Fix vector search timeout by pre-filtering on document_id BEFORE vector search
-- When document_id is provided, this avoids the full IVFFlat index scan (26K+ rows)
-- and instead computes distances on only ~80 chunks per document.
-- Drop the old function (handle all possible signatures)
-- Drop old function signatures
DROP FUNCTION IF EXISTS match_document_chunks(vector(1536), float, int);
DROP FUNCTION IF EXISTS match_document_chunks(vector(1536), float, int, text);
-- Create optimized function with document_id filtering
-- document_id is TEXT (varchar) in the actual schema
-- Create optimized function that branches based on whether document_id is provided
CREATE OR REPLACE FUNCTION match_document_chunks (
query_embedding vector(1536),
match_threshold float,
@@ -15,29 +15,51 @@ CREATE OR REPLACE FUNCTION match_document_chunks (
)
RETURNS TABLE (
id UUID,
document_id TEXT,
document_id VARCHAR(255),
content text,
metadata JSONB,
chunk_index INT,
similarity float
)
LANGUAGE sql STABLE
LANGUAGE plpgsql STABLE
AS $$
SELECT
document_chunks.id,
document_chunks.document_id,
document_chunks.content,
document_chunks.metadata,
document_chunks.chunk_index,
1 - (document_chunks.embedding <=> query_embedding) AS similarity
FROM document_chunks
WHERE document_chunks.embedding IS NOT NULL
AND (filter_document_id IS NULL OR document_chunks.document_id = filter_document_id)
AND 1 - (document_chunks.embedding <=> query_embedding) > match_threshold
ORDER BY document_chunks.embedding <=> query_embedding
LIMIT match_count;
BEGIN
IF filter_document_id IS NOT NULL THEN
-- FAST PATH: Pre-filter by document_id using btree index, then compute
-- vector distances on only that document's chunks (~80 rows).
-- This completely bypasses the IVFFlat index scan.
RETURN QUERY
SELECT
dc.id,
dc.document_id,
dc.content,
dc.metadata,
dc.chunk_index,
1 - (dc.embedding <=> query_embedding) AS similarity
FROM document_chunks dc
WHERE dc.document_id = filter_document_id
AND dc.embedding IS NOT NULL
AND 1 - (dc.embedding <=> query_embedding) > match_threshold
ORDER BY dc.embedding <=> query_embedding
LIMIT match_count;
ELSE
-- SLOW PATH: Search across all documents using IVFFlat index.
-- Only used when no document_id filter is provided.
RETURN QUERY
SELECT
dc.id,
dc.document_id,
dc.content,
dc.metadata,
dc.chunk_index,
1 - (dc.embedding <=> query_embedding) AS similarity
FROM document_chunks dc
WHERE dc.embedding IS NOT NULL
AND 1 - (dc.embedding <=> query_embedding) > match_threshold
ORDER BY dc.embedding <=> query_embedding
LIMIT match_count;
END IF;
END;
$$;
-- Add comment explaining the optimization
COMMENT ON FUNCTION match_document_chunks IS 'Optimized vector search that filters by document_id first to prevent timeouts. Always pass filter_document_id when searching within a specific document.';
COMMENT ON FUNCTION match_document_chunks IS 'Vector search with fast document-scoped path. When filter_document_id is provided, uses btree index to pre-filter (~80 rows) instead of scanning the full IVFFlat index (26K+ rows).';

View File

@@ -28,10 +28,10 @@ describe('Financial Summary Fixes', () => {
const result = parseFinancialsFromText(text);
expect(result.fy3.revenue).toBeDefined();
expect(result.fy2.revenue).toBeDefined();
expect(result.fy1.revenue).toBeDefined();
expect(result.ltm.revenue).toBeDefined();
expect(result.data.fy3.revenue).toBeDefined();
expect(result.data.fy2.revenue).toBeDefined();
expect(result.data.fy1.revenue).toBeDefined();
expect(result.data.ltm.revenue).toBeDefined();
});
test('Should parse financial table with year format', () => {
@@ -45,7 +45,7 @@ describe('Financial Summary Fixes', () => {
const result = parseFinancialsFromText(text);
// Should assign years to periods (oldest = FY3, newest = FY1)
expect(result.fy3.revenue || result.fy2.revenue || result.fy1.revenue).toBeDefined();
expect(result.data.fy3.revenue || result.data.fy2.revenue || result.data.fy1.revenue).toBeDefined();
});
test('Should handle tables with only 2-3 periods', () => {
@@ -59,7 +59,7 @@ describe('Financial Summary Fixes', () => {
const result = parseFinancialsFromText(text);
// Should still parse what's available
expect(result.fy1 || result.fy2).toBeDefined();
expect(result.data.fy1 || result.data.fy2).toBeDefined();
});
test('Should extract Gross Profit and Gross Margin', () => {
@@ -74,8 +74,8 @@ describe('Financial Summary Fixes', () => {
const result = parseFinancialsFromText(text);
expect(result.fy1.grossProfit).toBeDefined();
expect(result.fy1.grossMargin).toBeDefined();
expect(result.data.fy1.grossProfit).toBeDefined();
expect(result.data.fy1.grossMargin).toBeDefined();
});
});
@@ -91,10 +91,10 @@ describe('Financial Summary Fixes', () => {
const result = parseFinancialsFromText(text);
// Values should be correctly aligned with their periods
expect(result.fy3.revenue).toBeDefined();
expect(result.fy2.revenue).toBeDefined();
expect(result.fy1.revenue).toBeDefined();
expect(result.ltm.revenue).toBeDefined();
expect(result.data.fy3.revenue).toBeDefined();
expect(result.data.fy2.revenue).toBeDefined();
expect(result.data.fy1.revenue).toBeDefined();
expect(result.data.ltm.revenue).toBeDefined();
});
});
});

View File

@@ -152,7 +152,8 @@ const envSchema = Joi.object({
LOG_FILE: Joi.string().default('logs/app.log'),
// Processing Strategy
PROCESSING_STRATEGY: Joi.string().valid('document_ai_agentic_rag').default('document_ai_agentic_rag'),
PROCESSING_STRATEGY: Joi.string().valid('document_ai_agentic_rag', 'single_pass_quality_check').default('single_pass_quality_check'),
BACKGROUND_EMBEDDING_ENABLED: Joi.boolean().default(true),
// Agentic RAG Configuration
AGENTIC_RAG_ENABLED: Joi.boolean().default(false),
@@ -355,7 +356,8 @@ export const config = {
},
// Processing Strategy
processingStrategy: envVars['PROCESSING_STRATEGY'] || 'agentic_rag', // 'chunking' | 'rag' | 'agentic_rag'
processingStrategy: envVars['PROCESSING_STRATEGY'] || 'single_pass_quality_check',
backgroundEmbeddingEnabled: envVars['BACKGROUND_EMBEDDING_ENABLED'] !== false,
enableRAGProcessing: envVars['ENABLE_RAG_PROCESSING'] === 'true',
enableProcessingComparison: envVars['ENABLE_PROCESSING_COMPARISON'] === 'true',

View File

@@ -252,7 +252,7 @@ import { onSchedule } from 'firebase-functions/v2/scheduler';
export const processDocumentJobs = onSchedule({
schedule: 'every 1 minutes', // Minimum interval for Firebase Cloud Scheduler (immediate processing handles most cases)
timeoutSeconds: 900, // 15 minutes (max for Gen2 scheduled functions) - increased for large documents
memory: '1GiB',
memory: '2GiB',
retryCount: 2, // Retry up to 2 times on failure before waiting for next scheduled run
secrets: [
anthropicApiKey,

View File

@@ -162,13 +162,17 @@ async function testHaikuFinancialExtraction() {
// Test 2: Test deterministic parser first
console.log('\n📋 Test 2: Deterministic Parser');
console.log('-'.repeat(60));
const parserResults = parseFinancialsFromText(textToUse);
const parseResult = parseFinancialsFromText(textToUse);
const parserResults = parseResult.data;
console.log('Parser Results:');
console.log(` Confidence: ${parseResult.confidence}`);
console.log(` Tables Found: ${parseResult.tablesFound}`);
console.log(` Matched Rows: ${parseResult.matchedRows}`);
console.log(` FY3 Revenue: ${parserResults.fy3.revenue || 'Not found'}`);
console.log(` FY2 Revenue: ${parserResults.fy2.revenue || 'Not found'}`);
console.log(` FY1 Revenue: ${parserResults.fy1.revenue || 'Not found'}`);
console.log(` LTM Revenue: ${parserResults.ltm.revenue || 'Not found'}`);
const parserHasData = !!(parserResults.fy3.revenue || parserResults.fy2.revenue || parserResults.fy1.revenue || parserResults.ltm.revenue);
console.log(`\n${parserHasData ? '✅' : '⚠️ '} Parser ${parserHasData ? 'found' : 'did not find'} financial data`);

View File

@@ -0,0 +1,135 @@
/**
* Local test: Run the single-pass processor on real CIM PDFs
* without deploying to Firebase.
*
* Usage:
* cd backend
* npx ts-node src/scripts/test-single-pass-local.ts
*/
import * as fs from 'fs';
import * as path from 'path';
// Load .env before any service imports
import dotenv from 'dotenv';
dotenv.config({ path: path.resolve(__dirname, '../../.env') });
async function main() {
const projectRoot = path.resolve(__dirname, '../../../');
const cimFiles = [
path.join(projectRoot, 'Project Panther - Confidential Information Memorandum_vBluePoint.pdf'),
path.join(projectRoot, 'Project SNAP - CIP_Blue Point.pdf'),
];
const existingFiles = cimFiles.filter(f => fs.existsSync(f));
if (existingFiles.length === 0) {
console.error('No CIM PDFs found in project root. Expected:');
cimFiles.forEach(f => console.error(` ${f}`));
process.exit(1);
}
console.log(`\n${'='.repeat(70)}`);
console.log('SINGLE-PASS PROCESSOR LOCAL TEST');
console.log(`${'='.repeat(70)}\n`);
console.log(`Found ${existingFiles.length} CIM file(s) to test.\n`);
// Lazy-import services so .env is loaded first
const { singlePassProcessor } = await import('../services/singlePassProcessor');
for (const filePath of existingFiles) {
const fileName = path.basename(filePath);
const fileBuffer = fs.readFileSync(filePath);
const documentId = `test-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
console.log(`\n${'─'.repeat(70)}`);
console.log(`Processing: ${fileName}`);
console.log(` Size: ${(fileBuffer.length / 1024 / 1024).toFixed(1)} MB`);
console.log(` Document ID: ${documentId}`);
console.log(`${'─'.repeat(70)}\n`);
const startTime = Date.now();
try {
const result = await singlePassProcessor.processDocument(
documentId,
'test-user',
'', // text will be extracted from fileBuffer
{ fileBuffer, fileName, mimeType: 'application/pdf' },
);
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
console.log(`\n--- RESULT: ${fileName} ---`);
console.log(` Success: ${result.success}`);
console.log(` Processing time: ${elapsed}s`);
console.log(` API calls: ${result.apiCalls}`);
console.log(` Quality check: ${result.qualityCheckPassed ? 'PASSED' : 'FAILED (unverified)'}`);
console.log(` Was truncated: ${result.wasTruncated}`);
console.log(` Completeness: ${result.completenessScore.toFixed(1)}%`);
console.log(` Error: ${result.error || 'none'}`);
if (result.success && result.analysisData) {
const data = result.analysisData;
console.log(`\n --- KEY EXTRACTED DATA ---`);
console.log(` Company: ${data.dealOverview?.targetCompanyName || '???'}`);
console.log(` Industry: ${data.dealOverview?.industrySector || '???'}`);
console.log(` Geography: ${data.dealOverview?.geography || '???'}`);
console.log(` Page count: ${data.dealOverview?.cimPageCount || '???'}`);
console.log(` Employees: ${data.dealOverview?.employeeCount || '???'}`);
const fin = data.financialSummary?.financials;
if (fin) {
console.log(`\n --- FINANCIALS ---`);
for (const period of ['fy3', 'fy2', 'fy1', 'ltm'] as const) {
const p = fin[period];
if (p) {
console.log(` ${period.toUpperCase().padEnd(4)} Revenue: ${(p.revenue || 'N/A').padEnd(15)} EBITDA: ${(p.ebitda || 'N/A').padEnd(15)} Margin: ${p.ebitdaMargin || 'N/A'}`);
}
}
}
console.log(`\n --- INVESTMENT THESIS (preview) ---`);
const thesis = data.preliminaryInvestmentThesis;
if (thesis?.keyAttractions) {
// Count items by numbered list, bullets, or newline-separated entries
const text = thesis.keyAttractions;
const count = (text.match(/\d+[\.\)]\s/g) || []).length || text.split(/\n/).filter((l: string) => l.trim().length > 10).length;
console.log(` Key Attractions: ${count} items (${text.length} chars)`);
}
if (thesis?.potentialRisks) {
const text = thesis.potentialRisks;
const count = (text.match(/\d+[\.\)]\s/g) || []).length || text.split(/\n/).filter((l: string) => l.trim().length > 10).length;
console.log(` Potential Risks: ${count} items (${text.length} chars)`);
}
if (thesis?.valueCreationLevers) {
const text = thesis.valueCreationLevers;
const count = (text.match(/\d+[\.\)]\s/g) || []).length || text.split(/\n/).filter((l: string) => l.trim().length > 10).length;
console.log(` Value Creation: ${count} items (${text.length} chars)`);
}
const recommendation = data.keyQuestionsNextSteps?.preliminaryRecommendation;
console.log(` Recommendation: ${recommendation || '???'}`);
// Write full output to a JSON file for inspection
const outPath = path.join(projectRoot, `test-output-${fileName.replace(/\s+/g, '_').replace('.pdf', '')}.json`);
fs.writeFileSync(outPath, JSON.stringify(result, null, 2));
console.log(`\n Full output written to: ${outPath}`);
}
} catch (error) {
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
console.error(`\n FATAL ERROR after ${elapsed}s:`);
console.error(` ${error instanceof Error ? error.message : String(error)}`);
if (error instanceof Error && error.stack) {
console.error(` ${error.stack.split('\n').slice(1, 4).join('\n ')}`);
}
}
}
console.log(`\n${'='.repeat(70)}`);
console.log('TEST COMPLETE');
console.log(`${'='.repeat(70)}\n`);
}
main().catch(err => {
console.error('Unhandled error:', err);
process.exit(1);
});

View File

@@ -16,6 +16,16 @@ export interface ParsedFinancials {
ltm: FinancialPeriod;
}
export interface FinancialParseResult {
data: ParsedFinancials;
/** Number of candidate financial table headers found in the document. */
tablesFound: number;
/** Number of financial metric rows successfully matched (revenue, EBITDA, etc.). */
matchedRows: number;
/** Confidence: 'high' if both revenue+EBITDA found, 'low' if partial, 'none' if nothing. */
confidence: 'high' | 'low' | 'none';
}
type Bucket = keyof ParsedFinancials;
const PERIOD_TOKEN_REGEX = /\b(?:(?:FY[-\s]?\d{1,2})|(?:FY[-\s]?)?20\d{2}[A-Z]*|(?:FY[-\s]?[1234])|(?:LTM|TTM))\b/gi;
@@ -270,7 +280,7 @@ function assignTokensToBuckets(
}
}
export function parseFinancialsFromText(fullText: string): ParsedFinancials {
export function parseFinancialsFromText(fullText: string): FinancialParseResult {
const startTime = Date.now();
const result: ParsedFinancials = {
fy3: {},
@@ -278,12 +288,13 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials {
fy1: {},
ltm: {}
};
let candidateHeaders = 0;
try {
const text = fullText.replace(/\u00A0/g, ' ');
const lines = text.split('\n').map((line) => line.trim()).filter(Boolean);
if (lines.length === 0) {
return result;
return { data: result, tablesFound: 0, matchedRows: 0, confidence: 'none' };
}
let bestHeaderIndex = -1;
@@ -295,9 +306,10 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials {
for (let i = 0; i < lines.length; i++) {
const tokens = tokenizePeriodHeaders(lines[i]);
if (tokens.length >= 2) {
candidateHeaders++;
const buckets = yearTokensToBuckets(tokens);
const validBuckets = buckets.filter(Boolean).length;
// Score this header: prioritize headers followed by financial metric rows
let score = validBuckets;
@@ -381,9 +393,10 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials {
if (bestHeaderIndex === -1 || bestBuckets.filter(Boolean).length === 0) {
logger.info('Financial parser could not identify year header, returning empty result', {
totalLines: lines.length,
candidateHeaders,
sampleLines: lines.slice(0, 20).join(' | ')
});
return result;
return { data: result, tablesFound: candidateHeaders, matchedRows: 0, confidence: 'none' };
}
logger.info('Financial parser selected best header', {
@@ -511,19 +524,31 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials {
buckets: bestBuckets.map((bucket) => bucket || 'skip')
});
// Determine confidence based on what was found
const hasRevenue = !!(result.fy1.revenue || result.fy2.revenue || result.ltm.revenue);
const hasEbitda = !!(result.fy1.ebitda || result.fy2.ebitda || result.ltm.ebitda);
const confidence: 'high' | 'low' | 'none' =
hasRevenue && hasEbitda ? 'high' :
hasRevenue || hasEbitda ? 'low' : 'none';
logger.info('Financial parser results', {
elapsedMs: Date.now() - startTime,
headerLine: lines[bestHeaderIndex],
candidateHeaders,
matchedRows,
confidence,
fy3: result.fy3,
fy2: result.fy2,
fy1: result.fy1,
ltm: result.ltm
});
return { data: result, tablesFound: candidateHeaders, matchedRows, confidence };
} catch (error) {
logger.warn('Financial parser failed', { error: error instanceof Error ? error.message : String(error) });
}
return result;
return { data: result, tablesFound: candidateHeaders, matchedRows: 0, confidence: 'none' };
}
const containsMoneyOrPercent = (text: string): boolean => {
resetRegex(MONEY_REGEX);

View File

@@ -133,10 +133,11 @@ export class JobProcessorService {
await ProcessingJobModel.markAsProcessing(jobId);
jobStatusUpdated = true; // Track that we've updated status
// Add timeout protection (14 minutes, leaving 1 minute buffer before scheduled function timeout)
const processingTimeout = 14 * 60 * 1000; // 14 minutes in milliseconds
// Add timeout protection (28 minutes for API immediate processing, 14 for scheduled function)
// API endpoint has 30-min Cloud Run timeout; scheduled function has 15 min
const processingTimeout = 28 * 60 * 1000; // 28 minutes in milliseconds
const timeoutPromise = new Promise<never>((_, reject) => {
timeoutId = setTimeout(() => reject(new Error('Job processing timeout after 14 minutes')), processingTimeout);
timeoutId = setTimeout(() => reject(new Error('Job processing timeout after 28 minutes')), processingTimeout);
});
// Wrap processing logic in Promise.race with timeout
@@ -217,10 +218,15 @@ export class JobProcessorService {
}
// Process the document
// Strategy priority: job options > env config > default
const { config: envConfig } = await import('../config/env');
const processingStrategy = job.options?.strategy || envConfig.processingStrategy || 'document_ai_agentic_rag';
logger.info('Starting document processing', {
jobId,
documentId: job.document_id,
strategy: job.options?.strategy || 'document_ai_agentic_rag',
strategy: processingStrategy,
strategySource: job.options?.strategy ? 'job_options' : (envConfig.processingStrategy ? 'env_config' : 'default'),
});
const result = await unifiedDocumentProcessor.processDocument(
@@ -228,7 +234,7 @@ export class JobProcessorService {
job.user_id,
'', // Text will be extracted from fileBuffer
{
strategy: job.options?.strategy || 'document_ai_agentic_rag',
strategy: processingStrategy,
fileBuffer,
fileName: document.original_file_name,
mimeType: 'application/pdf',

View File

@@ -32,6 +32,12 @@ export interface CIMAnalysisResult {
inputTokens: number;
outputTokens: number;
validationIssues?: z.ZodIssue[];
/** The text actually sent to the LLM (may be truncated from the original). */
processedText?: string;
/** True if the input text was truncated to fit token limits. */
wasTruncated?: boolean;
/** Delta corrections from quality-check (delta mode only). */
corrections?: Array<{ path: string; value: string; reason?: string }>;
}
class LLMService {
@@ -41,6 +47,34 @@ class LLMService {
private maxTokens: number;
private temperature: number;
// Concurrency limiter to prevent Anthropic 429 rate limit errors.
// Max 2 concurrent API calls; additional calls queue up and wait.
private static readonly MAX_CONCURRENT_LLM_CALLS = 1;
private static activeCalls = 0;
private static waitQueue: Array<() => void> = [];
private async acquireLLMSlot(): Promise<void> {
if (LLMService.activeCalls < LLMService.MAX_CONCURRENT_LLM_CALLS) {
LLMService.activeCalls++;
return;
}
// Wait for a slot to free up
return new Promise<void>((resolve) => {
LLMService.waitQueue.push(() => {
LLMService.activeCalls++;
resolve();
});
});
}
private releaseLLMSlot(): void {
LLMService.activeCalls--;
if (LLMService.waitQueue.length > 0) {
const next = LLMService.waitQueue.shift()!;
next();
}
}
constructor() {
// CRITICAL DEBUG: Log what we're reading from config
logger.info('LLM Service constructor - Reading provider from config', {
@@ -313,6 +347,8 @@ class LLMService {
cost: this.estimateCost(estimatedTokens + response.content.length, selectedModel),
inputTokens: estimatedTokens,
outputTokens: response.content.length,
processedText: processedText,
wasTruncated,
};
} else {
lastError = new Error(`JSON validation failed: ${validation.error.errors.map(e => e.message).join(', ')}`);
@@ -353,10 +389,154 @@ class LLMService {
throw lastError || new Error('All LLM processing attempts failed');
}
/**
* Quality-check a first-pass CIM extraction.
*
* A second LLM call reviews the pass-1 JSON against the original document text
* and returns a corrected/enhanced version. Uses the same retry + Zod validation
* logic as processCIMDocument. Falls back to the pass-1 result on total failure.
*/
async qualityCheckCIMDocument(
text: string,
extractedData: CIMReview,
financialContext?: string,
options?: { wasTruncated?: boolean; truncationPercent?: number },
): Promise<CIMAnalysisResult & { corrections?: Array<{ path: string; value: string; reason?: string }> }> {
logger.info('Starting quality-check LLM call (delta mode)', {
textLength: text.length,
extractedDataKeys: Object.keys(extractedData),
wasTruncated: options?.wasTruncated,
});
// Condense the document: send first 20K + last 20K chars instead of full text.
// The quality check is reviewing the extraction, not re-reading the whole CIM.
const MAX_CONTEXT_CHARS = 40000;
let condensedText: string;
if (text.length > MAX_CONTEXT_CHARS) {
const halfSize = Math.floor(MAX_CONTEXT_CHARS / 2);
condensedText = text.substring(0, halfSize) +
`\n\n[... ${text.length - MAX_CONTEXT_CHARS} characters omitted for brevity ...]\n\n` +
text.substring(text.length - halfSize);
logger.info('Quality-check: condensed document text', {
originalLength: text.length,
condensedLength: condensedText.length,
});
} else {
condensedText = text;
}
const extractedJson = JSON.stringify(extractedData, null, 2);
// Use fast model (Haiku) for quality check — it's a review task, not original extraction
const selectedModel = config.llm.fastModel || config.llm.model;
const estimatedTokens = this.estimateTokenCount(condensedText) + this.estimateTokenCount(extractedJson);
const truncationNote = options?.wasTruncated
? `\nNote: The original document was truncated before the first-pass extraction. Do NOT invent data not present in the extracted JSON or the text excerpts below.`
: '';
const systemPrompt = `You are a senior investment analyst performing a quality review of a first-pass CIM extraction. Your job is to identify ERRORS and GAPS in the extracted data.
${truncationNote}
Return ONLY a JSON array of corrections. Each correction is an object with:
- "path": dot-notation path to the field (e.g. "financialSummary.financials.fy1.revenue")
- "value": the corrected value (string)
- "reason": brief explanation of why this correction is needed
RULES:
1. Only include fields that need changes. If the extraction is correct, return an empty array: []
2. For financial figures, cross-check against the document text. Fix wrong values or misaligned periods.
3. For "Not specified in CIM" on QUANTITATIVE fields (revenue, headcount, etc.), only replace if you can find the actual data in the text.
4. For "Not specified in CIM" on QUALITATIVE fields (statedReasonForSale, postTransactionIntentions, workingCapitalIntensity, typicalContractLength, etc.), ALWAYS provide a professional inference based on document context. For example: if reason for sale isn't stated but the seller is PE-backed, write "Not explicitly stated; likely PE sponsor exit based on [context]". If working capital intensity isn't stated but the company is asset-light, infer "Low working capital intensity consistent with asset-light business model".
5. For thin list fields (keyAttractions, potentialRisks, etc.), provide enhanced content with 5-8 items, 2-3 sentences each.
6. Do NOT hallucinate. Only use information present in or inferable from the document text.
7. Your ENTIRE response must be a valid JSON array. No markdown, no explanation outside the array.`;
const financialSection = financialContext
? `\n\nDETERMINISTIC FINANCIAL DATA (ground truth — do not contradict):\n${financialContext}`
: '';
const userPrompt = `Review this first-pass CIM extraction for errors and gaps.
DOCUMENT TEXT (condensed excerpts):
${condensedText}
${financialSection}
EXTRACTED JSON TO REVIEW:
${extractedJson}
Return a JSON array of corrections. If no corrections needed, return [].`;
let lastError: Error | null = null;
for (let attempt = 1; attempt <= 2; attempt++) {
try {
if (lastError && lastError.message.includes('rate limit')) {
const retryDelay = Math.min(60000 * attempt, 300000);
logger.warn(`Quality check: rate limit, waiting ${retryDelay}ms before attempt ${attempt}`);
await new Promise(resolve => setTimeout(resolve, retryDelay));
}
logger.info(`Quality-check LLM attempt ${attempt}/2`, { model: selectedModel });
const response = await this.callLLM({
prompt: userPrompt,
systemPrompt,
model: selectedModel,
maxTokens: 8000, // Delta output is much smaller than full JSON
temperature: 0.1, // Low temperature for consistency
});
if (!response.success) {
throw new Error(response.error || 'Quality-check LLM call failed');
}
// Parse the corrections array from the response
const corrections = this.parseCorrectionsArray(response.content);
if (corrections !== null) {
logger.info(`Quality-check completed on attempt ${attempt}`, {
correctionsCount: corrections.length,
});
return {
success: true,
corrections: corrections.length > 0 ? corrections : undefined,
model: selectedModel,
cost: this.estimateCost(estimatedTokens + response.content.length, selectedModel),
inputTokens: estimatedTokens,
outputTokens: response.content.length,
};
} else {
lastError = new Error('Quality-check response was not a valid JSON array');
logger.warn(`Quality-check parse failed on attempt ${attempt}`, {
responsePreview: response.content.substring(0, 200),
});
}
} catch (error) {
lastError = error instanceof Error ? error : new Error('Unknown error');
logger.error(`Quality-check attempt ${attempt} failed`, { error: lastError.message });
}
}
// All quality-check attempts failed — return failure (caller falls back to pass-1)
logger.warn('All quality-check attempts failed — caller should use pass-1 result', {
lastError: lastError?.message,
});
return {
success: false,
error: lastError?.message || 'All quality-check attempts failed',
model: selectedModel,
cost: 0,
inputTokens: estimatedTokens,
outputTokens: 0,
};
}
/**
* Call the appropriate LLM API
*/
private async callLLM(request: LLMRequest): Promise<LLMResponse> {
// Acquire a concurrency slot (max 2 concurrent LLM calls to avoid Anthropic 429s)
await this.acquireLLMSlot();
try {
// Use configured timeout from config.llm.timeoutMs (default 6 minutes for complex analysis)
// Increased from 3 minutes to handle complex CIM analysis even with RAG reduction
@@ -367,7 +547,7 @@ class LLMService {
...request,
model: normalizedModel
};
// Add a timeout wrapper to prevent hanging
const timeoutPromise = new Promise<LLMResponse>((_, reject) => {
setTimeout(() => reject(new Error(`LLM call timeout after ${timeoutMinutes} minutes`)), timeoutMs);
@@ -380,9 +560,11 @@ class LLMService {
model: normalizedRequest.model || this.defaultModel,
willCallOpenRouter: this.provider === 'openrouter',
willCallAnthropic: this.provider === 'anthropic',
willCallOpenAI: this.provider === 'openai'
willCallOpenAI: this.provider === 'openai',
queuedCalls: LLMService.waitQueue.length,
activeCalls: LLMService.activeCalls
});
if (this.provider === 'openai') {
return await this.callOpenAI(normalizedRequest);
} else if (this.provider === 'openrouter') {
@@ -405,6 +587,8 @@ class LLMService {
content: '',
error: error instanceof Error ? error.message : 'Unknown error',
};
} finally {
this.releaseLLMSlot();
}
}
@@ -910,13 +1094,13 @@ class LLMService {
CRITICAL REQUIREMENTS:
1. **JSON OUTPUT ONLY**: Your entire response MUST be a single, valid JSON object. Do not include any text or explanation before or after the JSON object.
2. **BPCP TEMPLATE FORMAT**: The JSON object MUST follow the BPCP CIM Review Template structure exactly as specified.
3. **COMPLETE ALL FIELDS**: You MUST provide a value for every field. Use "Not specified in CIM" for any information that is not available in the document.
4. **NO PLACEHOLDERS**: Do not use placeholders like "..." or "TBD". Use "Not specified in CIM" instead.
3. **COMPLETE ALL FIELDS**: You MUST provide a value for every field. For qualitative/assessment fields, ALWAYS provide your professional inference or best judgment based on context — do NOT default to "Not specified in CIM" for fields like statedReasonForSale, postTransactionIntentions, workingCapitalIntensity, or typicalContractLength. For example: if reason for sale is not stated but the seller is PE-backed, write "Not explicitly stated; likely PE sponsor exit/liquidity event based on [context]". Only use "Not specified in CIM" for specific quantitative data (exact financial figures, headcounts) that truly cannot be determined or reasonably inferred.
4. **NO PLACEHOLDERS**: Do not use placeholders like "..." or "TBD".
5. **PROFESSIONAL ANALYSIS**: The content should be high-quality and suitable for BPCP's investment committee.
6. **BPCP FOCUS**: Focus on companies in 5+MM EBITDA range in consumer and industrial end markets, with emphasis on M&A, technology & data usage, supply chain and human capital optimization.
7. **BPCP PREFERENCES**: BPCP prefers companies which are founder/family-owned and within driving distance of Cleveland and Charlotte.
8. **EXACT FIELD NAMES**: Use the exact field names and descriptions from the BPCP CIM Review Template.
9. **FINANCIAL DATA**: For financial metrics, use actual numbers if available, otherwise use "Not specified in CIM".
9. **FINANCIAL DATA**: For financial metrics, use actual numbers if available. Only use "Not specified in CIM" for specific financial line items that are truly absent from the document.
10. **VALID JSON**: Ensure your response is valid JSON that can be parsed without errors.
ANALYSIS QUALITY REQUIREMENTS:
@@ -1235,6 +1419,54 @@ SPECIAL REQUIREMENTS FOR PRELIMINARY INVESTMENT THESIS:
`;
}
/**
* Parse a corrections array from quality-check LLM response.
* Expects a JSON array like: [{"path":"...", "value":"...", "reason":"..."}]
* Returns null if parsing fails.
*/
private parseCorrectionsArray(content: string): Array<{ path: string; value: string; reason?: string }> | null {
try {
// Strip markdown fences if present
let cleaned = content.trim();
if (cleaned.startsWith('```')) {
const firstNewline = cleaned.indexOf('\n');
cleaned = cleaned.substring(firstNewline + 1);
const lastFence = cleaned.lastIndexOf('```');
if (lastFence > 0) cleaned = cleaned.substring(0, lastFence);
cleaned = cleaned.trim();
}
const parsed = JSON.parse(cleaned);
if (!Array.isArray(parsed)) {
logger.warn('Quality-check response is not an array', { type: typeof parsed });
return null;
}
// Validate each correction has at minimum path + value
const valid = parsed.filter((item: any) =>
item && typeof item.path === 'string' && item.path.length > 0 &&
(typeof item.value === 'string' || typeof item.value === 'number')
).map((item: any) => ({
path: item.path,
value: String(item.value),
reason: item.reason ? String(item.reason) : undefined,
}));
logger.info('Quality-check corrections parsed', {
rawCount: parsed.length,
validCount: valid.length,
});
return valid;
} catch (error) {
logger.warn('Failed to parse quality-check corrections array', {
error: error instanceof Error ? error.message : String(error),
contentPreview: content.substring(0, 200),
});
return null;
}
}
/**
* Extract JSON from LLM response
*/
@@ -2536,33 +2768,34 @@ CRITICAL REQUIREMENTS:
const trimmed = model.trim();
const lower = trimmed.toLowerCase();
const canonicalSonnet46 = 'claude-sonnet-4-6';
const canonicalHaiku45 = 'claude-haiku-4-5';
// Valid Anthropic API model IDs (date-stamped or aliased)
const canonicalSonnet46 = 'claude-sonnet-4-20250514';
const canonicalHaiku45 = 'claude-haiku-4-5-20251001';
const legacySonnet35 = 'claude-3-5-sonnet-latest';
const legacyHaiku35 = 'claude-3-5-haiku-latest';
// Keep modern 4.6/4.5 identifiers as-is
if (lower.includes('sonnet-4-6')) {
// Keep valid date-stamped identifiers as-is
if (lower.match(/sonnet-4-\d{8}/)) {
return trimmed;
}
if (lower.includes('haiku-4-5')) {
if (lower.match(/haiku-4-\d+-\d{8}/)) {
return trimmed;
}
// Map older Claude 4.x labels (4.0/4.5) to the active 4.6/4.5 SKUs
// Map shorthand Claude 4.x labels to valid API model IDs
if (lower.includes('sonnet-4')) {
if (trimmed !== canonicalSonnet46) {
logger.warn('Normalizing Claude Sonnet 4.x model to 4.6', {
logger.warn('Normalizing Claude Sonnet 4.x model to canonical API ID', {
requestedModel: trimmed,
normalizedModel: canonicalSonnet46
});
}
return canonicalSonnet46;
}
if (lower.includes('haiku-4')) {
if (trimmed !== canonicalHaiku45) {
logger.warn('Normalizing Claude Haiku 4.x model to 4.5', {
logger.warn('Normalizing Claude Haiku 4.x model to canonical API ID', {
requestedModel: trimmed,
normalizedModel: canonicalHaiku45
});

View File

@@ -125,7 +125,7 @@ export class OptimizedAgenticRAGProcessor {
/**
* Create intelligent chunks with semantic boundaries
*/
private async createIntelligentChunks(
async createIntelligentChunks(
text: string,
documentId: string,
enableSemanticChunking: boolean = true,
@@ -515,7 +515,7 @@ export class OptimizedAgenticRAGProcessor {
* Store chunks with optimized batching
* Returns the number of API calls made for embeddings
*/
private async storeChunksOptimized(
async storeChunksOptimized(
chunks: ProcessingChunk[],
documentId: string
): Promise<number> {
@@ -1216,20 +1216,27 @@ export class OptimizedAgenticRAGProcessor {
try {
const { parseFinancialsFromText } = await import('./financialTableParser');
const parsedFinancials = parseFinancialsFromText(text);
const parseResult = parseFinancialsFromText(text);
const parsedFinancials = parseResult.data;
if (this.hasStructuredFinancialData(parsedFinancials)) {
deterministicFinancials = parsedFinancials;
deterministicFinancialChunk = this.buildDeterministicFinancialChunk(documentId, parsedFinancials);
logger.info('Deterministic financial parser prepared structured data', {
documentId,
confidence: parseResult.confidence,
tablesFound: parseResult.tablesFound,
fy3: parsedFinancials.fy3,
fy2: parsedFinancials.fy2,
fy1: parsedFinancials.fy1,
ltm: parsedFinancials.ltm
});
} else {
logger.info('Deterministic financial parser did not find structured data', { documentId });
logger.info('Deterministic financial parser did not find structured data', {
documentId,
confidence: parseResult.confidence,
tablesFound: parseResult.tablesFound,
});
}
} catch (parserError) {
logger.warn('Deterministic financial parser failed during initialization', {
@@ -1241,31 +1248,23 @@ export class OptimizedAgenticRAGProcessor {
let totalApiCalls = 0;
const partialResults: Partial<CIMReview>[] = [];
// OPTIMIZED SEQUENTIAL EXECUTION: Combined passes to reduce API calls
logger.info('Starting agentic RAG extraction (optimized with combined passes)');
const sequentialStart = Date.now();
// Run combined passes sequentially to reduce total API calls from 5 to 3
logger.info('Pass 1: Metadata + Financial Data (Combined)');
// PARALLEL EXECUTION: Run all 3 independent extraction passes concurrently
logger.info('Starting agentic RAG extraction (3 passes in parallel)');
const parallelStart = Date.now();
const deterministicPinnedChunks = deterministicFinancialChunk ? [deterministicFinancialChunk] : [];
const pass1CombinedResult = await this.extractPass1CombinedMetadataFinancial(
const [pass1CombinedResult, pass2CombinedResult, pass3Result] = await Promise.all([
this.extractPass1CombinedMetadataFinancial(documentId, text, chunks, deterministicPinnedChunks),
this.extractPass2CombinedMarketBusiness(documentId, text, chunks),
this.extractPass5InvestmentThesis(documentId, text, chunks)
]);
const parallelTime = Date.now() - parallelStart;
logger.info('Parallel extraction completed', {
documentId,
text,
chunks,
deterministicPinnedChunks
);
logger.info('Pass 2: Market Analysis + Business Operations (Combined)');
const pass2CombinedResult = await this.extractPass2CombinedMarketBusiness(documentId, text, chunks);
logger.info('Pass 3: Investment Thesis');
const pass3Result = await this.extractPass5InvestmentThesis(documentId, text, chunks);
const sequentialTime = Date.now() - sequentialStart;
logger.info('Sequential extraction completed', {
documentId,
sequentialTimeMs: sequentialTime,
sequentialTimeSec: (sequentialTime / 1000).toFixed(1)
parallelTimeMs: parallelTime,
parallelTimeSec: (parallelTime / 1000).toFixed(1)
});
partialResults.push(pass1CombinedResult.data);
@@ -2135,49 +2134,53 @@ IMPORTANT EXTRACTION RULES:
const batches = this.groupMissingFieldsIntoBatches(missingFields);
let totalApiCalls = 0;
for (const batch of batches) {
// Create a targeted query for this batch of missing fields
const query = this.createGapFillingQuery(batch);
try {
const { chunks: relevantChunks } = await this.findRelevantChunks(
documentId,
query,
chunks,
30000 // Smaller context for gap-filling
);
// Process all gap-filling batches in parallel
const batchResults = await Promise.all(
batches.map(async (batch) => {
const query = this.createGapFillingQuery(batch);
try {
const { chunks: relevantChunks } = await this.findRelevantChunks(
documentId, query, chunks, 30000
);
if (relevantChunks.length === 0) {
logger.info('No relevant chunks found for gap-filling batch', { batch });
continue;
logger.info('No relevant chunks found for gap-filling batch', { batch });
return { apiCalls: 0, data: null };
}
const reducedText = relevantChunks
.slice(0, 15) // Limit to 15 chunks for gap-filling
.slice(0, 15)
.map((chunk, index) => {
const separator = index > 0 ? '\n\n---\n\n' : '';
return `${separator}[Section ${chunk.chunkIndex + 1}]\n${chunk.content}`;
return `${separator}[Section ${chunk.chunkIndex + 1}]\n${chunk.content}`;
})
.join('\n\n');
// Make LLM call for this batch
const result = await llmService.processCIMDocument(reducedText, 'BPCP CIM Review Template');
totalApiCalls++;
if (result.success && result.jsonOutput) {
// Merge gap-filled data (only for the missing fields)
this.deepMerge(currentData, result.jsonOutput);
logger.info('Gap-filling batch completed', {
batch: batch.slice(0, 5),
batchSize: batch.length
});
logger.info('Gap-filling batch completed', {
batch: batch.slice(0, 5),
batchSize: batch.length
});
return { apiCalls: 1, data: result.jsonOutput };
}
return { apiCalls: 1, data: null };
} catch (error) {
logger.error('Gap-filling batch failed', {
error: error instanceof Error ? error.message : String(error),
batch: batch.slice(0, 5)
});
batch: batch.slice(0, 5)
});
return { apiCalls: 0, data: null };
}
})
);
// Merge results sequentially to maintain deterministic merge order
for (const result of batchResults) {
totalApiCalls += result.apiCalls;
if (result.data) {
this.deepMerge(currentData, result.data);
}
}
@@ -2249,54 +2252,49 @@ Extract exact values, numbers, percentages, names, and detailed information.`;
{ path: 'keyQuestionsNextSteps.missingInformation', name: 'Missing Information' }
];
for (const { path, name } of listFieldPaths) {
// Identify which fields need repair
const fieldsNeedingRepair = listFieldPaths.filter(({ path, name }) => {
const value = this.getNestedField(validatedData, path);
if (value && typeof value === 'string') {
const itemCount = (value.match(/^\d+\.\s/gm) || []).length;
if (itemCount < 5 || itemCount > 8) {
logger.warn(`List field validation failed: ${name}`, {
documentId,
field: path,
currentCount: itemCount,
required: '5-8 items'
documentId, field: path, currentCount: itemCount, required: '5-8 items'
});
return true;
}
logger.debug(`List field validated: ${name}`, { documentId, field: path, itemCount });
}
return false;
});
// Repair all failing fields in parallel
if (fieldsNeedingRepair.length > 0) {
const repairResults = await Promise.all(
fieldsNeedingRepair.map(async ({ path, name }) => {
const value = this.getNestedField(validatedData, path) as string;
const itemCount = (value.match(/^\d+\.\s/gm) || []).length;
try {
// Attempt to repair the list
const repairedValue = await this.repairListField(
name,
value,
itemCount,
documentId,
chunks
);
apiCalls++;
// Update the nested field
this.setNestedField(validatedData, path, repairedValue);
const repairedValue = await this.repairListField(name, value, itemCount, documentId, chunks);
logger.info(`List field repaired: ${name}`, {
documentId,
field: path,
originalCount: itemCount,
documentId, field: path, originalCount: itemCount,
newCount: (repairedValue.match(/^\d+\.\s/gm) || []).length
});
return { path, repairedValue, success: true };
} catch (error) {
logger.error(`Failed to repair list field: ${name}`, {
documentId,
field: path,
documentId, field: path,
error: error instanceof Error ? error.message : String(error)
});
// Continue with original value if repair fails
return { path, repairedValue: null, success: false };
}
} else {
logger.debug(`List field validated: ${name}`, {
documentId,
field: path,
itemCount
});
})
);
for (const result of repairResults) {
if (result.success && result.repairedValue) {
this.setNestedField(validatedData, result.path, result.repairedValue);
apiCalls++;
}
}
}

View File

@@ -79,12 +79,13 @@ class SimpleDocumentProcessor {
let deterministicFinancials: any = null;
try {
const { parseFinancialsFromText } = await import('./financialTableParser');
const parsedFinancials = parseFinancialsFromText(extractedText);
const parseResult = parseFinancialsFromText(extractedText);
const parsedFinancials = parseResult.data;
// Check if parser found structured data
const hasData = parsedFinancials.fy3?.revenue || parsedFinancials.fy2?.revenue ||
const hasData = parsedFinancials.fy3?.revenue || parsedFinancials.fy2?.revenue ||
parsedFinancials.fy1?.revenue || parsedFinancials.ltm?.revenue;
if (hasData) {
deterministicFinancials = parsedFinancials;
logger.info('Deterministic financial parser found structured data', {

View File

@@ -0,0 +1,376 @@
import { logger } from '../utils/logger';
import { config } from '../config/env';
import { llmService } from './llmService';
import { CIMReview } from './llmSchemas';
import { documentAiProcessor } from './documentAiProcessor';
interface SinglePassResult {
success: boolean;
summary: string;
analysisData: CIMReview;
processingStrategy: 'single_pass_quality_check';
processingTime: number;
apiCalls: number;
error: string | undefined;
/** Whether the quality-check LLM pass succeeded. False means pass-1 result is unverified. */
qualityCheckPassed: boolean;
/** Whether the input text was truncated before LLM processing. */
wasTruncated: boolean;
/** Completeness score (0-100) from final validation. */
completenessScore: number;
}
/**
* Single-pass CIM processor: 2 LLM calls total (extraction + quality check).
*
* Flow:
* Document AI OCR (if needed) -> deterministic financial parse ->
* single LLM extraction -> quality-check LLM -> PDF generation -> done
*
* Background: chunk + embed for future search (non-blocking, fire-and-forget)
*/
class SinglePassProcessor {
/**
* Main entry point — processes a CIM document with 2 LLM calls.
*/
async processDocument(
documentId: string,
userId: string,
text: string,
options: {
fileBuffer?: Buffer;
fileName?: string;
mimeType?: string;
} = {}
): Promise<SinglePassResult> {
const startTime = Date.now();
let apiCalls = 0;
try {
// ── Step 0: Extract text via Document AI if not provided ──
let extractedText = text;
if (!extractedText || extractedText.length === 0) {
const { fileBuffer, fileName, mimeType } = options;
if (!fileBuffer || !fileName || !mimeType) {
throw new Error('Missing required options: fileBuffer, fileName, mimeType');
}
const docAiResult = await documentAiProcessor.extractTextOnly(
documentId, userId, fileBuffer, fileName, mimeType
);
if (!docAiResult.text || docAiResult.text.length === 0) {
throw new Error('Document AI text extraction returned no text');
}
extractedText = docAiResult.text;
logger.info('Single-pass: Document AI text extraction completed', {
documentId, textLength: extractedText.length,
});
}
// ── Step 1: Single LLM extraction ──
// Note: Deterministic financial parser disabled for single-pass mode.
// The LLM consistently outperforms it — the parser matches narrative text
// and footnotes, producing wrong values that override correct LLM output.
logger.info('Single-pass: Starting LLM extraction (pass 1)', { documentId });
const pass1Result = await llmService.processCIMDocument(
extractedText,
'', // template is embedded in the prompt
);
apiCalls++;
if (!pass1Result.success || !pass1Result.jsonOutput) {
throw new Error(pass1Result.error || 'LLM extraction (pass 1) failed');
}
// Use the same text the LLM actually saw (may be truncated)
const llmText = pass1Result.processedText || extractedText;
const wasTruncated = pass1Result.wasTruncated || false;
const truncationPercent = wasTruncated
? (llmText.length / extractedText.length) * 100
: 100;
if (wasTruncated) {
logger.warn('Single-pass: Pass 1 text was truncated', {
documentId,
originalLength: extractedText.length,
truncatedLength: llmText.length,
truncationPercent: truncationPercent.toFixed(1),
});
}
let analysisData: CIMReview = pass1Result.jsonOutput;
logger.info('Single-pass: Pass 1 complete', {
documentId,
model: pass1Result.model,
inputTokens: pass1Result.inputTokens,
outputTokens: pass1Result.outputTokens,
cost: pass1Result.cost,
wasTruncated,
});
// ── Step 3: Quality-check LLM (pass 2) ──
// Uses delta-only approach: quality check returns only corrections, not full JSON.
// Uses Haiku for speed. Sends condensed document (first/last 20K chars) not full text.
logger.info('Single-pass: Starting quality-check LLM (pass 2)', { documentId });
let qualityCheckPassed = false;
try {
const qualityResult = await llmService.qualityCheckCIMDocument(
llmText,
analysisData,
undefined, // no deterministic financial context
{ wasTruncated, truncationPercent },
);
apiCalls++;
if (qualityResult.success && qualityResult.corrections) {
// Apply delta corrections to the existing analysis data
analysisData = this.applyCorrections(analysisData, qualityResult.corrections);
qualityCheckPassed = true;
logger.info('Single-pass: Quality check complete — corrections applied', {
documentId,
model: qualityResult.model,
correctionsCount: qualityResult.corrections.length,
inputTokens: qualityResult.inputTokens,
outputTokens: qualityResult.outputTokens,
cost: qualityResult.cost,
});
} else if (qualityResult.success && !qualityResult.corrections) {
// Quality check returned success with no corrections — pass 1 was good
qualityCheckPassed = true;
logger.info('Single-pass: Quality check found no corrections needed', { documentId });
} else {
logger.error('Single-pass: Quality check returned no valid output — using pass 1 result', {
documentId,
error: qualityResult.error,
});
}
} catch (qualityError) {
// Quality check failed — fall back to pass 1 but log at ERROR
logger.error('Single-pass: Quality check failed — using unverified pass 1 result', {
documentId,
error: qualityError instanceof Error ? qualityError.message : String(qualityError),
});
}
// ── Step 4: Completeness gate ──
const completenessScore = this.scoreCompleteness(analysisData);
const COMPLETENESS_THRESHOLD = 85;
if (completenessScore < COMPLETENESS_THRESHOLD) {
const processingTime = Date.now() - startTime;
const msg = `Completeness score ${completenessScore.toFixed(1)}% is below ${COMPLETENESS_THRESHOLD}% threshold`;
logger.error('Single-pass: Completeness gate FAILED', {
documentId,
completenessScore,
threshold: COMPLETENESS_THRESHOLD,
qualityCheckPassed,
wasTruncated,
});
return {
success: false,
summary: '',
analysisData,
processingStrategy: 'single_pass_quality_check',
processingTime,
apiCalls,
error: msg,
qualityCheckPassed,
wasTruncated,
completenessScore,
};
}
// ── Step 5: Fire-and-forget background embedding ──
if (config.backgroundEmbeddingEnabled !== false) {
setImmediate(() => {
this.backgroundChunkAndEmbed(documentId, extractedText).catch((err) => {
logger.warn('Single-pass: Background embedding failed (non-blocking)', {
documentId,
error: err instanceof Error ? err.message : String(err),
});
});
});
}
const processingTime = Date.now() - startTime;
const summary = this.buildSummary(analysisData);
logger.info('Single-pass: Processing complete', {
documentId,
processingTime,
apiCalls,
textLength: extractedText.length,
qualityCheckPassed,
wasTruncated,
completenessScore,
});
return {
success: true,
summary,
analysisData,
processingStrategy: 'single_pass_quality_check',
processingTime,
apiCalls,
error: undefined,
qualityCheckPassed,
wasTruncated,
completenessScore,
};
} catch (error) {
const processingTime = Date.now() - startTime;
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error('Single-pass: Processing failed', {
documentId,
error: errorMessage,
stack: error instanceof Error ? error.stack : undefined,
processingTime,
apiCalls,
});
return {
success: false,
summary: '',
analysisData: {} as CIMReview,
processingStrategy: 'single_pass_quality_check',
processingTime,
apiCalls,
error: `Single-pass processing failed: ${errorMessage}`,
qualityCheckPassed: false,
wasTruncated: false,
completenessScore: 0,
};
}
}
/**
* Apply delta corrections from the quality check to the existing analysis data.
* Each correction has a dot-path (e.g. "financialSummary.financials.fy1.revenue")
* and a new value.
*/
private applyCorrections(
data: CIMReview,
corrections: Array<{ path: string; value: string; reason?: string }>,
): CIMReview {
const patched = JSON.parse(JSON.stringify(data)) as any;
let applied = 0;
for (const { path, value, reason } of corrections) {
const parts = path.split('.');
let target = patched;
for (let i = 0; i < parts.length - 1; i++) {
if (target[parts[i]] === undefined || target[parts[i]] === null) {
target[parts[i]] = {};
}
target = target[parts[i]];
}
const lastKey = parts[parts.length - 1];
const oldValue = target[lastKey];
target[lastKey] = value;
applied++;
logger.info('Quality-check correction applied', {
path,
oldValue: typeof oldValue === 'string' ? oldValue.substring(0, 80) : String(oldValue),
newValue: typeof value === 'string' ? value.substring(0, 80) : String(value),
reason: reason || '',
});
}
logger.info('Quality-check corrections summary', { total: corrections.length, applied });
return patched as CIMReview;
}
/**
* Build a short executive summary from the analysis data.
*/
private buildSummary(data: CIMReview): string {
const companyName = data.dealOverview?.targetCompanyName || 'Unknown Company';
const sector = data.dealOverview?.industrySector || 'Unknown Sector';
const recommendation = data.keyQuestionsNextSteps?.preliminaryRecommendation || 'No recommendation';
return `CIM Review: ${companyName} (${sector}) — ${recommendation}`;
}
/**
* Quick completeness score (0-100): counts non-empty string leaf fields.
* BPCP-internal fields (reviewers, dateReviewed, dateCIMReceived) are excluded.
*/
private scoreCompleteness(data: CIMReview): number {
const excludedPaths = new Set([
'dealOverview.reviewers',
'dealOverview.dateReviewed',
'dealOverview.dateCIMReceived',
]);
let total = 0;
let filled = 0;
const emptyFields: string[] = [];
const walk = (obj: any, path: string) => {
if (obj === null || obj === undefined) return;
if (typeof obj === 'string') {
if (excludedPaths.has(path)) return;
total++;
const trimmed = obj.trim();
if (trimmed !== '' && trimmed !== 'Not specified in CIM') {
filled++;
} else {
emptyFields.push(path);
}
} else if (typeof obj === 'object' && !Array.isArray(obj)) {
for (const key of Object.keys(obj)) {
walk(obj[key], path ? `${path}.${key}` : key);
}
}
};
walk(data, '');
const score = total > 0 ? (filled / total) * 100 : 0;
if (emptyFields.length > 0) {
logger.info('Completeness scoring: empty fields', {
score: score.toFixed(1),
total,
filled,
emptyCount: emptyFields.length,
emptyFields,
});
}
return score;
}
/**
* Background chunking and embedding for future vector search.
* Runs asynchronously after the main processing completes.
*/
private async backgroundChunkAndEmbed(documentId: string, text: string): Promise<void> {
try {
const { OptimizedAgenticRAGProcessor } = await import('./optimizedAgenticRAGProcessor');
const ragProcessor = new OptimizedAgenticRAGProcessor();
logger.info('Single-pass: Starting background chunk & embed', {
documentId, textLength: text.length,
});
const chunks = await ragProcessor.createIntelligentChunks(text, documentId, true, []);
await ragProcessor.storeChunksOptimized(chunks, documentId);
logger.info('Single-pass: Background chunk & embed complete', {
documentId, chunkCount: chunks.length,
});
} catch (error) {
logger.warn('Single-pass: Background chunk & embed failed', {
documentId,
error: error instanceof Error ? error.message : String(error),
});
}
}
}
export const singlePassProcessor = new SinglePassProcessor();

View File

@@ -3,6 +3,7 @@ import { config } from '../config/env';
import { optimizedAgenticRAGProcessor } from './optimizedAgenticRAGProcessor';
import { simpleDocumentProcessor } from './simpleDocumentProcessor';
import { documentAiProcessor } from './documentAiProcessor';
import { singlePassProcessor } from './singlePassProcessor';
import { CIMReview } from './llmSchemas';
// Default empty CIMReview object
@@ -111,7 +112,7 @@ interface ProcessingResult {
success: boolean;
summary: string;
analysisData: CIMReview;
processingStrategy: 'document_ai_agentic_rag' | 'simple_full_document';
processingStrategy: 'document_ai_agentic_rag' | 'simple_full_document' | 'single_pass_quality_check';
processingTime: number;
apiCalls: number;
error: string | undefined;
@@ -159,12 +160,101 @@ class UnifiedDocumentProcessor {
} else if (strategy === 'document_ai_agentic_rag') {
logger.info('Unified processor: Routing to RAG processor', { documentId, strategy });
return await this.processWithDocumentAiAgenticRag(documentId, userId, text, options);
} else if (strategy === 'single_pass_quality_check') {
logger.info('Unified processor: Routing to single-pass processor', { documentId, strategy });
return await this.processWithSinglePass(documentId, userId, text, options);
} else {
logger.error('Unified processor: Unsupported strategy', { documentId, strategy });
throw new Error(`Unsupported processing strategy: ${strategy}. Supported: 'simple_full_document', 'document_ai_agentic_rag'`);
throw new Error(`Unsupported processing strategy: ${strategy}. Supported: 'simple_full_document', 'document_ai_agentic_rag', 'single_pass_quality_check'`);
}
}
/**
* Process document using single-pass extraction + quality check (2 LLM calls)
*/
private async processWithSinglePass(
documentId: string,
userId: string,
text: string,
options: any
): Promise<ProcessingResult> {
logger.info('Using single-pass quality-check processing strategy', { documentId });
const result = await singlePassProcessor.processDocument(
documentId, userId, text, {
fileBuffer: options.fileBuffer,
fileName: options.fileName,
mimeType: options.mimeType,
}
);
if (!result.success) {
return {
success: false,
summary: '',
analysisData: defaultCIMReview,
processingStrategy: 'single_pass_quality_check',
processingTime: result.processingTime,
apiCalls: result.apiCalls,
error: result.error,
};
}
// Calculate page count from PDF if available
if (options.fileBuffer && options.fileName?.toLowerCase().endsWith('.pdf')) {
try {
const pdf = require('pdf-parse');
const pdfData = await pdf(options.fileBuffer);
if (pdfData.numpages > 0) {
if (!result.analysisData.dealOverview) {
result.analysisData.dealOverview = {} as any;
}
result.analysisData.dealOverview.cimPageCount = pdfData.numpages.toString();
}
} catch (error) {
logger.warn('Failed to calculate page count from PDF', {
documentId,
error: error instanceof Error ? error.message : String(error),
});
}
}
// Log quality metadata for monitoring
logger.info('Single-pass: Quality metadata', {
documentId,
qualityCheckPassed: result.qualityCheckPassed,
wasTruncated: result.wasTruncated,
completenessScore: result.completenessScore,
});
if (!result.qualityCheckPassed) {
logger.error('Single-pass: Quality check did NOT pass — extraction is unverified', {
documentId,
});
}
// Run the unified processor's own validation as well
const finalValidation = this.validateFinalData(result.analysisData);
if (!finalValidation.isValid) {
logger.warn('Single-pass: Final validation found issues', {
documentId,
completenessScore: finalValidation.completenessScore,
emptyFields: finalValidation.emptyFields.length,
lowQualityFields: finalValidation.lowQualityFields.length,
});
}
return {
success: true,
summary: result.summary,
analysisData: result.analysisData,
processingStrategy: 'single_pass_quality_check',
processingTime: result.processingTime,
apiCalls: result.apiCalls,
error: undefined,
};
}
/**
* Process document using Document AI + Agentic RAG approach
*/

View File

@@ -119,12 +119,12 @@ class VectorDatabaseService {
rpcParams.filter_document_id = documentId;
}
// Set a timeout for the RPC call (10 seconds)
// Set a timeout for the RPC call (30 seconds)
const searchPromise = this.supabaseClient
.rpc('match_document_chunks', rpcParams);
const timeoutPromise = new Promise<{ data: null; error: { message: string } }>((_, reject) => {
setTimeout(() => reject(new Error('Vector search timeout after 10s')), 10000);
setTimeout(() => reject(new Error('Vector search timeout after 30s')), 30000);
});
let result: any;
@@ -132,8 +132,8 @@ class VectorDatabaseService {
result = await Promise.race([searchPromise, timeoutPromise]);
} catch (timeoutError: any) {
if (timeoutError.message?.includes('timeout')) {
logger.error('Vector search timed out', { documentId, timeout: '10s' });
throw new Error('Vector search timeout after 10s');
logger.error('Vector search timed out', { documentId, timeout: '30s' });
throw new Error('Vector search timeout after 30s');
}
throw timeoutError;
}