Add single-pass CIM processor: 2 LLM calls, ~2.5 min processing
New processing strategy `single_pass_quality_check` replaces the multi-pass agentic RAG pipeline (15-25 min) with a streamlined 2-call approach: 1. Full-document LLM extraction (Sonnet) — single call with complete CIM text 2. Delta quality-check (Haiku) — reviews extraction, returns only corrections Key changes: - New singlePassProcessor.ts with extraction + quality check flow - llmService: qualityCheckCIMDocument() with delta-only corrections array - llmService: improved prompt requiring professional inferences for qualitative fields instead of defaulting to "Not specified in CIM" - Removed deterministic financial parser from single-pass flow (LLM outperforms it — parser matched footnotes and narrative text as financials) - Default strategy changed to single_pass_quality_check - Completeness scoring with diagnostic logging of empty fields Tested on 2 real CIMs: 100% completeness, correct financials, ~150s each. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,12 +1,12 @@
|
||||
-- Fix vector search timeout by adding document_id filtering and optimizing the query
|
||||
-- This prevents searching across all documents and only searches within a specific document
|
||||
-- Fix vector search timeout by pre-filtering on document_id BEFORE vector search
|
||||
-- When document_id is provided, this avoids the full IVFFlat index scan (26K+ rows)
|
||||
-- and instead computes distances on only ~80 chunks per document.
|
||||
|
||||
-- Drop the old function (handle all possible signatures)
|
||||
-- Drop old function signatures
|
||||
DROP FUNCTION IF EXISTS match_document_chunks(vector(1536), float, int);
|
||||
DROP FUNCTION IF EXISTS match_document_chunks(vector(1536), float, int, text);
|
||||
|
||||
-- Create optimized function with document_id filtering
|
||||
-- document_id is TEXT (varchar) in the actual schema
|
||||
-- Create optimized function that branches based on whether document_id is provided
|
||||
CREATE OR REPLACE FUNCTION match_document_chunks (
|
||||
query_embedding vector(1536),
|
||||
match_threshold float,
|
||||
@@ -15,29 +15,51 @@ CREATE OR REPLACE FUNCTION match_document_chunks (
|
||||
)
|
||||
RETURNS TABLE (
|
||||
id UUID,
|
||||
document_id TEXT,
|
||||
document_id VARCHAR(255),
|
||||
content text,
|
||||
metadata JSONB,
|
||||
chunk_index INT,
|
||||
similarity float
|
||||
)
|
||||
LANGUAGE sql STABLE
|
||||
LANGUAGE plpgsql STABLE
|
||||
AS $$
|
||||
SELECT
|
||||
document_chunks.id,
|
||||
document_chunks.document_id,
|
||||
document_chunks.content,
|
||||
document_chunks.metadata,
|
||||
document_chunks.chunk_index,
|
||||
1 - (document_chunks.embedding <=> query_embedding) AS similarity
|
||||
FROM document_chunks
|
||||
WHERE document_chunks.embedding IS NOT NULL
|
||||
AND (filter_document_id IS NULL OR document_chunks.document_id = filter_document_id)
|
||||
AND 1 - (document_chunks.embedding <=> query_embedding) > match_threshold
|
||||
ORDER BY document_chunks.embedding <=> query_embedding
|
||||
LIMIT match_count;
|
||||
BEGIN
|
||||
IF filter_document_id IS NOT NULL THEN
|
||||
-- FAST PATH: Pre-filter by document_id using btree index, then compute
|
||||
-- vector distances on only that document's chunks (~80 rows).
|
||||
-- This completely bypasses the IVFFlat index scan.
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
dc.id,
|
||||
dc.document_id,
|
||||
dc.content,
|
||||
dc.metadata,
|
||||
dc.chunk_index,
|
||||
1 - (dc.embedding <=> query_embedding) AS similarity
|
||||
FROM document_chunks dc
|
||||
WHERE dc.document_id = filter_document_id
|
||||
AND dc.embedding IS NOT NULL
|
||||
AND 1 - (dc.embedding <=> query_embedding) > match_threshold
|
||||
ORDER BY dc.embedding <=> query_embedding
|
||||
LIMIT match_count;
|
||||
ELSE
|
||||
-- SLOW PATH: Search across all documents using IVFFlat index.
|
||||
-- Only used when no document_id filter is provided.
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
dc.id,
|
||||
dc.document_id,
|
||||
dc.content,
|
||||
dc.metadata,
|
||||
dc.chunk_index,
|
||||
1 - (dc.embedding <=> query_embedding) AS similarity
|
||||
FROM document_chunks dc
|
||||
WHERE dc.embedding IS NOT NULL
|
||||
AND 1 - (dc.embedding <=> query_embedding) > match_threshold
|
||||
ORDER BY dc.embedding <=> query_embedding
|
||||
LIMIT match_count;
|
||||
END IF;
|
||||
END;
|
||||
$$;
|
||||
|
||||
-- Add comment explaining the optimization
|
||||
COMMENT ON FUNCTION match_document_chunks IS 'Optimized vector search that filters by document_id first to prevent timeouts. Always pass filter_document_id when searching within a specific document.';
|
||||
|
||||
COMMENT ON FUNCTION match_document_chunks IS 'Vector search with fast document-scoped path. When filter_document_id is provided, uses btree index to pre-filter (~80 rows) instead of scanning the full IVFFlat index (26K+ rows).';
|
||||
|
||||
@@ -28,10 +28,10 @@ describe('Financial Summary Fixes', () => {
|
||||
|
||||
const result = parseFinancialsFromText(text);
|
||||
|
||||
expect(result.fy3.revenue).toBeDefined();
|
||||
expect(result.fy2.revenue).toBeDefined();
|
||||
expect(result.fy1.revenue).toBeDefined();
|
||||
expect(result.ltm.revenue).toBeDefined();
|
||||
expect(result.data.fy3.revenue).toBeDefined();
|
||||
expect(result.data.fy2.revenue).toBeDefined();
|
||||
expect(result.data.fy1.revenue).toBeDefined();
|
||||
expect(result.data.ltm.revenue).toBeDefined();
|
||||
});
|
||||
|
||||
test('Should parse financial table with year format', () => {
|
||||
@@ -45,7 +45,7 @@ describe('Financial Summary Fixes', () => {
|
||||
const result = parseFinancialsFromText(text);
|
||||
|
||||
// Should assign years to periods (oldest = FY3, newest = FY1)
|
||||
expect(result.fy3.revenue || result.fy2.revenue || result.fy1.revenue).toBeDefined();
|
||||
expect(result.data.fy3.revenue || result.data.fy2.revenue || result.data.fy1.revenue).toBeDefined();
|
||||
});
|
||||
|
||||
test('Should handle tables with only 2-3 periods', () => {
|
||||
@@ -59,7 +59,7 @@ describe('Financial Summary Fixes', () => {
|
||||
const result = parseFinancialsFromText(text);
|
||||
|
||||
// Should still parse what's available
|
||||
expect(result.fy1 || result.fy2).toBeDefined();
|
||||
expect(result.data.fy1 || result.data.fy2).toBeDefined();
|
||||
});
|
||||
|
||||
test('Should extract Gross Profit and Gross Margin', () => {
|
||||
@@ -74,8 +74,8 @@ describe('Financial Summary Fixes', () => {
|
||||
|
||||
const result = parseFinancialsFromText(text);
|
||||
|
||||
expect(result.fy1.grossProfit).toBeDefined();
|
||||
expect(result.fy1.grossMargin).toBeDefined();
|
||||
expect(result.data.fy1.grossProfit).toBeDefined();
|
||||
expect(result.data.fy1.grossMargin).toBeDefined();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -91,10 +91,10 @@ describe('Financial Summary Fixes', () => {
|
||||
const result = parseFinancialsFromText(text);
|
||||
|
||||
// Values should be correctly aligned with their periods
|
||||
expect(result.fy3.revenue).toBeDefined();
|
||||
expect(result.fy2.revenue).toBeDefined();
|
||||
expect(result.fy1.revenue).toBeDefined();
|
||||
expect(result.ltm.revenue).toBeDefined();
|
||||
expect(result.data.fy3.revenue).toBeDefined();
|
||||
expect(result.data.fy2.revenue).toBeDefined();
|
||||
expect(result.data.fy1.revenue).toBeDefined();
|
||||
expect(result.data.ltm.revenue).toBeDefined();
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -152,7 +152,8 @@ const envSchema = Joi.object({
|
||||
LOG_FILE: Joi.string().default('logs/app.log'),
|
||||
|
||||
// Processing Strategy
|
||||
PROCESSING_STRATEGY: Joi.string().valid('document_ai_agentic_rag').default('document_ai_agentic_rag'),
|
||||
PROCESSING_STRATEGY: Joi.string().valid('document_ai_agentic_rag', 'single_pass_quality_check').default('single_pass_quality_check'),
|
||||
BACKGROUND_EMBEDDING_ENABLED: Joi.boolean().default(true),
|
||||
|
||||
// Agentic RAG Configuration
|
||||
AGENTIC_RAG_ENABLED: Joi.boolean().default(false),
|
||||
@@ -355,7 +356,8 @@ export const config = {
|
||||
},
|
||||
|
||||
// Processing Strategy
|
||||
processingStrategy: envVars['PROCESSING_STRATEGY'] || 'agentic_rag', // 'chunking' | 'rag' | 'agentic_rag'
|
||||
processingStrategy: envVars['PROCESSING_STRATEGY'] || 'single_pass_quality_check',
|
||||
backgroundEmbeddingEnabled: envVars['BACKGROUND_EMBEDDING_ENABLED'] !== false,
|
||||
enableRAGProcessing: envVars['ENABLE_RAG_PROCESSING'] === 'true',
|
||||
enableProcessingComparison: envVars['ENABLE_PROCESSING_COMPARISON'] === 'true',
|
||||
|
||||
|
||||
@@ -252,7 +252,7 @@ import { onSchedule } from 'firebase-functions/v2/scheduler';
|
||||
export const processDocumentJobs = onSchedule({
|
||||
schedule: 'every 1 minutes', // Minimum interval for Firebase Cloud Scheduler (immediate processing handles most cases)
|
||||
timeoutSeconds: 900, // 15 minutes (max for Gen2 scheduled functions) - increased for large documents
|
||||
memory: '1GiB',
|
||||
memory: '2GiB',
|
||||
retryCount: 2, // Retry up to 2 times on failure before waiting for next scheduled run
|
||||
secrets: [
|
||||
anthropicApiKey,
|
||||
|
||||
@@ -162,13 +162,17 @@ async function testHaikuFinancialExtraction() {
|
||||
// Test 2: Test deterministic parser first
|
||||
console.log('\n📋 Test 2: Deterministic Parser');
|
||||
console.log('-'.repeat(60));
|
||||
const parserResults = parseFinancialsFromText(textToUse);
|
||||
const parseResult = parseFinancialsFromText(textToUse);
|
||||
const parserResults = parseResult.data;
|
||||
console.log('Parser Results:');
|
||||
console.log(` Confidence: ${parseResult.confidence}`);
|
||||
console.log(` Tables Found: ${parseResult.tablesFound}`);
|
||||
console.log(` Matched Rows: ${parseResult.matchedRows}`);
|
||||
console.log(` FY3 Revenue: ${parserResults.fy3.revenue || 'Not found'}`);
|
||||
console.log(` FY2 Revenue: ${parserResults.fy2.revenue || 'Not found'}`);
|
||||
console.log(` FY1 Revenue: ${parserResults.fy1.revenue || 'Not found'}`);
|
||||
console.log(` LTM Revenue: ${parserResults.ltm.revenue || 'Not found'}`);
|
||||
|
||||
|
||||
const parserHasData = !!(parserResults.fy3.revenue || parserResults.fy2.revenue || parserResults.fy1.revenue || parserResults.ltm.revenue);
|
||||
console.log(`\n${parserHasData ? '✅' : '⚠️ '} Parser ${parserHasData ? 'found' : 'did not find'} financial data`);
|
||||
|
||||
|
||||
135
backend/src/scripts/test-single-pass-local.ts
Normal file
135
backend/src/scripts/test-single-pass-local.ts
Normal file
@@ -0,0 +1,135 @@
|
||||
/**
|
||||
* Local test: Run the single-pass processor on real CIM PDFs
|
||||
* without deploying to Firebase.
|
||||
*
|
||||
* Usage:
|
||||
* cd backend
|
||||
* npx ts-node src/scripts/test-single-pass-local.ts
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
// Load .env before any service imports
|
||||
import dotenv from 'dotenv';
|
||||
dotenv.config({ path: path.resolve(__dirname, '../../.env') });
|
||||
|
||||
async function main() {
|
||||
const projectRoot = path.resolve(__dirname, '../../../');
|
||||
const cimFiles = [
|
||||
path.join(projectRoot, 'Project Panther - Confidential Information Memorandum_vBluePoint.pdf'),
|
||||
path.join(projectRoot, 'Project SNAP - CIP_Blue Point.pdf'),
|
||||
];
|
||||
|
||||
const existingFiles = cimFiles.filter(f => fs.existsSync(f));
|
||||
if (existingFiles.length === 0) {
|
||||
console.error('No CIM PDFs found in project root. Expected:');
|
||||
cimFiles.forEach(f => console.error(` ${f}`));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(`\n${'='.repeat(70)}`);
|
||||
console.log('SINGLE-PASS PROCESSOR LOCAL TEST');
|
||||
console.log(`${'='.repeat(70)}\n`);
|
||||
console.log(`Found ${existingFiles.length} CIM file(s) to test.\n`);
|
||||
|
||||
// Lazy-import services so .env is loaded first
|
||||
const { singlePassProcessor } = await import('../services/singlePassProcessor');
|
||||
|
||||
for (const filePath of existingFiles) {
|
||||
const fileName = path.basename(filePath);
|
||||
const fileBuffer = fs.readFileSync(filePath);
|
||||
const documentId = `test-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
||||
|
||||
console.log(`\n${'─'.repeat(70)}`);
|
||||
console.log(`Processing: ${fileName}`);
|
||||
console.log(` Size: ${(fileBuffer.length / 1024 / 1024).toFixed(1)} MB`);
|
||||
console.log(` Document ID: ${documentId}`);
|
||||
console.log(`${'─'.repeat(70)}\n`);
|
||||
|
||||
const startTime = Date.now();
|
||||
try {
|
||||
const result = await singlePassProcessor.processDocument(
|
||||
documentId,
|
||||
'test-user',
|
||||
'', // text will be extracted from fileBuffer
|
||||
{ fileBuffer, fileName, mimeType: 'application/pdf' },
|
||||
);
|
||||
|
||||
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
|
||||
|
||||
console.log(`\n--- RESULT: ${fileName} ---`);
|
||||
console.log(` Success: ${result.success}`);
|
||||
console.log(` Processing time: ${elapsed}s`);
|
||||
console.log(` API calls: ${result.apiCalls}`);
|
||||
console.log(` Quality check: ${result.qualityCheckPassed ? 'PASSED' : 'FAILED (unverified)'}`);
|
||||
console.log(` Was truncated: ${result.wasTruncated}`);
|
||||
console.log(` Completeness: ${result.completenessScore.toFixed(1)}%`);
|
||||
console.log(` Error: ${result.error || 'none'}`);
|
||||
|
||||
if (result.success && result.analysisData) {
|
||||
const data = result.analysisData;
|
||||
console.log(`\n --- KEY EXTRACTED DATA ---`);
|
||||
console.log(` Company: ${data.dealOverview?.targetCompanyName || '???'}`);
|
||||
console.log(` Industry: ${data.dealOverview?.industrySector || '???'}`);
|
||||
console.log(` Geography: ${data.dealOverview?.geography || '???'}`);
|
||||
console.log(` Page count: ${data.dealOverview?.cimPageCount || '???'}`);
|
||||
console.log(` Employees: ${data.dealOverview?.employeeCount || '???'}`);
|
||||
|
||||
const fin = data.financialSummary?.financials;
|
||||
if (fin) {
|
||||
console.log(`\n --- FINANCIALS ---`);
|
||||
for (const period of ['fy3', 'fy2', 'fy1', 'ltm'] as const) {
|
||||
const p = fin[period];
|
||||
if (p) {
|
||||
console.log(` ${period.toUpperCase().padEnd(4)} Revenue: ${(p.revenue || 'N/A').padEnd(15)} EBITDA: ${(p.ebitda || 'N/A').padEnd(15)} Margin: ${p.ebitdaMargin || 'N/A'}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n --- INVESTMENT THESIS (preview) ---`);
|
||||
const thesis = data.preliminaryInvestmentThesis;
|
||||
if (thesis?.keyAttractions) {
|
||||
// Count items by numbered list, bullets, or newline-separated entries
|
||||
const text = thesis.keyAttractions;
|
||||
const count = (text.match(/\d+[\.\)]\s/g) || []).length || text.split(/\n/).filter((l: string) => l.trim().length > 10).length;
|
||||
console.log(` Key Attractions: ${count} items (${text.length} chars)`);
|
||||
}
|
||||
if (thesis?.potentialRisks) {
|
||||
const text = thesis.potentialRisks;
|
||||
const count = (text.match(/\d+[\.\)]\s/g) || []).length || text.split(/\n/).filter((l: string) => l.trim().length > 10).length;
|
||||
console.log(` Potential Risks: ${count} items (${text.length} chars)`);
|
||||
}
|
||||
if (thesis?.valueCreationLevers) {
|
||||
const text = thesis.valueCreationLevers;
|
||||
const count = (text.match(/\d+[\.\)]\s/g) || []).length || text.split(/\n/).filter((l: string) => l.trim().length > 10).length;
|
||||
console.log(` Value Creation: ${count} items (${text.length} chars)`);
|
||||
}
|
||||
|
||||
const recommendation = data.keyQuestionsNextSteps?.preliminaryRecommendation;
|
||||
console.log(` Recommendation: ${recommendation || '???'}`);
|
||||
|
||||
// Write full output to a JSON file for inspection
|
||||
const outPath = path.join(projectRoot, `test-output-${fileName.replace(/\s+/g, '_').replace('.pdf', '')}.json`);
|
||||
fs.writeFileSync(outPath, JSON.stringify(result, null, 2));
|
||||
console.log(`\n Full output written to: ${outPath}`);
|
||||
}
|
||||
} catch (error) {
|
||||
const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
|
||||
console.error(`\n FATAL ERROR after ${elapsed}s:`);
|
||||
console.error(` ${error instanceof Error ? error.message : String(error)}`);
|
||||
if (error instanceof Error && error.stack) {
|
||||
console.error(` ${error.stack.split('\n').slice(1, 4).join('\n ')}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n${'='.repeat(70)}`);
|
||||
console.log('TEST COMPLETE');
|
||||
console.log(`${'='.repeat(70)}\n`);
|
||||
}
|
||||
|
||||
main().catch(err => {
|
||||
console.error('Unhandled error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -16,6 +16,16 @@ export interface ParsedFinancials {
|
||||
ltm: FinancialPeriod;
|
||||
}
|
||||
|
||||
export interface FinancialParseResult {
|
||||
data: ParsedFinancials;
|
||||
/** Number of candidate financial table headers found in the document. */
|
||||
tablesFound: number;
|
||||
/** Number of financial metric rows successfully matched (revenue, EBITDA, etc.). */
|
||||
matchedRows: number;
|
||||
/** Confidence: 'high' if both revenue+EBITDA found, 'low' if partial, 'none' if nothing. */
|
||||
confidence: 'high' | 'low' | 'none';
|
||||
}
|
||||
|
||||
type Bucket = keyof ParsedFinancials;
|
||||
|
||||
const PERIOD_TOKEN_REGEX = /\b(?:(?:FY[-\s]?\d{1,2})|(?:FY[-\s]?)?20\d{2}[A-Z]*|(?:FY[-\s]?[1234])|(?:LTM|TTM))\b/gi;
|
||||
@@ -270,7 +280,7 @@ function assignTokensToBuckets(
|
||||
}
|
||||
}
|
||||
|
||||
export function parseFinancialsFromText(fullText: string): ParsedFinancials {
|
||||
export function parseFinancialsFromText(fullText: string): FinancialParseResult {
|
||||
const startTime = Date.now();
|
||||
const result: ParsedFinancials = {
|
||||
fy3: {},
|
||||
@@ -278,12 +288,13 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials {
|
||||
fy1: {},
|
||||
ltm: {}
|
||||
};
|
||||
let candidateHeaders = 0;
|
||||
|
||||
try {
|
||||
const text = fullText.replace(/\u00A0/g, ' ');
|
||||
const lines = text.split('\n').map((line) => line.trim()).filter(Boolean);
|
||||
if (lines.length === 0) {
|
||||
return result;
|
||||
return { data: result, tablesFound: 0, matchedRows: 0, confidence: 'none' };
|
||||
}
|
||||
|
||||
let bestHeaderIndex = -1;
|
||||
@@ -295,9 +306,10 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials {
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const tokens = tokenizePeriodHeaders(lines[i]);
|
||||
if (tokens.length >= 2) {
|
||||
candidateHeaders++;
|
||||
const buckets = yearTokensToBuckets(tokens);
|
||||
const validBuckets = buckets.filter(Boolean).length;
|
||||
|
||||
|
||||
// Score this header: prioritize headers followed by financial metric rows
|
||||
let score = validBuckets;
|
||||
|
||||
@@ -381,9 +393,10 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials {
|
||||
if (bestHeaderIndex === -1 || bestBuckets.filter(Boolean).length === 0) {
|
||||
logger.info('Financial parser could not identify year header, returning empty result', {
|
||||
totalLines: lines.length,
|
||||
candidateHeaders,
|
||||
sampleLines: lines.slice(0, 20).join(' | ')
|
||||
});
|
||||
return result;
|
||||
return { data: result, tablesFound: candidateHeaders, matchedRows: 0, confidence: 'none' };
|
||||
}
|
||||
|
||||
logger.info('Financial parser selected best header', {
|
||||
@@ -511,19 +524,31 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials {
|
||||
buckets: bestBuckets.map((bucket) => bucket || 'skip')
|
||||
});
|
||||
|
||||
// Determine confidence based on what was found
|
||||
const hasRevenue = !!(result.fy1.revenue || result.fy2.revenue || result.ltm.revenue);
|
||||
const hasEbitda = !!(result.fy1.ebitda || result.fy2.ebitda || result.ltm.ebitda);
|
||||
const confidence: 'high' | 'low' | 'none' =
|
||||
hasRevenue && hasEbitda ? 'high' :
|
||||
hasRevenue || hasEbitda ? 'low' : 'none';
|
||||
|
||||
logger.info('Financial parser results', {
|
||||
elapsedMs: Date.now() - startTime,
|
||||
headerLine: lines[bestHeaderIndex],
|
||||
candidateHeaders,
|
||||
matchedRows,
|
||||
confidence,
|
||||
fy3: result.fy3,
|
||||
fy2: result.fy2,
|
||||
fy1: result.fy1,
|
||||
ltm: result.ltm
|
||||
});
|
||||
|
||||
return { data: result, tablesFound: candidateHeaders, matchedRows, confidence };
|
||||
} catch (error) {
|
||||
logger.warn('Financial parser failed', { error: error instanceof Error ? error.message : String(error) });
|
||||
}
|
||||
|
||||
return result;
|
||||
return { data: result, tablesFound: candidateHeaders, matchedRows: 0, confidence: 'none' };
|
||||
}
|
||||
const containsMoneyOrPercent = (text: string): boolean => {
|
||||
resetRegex(MONEY_REGEX);
|
||||
|
||||
@@ -133,10 +133,11 @@ export class JobProcessorService {
|
||||
await ProcessingJobModel.markAsProcessing(jobId);
|
||||
jobStatusUpdated = true; // Track that we've updated status
|
||||
|
||||
// Add timeout protection (14 minutes, leaving 1 minute buffer before scheduled function timeout)
|
||||
const processingTimeout = 14 * 60 * 1000; // 14 minutes in milliseconds
|
||||
// Add timeout protection (28 minutes for API immediate processing, 14 for scheduled function)
|
||||
// API endpoint has 30-min Cloud Run timeout; scheduled function has 15 min
|
||||
const processingTimeout = 28 * 60 * 1000; // 28 minutes in milliseconds
|
||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||
timeoutId = setTimeout(() => reject(new Error('Job processing timeout after 14 minutes')), processingTimeout);
|
||||
timeoutId = setTimeout(() => reject(new Error('Job processing timeout after 28 minutes')), processingTimeout);
|
||||
});
|
||||
|
||||
// Wrap processing logic in Promise.race with timeout
|
||||
@@ -217,10 +218,15 @@ export class JobProcessorService {
|
||||
}
|
||||
|
||||
// Process the document
|
||||
// Strategy priority: job options > env config > default
|
||||
const { config: envConfig } = await import('../config/env');
|
||||
const processingStrategy = job.options?.strategy || envConfig.processingStrategy || 'document_ai_agentic_rag';
|
||||
|
||||
logger.info('Starting document processing', {
|
||||
jobId,
|
||||
documentId: job.document_id,
|
||||
strategy: job.options?.strategy || 'document_ai_agentic_rag',
|
||||
strategy: processingStrategy,
|
||||
strategySource: job.options?.strategy ? 'job_options' : (envConfig.processingStrategy ? 'env_config' : 'default'),
|
||||
});
|
||||
|
||||
const result = await unifiedDocumentProcessor.processDocument(
|
||||
@@ -228,7 +234,7 @@ export class JobProcessorService {
|
||||
job.user_id,
|
||||
'', // Text will be extracted from fileBuffer
|
||||
{
|
||||
strategy: job.options?.strategy || 'document_ai_agentic_rag',
|
||||
strategy: processingStrategy,
|
||||
fileBuffer,
|
||||
fileName: document.original_file_name,
|
||||
mimeType: 'application/pdf',
|
||||
|
||||
@@ -32,6 +32,12 @@ export interface CIMAnalysisResult {
|
||||
inputTokens: number;
|
||||
outputTokens: number;
|
||||
validationIssues?: z.ZodIssue[];
|
||||
/** The text actually sent to the LLM (may be truncated from the original). */
|
||||
processedText?: string;
|
||||
/** True if the input text was truncated to fit token limits. */
|
||||
wasTruncated?: boolean;
|
||||
/** Delta corrections from quality-check (delta mode only). */
|
||||
corrections?: Array<{ path: string; value: string; reason?: string }>;
|
||||
}
|
||||
|
||||
class LLMService {
|
||||
@@ -41,6 +47,34 @@ class LLMService {
|
||||
private maxTokens: number;
|
||||
private temperature: number;
|
||||
|
||||
// Concurrency limiter to prevent Anthropic 429 rate limit errors.
|
||||
// Max 2 concurrent API calls; additional calls queue up and wait.
|
||||
private static readonly MAX_CONCURRENT_LLM_CALLS = 1;
|
||||
private static activeCalls = 0;
|
||||
private static waitQueue: Array<() => void> = [];
|
||||
|
||||
private async acquireLLMSlot(): Promise<void> {
|
||||
if (LLMService.activeCalls < LLMService.MAX_CONCURRENT_LLM_CALLS) {
|
||||
LLMService.activeCalls++;
|
||||
return;
|
||||
}
|
||||
// Wait for a slot to free up
|
||||
return new Promise<void>((resolve) => {
|
||||
LLMService.waitQueue.push(() => {
|
||||
LLMService.activeCalls++;
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
private releaseLLMSlot(): void {
|
||||
LLMService.activeCalls--;
|
||||
if (LLMService.waitQueue.length > 0) {
|
||||
const next = LLMService.waitQueue.shift()!;
|
||||
next();
|
||||
}
|
||||
}
|
||||
|
||||
constructor() {
|
||||
// CRITICAL DEBUG: Log what we're reading from config
|
||||
logger.info('LLM Service constructor - Reading provider from config', {
|
||||
@@ -313,6 +347,8 @@ class LLMService {
|
||||
cost: this.estimateCost(estimatedTokens + response.content.length, selectedModel),
|
||||
inputTokens: estimatedTokens,
|
||||
outputTokens: response.content.length,
|
||||
processedText: processedText,
|
||||
wasTruncated,
|
||||
};
|
||||
} else {
|
||||
lastError = new Error(`JSON validation failed: ${validation.error.errors.map(e => e.message).join(', ')}`);
|
||||
@@ -353,10 +389,154 @@ class LLMService {
|
||||
throw lastError || new Error('All LLM processing attempts failed');
|
||||
}
|
||||
|
||||
/**
|
||||
* Quality-check a first-pass CIM extraction.
|
||||
*
|
||||
* A second LLM call reviews the pass-1 JSON against the original document text
|
||||
* and returns a corrected/enhanced version. Uses the same retry + Zod validation
|
||||
* logic as processCIMDocument. Falls back to the pass-1 result on total failure.
|
||||
*/
|
||||
async qualityCheckCIMDocument(
|
||||
text: string,
|
||||
extractedData: CIMReview,
|
||||
financialContext?: string,
|
||||
options?: { wasTruncated?: boolean; truncationPercent?: number },
|
||||
): Promise<CIMAnalysisResult & { corrections?: Array<{ path: string; value: string; reason?: string }> }> {
|
||||
logger.info('Starting quality-check LLM call (delta mode)', {
|
||||
textLength: text.length,
|
||||
extractedDataKeys: Object.keys(extractedData),
|
||||
wasTruncated: options?.wasTruncated,
|
||||
});
|
||||
|
||||
// Condense the document: send first 20K + last 20K chars instead of full text.
|
||||
// The quality check is reviewing the extraction, not re-reading the whole CIM.
|
||||
const MAX_CONTEXT_CHARS = 40000;
|
||||
let condensedText: string;
|
||||
if (text.length > MAX_CONTEXT_CHARS) {
|
||||
const halfSize = Math.floor(MAX_CONTEXT_CHARS / 2);
|
||||
condensedText = text.substring(0, halfSize) +
|
||||
`\n\n[... ${text.length - MAX_CONTEXT_CHARS} characters omitted for brevity ...]\n\n` +
|
||||
text.substring(text.length - halfSize);
|
||||
logger.info('Quality-check: condensed document text', {
|
||||
originalLength: text.length,
|
||||
condensedLength: condensedText.length,
|
||||
});
|
||||
} else {
|
||||
condensedText = text;
|
||||
}
|
||||
|
||||
const extractedJson = JSON.stringify(extractedData, null, 2);
|
||||
// Use fast model (Haiku) for quality check — it's a review task, not original extraction
|
||||
const selectedModel = config.llm.fastModel || config.llm.model;
|
||||
const estimatedTokens = this.estimateTokenCount(condensedText) + this.estimateTokenCount(extractedJson);
|
||||
|
||||
const truncationNote = options?.wasTruncated
|
||||
? `\nNote: The original document was truncated before the first-pass extraction. Do NOT invent data not present in the extracted JSON or the text excerpts below.`
|
||||
: '';
|
||||
|
||||
const systemPrompt = `You are a senior investment analyst performing a quality review of a first-pass CIM extraction. Your job is to identify ERRORS and GAPS in the extracted data.
|
||||
${truncationNote}
|
||||
Return ONLY a JSON array of corrections. Each correction is an object with:
|
||||
- "path": dot-notation path to the field (e.g. "financialSummary.financials.fy1.revenue")
|
||||
- "value": the corrected value (string)
|
||||
- "reason": brief explanation of why this correction is needed
|
||||
|
||||
RULES:
|
||||
1. Only include fields that need changes. If the extraction is correct, return an empty array: []
|
||||
2. For financial figures, cross-check against the document text. Fix wrong values or misaligned periods.
|
||||
3. For "Not specified in CIM" on QUANTITATIVE fields (revenue, headcount, etc.), only replace if you can find the actual data in the text.
|
||||
4. For "Not specified in CIM" on QUALITATIVE fields (statedReasonForSale, postTransactionIntentions, workingCapitalIntensity, typicalContractLength, etc.), ALWAYS provide a professional inference based on document context. For example: if reason for sale isn't stated but the seller is PE-backed, write "Not explicitly stated; likely PE sponsor exit based on [context]". If working capital intensity isn't stated but the company is asset-light, infer "Low working capital intensity consistent with asset-light business model".
|
||||
5. For thin list fields (keyAttractions, potentialRisks, etc.), provide enhanced content with 5-8 items, 2-3 sentences each.
|
||||
6. Do NOT hallucinate. Only use information present in or inferable from the document text.
|
||||
7. Your ENTIRE response must be a valid JSON array. No markdown, no explanation outside the array.`;
|
||||
|
||||
const financialSection = financialContext
|
||||
? `\n\nDETERMINISTIC FINANCIAL DATA (ground truth — do not contradict):\n${financialContext}`
|
||||
: '';
|
||||
|
||||
const userPrompt = `Review this first-pass CIM extraction for errors and gaps.
|
||||
|
||||
DOCUMENT TEXT (condensed excerpts):
|
||||
${condensedText}
|
||||
${financialSection}
|
||||
|
||||
EXTRACTED JSON TO REVIEW:
|
||||
${extractedJson}
|
||||
|
||||
Return a JSON array of corrections. If no corrections needed, return [].`;
|
||||
|
||||
let lastError: Error | null = null;
|
||||
|
||||
for (let attempt = 1; attempt <= 2; attempt++) {
|
||||
try {
|
||||
if (lastError && lastError.message.includes('rate limit')) {
|
||||
const retryDelay = Math.min(60000 * attempt, 300000);
|
||||
logger.warn(`Quality check: rate limit, waiting ${retryDelay}ms before attempt ${attempt}`);
|
||||
await new Promise(resolve => setTimeout(resolve, retryDelay));
|
||||
}
|
||||
|
||||
logger.info(`Quality-check LLM attempt ${attempt}/2`, { model: selectedModel });
|
||||
|
||||
const response = await this.callLLM({
|
||||
prompt: userPrompt,
|
||||
systemPrompt,
|
||||
model: selectedModel,
|
||||
maxTokens: 8000, // Delta output is much smaller than full JSON
|
||||
temperature: 0.1, // Low temperature for consistency
|
||||
});
|
||||
|
||||
if (!response.success) {
|
||||
throw new Error(response.error || 'Quality-check LLM call failed');
|
||||
}
|
||||
|
||||
// Parse the corrections array from the response
|
||||
const corrections = this.parseCorrectionsArray(response.content);
|
||||
|
||||
if (corrections !== null) {
|
||||
logger.info(`Quality-check completed on attempt ${attempt}`, {
|
||||
correctionsCount: corrections.length,
|
||||
});
|
||||
return {
|
||||
success: true,
|
||||
corrections: corrections.length > 0 ? corrections : undefined,
|
||||
model: selectedModel,
|
||||
cost: this.estimateCost(estimatedTokens + response.content.length, selectedModel),
|
||||
inputTokens: estimatedTokens,
|
||||
outputTokens: response.content.length,
|
||||
};
|
||||
} else {
|
||||
lastError = new Error('Quality-check response was not a valid JSON array');
|
||||
logger.warn(`Quality-check parse failed on attempt ${attempt}`, {
|
||||
responsePreview: response.content.substring(0, 200),
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
lastError = error instanceof Error ? error : new Error('Unknown error');
|
||||
logger.error(`Quality-check attempt ${attempt} failed`, { error: lastError.message });
|
||||
}
|
||||
}
|
||||
|
||||
// All quality-check attempts failed — return failure (caller falls back to pass-1)
|
||||
logger.warn('All quality-check attempts failed — caller should use pass-1 result', {
|
||||
lastError: lastError?.message,
|
||||
});
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: lastError?.message || 'All quality-check attempts failed',
|
||||
model: selectedModel,
|
||||
cost: 0,
|
||||
inputTokens: estimatedTokens,
|
||||
outputTokens: 0,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Call the appropriate LLM API
|
||||
*/
|
||||
private async callLLM(request: LLMRequest): Promise<LLMResponse> {
|
||||
// Acquire a concurrency slot (max 2 concurrent LLM calls to avoid Anthropic 429s)
|
||||
await this.acquireLLMSlot();
|
||||
try {
|
||||
// Use configured timeout from config.llm.timeoutMs (default 6 minutes for complex analysis)
|
||||
// Increased from 3 minutes to handle complex CIM analysis even with RAG reduction
|
||||
@@ -367,7 +547,7 @@ class LLMService {
|
||||
...request,
|
||||
model: normalizedModel
|
||||
};
|
||||
|
||||
|
||||
// Add a timeout wrapper to prevent hanging
|
||||
const timeoutPromise = new Promise<LLMResponse>((_, reject) => {
|
||||
setTimeout(() => reject(new Error(`LLM call timeout after ${timeoutMinutes} minutes`)), timeoutMs);
|
||||
@@ -380,9 +560,11 @@ class LLMService {
|
||||
model: normalizedRequest.model || this.defaultModel,
|
||||
willCallOpenRouter: this.provider === 'openrouter',
|
||||
willCallAnthropic: this.provider === 'anthropic',
|
||||
willCallOpenAI: this.provider === 'openai'
|
||||
willCallOpenAI: this.provider === 'openai',
|
||||
queuedCalls: LLMService.waitQueue.length,
|
||||
activeCalls: LLMService.activeCalls
|
||||
});
|
||||
|
||||
|
||||
if (this.provider === 'openai') {
|
||||
return await this.callOpenAI(normalizedRequest);
|
||||
} else if (this.provider === 'openrouter') {
|
||||
@@ -405,6 +587,8 @@ class LLMService {
|
||||
content: '',
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
};
|
||||
} finally {
|
||||
this.releaseLLMSlot();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -910,13 +1094,13 @@ class LLMService {
|
||||
CRITICAL REQUIREMENTS:
|
||||
1. **JSON OUTPUT ONLY**: Your entire response MUST be a single, valid JSON object. Do not include any text or explanation before or after the JSON object.
|
||||
2. **BPCP TEMPLATE FORMAT**: The JSON object MUST follow the BPCP CIM Review Template structure exactly as specified.
|
||||
3. **COMPLETE ALL FIELDS**: You MUST provide a value for every field. Use "Not specified in CIM" for any information that is not available in the document.
|
||||
4. **NO PLACEHOLDERS**: Do not use placeholders like "..." or "TBD". Use "Not specified in CIM" instead.
|
||||
3. **COMPLETE ALL FIELDS**: You MUST provide a value for every field. For qualitative/assessment fields, ALWAYS provide your professional inference or best judgment based on context — do NOT default to "Not specified in CIM" for fields like statedReasonForSale, postTransactionIntentions, workingCapitalIntensity, or typicalContractLength. For example: if reason for sale is not stated but the seller is PE-backed, write "Not explicitly stated; likely PE sponsor exit/liquidity event based on [context]". Only use "Not specified in CIM" for specific quantitative data (exact financial figures, headcounts) that truly cannot be determined or reasonably inferred.
|
||||
4. **NO PLACEHOLDERS**: Do not use placeholders like "..." or "TBD".
|
||||
5. **PROFESSIONAL ANALYSIS**: The content should be high-quality and suitable for BPCP's investment committee.
|
||||
6. **BPCP FOCUS**: Focus on companies in 5+MM EBITDA range in consumer and industrial end markets, with emphasis on M&A, technology & data usage, supply chain and human capital optimization.
|
||||
7. **BPCP PREFERENCES**: BPCP prefers companies which are founder/family-owned and within driving distance of Cleveland and Charlotte.
|
||||
8. **EXACT FIELD NAMES**: Use the exact field names and descriptions from the BPCP CIM Review Template.
|
||||
9. **FINANCIAL DATA**: For financial metrics, use actual numbers if available, otherwise use "Not specified in CIM".
|
||||
9. **FINANCIAL DATA**: For financial metrics, use actual numbers if available. Only use "Not specified in CIM" for specific financial line items that are truly absent from the document.
|
||||
10. **VALID JSON**: Ensure your response is valid JSON that can be parsed without errors.
|
||||
|
||||
ANALYSIS QUALITY REQUIREMENTS:
|
||||
@@ -1235,6 +1419,54 @@ SPECIAL REQUIREMENTS FOR PRELIMINARY INVESTMENT THESIS:
|
||||
`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a corrections array from quality-check LLM response.
|
||||
* Expects a JSON array like: [{"path":"...", "value":"...", "reason":"..."}]
|
||||
* Returns null if parsing fails.
|
||||
*/
|
||||
private parseCorrectionsArray(content: string): Array<{ path: string; value: string; reason?: string }> | null {
|
||||
try {
|
||||
// Strip markdown fences if present
|
||||
let cleaned = content.trim();
|
||||
if (cleaned.startsWith('```')) {
|
||||
const firstNewline = cleaned.indexOf('\n');
|
||||
cleaned = cleaned.substring(firstNewline + 1);
|
||||
const lastFence = cleaned.lastIndexOf('```');
|
||||
if (lastFence > 0) cleaned = cleaned.substring(0, lastFence);
|
||||
cleaned = cleaned.trim();
|
||||
}
|
||||
|
||||
const parsed = JSON.parse(cleaned);
|
||||
if (!Array.isArray(parsed)) {
|
||||
logger.warn('Quality-check response is not an array', { type: typeof parsed });
|
||||
return null;
|
||||
}
|
||||
|
||||
// Validate each correction has at minimum path + value
|
||||
const valid = parsed.filter((item: any) =>
|
||||
item && typeof item.path === 'string' && item.path.length > 0 &&
|
||||
(typeof item.value === 'string' || typeof item.value === 'number')
|
||||
).map((item: any) => ({
|
||||
path: item.path,
|
||||
value: String(item.value),
|
||||
reason: item.reason ? String(item.reason) : undefined,
|
||||
}));
|
||||
|
||||
logger.info('Quality-check corrections parsed', {
|
||||
rawCount: parsed.length,
|
||||
validCount: valid.length,
|
||||
});
|
||||
|
||||
return valid;
|
||||
} catch (error) {
|
||||
logger.warn('Failed to parse quality-check corrections array', {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
contentPreview: content.substring(0, 200),
|
||||
});
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract JSON from LLM response
|
||||
*/
|
||||
@@ -2536,33 +2768,34 @@ CRITICAL REQUIREMENTS:
|
||||
|
||||
const trimmed = model.trim();
|
||||
const lower = trimmed.toLowerCase();
|
||||
const canonicalSonnet46 = 'claude-sonnet-4-6';
|
||||
const canonicalHaiku45 = 'claude-haiku-4-5';
|
||||
// Valid Anthropic API model IDs (date-stamped or aliased)
|
||||
const canonicalSonnet46 = 'claude-sonnet-4-20250514';
|
||||
const canonicalHaiku45 = 'claude-haiku-4-5-20251001';
|
||||
const legacySonnet35 = 'claude-3-5-sonnet-latest';
|
||||
const legacyHaiku35 = 'claude-3-5-haiku-latest';
|
||||
|
||||
// Keep modern 4.6/4.5 identifiers as-is
|
||||
if (lower.includes('sonnet-4-6')) {
|
||||
|
||||
// Keep valid date-stamped identifiers as-is
|
||||
if (lower.match(/sonnet-4-\d{8}/)) {
|
||||
return trimmed;
|
||||
}
|
||||
if (lower.includes('haiku-4-5')) {
|
||||
if (lower.match(/haiku-4-\d+-\d{8}/)) {
|
||||
return trimmed;
|
||||
}
|
||||
|
||||
// Map older Claude 4.x labels (4.0/4.5) to the active 4.6/4.5 SKUs
|
||||
|
||||
// Map shorthand Claude 4.x labels to valid API model IDs
|
||||
if (lower.includes('sonnet-4')) {
|
||||
if (trimmed !== canonicalSonnet46) {
|
||||
logger.warn('Normalizing Claude Sonnet 4.x model to 4.6', {
|
||||
logger.warn('Normalizing Claude Sonnet 4.x model to canonical API ID', {
|
||||
requestedModel: trimmed,
|
||||
normalizedModel: canonicalSonnet46
|
||||
});
|
||||
}
|
||||
return canonicalSonnet46;
|
||||
}
|
||||
|
||||
|
||||
if (lower.includes('haiku-4')) {
|
||||
if (trimmed !== canonicalHaiku45) {
|
||||
logger.warn('Normalizing Claude Haiku 4.x model to 4.5', {
|
||||
logger.warn('Normalizing Claude Haiku 4.x model to canonical API ID', {
|
||||
requestedModel: trimmed,
|
||||
normalizedModel: canonicalHaiku45
|
||||
});
|
||||
|
||||
@@ -125,7 +125,7 @@ export class OptimizedAgenticRAGProcessor {
|
||||
/**
|
||||
* Create intelligent chunks with semantic boundaries
|
||||
*/
|
||||
private async createIntelligentChunks(
|
||||
async createIntelligentChunks(
|
||||
text: string,
|
||||
documentId: string,
|
||||
enableSemanticChunking: boolean = true,
|
||||
@@ -515,7 +515,7 @@ export class OptimizedAgenticRAGProcessor {
|
||||
* Store chunks with optimized batching
|
||||
* Returns the number of API calls made for embeddings
|
||||
*/
|
||||
private async storeChunksOptimized(
|
||||
async storeChunksOptimized(
|
||||
chunks: ProcessingChunk[],
|
||||
documentId: string
|
||||
): Promise<number> {
|
||||
@@ -1216,20 +1216,27 @@ export class OptimizedAgenticRAGProcessor {
|
||||
|
||||
try {
|
||||
const { parseFinancialsFromText } = await import('./financialTableParser');
|
||||
const parsedFinancials = parseFinancialsFromText(text);
|
||||
const parseResult = parseFinancialsFromText(text);
|
||||
const parsedFinancials = parseResult.data;
|
||||
|
||||
if (this.hasStructuredFinancialData(parsedFinancials)) {
|
||||
deterministicFinancials = parsedFinancials;
|
||||
deterministicFinancialChunk = this.buildDeterministicFinancialChunk(documentId, parsedFinancials);
|
||||
logger.info('Deterministic financial parser prepared structured data', {
|
||||
documentId,
|
||||
confidence: parseResult.confidence,
|
||||
tablesFound: parseResult.tablesFound,
|
||||
fy3: parsedFinancials.fy3,
|
||||
fy2: parsedFinancials.fy2,
|
||||
fy1: parsedFinancials.fy1,
|
||||
ltm: parsedFinancials.ltm
|
||||
});
|
||||
} else {
|
||||
logger.info('Deterministic financial parser did not find structured data', { documentId });
|
||||
logger.info('Deterministic financial parser did not find structured data', {
|
||||
documentId,
|
||||
confidence: parseResult.confidence,
|
||||
tablesFound: parseResult.tablesFound,
|
||||
});
|
||||
}
|
||||
} catch (parserError) {
|
||||
logger.warn('Deterministic financial parser failed during initialization', {
|
||||
@@ -1241,31 +1248,23 @@ export class OptimizedAgenticRAGProcessor {
|
||||
let totalApiCalls = 0;
|
||||
const partialResults: Partial<CIMReview>[] = [];
|
||||
|
||||
// OPTIMIZED SEQUENTIAL EXECUTION: Combined passes to reduce API calls
|
||||
logger.info('Starting agentic RAG extraction (optimized with combined passes)');
|
||||
const sequentialStart = Date.now();
|
||||
|
||||
// Run combined passes sequentially to reduce total API calls from 5 to 3
|
||||
logger.info('Pass 1: Metadata + Financial Data (Combined)');
|
||||
// PARALLEL EXECUTION: Run all 3 independent extraction passes concurrently
|
||||
logger.info('Starting agentic RAG extraction (3 passes in parallel)');
|
||||
const parallelStart = Date.now();
|
||||
|
||||
const deterministicPinnedChunks = deterministicFinancialChunk ? [deterministicFinancialChunk] : [];
|
||||
const pass1CombinedResult = await this.extractPass1CombinedMetadataFinancial(
|
||||
|
||||
const [pass1CombinedResult, pass2CombinedResult, pass3Result] = await Promise.all([
|
||||
this.extractPass1CombinedMetadataFinancial(documentId, text, chunks, deterministicPinnedChunks),
|
||||
this.extractPass2CombinedMarketBusiness(documentId, text, chunks),
|
||||
this.extractPass5InvestmentThesis(documentId, text, chunks)
|
||||
]);
|
||||
|
||||
const parallelTime = Date.now() - parallelStart;
|
||||
logger.info('Parallel extraction completed', {
|
||||
documentId,
|
||||
text,
|
||||
chunks,
|
||||
deterministicPinnedChunks
|
||||
);
|
||||
|
||||
logger.info('Pass 2: Market Analysis + Business Operations (Combined)');
|
||||
const pass2CombinedResult = await this.extractPass2CombinedMarketBusiness(documentId, text, chunks);
|
||||
|
||||
logger.info('Pass 3: Investment Thesis');
|
||||
const pass3Result = await this.extractPass5InvestmentThesis(documentId, text, chunks);
|
||||
|
||||
const sequentialTime = Date.now() - sequentialStart;
|
||||
logger.info('Sequential extraction completed', {
|
||||
documentId,
|
||||
sequentialTimeMs: sequentialTime,
|
||||
sequentialTimeSec: (sequentialTime / 1000).toFixed(1)
|
||||
parallelTimeMs: parallelTime,
|
||||
parallelTimeSec: (parallelTime / 1000).toFixed(1)
|
||||
});
|
||||
|
||||
partialResults.push(pass1CombinedResult.data);
|
||||
@@ -2135,49 +2134,53 @@ IMPORTANT EXTRACTION RULES:
|
||||
const batches = this.groupMissingFieldsIntoBatches(missingFields);
|
||||
let totalApiCalls = 0;
|
||||
|
||||
for (const batch of batches) {
|
||||
// Create a targeted query for this batch of missing fields
|
||||
const query = this.createGapFillingQuery(batch);
|
||||
|
||||
try {
|
||||
const { chunks: relevantChunks } = await this.findRelevantChunks(
|
||||
documentId,
|
||||
query,
|
||||
chunks,
|
||||
30000 // Smaller context for gap-filling
|
||||
);
|
||||
// Process all gap-filling batches in parallel
|
||||
const batchResults = await Promise.all(
|
||||
batches.map(async (batch) => {
|
||||
const query = this.createGapFillingQuery(batch);
|
||||
try {
|
||||
const { chunks: relevantChunks } = await this.findRelevantChunks(
|
||||
documentId, query, chunks, 30000
|
||||
);
|
||||
|
||||
if (relevantChunks.length === 0) {
|
||||
logger.info('No relevant chunks found for gap-filling batch', { batch });
|
||||
continue;
|
||||
logger.info('No relevant chunks found for gap-filling batch', { batch });
|
||||
return { apiCalls: 0, data: null };
|
||||
}
|
||||
|
||||
const reducedText = relevantChunks
|
||||
.slice(0, 15) // Limit to 15 chunks for gap-filling
|
||||
.slice(0, 15)
|
||||
.map((chunk, index) => {
|
||||
const separator = index > 0 ? '\n\n---\n\n' : '';
|
||||
return `${separator}[Section ${chunk.chunkIndex + 1}]\n${chunk.content}`;
|
||||
return `${separator}[Section ${chunk.chunkIndex + 1}]\n${chunk.content}`;
|
||||
})
|
||||
.join('\n\n');
|
||||
|
||||
// Make LLM call for this batch
|
||||
const result = await llmService.processCIMDocument(reducedText, 'BPCP CIM Review Template');
|
||||
totalApiCalls++;
|
||||
|
||||
if (result.success && result.jsonOutput) {
|
||||
// Merge gap-filled data (only for the missing fields)
|
||||
this.deepMerge(currentData, result.jsonOutput);
|
||||
logger.info('Gap-filling batch completed', {
|
||||
batch: batch.slice(0, 5),
|
||||
batchSize: batch.length
|
||||
});
|
||||
logger.info('Gap-filling batch completed', {
|
||||
batch: batch.slice(0, 5),
|
||||
batchSize: batch.length
|
||||
});
|
||||
return { apiCalls: 1, data: result.jsonOutput };
|
||||
}
|
||||
|
||||
return { apiCalls: 1, data: null };
|
||||
} catch (error) {
|
||||
logger.error('Gap-filling batch failed', {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
batch: batch.slice(0, 5)
|
||||
});
|
||||
batch: batch.slice(0, 5)
|
||||
});
|
||||
return { apiCalls: 0, data: null };
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
// Merge results sequentially to maintain deterministic merge order
|
||||
for (const result of batchResults) {
|
||||
totalApiCalls += result.apiCalls;
|
||||
if (result.data) {
|
||||
this.deepMerge(currentData, result.data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2249,54 +2252,49 @@ Extract exact values, numbers, percentages, names, and detailed information.`;
|
||||
{ path: 'keyQuestionsNextSteps.missingInformation', name: 'Missing Information' }
|
||||
];
|
||||
|
||||
for (const { path, name } of listFieldPaths) {
|
||||
// Identify which fields need repair
|
||||
const fieldsNeedingRepair = listFieldPaths.filter(({ path, name }) => {
|
||||
const value = this.getNestedField(validatedData, path);
|
||||
|
||||
if (value && typeof value === 'string') {
|
||||
const itemCount = (value.match(/^\d+\.\s/gm) || []).length;
|
||||
|
||||
if (itemCount < 5 || itemCount > 8) {
|
||||
logger.warn(`List field validation failed: ${name}`, {
|
||||
documentId,
|
||||
field: path,
|
||||
currentCount: itemCount,
|
||||
required: '5-8 items'
|
||||
documentId, field: path, currentCount: itemCount, required: '5-8 items'
|
||||
});
|
||||
return true;
|
||||
}
|
||||
logger.debug(`List field validated: ${name}`, { documentId, field: path, itemCount });
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
// Repair all failing fields in parallel
|
||||
if (fieldsNeedingRepair.length > 0) {
|
||||
const repairResults = await Promise.all(
|
||||
fieldsNeedingRepair.map(async ({ path, name }) => {
|
||||
const value = this.getNestedField(validatedData, path) as string;
|
||||
const itemCount = (value.match(/^\d+\.\s/gm) || []).length;
|
||||
try {
|
||||
// Attempt to repair the list
|
||||
const repairedValue = await this.repairListField(
|
||||
name,
|
||||
value,
|
||||
itemCount,
|
||||
documentId,
|
||||
chunks
|
||||
);
|
||||
apiCalls++;
|
||||
|
||||
// Update the nested field
|
||||
this.setNestedField(validatedData, path, repairedValue);
|
||||
|
||||
const repairedValue = await this.repairListField(name, value, itemCount, documentId, chunks);
|
||||
logger.info(`List field repaired: ${name}`, {
|
||||
documentId,
|
||||
field: path,
|
||||
originalCount: itemCount,
|
||||
documentId, field: path, originalCount: itemCount,
|
||||
newCount: (repairedValue.match(/^\d+\.\s/gm) || []).length
|
||||
});
|
||||
return { path, repairedValue, success: true };
|
||||
} catch (error) {
|
||||
logger.error(`Failed to repair list field: ${name}`, {
|
||||
documentId,
|
||||
field: path,
|
||||
documentId, field: path,
|
||||
error: error instanceof Error ? error.message : String(error)
|
||||
});
|
||||
// Continue with original value if repair fails
|
||||
return { path, repairedValue: null, success: false };
|
||||
}
|
||||
} else {
|
||||
logger.debug(`List field validated: ${name}`, {
|
||||
documentId,
|
||||
field: path,
|
||||
itemCount
|
||||
});
|
||||
})
|
||||
);
|
||||
|
||||
for (const result of repairResults) {
|
||||
if (result.success && result.repairedValue) {
|
||||
this.setNestedField(validatedData, result.path, result.repairedValue);
|
||||
apiCalls++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -79,12 +79,13 @@ class SimpleDocumentProcessor {
|
||||
let deterministicFinancials: any = null;
|
||||
try {
|
||||
const { parseFinancialsFromText } = await import('./financialTableParser');
|
||||
const parsedFinancials = parseFinancialsFromText(extractedText);
|
||||
|
||||
const parseResult = parseFinancialsFromText(extractedText);
|
||||
const parsedFinancials = parseResult.data;
|
||||
|
||||
// Check if parser found structured data
|
||||
const hasData = parsedFinancials.fy3?.revenue || parsedFinancials.fy2?.revenue ||
|
||||
const hasData = parsedFinancials.fy3?.revenue || parsedFinancials.fy2?.revenue ||
|
||||
parsedFinancials.fy1?.revenue || parsedFinancials.ltm?.revenue;
|
||||
|
||||
|
||||
if (hasData) {
|
||||
deterministicFinancials = parsedFinancials;
|
||||
logger.info('Deterministic financial parser found structured data', {
|
||||
|
||||
376
backend/src/services/singlePassProcessor.ts
Normal file
376
backend/src/services/singlePassProcessor.ts
Normal file
@@ -0,0 +1,376 @@
|
||||
import { logger } from '../utils/logger';
|
||||
import { config } from '../config/env';
|
||||
import { llmService } from './llmService';
|
||||
import { CIMReview } from './llmSchemas';
|
||||
import { documentAiProcessor } from './documentAiProcessor';
|
||||
|
||||
interface SinglePassResult {
|
||||
success: boolean;
|
||||
summary: string;
|
||||
analysisData: CIMReview;
|
||||
processingStrategy: 'single_pass_quality_check';
|
||||
processingTime: number;
|
||||
apiCalls: number;
|
||||
error: string | undefined;
|
||||
/** Whether the quality-check LLM pass succeeded. False means pass-1 result is unverified. */
|
||||
qualityCheckPassed: boolean;
|
||||
/** Whether the input text was truncated before LLM processing. */
|
||||
wasTruncated: boolean;
|
||||
/** Completeness score (0-100) from final validation. */
|
||||
completenessScore: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Single-pass CIM processor: 2 LLM calls total (extraction + quality check).
|
||||
*
|
||||
* Flow:
|
||||
* Document AI OCR (if needed) -> deterministic financial parse ->
|
||||
* single LLM extraction -> quality-check LLM -> PDF generation -> done
|
||||
*
|
||||
* Background: chunk + embed for future search (non-blocking, fire-and-forget)
|
||||
*/
|
||||
class SinglePassProcessor {
|
||||
/**
|
||||
* Main entry point — processes a CIM document with 2 LLM calls.
|
||||
*/
|
||||
async processDocument(
|
||||
documentId: string,
|
||||
userId: string,
|
||||
text: string,
|
||||
options: {
|
||||
fileBuffer?: Buffer;
|
||||
fileName?: string;
|
||||
mimeType?: string;
|
||||
} = {}
|
||||
): Promise<SinglePassResult> {
|
||||
const startTime = Date.now();
|
||||
let apiCalls = 0;
|
||||
|
||||
try {
|
||||
// ── Step 0: Extract text via Document AI if not provided ──
|
||||
let extractedText = text;
|
||||
if (!extractedText || extractedText.length === 0) {
|
||||
const { fileBuffer, fileName, mimeType } = options;
|
||||
if (!fileBuffer || !fileName || !mimeType) {
|
||||
throw new Error('Missing required options: fileBuffer, fileName, mimeType');
|
||||
}
|
||||
|
||||
const docAiResult = await documentAiProcessor.extractTextOnly(
|
||||
documentId, userId, fileBuffer, fileName, mimeType
|
||||
);
|
||||
if (!docAiResult.text || docAiResult.text.length === 0) {
|
||||
throw new Error('Document AI text extraction returned no text');
|
||||
}
|
||||
extractedText = docAiResult.text;
|
||||
|
||||
logger.info('Single-pass: Document AI text extraction completed', {
|
||||
documentId, textLength: extractedText.length,
|
||||
});
|
||||
}
|
||||
|
||||
// ── Step 1: Single LLM extraction ──
|
||||
// Note: Deterministic financial parser disabled for single-pass mode.
|
||||
// The LLM consistently outperforms it — the parser matches narrative text
|
||||
// and footnotes, producing wrong values that override correct LLM output.
|
||||
logger.info('Single-pass: Starting LLM extraction (pass 1)', { documentId });
|
||||
const pass1Result = await llmService.processCIMDocument(
|
||||
extractedText,
|
||||
'', // template is embedded in the prompt
|
||||
);
|
||||
apiCalls++;
|
||||
|
||||
if (!pass1Result.success || !pass1Result.jsonOutput) {
|
||||
throw new Error(pass1Result.error || 'LLM extraction (pass 1) failed');
|
||||
}
|
||||
|
||||
// Use the same text the LLM actually saw (may be truncated)
|
||||
const llmText = pass1Result.processedText || extractedText;
|
||||
const wasTruncated = pass1Result.wasTruncated || false;
|
||||
const truncationPercent = wasTruncated
|
||||
? (llmText.length / extractedText.length) * 100
|
||||
: 100;
|
||||
|
||||
if (wasTruncated) {
|
||||
logger.warn('Single-pass: Pass 1 text was truncated', {
|
||||
documentId,
|
||||
originalLength: extractedText.length,
|
||||
truncatedLength: llmText.length,
|
||||
truncationPercent: truncationPercent.toFixed(1),
|
||||
});
|
||||
}
|
||||
|
||||
let analysisData: CIMReview = pass1Result.jsonOutput;
|
||||
|
||||
logger.info('Single-pass: Pass 1 complete', {
|
||||
documentId,
|
||||
model: pass1Result.model,
|
||||
inputTokens: pass1Result.inputTokens,
|
||||
outputTokens: pass1Result.outputTokens,
|
||||
cost: pass1Result.cost,
|
||||
wasTruncated,
|
||||
});
|
||||
|
||||
// ── Step 3: Quality-check LLM (pass 2) ──
|
||||
// Uses delta-only approach: quality check returns only corrections, not full JSON.
|
||||
// Uses Haiku for speed. Sends condensed document (first/last 20K chars) not full text.
|
||||
logger.info('Single-pass: Starting quality-check LLM (pass 2)', { documentId });
|
||||
let qualityCheckPassed = false;
|
||||
try {
|
||||
const qualityResult = await llmService.qualityCheckCIMDocument(
|
||||
llmText,
|
||||
analysisData,
|
||||
undefined, // no deterministic financial context
|
||||
{ wasTruncated, truncationPercent },
|
||||
);
|
||||
apiCalls++;
|
||||
|
||||
if (qualityResult.success && qualityResult.corrections) {
|
||||
// Apply delta corrections to the existing analysis data
|
||||
analysisData = this.applyCorrections(analysisData, qualityResult.corrections);
|
||||
qualityCheckPassed = true;
|
||||
|
||||
logger.info('Single-pass: Quality check complete — corrections applied', {
|
||||
documentId,
|
||||
model: qualityResult.model,
|
||||
correctionsCount: qualityResult.corrections.length,
|
||||
inputTokens: qualityResult.inputTokens,
|
||||
outputTokens: qualityResult.outputTokens,
|
||||
cost: qualityResult.cost,
|
||||
});
|
||||
} else if (qualityResult.success && !qualityResult.corrections) {
|
||||
// Quality check returned success with no corrections — pass 1 was good
|
||||
qualityCheckPassed = true;
|
||||
logger.info('Single-pass: Quality check found no corrections needed', { documentId });
|
||||
} else {
|
||||
logger.error('Single-pass: Quality check returned no valid output — using pass 1 result', {
|
||||
documentId,
|
||||
error: qualityResult.error,
|
||||
});
|
||||
}
|
||||
} catch (qualityError) {
|
||||
// Quality check failed — fall back to pass 1 but log at ERROR
|
||||
logger.error('Single-pass: Quality check failed — using unverified pass 1 result', {
|
||||
documentId,
|
||||
error: qualityError instanceof Error ? qualityError.message : String(qualityError),
|
||||
});
|
||||
}
|
||||
|
||||
// ── Step 4: Completeness gate ──
|
||||
const completenessScore = this.scoreCompleteness(analysisData);
|
||||
const COMPLETENESS_THRESHOLD = 85;
|
||||
|
||||
if (completenessScore < COMPLETENESS_THRESHOLD) {
|
||||
const processingTime = Date.now() - startTime;
|
||||
const msg = `Completeness score ${completenessScore.toFixed(1)}% is below ${COMPLETENESS_THRESHOLD}% threshold`;
|
||||
logger.error('Single-pass: Completeness gate FAILED', {
|
||||
documentId,
|
||||
completenessScore,
|
||||
threshold: COMPLETENESS_THRESHOLD,
|
||||
qualityCheckPassed,
|
||||
wasTruncated,
|
||||
});
|
||||
|
||||
return {
|
||||
success: false,
|
||||
summary: '',
|
||||
analysisData,
|
||||
processingStrategy: 'single_pass_quality_check',
|
||||
processingTime,
|
||||
apiCalls,
|
||||
error: msg,
|
||||
qualityCheckPassed,
|
||||
wasTruncated,
|
||||
completenessScore,
|
||||
};
|
||||
}
|
||||
|
||||
// ── Step 5: Fire-and-forget background embedding ──
|
||||
if (config.backgroundEmbeddingEnabled !== false) {
|
||||
setImmediate(() => {
|
||||
this.backgroundChunkAndEmbed(documentId, extractedText).catch((err) => {
|
||||
logger.warn('Single-pass: Background embedding failed (non-blocking)', {
|
||||
documentId,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
const processingTime = Date.now() - startTime;
|
||||
const summary = this.buildSummary(analysisData);
|
||||
|
||||
logger.info('Single-pass: Processing complete', {
|
||||
documentId,
|
||||
processingTime,
|
||||
apiCalls,
|
||||
textLength: extractedText.length,
|
||||
qualityCheckPassed,
|
||||
wasTruncated,
|
||||
completenessScore,
|
||||
});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
summary,
|
||||
analysisData,
|
||||
processingStrategy: 'single_pass_quality_check',
|
||||
processingTime,
|
||||
apiCalls,
|
||||
error: undefined,
|
||||
qualityCheckPassed,
|
||||
wasTruncated,
|
||||
completenessScore,
|
||||
};
|
||||
} catch (error) {
|
||||
const processingTime = Date.now() - startTime;
|
||||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||||
|
||||
logger.error('Single-pass: Processing failed', {
|
||||
documentId,
|
||||
error: errorMessage,
|
||||
stack: error instanceof Error ? error.stack : undefined,
|
||||
processingTime,
|
||||
apiCalls,
|
||||
});
|
||||
|
||||
return {
|
||||
success: false,
|
||||
summary: '',
|
||||
analysisData: {} as CIMReview,
|
||||
processingStrategy: 'single_pass_quality_check',
|
||||
processingTime,
|
||||
apiCalls,
|
||||
error: `Single-pass processing failed: ${errorMessage}`,
|
||||
qualityCheckPassed: false,
|
||||
wasTruncated: false,
|
||||
completenessScore: 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply delta corrections from the quality check to the existing analysis data.
|
||||
* Each correction has a dot-path (e.g. "financialSummary.financials.fy1.revenue")
|
||||
* and a new value.
|
||||
*/
|
||||
private applyCorrections(
|
||||
data: CIMReview,
|
||||
corrections: Array<{ path: string; value: string; reason?: string }>,
|
||||
): CIMReview {
|
||||
const patched = JSON.parse(JSON.stringify(data)) as any;
|
||||
let applied = 0;
|
||||
|
||||
for (const { path, value, reason } of corrections) {
|
||||
const parts = path.split('.');
|
||||
let target = patched;
|
||||
for (let i = 0; i < parts.length - 1; i++) {
|
||||
if (target[parts[i]] === undefined || target[parts[i]] === null) {
|
||||
target[parts[i]] = {};
|
||||
}
|
||||
target = target[parts[i]];
|
||||
}
|
||||
const lastKey = parts[parts.length - 1];
|
||||
const oldValue = target[lastKey];
|
||||
target[lastKey] = value;
|
||||
applied++;
|
||||
|
||||
logger.info('Quality-check correction applied', {
|
||||
path,
|
||||
oldValue: typeof oldValue === 'string' ? oldValue.substring(0, 80) : String(oldValue),
|
||||
newValue: typeof value === 'string' ? value.substring(0, 80) : String(value),
|
||||
reason: reason || '',
|
||||
});
|
||||
}
|
||||
|
||||
logger.info('Quality-check corrections summary', { total: corrections.length, applied });
|
||||
return patched as CIMReview;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a short executive summary from the analysis data.
|
||||
*/
|
||||
private buildSummary(data: CIMReview): string {
|
||||
const companyName = data.dealOverview?.targetCompanyName || 'Unknown Company';
|
||||
const sector = data.dealOverview?.industrySector || 'Unknown Sector';
|
||||
const recommendation = data.keyQuestionsNextSteps?.preliminaryRecommendation || 'No recommendation';
|
||||
return `CIM Review: ${companyName} (${sector}) — ${recommendation}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Quick completeness score (0-100): counts non-empty string leaf fields.
|
||||
* BPCP-internal fields (reviewers, dateReviewed, dateCIMReceived) are excluded.
|
||||
*/
|
||||
private scoreCompleteness(data: CIMReview): number {
|
||||
const excludedPaths = new Set([
|
||||
'dealOverview.reviewers',
|
||||
'dealOverview.dateReviewed',
|
||||
'dealOverview.dateCIMReceived',
|
||||
]);
|
||||
|
||||
let total = 0;
|
||||
let filled = 0;
|
||||
const emptyFields: string[] = [];
|
||||
|
||||
const walk = (obj: any, path: string) => {
|
||||
if (obj === null || obj === undefined) return;
|
||||
if (typeof obj === 'string') {
|
||||
if (excludedPaths.has(path)) return;
|
||||
total++;
|
||||
const trimmed = obj.trim();
|
||||
if (trimmed !== '' && trimmed !== 'Not specified in CIM') {
|
||||
filled++;
|
||||
} else {
|
||||
emptyFields.push(path);
|
||||
}
|
||||
} else if (typeof obj === 'object' && !Array.isArray(obj)) {
|
||||
for (const key of Object.keys(obj)) {
|
||||
walk(obj[key], path ? `${path}.${key}` : key);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
walk(data, '');
|
||||
const score = total > 0 ? (filled / total) * 100 : 0;
|
||||
|
||||
if (emptyFields.length > 0) {
|
||||
logger.info('Completeness scoring: empty fields', {
|
||||
score: score.toFixed(1),
|
||||
total,
|
||||
filled,
|
||||
emptyCount: emptyFields.length,
|
||||
emptyFields,
|
||||
});
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
/**
|
||||
* Background chunking and embedding for future vector search.
|
||||
* Runs asynchronously after the main processing completes.
|
||||
*/
|
||||
private async backgroundChunkAndEmbed(documentId: string, text: string): Promise<void> {
|
||||
try {
|
||||
const { OptimizedAgenticRAGProcessor } = await import('./optimizedAgenticRAGProcessor');
|
||||
const ragProcessor = new OptimizedAgenticRAGProcessor();
|
||||
|
||||
logger.info('Single-pass: Starting background chunk & embed', {
|
||||
documentId, textLength: text.length,
|
||||
});
|
||||
|
||||
const chunks = await ragProcessor.createIntelligentChunks(text, documentId, true, []);
|
||||
await ragProcessor.storeChunksOptimized(chunks, documentId);
|
||||
|
||||
logger.info('Single-pass: Background chunk & embed complete', {
|
||||
documentId, chunkCount: chunks.length,
|
||||
});
|
||||
} catch (error) {
|
||||
logger.warn('Single-pass: Background chunk & embed failed', {
|
||||
documentId,
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export const singlePassProcessor = new SinglePassProcessor();
|
||||
@@ -3,6 +3,7 @@ import { config } from '../config/env';
|
||||
import { optimizedAgenticRAGProcessor } from './optimizedAgenticRAGProcessor';
|
||||
import { simpleDocumentProcessor } from './simpleDocumentProcessor';
|
||||
import { documentAiProcessor } from './documentAiProcessor';
|
||||
import { singlePassProcessor } from './singlePassProcessor';
|
||||
import { CIMReview } from './llmSchemas';
|
||||
|
||||
// Default empty CIMReview object
|
||||
@@ -111,7 +112,7 @@ interface ProcessingResult {
|
||||
success: boolean;
|
||||
summary: string;
|
||||
analysisData: CIMReview;
|
||||
processingStrategy: 'document_ai_agentic_rag' | 'simple_full_document';
|
||||
processingStrategy: 'document_ai_agentic_rag' | 'simple_full_document' | 'single_pass_quality_check';
|
||||
processingTime: number;
|
||||
apiCalls: number;
|
||||
error: string | undefined;
|
||||
@@ -159,12 +160,101 @@ class UnifiedDocumentProcessor {
|
||||
} else if (strategy === 'document_ai_agentic_rag') {
|
||||
logger.info('Unified processor: Routing to RAG processor', { documentId, strategy });
|
||||
return await this.processWithDocumentAiAgenticRag(documentId, userId, text, options);
|
||||
} else if (strategy === 'single_pass_quality_check') {
|
||||
logger.info('Unified processor: Routing to single-pass processor', { documentId, strategy });
|
||||
return await this.processWithSinglePass(documentId, userId, text, options);
|
||||
} else {
|
||||
logger.error('Unified processor: Unsupported strategy', { documentId, strategy });
|
||||
throw new Error(`Unsupported processing strategy: ${strategy}. Supported: 'simple_full_document', 'document_ai_agentic_rag'`);
|
||||
throw new Error(`Unsupported processing strategy: ${strategy}. Supported: 'simple_full_document', 'document_ai_agentic_rag', 'single_pass_quality_check'`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process document using single-pass extraction + quality check (2 LLM calls)
|
||||
*/
|
||||
private async processWithSinglePass(
|
||||
documentId: string,
|
||||
userId: string,
|
||||
text: string,
|
||||
options: any
|
||||
): Promise<ProcessingResult> {
|
||||
logger.info('Using single-pass quality-check processing strategy', { documentId });
|
||||
|
||||
const result = await singlePassProcessor.processDocument(
|
||||
documentId, userId, text, {
|
||||
fileBuffer: options.fileBuffer,
|
||||
fileName: options.fileName,
|
||||
mimeType: options.mimeType,
|
||||
}
|
||||
);
|
||||
|
||||
if (!result.success) {
|
||||
return {
|
||||
success: false,
|
||||
summary: '',
|
||||
analysisData: defaultCIMReview,
|
||||
processingStrategy: 'single_pass_quality_check',
|
||||
processingTime: result.processingTime,
|
||||
apiCalls: result.apiCalls,
|
||||
error: result.error,
|
||||
};
|
||||
}
|
||||
|
||||
// Calculate page count from PDF if available
|
||||
if (options.fileBuffer && options.fileName?.toLowerCase().endsWith('.pdf')) {
|
||||
try {
|
||||
const pdf = require('pdf-parse');
|
||||
const pdfData = await pdf(options.fileBuffer);
|
||||
if (pdfData.numpages > 0) {
|
||||
if (!result.analysisData.dealOverview) {
|
||||
result.analysisData.dealOverview = {} as any;
|
||||
}
|
||||
result.analysisData.dealOverview.cimPageCount = pdfData.numpages.toString();
|
||||
}
|
||||
} catch (error) {
|
||||
logger.warn('Failed to calculate page count from PDF', {
|
||||
documentId,
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Log quality metadata for monitoring
|
||||
logger.info('Single-pass: Quality metadata', {
|
||||
documentId,
|
||||
qualityCheckPassed: result.qualityCheckPassed,
|
||||
wasTruncated: result.wasTruncated,
|
||||
completenessScore: result.completenessScore,
|
||||
});
|
||||
|
||||
if (!result.qualityCheckPassed) {
|
||||
logger.error('Single-pass: Quality check did NOT pass — extraction is unverified', {
|
||||
documentId,
|
||||
});
|
||||
}
|
||||
|
||||
// Run the unified processor's own validation as well
|
||||
const finalValidation = this.validateFinalData(result.analysisData);
|
||||
if (!finalValidation.isValid) {
|
||||
logger.warn('Single-pass: Final validation found issues', {
|
||||
documentId,
|
||||
completenessScore: finalValidation.completenessScore,
|
||||
emptyFields: finalValidation.emptyFields.length,
|
||||
lowQualityFields: finalValidation.lowQualityFields.length,
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
summary: result.summary,
|
||||
analysisData: result.analysisData,
|
||||
processingStrategy: 'single_pass_quality_check',
|
||||
processingTime: result.processingTime,
|
||||
apiCalls: result.apiCalls,
|
||||
error: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Process document using Document AI + Agentic RAG approach
|
||||
*/
|
||||
|
||||
@@ -119,12 +119,12 @@ class VectorDatabaseService {
|
||||
rpcParams.filter_document_id = documentId;
|
||||
}
|
||||
|
||||
// Set a timeout for the RPC call (10 seconds)
|
||||
// Set a timeout for the RPC call (30 seconds)
|
||||
const searchPromise = this.supabaseClient
|
||||
.rpc('match_document_chunks', rpcParams);
|
||||
|
||||
const timeoutPromise = new Promise<{ data: null; error: { message: string } }>((_, reject) => {
|
||||
setTimeout(() => reject(new Error('Vector search timeout after 10s')), 10000);
|
||||
setTimeout(() => reject(new Error('Vector search timeout after 30s')), 30000);
|
||||
});
|
||||
|
||||
let result: any;
|
||||
@@ -132,8 +132,8 @@ class VectorDatabaseService {
|
||||
result = await Promise.race([searchPromise, timeoutPromise]);
|
||||
} catch (timeoutError: any) {
|
||||
if (timeoutError.message?.includes('timeout')) {
|
||||
logger.error('Vector search timed out', { documentId, timeout: '10s' });
|
||||
throw new Error('Vector search timeout after 10s');
|
||||
logger.error('Vector search timed out', { documentId, timeout: '30s' });
|
||||
throw new Error('Vector search timeout after 30s');
|
||||
}
|
||||
throw timeoutError;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user