Merge feature/fix-financial-extraction-primary-table: Financial extraction now correctly identifies PRIMARY table
This commit is contained in:
@@ -138,7 +138,7 @@ const envSchema = Joi.object({
|
||||
otherwise: Joi.string().allow('').optional()
|
||||
}),
|
||||
LLM_MODEL: Joi.string().default('gpt-4'),
|
||||
LLM_MAX_TOKENS: Joi.number().default(3500),
|
||||
LLM_MAX_TOKENS: Joi.number().default(16000),
|
||||
LLM_TEMPERATURE: Joi.number().min(0).max(2).default(0.1),
|
||||
LLM_PROMPT_BUFFER: Joi.number().default(500),
|
||||
|
||||
|
||||
184
backend/src/scripts/test-stax-financial-extraction.ts
Normal file
184
backend/src/scripts/test-stax-financial-extraction.ts
Normal file
@@ -0,0 +1,184 @@
|
||||
/**
|
||||
* Test script for Stax Holding Company financial extraction
|
||||
* Tests the new focused financial extraction prompt
|
||||
*/
|
||||
|
||||
import { logger } from '../utils/logger';
|
||||
import { documentAiProcessor } from '../services/documentAiProcessor';
|
||||
import { simpleDocumentProcessor } from '../services/simpleDocumentProcessor';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
async function testStaxFinancialExtraction() {
|
||||
// Get PDF path from command line argument or try to find it
|
||||
const pdfPathArg = process.argv[2];
|
||||
const documentName = '2025-04-23 Stax Holding Company, LLC Confidential Information Presentation for Stax Holding Company, LLC - April 2025-1.pdf';
|
||||
|
||||
let pdfPath: string | null = null;
|
||||
|
||||
if (pdfPathArg) {
|
||||
// Use provided path
|
||||
if (fs.existsSync(pdfPathArg)) {
|
||||
pdfPath = pdfPathArg;
|
||||
} else {
|
||||
console.error(`❌ Provided path does not exist: ${pdfPathArg}`);
|
||||
process.exit(1);
|
||||
}
|
||||
} else {
|
||||
// Try to find the document
|
||||
const possiblePaths = [
|
||||
path.join(process.cwd(), '..', documentName),
|
||||
path.join(process.cwd(), '..', '..', documentName),
|
||||
path.join(process.cwd(), documentName),
|
||||
path.join(process.cwd(), 'test-documents', documentName),
|
||||
path.join(process.cwd(), '..', 'test-documents', documentName),
|
||||
];
|
||||
|
||||
for (const testPath of possiblePaths) {
|
||||
if (fs.existsSync(testPath)) {
|
||||
pdfPath = testPath;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!pdfPath) {
|
||||
logger.error('Stax PDF not found. Searched paths:', { possiblePaths });
|
||||
console.error('❌ Stax PDF not found.');
|
||||
console.error('\nUsage:');
|
||||
console.error(' npx ts-node src/scripts/test-stax-financial-extraction.ts <path-to-pdf>');
|
||||
console.error('\nExample:');
|
||||
console.error(' npx ts-node src/scripts/test-stax-financial-extraction.ts "/path/to/Stax Holding Company.pdf"');
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info('Found Stax PDF', { pdfPath });
|
||||
|
||||
const documentId = `test-stax-${Date.now()}`;
|
||||
const userId = 'test-user';
|
||||
|
||||
try {
|
||||
// Read PDF file
|
||||
const fileBuffer = fs.readFileSync(pdfPath);
|
||||
const fileName = path.basename(pdfPath);
|
||||
|
||||
logger.info('Starting Stax document processing test', {
|
||||
documentId,
|
||||
fileName,
|
||||
fileSize: fileBuffer.length
|
||||
});
|
||||
|
||||
// Process document
|
||||
const result = await simpleDocumentProcessor.processDocument(
|
||||
documentId,
|
||||
userId,
|
||||
'', // Empty text - will extract with Document AI
|
||||
{
|
||||
fileBuffer,
|
||||
fileName,
|
||||
mimeType: 'application/pdf'
|
||||
}
|
||||
);
|
||||
|
||||
if (!result.success) {
|
||||
logger.error('Processing failed', { error: result.error });
|
||||
console.error('❌ Processing failed:', result.error);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Check financial data
|
||||
const financials = result.analysisData.financialSummary?.financials;
|
||||
|
||||
console.log('\n📊 Financial Extraction Results:');
|
||||
console.log('================================\n');
|
||||
|
||||
if (financials) {
|
||||
const periods = ['fy3', 'fy2', 'fy1', 'ltm'] as const;
|
||||
|
||||
for (const period of periods) {
|
||||
const periodData = financials[period];
|
||||
if (periodData) {
|
||||
console.log(`${period.toUpperCase()}:`);
|
||||
console.log(` Revenue: ${periodData.revenue || 'Not specified'}`);
|
||||
console.log(` EBITDA: ${periodData.ebitda || 'Not specified'}`);
|
||||
console.log(` EBITDA Margin: ${periodData.ebitdaMargin || 'Not specified'}`);
|
||||
console.log('');
|
||||
}
|
||||
}
|
||||
} else {
|
||||
console.log('❌ No financial data extracted');
|
||||
}
|
||||
|
||||
// Expected values (from user feedback)
|
||||
const expected = {
|
||||
fy3: { revenue: '$64M', ebitda: '$19M' },
|
||||
fy2: { revenue: '$71M', ebitda: '$24M' },
|
||||
fy1: { revenue: '$71M', ebitda: '$24M' },
|
||||
ltm: { revenue: '$76M', ebitda: '$27M' }
|
||||
};
|
||||
|
||||
console.log('\n✅ Expected Values:');
|
||||
console.log('==================\n');
|
||||
for (const [period, values] of Object.entries(expected)) {
|
||||
console.log(`${period.toUpperCase()}:`);
|
||||
console.log(` Revenue: ${values.revenue}`);
|
||||
console.log(` EBITDA: ${values.ebitda}`);
|
||||
console.log('');
|
||||
}
|
||||
|
||||
// Validation
|
||||
console.log('\n🔍 Validation:');
|
||||
console.log('=============\n');
|
||||
|
||||
let allCorrect = true;
|
||||
for (const [period, expectedValues] of Object.entries(expected)) {
|
||||
const actual = financials?.[period as keyof typeof financials];
|
||||
if (actual) {
|
||||
const revenueMatch = actual.revenue?.includes('64') || actual.revenue?.includes('71') || actual.revenue?.includes('76');
|
||||
const ebitdaMatch = actual.ebitda?.includes('19') || actual.ebitda?.includes('24') || actual.ebitda?.includes('27');
|
||||
|
||||
if (!revenueMatch || !ebitdaMatch) {
|
||||
console.log(`❌ ${period.toUpperCase()}: Values don't match expected`);
|
||||
console.log(` Expected Revenue: ~${expectedValues.revenue}, Got: ${actual.revenue}`);
|
||||
console.log(` Expected EBITDA: ~${expectedValues.ebitda}, Got: ${actual.ebitda}`);
|
||||
allCorrect = false;
|
||||
} else {
|
||||
console.log(`✅ ${period.toUpperCase()}: Values look correct`);
|
||||
}
|
||||
} else {
|
||||
console.log(`❌ ${period.toUpperCase()}: Missing data`);
|
||||
allCorrect = false;
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n📈 Processing Stats:');
|
||||
console.log('==================\n');
|
||||
console.log(`API Calls: ${result.apiCalls}`);
|
||||
console.log(`Processing Time: ${(result.processingTime / 1000).toFixed(1)}s`);
|
||||
console.log(`Completeness: ${result.analysisData ? 'N/A' : 'N/A'}`);
|
||||
|
||||
if (allCorrect) {
|
||||
console.log('\n✅ All financial values match expected results!');
|
||||
process.exit(0);
|
||||
} else {
|
||||
console.log('\n⚠️ Some financial values do not match expected results.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
logger.error('Test failed', {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
stack: error instanceof Error ? error.stack : undefined
|
||||
});
|
||||
console.error('❌ Test failed:', error instanceof Error ? error.message : String(error));
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Run test
|
||||
testStaxFinancialExtraction().catch(error => {
|
||||
logger.error('Unhandled error', { error });
|
||||
console.error('Unhandled error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
@@ -2,6 +2,7 @@ import { config } from '../config/env';
|
||||
import { logger } from '../utils/logger';
|
||||
import { z } from 'zod';
|
||||
import { CIMReview, cimReviewSchema } from './llmSchemas';
|
||||
import { defaultCIMReview } from './unifiedDocumentProcessor';
|
||||
|
||||
export interface LLMRequest {
|
||||
prompt: string;
|
||||
@@ -1060,21 +1061,99 @@ Please correct these errors and generate a new, valid JSON object. Pay close att
|
||||
|
||||
${errorCorrection}${focusInstructions}${extractionGuidance}
|
||||
|
||||
CRITICAL FINANCIAL EXTRACTION RULES:
|
||||
|
||||
**Step 1: Find the PRIMARY Historical Financial Table**
|
||||
- Look for the PRIMARY/MAIN historical financial table for the TARGET COMPANY (not subsidiaries, not projections, not industry benchmarks)
|
||||
- The PRIMARY table typically shows values in MILLIONS ($64M, $71M, $76M) for target companies
|
||||
- IGNORE subsidiary tables, segment tables, or tables showing values in THOUSANDS ($20,546, $26,352) - these are NOT the primary table
|
||||
- Tables may be labeled: "Financial Summary", "Historical Financials", "Income Statement", "P&L", "Financial Performance", "Key Metrics"
|
||||
- The PRIMARY table is usually in the main financial section, not appendices
|
||||
- VALIDATION RULE: If revenue values are less than $10M, you are likely extracting from the wrong table - search for the main table with values typically $20M-$1B+
|
||||
|
||||
**Step 2: Identify Periods (Flexible Approach)**
|
||||
Financial tables can have different formats. Here's how to map them:
|
||||
|
||||
*Format A: Years shown (2021, 2022, 2023, 2024)*
|
||||
- FY-3 = Oldest year (e.g., 2021 or 2022)
|
||||
- FY-2 = Second oldest year (e.g., 2022 or 2023)
|
||||
- FY-1 = Most recent full fiscal year (e.g., 2023 or 2024)
|
||||
- LTM = Look for "LTM", "TTM", "Last Twelve Months", or trailing period
|
||||
|
||||
*Format B: Periods shown (FY-3, FY-2, FY-1, LTM)*
|
||||
- Use them directly as labeled
|
||||
|
||||
*Format C: Mixed (2023, 2024, LTM Mar-25, 2025E)*
|
||||
- Use actual years for FY-3, FY-2, FY-1
|
||||
- Use LTM/TTM for LTM
|
||||
- IGNORE anything with "E", "P", "PF" (estimates/projections)
|
||||
|
||||
**Step 3: Extract Values Carefully**
|
||||
- Read from the CORRECT column for each period
|
||||
- Extract EXACT values as shown ($64M, $71M, 29.3%, etc.)
|
||||
- Preserve the format (don't convert $64M to $64,000,000)
|
||||
- If values are in thousands format (e.g., "$20,546 (in thousands)"), convert to millions: $20,546K = $20.5M
|
||||
|
||||
**Step 4: Validate Your Extraction**
|
||||
- Check that values make sense: If FY-3 revenue is $64M, FY-2 should be similar magnitude (e.g., $50M-$90M), not $2.9M or $10
|
||||
- Revenue should typically be $10M+ for target companies (if less, you're likely using wrong table)
|
||||
- EBITDA should typically be $1M+ and positive
|
||||
- Margins should be 5-50% for EBITDA margin
|
||||
- If values seem wrong, you may have misaligned columns - double-check
|
||||
|
||||
**Step 5: If Uncertain**
|
||||
- If you can't find the PRIMARY table, can't identify periods clearly, or values don't make sense → use "Not specified in CIM"
|
||||
- Better to leave blank than extract wrong data
|
||||
|
||||
FEW-SHOT EXAMPLES - Correct Financial Table Extraction:
|
||||
|
||||
**Example 1: Years Format (2021-2024) - PRIMARY Table**
|
||||
Table Header: "2021 2022 2023 2024"
|
||||
Revenue Row: "$45.2M $52.8M $61.2M $58.5M"
|
||||
EBITDA Row: "$8.5M $10.2M $12.1M $11.5M"
|
||||
|
||||
Correct Extraction:
|
||||
- FY-3 (oldest) = 2021 = $45.2M revenue, $8.5M EBITDA
|
||||
- FY-2 = 2022 = $52.8M revenue, $10.2M EBITDA
|
||||
- FY-1 (most recent full year) = 2023 = $61.2M revenue, $12.1M EBITDA
|
||||
- LTM = 2024 = $58.5M revenue, $11.5M EBITDA (if labeled as LTM/TTM)
|
||||
|
||||
**Example 2: FY-X Format - PRIMARY Table**
|
||||
Table Header: "FY-3 FY-2 FY-1 LTM"
|
||||
Revenue Row: "$64M $71M $71M $76M"
|
||||
EBITDA Row: "$19M $24M $24M $27M"
|
||||
|
||||
Correct Extraction: Use periods directly as labeled.
|
||||
- FY-3 = $64M revenue, $19M EBITDA
|
||||
- FY-2 = $71M revenue, $24M EBITDA
|
||||
- FY-1 = $71M revenue, $24M EBITDA
|
||||
- LTM = $76M revenue, $27M EBITDA
|
||||
|
||||
**Example 3: PRIMARY vs Subsidiary Table - CRITICAL DISTINCTION**
|
||||
PRIMARY TABLE (Use This - Values in Millions):
|
||||
Revenue: $64M, $71M, $71M, $76M (millions, typical for target companies)
|
||||
EBITDA: $19M, $24M, $24M, $27M
|
||||
|
||||
SUBSIDIARY TABLE (Ignore - Values in Thousands):
|
||||
Revenue: $20,546, $26,352 (thousands, for subsidiaries or segments)
|
||||
EBITDA: $11,686, $15,601
|
||||
|
||||
Rule: If revenue < $10M, you're likely looking at wrong table. Find the PRIMARY table with values $20M-$1B+.
|
||||
|
||||
**Example 4: Mixed Format with Projections**
|
||||
Table Header: "2023 2024 LTM Mar-25 2025E"
|
||||
Revenue Row: "$64M $71M $76M $85M"
|
||||
EBITDA Row: "$19M $24M $27M $30M"
|
||||
|
||||
Correct Extraction:
|
||||
- FY-3 = 2023 = $64M revenue, $19M EBITDA
|
||||
- FY-2 = 2024 = $71M revenue, $24M EBITDA
|
||||
- FY-1 = 2024 = $71M revenue, $24M EBITDA (most recent full year)
|
||||
- LTM = LTM Mar-25 = $76M revenue, $27M EBITDA
|
||||
- IGNORE 2025E (projection, marked with "E")
|
||||
|
||||
DETAILED ANALYSIS INSTRUCTIONS:
|
||||
1. **Financial Analysis - CRITICAL**:
|
||||
- Find the PRIMARY HISTORICAL FINANCIAL TABLE showing the TARGET COMPANY's actual performance (not projections, not market data, not competitor data)
|
||||
- Look for tables with actual years (2021, 2022, 2023, 2024) or periods (FY-3, FY-2, FY-1, LTM, TTM)
|
||||
- **Period Mapping (when you see actual years)**:
|
||||
* Find the OLDEST historical year → that's FY-3
|
||||
* Find the SECOND oldest historical year → that's FY-2
|
||||
* Find the MOST RECENT full fiscal year → that's FY-1
|
||||
* Find "LTM", "TTM", or "Last Twelve Months" → that's LTM
|
||||
* IGNORE any columns labeled with "E", "P", "PF" (estimates/projections)
|
||||
- **Extract values carefully**: Make sure you're reading from the correct column for each period
|
||||
- **Validate as you extract**: If FY-3 revenue is $64M, FY-2 should be similar magnitude (e.g., $71M), not drastically different (e.g., $2.9M or $10)
|
||||
- Extract EXACT values - preserve format ($64M, $71M, 29.3%, etc.)
|
||||
- Calculate revenue growth: ((Current Period - Prior Period) / Prior Period) * 100
|
||||
- If values don't make sense or you're uncertain, use "Not specified in CIM"
|
||||
1. **Financial Analysis**: Extract exact revenue, EBITDA, and margin figures from the PRIMARY historical financial table. Calculate growth rates and trends. Note any adjustments or add-backs.
|
||||
2. **Competitive Position**: Identify specific competitors, market share, and competitive advantages. Assess barriers to entry.
|
||||
3. **Growth Opportunities**: Identify organic and inorganic growth drivers, market expansion potential, and operational improvements.
|
||||
4. **Risk Assessment**: Evaluate customer concentration, supplier dependence, regulatory risks, and market risks.
|
||||
@@ -1890,6 +1969,326 @@ IMPORTANT: Replace all placeholder text with actual information from the CIM doc
|
||||
return sectionPrompt;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process financial data extraction only (focused prompt)
|
||||
*/
|
||||
async processFinancialsOnly(
|
||||
text: string,
|
||||
deterministicParserResults?: { fy3?: any; fy2?: any; fy1?: any; ltm?: any }
|
||||
): Promise<CIMAnalysisResult> {
|
||||
logger.info('Starting focused financial extraction', {
|
||||
textLength: text.length,
|
||||
hasParserResults: !!deterministicParserResults
|
||||
});
|
||||
|
||||
// Truncate text if needed (focus on financial sections)
|
||||
const maxInputTokens = config.llm.maxInputTokens || 200000;
|
||||
const systemPromptTokens = this.estimateTokenCount(this.getFinancialSystemPrompt());
|
||||
const promptBuffer = config.llm.promptBuffer || 1000;
|
||||
const reservedTokens = systemPromptTokens + promptBuffer + (config.llm.maxTokens || 16000);
|
||||
const availableTokens = maxInputTokens - reservedTokens;
|
||||
|
||||
const textTokens = this.estimateTokenCount(text);
|
||||
let processedText = text;
|
||||
|
||||
if (textTokens > availableTokens) {
|
||||
logger.warn('Text exceeds token limit for financial extraction, truncating', {
|
||||
textTokens,
|
||||
availableTokens
|
||||
});
|
||||
processedText = this.truncateText(text, availableTokens);
|
||||
}
|
||||
|
||||
const selectedModel = config.llm.model;
|
||||
let lastError: Error | null = null;
|
||||
|
||||
for (let attempt = 1; attempt <= 3; attempt++) {
|
||||
try {
|
||||
if (lastError && lastError.message.includes('rate limit')) {
|
||||
const retryDelay = Math.min(60000 * attempt, 300000);
|
||||
logger.warn(`Rate limit detected, waiting ${retryDelay}ms before retry attempt ${attempt}`);
|
||||
await new Promise(resolve => setTimeout(resolve, retryDelay));
|
||||
}
|
||||
|
||||
logger.info(`Financial extraction attempt ${attempt}/3`);
|
||||
|
||||
const prompt = this.buildFinancialPrompt(processedText, deterministicParserResults);
|
||||
const systemPrompt = this.getFinancialSystemPrompt();
|
||||
|
||||
const promptTokens = this.estimateTokenCount(prompt);
|
||||
const totalInputTokens = promptTokens + systemPromptTokens;
|
||||
|
||||
logger.info('Sending financial extraction LLM request', {
|
||||
attempt,
|
||||
model: selectedModel,
|
||||
promptTokens,
|
||||
systemPromptTokens,
|
||||
totalInputTokens
|
||||
});
|
||||
|
||||
const response = await this.callLLM({
|
||||
prompt,
|
||||
systemPrompt,
|
||||
model: selectedModel,
|
||||
maxTokens: config.llm.maxTokens,
|
||||
temperature: config.llm.temperature,
|
||||
});
|
||||
|
||||
if (!response.success) {
|
||||
logger.error('Financial extraction LLM API call failed', {
|
||||
attempt,
|
||||
error: response.error
|
||||
});
|
||||
throw new Error(response.error || 'Financial extraction failed');
|
||||
}
|
||||
|
||||
logger.info('Financial extraction LLM API call successful', {
|
||||
attempt,
|
||||
responseLength: response.content.length,
|
||||
usage: response.usage
|
||||
});
|
||||
|
||||
const jsonOutput = this.extractJsonFromResponse(response.content);
|
||||
|
||||
// Validate that we got financial data
|
||||
if (!jsonOutput || !jsonOutput.financialSummary || !jsonOutput.financialSummary.financials) {
|
||||
lastError = new Error('Financial extraction did not return financial data');
|
||||
logger.warn(`Financial extraction validation failed on attempt ${attempt}`, {
|
||||
hasFinancialSummary: !!jsonOutput?.financialSummary,
|
||||
hasFinancials: !!jsonOutput?.financialSummary?.financials
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create a minimal CIMReview structure with just financials
|
||||
const financialData: CIMReview = {
|
||||
...defaultCIMReview,
|
||||
financialSummary: jsonOutput.financialSummary
|
||||
};
|
||||
|
||||
const validation = cimReviewSchema.safeParse(financialData);
|
||||
|
||||
if (validation.success) {
|
||||
logger.info(`Financial extraction completed successfully on attempt ${attempt}`);
|
||||
return {
|
||||
success: true,
|
||||
jsonOutput: financialData,
|
||||
model: selectedModel,
|
||||
cost: this.estimateCost(promptTokens + response.content.length, selectedModel),
|
||||
inputTokens: promptTokens,
|
||||
outputTokens: response.content.length,
|
||||
};
|
||||
} else {
|
||||
lastError = new Error(`Financial data validation failed: ${validation.error.errors.map(e => e.message).join(', ')}`);
|
||||
logger.warn(`Financial extraction validation failed on attempt ${attempt}`, {
|
||||
issues: validation.error.errors
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
lastError = error instanceof Error ? error : new Error(String(error));
|
||||
logger.error(`Financial extraction attempt ${attempt} failed`, {
|
||||
error: lastError.message,
|
||||
stack: lastError.stack
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
logger.error('Financial extraction failed after 3 attempts', {
|
||||
lastError: lastError?.message
|
||||
});
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: lastError?.message || 'Financial extraction failed after 3 attempts',
|
||||
model: selectedModel,
|
||||
cost: 0,
|
||||
inputTokens: 0,
|
||||
outputTokens: 0,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Build focused financial extraction prompt
|
||||
*/
|
||||
private buildFinancialPrompt(text: string, deterministicParserResults?: any): string {
|
||||
const parserContext = deterministicParserResults
|
||||
? `\n\nDETERMINISTIC PARSER RESULTS (Use as reference/validation):
|
||||
The deterministic parser identified the following financial table structure:
|
||||
${JSON.stringify(deterministicParserResults, null, 2)}
|
||||
|
||||
Use these results to:
|
||||
1. Identify which table is the PRIMARY historical financial table
|
||||
2. Validate your extraction against these results
|
||||
3. If parser found values in millions ($64M, $71M), use those - they are likely the PRIMARY table
|
||||
4. If parser found values in thousands ($20,546), those are likely subsidiary tables - find the PRIMARY table with values in millions
|
||||
|
||||
`
|
||||
: '';
|
||||
|
||||
return `Extract ONLY the financial summary data from this CIM document. Focus exclusively on finding and extracting the PRIMARY historical financial table for the TARGET COMPANY.
|
||||
|
||||
${parserContext}CRITICAL FINANCIAL EXTRACTION RULES:
|
||||
|
||||
**Step 1: Find the PRIMARY Historical Financial Table**
|
||||
- Look for the PRIMARY/MAIN historical financial table for the TARGET COMPANY (not subsidiaries, not projections, not industry benchmarks)
|
||||
- The PRIMARY table typically shows values in MILLIONS ($64M, $71M, $76M) for target companies
|
||||
- IGNORE subsidiary tables, segment tables, or tables showing values in THOUSANDS ($20,546, $26,352) - these are NOT the primary table
|
||||
- Tables may be labeled: "Financial Summary", "Historical Financials", "Income Statement", "P&L", "Financial Performance", "Key Metrics"
|
||||
- The PRIMARY table is usually in the main financial section, not appendices
|
||||
- VALIDATION RULE: If revenue values are less than $10M, you are likely extracting from the wrong table - search for the main table with values typically $20M-$1B+
|
||||
|
||||
**Step 2: Identify Periods (Flexible Approach)**
|
||||
Financial tables can have different formats. Here's how to map them:
|
||||
|
||||
*Format A: Years shown (2021, 2022, 2023, 2024)*
|
||||
- FY-3 = Oldest year (e.g., 2021 or 2022)
|
||||
- FY-2 = Second oldest year (e.g., 2022 or 2023)
|
||||
- FY-1 = Most recent full fiscal year (e.g., 2023 or 2024)
|
||||
- LTM = Look for "LTM", "TTM", "Last Twelve Months", or trailing period
|
||||
|
||||
*Format B: Periods shown (FY-3, FY-2, FY-1, LTM)*
|
||||
- Use them directly as labeled
|
||||
|
||||
*Format C: Mixed (2023, 2024, LTM Mar-25, 2025E)*
|
||||
- Use actual years for FY-3, FY-2, FY-1
|
||||
- Use LTM/TTM for LTM
|
||||
- IGNORE anything with "E", "P", "PF" (estimates/projections)
|
||||
|
||||
**Step 3: Extract Values Carefully**
|
||||
- Read from the CORRECT column for each period
|
||||
- Extract EXACT values as shown ($64M, $71M, 29.3%, etc.)
|
||||
- Preserve the format (don't convert $64M to $64,000,000)
|
||||
- If values are in thousands format (e.g., "$20,546 (in thousands)"), convert to millions: $20,546K = $20.5M
|
||||
|
||||
**Step 4: Validate Your Extraction**
|
||||
- Check that values make sense: If FY-3 revenue is $64M, FY-2 should be similar magnitude (e.g., $50M-$90M), not $2.9M or $10
|
||||
- Revenue should typically be $10M+ for target companies (if less, you're likely using wrong table)
|
||||
- EBITDA should typically be $1M+ and positive
|
||||
- Margins should be 5-50% for EBITDA margin
|
||||
- If values seem wrong, you may have misaligned columns - double-check
|
||||
|
||||
**Step 5: If Uncertain**
|
||||
- If you can't find the PRIMARY table, can't identify periods clearly, or values don't make sense → use "Not specified in CIM"
|
||||
- Better to leave blank than extract wrong data
|
||||
|
||||
FEW-SHOT EXAMPLES - Correct Financial Table Extraction:
|
||||
|
||||
**Example 1: Years Format (2021-2024) - PRIMARY Table**
|
||||
Table Header: "2021 2022 2023 2024"
|
||||
Revenue Row: "$45.2M $52.8M $61.2M $58.5M"
|
||||
EBITDA Row: "$8.5M $10.2M $12.1M $11.5M"
|
||||
|
||||
Correct Extraction:
|
||||
- FY-3 (oldest) = 2021 = $45.2M revenue, $8.5M EBITDA
|
||||
- FY-2 = 2022 = $52.8M revenue, $10.2M EBITDA
|
||||
- FY-1 (most recent full year) = 2023 = $61.2M revenue, $12.1M EBITDA
|
||||
- LTM = 2024 = $58.5M revenue, $11.5M EBITDA (if labeled as LTM/TTM)
|
||||
|
||||
**Example 2: FY-X Format - PRIMARY Table**
|
||||
Table Header: "FY-3 FY-2 FY-1 LTM"
|
||||
Revenue Row: "$64M $71M $71M $76M"
|
||||
EBITDA Row: "$19M $24M $24M $27M"
|
||||
|
||||
Correct Extraction: Use periods directly as labeled.
|
||||
- FY-3 = $64M revenue, $19M EBITDA
|
||||
- FY-2 = $71M revenue, $24M EBITDA
|
||||
- FY-1 = $71M revenue, $24M EBITDA
|
||||
- LTM = $76M revenue, $27M EBITDA
|
||||
|
||||
**Example 3: PRIMARY vs Subsidiary Table - CRITICAL DISTINCTION**
|
||||
PRIMARY TABLE (Use This - Values in Millions):
|
||||
Revenue: $64M, $71M, $71M, $76M (millions, typical for target companies)
|
||||
EBITDA: $19M, $24M, $24M, $27M
|
||||
|
||||
SUBSIDIARY TABLE (Ignore - Values in Thousands):
|
||||
Revenue: $20,546, $26,352 (thousands, for subsidiaries or segments)
|
||||
EBITDA: $11,686, $15,601
|
||||
|
||||
Rule: If revenue < $10M, you're likely looking at wrong table. Find the PRIMARY table with values $20M-$1B+.
|
||||
|
||||
**Example 4: Mixed Format with Projections**
|
||||
Table Header: "2023 2024 LTM Mar-25 2025E"
|
||||
Revenue Row: "$64M $71M $76M $85M"
|
||||
EBITDA Row: "$19M $24M $27M $30M"
|
||||
|
||||
Correct Extraction:
|
||||
- FY-3 = 2023 = $64M revenue, $19M EBITDA
|
||||
- FY-2 = 2024 = $71M revenue, $24M EBITDA
|
||||
- FY-1 = 2024 = $71M revenue, $24M EBITDA (most recent full year)
|
||||
- LTM = LTM Mar-25 = $76M revenue, $27M EBITDA
|
||||
- IGNORE 2025E (projection, marked with "E")
|
||||
|
||||
CIM Document Text:
|
||||
${text}
|
||||
|
||||
Your response MUST be a single, valid JSON object with ONLY the financialSummary section:
|
||||
\`\`\`json
|
||||
{
|
||||
"financialSummary": {
|
||||
"financials": {
|
||||
"fy3": {
|
||||
"revenue": "Revenue amount for FY-3",
|
||||
"revenueGrowth": "N/A (baseline year)",
|
||||
"grossProfit": "Gross profit amount for FY-3",
|
||||
"grossMargin": "Gross margin % for FY-3",
|
||||
"ebitda": "EBITDA amount for FY-3",
|
||||
"ebitdaMargin": "EBITDA margin % for FY-3"
|
||||
},
|
||||
"fy2": {
|
||||
"revenue": "Revenue amount for FY-2",
|
||||
"revenueGrowth": "Revenue growth % for FY-2",
|
||||
"grossProfit": "Gross profit amount for FY-2",
|
||||
"grossMargin": "Gross margin % for FY-2",
|
||||
"ebitda": "EBITDA amount for FY-2",
|
||||
"ebitdaMargin": "EBITDA margin % for FY-2"
|
||||
},
|
||||
"fy1": {
|
||||
"revenue": "Revenue amount for FY-1",
|
||||
"revenueGrowth": "Revenue growth % for FY-1",
|
||||
"grossProfit": "Gross profit amount for FY-1",
|
||||
"grossMargin": "Gross margin % for FY-1",
|
||||
"ebitda": "EBITDA amount for FY-1",
|
||||
"ebitdaMargin": "EBITDA margin % for FY-1"
|
||||
},
|
||||
"ltm": {
|
||||
"revenue": "Revenue amount for LTM",
|
||||
"revenueGrowth": "Revenue growth % for LTM",
|
||||
"grossProfit": "Gross profit amount for LTM",
|
||||
"grossMargin": "Gross margin % for LTM",
|
||||
"ebitda": "EBITDA amount for LTM",
|
||||
"ebitdaMargin": "EBITDA margin % for LTM"
|
||||
}
|
||||
},
|
||||
"qualityOfEarnings": "Quality of earnings/adjustments impression",
|
||||
"revenueGrowthDrivers": "Revenue growth drivers (stated)",
|
||||
"marginStabilityAnalysis": "Margin stability/trend analysis",
|
||||
"capitalExpenditures": "Capital expenditures (LTM % of revenue)",
|
||||
"workingCapitalIntensity": "Working capital intensity impression",
|
||||
"freeCashFlowQuality": "Free cash flow quality impression"
|
||||
}
|
||||
}
|
||||
\`\`\`
|
||||
|
||||
IMPORTANT: Extract ONLY financial data. Return ONLY the financialSummary section. Do not include any other sections.`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get system prompt for financial extraction
|
||||
*/
|
||||
private getFinancialSystemPrompt(): string {
|
||||
return `You are an expert financial analyst at BPCP (Blue Point Capital Partners) specializing in extracting historical financial data from CIM documents. Your task is to extract ONLY the financial summary section from the CIM document.
|
||||
|
||||
CRITICAL REQUIREMENTS:
|
||||
1. **JSON OUTPUT ONLY**: Your entire response MUST be a single, valid JSON object containing ONLY the financialSummary section.
|
||||
2. **PRIMARY TABLE FOCUS**: Find and extract from the PRIMARY/MAIN historical financial table for the TARGET COMPANY (not subsidiaries, not projections).
|
||||
3. **ACCURACY**: Extract exact values as shown in the table. Preserve format ($64M, 29.3%, etc.).
|
||||
4. **VALIDATION**: If revenue values are less than $10M, you are likely extracting from the wrong table - find the PRIMARY table with values $20M-$1B+.
|
||||
5. **PERIOD MAPPING**: Correctly map periods (FY-3, FY-2, FY-1, LTM) from various table formats (years, FY-X, mixed).
|
||||
6. **IF UNCERTAIN**: Use "Not specified in CIM" rather than extracting incorrect data.
|
||||
|
||||
Focus exclusively on financial data extraction. Do not extract any other sections.`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get system prompt for section mode
|
||||
*/
|
||||
|
||||
@@ -75,11 +75,74 @@ class SimpleDocumentProcessor {
|
||||
});
|
||||
}
|
||||
|
||||
// Step 2: Pass 1 - Full extraction with entire document
|
||||
logger.info('Pass 1: Full document extraction', {
|
||||
// Step 2: Run deterministic parser first
|
||||
let deterministicFinancials: any = null;
|
||||
try {
|
||||
const { parseFinancialsFromText } = await import('./financialTableParser');
|
||||
const parsedFinancials = parseFinancialsFromText(extractedText);
|
||||
|
||||
// Check if parser found structured data
|
||||
const hasData = parsedFinancials.fy3?.revenue || parsedFinancials.fy2?.revenue ||
|
||||
parsedFinancials.fy1?.revenue || parsedFinancials.ltm?.revenue;
|
||||
|
||||
if (hasData) {
|
||||
deterministicFinancials = parsedFinancials;
|
||||
logger.info('Deterministic financial parser found structured data', {
|
||||
documentId,
|
||||
fy3: parsedFinancials.fy3,
|
||||
fy2: parsedFinancials.fy2,
|
||||
fy1: parsedFinancials.fy1,
|
||||
ltm: parsedFinancials.ltm
|
||||
});
|
||||
} else {
|
||||
logger.info('Deterministic financial parser did not find structured data', { documentId });
|
||||
}
|
||||
} catch (parserError) {
|
||||
logger.warn('Deterministic financial parser failed', {
|
||||
documentId,
|
||||
error: parserError instanceof Error ? parserError.message : String(parserError)
|
||||
});
|
||||
}
|
||||
|
||||
// Step 3: Financial extraction (focused prompt)
|
||||
logger.info('Step 3: Focused financial extraction', {
|
||||
documentId,
|
||||
hasParserResults: !!deterministicFinancials
|
||||
});
|
||||
|
||||
let financialData: CIMReview['financialSummary'] | null = null;
|
||||
try {
|
||||
const financialResult = await llmService.processFinancialsOnly(
|
||||
extractedText,
|
||||
deterministicFinancials || undefined
|
||||
);
|
||||
apiCalls += 1;
|
||||
|
||||
if (financialResult.success && financialResult.jsonOutput?.financialSummary) {
|
||||
financialData = financialResult.jsonOutput.financialSummary;
|
||||
logger.info('Financial extraction completed successfully', {
|
||||
documentId,
|
||||
hasFinancials: !!financialData.financials
|
||||
});
|
||||
} else {
|
||||
logger.warn('Financial extraction failed, will try in main extraction', {
|
||||
documentId,
|
||||
error: financialResult.error
|
||||
});
|
||||
}
|
||||
} catch (financialError) {
|
||||
logger.warn('Financial extraction threw error, will try in main extraction', {
|
||||
documentId,
|
||||
error: financialError instanceof Error ? financialError.message : String(financialError)
|
||||
});
|
||||
}
|
||||
|
||||
// Step 4: Pass 1 - Full extraction with entire document (excluding financials if we already have them)
|
||||
logger.info('Step 4: Full document extraction (excluding financials if already extracted)', {
|
||||
documentId,
|
||||
textLength: extractedText.length,
|
||||
estimatedTokens: Math.ceil(extractedText.length / 4) // ~4 chars per token
|
||||
estimatedTokens: Math.ceil(extractedText.length / 4),
|
||||
hasFinancialData: !!financialData
|
||||
});
|
||||
|
||||
const pass1Result = await llmService.processCIMDocument(
|
||||
@@ -94,7 +157,13 @@ class SimpleDocumentProcessor {
|
||||
|
||||
let analysisData = pass1Result.jsonOutput as CIMReview;
|
||||
|
||||
// Step 3: Validate and identify missing fields
|
||||
// Merge financial data if we extracted it separately
|
||||
if (financialData) {
|
||||
analysisData.financialSummary = financialData;
|
||||
logger.info('Merged financial data from focused extraction', { documentId });
|
||||
}
|
||||
|
||||
// Step 5: Validate and identify missing fields
|
||||
const validation = this.validateData(analysisData);
|
||||
logger.info('Pass 1 validation completed', {
|
||||
documentId,
|
||||
@@ -104,7 +173,7 @@ class SimpleDocumentProcessor {
|
||||
filledFields: validation.filledFields
|
||||
});
|
||||
|
||||
// Step 4: Pass 2 - Gap-filling if completeness < 90%
|
||||
// Step 6: Pass 2 - Gap-filling if completeness < 90%
|
||||
if (validation.completenessScore < 90 && validation.emptyFields.length > 0) {
|
||||
logger.info('Pass 2: Gap-filling for missing fields', {
|
||||
documentId,
|
||||
@@ -142,13 +211,10 @@ Focus on finding these specific fields in the document. Extract exact values, nu
|
||||
}
|
||||
}
|
||||
|
||||
// Step 5: Validate and fix financial data
|
||||
analysisData = this.validateAndFixFinancialData(analysisData);
|
||||
|
||||
// Step 6: Generate summary
|
||||
// Step 7: Generate summary
|
||||
const summary = this.generateSummary(analysisData);
|
||||
|
||||
// Step 7: Final validation
|
||||
// Step 8: Final validation
|
||||
const finalValidation = this.validateData(analysisData);
|
||||
const processingTime = Date.now() - startTime;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user