- Added auto-correction logic for EBITDA margins when difference >15pp - Fixed missing closing brace in revenue validation block - Enhanced margin validation to catch cases like 95% -> 22.3%
789 lines
30 KiB
TypeScript
789 lines
30 KiB
TypeScript
import { logger } from '../utils/logger';
|
|
import { config } from '../config/env';
|
|
import { documentAiProcessor } from './documentAiProcessor';
|
|
import { llmService } from './llmService';
|
|
import { CIMReview } from './llmSchemas';
|
|
import { cimReviewSchema } from './llmSchemas';
|
|
import { defaultCIMReview } from './unifiedDocumentProcessor';
|
|
|
|
interface ProcessingResult {
|
|
success: boolean;
|
|
summary: string;
|
|
analysisData: CIMReview;
|
|
processingStrategy: 'simple_full_document';
|
|
processingTime: number;
|
|
apiCalls: number;
|
|
error: string | undefined;
|
|
}
|
|
|
|
/**
|
|
* Simple Document Processor
|
|
*
|
|
* Strategy: Extract full text, send entire document to LLM in 1-2 passes
|
|
* - Pass 1: Full extraction with comprehensive prompt
|
|
* - Pass 2 (if needed): Validation and gap-filling
|
|
*
|
|
* This is simpler, faster, and more reliable than complex RAG chunking.
|
|
*/
|
|
class SimpleDocumentProcessor {
|
|
/**
|
|
* Process document using simple full-document approach
|
|
*/
|
|
async processDocument(
|
|
documentId: string,
|
|
userId: string,
|
|
text: string,
|
|
options: any = {}
|
|
): Promise<ProcessingResult> {
|
|
const startTime = Date.now();
|
|
let apiCalls = 0;
|
|
|
|
try {
|
|
logger.info('Simple processor: Starting', {
|
|
documentId,
|
|
textProvided: !!text && text.length > 0,
|
|
textLength: text.length,
|
|
hasFileBuffer: !!options.fileBuffer,
|
|
hasFileName: !!options.fileName
|
|
});
|
|
|
|
// Step 1: Extract text if not provided
|
|
let extractedText = text;
|
|
if (!extractedText || extractedText.length === 0) {
|
|
const { fileBuffer, fileName, mimeType } = options;
|
|
if (!fileBuffer || !fileName || !mimeType) {
|
|
throw new Error('Missing required options: fileBuffer, fileName, mimeType');
|
|
}
|
|
|
|
logger.info('Extracting text with Document AI (text only, no RAG)', { documentId, fileName });
|
|
const extractionResult = await documentAiProcessor.extractTextOnly(
|
|
documentId,
|
|
userId,
|
|
fileBuffer,
|
|
fileName,
|
|
mimeType
|
|
);
|
|
|
|
if (!extractionResult || !extractionResult.text) {
|
|
throw new Error(`Document AI text extraction failed`);
|
|
}
|
|
|
|
extractedText = extractionResult.text;
|
|
logger.info('Text extraction completed', {
|
|
documentId,
|
|
textLength: extractedText.length
|
|
});
|
|
}
|
|
|
|
// Step 2: Run deterministic parser first
|
|
let deterministicFinancials: any = null;
|
|
try {
|
|
const { parseFinancialsFromText } = await import('./financialTableParser');
|
|
const parsedFinancials = parseFinancialsFromText(extractedText);
|
|
|
|
// Check if parser found structured data
|
|
const hasData = parsedFinancials.fy3?.revenue || parsedFinancials.fy2?.revenue ||
|
|
parsedFinancials.fy1?.revenue || parsedFinancials.ltm?.revenue;
|
|
|
|
if (hasData) {
|
|
deterministicFinancials = parsedFinancials;
|
|
logger.info('Deterministic financial parser found structured data', {
|
|
documentId,
|
|
fy3: parsedFinancials.fy3,
|
|
fy2: parsedFinancials.fy2,
|
|
fy1: parsedFinancials.fy1,
|
|
ltm: parsedFinancials.ltm
|
|
});
|
|
} else {
|
|
logger.info('Deterministic financial parser did not find structured data', { documentId });
|
|
}
|
|
} catch (parserError) {
|
|
logger.warn('Deterministic financial parser failed', {
|
|
documentId,
|
|
error: parserError instanceof Error ? parserError.message : String(parserError)
|
|
});
|
|
}
|
|
|
|
// Step 3: Financial extraction (focused prompt)
|
|
logger.info('Step 3: Focused financial extraction', {
|
|
documentId,
|
|
hasParserResults: !!deterministicFinancials
|
|
});
|
|
|
|
let financialData: CIMReview['financialSummary'] | null = null;
|
|
try {
|
|
const financialResult = await llmService.processFinancialsOnly(
|
|
extractedText,
|
|
deterministicFinancials || undefined
|
|
);
|
|
apiCalls += 1;
|
|
|
|
if (financialResult.success && financialResult.jsonOutput?.financialSummary) {
|
|
financialData = financialResult.jsonOutput.financialSummary;
|
|
logger.info('Financial extraction completed successfully', {
|
|
documentId,
|
|
hasFinancials: !!financialData.financials
|
|
});
|
|
} else {
|
|
logger.warn('Financial extraction failed, will try in main extraction', {
|
|
documentId,
|
|
error: financialResult.error
|
|
});
|
|
}
|
|
} catch (financialError) {
|
|
logger.warn('Financial extraction threw error, will try in main extraction', {
|
|
documentId,
|
|
error: financialError instanceof Error ? financialError.message : String(financialError)
|
|
});
|
|
}
|
|
|
|
// Step 4: Pass 1 - Full extraction with entire document (excluding financials if we already have them)
|
|
logger.info('Step 4: Full document extraction (excluding financials if already extracted)', {
|
|
documentId,
|
|
textLength: extractedText.length,
|
|
estimatedTokens: Math.ceil(extractedText.length / 4),
|
|
hasFinancialData: !!financialData
|
|
});
|
|
|
|
const pass1Result = await llmService.processCIMDocument(
|
|
extractedText,
|
|
'BPCP CIM Review Template'
|
|
);
|
|
apiCalls += 1;
|
|
|
|
if (!pass1Result.success || !pass1Result.jsonOutput) {
|
|
throw new Error(`Pass 1 extraction failed: ${pass1Result.error || 'Unknown error'}`);
|
|
}
|
|
|
|
let analysisData = pass1Result.jsonOutput as CIMReview;
|
|
|
|
// Merge financial data if we extracted it separately
|
|
if (financialData) {
|
|
analysisData.financialSummary = financialData;
|
|
logger.info('Merged financial data from focused extraction', { documentId });
|
|
}
|
|
|
|
// Step 5: Validate and identify missing fields
|
|
const validation = this.validateData(analysisData);
|
|
logger.info('Pass 1 validation completed', {
|
|
documentId,
|
|
completeness: validation.completenessScore.toFixed(1) + '%',
|
|
emptyFields: validation.emptyFields.length,
|
|
totalFields: validation.totalFields,
|
|
filledFields: validation.filledFields
|
|
});
|
|
|
|
// Step 6: Pass 2 - Gap-filling if completeness < 90%
|
|
if (validation.completenessScore < 90 && validation.emptyFields.length > 0) {
|
|
logger.info('Pass 2: Gap-filling for missing fields', {
|
|
documentId,
|
|
missingFields: validation.emptyFields.length,
|
|
sampleFields: validation.emptyFields.slice(0, 5)
|
|
});
|
|
|
|
// Create focused prompt for missing fields
|
|
const missingFieldsList = validation.emptyFields.slice(0, 20).join(', ');
|
|
const gapFillPrompt = `The following fields are missing or incomplete. Please extract them from the document:
|
|
${missingFieldsList}
|
|
|
|
Focus on finding these specific fields in the document. Extract exact values, numbers, and details.`;
|
|
|
|
const pass2Result = await llmService.processCIMDocument(
|
|
extractedText,
|
|
'BPCP CIM Review Template',
|
|
analysisData,
|
|
validation.emptyFields.slice(0, 20), // focusedFields
|
|
gapFillPrompt // extractionInstructions
|
|
);
|
|
apiCalls += 1;
|
|
|
|
if (pass2Result.success && pass2Result.jsonOutput) {
|
|
// Merge pass 2 results into pass 1, preferring pass 2 values for missing fields
|
|
analysisData = this.mergeResults(analysisData, pass2Result.jsonOutput as CIMReview, validation.emptyFields);
|
|
|
|
// Re-validate
|
|
const finalValidation = this.validateData(analysisData);
|
|
logger.info('Pass 2 validation completed', {
|
|
documentId,
|
|
completeness: finalValidation.completenessScore.toFixed(1) + '%',
|
|
emptyFields: finalValidation.emptyFields.length
|
|
});
|
|
}
|
|
}
|
|
|
|
// Step 7: Generate summary
|
|
const summary = this.generateSummary(analysisData);
|
|
|
|
// Step 8: Final validation
|
|
const finalValidation = this.validateData(analysisData);
|
|
const processingTime = Date.now() - startTime;
|
|
|
|
logger.info('Simple processing completed', {
|
|
documentId,
|
|
completeness: finalValidation.completenessScore.toFixed(1) + '%',
|
|
totalFields: finalValidation.totalFields,
|
|
filledFields: finalValidation.filledFields,
|
|
emptyFields: finalValidation.emptyFields.length,
|
|
apiCalls,
|
|
processingTimeMs: processingTime
|
|
});
|
|
|
|
return {
|
|
success: true,
|
|
summary,
|
|
analysisData,
|
|
processingStrategy: 'simple_full_document',
|
|
processingTime,
|
|
apiCalls,
|
|
error: undefined
|
|
};
|
|
|
|
} catch (error) {
|
|
const processingTime = Date.now() - startTime;
|
|
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
|
|
|
|
logger.error('Simple processing failed', {
|
|
documentId,
|
|
error: errorMessage,
|
|
processingTimeMs: processingTime
|
|
});
|
|
|
|
return {
|
|
success: false,
|
|
summary: '',
|
|
analysisData: defaultCIMReview,
|
|
processingStrategy: 'simple_full_document',
|
|
processingTime,
|
|
apiCalls,
|
|
error: errorMessage
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Merge pass 2 results into pass 1, preferring pass 2 for missing fields
|
|
*/
|
|
private mergeResults(
|
|
pass1: CIMReview,
|
|
pass2: CIMReview,
|
|
missingFields: string[]
|
|
): CIMReview {
|
|
const merged = JSON.parse(JSON.stringify(pass1)) as CIMReview;
|
|
|
|
for (const fieldPath of missingFields) {
|
|
const value = this.getNestedValue(pass2, fieldPath);
|
|
if (value && value !== '' && value !== 'Not specified in CIM') {
|
|
this.setNestedValue(merged, fieldPath, value);
|
|
}
|
|
}
|
|
|
|
return merged;
|
|
}
|
|
|
|
/**
|
|
* Get nested value by path (e.g., "dealOverview.dealSource")
|
|
*/
|
|
private getNestedValue(obj: any, path: string): any {
|
|
const keys = path.split('.');
|
|
let current = obj;
|
|
for (const key of keys) {
|
|
if (current && typeof current === 'object' && key in current) {
|
|
current = current[key];
|
|
} else {
|
|
return undefined;
|
|
}
|
|
}
|
|
return current;
|
|
}
|
|
|
|
/**
|
|
* Set nested value by path
|
|
*/
|
|
private setNestedValue(obj: any, path: string, value: any): void {
|
|
const keys = path.split('.');
|
|
let current = obj;
|
|
for (let i = 0; i < keys.length - 1; i++) {
|
|
const key = keys[i];
|
|
if (!(key in current) || typeof current[key] !== 'object') {
|
|
current[key] = {};
|
|
}
|
|
current = current[key];
|
|
}
|
|
current[keys[keys.length - 1]] = value;
|
|
}
|
|
|
|
/**
|
|
* Validate data and calculate completeness
|
|
*/
|
|
private validateData(data: CIMReview): {
|
|
isValid: boolean;
|
|
completenessScore: number;
|
|
totalFields: number;
|
|
filledFields: number;
|
|
emptyFields: string[];
|
|
issues: string[];
|
|
} {
|
|
const emptyFields: string[] = [];
|
|
const issues: string[] = [];
|
|
let totalFields = 0;
|
|
let filledFields = 0;
|
|
|
|
// BPCP internal fields (not in CIM)
|
|
const bpcpInternalFields = [
|
|
'dealOverview.reviewers',
|
|
'dealOverview.dateReviewed',
|
|
'dealOverview.dateCIMReceived',
|
|
];
|
|
|
|
// Optional fields (allowed to be empty)
|
|
const optionalFields = [
|
|
'dealOverview.transactionType',
|
|
'dealOverview.statedReasonForSale',
|
|
'businessDescription.customerBaseOverview.customerConcentrationRisk',
|
|
'businessDescription.customerBaseOverview.typicalContractLength',
|
|
];
|
|
|
|
const isBpcpInternalField = (path: string): boolean => {
|
|
return bpcpInternalFields.some(field => path === field || path.startsWith(field + '.'));
|
|
};
|
|
|
|
const isOptionalField = (path: string): boolean => {
|
|
return optionalFields.some(field => path === field || path.startsWith(field + '.'));
|
|
};
|
|
|
|
const checkValue = (value: any, path: string = ''): void => {
|
|
// Skip BPCP internal fields
|
|
if (isBpcpInternalField(path)) {
|
|
return;
|
|
}
|
|
|
|
if (value === null || value === undefined) {
|
|
if (!isOptionalField(path)) {
|
|
emptyFields.push(path);
|
|
}
|
|
totalFields++;
|
|
return;
|
|
}
|
|
|
|
if (typeof value === 'string') {
|
|
totalFields++;
|
|
const trimmed = value.trim();
|
|
|
|
if (trimmed === '' || trimmed === 'Not specified in CIM') {
|
|
if (!isOptionalField(path)) {
|
|
emptyFields.push(path);
|
|
} else {
|
|
filledFields++; // Count optional fields as filled even if "Not specified"
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Check minimum length (except for short fields like page count)
|
|
const shortFields = ['dealOverview.cimPageCount'];
|
|
const isShortField = shortFields.some(field => path === field || path.startsWith(field + '.'));
|
|
|
|
if (!isShortField && trimmed.length < 10) {
|
|
issues.push(`${path}: Too short (${trimmed.length} chars, min 10)`);
|
|
}
|
|
|
|
filledFields++;
|
|
} else if (typeof value === 'object' && value !== null && !Array.isArray(value)) {
|
|
Object.keys(value).forEach(key => {
|
|
checkValue(value[key], path ? `${path}.${key}` : key);
|
|
});
|
|
}
|
|
};
|
|
|
|
checkValue(data);
|
|
|
|
const completenessScore = totalFields > 0
|
|
? (filledFields / totalFields) * 100
|
|
: 0;
|
|
|
|
// Validate schema
|
|
const schemaValidation = cimReviewSchema.safeParse(data);
|
|
const isValid = schemaValidation.success;
|
|
|
|
if (!isValid) {
|
|
issues.push(`Schema validation failed: ${schemaValidation.error?.errors.map(e => e.message).join(', ')}`);
|
|
}
|
|
|
|
return {
|
|
isValid,
|
|
completenessScore,
|
|
totalFields,
|
|
filledFields,
|
|
emptyFields,
|
|
issues
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Generate summary from analysis data
|
|
*/
|
|
/**
|
|
* Validate and fix financial data - reject obviously wrong values
|
|
*/
|
|
private validateAndFixFinancialData(data: CIMReview): CIMReview {
|
|
if (!data.financialSummary?.financials) {
|
|
return data;
|
|
}
|
|
|
|
const financials = data.financialSummary.financials;
|
|
const periods: Array<'fy3' | 'fy2' | 'fy1' | 'ltm'> = ['fy3', 'fy2', 'fy1', 'ltm'];
|
|
|
|
// Helper to check if a financial value is obviously wrong
|
|
const isInvalidValue = (value: string, fieldType: 'revenue' | 'ebitda' = 'revenue'): boolean => {
|
|
const trimmed = value.trim();
|
|
// Reject very short values (likely extraction errors)
|
|
if (trimmed.length < 3) return true;
|
|
|
|
// Reject specific known wrong patterns
|
|
const invalidPatterns = [
|
|
/^\$?3\.?0?0?$/, // "$3", "$3.00", "3"
|
|
/^\$?10\.?0?0?$/, // "$10", "10" (too small)
|
|
/^-\d+M$/, // "-25M", "-5M"
|
|
/^\$-?\d+M$/, // "$-25M", "$-5M"
|
|
/^\$?\d{1,2}$/, // Single or double digit dollar amounts (too small)
|
|
];
|
|
|
|
if (invalidPatterns.some(pattern => pattern.test(trimmed))) {
|
|
return true;
|
|
}
|
|
|
|
// Additional check: reject values that are too small for target companies
|
|
const numericValue = extractNumericValue(trimmed);
|
|
if (numericValue !== null) {
|
|
// Revenue should be at least $5M for target companies
|
|
if (fieldType === 'revenue' && numericValue < 5000000) {
|
|
return true;
|
|
}
|
|
// EBITDA should be at least $500K for target companies
|
|
if (fieldType === 'ebitda' && Math.abs(numericValue) < 500000) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
};
|
|
|
|
// Helper to extract numeric value from financial string
|
|
const extractNumericValue = (value: string): number | null => {
|
|
// Remove currency symbols, commas, parentheses
|
|
let cleaned = value.replace(/[$,\s()]/g, '');
|
|
|
|
// Handle K, M, B suffixes
|
|
let multiplier = 1;
|
|
if (cleaned.toLowerCase().endsWith('k')) {
|
|
multiplier = 1000;
|
|
cleaned = cleaned.slice(0, -1);
|
|
} else if (cleaned.toLowerCase().endsWith('m')) {
|
|
multiplier = 1000000;
|
|
cleaned = cleaned.slice(0, -1);
|
|
} else if (cleaned.toLowerCase().endsWith('b')) {
|
|
multiplier = 1000000000;
|
|
cleaned = cleaned.slice(0, -1);
|
|
}
|
|
|
|
// Check for negative
|
|
const isNegative = cleaned.startsWith('-');
|
|
if (isNegative) cleaned = cleaned.substring(1);
|
|
|
|
const num = parseFloat(cleaned);
|
|
if (isNaN(num)) return null;
|
|
|
|
return (isNegative ? -1 : 1) * num * multiplier;
|
|
};
|
|
|
|
periods.forEach(period => {
|
|
const periodData = financials[period];
|
|
if (!periodData) return;
|
|
|
|
// Validate revenue - should be reasonable (typically $10M-$1B+ for target companies)
|
|
if (periodData.revenue && periodData.revenue !== 'Not specified in CIM') {
|
|
if (isInvalidValue(periodData.revenue, 'revenue')) {
|
|
logger.warn('Rejecting invalid revenue value', {
|
|
period,
|
|
value: periodData.revenue,
|
|
reason: 'Value is clearly wrong (too small or invalid pattern)'
|
|
});
|
|
periodData.revenue = 'Not specified in CIM';
|
|
} else {
|
|
// Additional validation: check if numeric value is reasonable
|
|
const numericValue = extractNumericValue(periodData.revenue);
|
|
if (numericValue !== null) {
|
|
// Revenue should typically be at least $5M for a target company
|
|
// Reject if less than $5M (likely extraction error or wrong column)
|
|
if (Math.abs(numericValue) < 5000000) {
|
|
logger.warn('Rejecting revenue value - too small', {
|
|
period,
|
|
value: periodData.revenue,
|
|
numericValue,
|
|
reason: 'Revenue value is unreasonably small (<$5M) - likely wrong column or extraction error'
|
|
});
|
|
periodData.revenue = 'Not specified in CIM';
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Cross-validate: Check consistency across periods
|
|
// Enhanced validation: Check trends and detect misaligned columns
|
|
const otherPeriods = periods.filter(p => p !== period && financials[p]?.revenue);
|
|
if (otherPeriods.length > 0 && periodData.revenue && periodData.revenue !== 'Not specified in CIM') {
|
|
const currentValue = extractNumericValue(periodData.revenue);
|
|
if (currentValue !== null && currentValue > 0) {
|
|
const otherValues = otherPeriods
|
|
.map(p => {
|
|
const val = extractNumericValue(financials[p]!.revenue || '');
|
|
return val !== null && val > 0 ? { period: p as 'fy3' | 'fy2' | 'fy1' | 'ltm', value: val } : null;
|
|
})
|
|
.filter((v): v is { period: 'fy3' | 'fy2' | 'fy1' | 'ltm'; value: number } => v !== null);
|
|
|
|
if (otherValues.length > 0) {
|
|
const avgOtherValue = otherValues.reduce((a, b) => a + b.value, 0) / otherValues.length;
|
|
const maxOtherValue = Math.max(...otherValues.map(v => v.value));
|
|
const minOtherValue = Math.min(...otherValues.map(v => v.value));
|
|
|
|
// Check 1: Value is too small compared to other periods (likely wrong column)
|
|
if (currentValue < avgOtherValue * 0.2) {
|
|
logger.warn('Rejecting revenue value - inconsistent with other periods', {
|
|
period,
|
|
value: periodData.revenue,
|
|
numericValue: currentValue,
|
|
avgOtherPeriods: avgOtherValue,
|
|
maxOtherPeriods: maxOtherValue,
|
|
minOtherPeriods: minOtherValue,
|
|
reason: `Value ($${(currentValue / 1000000).toFixed(1)}M) is <20% of average ($${(avgOtherValue / 1000000).toFixed(1)}M) - likely wrong column or misaligned extraction`
|
|
});
|
|
periodData.revenue = 'Not specified in CIM';
|
|
}
|
|
|
|
// Check 2: Revenue should generally increase or be stable (FY-1/LTM shouldn't be much lower than FY-2/FY-3)
|
|
// Exception: If this is FY-3 and others are higher, that's normal
|
|
if (period !== 'fy3' && currentValue < minOtherValue * 0.5 && currentValue < avgOtherValue * 0.6) {
|
|
logger.warn('Revenue value suspiciously low compared to other periods - possible column misalignment', {
|
|
period,
|
|
value: periodData.revenue,
|
|
numericValue: currentValue,
|
|
avgOtherPeriods: avgOtherValue,
|
|
minOtherPeriods: minOtherValue,
|
|
reason: `Revenue for ${period} ($${(currentValue / 1000000).toFixed(1)}M) is <50% of minimum other period ($${(minOtherValue / 1000000).toFixed(1)}M) - may indicate column misalignment`
|
|
});
|
|
// Don't reject automatically, but flag for review - this often indicates wrong column
|
|
}
|
|
|
|
// Check 3: Detect unusual growth patterns (suggests misaligned columns)
|
|
// Find adjacent periods to check growth
|
|
const periodOrder = ['fy3', 'fy2', 'fy1', 'ltm'];
|
|
const currentIndex = periodOrder.indexOf(period);
|
|
if (currentIndex > 0) {
|
|
const prevPeriod = periodOrder[currentIndex - 1];
|
|
const prevValue = extractNumericValue(financials[prevPeriod]?.revenue || '');
|
|
if (prevValue !== null && prevValue > 0) {
|
|
const growth = ((currentValue - prevValue) / prevValue) * 100;
|
|
// Flag if growth is >200% or < -50% (unusual for year-over-year)
|
|
if (growth > 200 || growth < -50) {
|
|
logger.warn('Detected unusual revenue growth pattern - may indicate misaligned columns', {
|
|
period,
|
|
prevPeriod,
|
|
currentValue: currentValue,
|
|
prevValue: prevValue,
|
|
growth: `${growth.toFixed(1)}%`,
|
|
reason: `Unusual growth (${growth > 0 ? '+' : ''}${growth.toFixed(1)}%) between ${prevPeriod} and ${period} - may indicate column misalignment`
|
|
});
|
|
// Don't reject - just log as warning, as this might be legitimate
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Validate EBITDA - should be reasonable
|
|
if (periodData.ebitda && periodData.ebitda !== 'Not specified in CIM') {
|
|
if (isInvalidValue(periodData.ebitda, 'ebitda')) {
|
|
logger.warn('Rejecting invalid EBITDA value', {
|
|
period,
|
|
value: periodData.ebitda,
|
|
reason: 'Value is clearly wrong (too small or invalid pattern)'
|
|
});
|
|
periodData.ebitda = 'Not specified in CIM';
|
|
} else {
|
|
// EBITDA can be negative, but should be reasonable in magnitude
|
|
const numericValue = extractNumericValue(periodData.ebitda);
|
|
if (numericValue !== null) {
|
|
// Reject if absolute value is less than $1K (likely extraction error)
|
|
if (Math.abs(numericValue) < 1000) {
|
|
logger.warn('Rejecting EBITDA value - too small', {
|
|
period,
|
|
value: periodData.ebitda,
|
|
numericValue,
|
|
reason: 'EBITDA value is unreasonably small'
|
|
});
|
|
periodData.ebitda = 'Not specified in CIM';
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Validate margins - should be reasonable percentages and consistent across periods
|
|
if (periodData.ebitdaMargin && periodData.ebitdaMargin !== 'Not specified in CIM') {
|
|
const marginStr = periodData.ebitdaMargin.trim();
|
|
// Extract numeric value
|
|
const marginMatch = marginStr.match(/(-?\d+(?:\.\d+)?)/);
|
|
if (marginMatch) {
|
|
const marginValue = parseFloat(marginMatch[1]);
|
|
|
|
// First, try to calculate margin from revenue and EBITDA to validate
|
|
const revValue = extractNumericValue(periodData.revenue || '');
|
|
const ebitdaValue = extractNumericValue(periodData.ebitda || '');
|
|
|
|
if (revValue !== null && ebitdaValue !== null && revValue > 0) {
|
|
const calculatedMargin = (ebitdaValue / revValue) * 100;
|
|
const marginDiff = Math.abs(calculatedMargin - marginValue);
|
|
|
|
// If margin difference is > 15 percentage points, auto-correct it
|
|
// This catches cases like 95% when it should be 22%, or 15% when it should be 75%
|
|
if (marginDiff > 15) {
|
|
logger.warn('EBITDA margin mismatch detected - auto-correcting', {
|
|
period,
|
|
statedMargin: `${marginValue}%`,
|
|
calculatedMargin: `${calculatedMargin.toFixed(1)}%`,
|
|
difference: `${marginDiff.toFixed(1)}pp`,
|
|
revenue: periodData.revenue,
|
|
ebitda: periodData.ebitda,
|
|
action: 'Auto-correcting margin to calculated value',
|
|
reason: `Stated margin (${marginValue}%) differs significantly from calculated margin (${calculatedMargin.toFixed(1)}%) - likely extraction error`
|
|
});
|
|
// Auto-correct: Use calculated margin instead of stated margin
|
|
periodData.ebitdaMargin = `${calculatedMargin.toFixed(1)}%`;
|
|
} else if (marginDiff > 10) {
|
|
// If difference is 10-15pp, log warning but don't auto-correct (might be legitimate)
|
|
logger.warn('EBITDA margin mismatch detected', {
|
|
period,
|
|
statedMargin: `${marginValue}%`,
|
|
calculatedMargin: `${calculatedMargin.toFixed(1)}%`,
|
|
difference: `${marginDiff.toFixed(1)}pp`,
|
|
revenue: periodData.revenue,
|
|
ebitda: periodData.ebitda,
|
|
reason: `Stated margin (${marginValue}%) differs from calculated margin (${calculatedMargin.toFixed(1)}%) - may indicate data extraction error`
|
|
});
|
|
} else {
|
|
// Margin matches calculated value, but check if it's in reasonable range
|
|
// Reject margins outside reasonable range (-10% to 60%)
|
|
// Negative margins are possible but should be within reason
|
|
if (marginValue < -10 || marginValue > 60) {
|
|
logger.warn('EBITDA margin outside reasonable range - using calculated value', {
|
|
period,
|
|
value: marginStr,
|
|
numericValue: marginValue,
|
|
calculatedMargin: `${calculatedMargin.toFixed(1)}%`,
|
|
reason: `Stated margin (${marginValue}%) outside reasonable range (-10% to 60%), but calculated margin (${calculatedMargin.toFixed(1)}%) is valid - using calculated`
|
|
});
|
|
// Use calculated margin if it's in reasonable range
|
|
if (calculatedMargin >= -10 && calculatedMargin <= 60) {
|
|
periodData.ebitdaMargin = `${calculatedMargin.toFixed(1)}%`;
|
|
} else {
|
|
periodData.ebitdaMargin = 'Not specified in CIM';
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Can't calculate margin, so just check if stated margin is in reasonable range
|
|
if (marginValue < -10 || marginValue > 60) {
|
|
logger.warn('Rejecting invalid EBITDA margin', {
|
|
period,
|
|
value: marginStr,
|
|
numericValue: marginValue,
|
|
reason: `Margin (${marginValue}%) outside reasonable range (-10% to 60%)`
|
|
});
|
|
periodData.ebitdaMargin = 'Not specified in CIM';
|
|
}
|
|
}
|
|
|
|
// Check margin consistency across periods (margins should be relatively stable)
|
|
if (periodData.ebitdaMargin && periodData.ebitdaMargin !== 'Not specified in CIM') {
|
|
// Re-extract margin value after potential auto-correction
|
|
const finalMarginMatch = periodData.ebitdaMargin.match(/(-?\d+(?:\.\d+)?)/);
|
|
const finalMarginValue = finalMarginMatch ? parseFloat(finalMarginMatch[1]) : marginValue;
|
|
|
|
// Get other periods for cross-period validation
|
|
const otherPeriodsForMargin = periods.filter(p => p !== period && financials[p]?.ebitdaMargin);
|
|
const otherMargins = otherPeriodsForMargin
|
|
.map(p => {
|
|
const margin = financials[p]?.ebitdaMargin;
|
|
if (!margin || margin === 'Not specified in CIM') return null;
|
|
const match = margin.match(/(-?\d+(?:\.\d+)?)/);
|
|
return match ? parseFloat(match[1]) : null;
|
|
})
|
|
.filter((v): v is number => v !== null);
|
|
|
|
if (otherMargins.length > 0) {
|
|
const avgOtherMargin = otherMargins.reduce((a, b) => a + b, 0) / otherMargins.length;
|
|
const marginDiff = Math.abs(finalMarginValue - avgOtherMargin);
|
|
// Flag if margin differs by > 20 percentage points from average
|
|
if (marginDiff > 20) {
|
|
logger.warn('EBITDA margin inconsistency across periods', {
|
|
period,
|
|
margin: `${finalMarginValue}%`,
|
|
avgOtherPeriods: `${avgOtherMargin.toFixed(1)}%`,
|
|
difference: `${marginDiff.toFixed(1)}pp`,
|
|
reason: `Margin for ${period} (${finalMarginValue}%) differs significantly from average of other periods (${avgOtherMargin.toFixed(1)}%) - may indicate extraction error`
|
|
});
|
|
// Don't reject - just log as warning
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Validate revenue growth - should be reasonable percentage
|
|
if (periodData.revenueGrowth && periodData.revenueGrowth !== 'Not specified in CIM' && periodData.revenueGrowth !== 'N/A') {
|
|
const growthStr = periodData.revenueGrowth.trim();
|
|
const growthMatch = growthStr.match(/(-?\d+(?:\.\d+)?)/);
|
|
if (growthMatch) {
|
|
const growthValue = parseFloat(growthMatch[1]);
|
|
// Reject growth rates outside reasonable range (-50% to 500%)
|
|
if (growthValue < -50 || growthValue > 500) {
|
|
logger.warn('Rejecting invalid revenue growth', {
|
|
period,
|
|
value: growthStr,
|
|
numericValue: growthValue,
|
|
reason: 'Growth rate outside reasonable range'
|
|
});
|
|
periodData.revenueGrowth = 'Not specified in CIM';
|
|
}
|
|
}
|
|
}
|
|
});
|
|
|
|
return data;
|
|
}
|
|
|
|
private generateSummary(data: CIMReview): string {
|
|
const parts: string[] = [];
|
|
|
|
if (data.dealOverview?.targetCompanyName) {
|
|
parts.push(`Target: ${data.dealOverview.targetCompanyName}`);
|
|
}
|
|
if (data.dealOverview?.industrySector) {
|
|
parts.push(`Industry: ${data.dealOverview.industrySector}`);
|
|
}
|
|
if (data.dealOverview?.geography) {
|
|
parts.push(`Location: ${data.dealOverview.geography}`);
|
|
}
|
|
if (data.financialSummary?.financials?.ltm?.revenue) {
|
|
parts.push(`LTM Revenue: ${data.financialSummary.financials.ltm.revenue}`);
|
|
}
|
|
if (data.financialSummary?.financials?.ltm?.ebitda) {
|
|
parts.push(`LTM EBITDA: ${data.financialSummary.financials.ltm.ebitda}`);
|
|
}
|
|
|
|
return parts.join(' | ') || 'CIM analysis completed';
|
|
}
|
|
}
|
|
|
|
export const simpleDocumentProcessor = new SimpleDocumentProcessor();
|
|
|