[\s\S]*?([^<]+)<\/strong><\/td>([\s\S]*?)<\/tr>/g);
+
+ for (const rowMatch of rowMatches) {
+ const metric = rowMatch[1].trim();
+ const valuesRow = rowMatch[2];
+ const values: string[] = [];
+ const valueMatches = valuesRow.matchAll(/([^<]+)<\/td>/g);
+ for (const valueMatch of valueMatches) {
+ values.push(valueMatch[1].trim());
+ }
+ rows.push({ metric, values });
+ }
+
+ return { periods, rows };
+}
+
+function testFinancialSummary(testName: string, data: CIMReview) {
+ console.log(`\n${'='.repeat(60)}`);
+ console.log(`Test: ${testName}`);
+ console.log('='.repeat(60));
+
+ try {
+ // Generate financial summary table directly
+ const summary = generateFinancialSummaryTable(data);
+
+ // Extract financial table
+ const table = extractFinancialTable(summary);
+
+ if (!table) {
+ console.log('❌ FAILED: No financial table found in summary');
+ return false;
+ }
+
+ console.log('\n📊 Financial Table Structure:');
+ console.log(`Periods: ${table.periods.join(' → ')}`);
+ console.log(`\nRows found:`);
+ table.rows.forEach(row => {
+ console.log(` - ${row.metric}: ${row.values.join(' | ')}`);
+ });
+
+ // Test 1: Period ordering (should be in chronological order: FY3 → FY2 → FY1 → LTM)
+ // But only include periods that have data
+ const expectedOrder = ['FY3', 'FY2', 'FY1', 'LTM'];
+ const actualOrder = table.periods.filter(p => expectedOrder.includes(p));
+
+ // Check that the order is correct (periods should be in chronological order)
+ // If we have FY2, FY1, LTM, that's correct - they're in order
+ // If we have FY3, FY1, LTM, that's wrong - missing FY2 breaks the sequence
+ let isOrderCorrect = true;
+ for (let i = 0; i < actualOrder.length - 1; i++) {
+ const currentIndex = expectedOrder.indexOf(actualOrder[i]);
+ const nextIndex = expectedOrder.indexOf(actualOrder[i + 1]);
+ if (nextIndex <= currentIndex) {
+ isOrderCorrect = false;
+ break;
+ }
+ }
+
+ console.log(`\n✅ Period Order Check:`);
+ console.log(` Expected order: ${expectedOrder.join(' → ')}`);
+ console.log(` Actual periods: ${table.periods.join(' → ')}`);
+ console.log(` ${isOrderCorrect ? '✅ PASS (periods in correct chronological order)' : '❌ FAIL (periods out of order)'}`);
+
+ // Test 2: Check for required metrics
+ const requiredMetrics = ['Revenue', 'Gross Profit', 'Gross Margin', 'EBITDA', 'EBITDA Margin', 'Revenue Growth'];
+ const foundMetrics = table.rows.map(r => r.metric);
+ const missingMetrics = requiredMetrics.filter(m => !foundMetrics.includes(m));
+
+ console.log(`\n✅ Required Metrics Check:`);
+ console.log(` Found: ${foundMetrics.join(', ')}`);
+ if (missingMetrics.length > 0) {
+ console.log(` Missing: ${missingMetrics.join(', ')}`);
+ console.log(` ⚠️ WARNING: Some metrics missing (may be intentional if data not available)`);
+ } else {
+ console.log(` ✅ PASS: All required metrics present`);
+ }
+
+ // Test 3: Check that values align with periods
+ const allRowsHaveCorrectValueCount = table.rows.every(row => row.values.length === table.periods.length);
+ console.log(`\n✅ Value Alignment Check:`);
+ console.log(` Each row has ${table.periods.length} values (one per period)`);
+ console.log(` ${allRowsHaveCorrectValueCount ? '✅ PASS' : '❌ FAIL'}`);
+
+ // Test 4: Check for "Not specified" or empty values
+ const hasEmptyValues = table.rows.some(row => row.values.some(v => v === '-' || v === 'Not specified in CIM'));
+ if (hasEmptyValues) {
+ console.log(`\n⚠️ Note: Some values are marked as '-' or 'Not specified in CIM'`);
+ }
+
+ return isOrderCorrect && allRowsHaveCorrectValueCount;
+ } catch (error) {
+ console.log(`\n❌ ERROR: ${error instanceof Error ? error.message : String(error)}`);
+ if (error instanceof Error && error.stack) {
+ console.log(`\nStack trace:\n${error.stack}`);
+ }
+ return false;
+ }
+}
+
+async function runTests() {
+ console.log('\n🧪 Financial Summary Workflow Test');
+ console.log('===================================\n');
+
+ const results: Array<{ name: string; passed: boolean }> = [];
+
+ // Test 1: Complete financial data
+ results.push({
+ name: 'Complete Financial Data (All Periods & Metrics)',
+ passed: testFinancialSummary('Complete Financial Data', sampleFinancialData)
+ });
+
+ // Test 2: Partial periods
+ results.push({
+ name: 'Partial Periods (Missing FY3)',
+ passed: testFinancialSummary('Partial Periods', sampleFinancialDataPartial)
+ });
+
+ // Test 3: Missing some metrics
+ results.push({
+ name: 'Missing Some Metrics (No Gross Profit/Margin)',
+ passed: testFinancialSummary('Missing Metrics', sampleFinancialDataMissingMetrics)
+ });
+
+ // Summary
+ console.log(`\n${'='.repeat(60)}`);
+ console.log('Test Summary');
+ console.log('='.repeat(60));
+ results.forEach((result, index) => {
+ console.log(`${index + 1}. ${result.name}: ${result.passed ? '✅ PASS' : '❌ FAIL'}`);
+ });
+
+ const allPassed = results.every(r => r.passed);
+ console.log(`\n${allPassed ? '✅ All tests passed!' : '❌ Some tests failed'}\n`);
+
+ process.exit(allPassed ? 0 : 1);
+}
+
+// Run tests
+runTests().catch(error => {
+ logger.error('Test execution failed', { error: error instanceof Error ? error.message : String(error) });
+ console.error('❌ Test execution failed:', error);
+ process.exit(1);
+});
+
diff --git a/backend/src/services/financialTableParser.ts b/backend/src/services/financialTableParser.ts
index ef825ca..d73f7e7 100644
--- a/backend/src/services/financialTableParser.ts
+++ b/backend/src/services/financialTableParser.ts
@@ -85,6 +85,7 @@ function yearTokensToBuckets(tokens: string[]): Array {
const bucketAssignments: Array = new Array(tokens.length).fill(null);
const ltmIndices: number[] = [];
+ // First pass: Identify LTM/TTM periods
tokens.forEach((token, index) => {
if (token.includes('LTM') || token.includes('TTM')) {
bucketAssignments[index] = 'ltm';
@@ -92,19 +93,43 @@ function yearTokensToBuckets(tokens: string[]): Array {
}
});
+ // Get non-LTM indices (these should be fiscal years)
const nonLtmIndices = tokens
.map((token, index) => ({ token, index }))
.filter(({ index }) => !ltmIndices.includes(index));
+ // Handle edge cases: tables with only 2-3 periods (not all 4)
+ // Strategy: Assign FY buckets from most recent to oldest (FY1, FY2, FY3)
+ // If we have 3 years: assign FY1, FY2, FY3
+ // If we have 2 years: assign FY1, FY2
+ // If we have 1 year: assign FY1
const fyBuckets: Bucket[] = ['fy1', 'fy2', 'fy3'];
let fyIndex = 0;
+ // Assign from most recent (rightmost) to oldest (leftmost)
+ // This matches typical table layout: oldest year on left, newest on right
for (let i = nonLtmIndices.length - 1; i >= 0 && fyIndex < fyBuckets.length; i--) {
const { index } = nonLtmIndices[i];
bucketAssignments[index] = fyBuckets[fyIndex];
fyIndex++;
}
+ // Validation: Log if we have unusual period counts
+ const assignedBuckets = bucketAssignments.filter(Boolean);
+ if (assignedBuckets.length < 2) {
+ logger.debug('Financial parser: Few periods detected', {
+ totalTokens: tokens.length,
+ assignedBuckets: assignedBuckets.length,
+ tokens: tokens.slice(0, 10)
+ });
+ } else if (assignedBuckets.length > 4) {
+ logger.debug('Financial parser: Many periods detected - may include projections', {
+ totalTokens: tokens.length,
+ assignedBuckets: assignedBuckets.length,
+ tokens: tokens.slice(0, 10)
+ });
+ }
+
return bucketAssignments;
}
@@ -160,21 +185,80 @@ function isPercentLike(value?: string): boolean {
function assignTokensToBuckets(
tokens: string[],
buckets: Array,
- mapper: (bucket: Bucket, value: string) => void
+ mapper: (bucket: Bucket, value: string) => void,
+ fieldName?: string,
+ lineIndex?: number
) {
- // Only assign tokens that align with non-null buckets (skip columns)
- // This ensures we don't assign data to skipped columns (like projections)
+ // Count non-null buckets (actual periods we want to extract)
+ const validBuckets = buckets.filter(Boolean).length;
+
+ // Validation: Check if token count matches expected bucket count
+ // Allow some flexibility - tokens can be within 1 of valid buckets (handles missing values)
+ if (tokens.length < validBuckets - 1) {
+ logger.debug('Financial parser: Token count mismatch - too few tokens', {
+ field: fieldName,
+ lineIndex,
+ tokensFound: tokens.length,
+ validBuckets,
+ tokens: tokens.slice(0, 10),
+ buckets: buckets.map(b => b || 'skip')
+ });
+ // Still try to assign what we have, but log the issue
+ } else if (tokens.length > validBuckets + 1) {
+ logger.debug('Financial parser: Token count mismatch - too many tokens', {
+ field: fieldName,
+ lineIndex,
+ tokensFound: tokens.length,
+ validBuckets,
+ tokens: tokens.slice(0, 10),
+ buckets: buckets.map(b => b || 'skip')
+ });
+ // Take only the first N tokens that match buckets
+ }
+
+ // Map tokens to buckets by position
+ // Strategy: Match tokens sequentially to non-null buckets
let tokenIndex = 0;
for (let i = 0; i < buckets.length && tokenIndex < tokens.length; i++) {
const bucket = buckets[i];
if (!bucket) {
// Skip this column (it's a projection or irrelevant period)
- // Don't increment tokenIndex - the token might belong to the next bucket
+ // CRITICAL: When we skip a bucket, we also skip the corresponding token
+ // This assumes tokens are aligned with columns in the table
+ // If the table has missing values, tokens might be misaligned
+ // In that case, we try to match by counting non-null buckets before this position
+ const nonNullBucketsBefore = buckets.slice(0, i).filter(Boolean).length;
+ if (tokenIndex < nonNullBucketsBefore) {
+ // We're behind - this might be a missing value, skip the token
+ tokenIndex++;
+ }
continue;
}
+
// Assign the token to this bucket
- mapper(bucket, tokens[tokenIndex]);
- tokenIndex++;
+ if (tokenIndex < tokens.length) {
+ mapper(bucket, tokens[tokenIndex]);
+ tokenIndex++;
+ } else {
+ // No more tokens - this period has no value
+ logger.debug('Financial parser: Missing token for bucket', {
+ field: fieldName,
+ bucket,
+ bucketIndex: i,
+ tokensFound: tokens.length
+ });
+ }
+ }
+
+ // Log if we didn't use all tokens (might indicate misalignment)
+ if (tokenIndex < tokens.length && tokens.length > validBuckets) {
+ logger.debug('Financial parser: Unused tokens detected', {
+ field: fieldName,
+ tokensUsed: tokenIndex,
+ tokensTotal: tokens.length,
+ validBuckets,
+ unusedTokens: tokens.slice(tokenIndex)
+ });
}
}
@@ -384,12 +468,19 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials {
line: line.substring(0, 150),
nextLine: nextLine.substring(0, 100),
tokensFound: tokens.length,
- tokens: tokens.slice(0, 10) // Limit token logging
+ tokens: tokens.slice(0, 10), // Limit token logging
+ buckets: bestBuckets.map(b => b || 'skip')
});
- assignTokensToBuckets(tokens, bestBuckets, (bucket, value) => {
- bucketSetters[field](bucket, value);
- });
+ assignTokensToBuckets(
+ tokens,
+ bestBuckets,
+ (bucket, value) => {
+ bucketSetters[field](bucket, value);
+ },
+ field,
+ i
+ );
}
}
diff --git a/backend/src/services/llmService.ts b/backend/src/services/llmService.ts
index f87a8c9..d61b124 100644
--- a/backend/src/services/llmService.ts
+++ b/backend/src/services/llmService.ts
@@ -2069,6 +2069,103 @@ IMPORTANT: Replace all placeholder text with actual information from the CIM doc
const validation = cimReviewSchema.safeParse(financialData);
if (validation.success) {
+ // Post-extraction validation: Check that values make sense
+ const financials = financialData.financialSummary?.financials;
+ if (financials) {
+ const validationIssues: string[] = [];
+
+ // Helper to extract numeric value from financial string
+ const extractNumericValue = (value: string): number | null => {
+ if (!value || value === 'Not specified in CIM' || value.includes('Not specified')) {
+ return null;
+ }
+ let cleaned = value.replace(/[$,\s()]/g, '');
+ let multiplier = 1;
+ if (cleaned.toLowerCase().endsWith('k')) {
+ multiplier = 1000;
+ cleaned = cleaned.slice(0, -1);
+ } else if (cleaned.toLowerCase().endsWith('m')) {
+ multiplier = 1000000;
+ cleaned = cleaned.slice(0, -1);
+ } else if (cleaned.toLowerCase().endsWith('b')) {
+ multiplier = 1000000000;
+ cleaned = cleaned.slice(0, -1);
+ }
+ const isNegative = cleaned.startsWith('-');
+ if (isNegative) cleaned = cleaned.substring(1);
+ const num = parseFloat(cleaned);
+ return isNaN(num) ? null : (isNegative ? -1 : 1) * num * multiplier;
+ };
+
+ // Cross-period validation: Check revenue trends
+ const revenues: Array<{ period: string; value: number }> = [];
+ ['fy3', 'fy2', 'fy1', 'ltm'].forEach(period => {
+ const rev = financials[period as keyof typeof financials]?.revenue;
+ if (rev) {
+ const numValue = extractNumericValue(rev);
+ if (numValue !== null && numValue > 0) {
+ revenues.push({ period, value: numValue });
+ }
+ }
+ });
+
+ // Check for unreasonable revenue values (< $5M suggests wrong table)
+ revenues.forEach(({ period, value }) => {
+ if (value < 5000000) {
+ validationIssues.push(`Revenue for ${period} is suspiciously low ($${(value / 1000000).toFixed(1)}M) - may be from wrong table`);
+ }
+ });
+
+ // Check for unreasonable growth rates (suggests misaligned columns)
+ for (let i = 1; i < revenues.length; i++) {
+ const prev = revenues[i - 1];
+ const curr = revenues[i];
+ const growth = ((curr.value - prev.value) / prev.value) * 100;
+ if (Math.abs(growth) > 200) {
+ validationIssues.push(`Unusual revenue growth between ${prev.period} and ${curr.period} (${growth.toFixed(1)}%) - may indicate misaligned columns`);
+ }
+ }
+
+ // Check EBITDA margins are reasonable
+ ['fy3', 'fy2', 'fy1', 'ltm'].forEach(period => {
+ const periodData = financials[period as keyof typeof financials];
+ if (periodData?.revenue && periodData?.ebitda && periodData?.ebitdaMargin) {
+ const revValue = extractNumericValue(periodData.revenue);
+ const ebitdaValue = extractNumericValue(periodData.ebitda);
+ const marginValue = parseFloat(periodData.ebitdaMargin.replace('%', ''));
+
+ if (revValue !== null && ebitdaValue !== null && !isNaN(marginValue)) {
+ const calculatedMargin = (ebitdaValue / revValue) * 100;
+ const marginDiff = Math.abs(calculatedMargin - marginValue);
+
+ // If margin difference is > 5 percentage points, there may be an issue
+ if (marginDiff > 5 && revValue > 0) {
+ validationIssues.push(`EBITDA margin mismatch for ${period}: stated ${marginValue}% vs calculated ${calculatedMargin.toFixed(1)}%`);
+ }
+
+ // Check margin is in reasonable range
+ if (marginValue < 0 || marginValue > 60) {
+ validationIssues.push(`EBITDA margin for ${period} is outside typical range (${marginValue}%)`);
+ }
+ }
+ }
+ });
+
+ if (validationIssues.length > 0) {
+ logger.warn('Financial extraction post-validation found issues', {
+ attempt,
+ issues: validationIssues,
+ financials: {
+ fy3: financials.fy3,
+ fy2: financials.fy2,
+ fy1: financials.fy1,
+ ltm: financials.ltm
+ }
+ });
+ // Don't fail - just log the issues. The values might still be usable.
+ }
+ }
+
logger.info(`Financial extraction completed successfully on attempt ${attempt}`);
return {
success: true,
@@ -2137,35 +2234,80 @@ ${parserContext}CRITICAL FINANCIAL EXTRACTION RULES:
- The PRIMARY table is usually in the main financial section, not appendices
- VALIDATION RULE: If revenue values are less than $10M, you are likely extracting from the wrong table - search for the main table with values typically $20M-$1B+
-**Step 2: Identify Periods (Flexible Approach)**
+**Step 2: Identify Periods (CRITICAL - Chronological Order)**
Financial tables can have different formats. Here's how to map them:
+IMPORTANT: Periods must be in chronological order (oldest to newest):
+- FY-3 = Oldest year (3 years ago)
+- FY-2 = Second oldest year (2 years ago)
+- FY-1 = Most recent full fiscal year (1 year ago, most recent complete year)
+- LTM = Look for "LTM", "TTM", "Last Twelve Months", or trailing period (most recent)
+
*Format A: Years shown (2021, 2022, 2023, 2024)*
-- FY-3 = Oldest year (e.g., 2021 or 2022)
-- FY-2 = Second oldest year (e.g., 2022 or 2023)
-- FY-1 = Most recent full fiscal year (e.g., 2023 or 2024)
-- LTM = Look for "LTM", "TTM", "Last Twelve Months", or trailing period
+- Identify the OLDEST year = FY-3
+- Identify the SECOND OLDEST year = FY-2
+- Identify the MOST RECENT FULL YEAR = FY-1
+- Identify LTM/TTM if present = LTM
+- Example: "2021 2022 2023 2024" → FY-3=2021, FY-2=2022, FY-1=2023, LTM=2024 (if labeled as LTM)
*Format B: Periods shown (FY-3, FY-2, FY-1, LTM)*
-- Use them directly as labeled
+- Use them directly as labeled (they're already in correct format)
*Format C: Mixed (2023, 2024, LTM Mar-25, 2025E)*
-- Use actual years for FY-3, FY-2, FY-1
+- Use actual years for FY-3, FY-2, FY-1 (oldest to newest)
- Use LTM/TTM for LTM
- IGNORE anything with "E", "P", "PF" (estimates/projections)
-**Step 3: Extract Values Carefully**
-- Read from the CORRECT column for each period
+*Format D: Only 2-3 periods (not all 4)*
+- If only 2 years: assign FY-1 (most recent) and FY-2 (older)
+- If only 3 years: assign FY-1 (most recent), FY-2 (middle), FY-3 (oldest)
+
+**Step 3: Extract Values Carefully - Column Alignment is CRITICAL**
+- Read from the CORRECT column for each period - this is the most common error!
+- Tables are typically laid out: [Oldest Year] [Second Oldest] [Most Recent] [LTM]
+- Match each value to its correct period by column position
- Extract EXACT values as shown ($64M, $71M, 29.3%, etc.)
- Preserve the format (don't convert $64M to $64,000,000)
- If values are in thousands format (e.g., "$20,546 (in thousands)"), convert to millions: $20,546K = $20.5M
-**Step 4: Validate Your Extraction**
-- Check that values make sense: If FY-3 revenue is $64M, FY-2 should be similar magnitude (e.g., $50M-$90M), not $2.9M or $10
-- Revenue should typically be $10M+ for target companies (if less, you're likely using wrong table)
-- EBITDA should typically be $1M+ and positive
-- Margins should be 5-50% for EBITDA margin
-- If values seem wrong, you may have misaligned columns - double-check
+COLUMN ALIGNMENT CHECKLIST:
+1. Count the columns in the header row
+2. Count the values in each data row
+3. Ensure values align with their corresponding period columns
+4. If a row has fewer values than columns, the missing values are likely at the end (oldest periods)
+5. If values seem misaligned, double-check by comparing revenue trends (should generally increase or be stable)
+
+**Step 4: Validate Your Extraction - Run These Checks**
+
+CRITICAL VALIDATION CHECKS (run these before finalizing):
+
+1. **Magnitude Check:**
+ - Revenue should typically be $10M+ for target companies (if less, you're likely using wrong table)
+ - EBITDA should typically be $1M+ and positive
+ - If FY-3 revenue is $64M, FY-2 should be similar magnitude (e.g., $50M-$90M), not $2.9M or $10
+
+2. **Trend Check:**
+ - Revenue should generally increase or be stable year-over-year (FY-3 → FY-2 → FY-1)
+ - Large sudden drops (>50%) or increases (>200%) may indicate misaligned columns
+ - EBITDA should follow similar trends to revenue
+
+3. **Margin Check:**
+ - EBITDA margins should be 5-50% (typical range)
+ - Gross margins should be 20-80% (typical range)
+ - Margins should be relatively stable across periods (within 10-15 percentage points)
+
+4. **Cross-Period Validation:**
+ - If FY-3 revenue = $64M and FY-2 revenue = $71M, growth should be ~11% (not 1000% or -50%)
+ - If revenue values don't make sense relative to each other, you likely misaligned columns
+
+5. **Missing Values:**
+ - If a period has no value, use "Not specified in CIM" (don't make up values)
+ - FY-3 may legitimately have "N/A" for revenueGrowth (it's the baseline year)
+
+If ANY validation check fails, you likely have:
+- Wrong table (subsidiary instead of primary)
+- Misaligned columns (values in wrong period columns)
+- Extraction error (read the table again carefully)
**Step 5: If Uncertain**
- If you can't find the PRIMARY table, can't identify periods clearly, or values don't make sense → use "Not specified in CIM"
@@ -2212,12 +2354,38 @@ Revenue Row: "$64M $71M $76M $85M"
EBITDA Row: "$19M $24M $27M $30M"
Correct Extraction:
-- FY-3 = 2023 = $64M revenue, $19M EBITDA
-- FY-2 = 2024 = $71M revenue, $24M EBITDA
-- FY-1 = 2024 = $71M revenue, $24M EBITDA (most recent full year)
-- LTM = LTM Mar-25 = $76M revenue, $27M EBITDA
+- FY-3 = 2023 = $64M revenue, $19M EBITDA (oldest year)
+- FY-2 = 2024 = $71M revenue, $24M EBITDA (second oldest)
+- FY-1 = 2024 = $71M revenue, $24M EBITDA (most recent full year - same as FY-2 in this case)
+- LTM = LTM Mar-25 = $76M revenue, $27M EBITDA (most recent trailing period)
- IGNORE 2025E (projection, marked with "E")
+**Example 5: Column Misalignment Error (WRONG - Don't Do This)**
+Table Header: "FY-3 FY-2 FY-1 LTM"
+Revenue Row: "$64M $71M $71M $76M"
+EBITDA Row: "$19M $24M $24M $27M"
+
+WRONG Extraction (misaligned):
+- FY-3 = $71M revenue (WRONG - this is FY-2's value!)
+- FY-2 = $71M revenue (WRONG - this is FY-1's value!)
+
+CORRECT Extraction (properly aligned):
+- FY-3 = $64M revenue, $19M EBITDA (first column)
+- FY-2 = $71M revenue, $24M EBITDA (second column)
+- FY-1 = $71M revenue, $24M EBITDA (third column)
+- LTM = $76M revenue, $27M EBITDA (fourth column)
+
+**Example 6: Only 2 Periods (Edge Case)**
+Table Header: "2023 2024"
+Revenue Row: "$64M $71M"
+EBITDA Row: "$19M $24M"
+
+Correct Extraction:
+- FY-3 = Not specified in CIM (only 2 years provided)
+- FY-2 = 2023 = $64M revenue, $19M EBITDA (older year)
+- FY-1 = 2024 = $71M revenue, $24M EBITDA (most recent year)
+- LTM = Not specified in CIM (no LTM column)
+
CIM Document Text:
${text}
diff --git a/backend/src/services/optimizedAgenticRAGProcessor.ts b/backend/src/services/optimizedAgenticRAGProcessor.ts
index ba729ef..f875a6b 100644
--- a/backend/src/services/optimizedAgenticRAGProcessor.ts
+++ b/backend/src/services/optimizedAgenticRAGProcessor.ts
@@ -1020,79 +1020,104 @@ export class OptimizedAgenticRAGProcessor {
summary += `## Financial Summary\n\n`;
const financials = analysisData.financialSummary.financials;
- // Create financial table
- summary += `\n`;
- summary += `\n\n| Metric | \n`;
+ // Helper function to check if a period has any non-empty metric
+ const hasAnyMetric = (period: 'fy3' | 'fy2' | 'fy1' | 'ltm'): boolean => {
+ const periodData = financials[period];
+ if (!periodData) return false;
+ return !!(
+ periodData.revenue ||
+ periodData.revenueGrowth ||
+ periodData.grossProfit ||
+ periodData.grossMargin ||
+ periodData.ebitda ||
+ periodData.ebitdaMargin
+ );
+ };
- const periods: string[] = [];
- if (financials.fy1) periods.push('FY1');
- if (financials.fy2) periods.push('FY2');
- if (financials.fy3) periods.push('FY3');
- if (financials.ltm) periods.push('LTM');
+ // Build periods array in chronological order (oldest to newest): FY3 → FY2 → FY1 → LTM
+ // Only include periods that have at least one non-empty metric
+ const periods: Array<{ key: 'fy3' | 'fy2' | 'fy1' | 'ltm'; label: string }> = [];
+ if (hasAnyMetric('fy3')) periods.push({ key: 'fy3', label: 'FY3' });
+ if (hasAnyMetric('fy2')) periods.push({ key: 'fy2', label: 'FY2' });
+ if (hasAnyMetric('fy1')) periods.push({ key: 'fy1', label: 'FY1' });
+ if (hasAnyMetric('ltm')) periods.push({ key: 'ltm', label: 'LTM' });
- periods.forEach(period => {
- summary += `${period} | \n`;
- });
- summary += ` \n\n\n`;
-
- // Revenue row
- if (financials.fy1?.revenue || financials.fy2?.revenue || financials.fy3?.revenue || financials.ltm?.revenue) {
- summary += `\n| Revenue | \n`;
+ // Only create table if we have at least one period with data
+ if (periods.length > 0) {
+ // Create financial table
+ summary += `\n`;
+ summary += `\n\n| Metric | \n`;
+
periods.forEach(period => {
- let value = '-';
- if (period === 'FY1' && financials.fy1?.revenue) value = financials.fy1.revenue;
- else if (period === 'FY2' && financials.fy2?.revenue) value = financials.fy2.revenue;
- else if (period === 'FY3' && financials.fy3?.revenue) value = financials.fy3.revenue;
- else if (period === 'LTM' && financials.ltm?.revenue) value = financials.ltm.revenue;
- summary += `${value} | \n`;
+ summary += `${period.label} | \n`;
});
- summary += ` \n`;
+ summary += `\n\n\n`;
+
+ // Helper function to get value for a period and metric
+ const getValue = (periodKey: 'fy3' | 'fy2' | 'fy1' | 'ltm', metric: keyof typeof financials.fy1): string => {
+ const periodData = financials[periodKey];
+ if (!periodData) return '-';
+ const value = periodData[metric];
+ return value && value.trim() && value !== 'Not specified in CIM' ? value : '-';
+ };
+
+ // Revenue row
+ if (financials.fy1?.revenue || financials.fy2?.revenue || financials.fy3?.revenue || financials.ltm?.revenue) {
+ summary += `\n| Revenue | \n`;
+ periods.forEach(period => {
+ summary += `${getValue(period.key, 'revenue')} | \n`;
+ });
+ summary += ` \n`;
+ }
+
+ // Gross Profit row
+ if (financials.fy1?.grossProfit || financials.fy2?.grossProfit || financials.fy3?.grossProfit || financials.ltm?.grossProfit) {
+ summary += `\n| Gross Profit | \n`;
+ periods.forEach(period => {
+ summary += `${getValue(period.key, 'grossProfit')} | \n`;
+ });
+ summary += ` \n`;
+ }
+
+ // Gross Margin row
+ if (financials.fy1?.grossMargin || financials.fy2?.grossMargin || financials.fy3?.grossMargin || financials.ltm?.grossMargin) {
+ summary += `\n| Gross Margin | \n`;
+ periods.forEach(period => {
+ summary += `${getValue(period.key, 'grossMargin')} | \n`;
+ });
+ summary += ` \n`;
+ }
+
+ // EBITDA row
+ if (financials.fy1?.ebitda || financials.fy2?.ebitda || financials.fy3?.ebitda || financials.ltm?.ebitda) {
+ summary += `\n| EBITDA | \n`;
+ periods.forEach(period => {
+ summary += `${getValue(period.key, 'ebitda')} | \n`;
+ });
+ summary += ` \n`;
+ }
+
+ // EBITDA Margin row
+ if (financials.fy1?.ebitdaMargin || financials.fy2?.ebitdaMargin || financials.fy3?.ebitdaMargin || financials.ltm?.ebitdaMargin) {
+ summary += `\n| EBITDA Margin | \n`;
+ periods.forEach(period => {
+ summary += `${getValue(period.key, 'ebitdaMargin')} | \n`;
+ });
+ summary += ` \n`;
+ }
+
+ // Revenue Growth row
+ if (financials.fy1?.revenueGrowth || financials.fy2?.revenueGrowth || financials.fy3?.revenueGrowth || financials.ltm?.revenueGrowth) {
+ summary += `\n| Revenue Growth | \n`;
+ periods.forEach(period => {
+ summary += `${getValue(period.key, 'revenueGrowth')} | \n`;
+ });
+ summary += ` \n`;
+ }
+
+ summary += `\n \n\n`;
}
- // EBITDA row
- if (financials.fy1?.ebitda || financials.fy2?.ebitda || financials.fy3?.ebitda || financials.ltm?.ebitda) {
- summary += ` \n| EBITDA | \n`;
- periods.forEach(period => {
- let value = '-';
- if (period === 'FY1' && financials.fy1?.ebitda) value = financials.fy1.ebitda;
- else if (period === 'FY2' && financials.fy2?.ebitda) value = financials.fy2.ebitda;
- else if (period === 'FY3' && financials.fy3?.ebitda) value = financials.fy3.ebitda;
- else if (period === 'LTM' && financials.ltm?.ebitda) value = financials.ltm.ebitda;
- summary += `${value} | \n`;
- });
- summary += ` \n`;
- }
-
- // EBITDA Margin row
- if (financials.fy1?.ebitdaMargin || financials.fy2?.ebitdaMargin || financials.fy3?.ebitdaMargin || financials.ltm?.ebitdaMargin) {
- summary += `\n| EBITDA Margin | \n`;
- periods.forEach(period => {
- let value = '-';
- if (period === 'FY1' && financials.fy1?.ebitdaMargin) value = financials.fy1.ebitdaMargin;
- else if (period === 'FY2' && financials.fy2?.ebitdaMargin) value = financials.fy2.ebitdaMargin;
- else if (period === 'FY3' && financials.fy3?.ebitdaMargin) value = financials.fy3.ebitdaMargin;
- else if (period === 'LTM' && financials.ltm?.ebitdaMargin) value = financials.ltm.ebitdaMargin;
- summary += `${value} | \n`;
- });
- summary += ` \n`;
- }
-
- // Revenue Growth row
- if (financials.fy1?.revenueGrowth || financials.fy2?.revenueGrowth || financials.fy3?.revenueGrowth || financials.ltm?.revenueGrowth) {
- summary += `\n| Revenue Growth | \n`;
- periods.forEach(period => {
- let value = '-';
- if (period === 'FY1' && financials.fy1?.revenueGrowth) value = financials.fy1.revenueGrowth;
- else if (period === 'FY2' && financials.fy2?.revenueGrowth) value = financials.fy2.revenueGrowth;
- else if (period === 'FY3' && financials.fy3?.revenueGrowth) value = financials.fy3.revenueGrowth;
- else if (period === 'LTM' && financials.ltm?.revenueGrowth) value = financials.ltm.revenueGrowth;
- summary += `${value} | \n`;
- });
- summary += ` \n`;
- }
-
- summary += `\n \n\n`;
-
// Add financial notes
if (analysisData.financialSummary.qualityOfEarnings) {
summary += `**Quality of Earnings:** ${analysisData.financialSummary.qualityOfEarnings}\n\n`;
diff --git a/backend/src/services/simpleDocumentProcessor.ts b/backend/src/services/simpleDocumentProcessor.ts
index 549eed2..c62c0c7 100644
--- a/backend/src/services/simpleDocumentProcessor.ts
+++ b/backend/src/services/simpleDocumentProcessor.ts
@@ -527,29 +527,61 @@ Focus on finding these specific fields in the document. Extract exact values, nu
}
}
- // Cross-validate: If we have other periods, check for consistency
- // If FY-3 is $64M but FY-2 is $2.9M, that's a red flag
+ // Cross-validate: Check consistency across periods
+ // Enhanced validation: Check trends and detect misaligned columns
const otherPeriods = periods.filter(p => p !== period && financials[p]?.revenue);
if (otherPeriods.length > 0 && periodData.revenue && periodData.revenue !== 'Not specified in CIM') {
const currentValue = extractNumericValue(periodData.revenue);
- if (currentValue !== null) {
+ if (currentValue !== null && currentValue > 0) {
const otherValues = otherPeriods
- .map(p => extractNumericValue(financials[p]!.revenue || ''))
- .filter((v): v is number => v !== null);
+ .map(p => {
+ const val = extractNumericValue(financials[p]!.revenue || '');
+ return val !== null && val > 0 ? { period: p, value: val } : null;
+ })
+ .filter((v): v is { period: string; value: number } => v !== null);
if (otherValues.length > 0) {
- const avgOtherValue = otherValues.reduce((a, b) => a + b, 0) / otherValues.length;
- // If current value is less than 20% of average, it's likely wrong
- if (currentValue > 0 && avgOtherValue > 0 && currentValue < avgOtherValue * 0.2) {
+ const avgOtherValue = otherValues.reduce((a, b) => a + b.value, 0) / otherValues.length;
+ const maxOtherValue = Math.max(...otherValues.map(v => v.value));
+ const minOtherValue = Math.min(...otherValues.map(v => v.value));
+
+ // Check 1: Value is too small compared to other periods (likely wrong column)
+ if (currentValue < avgOtherValue * 0.2) {
logger.warn('Rejecting revenue value - inconsistent with other periods', {
period,
value: periodData.revenue,
numericValue: currentValue,
avgOtherPeriods: avgOtherValue,
- reason: 'Value is too small compared to other periods - likely wrong column'
+ maxOtherPeriods: maxOtherValue,
+ minOtherPeriods: minOtherValue,
+ reason: `Value ($${(currentValue / 1000000).toFixed(1)}M) is <20% of average ($${(avgOtherValue / 1000000).toFixed(1)}M) - likely wrong column or misaligned extraction`
});
periodData.revenue = 'Not specified in CIM';
}
+
+ // Check 2: Detect unusual growth patterns (suggests misaligned columns)
+ // Find adjacent periods to check growth
+ const periodOrder = ['fy3', 'fy2', 'fy1', 'ltm'];
+ const currentIndex = periodOrder.indexOf(period);
+ if (currentIndex > 0) {
+ const prevPeriod = periodOrder[currentIndex - 1];
+ const prevValue = extractNumericValue(financials[prevPeriod]?.revenue || '');
+ if (prevValue !== null && prevValue > 0) {
+ const growth = ((currentValue - prevValue) / prevValue) * 100;
+ // Flag if growth is >200% or < -50% (unusual for year-over-year)
+ if (growth > 200 || growth < -50) {
+ logger.warn('Detected unusual revenue growth pattern - may indicate misaligned columns', {
+ period,
+ prevPeriod,
+ currentValue: currentValue,
+ prevValue: prevValue,
+ growth: `${growth.toFixed(1)}%`,
+ reason: `Unusual growth (${growth > 0 ? '+' : ''}${growth.toFixed(1)}%) between ${prevPeriod} and ${period} - may indicate column misalignment`
+ });
+ // Don't reject - just log as warning, as this might be legitimate
+ }
+ }
+ }
}
}
}
@@ -581,23 +613,70 @@ Focus on finding these specific fields in the document. Extract exact values, nu
}
}
- // Validate margins - should be reasonable percentages
+ // Validate margins - should be reasonable percentages and consistent across periods
if (periodData.ebitdaMargin && periodData.ebitdaMargin !== 'Not specified in CIM') {
const marginStr = periodData.ebitdaMargin.trim();
// Extract numeric value
const marginMatch = marginStr.match(/(-?\d+(?:\.\d+)?)/);
if (marginMatch) {
const marginValue = parseFloat(marginMatch[1]);
- // Reject margins outside reasonable range (-10% to 100%)
+ // Reject margins outside reasonable range (-10% to 60%)
// Negative margins are possible but should be within reason
- if (marginValue < -10 || marginValue > 100) {
+ if (marginValue < -10 || marginValue > 60) {
logger.warn('Rejecting invalid EBITDA margin', {
period,
value: marginStr,
numericValue: marginValue,
- reason: 'Margin outside reasonable range (-10% to 100%)'
+ reason: `Margin (${marginValue}%) outside reasonable range (-10% to 60%)`
});
periodData.ebitdaMargin = 'Not specified in CIM';
+ } else {
+ // Cross-validate: Check margin consistency with revenue and EBITDA
+ const revValue = extractNumericValue(periodData.revenue || '');
+ const ebitdaValue = extractNumericValue(periodData.ebitda || '');
+ if (revValue !== null && ebitdaValue !== null && revValue > 0) {
+ const calculatedMargin = (ebitdaValue / revValue) * 100;
+ const marginDiff = Math.abs(calculatedMargin - marginValue);
+ // If margin difference is > 10 percentage points, flag it
+ if (marginDiff > 10) {
+ logger.warn('EBITDA margin mismatch detected', {
+ period,
+ statedMargin: `${marginValue}%`,
+ calculatedMargin: `${calculatedMargin.toFixed(1)}%`,
+ difference: `${marginDiff.toFixed(1)}pp`,
+ revenue: periodData.revenue,
+ ebitda: periodData.ebitda,
+ reason: `Stated margin (${marginValue}%) differs significantly from calculated margin (${calculatedMargin.toFixed(1)}%) - may indicate data extraction error`
+ });
+ // Don't reject - just log as warning
+ }
+ }
+
+ // Check margin consistency across periods (margins should be relatively stable)
+ const otherMargins = otherPeriods
+ .map(p => {
+ const margin = financials[p]?.ebitdaMargin;
+ if (!margin || margin === 'Not specified in CIM') return null;
+ const match = margin.match(/(-?\d+(?:\.\d+)?)/);
+ return match ? parseFloat(match[1]) : null;
+ })
+ .filter((v): v is number => v !== null);
+
+ if (otherMargins.length > 0) {
+ const avgOtherMargin = otherMargins.reduce((a, b) => a + b, 0) / otherMargins.length;
+ const marginDiff = Math.abs(marginValue - avgOtherMargin);
+ // Flag if margin differs by > 20 percentage points from average
+ if (marginDiff > 20) {
+ logger.warn('EBITDA margin inconsistency across periods', {
+ period,
+ margin: `${marginValue}%`,
+ avgOtherPeriods: `${avgOtherMargin.toFixed(1)}%`,
+ difference: `${marginDiff.toFixed(1)}pp`,
+ reason: `Margin for ${period} (${marginValue}%) differs significantly from average of other periods (${avgOtherMargin.toFixed(1)}%) - may indicate extraction error`
+ });
+ // Don't reject - just log as warning
+ }
+ }
}
}
}
| |