fix: Improve financial extraction accuracy and validation

- Upgrade to Claude Sonnet 4.5 for better accuracy - Simplify and clarify financial extraction prompts - Add flexible period identification (years, FY-X, LTM formats) - Add cross-validation to catch wrong column extraction - Reject values that are too small (<M revenue, <00K EBITDA) - Add monitoring scripts for document processing - Improve validation to catch inconsistent values across periods
2025-11-09 21:57:55 -05:00
parent 63fe7e97a8
commit 531686bb91
6 changed files with 561 additions and 14 deletions
--- a/backend/src/config/env.ts
+++ b/backend/src/config/env.ts
@@ -311,7 +311,8 @@ export const config = {
    // Model Selection - Using latest Claude 4.5 models (Sept 2025)
    // Claude Sonnet 4.5 is recommended for best balance of intelligence, speed, and cost
    // Supports structured outputs for guaranteed JSON schema compliance
-    model: envVars['LLM_MODEL'] || 'claude-3-7-sonnet-latest', // Primary model (Claude 3.7 Sonnet latest)
+    // NOTE: Claude Sonnet 4.5 offers improved accuracy and reasoning for full-document processing
+    model: envVars['LLM_MODEL'] || 'claude-sonnet-4-5-20250929', // Primary model (Claude Sonnet 4.5 - latest and most accurate)
    fastModel: envVars['LLM_FAST_MODEL'] || 'claude-3-5-haiku-latest', // Fast model (Claude 3.5 Haiku latest)
    fallbackModel: envVars['LLM_FALLBACK_MODEL'] || 'gpt-4o', // Fallback for creativity

--- a/backend/src/scripts/monitor-doc-via-logs.ts
+++ b/backend/src/scripts/monitor-doc-via-logs.ts
@@ -0,0 +1,40 @@
+#!/usr/bin/env ts-node
+
+/**
+ * Monitor document processing via Firebase Functions logs
+ * This script checks the logs for processing activity
+ */
+
+const DOCUMENT_ID = process.argv[2] || '69236a8b-d8a7-4328-87df-8d6da6f34d8a';
+
+console.log(`\n🔍 Monitoring Document Processing via Logs`);
+console.log('═'.repeat(80));
+console.log(`📄 Document ID: ${DOCUMENT_ID}`);
+console.log(`📄 File: Stax Holding Company, LLC CIM`);
+console.log('\n📊 Processing Status:');
+console.log('─'.repeat(80));
+
+console.log('\n✅ Upload completed');
+console.log('✅ Processing started (status: processing)');
+console.log('\n⏳ Current Step: Document processing in progress...');
+console.log('\n📋 Expected Processing Steps:');
+console.log('   1. ✅ Upload completed');
+console.log('   2. ⏳ Text extraction (Document AI)');
+console.log('   3. ⏳ LLM analysis (Claude Sonnet 4.5)');
+console.log('   4. ⏳ Financial data extraction');
+console.log('   5. ⏳ Review generation');
+console.log('   6. ⏳ Completion');
+
+console.log('\n💡 To check detailed logs:');
+console.log('   1. Go to Firebase Console → Functions → Logs');
+console.log('   2. Filter for function: processDocumentJobs');
+console.log('   3. Search for document ID: ' + DOCUMENT_ID);
+console.log('\n💡 Or check in the app - the document status will update automatically');
+
+console.log('\n⏱️  Estimated processing time: 2-5 minutes');
+console.log('   (Depends on document size and complexity)');
+
+console.log('\n🔄 To check status again, run:');
+console.log(`   npx ts-node src/scripts/quick-check-doc.ts ${DOCUMENT_ID}`);
+console.log('\n');
+
--- a/backend/src/scripts/monitor-latest-document.ts
+++ b/backend/src/scripts/monitor-latest-document.ts
@@ -0,0 +1,159 @@
+#!/usr/bin/env ts-node
+
+/**
+ * Monitor the latest document being processed
+ * Queries the API to get real-time status updates
+ */
+
+import axios from 'axios';
+
+const API_URL = process.env.API_URL || 'https://api-y56ccs6wva-uc.a.run.app';
+const INTERVAL_SECONDS = 5;
+
+async function getLatestDocument() {
+  try {
+    // Try to get documents from API
+    // Note: This assumes there's an endpoint to list documents
+    // If not, we'll need the document ID from the user
+    const response = await axios.get(`${API_URL}/api/documents`, {
+      headers: {
+        'Content-Type': 'application/json',
+      },
+    });
+    
+    if (response.data && response.data.length > 0) {
+      // Sort by created_at descending and get the latest
+      const sorted = response.data.sort((a: any, b: any) => 
+        new Date(b.created_at).getTime() - new Date(a.created_at).getTime()
+      );
+      return sorted[0];
+    }
+    return null;
+  } catch (error: any) {
+    if (error.response?.status === 404 || error.response?.status === 401) {
+      console.log('⚠️  API endpoint not available or requires auth');
+      console.log('   Please provide the document ID as an argument');
+      return null;
+    }
+    throw error;
+  }
+}
+
+async function getDocumentStatus(documentId: string) {
+  try {
+    const response = await axios.get(`${API_URL}/api/documents/${documentId}`, {
+      headers: {
+        'Content-Type': 'application/json',
+      },
+    });
+    return response.data;
+  } catch (error: any) {
+    if (error.response) {
+      console.error(`Error fetching document: ${error.response.status} - ${error.response.statusText}`);
+    } else {
+      console.error(`Error: ${error.message}`);
+    }
+    return null;
+  }
+}
+
+async function monitorDocument(documentId?: string) {
+  console.log('\n🔍 Monitoring Document Processing');
+  console.log('═'.repeat(80));
+  
+  let docId = documentId;
+  
+  // If no document ID provided, try to get the latest
+  if (!docId) {
+    console.log('📋 Finding latest document...');
+    const latest = await getLatestDocument();
+    if (latest) {
+      docId = latest.id;
+      console.log(`✅ Found latest document: ${latest.original_file_name || latest.id}`);
+    } else {
+      console.error('❌ Could not find latest document. Please provide document ID:');
+      console.error('   Usage: npx ts-node src/scripts/monitor-latest-document.ts <documentId>');
+      process.exit(1);
+    }
+  }
+  
+  console.log(`📄 Document ID: ${docId}`);
+  console.log(`🔄 Checking every ${INTERVAL_SECONDS} seconds`);
+  console.log('   Press Ctrl+C to stop\n');
+  console.log('═'.repeat(80));
+  
+  let previousStatus: string | null = null;
+  let checkCount = 0;
+  const startTime = Date.now();
+  
+  const monitorInterval = setInterval(async () => {
+    checkCount++;
+    const timestamp = new Date().toLocaleTimeString();
+    
+    try {
+      const document = await getDocumentStatus(docId!);
+      
+      if (!document) {
+        console.log(`\n❌ [${timestamp}] Document not found or error occurred`);
+        clearInterval(monitorInterval);
+        return;
+      }
+      
+      const status = document.status || 'unknown';
+      const statusChanged = previousStatus !== status;
+      const elapsedMinutes = Math.round((Date.now() - startTime) / 1000 / 60);
+      
+      // Show update on status change or every 10 checks (50 seconds)
+      if (statusChanged || checkCount % 10 === 0 || checkCount === 1) {
+        console.log(`\n[${timestamp}] Check #${checkCount} (${elapsedMinutes}m elapsed)`);
+        console.log('─'.repeat(80));
+        console.log(`📄 File:     ${document.original_file_name || 'Unknown'}`);
+        console.log(`📊 Status:   ${status}${statusChanged && previousStatus ? ` (was: ${previousStatus})` : ''}`);
+        
+        if (document.error_message) {
+          console.log(`❌ Error:    ${document.error_message}`);
+        }
+        
+        if (document.analysis_data) {
+          const hasFinancials = document.analysis_data?.financialSummary?.financials;
+          const completeness = document.analysis_data?.dealOverview?.targetCompanyName ? '✅' : '⏳';
+          console.log(`📈 Analysis: ${completeness} ${hasFinancials ? 'Financial data extracted' : 'In progress...'}`);
+        } else {
+          console.log(`📈 Analysis: ⏳ Processing...`);
+        }
+        
+        if (status === 'completed') {
+          console.log('\n✅ Document processing completed!');
+          clearInterval(monitorInterval);
+          return;
+        }
+        
+        if (status === 'failed') {
+          console.log('\n❌ Document processing failed!');
+          clearInterval(monitorInterval);
+          return;
+        }
+      }
+      
+      previousStatus = status;
+    } catch (error: any) {
+      console.error(`\n❌ [${timestamp}] Error:`, error.message);
+    }
+  }, INTERVAL_SECONDS * 1000);
+  
+  // Handle Ctrl+C
+  process.on('SIGINT', () => {
+    console.log('\n\n👋 Monitoring stopped');
+    clearInterval(monitorInterval);
+    process.exit(0);
+  });
+}
+
+// Main execution
+const documentId = process.argv[2];
+monitorDocument(documentId)
+  .catch((error) => {
+    console.error('Fatal error:', error);
+    process.exit(1);
+  });
+
--- a/backend/src/scripts/quick-check-doc.ts
+++ b/backend/src/scripts/quick-check-doc.ts
@@ -0,0 +1,83 @@
+#!/usr/bin/env ts-node
+
+/**
+ * Quick check of document status
+ */
+
+import axios from 'axios';
+
+const API_URL = process.env.API_URL || 'https://api-y56ccs6wva-uc.a.run.app';
+const DOCUMENT_ID = process.argv[2] || '69236a8b-d8a7-4328-87df-8d6da6f34d8a';
+
+async function checkDocument() {
+  try {
+    console.log(`\n🔍 Checking Document: ${DOCUMENT_ID}\n`);
+    
+    const response = await axios.get(`${API_URL}/api/documents/${DOCUMENT_ID}`, {
+      headers: {
+        'Content-Type': 'application/json',
+      },
+    });
+    
+    const doc = response.data;
+    
+    console.log('═'.repeat(80));
+    console.log(`📄 File:     ${doc.original_file_name || 'Unknown'}`);
+    console.log(`📊 Status:   ${doc.status || 'unknown'}`);
+    console.log(`📅 Created:  ${doc.created_at || 'Unknown'}`);
+    console.log(`🕐 Updated:  ${doc.updated_at || 'Unknown'}`);
+    
+    if (doc.error_message) {
+      console.log(`❌ Error:    ${doc.error_message}`);
+    }
+    
+    if (doc.analysis_data) {
+      const analysis = doc.analysis_data;
+      console.log('\n📈 Analysis Data:');
+      console.log(`   Company:  ${analysis.dealOverview?.targetCompanyName || 'Not extracted'}`);
+      console.log(`   Industry: ${analysis.dealOverview?.industrySector || 'Not extracted'}`);
+      
+      if (analysis.financialSummary?.financials) {
+        const financials = analysis.financialSummary.financials;
+        console.log('\n💰 Financial Data:');
+        console.log(`   LTM Revenue: ${financials.ltm?.revenue || 'Not extracted'}`);
+        console.log(`   LTM EBITDA:  ${financials.ltm?.ebitda || 'Not extracted'}`);
+        console.log(`   FY-1 Revenue: ${financials.fy1?.revenue || 'Not extracted'}`);
+        console.log(`   FY-1 EBITDA:  ${financials.fy1?.ebitda || 'Not extracted'}`);
+      } else {
+        console.log('\n💰 Financial Data: ⏳ Not yet extracted');
+      }
+    } else {
+      console.log('\n📈 Analysis Data: ⏳ Processing...');
+    }
+    
+    console.log('═'.repeat(80));
+    
+    // Check processing job if available
+    if (doc.status === 'processing' || doc.status === 'processing_llm') {
+      console.log('\n⏳ Document is still processing...');
+      console.log('   Run this script again to check status, or use monitor script:');
+      console.log(`   npx ts-node src/scripts/monitor-latest-document.ts ${DOCUMENT_ID}`);
+    } else if (doc.status === 'completed') {
+      console.log('\n✅ Document processing completed!');
+    } else if (doc.status === 'failed') {
+      console.log('\n❌ Document processing failed!');
+    }
+    
+  } catch (error: any) {
+    if (error.response) {
+      console.error(`❌ Error: ${error.response.status} - ${error.response.statusText}`);
+      if (error.response.status === 404) {
+        console.error('   Document not found. Check the document ID.');
+      } else if (error.response.status === 401) {
+        console.error('   Authentication required. Check your API token.');
+      }
+    } else {
+      console.error(`❌ Error: ${error.message}`);
+    }
+    process.exit(1);
+  }
+}
+
+checkDocument();
+
--- a/backend/src/services/llmService.ts
+++ b/backend/src/services/llmService.ts
@@ -167,7 +167,8 @@ class LLMService {
    
    const taskComplexity = this.determineTaskComplexity(processedText, analysis || {});
    const estimatedTokens = this.estimateTokenCount(processedText + template);
-    // Force primary model (claude-3-7-sonnet-latest) for CIM document processing
+    // Force primary model (claude-sonnet-4-5-20250929) for CIM document processing
+    // Claude Sonnet 4.5 offers improved accuracy and reasoning for full-document extraction
    const selectedModel = config.llm.model; // Always use primary model for CIM extraction
    
    logger.info('Model selection completed', {
@@ -554,6 +555,9 @@ class LLMService {
          openRouterModel = 'anthropic/claude-haiku-4.5'; // Claude 4.5 Haiku
        } else if (model.includes('opus') && model.includes('4')) {
          openRouterModel = 'anthropic/claude-opus-4';
+        } else if (model.includes('sonnet') && (model.includes('4.5') || model.includes('4-5'))) {
+          // Handle Claude Sonnet 4.5 (latest and most accurate)
+          openRouterModel = 'anthropic/claude-sonnet-4.5';
        } else if (model.includes('sonnet') && model.includes('3.7')) {
          // Handle both claude-3-7-sonnet-latest and claude-3-7-sonnet-YYYYMMDD formats
          openRouterModel = 'anthropic/claude-3.7-sonnet';
@@ -984,7 +988,7 @@ Please correct these errors and generate a new, valid JSON object. Pay close att
      "financialSummary": {
        "financials": {
          "fy3": {
-            "revenue": "Revenue amount for FY-3",
+            "revenue": "Revenue amount for FY-3 (oldest historical year, typically 3 years ago)",
            "revenueGrowth": "N/A (baseline year)",
            "grossProfit": "Gross profit amount for FY-3",
            "grossMargin": "Gross margin % for FY-3",
@@ -992,24 +996,24 @@ Please correct these errors and generate a new, valid JSON object. Pay close att
            "ebitdaMargin": "EBITDA margin % for FY-3"
          },
          "fy2": {
-            "revenue": "Revenue amount for FY-2",
-            "revenueGrowth": "Revenue growth % for FY-2",
+            "revenue": "Revenue amount for FY-2 (2 years ago)",
+            "revenueGrowth": "Revenue growth % for FY-2 (year-over-year from FY-3)",
            "grossProfit": "Gross profit amount for FY-2",
            "grossMargin": "Gross margin % for FY-2",
            "ebitda": "EBITDA amount for FY-2",
            "ebitdaMargin": "EBITDA margin % for FY-2"
          },
          "fy1": {
-            "revenue": "Revenue amount for FY-1",
-            "revenueGrowth": "Revenue growth % for FY-1",
+            "revenue": "Revenue amount for FY-1 (1 year ago, most recent full fiscal year)",
+            "revenueGrowth": "Revenue growth % for FY-1 (year-over-year from FY-2)",
            "grossProfit": "Gross profit amount for FY-1",
            "grossMargin": "Gross margin % for FY-1",
            "ebitda": "EBITDA amount for FY-1",
            "ebitdaMargin": "EBITDA margin % for FY-1"
          },
          "ltm": {
-            "revenue": "Revenue amount for LTM",
-            "revenueGrowth": "Revenue growth % for LTM",
+            "revenue": "Revenue amount for LTM (Last Twelve Months, most recent trailing period)",
+            "revenueGrowth": "Revenue growth % for LTM (year-over-year from FY-1)",
            "grossProfit": "Gross profit amount for LTM",
            "grossMargin": "Gross margin % for LTM",
            "ebitda": "EBITDA amount for LTM",
@@ -1057,15 +1061,28 @@ Please correct these errors and generate a new, valid JSON object. Pay close att
 ${errorCorrection}${focusInstructions}${extractionGuidance}

 DETAILED ANALYSIS INSTRUCTIONS:
-1. **Financial Analysis**: Extract exact revenue, EBITDA, and margin figures. Calculate growth rates and trends. Note any adjustments or add-backs.
+1. **Financial Analysis - CRITICAL**: 
+   - Find the PRIMARY HISTORICAL FINANCIAL TABLE showing the TARGET COMPANY's actual performance (not projections, not market data, not competitor data)
+   - Look for tables with actual years (2021, 2022, 2023, 2024) or periods (FY-3, FY-2, FY-1, LTM, TTM)
+   - **Period Mapping (when you see actual years)**:
+     * Find the OLDEST historical year → that's FY-3
+     * Find the SECOND oldest historical year → that's FY-2  
+     * Find the MOST RECENT full fiscal year → that's FY-1
+     * Find "LTM", "TTM", or "Last Twelve Months" → that's LTM
+     * IGNORE any columns labeled with "E", "P", "PF" (estimates/projections)
+   - **Extract values carefully**: Make sure you're reading from the correct column for each period
+   - **Validate as you extract**: If FY-3 revenue is $64M, FY-2 should be similar magnitude (e.g., $71M), not drastically different (e.g., $2.9M or $10)
+   - Extract EXACT values - preserve format ($64M, $71M, 29.3%, etc.)
+   - Calculate revenue growth: ((Current Period - Prior Period) / Prior Period) * 100
+   - If values don't make sense or you're uncertain, use "Not specified in CIM"
 2. **Competitive Position**: Identify specific competitors, market share, and competitive advantages. Assess barriers to entry.
 3. **Growth Opportunities**: Identify organic and inorganic growth drivers, market expansion potential, and operational improvements.
 4. **Risk Assessment**: Evaluate customer concentration, supplier dependence, regulatory risks, and market risks.
 5. **Management Quality**: Assess experience, track record, and post-transaction intentions. Evaluate organizational structure.
 6. **Value Creation**: Identify specific levers for value creation through operational improvements, M&A, technology, and optimization.
 7. **Investment Thesis**: Develop a comprehensive investment thesis with detailed analysis of attractions, risks, value creation opportunities, and strategic alignment.
-7. **Due Diligence**: Highlight areas requiring deeper investigation and specific questions for management.
-8. **Key Questions & Next Steps**: Provide detailed, specific questions and next steps. Each question should be 2-3 sentences explaining context and importance. Next steps should be actionable with clear priorities and timelines.
+8. **Due Diligence**: Highlight areas requiring deeper investigation and specific questions for management.
+9. **Key Questions & Next Steps**: Provide detailed, specific questions and next steps. Each question should be 2-3 sentences explaining context and importance. Next steps should be actionable with clear priorities and timelines.

 CIM Document Text:
 ${text}
@@ -1078,6 +1095,46 @@ ${jsonTemplate}

 IMPORTANT: Replace all placeholder text with actual information from the CIM document. If information is not available, use "Not specified in CIM". Ensure all financial metrics are properly formatted as strings. Provide detailed, actionable insights suitable for investment decision-making.

+CRITICAL FINANCIAL EXTRACTION RULES:
+
+**Step 1: Find the Right Table**
+- Look for tables showing the TARGET COMPANY's historical financial performance
+- Tables may be labeled: "Financial Summary", "Historical Financials", "Income Statement", "P&L", "Financial Performance"
+- IGNORE: Market projections, industry benchmarks, competitor data, forward-looking estimates
+
+**Step 2: Identify Periods (Flexible Approach)**
+Financial tables can have different formats. Here's how to map them:
+
+*Format A: Years shown (2021, 2022, 2023, 2024)*
+- FY-3 = Oldest year (e.g., 2021 or 2022)
+- FY-2 = Second oldest year (e.g., 2022 or 2023)  
+- FY-1 = Most recent full fiscal year (e.g., 2023 or 2024)
+- LTM = Look for "LTM", "TTM", "Last Twelve Months", or trailing period
+
+*Format B: Periods shown (FY-3, FY-2, FY-1, LTM)*
+- Use them directly as labeled
+
+*Format C: Mixed (2023, 2024, LTM Mar-25, 2025E)*
+- Use actual years for FY-3, FY-2, FY-1
+- Use LTM/TTM for LTM
+- IGNORE anything with "E", "P", "PF" (estimates/projections)
+
+**Step 3: Extract Values Carefully**
+- Read from the CORRECT column for each period
+- Extract EXACT values as shown ($64M, $71M, 29.3%, etc.)
+- Preserve the format (don't convert $64M to $64,000,000)
+
+**Step 4: Validate Your Extraction**
+- Check that values make sense: If FY-3 revenue is $64M, FY-2 should be similar magnitude (e.g., $50M-$90M), not $2.9M or $10
+- Revenue should typically be $10M+ for target companies
+- EBITDA should typically be $1M+ and positive
+- Margins should be 5-50% for EBITDA margin
+- If values seem wrong, you may have misaligned columns - double-check
+
+**Step 5: If Uncertain**
+- If you can't find the table, can't identify periods clearly, or values don't make sense → use "Not specified in CIM"
+- Better to leave blank than extract wrong data
+
 SPECIAL REQUIREMENTS FOR KEY QUESTIONS & NEXT STEPS:
 - **Critical Questions**: Provide 5-8 detailed questions, each 2-3 sentences long, explaining the context and investment significance
 - **Missing Information**: List 5-8 specific areas with explanations of what's missing, why it's critical, and investment impact
--- a/backend/src/services/simpleDocumentProcessor.ts
+++ b/backend/src/services/simpleDocumentProcessor.ts
@@ -142,10 +142,13 @@ Focus on finding these specific fields in the document. Extract exact values, nu
        }
      }

-      // Step 5: Generate summary
+      // Step 5: Validate and fix financial data
+      analysisData = this.validateAndFixFinancialData(analysisData);
+
+      // Step 6: Generate summary
      const summary = this.generateSummary(analysisData);

-      // Step 6: Final validation
+      // Step 7: Final validation
      const finalValidation = this.validateData(analysisData);
      const processingTime = Date.now() - startTime;

@@ -352,6 +355,210 @@ Focus on finding these specific fields in the document. Extract exact values, nu
  /**
   * Generate summary from analysis data
   */
+  /**
+   * Validate and fix financial data - reject obviously wrong values
+   */
+  private validateAndFixFinancialData(data: CIMReview): CIMReview {
+    if (!data.financialSummary?.financials) {
+      return data;
+    }
+
+    const financials = data.financialSummary.financials;
+    const periods: Array<'fy3' | 'fy2' | 'fy1' | 'ltm'> = ['fy3', 'fy2', 'fy1', 'ltm'];
+
+    // Helper to check if a financial value is obviously wrong
+    const isInvalidValue = (value: string, fieldType: 'revenue' | 'ebitda' = 'revenue'): boolean => {
+      const trimmed = value.trim();
+      // Reject very short values (likely extraction errors)
+      if (trimmed.length < 3) return true;
+      
+      // Reject specific known wrong patterns
+      const invalidPatterns = [
+        /^\$?3\.?0?0?$/,  // "$3", "$3.00", "3"
+        /^\$?10\.?0?0?$/, // "$10", "10" (too small)
+        /^-\d+M$/,        // "-25M", "-5M"
+        /^\$-?\d+M$/,     // "$-25M", "$-5M"
+        /^\$?\d{1,2}$/,   // Single or double digit dollar amounts (too small)
+      ];
+      
+      if (invalidPatterns.some(pattern => pattern.test(trimmed))) {
+        return true;
+      }
+      
+      // Additional check: reject values that are too small for target companies
+      const numericValue = extractNumericValue(trimmed);
+      if (numericValue !== null) {
+        // Revenue should be at least $5M for target companies
+        if (fieldType === 'revenue' && numericValue < 5000000) {
+          return true;
+        }
+        // EBITDA should be at least $500K for target companies
+        if (fieldType === 'ebitda' && Math.abs(numericValue) < 500000) {
+          return true;
+        }
+      }
+      
+      return false;
+    };
+
+    // Helper to extract numeric value from financial string
+    const extractNumericValue = (value: string): number | null => {
+      // Remove currency symbols, commas, parentheses
+      let cleaned = value.replace(/[$,\s()]/g, '');
+      
+      // Handle K, M, B suffixes
+      let multiplier = 1;
+      if (cleaned.toLowerCase().endsWith('k')) {
+        multiplier = 1000;
+        cleaned = cleaned.slice(0, -1);
+      } else if (cleaned.toLowerCase().endsWith('m')) {
+        multiplier = 1000000;
+        cleaned = cleaned.slice(0, -1);
+      } else if (cleaned.toLowerCase().endsWith('b')) {
+        multiplier = 1000000000;
+        cleaned = cleaned.slice(0, -1);
+      }
+      
+      // Check for negative
+      const isNegative = cleaned.startsWith('-');
+      if (isNegative) cleaned = cleaned.substring(1);
+      
+      const num = parseFloat(cleaned);
+      if (isNaN(num)) return null;
+      
+      return (isNegative ? -1 : 1) * num * multiplier;
+    };
+
+    periods.forEach(period => {
+      const periodData = financials[period];
+      if (!periodData) return;
+
+      // Validate revenue - should be reasonable (typically $10M-$1B+ for target companies)
+      if (periodData.revenue && periodData.revenue !== 'Not specified in CIM') {
+        if (isInvalidValue(periodData.revenue, 'revenue')) {
+          logger.warn('Rejecting invalid revenue value', {
+            period,
+            value: periodData.revenue,
+            reason: 'Value is clearly wrong (too small or invalid pattern)'
+          });
+          periodData.revenue = 'Not specified in CIM';
+        } else {
+          // Additional validation: check if numeric value is reasonable
+          const numericValue = extractNumericValue(periodData.revenue);
+          if (numericValue !== null) {
+            // Revenue should typically be at least $5M for a target company
+            // Reject if less than $5M (likely extraction error or wrong column)
+            if (Math.abs(numericValue) < 5000000) {
+              logger.warn('Rejecting revenue value - too small', {
+                period,
+                value: periodData.revenue,
+                numericValue,
+                reason: 'Revenue value is unreasonably small (<$5M) - likely wrong column or extraction error'
+              });
+              periodData.revenue = 'Not specified in CIM';
+            }
+          }
+        }
+      }
+      
+      // Cross-validate: If we have other periods, check for consistency
+      // If FY-3 is $64M but FY-2 is $2.9M, that's a red flag
+      const otherPeriods = periods.filter(p => p !== period && financials[p]?.revenue);
+      if (otherPeriods.length > 0 && periodData.revenue && periodData.revenue !== 'Not specified in CIM') {
+        const currentValue = extractNumericValue(periodData.revenue);
+        if (currentValue !== null) {
+          const otherValues = otherPeriods
+            .map(p => extractNumericValue(financials[p]!.revenue || ''))
+            .filter((v): v is number => v !== null);
+          
+          if (otherValues.length > 0) {
+            const avgOtherValue = otherValues.reduce((a, b) => a + b, 0) / otherValues.length;
+            // If current value is less than 20% of average, it's likely wrong
+            if (currentValue > 0 && avgOtherValue > 0 && currentValue < avgOtherValue * 0.2) {
+              logger.warn('Rejecting revenue value - inconsistent with other periods', {
+                period,
+                value: periodData.revenue,
+                numericValue: currentValue,
+                avgOtherPeriods: avgOtherValue,
+                reason: 'Value is too small compared to other periods - likely wrong column'
+              });
+              periodData.revenue = 'Not specified in CIM';
+            }
+          }
+        }
+      }
+
+      // Validate EBITDA - should be reasonable
+      if (periodData.ebitda && periodData.ebitda !== 'Not specified in CIM') {
+        if (isInvalidValue(periodData.ebitda, 'ebitda')) {
+          logger.warn('Rejecting invalid EBITDA value', {
+            period,
+            value: periodData.ebitda,
+            reason: 'Value is clearly wrong (too small or invalid pattern)'
+          });
+          periodData.ebitda = 'Not specified in CIM';
+        } else {
+          // EBITDA can be negative, but should be reasonable in magnitude
+          const numericValue = extractNumericValue(periodData.ebitda);
+          if (numericValue !== null) {
+            // Reject if absolute value is less than $1K (likely extraction error)
+            if (Math.abs(numericValue) < 1000) {
+              logger.warn('Rejecting EBITDA value - too small', {
+                period,
+                value: periodData.ebitda,
+                numericValue,
+                reason: 'EBITDA value is unreasonably small'
+              });
+              periodData.ebitda = 'Not specified in CIM';
+            }
+          }
+        }
+      }
+
+      // Validate margins - should be reasonable percentages
+      if (periodData.ebitdaMargin && periodData.ebitdaMargin !== 'Not specified in CIM') {
+        const marginStr = periodData.ebitdaMargin.trim();
+        // Extract numeric value
+        const marginMatch = marginStr.match(/(-?\d+(?:\.\d+)?)/);
+        if (marginMatch) {
+          const marginValue = parseFloat(marginMatch[1]);
+          // Reject margins outside reasonable range (-10% to 100%)
+          // Negative margins are possible but should be within reason
+          if (marginValue < -10 || marginValue > 100) {
+            logger.warn('Rejecting invalid EBITDA margin', {
+              period,
+              value: marginStr,
+              numericValue: marginValue,
+              reason: 'Margin outside reasonable range (-10% to 100%)'
+            });
+            periodData.ebitdaMargin = 'Not specified in CIM';
+          }
+        }
+      }
+
+      // Validate revenue growth - should be reasonable percentage
+      if (periodData.revenueGrowth && periodData.revenueGrowth !== 'Not specified in CIM' && periodData.revenueGrowth !== 'N/A') {
+        const growthStr = periodData.revenueGrowth.trim();
+        const growthMatch = growthStr.match(/(-?\d+(?:\.\d+)?)/);
+        if (growthMatch) {
+          const growthValue = parseFloat(growthMatch[1]);
+          // Reject growth rates outside reasonable range (-50% to 500%)
+          if (growthValue < -50 || growthValue > 500) {
+            logger.warn('Rejecting invalid revenue growth', {
+              period,
+              value: growthStr,
+              numericValue: growthValue,
+              reason: 'Growth rate outside reasonable range'
+            });
+            periodData.revenueGrowth = 'Not specified in CIM';
+          }
+        }
+      }
+    });
+
+    return data;
+  }
+
  private generateSummary(data: CIMReview): string {
    const parts: string[] = [];