feat: Production release v2.0.0 - Simple Document Processor

Major release with significant performance improvements and new processing strategy. ## Core Changes - Implemented simple_full_document processing strategy (default) - Full document → LLM approach: 1-2 passes, ~5-6 minutes processing time - Achieved 100% completeness with 2 API calls (down from 5+) - Removed redundant Document AI passes for faster processing ## Financial Data Extraction - Enhanced deterministic financial table parser - Improved FY3/FY2/FY1/LTM identification from varying CIM formats - Automatic merging of parser results with LLM extraction ## Code Quality & Infrastructure - Cleaned up debug logging (removed emoji markers from production code) - Fixed Firebase Secrets configuration (using modern defineSecret approach) - Updated OpenAI API key - Resolved deployment conflicts (secrets vs environment variables) - Added .env files to Firebase ignore list ## Deployment - Firebase Functions v2 deployment successful - All 7 required secrets verified and configured - Function URL: https://api-y56ccs6wva-uc.a.run.app ## Performance Improvements - Processing time: ~5-6 minutes (down from 23+ minutes) - API calls: 1-2 (down from 5+) - Completeness: 100% achievable - LLM Model: claude-3-7-sonnet-latest ## Breaking Changes - Default processing strategy changed to 'simple_full_document' - RAG processor available as alternative strategy 'document_ai_agentic_rag' ## Files Changed - 36 files changed, 5642 insertions(+), 4451 deletions(-) - Removed deprecated documentation files - Cleaned up unused services and models This release represents a major refactoring focused on speed, accuracy, and maintainability.
2025-11-09 21:07:22 -05:00
parent 0ec3d1412b
commit 9c916d12f4
106 changed files with 19228 additions and 4420 deletions
--- a/backend/src/services/financialTableParser.ts
+++ b/backend/src/services/financialTableParser.ts
@@ -0,0 +1,415 @@
+import { logger } from '../utils/logger';
+
+export interface FinancialPeriod {
+  revenue?: string;
+  revenueGrowth?: string;
+  grossProfit?: string;
+  grossMargin?: string;
+  ebitda?: string;
+  ebitdaMargin?: string;
+}
+
+export interface ParsedFinancials {
+  fy3: FinancialPeriod;
+  fy2: FinancialPeriod;
+  fy1: FinancialPeriod;
+  ltm: FinancialPeriod;
+}
+
+type Bucket = keyof ParsedFinancials;
+
+const PERIOD_TOKEN_REGEX = /\b(?:(?:FY[-\s]?\d{1,2})|(?:FY[-\s]?)?20\d{2}[A-Z]*|(?:FY[-\s]?[1234])|(?:LTM|TTM))\b/gi;
+const MONEY_REGEX = /-?\$?\(?\d[\d,]*(?:\.\d+)?\)?\s?(?:K|M|B)?/g;
+const PERCENT_REGEX = /-?\d{1,3}(?:\.\d+)?\s?%/g;
+
+const ROW_MATCHERS: Record<string, RegExp> = {
+  revenue: /(revenue|net sales|total sales|top\s+line)/i,
+  grossProfit: /(gross\s+profit)/i,
+  grossMargin: /(gross\s+margin)/i,
+  ebitda: /(ebitda|adjusted\s+ebitda|adj\.*\s*ebitda)/i,
+  ebitdaMargin: /(ebitda\s+margin|adj\.*\s*ebitda\s+margin)/i,
+  revenueGrowth: /(revenue\s+growth|yoy|y\/y|year[-\s]*over[-\s]*year)/i
+};
+
+function normalizeToken(token: string): string {
+  return token.replace(/\s+/g, ' ').replace(/[()]/g, '').trim();
+}
+
+function tokenizePeriodHeaders(line: string): string[] {
+  const matches = line.match(PERIOD_TOKEN_REGEX);
+  if (!matches) return [];
+
+  const normalizedTokens: string[] = [];
+  for (const match of matches) {
+    const normalized = normalizePeriodToken(match);
+    if (!normalized) continue;
+    if (!normalizedTokens.includes(normalized)) {
+      normalizedTokens.push(normalized);
+    }
+  }
+  return normalizedTokens;
+}
+
+function normalizePeriodToken(rawToken: string): string | null {
+  if (!rawToken) return null;
+  const trimmedOriginal = rawToken.trim().toUpperCase();
+  const isProjection = trimmedOriginal.endsWith('P') || trimmedOriginal.endsWith('PF');
+  if (isProjection) {
+    return null;
+  }
+
+  let token = trimmedOriginal.replace(/[\u00A0\s]/g, '');
+
+  // Remove trailing punctuation
+  token = token.replace(/[.,]+$/, '');
+
+  // Remove projection suffixes (A, E, F, PF, etc.)
+  token = token.replace(/(20\d{2})(?:[A-Z]+)$/i, '$1');
+  token = token.replace(/(FY20\d{2})(?:[A-Z]+)$/i, '$1');
+
+  // Normalize FYXX to FY-XX
+  if (/^FY\d{1,2}$/.test(token)) {
+    token = token.replace(/^FY(\d{1,2})$/, 'FY-$1');
+  }
+
+  // Normalize FY20XX to just the year
+  if (/^FY20\d{2}$/.test(token)) {
+    token = token.replace(/^FY(20\d{2})$/, '$1');
+  }
+  return token;
+}
+
+function yearTokensToBuckets(tokens: string[]): Array<Bucket | null> {
+  if (!tokens.length) return [];
+
+  const bucketAssignments: Array<Bucket | null> = new Array(tokens.length).fill(null);
+  const ltmIndices: number[] = [];
+
+  tokens.forEach((token, index) => {
+    if (token.includes('LTM') || token.includes('TTM')) {
+      bucketAssignments[index] = 'ltm';
+      ltmIndices.push(index);
+    }
+  });
+
+  const nonLtmIndices = tokens
+    .map((token, index) => ({ token, index }))
+    .filter(({ index }) => !ltmIndices.includes(index));
+
+  const fyBuckets: Bucket[] = ['fy1', 'fy2', 'fy3'];
+  let fyIndex = 0;
+
+  for (let i = nonLtmIndices.length - 1; i >= 0 && fyIndex < fyBuckets.length; i--) {
+    const { index } = nonLtmIndices[i];
+    bucketAssignments[index] = fyBuckets[fyIndex];
+    fyIndex++;
+  }
+
+  return bucketAssignments;
+}
+
+/**
+ * Extract numeric tokens (money/percentages) from a line or combined lines.
+ * Best practice: Extract all numeric values and preserve their order to match column positions.
+ */
+function extractNumericTokens(line: string, nextLine?: string): string[] {
+  const combined = `${line} ${nextLine || ''}`;
+  
+  // Extract money values with their positions to preserve column order
+  const moneyMatches = Array.from(combined.matchAll(MONEY_REGEX))
+    .map((m) => ({ value: normalizeToken(m[0]), index: m.index || 0 }))
+    .filter((m) => m.value && /\d/.test(m.value));
+  
+  // Extract percentage values with their positions
+  const percentMatches = Array.from(combined.matchAll(PERCENT_REGEX))
+    .map((m) => ({ value: normalizeToken(m[0]), index: m.index || 0 }))
+    .filter((m) => m.value && /\d/.test(m.value));
+  
+  // Combine and sort by position to preserve column order (critical for table parsing)
+  const allMatches = [...moneyMatches, ...percentMatches]
+    .sort((a, b) => a.index - b.index)
+    .map((m) => m.value);
+  
+  // Remove duplicates while preserving order
+  const tokens: string[] = [];
+  for (const token of allMatches) {
+    if (!tokens.includes(token)) {
+      tokens.push(token);
+    }
+  }
+  
+  return tokens;
+}
+
+function isMoneyLike(value?: string): boolean {
+  if (!value) return false;
+  const clean = value.replace(/[(),\s]/g, '');
+  return /\d/.test(clean) && (value.includes('$') || /[KMB]/i.test(value));
+}
+
+function isPercentLike(value?: string): boolean {
+  if (!value) return false;
+  return /\d/.test(value) && value.includes('%');
+}
+
+/**
+ * Assign tokens to buckets based on column position.
+ * Best practice: Map tokens to buckets by index position, ensuring alignment with header columns.
+ * This assumes tokens are in the same order as the header columns.
+ */
+function assignTokensToBuckets(
+  tokens: string[],
+  buckets: Array<Bucket | null>,
+  mapper: (bucket: Bucket, value: string) => void
+) {
+  // Only assign tokens that align with non-null buckets (skip columns)
+  // This ensures we don't assign data to skipped columns (like projections)
+  let tokenIndex = 0;
+  for (let i = 0; i < buckets.length && tokenIndex < tokens.length; i++) {
+    const bucket = buckets[i];
+    if (!bucket) {
+      // Skip this column (it's a projection or irrelevant period)
+      // Don't increment tokenIndex - the token might belong to the next bucket
+      continue;
+    }
+    // Assign the token to this bucket
+    mapper(bucket, tokens[tokenIndex]);
+    tokenIndex++;
+  }
+}
+
+export function parseFinancialsFromText(fullText: string): ParsedFinancials {
+  const startTime = Date.now();
+  const result: ParsedFinancials = {
+    fy3: {},
+    fy2: {},
+    fy1: {},
+    ltm: {}
+  };
+
+  try {
+    const text = fullText.replace(/\u00A0/g, ' ');
+    const lines = text.split('\n').map((line) => line.trim()).filter(Boolean);
+    if (lines.length === 0) {
+      return result;
+    }
+
+    let bestHeaderIndex = -1;
+    let bestBuckets: Array<Bucket | null> = [];
+    let bestHeaderScore = 0;
+
+    // Locate best header line containing year-like tokens
+    // Best practice: Score headers by both period count AND likelihood of being a financial table
+    for (let i = 0; i < lines.length; i++) {
+      const tokens = tokenizePeriodHeaders(lines[i]);
+      if (tokens.length >= 2) {
+        const buckets = yearTokensToBuckets(tokens);
+        const validBuckets = buckets.filter(Boolean).length;
+        
+        // Score this header: prioritize headers followed by financial metric rows
+        let score = validBuckets;
+        
+        // CRITICAL: Financial sections are typically in the BACK HALF of the document
+        // Boost score for headers in the latter portion of the document
+        const documentPosition = i / lines.length;
+        if (documentPosition > 0.5) {
+          score += 50; // Strong boost for headers in back half
+        } else if (documentPosition > 0.4) {
+          score += 20; // Moderate boost for headers in second half
+        }
+        
+        // CRITICAL: Financial tables almost always have BOTH revenue AND EBITDA rows
+        // Look ahead 5-20 lines for these key indicators
+        const lookAheadStart = Math.min(i + 1, lines.length);
+        const lookAheadEnd = Math.min(i + 20, lines.length);
+        let hasRevenue = false;
+        let hasEBITDA = false;
+        let financialRowCount = 0;
+        
+        for (let j = lookAheadStart; j < lookAheadEnd; j++) {
+          const checkLine = lines[j] || '';
+          const hasNumbers = MONEY_REGEX.test(checkLine) || PERCENT_REGEX.test(checkLine);
+          
+          if (!hasNumbers) continue; // Skip lines without numbers
+          
+          // Check for revenue (and variations)
+          if (ROW_MATCHERS.revenue.test(checkLine)) {
+            hasRevenue = true;
+            financialRowCount++;
+          }
+          
+          // Check for EBITDA (and variations)
+          if (ROW_MATCHERS.ebitda.test(checkLine)) {
+            hasEBITDA = true;
+            financialRowCount++;
+          }
+          
+          // Also count other financial metrics
+          if (ROW_MATCHERS.grossProfit.test(checkLine) || 
+              ROW_MATCHERS.grossMargin.test(checkLine) ||
+              ROW_MATCHERS.ebitdaMargin.test(checkLine) ||
+              ROW_MATCHERS.revenueGrowth.test(checkLine)) {
+            financialRowCount++;
+          }
+        }
+        
+        // MASSIVE boost if header has BOTH revenue AND EBITDA (strongest signal)
+        if (hasRevenue && hasEBITDA) {
+          score += 100; // This is almost certainly the financial table
+        } else if (hasRevenue || hasEBITDA) {
+          score += 20; // Has one key metric
+        }
+        
+        // Additional boost for other financial rows
+        score += financialRowCount * 5;
+        
+        // Log scoring details for debugging (only for headers with potential)
+        if (validBuckets >= 2 && (hasRevenue || hasEBITDA || financialRowCount > 0)) {
+          logger.debug('Financial parser header scoring', {
+            headerIndex: i,
+            headerLine: lines[i].substring(0, 100),
+            validBuckets,
+            hasRevenue,
+            hasEBITDA,
+            financialRowCount,
+            score,
+            lookAheadWindow: `${lookAheadStart}-${lookAheadEnd}`
+          });
+        }
+        
+        // Prefer headers with more valid buckets (more historical periods)
+        if (score > bestHeaderScore || (score === bestHeaderScore && validBuckets > bestBuckets.filter(Boolean).length)) {
+          bestHeaderScore = score;
+          bestBuckets = buckets;
+          bestHeaderIndex = i;
+        }
+      }
+    }
+
+    if (bestHeaderIndex === -1 || bestBuckets.filter(Boolean).length === 0) {
+      logger.info('Financial parser could not identify year header, returning empty result', {
+        totalLines: lines.length,
+        sampleLines: lines.slice(0, 20).join(' | ')
+      });
+      return result;
+    }
+    
+    logger.info('Financial parser selected best header', {
+      headerIndex: bestHeaderIndex,
+      headerScore: bestHeaderScore,
+      buckets: bestBuckets.map((bucket) => bucket || 'skip')
+    });
+
+    logger.info('Financial parser found header', {
+      headerIndex: bestHeaderIndex,
+      headerLine: lines[bestHeaderIndex],
+      buckets: bestBuckets.map((bucket) => bucket || 'skip'),
+      totalLines: lines.length
+    });
+
+    // Expand window to search for financial data rows (header might be separated from data)
+    const windowStart = Math.max(0, bestHeaderIndex - 10);
+    const windowEnd = Math.min(lines.length, bestHeaderIndex + 50); // Increased from 18 to 50 to find data rows
+    const windowLines = lines.slice(windowStart, windowEnd);
+    
+    logger.info('Financial parser window', {
+      windowStart,
+      windowEnd,
+      windowSize: windowLines.length,
+      windowLines: windowLines.join(' | ')
+    });
+
+    const bucketSetters: Record<string, (bucket: Bucket, value: string) => void> = {
+      revenue: (bucket, value) => {
+        if (isMoneyLike(value)) result[bucket].revenue = result[bucket].revenue || value;
+      },
+      grossProfit: (bucket, value) => {
+        if (isMoneyLike(value)) result[bucket].grossProfit = result[bucket].grossProfit || value;
+      },
+      ebitda: (bucket, value) => {
+        if (isMoneyLike(value)) result[bucket].ebitda = result[bucket].ebitda || value;
+      },
+      grossMargin: (bucket, value) => {
+        if (isPercentLike(value)) result[bucket].grossMargin = result[bucket].grossMargin || value;
+      },
+      ebitdaMargin: (bucket, value) => {
+        if (isPercentLike(value)) result[bucket].ebitdaMargin = result[bucket].ebitdaMargin || value;
+      },
+      revenueGrowth: (bucket, value) => {
+        if (isPercentLike(value)) result[bucket].revenueGrowth = result[bucket].revenueGrowth || value;
+      }
+    };
+
+    let matchedRows = 0;
+    // Search in a larger window around the header for financial data rows
+    // Also search lines that come after the header (financial tables are usually below headers)
+    const searchStart = bestHeaderIndex;
+    const searchEnd = Math.min(lines.length, bestHeaderIndex + 100); // Search up to 100 lines after header
+    
+    for (let i = searchStart; i < searchEnd; i++) {
+      const line = lines[i];
+      if (!line || line.trim().length === 0) continue;
+
+      // Check current line and next few lines for numbers (tables might span multiple lines)
+      const nextLine = lines[i + 1] || '';
+      const lineAfterNext = lines[i + 2] || '';
+      const combinedForTokens = `${line} ${nextLine} ${lineAfterNext}`;
+      
+      // CRITICAL: Only match rows that contain BOTH the field name AND numeric values
+      // This prevents matching descriptive text that just mentions financial terms
+      const hasMoneyOrPercent = MONEY_REGEX.test(combinedForTokens) || PERCENT_REGEX.test(combinedForTokens);
+      if (!hasMoneyOrPercent) continue; // Skip lines without actual financial numbers
+
+      for (const [field, matcher] of Object.entries(ROW_MATCHERS)) {
+        if (!matcher.test(line)) continue;
+
+        // Extract tokens from the combined lines
+        const tokens = extractNumericTokens(line, combinedForTokens);
+        
+        // Only process if we found meaningful tokens (at least 2, indicating multiple periods)
+        if (tokens.length < 2) {
+          logger.debug('Financial parser: matched field but insufficient tokens', {
+            field,
+            lineIndex: i,
+            tokensFound: tokens.length,
+            line: line.substring(0, 100)
+          });
+          continue;
+        }
+
+        matchedRows++;
+        logger.info('Financial parser matched row', {
+          field,
+          lineIndex: i,
+          line: line.substring(0, 150),
+          nextLine: nextLine.substring(0, 100),
+          tokensFound: tokens.length,
+          tokens: tokens.slice(0, 10) // Limit token logging
+        });
+
+        assignTokensToBuckets(tokens, bestBuckets, (bucket, value) => {
+          bucketSetters[field](bucket, value);
+        });
+      }
+    }
+    
+    logger.info('Financial parser row matching summary', {
+      matchedRows,
+      bestBuckets: bestBuckets.length,
+      buckets: bestBuckets.map((bucket) => bucket || 'skip')
+    });
+
+    logger.info('Financial parser results', {
+      elapsedMs: Date.now() - startTime,
+      headerLine: lines[bestHeaderIndex],
+      fy3: result.fy3,
+      fy2: result.fy2,
+      fy1: result.fy1,
+      ltm: result.ltm
+    });
+  } catch (error) {
+    logger.warn('Financial parser failed', { error: error instanceof Error ? error.message : String(error) });
+  }
+
+  return result;
+}