import { logger } from '../utils/logger'; export interface FinancialPeriod { revenue?: string; revenueGrowth?: string; grossProfit?: string; grossMargin?: string; ebitda?: string; ebitdaMargin?: string; } export interface ParsedFinancials { fy3: FinancialPeriod; fy2: FinancialPeriod; fy1: FinancialPeriod; ltm: FinancialPeriod; } type Bucket = keyof ParsedFinancials; const PERIOD_TOKEN_REGEX = /\b(?:(?:FY[-\s]?\d{1,2})|(?:FY[-\s]?)?20\d{2}[A-Z]*|(?:FY[-\s]?[1234])|(?:LTM|TTM))\b/gi; const MONEY_REGEX = /-?\$?$?\d[\d,]*(?:\.\d+)?$?\s?(?:K|M|B)?/g; const PERCENT_REGEX = /-?\d{1,3}(?:\.\d+)?\s?%/g; const resetRegex = (regex: RegExp): void => { regex.lastIndex = 0; }; const ROW_MATCHERS: Record = { revenue: /(revenue|net sales|total sales|top\s+line)/i, grossProfit: /(gross\s+profit)/i, grossMargin: /(gross\s+margin)/i, ebitda: /(ebitda|adjusted\s+ebitda|adj\.*\s*ebitda)/i, ebitdaMargin: /(ebitda\s+margin|adj\.*\s*ebitda\s+margin)/i, revenueGrowth: /(revenue\s+growth|yoy|y\/y|year[-\s]*over[-\s]*year)/i }; function normalizeToken(token: string): string { return token.replace(/\s+/g, ' ').replace(/[()]/g, '').trim(); } function tokenizePeriodHeaders(line: string): string[] { const matches = line.match(PERIOD_TOKEN_REGEX); if (!matches) return []; const normalizedTokens: string[] = []; for (const match of matches) { const normalized = normalizePeriodToken(match); if (!normalized) continue; if (!normalizedTokens.includes(normalized)) { normalizedTokens.push(normalized); } } return normalizedTokens; } function normalizePeriodToken(rawToken: string): string | null { if (!rawToken) return null; const trimmedOriginal = rawToken.trim().toUpperCase(); const isProjection = trimmedOriginal.endsWith('P') || trimmedOriginal.endsWith('PF'); if (isProjection) { return null; } let token = trimmedOriginal.replace(/[\u00A0\s]/g, ''); // Remove trailing punctuation token = token.replace(/[.,]+$/, ''); // Remove projection suffixes (A, E, F, PF, etc.) token = token.replace(/(20\d{2})(?:[A-Z]+)$/i, '$1'); token = token.replace(/(FY20\d{2})(?:[A-Z]+)$/i, '$1'); // Normalize FYXX to FY-XX if (/^FY\d{1,2}$/.test(token)) { token = token.replace(/^FY(\d{1,2})$/, 'FY-$1'); } // Normalize FY20XX to just the year if (/^FY20\d{2}$/.test(token)) { token = token.replace(/^FY(20\d{2})$/, '$1'); } return token; } function yearTokensToBuckets(tokens: string[]): Array { if (!tokens.length) return []; const bucketAssignments: Array = new Array(tokens.length).fill(null); const ltmIndices: number[] = []; // First pass: Identify LTM/TTM periods tokens.forEach((token, index) => { if (token.includes('LTM') || token.includes('TTM')) { bucketAssignments[index] = 'ltm'; ltmIndices.push(index); } }); // Get non-LTM indices (these should be fiscal years) const nonLtmIndices = tokens .map((token, index) => ({ token, index })) .filter(({ index }) => !ltmIndices.includes(index)); // Handle edge cases: tables with only 2-3 periods (not all 4) // Strategy: Assign FY buckets from most recent to oldest (FY1, FY2, FY3) // If we have 3 years: assign FY1, FY2, FY3 // If we have 2 years: assign FY1, FY2 // If we have 1 year: assign FY1 const fyBuckets: Bucket[] = ['fy1', 'fy2', 'fy3']; let fyIndex = 0; // Assign from most recent (rightmost) to oldest (leftmost) // This matches typical table layout: oldest year on left, newest on right for (let i = nonLtmIndices.length - 1; i >= 0 && fyIndex < fyBuckets.length; i--) { const { index } = nonLtmIndices[i]; bucketAssignments[index] = fyBuckets[fyIndex]; fyIndex++; } // Validation: Log if we have unusual period counts const assignedBuckets = bucketAssignments.filter(Boolean); if (assignedBuckets.length < 2) { logger.debug('Financial parser: Few periods detected', { totalTokens: tokens.length, assignedBuckets: assignedBuckets.length, tokens: tokens.slice(0, 10) }); } else if (assignedBuckets.length > 4) { logger.debug('Financial parser: Many periods detected - may include projections', { totalTokens: tokens.length, assignedBuckets: assignedBuckets.length, tokens: tokens.slice(0, 10) }); } return bucketAssignments; } /** * Extract numeric tokens (money/percentages) from a line or combined lines. * Best practice: Extract all numeric values and preserve their order to match column positions. */ function extractNumericTokens(line: string, additionalContent?: string): string[] { const combined = additionalContent ? `${line} ${additionalContent}` : line; const lineLength = line.length; // Extract money values with their positions to preserve column order resetRegex(MONEY_REGEX); const moneyMatches = Array.from(combined.matchAll(MONEY_REGEX)) .map((m) => ({ value: normalizeToken(m[0]), index: m.index ?? 0 })) .filter((m) => m.value && /\d/.test(m.value)); // Extract percentage values with their positions resetRegex(PERCENT_REGEX); const percentMatches = Array.from(combined.matchAll(PERCENT_REGEX)) .map((m) => ({ value: normalizeToken(m[0]), index: m.index ?? 0 })) .filter((m) => m.value && /\d/.test(m.value)); const sortedMatches = [...moneyMatches, ...percentMatches].sort((a, b) => a.index - b.index); const primaryTokens = sortedMatches .filter(match => match.index < lineLength) .map(match => match.value); if (primaryTokens.length >= 2 || !additionalContent) { return primaryTokens.length > 0 ? primaryTokens : sortedMatches.map(match => match.value); } const secondaryTokens = sortedMatches .filter(match => match.index >= lineLength) .map(match => match.value); return primaryTokens.concat(secondaryTokens); } function isMoneyLike(value?: string): boolean { if (!value) return false; const clean = value.replace(/[(),\s]/g, ''); return /\d/.test(clean) && (value.includes('$') || /[KMB]/i.test(value)); } function isPercentLike(value?: string): boolean { if (!value) return false; return /\d/.test(value) && value.includes('%'); } /** * Assign tokens to buckets based on column position. * Best practice: Map tokens to buckets by index position, ensuring alignment with header columns. * This assumes tokens are in the same order as the header columns. */ function assignTokensToBuckets( tokens: string[], buckets: Array, mapper: (bucket: Bucket, value: string) => void, fieldName?: string, lineIndex?: number ) { // Count non-null buckets (actual periods we want to extract) const validBuckets = buckets.filter(Boolean).length; // Validation: Check if token count matches expected bucket count // Allow some flexibility - tokens can be within 1 of valid buckets (handles missing values) if (tokens.length < validBuckets - 1) { logger.debug('Financial parser: Token count mismatch - too few tokens', { field: fieldName, lineIndex, tokensFound: tokens.length, validBuckets, tokens: tokens.slice(0, 10), buckets: buckets.map(b => b || 'skip') }); // Still try to assign what we have, but log the issue } else if (tokens.length > validBuckets + 1) { logger.debug('Financial parser: Token count mismatch - too many tokens', { field: fieldName, lineIndex, tokensFound: tokens.length, validBuckets, tokens: tokens.slice(0, 10), buckets: buckets.map(b => b || 'skip') }); // Take only the first N tokens that match buckets } // Map tokens to buckets by position // Strategy: Match tokens sequentially to non-null buckets let tokenIndex = 0; for (let i = 0; i < buckets.length && tokenIndex < tokens.length; i++) { const bucket = buckets[i]; if (!bucket) { // Skip this column (it's a projection or irrelevant period) // CRITICAL: When we skip a bucket, we also skip the corresponding token // This assumes tokens are aligned with columns in the table // If the table has missing values, tokens might be misaligned // In that case, we try to match by counting non-null buckets before this position const nonNullBucketsBefore = buckets.slice(0, i).filter(Boolean).length; if (tokenIndex < nonNullBucketsBefore) { // We're behind - this might be a missing value, skip the token tokenIndex++; } continue; } // Assign the token to this bucket if (tokenIndex < tokens.length) { mapper(bucket, tokens[tokenIndex]); tokenIndex++; } else { // No more tokens - this period has no value logger.debug('Financial parser: Missing token for bucket', { field: fieldName, bucket, bucketIndex: i, tokensFound: tokens.length }); } } // Log if we didn't use all tokens (might indicate misalignment) if (tokenIndex < tokens.length && tokens.length > validBuckets) { logger.debug('Financial parser: Unused tokens detected', { field: fieldName, tokensUsed: tokenIndex, tokensTotal: tokens.length, validBuckets, unusedTokens: tokens.slice(tokenIndex) }); } } export function parseFinancialsFromText(fullText: string): ParsedFinancials { const startTime = Date.now(); const result: ParsedFinancials = { fy3: {}, fy2: {}, fy1: {}, ltm: {} }; try { const text = fullText.replace(/\u00A0/g, ' '); const lines = text.split('\n').map((line) => line.trim()).filter(Boolean); if (lines.length === 0) { return result; } let bestHeaderIndex = -1; let bestBuckets: Array = []; let bestHeaderScore = 0; // Locate best header line containing year-like tokens // Best practice: Score headers by both period count AND likelihood of being a financial table for (let i = 0; i < lines.length; i++) { const tokens = tokenizePeriodHeaders(lines[i]); if (tokens.length >= 2) { const buckets = yearTokensToBuckets(tokens); const validBuckets = buckets.filter(Boolean).length; // Score this header: prioritize headers followed by financial metric rows let score = validBuckets; // CRITICAL: Financial sections are typically in the BACK HALF of the document // Boost score for headers in the latter portion of the document const documentPosition = i / lines.length; if (documentPosition > 0.5) { score += 50; // Strong boost for headers in back half } else if (documentPosition > 0.4) { score += 20; // Moderate boost for headers in second half } // CRITICAL: Financial tables almost always have BOTH revenue AND EBITDA rows // Look ahead 5-20 lines for these key indicators const lookAheadStart = Math.min(i + 1, lines.length); const lookAheadEnd = Math.min(i + 20, lines.length); let hasRevenue = false; let hasEBITDA = false; let financialRowCount = 0; for (let j = lookAheadStart; j < lookAheadEnd; j++) { const checkLine = lines[j] || ''; const hasNumbers = containsMoneyOrPercent(checkLine); if (!hasNumbers) continue; // Skip lines without numbers // Check for revenue (and variations) if (ROW_MATCHERS.revenue.test(checkLine)) { hasRevenue = true; financialRowCount++; } // Check for EBITDA (and variations) if (ROW_MATCHERS.ebitda.test(checkLine)) { hasEBITDA = true; financialRowCount++; } // Also count other financial metrics if (ROW_MATCHERS.grossProfit.test(checkLine) || ROW_MATCHERS.grossMargin.test(checkLine) || ROW_MATCHERS.ebitdaMargin.test(checkLine) || ROW_MATCHERS.revenueGrowth.test(checkLine)) { financialRowCount++; } } // MASSIVE boost if header has BOTH revenue AND EBITDA (strongest signal) if (hasRevenue && hasEBITDA) { score += 100; // This is almost certainly the financial table } else if (hasRevenue || hasEBITDA) { score += 20; // Has one key metric } // Additional boost for other financial rows score += financialRowCount * 5; // Log scoring details for debugging (only for headers with potential) if (validBuckets >= 2 && (hasRevenue || hasEBITDA || financialRowCount > 0)) { logger.debug('Financial parser header scoring', { headerIndex: i, headerLine: lines[i].substring(0, 100), validBuckets, hasRevenue, hasEBITDA, financialRowCount, score, lookAheadWindow: `${lookAheadStart}-${lookAheadEnd}` }); } // Prefer headers with more valid buckets (more historical periods) if (score > bestHeaderScore || (score === bestHeaderScore && validBuckets > bestBuckets.filter(Boolean).length)) { bestHeaderScore = score; bestBuckets = buckets; bestHeaderIndex = i; } } } if (bestHeaderIndex === -1 || bestBuckets.filter(Boolean).length === 0) { logger.info('Financial parser could not identify year header, returning empty result', { totalLines: lines.length, sampleLines: lines.slice(0, 20).join(' | ') }); return result; } logger.info('Financial parser selected best header', { headerIndex: bestHeaderIndex, headerScore: bestHeaderScore, buckets: bestBuckets.map((bucket) => bucket || 'skip') }); logger.info('Financial parser found header', { headerIndex: bestHeaderIndex, headerLine: lines[bestHeaderIndex], buckets: bestBuckets.map((bucket) => bucket || 'skip'), totalLines: lines.length }); // Expand window to search for financial data rows (header might be separated from data) const windowStart = Math.max(0, bestHeaderIndex - 10); const windowEnd = Math.min(lines.length, bestHeaderIndex + 50); // Increased from 18 to 50 to find data rows const windowLines = lines.slice(windowStart, windowEnd); logger.info('Financial parser window', { windowStart, windowEnd, windowSize: windowLines.length, windowLines: windowLines.join(' | ') }); const bucketSetters: Record void> = { revenue: (bucket, value) => { if (isMoneyLike(value)) result[bucket].revenue = result[bucket].revenue || value; }, grossProfit: (bucket, value) => { if (isMoneyLike(value)) result[bucket].grossProfit = result[bucket].grossProfit || value; }, ebitda: (bucket, value) => { if (isMoneyLike(value)) result[bucket].ebitda = result[bucket].ebitda || value; }, grossMargin: (bucket, value) => { if (isPercentLike(value)) result[bucket].grossMargin = result[bucket].grossMargin || value; }, ebitdaMargin: (bucket, value) => { if (isPercentLike(value)) result[bucket].ebitdaMargin = result[bucket].ebitdaMargin || value; }, revenueGrowth: (bucket, value) => { if (isPercentLike(value)) result[bucket].revenueGrowth = result[bucket].revenueGrowth || value; } }; let matchedRows = 0; // Search in a larger window around the header for financial data rows // Also search lines that come after the header (financial tables are usually below headers) const searchStart = bestHeaderIndex; const searchEnd = Math.min(lines.length, bestHeaderIndex + 100); // Search up to 100 lines after header for (let i = searchStart; i < searchEnd; i++) { const line = lines[i]; if (!line || line.trim().length === 0) continue; // Check current line and next few lines for numbers (tables might span multiple lines) const nextLine = lines[i + 1] || ''; const lineAfterNext = lines[i + 2] || ''; const combinedForTokens = `${line} ${nextLine} ${lineAfterNext}`; // CRITICAL: Only match rows that contain BOTH the field name AND numeric values // This prevents matching descriptive text that just mentions financial terms const hasMoneyOrPercent = containsMoneyOrPercent(combinedForTokens); if (!hasMoneyOrPercent) continue; // Skip lines without actual financial numbers for (const [field, matcher] of Object.entries(ROW_MATCHERS)) { if (!matcher.test(line)) continue; // Extract tokens from the combined lines const extraContent = `${nextLine} ${lineAfterNext}`.trim() || undefined; let tokens = extractNumericTokens(line, extraContent); if (['grossMargin', 'ebitdaMargin', 'revenueGrowth'].includes(field)) { const percentTokens = tokens.filter(isPercentLike); if (percentTokens.length > 0) { tokens = percentTokens; } } else if (['revenue', 'grossProfit', 'ebitda'].includes(field)) { const moneyTokens = tokens.filter(isMoneyLike); if (moneyTokens.length > 0) { tokens = moneyTokens; } } // Only process if we found meaningful tokens (at least 2, indicating multiple periods) if (tokens.length < 2) { logger.debug('Financial parser: matched field but insufficient tokens', { field, lineIndex: i, tokensFound: tokens.length, line: line.substring(0, 100) }); continue; } matchedRows++; logger.info('Financial parser matched row', { field, lineIndex: i, line: line.substring(0, 150), nextLine: nextLine.substring(0, 100), tokensFound: tokens.length, tokens: tokens.slice(0, 10), // Limit token logging buckets: bestBuckets.map(b => b || 'skip') }); assignTokensToBuckets( tokens, bestBuckets, (bucket, value) => { bucketSetters[field](bucket, value); }, field, i ); } } logger.info('Financial parser row matching summary', { matchedRows, bestBuckets: bestBuckets.length, buckets: bestBuckets.map((bucket) => bucket || 'skip') }); logger.info('Financial parser results', { elapsedMs: Date.now() - startTime, headerLine: lines[bestHeaderIndex], fy3: result.fy3, fy2: result.fy2, fy1: result.fy1, ltm: result.ltm }); } catch (error) { logger.warn('Financial parser failed', { error: error instanceof Error ? error.message : String(error) }); } return result; } const containsMoneyOrPercent = (text: string): boolean => { resetRegex(MONEY_REGEX); const hasMoney = MONEY_REGEX.test(text); resetRegex(PERCENT_REGEX); const hasPercent = PERCENT_REGEX.test(text); return hasMoney || hasPercent; };