535 lines
19 KiB
TypeScript
535 lines
19 KiB
TypeScript
import { logger } from '../utils/logger';
|
|
|
|
export interface FinancialPeriod {
|
|
revenue?: string;
|
|
revenueGrowth?: string;
|
|
grossProfit?: string;
|
|
grossMargin?: string;
|
|
ebitda?: string;
|
|
ebitdaMargin?: string;
|
|
}
|
|
|
|
export interface ParsedFinancials {
|
|
fy3: FinancialPeriod;
|
|
fy2: FinancialPeriod;
|
|
fy1: FinancialPeriod;
|
|
ltm: FinancialPeriod;
|
|
}
|
|
|
|
type Bucket = keyof ParsedFinancials;
|
|
|
|
const PERIOD_TOKEN_REGEX = /\b(?:(?:FY[-\s]?\d{1,2})|(?:FY[-\s]?)?20\d{2}[A-Z]*|(?:FY[-\s]?[1234])|(?:LTM|TTM))\b/gi;
|
|
const MONEY_REGEX = /-?\$?\(?\d[\d,]*(?:\.\d+)?\)?\s?(?:K|M|B)?/g;
|
|
const PERCENT_REGEX = /-?\d{1,3}(?:\.\d+)?\s?%/g;
|
|
|
|
const resetRegex = (regex: RegExp): void => {
|
|
regex.lastIndex = 0;
|
|
};
|
|
|
|
const ROW_MATCHERS: Record<string, RegExp> = {
|
|
revenue: /(revenue|net sales|total sales|top\s+line)/i,
|
|
grossProfit: /(gross\s+profit)/i,
|
|
grossMargin: /(gross\s+margin)/i,
|
|
ebitda: /(ebitda|adjusted\s+ebitda|adj\.*\s*ebitda)/i,
|
|
ebitdaMargin: /(ebitda\s+margin|adj\.*\s*ebitda\s+margin)/i,
|
|
revenueGrowth: /(revenue\s+growth|yoy|y\/y|year[-\s]*over[-\s]*year)/i
|
|
};
|
|
|
|
function normalizeToken(token: string): string {
|
|
return token.replace(/\s+/g, ' ').replace(/[()]/g, '').trim();
|
|
}
|
|
|
|
function tokenizePeriodHeaders(line: string): string[] {
|
|
const matches = line.match(PERIOD_TOKEN_REGEX);
|
|
if (!matches) return [];
|
|
|
|
const normalizedTokens: string[] = [];
|
|
for (const match of matches) {
|
|
const normalized = normalizePeriodToken(match);
|
|
if (!normalized) continue;
|
|
if (!normalizedTokens.includes(normalized)) {
|
|
normalizedTokens.push(normalized);
|
|
}
|
|
}
|
|
return normalizedTokens;
|
|
}
|
|
|
|
function normalizePeriodToken(rawToken: string): string | null {
|
|
if (!rawToken) return null;
|
|
const trimmedOriginal = rawToken.trim().toUpperCase();
|
|
const isProjection = trimmedOriginal.endsWith('P') || trimmedOriginal.endsWith('PF');
|
|
if (isProjection) {
|
|
return null;
|
|
}
|
|
|
|
let token = trimmedOriginal.replace(/[\u00A0\s]/g, '');
|
|
|
|
// Remove trailing punctuation
|
|
token = token.replace(/[.,]+$/, '');
|
|
|
|
// Remove projection suffixes (A, E, F, PF, etc.)
|
|
token = token.replace(/(20\d{2})(?:[A-Z]+)$/i, '$1');
|
|
token = token.replace(/(FY20\d{2})(?:[A-Z]+)$/i, '$1');
|
|
|
|
// Normalize FYXX to FY-XX
|
|
if (/^FY\d{1,2}$/.test(token)) {
|
|
token = token.replace(/^FY(\d{1,2})$/, 'FY-$1');
|
|
}
|
|
|
|
// Normalize FY20XX to just the year
|
|
if (/^FY20\d{2}$/.test(token)) {
|
|
token = token.replace(/^FY(20\d{2})$/, '$1');
|
|
}
|
|
return token;
|
|
}
|
|
|
|
function yearTokensToBuckets(tokens: string[]): Array<Bucket | null> {
|
|
if (!tokens.length) return [];
|
|
|
|
const bucketAssignments: Array<Bucket | null> = new Array(tokens.length).fill(null);
|
|
const ltmIndices: number[] = [];
|
|
|
|
// First pass: Identify LTM/TTM periods
|
|
tokens.forEach((token, index) => {
|
|
if (token.includes('LTM') || token.includes('TTM')) {
|
|
bucketAssignments[index] = 'ltm';
|
|
ltmIndices.push(index);
|
|
}
|
|
});
|
|
|
|
// Get non-LTM indices (these should be fiscal years)
|
|
const nonLtmIndices = tokens
|
|
.map((token, index) => ({ token, index }))
|
|
.filter(({ index }) => !ltmIndices.includes(index));
|
|
|
|
// Handle edge cases: tables with only 2-3 periods (not all 4)
|
|
// Strategy: Assign FY buckets from most recent to oldest (FY1, FY2, FY3)
|
|
// If we have 3 years: assign FY1, FY2, FY3
|
|
// If we have 2 years: assign FY1, FY2
|
|
// If we have 1 year: assign FY1
|
|
const fyBuckets: Bucket[] = ['fy1', 'fy2', 'fy3'];
|
|
let fyIndex = 0;
|
|
|
|
// Assign from most recent (rightmost) to oldest (leftmost)
|
|
// This matches typical table layout: oldest year on left, newest on right
|
|
for (let i = nonLtmIndices.length - 1; i >= 0 && fyIndex < fyBuckets.length; i--) {
|
|
const { index } = nonLtmIndices[i];
|
|
bucketAssignments[index] = fyBuckets[fyIndex];
|
|
fyIndex++;
|
|
}
|
|
|
|
// Validation: Log if we have unusual period counts
|
|
const assignedBuckets = bucketAssignments.filter(Boolean);
|
|
if (assignedBuckets.length < 2) {
|
|
logger.debug('Financial parser: Few periods detected', {
|
|
totalTokens: tokens.length,
|
|
assignedBuckets: assignedBuckets.length,
|
|
tokens: tokens.slice(0, 10)
|
|
});
|
|
} else if (assignedBuckets.length > 4) {
|
|
logger.debug('Financial parser: Many periods detected - may include projections', {
|
|
totalTokens: tokens.length,
|
|
assignedBuckets: assignedBuckets.length,
|
|
tokens: tokens.slice(0, 10)
|
|
});
|
|
}
|
|
|
|
return bucketAssignments;
|
|
}
|
|
|
|
/**
|
|
* Extract numeric tokens (money/percentages) from a line or combined lines.
|
|
* Best practice: Extract all numeric values and preserve their order to match column positions.
|
|
*/
|
|
function extractNumericTokens(line: string, additionalContent?: string): string[] {
|
|
const combined = additionalContent ? `${line} ${additionalContent}` : line;
|
|
const lineLength = line.length;
|
|
|
|
// Extract money values with their positions to preserve column order
|
|
resetRegex(MONEY_REGEX);
|
|
const moneyMatches = Array.from(combined.matchAll(MONEY_REGEX))
|
|
.map((m) => ({ value: normalizeToken(m[0]), index: m.index ?? 0 }))
|
|
.filter((m) => m.value && /\d/.test(m.value));
|
|
|
|
// Extract percentage values with their positions
|
|
resetRegex(PERCENT_REGEX);
|
|
const percentMatches = Array.from(combined.matchAll(PERCENT_REGEX))
|
|
.map((m) => ({ value: normalizeToken(m[0]), index: m.index ?? 0 }))
|
|
.filter((m) => m.value && /\d/.test(m.value));
|
|
|
|
const sortedMatches = [...moneyMatches, ...percentMatches].sort((a, b) => a.index - b.index);
|
|
|
|
const primaryTokens = sortedMatches
|
|
.filter(match => match.index < lineLength)
|
|
.map(match => match.value);
|
|
|
|
if (primaryTokens.length >= 2 || !additionalContent) {
|
|
return primaryTokens.length > 0 ? primaryTokens : sortedMatches.map(match => match.value);
|
|
}
|
|
|
|
const secondaryTokens = sortedMatches
|
|
.filter(match => match.index >= lineLength)
|
|
.map(match => match.value);
|
|
|
|
return primaryTokens.concat(secondaryTokens);
|
|
}
|
|
|
|
function isMoneyLike(value?: string): boolean {
|
|
if (!value) return false;
|
|
const clean = value.replace(/[(),\s]/g, '');
|
|
return /\d/.test(clean) && (value.includes('$') || /[KMB]/i.test(value));
|
|
}
|
|
|
|
function isPercentLike(value?: string): boolean {
|
|
if (!value) return false;
|
|
return /\d/.test(value) && value.includes('%');
|
|
}
|
|
|
|
/**
|
|
* Assign tokens to buckets based on column position.
|
|
* Best practice: Map tokens to buckets by index position, ensuring alignment with header columns.
|
|
* This assumes tokens are in the same order as the header columns.
|
|
*/
|
|
function assignTokensToBuckets(
|
|
tokens: string[],
|
|
buckets: Array<Bucket | null>,
|
|
mapper: (bucket: Bucket, value: string) => void,
|
|
fieldName?: string,
|
|
lineIndex?: number
|
|
) {
|
|
// Count non-null buckets (actual periods we want to extract)
|
|
const validBuckets = buckets.filter(Boolean).length;
|
|
|
|
// Validation: Check if token count matches expected bucket count
|
|
// Allow some flexibility - tokens can be within 1 of valid buckets (handles missing values)
|
|
if (tokens.length < validBuckets - 1) {
|
|
logger.debug('Financial parser: Token count mismatch - too few tokens', {
|
|
field: fieldName,
|
|
lineIndex,
|
|
tokensFound: tokens.length,
|
|
validBuckets,
|
|
tokens: tokens.slice(0, 10),
|
|
buckets: buckets.map(b => b || 'skip')
|
|
});
|
|
// Still try to assign what we have, but log the issue
|
|
} else if (tokens.length > validBuckets + 1) {
|
|
logger.debug('Financial parser: Token count mismatch - too many tokens', {
|
|
field: fieldName,
|
|
lineIndex,
|
|
tokensFound: tokens.length,
|
|
validBuckets,
|
|
tokens: tokens.slice(0, 10),
|
|
buckets: buckets.map(b => b || 'skip')
|
|
});
|
|
// Take only the first N tokens that match buckets
|
|
}
|
|
|
|
// Map tokens to buckets by position
|
|
// Strategy: Match tokens sequentially to non-null buckets
|
|
let tokenIndex = 0;
|
|
for (let i = 0; i < buckets.length && tokenIndex < tokens.length; i++) {
|
|
const bucket = buckets[i];
|
|
if (!bucket) {
|
|
// Skip this column (it's a projection or irrelevant period)
|
|
// CRITICAL: When we skip a bucket, we also skip the corresponding token
|
|
// This assumes tokens are aligned with columns in the table
|
|
// If the table has missing values, tokens might be misaligned
|
|
// In that case, we try to match by counting non-null buckets before this position
|
|
const nonNullBucketsBefore = buckets.slice(0, i).filter(Boolean).length;
|
|
if (tokenIndex < nonNullBucketsBefore) {
|
|
// We're behind - this might be a missing value, skip the token
|
|
tokenIndex++;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Assign the token to this bucket
|
|
if (tokenIndex < tokens.length) {
|
|
mapper(bucket, tokens[tokenIndex]);
|
|
tokenIndex++;
|
|
} else {
|
|
// No more tokens - this period has no value
|
|
logger.debug('Financial parser: Missing token for bucket', {
|
|
field: fieldName,
|
|
bucket,
|
|
bucketIndex: i,
|
|
tokensFound: tokens.length
|
|
});
|
|
}
|
|
}
|
|
|
|
// Log if we didn't use all tokens (might indicate misalignment)
|
|
if (tokenIndex < tokens.length && tokens.length > validBuckets) {
|
|
logger.debug('Financial parser: Unused tokens detected', {
|
|
field: fieldName,
|
|
tokensUsed: tokenIndex,
|
|
tokensTotal: tokens.length,
|
|
validBuckets,
|
|
unusedTokens: tokens.slice(tokenIndex)
|
|
});
|
|
}
|
|
}
|
|
|
|
export function parseFinancialsFromText(fullText: string): ParsedFinancials {
|
|
const startTime = Date.now();
|
|
const result: ParsedFinancials = {
|
|
fy3: {},
|
|
fy2: {},
|
|
fy1: {},
|
|
ltm: {}
|
|
};
|
|
|
|
try {
|
|
const text = fullText.replace(/\u00A0/g, ' ');
|
|
const lines = text.split('\n').map((line) => line.trim()).filter(Boolean);
|
|
if (lines.length === 0) {
|
|
return result;
|
|
}
|
|
|
|
let bestHeaderIndex = -1;
|
|
let bestBuckets: Array<Bucket | null> = [];
|
|
let bestHeaderScore = 0;
|
|
|
|
// Locate best header line containing year-like tokens
|
|
// Best practice: Score headers by both period count AND likelihood of being a financial table
|
|
for (let i = 0; i < lines.length; i++) {
|
|
const tokens = tokenizePeriodHeaders(lines[i]);
|
|
if (tokens.length >= 2) {
|
|
const buckets = yearTokensToBuckets(tokens);
|
|
const validBuckets = buckets.filter(Boolean).length;
|
|
|
|
// Score this header: prioritize headers followed by financial metric rows
|
|
let score = validBuckets;
|
|
|
|
// CRITICAL: Financial sections are typically in the BACK HALF of the document
|
|
// Boost score for headers in the latter portion of the document
|
|
const documentPosition = i / lines.length;
|
|
if (documentPosition > 0.5) {
|
|
score += 50; // Strong boost for headers in back half
|
|
} else if (documentPosition > 0.4) {
|
|
score += 20; // Moderate boost for headers in second half
|
|
}
|
|
|
|
// CRITICAL: Financial tables almost always have BOTH revenue AND EBITDA rows
|
|
// Look ahead 5-20 lines for these key indicators
|
|
const lookAheadStart = Math.min(i + 1, lines.length);
|
|
const lookAheadEnd = Math.min(i + 20, lines.length);
|
|
let hasRevenue = false;
|
|
let hasEBITDA = false;
|
|
let financialRowCount = 0;
|
|
|
|
for (let j = lookAheadStart; j < lookAheadEnd; j++) {
|
|
const checkLine = lines[j] || '';
|
|
const hasNumbers = containsMoneyOrPercent(checkLine);
|
|
|
|
if (!hasNumbers) continue; // Skip lines without numbers
|
|
|
|
// Check for revenue (and variations)
|
|
if (ROW_MATCHERS.revenue.test(checkLine)) {
|
|
hasRevenue = true;
|
|
financialRowCount++;
|
|
}
|
|
|
|
// Check for EBITDA (and variations)
|
|
if (ROW_MATCHERS.ebitda.test(checkLine)) {
|
|
hasEBITDA = true;
|
|
financialRowCount++;
|
|
}
|
|
|
|
// Also count other financial metrics
|
|
if (ROW_MATCHERS.grossProfit.test(checkLine) ||
|
|
ROW_MATCHERS.grossMargin.test(checkLine) ||
|
|
ROW_MATCHERS.ebitdaMargin.test(checkLine) ||
|
|
ROW_MATCHERS.revenueGrowth.test(checkLine)) {
|
|
financialRowCount++;
|
|
}
|
|
}
|
|
|
|
// MASSIVE boost if header has BOTH revenue AND EBITDA (strongest signal)
|
|
if (hasRevenue && hasEBITDA) {
|
|
score += 100; // This is almost certainly the financial table
|
|
} else if (hasRevenue || hasEBITDA) {
|
|
score += 20; // Has one key metric
|
|
}
|
|
|
|
// Additional boost for other financial rows
|
|
score += financialRowCount * 5;
|
|
|
|
// Log scoring details for debugging (only for headers with potential)
|
|
if (validBuckets >= 2 && (hasRevenue || hasEBITDA || financialRowCount > 0)) {
|
|
logger.debug('Financial parser header scoring', {
|
|
headerIndex: i,
|
|
headerLine: lines[i].substring(0, 100),
|
|
validBuckets,
|
|
hasRevenue,
|
|
hasEBITDA,
|
|
financialRowCount,
|
|
score,
|
|
lookAheadWindow: `${lookAheadStart}-${lookAheadEnd}`
|
|
});
|
|
}
|
|
|
|
// Prefer headers with more valid buckets (more historical periods)
|
|
if (score > bestHeaderScore || (score === bestHeaderScore && validBuckets > bestBuckets.filter(Boolean).length)) {
|
|
bestHeaderScore = score;
|
|
bestBuckets = buckets;
|
|
bestHeaderIndex = i;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (bestHeaderIndex === -1 || bestBuckets.filter(Boolean).length === 0) {
|
|
logger.info('Financial parser could not identify year header, returning empty result', {
|
|
totalLines: lines.length,
|
|
sampleLines: lines.slice(0, 20).join(' | ')
|
|
});
|
|
return result;
|
|
}
|
|
|
|
logger.info('Financial parser selected best header', {
|
|
headerIndex: bestHeaderIndex,
|
|
headerScore: bestHeaderScore,
|
|
buckets: bestBuckets.map((bucket) => bucket || 'skip')
|
|
});
|
|
|
|
logger.info('Financial parser found header', {
|
|
headerIndex: bestHeaderIndex,
|
|
headerLine: lines[bestHeaderIndex],
|
|
buckets: bestBuckets.map((bucket) => bucket || 'skip'),
|
|
totalLines: lines.length
|
|
});
|
|
|
|
// Expand window to search for financial data rows (header might be separated from data)
|
|
const windowStart = Math.max(0, bestHeaderIndex - 10);
|
|
const windowEnd = Math.min(lines.length, bestHeaderIndex + 50); // Increased from 18 to 50 to find data rows
|
|
const windowLines = lines.slice(windowStart, windowEnd);
|
|
|
|
logger.info('Financial parser window', {
|
|
windowStart,
|
|
windowEnd,
|
|
windowSize: windowLines.length,
|
|
windowLines: windowLines.join(' | ')
|
|
});
|
|
|
|
const bucketSetters: Record<string, (bucket: Bucket, value: string) => void> = {
|
|
revenue: (bucket, value) => {
|
|
if (isMoneyLike(value)) result[bucket].revenue = result[bucket].revenue || value;
|
|
},
|
|
grossProfit: (bucket, value) => {
|
|
if (isMoneyLike(value)) result[bucket].grossProfit = result[bucket].grossProfit || value;
|
|
},
|
|
ebitda: (bucket, value) => {
|
|
if (isMoneyLike(value)) result[bucket].ebitda = result[bucket].ebitda || value;
|
|
},
|
|
grossMargin: (bucket, value) => {
|
|
if (isPercentLike(value)) result[bucket].grossMargin = result[bucket].grossMargin || value;
|
|
},
|
|
ebitdaMargin: (bucket, value) => {
|
|
if (isPercentLike(value)) result[bucket].ebitdaMargin = result[bucket].ebitdaMargin || value;
|
|
},
|
|
revenueGrowth: (bucket, value) => {
|
|
if (isPercentLike(value)) result[bucket].revenueGrowth = result[bucket].revenueGrowth || value;
|
|
}
|
|
};
|
|
|
|
let matchedRows = 0;
|
|
// Search in a larger window around the header for financial data rows
|
|
// Also search lines that come after the header (financial tables are usually below headers)
|
|
const searchStart = bestHeaderIndex;
|
|
const searchEnd = Math.min(lines.length, bestHeaderIndex + 100); // Search up to 100 lines after header
|
|
|
|
for (let i = searchStart; i < searchEnd; i++) {
|
|
const line = lines[i];
|
|
if (!line || line.trim().length === 0) continue;
|
|
|
|
// Check current line and next few lines for numbers (tables might span multiple lines)
|
|
const nextLine = lines[i + 1] || '';
|
|
const lineAfterNext = lines[i + 2] || '';
|
|
const combinedForTokens = `${line} ${nextLine} ${lineAfterNext}`;
|
|
|
|
// CRITICAL: Only match rows that contain BOTH the field name AND numeric values
|
|
// This prevents matching descriptive text that just mentions financial terms
|
|
const hasMoneyOrPercent = containsMoneyOrPercent(combinedForTokens);
|
|
if (!hasMoneyOrPercent) continue; // Skip lines without actual financial numbers
|
|
|
|
for (const [field, matcher] of Object.entries(ROW_MATCHERS)) {
|
|
if (!matcher.test(line)) continue;
|
|
|
|
// Extract tokens from the combined lines
|
|
const extraContent = `${nextLine} ${lineAfterNext}`.trim() || undefined;
|
|
let tokens = extractNumericTokens(line, extraContent);
|
|
|
|
if (['grossMargin', 'ebitdaMargin', 'revenueGrowth'].includes(field)) {
|
|
const percentTokens = tokens.filter(isPercentLike);
|
|
if (percentTokens.length > 0) {
|
|
tokens = percentTokens;
|
|
}
|
|
} else if (['revenue', 'grossProfit', 'ebitda'].includes(field)) {
|
|
const moneyTokens = tokens.filter(isMoneyLike);
|
|
if (moneyTokens.length > 0) {
|
|
tokens = moneyTokens;
|
|
}
|
|
}
|
|
|
|
// Only process if we found meaningful tokens (at least 2, indicating multiple periods)
|
|
if (tokens.length < 2) {
|
|
logger.debug('Financial parser: matched field but insufficient tokens', {
|
|
field,
|
|
lineIndex: i,
|
|
tokensFound: tokens.length,
|
|
line: line.substring(0, 100)
|
|
});
|
|
continue;
|
|
}
|
|
|
|
matchedRows++;
|
|
logger.info('Financial parser matched row', {
|
|
field,
|
|
lineIndex: i,
|
|
line: line.substring(0, 150),
|
|
nextLine: nextLine.substring(0, 100),
|
|
tokensFound: tokens.length,
|
|
tokens: tokens.slice(0, 10), // Limit token logging
|
|
buckets: bestBuckets.map(b => b || 'skip')
|
|
});
|
|
|
|
assignTokensToBuckets(
|
|
tokens,
|
|
bestBuckets,
|
|
(bucket, value) => {
|
|
bucketSetters[field](bucket, value);
|
|
},
|
|
field,
|
|
i
|
|
);
|
|
}
|
|
}
|
|
|
|
logger.info('Financial parser row matching summary', {
|
|
matchedRows,
|
|
bestBuckets: bestBuckets.length,
|
|
buckets: bestBuckets.map((bucket) => bucket || 'skip')
|
|
});
|
|
|
|
logger.info('Financial parser results', {
|
|
elapsedMs: Date.now() - startTime,
|
|
headerLine: lines[bestHeaderIndex],
|
|
fy3: result.fy3,
|
|
fy2: result.fy2,
|
|
fy1: result.fy1,
|
|
ltm: result.ltm
|
|
});
|
|
} catch (error) {
|
|
logger.warn('Financial parser failed', { error: error instanceof Error ? error.message : String(error) });
|
|
}
|
|
|
|
return result;
|
|
}
|
|
const containsMoneyOrPercent = (text: string): boolean => {
|
|
resetRegex(MONEY_REGEX);
|
|
const hasMoney = MONEY_REGEX.test(text);
|
|
resetRegex(PERCENT_REGEX);
|
|
const hasPercent = PERCENT_REGEX.test(text);
|
|
return hasMoney || hasPercent;
|
|
};
|