Add acceptance tests and align defaults to Sonnet 4
This commit is contained in:
@@ -22,6 +22,10 @@ const PERIOD_TOKEN_REGEX = /\b(?:(?:FY[-\s]?\d{1,2})|(?:FY[-\s]?)?20\d{2}[A-Z]*|
|
||||
const MONEY_REGEX = /-?\$?\(?\d[\d,]*(?:\.\d+)?\)?\s?(?:K|M|B)?/g;
|
||||
const PERCENT_REGEX = /-?\d{1,3}(?:\.\d+)?\s?%/g;
|
||||
|
||||
const resetRegex = (regex: RegExp): void => {
|
||||
regex.lastIndex = 0;
|
||||
};
|
||||
|
||||
const ROW_MATCHERS: Record<string, RegExp> = {
|
||||
revenue: /(revenue|net sales|total sales|top\s+line)/i,
|
||||
grossProfit: /(gross\s+profit)/i,
|
||||
@@ -137,33 +141,37 @@ function yearTokensToBuckets(tokens: string[]): Array<Bucket | null> {
|
||||
* Extract numeric tokens (money/percentages) from a line or combined lines.
|
||||
* Best practice: Extract all numeric values and preserve their order to match column positions.
|
||||
*/
|
||||
function extractNumericTokens(line: string, nextLine?: string): string[] {
|
||||
const combined = `${line} ${nextLine || ''}`;
|
||||
function extractNumericTokens(line: string, additionalContent?: string): string[] {
|
||||
const combined = additionalContent ? `${line} ${additionalContent}` : line;
|
||||
const lineLength = line.length;
|
||||
|
||||
// Extract money values with their positions to preserve column order
|
||||
resetRegex(MONEY_REGEX);
|
||||
const moneyMatches = Array.from(combined.matchAll(MONEY_REGEX))
|
||||
.map((m) => ({ value: normalizeToken(m[0]), index: m.index || 0 }))
|
||||
.map((m) => ({ value: normalizeToken(m[0]), index: m.index ?? 0 }))
|
||||
.filter((m) => m.value && /\d/.test(m.value));
|
||||
|
||||
// Extract percentage values with their positions
|
||||
resetRegex(PERCENT_REGEX);
|
||||
const percentMatches = Array.from(combined.matchAll(PERCENT_REGEX))
|
||||
.map((m) => ({ value: normalizeToken(m[0]), index: m.index || 0 }))
|
||||
.map((m) => ({ value: normalizeToken(m[0]), index: m.index ?? 0 }))
|
||||
.filter((m) => m.value && /\d/.test(m.value));
|
||||
|
||||
// Combine and sort by position to preserve column order (critical for table parsing)
|
||||
const allMatches = [...moneyMatches, ...percentMatches]
|
||||
.sort((a, b) => a.index - b.index)
|
||||
.map((m) => m.value);
|
||||
const sortedMatches = [...moneyMatches, ...percentMatches].sort((a, b) => a.index - b.index);
|
||||
|
||||
// Remove duplicates while preserving order
|
||||
const tokens: string[] = [];
|
||||
for (const token of allMatches) {
|
||||
if (!tokens.includes(token)) {
|
||||
tokens.push(token);
|
||||
}
|
||||
const primaryTokens = sortedMatches
|
||||
.filter(match => match.index < lineLength)
|
||||
.map(match => match.value);
|
||||
|
||||
if (primaryTokens.length >= 2 || !additionalContent) {
|
||||
return primaryTokens.length > 0 ? primaryTokens : sortedMatches.map(match => match.value);
|
||||
}
|
||||
|
||||
return tokens;
|
||||
const secondaryTokens = sortedMatches
|
||||
.filter(match => match.index >= lineLength)
|
||||
.map(match => match.value);
|
||||
|
||||
return primaryTokens.concat(secondaryTokens);
|
||||
}
|
||||
|
||||
function isMoneyLike(value?: string): boolean {
|
||||
@@ -312,7 +320,7 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials {
|
||||
|
||||
for (let j = lookAheadStart; j < lookAheadEnd; j++) {
|
||||
const checkLine = lines[j] || '';
|
||||
const hasNumbers = MONEY_REGEX.test(checkLine) || PERCENT_REGEX.test(checkLine);
|
||||
const hasNumbers = containsMoneyOrPercent(checkLine);
|
||||
|
||||
if (!hasNumbers) continue; // Skip lines without numbers
|
||||
|
||||
@@ -441,14 +449,27 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials {
|
||||
|
||||
// CRITICAL: Only match rows that contain BOTH the field name AND numeric values
|
||||
// This prevents matching descriptive text that just mentions financial terms
|
||||
const hasMoneyOrPercent = MONEY_REGEX.test(combinedForTokens) || PERCENT_REGEX.test(combinedForTokens);
|
||||
const hasMoneyOrPercent = containsMoneyOrPercent(combinedForTokens);
|
||||
if (!hasMoneyOrPercent) continue; // Skip lines without actual financial numbers
|
||||
|
||||
for (const [field, matcher] of Object.entries(ROW_MATCHERS)) {
|
||||
if (!matcher.test(line)) continue;
|
||||
|
||||
// Extract tokens from the combined lines
|
||||
const tokens = extractNumericTokens(line, combinedForTokens);
|
||||
const extraContent = `${nextLine} ${lineAfterNext}`.trim() || undefined;
|
||||
let tokens = extractNumericTokens(line, extraContent);
|
||||
|
||||
if (['grossMargin', 'ebitdaMargin', 'revenueGrowth'].includes(field)) {
|
||||
const percentTokens = tokens.filter(isPercentLike);
|
||||
if (percentTokens.length > 0) {
|
||||
tokens = percentTokens;
|
||||
}
|
||||
} else if (['revenue', 'grossProfit', 'ebitda'].includes(field)) {
|
||||
const moneyTokens = tokens.filter(isMoneyLike);
|
||||
if (moneyTokens.length > 0) {
|
||||
tokens = moneyTokens;
|
||||
}
|
||||
}
|
||||
|
||||
// Only process if we found meaningful tokens (at least 2, indicating multiple periods)
|
||||
if (tokens.length < 2) {
|
||||
@@ -504,3 +525,10 @@ export function parseFinancialsFromText(fullText: string): ParsedFinancials {
|
||||
|
||||
return result;
|
||||
}
|
||||
const containsMoneyOrPercent = (text: string): boolean => {
|
||||
resetRegex(MONEY_REGEX);
|
||||
const hasMoney = MONEY_REGEX.test(text);
|
||||
resetRegex(PERCENT_REGEX);
|
||||
const hasPercent = PERCENT_REGEX.test(text);
|
||||
return hasMoney || hasPercent;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user