cim_summary/backend/src/services/financialTableParser.ts

import { logger } from '../utils/logger';

export interface FinancialPeriod {
  revenue?: string;
  revenueGrowth?: string;
  grossProfit?: string;
  grossMargin?: string;
  ebitda?: string;
  ebitdaMargin?: string;
}

export interface ParsedFinancials {
  fy3: FinancialPeriod;
  fy2: FinancialPeriod;
  fy1: FinancialPeriod;
  ltm: FinancialPeriod;
}

type Bucket = keyof ParsedFinancials;

const PERIOD_TOKEN_REGEX = /\b(?:(?:FY[-\s]?\d{1,2})|(?:FY[-\s]?)?20\d{2}[A-Z]*|(?:FY[-\s]?[1234])|(?:LTM|TTM))\b/gi;
const MONEY_REGEX = /-?\$?\(?\d[\d,]*(?:\.\d+)?\)?\s?(?:K|M|B)?/g;
const PERCENT_REGEX = /-?\d{1,3}(?:\.\d+)?\s?%/g;

const resetRegex = (regex: RegExp): void => {
  regex.lastIndex = 0;
};

const ROW_MATCHERS: Record<string, RegExp> = {
  revenue: /(revenue|net sales|total sales|top\s+line)/i,
  grossProfit: /(gross\s+profit)/i,
  grossMargin: /(gross\s+margin)/i,
  ebitda: /(ebitda|adjusted\s+ebitda|adj\.*\s*ebitda)/i,
  ebitdaMargin: /(ebitda\s+margin|adj\.*\s*ebitda\s+margin)/i,
  revenueGrowth: /(revenue\s+growth|yoy|y\/y|year[-\s]*over[-\s]*year)/i
};

function normalizeToken(token: string): string {
  return token.replace(/\s+/g, ' ').replace(/[()]/g, '').trim();
}

function tokenizePeriodHeaders(line: string): string[] {
  const matches = line.match(PERIOD_TOKEN_REGEX);
  if (!matches) return [];

  const normalizedTokens: string[] = [];
  for (const match of matches) {
    const normalized = normalizePeriodToken(match);
    if (!normalized) continue;
    if (!normalizedTokens.includes(normalized)) {
      normalizedTokens.push(normalized);
    }
  }
  return normalizedTokens;
}

function normalizePeriodToken(rawToken: string): string | null {
  if (!rawToken) return null;
  const trimmedOriginal = rawToken.trim().toUpperCase();
  const isProjection = trimmedOriginal.endsWith('P') || trimmedOriginal.endsWith('PF');
  if (isProjection) {
    return null;
  }

  let token = trimmedOriginal.replace(/[\u00A0\s]/g, '');

  // Remove trailing punctuation
  token = token.replace(/[.,]+$/, '');

  // Remove projection suffixes (A, E, F, PF, etc.)
  token = token.replace(/(20\d{2})(?:[A-Z]+)$/i, '$1');
  token = token.replace(/(FY20\d{2})(?:[A-Z]+)$/i, '$1');

  // Normalize FYXX to FY-XX
  if (/^FY\d{1,2}$/.test(token)) {
    token = token.replace(/^FY(\d{1,2})$/, 'FY-$1');
  }

  // Normalize FY20XX to just the year
  if (/^FY20\d{2}$/.test(token)) {
    token = token.replace(/^FY(20\d{2})$/, '$1');
  }
  return token;
}

function yearTokensToBuckets(tokens: string[]): Array<Bucket | null> {
  if (!tokens.length) return [];

  const bucketAssignments: Array<Bucket | null> = new Array(tokens.length).fill(null);
  const ltmIndices: number[] = [];

  // First pass: Identify LTM/TTM periods
  tokens.forEach((token, index) => {
    if (token.includes('LTM') || token.includes('TTM')) {
      bucketAssignments[index] = 'ltm';
      ltmIndices.push(index);
    }
  });

  // Get non-LTM indices (these should be fiscal years)
  const nonLtmIndices = tokens
    .map((token, index) => ({ token, index }))
    .filter(({ index }) => !ltmIndices.includes(index));

  // Handle edge cases: tables with only 2-3 periods (not all 4)
  // Strategy: Assign FY buckets from most recent to oldest (FY1, FY2, FY3)
  // If we have 3 years: assign FY1, FY2, FY3
  // If we have 2 years: assign FY1, FY2
  // If we have 1 year: assign FY1
  const fyBuckets: Bucket[] = ['fy1', 'fy2', 'fy3'];
  let fyIndex = 0;

  // Assign from most recent (rightmost) to oldest (leftmost)
  // This matches typical table layout: oldest year on left, newest on right
  for (let i = nonLtmIndices.length - 1; i >= 0 && fyIndex < fyBuckets.length; i--) {
    const { index } = nonLtmIndices[i];
    bucketAssignments[index] = fyBuckets[fyIndex];
    fyIndex++;
  }

  // Validation: Log if we have unusual period counts
  const assignedBuckets = bucketAssignments.filter(Boolean);
  if (assignedBuckets.length < 2) {
    logger.debug('Financial parser: Few periods detected', {
      totalTokens: tokens.length,
      assignedBuckets: assignedBuckets.length,
      tokens: tokens.slice(0, 10)
    });
  } else if (assignedBuckets.length > 4) {
    logger.debug('Financial parser: Many periods detected - may include projections', {
      totalTokens: tokens.length,
      assignedBuckets: assignedBuckets.length,
      tokens: tokens.slice(0, 10)
    });
  }

  return bucketAssignments;
}

/**
 * Extract numeric tokens (money/percentages) from a line or combined lines.
 * Best practice: Extract all numeric values and preserve their order to match column positions.
 */
function extractNumericTokens(line: string, additionalContent?: string): string[] {
  const combined = additionalContent ? `${line} ${additionalContent}` : line;
  const lineLength = line.length;

  // Extract money values with their positions to preserve column order
  resetRegex(MONEY_REGEX);
  const moneyMatches = Array.from(combined.matchAll(MONEY_REGEX))
    .map((m) => ({ value: normalizeToken(m[0]), index: m.index ?? 0 }))
    .filter((m) => m.value && /\d/.test(m.value));

  // Extract percentage values with their positions
  resetRegex(PERCENT_REGEX);
  const percentMatches = Array.from(combined.matchAll(PERCENT_REGEX))
    .map((m) => ({ value: normalizeToken(m[0]), index: m.index ?? 0 }))
    .filter((m) => m.value && /\d/.test(m.value));

  const sortedMatches = [...moneyMatches, ...percentMatches].sort((a, b) => a.index - b.index);

  const primaryTokens = sortedMatches
    .filter(match => match.index < lineLength)
    .map(match => match.value);

  if (primaryTokens.length >= 2 || !additionalContent) {
    return primaryTokens.length > 0 ? primaryTokens : sortedMatches.map(match => match.value);
  }

  const secondaryTokens = sortedMatches
    .filter(match => match.index >= lineLength)
    .map(match => match.value);

  return primaryTokens.concat(secondaryTokens);
}

function isMoneyLike(value?: string): boolean {
  if (!value) return false;
  const clean = value.replace(/[(),\s]/g, '');
  return /\d/.test(clean) && (value.includes('$') || /[KMB]/i.test(value));
}

function isPercentLike(value?: string): boolean {
  if (!value) return false;
  return /\d/.test(value) && value.includes('%');
}

/**
 * Assign tokens to buckets based on column position.
 * Best practice: Map tokens to buckets by index position, ensuring alignment with header columns.
 * This assumes tokens are in the same order as the header columns.
 */
function assignTokensToBuckets(
  tokens: string[],
  buckets: Array<Bucket | null>,
  mapper: (bucket: Bucket, value: string) => void,
  fieldName?: string,
  lineIndex?: number
) {
  // Count non-null buckets (actual periods we want to extract)
  const validBuckets = buckets.filter(Boolean).length;

  // Validation: Check if token count matches expected bucket count
  // Allow some flexibility - tokens can be within 1 of valid buckets (handles missing values)
  if (tokens.length < validBuckets - 1) {
    logger.debug('Financial parser: Token count mismatch - too few tokens', {
      field: fieldName,
      lineIndex,
      tokensFound: tokens.length,
      validBuckets,
      tokens: tokens.slice(0, 10),
      buckets: buckets.map(b => b || 'skip')
    });
    // Still try to assign what we have, but log the issue
  } else if (tokens.length > validBuckets + 1) {
    logger.debug('Financial parser: Token count mismatch - too many tokens', {
      field: fieldName,
      lineIndex,
      tokensFound: tokens.length,
      validBuckets,
      tokens: tokens.slice(0, 10),
      buckets: buckets.map(b => b || 'skip')
    });
    // Take only the first N tokens that match buckets
  }

  // Map tokens to buckets by position
  // Strategy: Match tokens sequentially to non-null buckets
  let tokenIndex = 0;
  for (let i = 0; i < buckets.length && tokenIndex < tokens.length; i++) {
    const bucket = buckets[i];
    if (!bucket) {
      // Skip this column (it's a projection or irrelevant period)
      // CRITICAL: When we skip a bucket, we also skip the corresponding token
      // This assumes tokens are aligned with columns in the table
      // If the table has missing values, tokens might be misaligned
      // In that case, we try to match by counting non-null buckets before this position
      const nonNullBucketsBefore = buckets.slice(0, i).filter(Boolean).length;
      if (tokenIndex < nonNullBucketsBefore) {
        // We're behind - this might be a missing value, skip the token
        tokenIndex++;
      }
      continue;
    }

    // Assign the token to this bucket
    if (tokenIndex < tokens.length) {
      mapper(bucket, tokens[tokenIndex]);
      tokenIndex++;
    } else {
      // No more tokens - this period has no value
      logger.debug('Financial parser: Missing token for bucket', {
        field: fieldName,
        bucket,
        bucketIndex: i,
        tokensFound: tokens.length
      });
    }
  }

  // Log if we didn't use all tokens (might indicate misalignment)
  if (tokenIndex < tokens.length && tokens.length > validBuckets) {
    logger.debug('Financial parser: Unused tokens detected', {
      field: fieldName,
      tokensUsed: tokenIndex,
      tokensTotal: tokens.length,
      validBuckets,
      unusedTokens: tokens.slice(tokenIndex)
    });
  }
}

export function parseFinancialsFromText(fullText: string): ParsedFinancials {
  const startTime = Date.now();
  const result: ParsedFinancials = {
    fy3: {},
    fy2: {},
    fy1: {},
    ltm: {}
  };

  try {
    const text = fullText.replace(/\u00A0/g, ' ');
    const lines = text.split('\n').map((line) => line.trim()).filter(Boolean);
    if (lines.length === 0) {
      return result;
    }

    let bestHeaderIndex = -1;
    let bestBuckets: Array<Bucket | null> = [];
    let bestHeaderScore = 0;

    // Locate best header line containing year-like tokens
    // Best practice: Score headers by both period count AND likelihood of being a financial table
    for (let i = 0; i < lines.length; i++) {
      const tokens = tokenizePeriodHeaders(lines[i]);
      if (tokens.length >= 2) {
        const buckets = yearTokensToBuckets(tokens);
        const validBuckets = buckets.filter(Boolean).length;

        // Score this header: prioritize headers followed by financial metric rows
        let score = validBuckets;

        // CRITICAL: Financial sections are typically in the BACK HALF of the document
        // Boost score for headers in the latter portion of the document
        const documentPosition = i / lines.length;
        if (documentPosition > 0.5) {
          score += 50; // Strong boost for headers in back half
        } else if (documentPosition > 0.4) {
          score += 20; // Moderate boost for headers in second half
        }

        // CRITICAL: Financial tables almost always have BOTH revenue AND EBITDA rows
        // Look ahead 5-20 lines for these key indicators
        const lookAheadStart = Math.min(i + 1, lines.length);
        const lookAheadEnd = Math.min(i + 20, lines.length);
        let hasRevenue = false;
        let hasEBITDA = false;
        let financialRowCount = 0;

        for (let j = lookAheadStart; j < lookAheadEnd; j++) {
          const checkLine = lines[j] || '';
          const hasNumbers = containsMoneyOrPercent(checkLine);

          if (!hasNumbers) continue; // Skip lines without numbers

          // Check for revenue (and variations)
          if (ROW_MATCHERS.revenue.test(checkLine)) {
            hasRevenue = true;
            financialRowCount++;
          }

          // Check for EBITDA (and variations)
          if (ROW_MATCHERS.ebitda.test(checkLine)) {
            hasEBITDA = true;
            financialRowCount++;
          }

          // Also count other financial metrics
          if (ROW_MATCHERS.grossProfit.test(checkLine) ||
              ROW_MATCHERS.grossMargin.test(checkLine) ||
              ROW_MATCHERS.ebitdaMargin.test(checkLine) ||
              ROW_MATCHERS.revenueGrowth.test(checkLine)) {
            financialRowCount++;
          }
        }

        // MASSIVE boost if header has BOTH revenue AND EBITDA (strongest signal)
        if (hasRevenue && hasEBITDA) {
          score += 100; // This is almost certainly the financial table
        } else if (hasRevenue || hasEBITDA) {
          score += 20; // Has one key metric
        }

        // Additional boost for other financial rows
        score += financialRowCount * 5;

        // Log scoring details for debugging (only for headers with potential)
        if (validBuckets >= 2 && (hasRevenue || hasEBITDA || financialRowCount > 0)) {
          logger.debug('Financial parser header scoring', {
            headerIndex: i,
            headerLine: lines[i].substring(0, 100),
            validBuckets,
            hasRevenue,
            hasEBITDA,
            financialRowCount,
            score,
            lookAheadWindow: `${lookAheadStart}-${lookAheadEnd}`
          });
        }

        // Prefer headers with more valid buckets (more historical periods)
        if (score > bestHeaderScore || (score === bestHeaderScore && validBuckets > bestBuckets.filter(Boolean).length)) {
          bestHeaderScore = score;
          bestBuckets = buckets;
          bestHeaderIndex = i;
        }
      }
    }

    if (bestHeaderIndex === -1 || bestBuckets.filter(Boolean).length === 0) {
      logger.info('Financial parser could not identify year header, returning empty result', {
        totalLines: lines.length,
        sampleLines: lines.slice(0, 20).join(' | ')
      });
      return result;
    }

    logger.info('Financial parser selected best header', {
      headerIndex: bestHeaderIndex,
      headerScore: bestHeaderScore,
      buckets: bestBuckets.map((bucket) => bucket || 'skip')
    });

    logger.info('Financial parser found header', {
      headerIndex: bestHeaderIndex,
      headerLine: lines[bestHeaderIndex],
      buckets: bestBuckets.map((bucket) => bucket || 'skip'),
      totalLines: lines.length
    });

    // Expand window to search for financial data rows (header might be separated from data)
    const windowStart = Math.max(0, bestHeaderIndex - 10);
    const windowEnd = Math.min(lines.length, bestHeaderIndex + 50); // Increased from 18 to 50 to find data rows
    const windowLines = lines.slice(windowStart, windowEnd);

    logger.info('Financial parser window', {
      windowStart,
      windowEnd,
      windowSize: windowLines.length,
      windowLines: windowLines.join(' | ')
    });

    const bucketSetters: Record<string, (bucket: Bucket, value: string) => void> = {
      revenue: (bucket, value) => {
        if (isMoneyLike(value)) result[bucket].revenue = result[bucket].revenue || value;
      },
      grossProfit: (bucket, value) => {
        if (isMoneyLike(value)) result[bucket].grossProfit = result[bucket].grossProfit || value;
      },
      ebitda: (bucket, value) => {
        if (isMoneyLike(value)) result[bucket].ebitda = result[bucket].ebitda || value;
      },
      grossMargin: (bucket, value) => {
        if (isPercentLike(value)) result[bucket].grossMargin = result[bucket].grossMargin || value;
      },
      ebitdaMargin: (bucket, value) => {
        if (isPercentLike(value)) result[bucket].ebitdaMargin = result[bucket].ebitdaMargin || value;
      },
      revenueGrowth: (bucket, value) => {
        if (isPercentLike(value)) result[bucket].revenueGrowth = result[bucket].revenueGrowth || value;
      }
    };

    let matchedRows = 0;
    // Search in a larger window around the header for financial data rows
    // Also search lines that come after the header (financial tables are usually below headers)
    const searchStart = bestHeaderIndex;
    const searchEnd = Math.min(lines.length, bestHeaderIndex + 100); // Search up to 100 lines after header

    for (let i = searchStart; i < searchEnd; i++) {
      const line = lines[i];
      if (!line || line.trim().length === 0) continue;

      // Check current line and next few lines for numbers (tables might span multiple lines)
      const nextLine = lines[i + 1] || '';
      const lineAfterNext = lines[i + 2] || '';
      const combinedForTokens = `${line} ${nextLine} ${lineAfterNext}`;

      // CRITICAL: Only match rows that contain BOTH the field name AND numeric values
      // This prevents matching descriptive text that just mentions financial terms
      const hasMoneyOrPercent = containsMoneyOrPercent(combinedForTokens);
      if (!hasMoneyOrPercent) continue; // Skip lines without actual financial numbers

      for (const [field, matcher] of Object.entries(ROW_MATCHERS)) {
        if (!matcher.test(line)) continue;

        // Extract tokens from the combined lines
        const extraContent = `${nextLine} ${lineAfterNext}`.trim() || undefined;
        let tokens = extractNumericTokens(line, extraContent);

        if (['grossMargin', 'ebitdaMargin', 'revenueGrowth'].includes(field)) {
          const percentTokens = tokens.filter(isPercentLike);
          if (percentTokens.length > 0) {
            tokens = percentTokens;
          }
        } else if (['revenue', 'grossProfit', 'ebitda'].includes(field)) {
          const moneyTokens = tokens.filter(isMoneyLike);
          if (moneyTokens.length > 0) {
            tokens = moneyTokens;
          }
        }

        // Only process if we found meaningful tokens (at least 2, indicating multiple periods)
        if (tokens.length < 2) {
          logger.debug('Financial parser: matched field but insufficient tokens', {
            field,
            lineIndex: i,
            tokensFound: tokens.length,
            line: line.substring(0, 100)
          });
          continue;
        }

        matchedRows++;
        logger.info('Financial parser matched row', {
          field,
          lineIndex: i,
          line: line.substring(0, 150),
          nextLine: nextLine.substring(0, 100),
          tokensFound: tokens.length,
          tokens: tokens.slice(0, 10), // Limit token logging
          buckets: bestBuckets.map(b => b || 'skip')
        });

        assignTokensToBuckets(
          tokens,
          bestBuckets,
          (bucket, value) => {
            bucketSetters[field](bucket, value);
          },
          field,
          i
        );
      }
    }

    logger.info('Financial parser row matching summary', {
      matchedRows,
      bestBuckets: bestBuckets.length,
      buckets: bestBuckets.map((bucket) => bucket || 'skip')
    });

    logger.info('Financial parser results', {
      elapsedMs: Date.now() - startTime,
      headerLine: lines[bestHeaderIndex],
      fy3: result.fy3,
      fy2: result.fy2,
      fy1: result.fy1,
      ltm: result.ltm
    });
  } catch (error) {
    logger.warn('Financial parser failed', { error: error instanceof Error ? error.message : String(error) });
  }

  return result;
}
const containsMoneyOrPercent = (text: string): boolean => {
  resetRegex(MONEY_REGEX);
  const hasMoney = MONEY_REGEX.test(text);
  resetRegex(PERCENT_REGEX);
  const hasPercent = PERCENT_REGEX.test(text);
  return hasMoney || hasPercent;
};