cim_summary/backend/src/services/rag/chunking.ts

import { logger } from '../../utils/logger';
import type { StructuredTable } from '../documentAiProcessor';
import type { ProcessingChunk } from './types';
import { isFinancialTable, formatTableAsMarkdown } from './tableProcessor';
import { detectSectionType, extractMetadata } from './utils';

const MAX_CHUNK_SIZE = 4000;
const OVERLAP_SIZE = 200;

interface SemanticChunk {
  content: string;
  startPosition: number;
  endPosition: number;
  sectionType?: string;
  metadata?: Record<string, any>;
}

/**
 * Create intelligent chunks with semantic boundaries
 */
export async function createIntelligentChunks(
  text: string,
  documentId: string,
  enableSemanticChunking: boolean = true,
  structuredTables: StructuredTable[] = []
): Promise<ProcessingChunk[]> {
  const chunks: ProcessingChunk[] = [];

  if (structuredTables.length > 0) {
    logger.info('Processing structured tables for chunking', {
      documentId,
      tableCount: structuredTables.length
    });

    structuredTables.forEach((table, index) => {
      const isFinancial = isFinancialTable(table);
      const markdownTable = formatTableAsMarkdown(table);
      const chunkIndex = chunks.length;

      chunks.push({
        id: `${documentId}-table-${index}`,
        content: markdownTable,
        chunkIndex,
        startPosition: -1,
        endPosition: -1,
        sectionType: isFinancial ? 'financial-table' : 'table',
        metadata: {
          isStructuredTable: true,
          isFinancialTable: isFinancial,
          tableIndex: index,
          pageNumber: table.position?.pageNumber ?? -1,
          headerCount: table.headers.length,
          rowCount: table.rows.length,
          structuredData: table
        }
      });

      logger.info('Created chunk for structured table', {
        documentId,
        tableIndex: index,
        isFinancial,
        chunkId: `${documentId}-table-${index}`,
        headerCount: table.headers.length,
        rowCount: table.rows.length
      });
    });
  }

  if (enableSemanticChunking) {
    const semanticChunks = splitBySemanticBoundaries(text);

    for (let i = 0; i < semanticChunks.length; i++) {
      const chunk = semanticChunks[i];
      if (chunk && chunk.content.length > 50) {
        const chunkIndex = chunks.length;
        chunks.push({
          id: `${documentId}-chunk-${chunkIndex}`,
          content: chunk.content,
          chunkIndex,
          startPosition: chunk.startPosition,
          endPosition: chunk.endPosition,
          sectionType: chunk.sectionType || 'general',
          metadata: {
            ...(chunk.metadata || {}),
            hasStructuredTableContext: false
          }
        });
      }
    }
  } else {
    for (let i = 0; i < text.length; i += MAX_CHUNK_SIZE - OVERLAP_SIZE) {
      const chunkContent = text.substring(i, i + MAX_CHUNK_SIZE);
      if (chunkContent.trim().length > 50) {
        const chunkIndex = chunks.length;
        chunks.push({
          id: `${documentId}-chunk-${chunkIndex}`,
          content: chunkContent,
          chunkIndex,
          startPosition: i,
          endPosition: i + chunkContent.length,
          sectionType: detectSectionType(chunkContent),
          metadata: extractMetadata(chunkContent)
        });
      }
    }
  }

  return chunks;
}

/**
 * Split text by semantic boundaries (paragraphs, sections, etc.)
 */
function splitBySemanticBoundaries(text: string): SemanticChunk[] {
  const chunks: SemanticChunk[] = [];

  // Split by double newlines (paragraphs)
  const paragraphs = text.split(/\n\s*\n/);
  let currentPosition = 0;

  for (const paragraph of paragraphs) {
    if (paragraph.trim().length === 0) {
      currentPosition += paragraph.length + 2; // +2 for \n\n
      continue;
    }

    // If paragraph is too large, split it further
    if (paragraph.length > MAX_CHUNK_SIZE) {
      const subChunks = splitLargeParagraph(paragraph, currentPosition);
      chunks.push(...subChunks);
      currentPosition += paragraph.length + 2;
    } else {
      chunks.push({
        content: paragraph.trim(),
        startPosition: currentPosition,
        endPosition: currentPosition + paragraph.length,
        sectionType: detectSectionType(paragraph),
        metadata: extractMetadata(paragraph)
      });
      currentPosition += paragraph.length + 2;
    }
  }

  return chunks;
}

/**
 * Split large paragraphs into smaller chunks
 */
function splitLargeParagraph(
  paragraph: string,
  startPosition: number
): SemanticChunk[] {
  const chunks: SemanticChunk[] = [];

  // Split by sentences first
  const sentences = paragraph.match(/[^.!?]+[.!?]+/g) || [paragraph];
  let currentChunk = '';
  let chunkStartPosition = startPosition;

  for (const sentence of sentences) {
    if ((currentChunk + sentence).length > MAX_CHUNK_SIZE && currentChunk.length > 0) {
      // Store current chunk and start new one
      chunks.push({
        content: currentChunk.trim(),
        startPosition: chunkStartPosition,
        endPosition: chunkStartPosition + currentChunk.length,
        sectionType: detectSectionType(currentChunk),
        metadata: extractMetadata(currentChunk)
      });
      currentChunk = sentence;
      chunkStartPosition = chunkStartPosition + currentChunk.length;
    } else {
      currentChunk += sentence;
    }
  }

  // Add the last chunk
  if (currentChunk.trim().length > 0) {
    chunks.push({
      content: currentChunk.trim(),
      startPosition: chunkStartPosition,
      endPosition: chunkStartPosition + currentChunk.length,
      sectionType: detectSectionType(currentChunk),
      metadata: extractMetadata(currentChunk)
    });
  }

  return chunks;
}