Files
cim_summary/backend/src/services/rag/chunking.ts
admin 87c6da4225 Refactor LLM service architecture and improve document processing
- Refactor LLM service with provider pattern (Anthropic, OpenAI, OpenRouter)
- Add structured LLM prompts and utilities (token estimation, cost calculation, JSON extraction)
- Implement RAG improvements with optimized chunking and embedding services
- Add financial extraction monitoring service
- Add parallel document processor
- Improve error handling with dedicated error handlers
- Add comprehensive TypeScript types for LLM, document, and processing
- Update optimized agentic RAG processor and simple document processor
2025-11-11 21:04:42 -05:00

192 lines
5.5 KiB
TypeScript

import { logger } from '../../utils/logger';
import type { StructuredTable } from '../documentAiProcessor';
import type { ProcessingChunk } from './types';
import { isFinancialTable, formatTableAsMarkdown } from './tableProcessor';
import { detectSectionType, extractMetadata } from './utils';
const MAX_CHUNK_SIZE = 4000;
const OVERLAP_SIZE = 200;
interface SemanticChunk {
content: string;
startPosition: number;
endPosition: number;
sectionType?: string;
metadata?: Record<string, any>;
}
/**
* Create intelligent chunks with semantic boundaries
*/
export async function createIntelligentChunks(
text: string,
documentId: string,
enableSemanticChunking: boolean = true,
structuredTables: StructuredTable[] = []
): Promise<ProcessingChunk[]> {
const chunks: ProcessingChunk[] = [];
if (structuredTables.length > 0) {
logger.info('Processing structured tables for chunking', {
documentId,
tableCount: structuredTables.length
});
structuredTables.forEach((table, index) => {
const isFinancial = isFinancialTable(table);
const markdownTable = formatTableAsMarkdown(table);
const chunkIndex = chunks.length;
chunks.push({
id: `${documentId}-table-${index}`,
content: markdownTable,
chunkIndex,
startPosition: -1,
endPosition: -1,
sectionType: isFinancial ? 'financial-table' : 'table',
metadata: {
isStructuredTable: true,
isFinancialTable: isFinancial,
tableIndex: index,
pageNumber: table.position?.pageNumber ?? -1,
headerCount: table.headers.length,
rowCount: table.rows.length,
structuredData: table
}
});
logger.info('Created chunk for structured table', {
documentId,
tableIndex: index,
isFinancial,
chunkId: `${documentId}-table-${index}`,
headerCount: table.headers.length,
rowCount: table.rows.length
});
});
}
if (enableSemanticChunking) {
const semanticChunks = splitBySemanticBoundaries(text);
for (let i = 0; i < semanticChunks.length; i++) {
const chunk = semanticChunks[i];
if (chunk && chunk.content.length > 50) {
const chunkIndex = chunks.length;
chunks.push({
id: `${documentId}-chunk-${chunkIndex}`,
content: chunk.content,
chunkIndex,
startPosition: chunk.startPosition,
endPosition: chunk.endPosition,
sectionType: chunk.sectionType || 'general',
metadata: {
...(chunk.metadata || {}),
hasStructuredTableContext: false
}
});
}
}
} else {
for (let i = 0; i < text.length; i += MAX_CHUNK_SIZE - OVERLAP_SIZE) {
const chunkContent = text.substring(i, i + MAX_CHUNK_SIZE);
if (chunkContent.trim().length > 50) {
const chunkIndex = chunks.length;
chunks.push({
id: `${documentId}-chunk-${chunkIndex}`,
content: chunkContent,
chunkIndex,
startPosition: i,
endPosition: i + chunkContent.length,
sectionType: detectSectionType(chunkContent),
metadata: extractMetadata(chunkContent)
});
}
}
}
return chunks;
}
/**
* Split text by semantic boundaries (paragraphs, sections, etc.)
*/
function splitBySemanticBoundaries(text: string): SemanticChunk[] {
const chunks: SemanticChunk[] = [];
// Split by double newlines (paragraphs)
const paragraphs = text.split(/\n\s*\n/);
let currentPosition = 0;
for (const paragraph of paragraphs) {
if (paragraph.trim().length === 0) {
currentPosition += paragraph.length + 2; // +2 for \n\n
continue;
}
// If paragraph is too large, split it further
if (paragraph.length > MAX_CHUNK_SIZE) {
const subChunks = splitLargeParagraph(paragraph, currentPosition);
chunks.push(...subChunks);
currentPosition += paragraph.length + 2;
} else {
chunks.push({
content: paragraph.trim(),
startPosition: currentPosition,
endPosition: currentPosition + paragraph.length,
sectionType: detectSectionType(paragraph),
metadata: extractMetadata(paragraph)
});
currentPosition += paragraph.length + 2;
}
}
return chunks;
}
/**
* Split large paragraphs into smaller chunks
*/
function splitLargeParagraph(
paragraph: string,
startPosition: number
): SemanticChunk[] {
const chunks: SemanticChunk[] = [];
// Split by sentences first
const sentences = paragraph.match(/[^.!?]+[.!?]+/g) || [paragraph];
let currentChunk = '';
let chunkStartPosition = startPosition;
for (const sentence of sentences) {
if ((currentChunk + sentence).length > MAX_CHUNK_SIZE && currentChunk.length > 0) {
// Store current chunk and start new one
chunks.push({
content: currentChunk.trim(),
startPosition: chunkStartPosition,
endPosition: chunkStartPosition + currentChunk.length,
sectionType: detectSectionType(currentChunk),
metadata: extractMetadata(currentChunk)
});
currentChunk = sentence;
chunkStartPosition = chunkStartPosition + currentChunk.length;
} else {
currentChunk += sentence;
}
}
// Add the last chunk
if (currentChunk.trim().length > 0) {
chunks.push({
content: currentChunk.trim(),
startPosition: chunkStartPosition,
endPosition: chunkStartPosition + currentChunk.length,
sectionType: detectSectionType(currentChunk),
metadata: extractMetadata(currentChunk)
});
}
return chunks;
}