- Refactor LLM service with provider pattern (Anthropic, OpenAI, OpenRouter) - Add structured LLM prompts and utilities (token estimation, cost calculation, JSON extraction) - Implement RAG improvements with optimized chunking and embedding services - Add financial extraction monitoring service - Add parallel document processor - Improve error handling with dedicated error handlers - Add comprehensive TypeScript types for LLM, document, and processing - Update optimized agentic RAG processor and simple document processor
192 lines
5.5 KiB
TypeScript
192 lines
5.5 KiB
TypeScript
import { logger } from '../../utils/logger';
|
|
import type { StructuredTable } from '../documentAiProcessor';
|
|
import type { ProcessingChunk } from './types';
|
|
import { isFinancialTable, formatTableAsMarkdown } from './tableProcessor';
|
|
import { detectSectionType, extractMetadata } from './utils';
|
|
|
|
const MAX_CHUNK_SIZE = 4000;
|
|
const OVERLAP_SIZE = 200;
|
|
|
|
interface SemanticChunk {
|
|
content: string;
|
|
startPosition: number;
|
|
endPosition: number;
|
|
sectionType?: string;
|
|
metadata?: Record<string, any>;
|
|
}
|
|
|
|
/**
|
|
* Create intelligent chunks with semantic boundaries
|
|
*/
|
|
export async function createIntelligentChunks(
|
|
text: string,
|
|
documentId: string,
|
|
enableSemanticChunking: boolean = true,
|
|
structuredTables: StructuredTable[] = []
|
|
): Promise<ProcessingChunk[]> {
|
|
const chunks: ProcessingChunk[] = [];
|
|
|
|
if (structuredTables.length > 0) {
|
|
logger.info('Processing structured tables for chunking', {
|
|
documentId,
|
|
tableCount: structuredTables.length
|
|
});
|
|
|
|
structuredTables.forEach((table, index) => {
|
|
const isFinancial = isFinancialTable(table);
|
|
const markdownTable = formatTableAsMarkdown(table);
|
|
const chunkIndex = chunks.length;
|
|
|
|
chunks.push({
|
|
id: `${documentId}-table-${index}`,
|
|
content: markdownTable,
|
|
chunkIndex,
|
|
startPosition: -1,
|
|
endPosition: -1,
|
|
sectionType: isFinancial ? 'financial-table' : 'table',
|
|
metadata: {
|
|
isStructuredTable: true,
|
|
isFinancialTable: isFinancial,
|
|
tableIndex: index,
|
|
pageNumber: table.position?.pageNumber ?? -1,
|
|
headerCount: table.headers.length,
|
|
rowCount: table.rows.length,
|
|
structuredData: table
|
|
}
|
|
});
|
|
|
|
logger.info('Created chunk for structured table', {
|
|
documentId,
|
|
tableIndex: index,
|
|
isFinancial,
|
|
chunkId: `${documentId}-table-${index}`,
|
|
headerCount: table.headers.length,
|
|
rowCount: table.rows.length
|
|
});
|
|
});
|
|
}
|
|
|
|
if (enableSemanticChunking) {
|
|
const semanticChunks = splitBySemanticBoundaries(text);
|
|
|
|
for (let i = 0; i < semanticChunks.length; i++) {
|
|
const chunk = semanticChunks[i];
|
|
if (chunk && chunk.content.length > 50) {
|
|
const chunkIndex = chunks.length;
|
|
chunks.push({
|
|
id: `${documentId}-chunk-${chunkIndex}`,
|
|
content: chunk.content,
|
|
chunkIndex,
|
|
startPosition: chunk.startPosition,
|
|
endPosition: chunk.endPosition,
|
|
sectionType: chunk.sectionType || 'general',
|
|
metadata: {
|
|
...(chunk.metadata || {}),
|
|
hasStructuredTableContext: false
|
|
}
|
|
});
|
|
}
|
|
}
|
|
} else {
|
|
for (let i = 0; i < text.length; i += MAX_CHUNK_SIZE - OVERLAP_SIZE) {
|
|
const chunkContent = text.substring(i, i + MAX_CHUNK_SIZE);
|
|
if (chunkContent.trim().length > 50) {
|
|
const chunkIndex = chunks.length;
|
|
chunks.push({
|
|
id: `${documentId}-chunk-${chunkIndex}`,
|
|
content: chunkContent,
|
|
chunkIndex,
|
|
startPosition: i,
|
|
endPosition: i + chunkContent.length,
|
|
sectionType: detectSectionType(chunkContent),
|
|
metadata: extractMetadata(chunkContent)
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return chunks;
|
|
}
|
|
|
|
/**
|
|
* Split text by semantic boundaries (paragraphs, sections, etc.)
|
|
*/
|
|
function splitBySemanticBoundaries(text: string): SemanticChunk[] {
|
|
const chunks: SemanticChunk[] = [];
|
|
|
|
// Split by double newlines (paragraphs)
|
|
const paragraphs = text.split(/\n\s*\n/);
|
|
let currentPosition = 0;
|
|
|
|
for (const paragraph of paragraphs) {
|
|
if (paragraph.trim().length === 0) {
|
|
currentPosition += paragraph.length + 2; // +2 for \n\n
|
|
continue;
|
|
}
|
|
|
|
// If paragraph is too large, split it further
|
|
if (paragraph.length > MAX_CHUNK_SIZE) {
|
|
const subChunks = splitLargeParagraph(paragraph, currentPosition);
|
|
chunks.push(...subChunks);
|
|
currentPosition += paragraph.length + 2;
|
|
} else {
|
|
chunks.push({
|
|
content: paragraph.trim(),
|
|
startPosition: currentPosition,
|
|
endPosition: currentPosition + paragraph.length,
|
|
sectionType: detectSectionType(paragraph),
|
|
metadata: extractMetadata(paragraph)
|
|
});
|
|
currentPosition += paragraph.length + 2;
|
|
}
|
|
}
|
|
|
|
return chunks;
|
|
}
|
|
|
|
/**
|
|
* Split large paragraphs into smaller chunks
|
|
*/
|
|
function splitLargeParagraph(
|
|
paragraph: string,
|
|
startPosition: number
|
|
): SemanticChunk[] {
|
|
const chunks: SemanticChunk[] = [];
|
|
|
|
// Split by sentences first
|
|
const sentences = paragraph.match(/[^.!?]+[.!?]+/g) || [paragraph];
|
|
let currentChunk = '';
|
|
let chunkStartPosition = startPosition;
|
|
|
|
for (const sentence of sentences) {
|
|
if ((currentChunk + sentence).length > MAX_CHUNK_SIZE && currentChunk.length > 0) {
|
|
// Store current chunk and start new one
|
|
chunks.push({
|
|
content: currentChunk.trim(),
|
|
startPosition: chunkStartPosition,
|
|
endPosition: chunkStartPosition + currentChunk.length,
|
|
sectionType: detectSectionType(currentChunk),
|
|
metadata: extractMetadata(currentChunk)
|
|
});
|
|
currentChunk = sentence;
|
|
chunkStartPosition = chunkStartPosition + currentChunk.length;
|
|
} else {
|
|
currentChunk += sentence;
|
|
}
|
|
}
|
|
|
|
// Add the last chunk
|
|
if (currentChunk.trim().length > 0) {
|
|
chunks.push({
|
|
content: currentChunk.trim(),
|
|
startPosition: chunkStartPosition,
|
|
endPosition: chunkStartPosition + currentChunk.length,
|
|
sectionType: detectSectionType(currentChunk),
|
|
metadata: extractMetadata(currentChunk)
|
|
});
|
|
}
|
|
|
|
return chunks;
|
|
}
|
|
|