diff --git a/backend/src/controllers/documentController.ts b/backend/src/controllers/documentController.ts index fd28912..f24ea84 100644 --- a/backend/src/controllers/documentController.ts +++ b/backend/src/controllers/documentController.ts @@ -4,7 +4,6 @@ import { DocumentModel } from '../models/DocumentModel'; import { fileStorageService } from '../services/fileStorageService'; import { jobQueueService } from '../services/jobQueueService'; import { uploadProgressService } from '../services/uploadProgressService'; -import config from '../config/env'; export const documentController = { async uploadDocument(req: Request, res: Response): Promise { @@ -22,8 +21,9 @@ export const documentController = { } const file = req.file; - const processImmediately = req.body.processImmediately === 'true'; - const processingStrategy = req.body.processingStrategy || config.processingStrategy; + + // Always use optimized agentic RAG processing - no strategy selection needed + const processingStrategy = 'optimized_agentic_rag'; // Store file and get file path const storageResult = await fileStorageService.storeFile(file, userId); @@ -42,26 +42,27 @@ export const documentController = { status: 'uploaded' }); - // Queue processing job (auto-process all documents when using agentic_rag strategy) - const shouldAutoProcess = config.processingStrategy === 'agentic_rag' || processImmediately; - if (shouldAutoProcess) { - try { - const jobId = await jobQueueService.addJob( - 'document_processing', - { - documentId: document.id, - userId: userId, - options: { strategy: processingStrategy } - }, - 0 // Normal priority - ); - logger.info('Document processing job queued', { documentId: document.id, jobId, strategy: processingStrategy }); - - // Update status to indicate it's queued for processing - await DocumentModel.updateById(document.id, { status: 'extracting_text' }); - } catch (error) { - logger.error('Failed to queue document processing job', { error, documentId: document.id }); - } + // Always auto-process with optimized agentic RAG + try { + const jobId = await jobQueueService.addJob( + 'document_processing', + { + documentId: document.id, + userId: userId, + options: { strategy: processingStrategy } + }, + 0 // Normal priority + ); + logger.info('Document processing job queued with optimized agentic RAG', { + documentId: document.id, + jobId, + strategy: processingStrategy + }); + + // Update status to indicate it's queued for processing + await DocumentModel.updateById(document.id, { status: 'extracting_text' }); + } catch (error) { + logger.error('Failed to queue document processing job', { error, documentId: document.id }); } // Return document info @@ -69,10 +70,11 @@ export const documentController = { id: document.id, name: document.original_file_name, originalName: document.original_file_name, - status: shouldAutoProcess ? 'extracting_text' : 'uploaded', + status: 'extracting_text', uploadedAt: document.created_at, uploadedBy: userId, - fileSize: document.file_size + fileSize: document.file_size, + processingStrategy: processingStrategy }); } catch (error) { @@ -190,10 +192,22 @@ export const documentController = { // Get progress from upload progress service const progress = uploadProgressService.getProgress(id); + // If no progress data from service, calculate based on document status + let calculatedProgress = 0; + if (document.status === 'completed') { + calculatedProgress = 100; + } else if (document.status === 'processing_llm' || document.status === 'generating_pdf') { + calculatedProgress = 75; + } else if (document.status === 'extracting_text') { + calculatedProgress = 25; + } else if (document.status === 'uploaded') { + calculatedProgress = 10; + } + res.json({ id: document.id, status: document.status, - progress: progress || 0, + progress: progress ? progress.progress : calculatedProgress, uploadedAt: document.created_at, processedAt: document.processing_completed_at }); diff --git a/backend/src/routes/vector.ts b/backend/src/routes/vector.ts index 91f764a..887af80 100644 --- a/backend/src/routes/vector.ts +++ b/backend/src/routes/vector.ts @@ -1,5 +1,4 @@ import { Router } from 'express'; -import { authenticateToken } from '../middleware/auth'; import { vectorDocumentProcessor } from '../services/vectorDocumentProcessor'; import { VectorDatabaseModel } from '../models/VectorDatabaseModel'; import { logger } from '../utils/logger'; @@ -65,131 +64,12 @@ const extendedVectorProcessor = { } }; -/** - * POST /api/vector/search - * Search for relevant content in vector database - */ -router.post('/search', authenticateToken, async (req, res) => { - try { - const { query, documentId, limit = 10, similarityThreshold = 0.6 } = req.body; - - if (!query) { - return res.status(400).json({ error: 'Query is required' }); - } - - const results = await vectorDocumentProcessor.searchRelevantContent(query, { - documentId, - limit, - similarityThreshold - }); - - return res.json({ results }); - } catch (error) { - logger.error('Vector search failed', error); - return res.status(500).json({ error: 'Vector search failed' }); - } -}); - -/** - * POST /api/vector/process-document - * Process a document for vector search - */ -router.post('/process-document', async (req, res) => { - try { - const { documentId, text, metadata = {} } = req.body; - - if (!documentId || !text) { - return res.status(400).json({ error: 'Document ID and text are required' }); - } - - const result = await vectorDocumentProcessor.processDocumentForVectorSearch( - documentId, - text, - metadata - ); - - return res.json({ success: true, result }); - } catch (error) { - logger.error('Document processing failed', error); - return res.status(500).json({ error: 'Document processing failed' }); - } -}); - -/** - * GET /api/vector/similar/:documentId - * Find similar documents - */ -router.get('/similar/:documentId', authenticateToken, async (req, res) => { - try { - const { documentId } = req.params; - const { limit = 10, similarityThreshold = 0.6 } = req.query; - - const results = await extendedVectorProcessor.findSimilarDocuments( - documentId || '', - parseInt(limit as string), - parseFloat(similarityThreshold as string) - ); - - return res.json({ results }); - } catch (error) { - logger.error('Similar documents search failed', error); - return res.status(500).json({ error: 'Similar documents search failed' }); - } -}); - -/** - * POST /api/vector/industry-search - * Search by industry - */ -router.post('/industry-search', async (req, res) => { - try { - const { industry, query, limit = 20 } = req.body; - - if (!industry || !query) { - return res.status(400).json({ error: 'Industry and query are required' }); - } - - const results = await extendedVectorProcessor.searchByIndustry( - industry, - query, - limit - ); - - return res.json({ results }); - } catch (error) { - logger.error('Industry search failed', error); - return res.status(500).json({ error: 'Industry search failed' }); - } -}); - -/** - * POST /api/vector/process-cim-sections - * Process CIM-specific sections for enhanced search - */ -router.post('/process-cim-sections', async (req, res) => { - try { - const { documentId, cimData, metadata = {} } = req.body; - - if (!documentId || !cimData) { - return res.status(400).json({ error: 'Document ID and CIM data are required' }); - } - - const result = await extendedVectorProcessor.processCIMSections( - documentId || '', - cimData, - metadata - ); - - return res.json({ success: true, result }); - } catch (error) { - logger.error('CIM sections processing failed', error); - return res.status(500).json({ error: 'CIM sections processing failed' }); - } -}); +// DISABLED: All vector processing routes have been disabled +// Only read-only endpoints for monitoring and analytics are kept /** * GET /api/vector/document-chunks/:documentId - * Get document chunks for a specific document + * Get document chunks for a specific document (read-only) */ router.get('/document-chunks/:documentId', async (req, res) => { try { @@ -206,7 +86,7 @@ router.get('/document-chunks/:documentId', async (req, res) => { /** * GET /api/vector/analytics - * Get search analytics for the current user + * Get search analytics for the current user (read-only) */ router.get('/analytics', async (req, res) => { try { @@ -231,7 +111,7 @@ router.get('/analytics', async (req, res) => { /** * GET /api/vector/stats - * Get vector database statistics + * Get vector database statistics (read-only) */ router.get('/stats', async (_req, res) => { try { @@ -244,36 +124,4 @@ router.get('/stats', async (_req, res) => { } }); -/** - * DELETE /api/vector/document-chunks/:documentId - * Delete document chunks when a document is deleted - */ -router.delete('/document-chunks/:documentId', async (req, res) => { - try { - const { documentId } = req.params; - - await VectorDatabaseModel.deleteDocumentChunks(documentId); - - return res.json({ success: true }); - } catch (error) { - logger.error('Failed to delete document chunks', error); - return res.status(500).json({ error: 'Failed to delete document chunks' }); - } -}); - -/** - * POST /api/vector/update-similarities - * Update document similarity scores - */ -router.post('/update-similarities', async (_req, res) => { - try { - await VectorDatabaseModel.updateDocumentSimilarities(); - - return res.json({ success: true }); - } catch (error) { - logger.error('Failed to update similarities', error); - return res.status(500).json({ error: 'Failed to update similarities' }); - } -}); - export default router; \ No newline at end of file diff --git a/backend/src/services/agenticRAGProcessor.ts b/backend/src/services/agenticRAGProcessor.ts index 2e260f4..ce6f55f 100644 --- a/backend/src/services/agenticRAGProcessor.ts +++ b/backend/src/services/agenticRAGProcessor.ts @@ -612,25 +612,157 @@ class AgenticRAGProcessor { logger.info('Starting comprehensive document vectorization', { documentId, sessionId }); try { - // Strategy 1: Hierarchical chunking with semantic boundaries - const chunks = await this.createIntelligentChunks(text, documentId); + // Strategy 1: Stream processing for large documents + const MAX_TEXT_SIZE = 50000; // 50KB chunks to prevent memory issues + const chunks: Array<{ + content: string; + chunkIndex: number; + startPosition: number; + endPosition: number; + sectionType?: string; + }> = []; + + if (text.length > MAX_TEXT_SIZE) { + logger.info('Large document detected, using streaming chunking', { + documentId, + textLength: text.length, + estimatedChunks: Math.ceil(text.length / MAX_TEXT_SIZE) + }); + + // Stream processing for large documents + let chunkIndex = 0; + let position = 0; + + while (position < text.length) { + // Force garbage collection between chunks + if (global.gc) { + global.gc(); + } + + const chunkSize = Math.min(MAX_TEXT_SIZE, text.length - position); + let chunkEnd = position + chunkSize; + + // Try to end at sentence boundary + if (chunkEnd < text.length) { + const sentenceEnd = this.findSentenceBoundary(text, chunkEnd); + if (sentenceEnd > position + 1000) { // Ensure minimum chunk size + chunkEnd = sentenceEnd; + } + } + + const chunkText = text.substring(position, chunkEnd); + + // Detect section type for this chunk + const sectionType = this.identifySectionType(chunkText); + + chunks.push({ + content: chunkText, + chunkIndex: chunkIndex++, + startPosition: position, + endPosition: chunkEnd, + sectionType + }); + + position = chunkEnd; + + // Log progress for large documents + if (chunkIndex % 10 === 0) { + logger.info('Vectorization progress', { + documentId, + chunkIndex, + progress: Math.round((position / text.length) * 100) + '%' + }); + } + } + } else { + // For smaller documents, use the original intelligent chunking + chunks.push(...await this.createIntelligentChunks(text, documentId)); + } - // Strategy 2: Generate embeddings with metadata enrichment - const enrichedChunks = await this.enrichChunksWithMetadata(chunks); + // Strategy 2: Process chunks in batches to manage memory + const BATCH_SIZE = 5; // Process 5 chunks at a time + const enrichedChunks: Array<{ + content: string; + chunkIndex: number; + startPosition: number; + endPosition: number; + sectionType?: string; + metadata: { + hasFinancialData: boolean; + hasMetrics: boolean; + keyTerms: string[]; + importance: 'high' | 'medium' | 'low'; + conceptDensity: number; + }; + }> = []; - // Strategy 3: Store with optimized indexing - await vectorDocumentProcessor.storeDocumentChunks(enrichedChunks, { - documentId, - indexingStrategy: 'hierarchical', - similarity_threshold: 0.8, - enable_hybrid_search: true - }); + for (let i = 0; i < chunks.length; i += BATCH_SIZE) { + const batch = chunks.slice(i, i + BATCH_SIZE); + + // Process batch + const batchPromises = batch.map(async (chunk) => { + const metadata = { + hasFinancialData: this.containsFinancialData(chunk.content), + hasMetrics: this.containsMetrics(chunk.content), + keyTerms: this.extractKeyTerms(chunk.content), + importance: this.calculateImportance(chunk.content, chunk.sectionType), + conceptDensity: this.calculateConceptDensity(chunk.content) + }; + + return { + ...chunk, + metadata + }; + }); + + const batchResults = await Promise.all(batchPromises); + enrichedChunks.push(...batchResults); + + // Force garbage collection after each batch + if (global.gc) { + global.gc(); + } + + // Log batch progress + logger.info('Enriched chunk batch', { + documentId, + batchNumber: Math.floor(i / BATCH_SIZE) + 1, + totalBatches: Math.ceil(chunks.length / BATCH_SIZE), + processedChunks: enrichedChunks.length + }); + } + + // Strategy 3: Store chunks in batches to prevent memory buildup + const STORE_BATCH_SIZE = 3; + for (let i = 0; i < enrichedChunks.length; i += STORE_BATCH_SIZE) { + const storeBatch = enrichedChunks.slice(i, i + STORE_BATCH_SIZE); + + await vectorDocumentProcessor.storeDocumentChunks(storeBatch, { + documentId, + indexingStrategy: 'hierarchical', + similarity_threshold: 0.8, + enable_hybrid_search: true + }); + + // Force garbage collection after storing each batch + if (global.gc) { + global.gc(); + } + + logger.info('Stored chunk batch', { + documentId, + batchNumber: Math.floor(i / STORE_BATCH_SIZE) + 1, + totalBatches: Math.ceil(enrichedChunks.length / STORE_BATCH_SIZE), + storedChunks: Math.min(i + STORE_BATCH_SIZE, enrichedChunks.length) + }); + } logger.info('Document vectorization completed successfully', { documentId, sessionId, chunksCreated: enrichedChunks.length, - avgChunkSize: Math.round(enrichedChunks.reduce((sum, c) => sum + c.content.length, 0) / enrichedChunks.length) + avgChunkSize: Math.round(enrichedChunks.reduce((sum, c) => sum + c.content.length, 0) / enrichedChunks.length), + totalTextLength: text.length }); } catch (error) { @@ -740,53 +872,53 @@ class AgenticRAGProcessor { return chunks; } - /** - * Enrich chunks with metadata for enhanced retrieval - */ - private async enrichChunksWithMetadata(chunks: Array<{ - content: string; - chunkIndex: number; - startPosition: number; - endPosition: number; - sectionType?: string; - }>): Promise> { - const enrichedChunks = []; + // /** + // * Enrich chunks with metadata for enhanced retrieval + // */ + // private async enrichChunksWithMetadata(chunks: Array<{ + // content: string; + // chunkIndex: number; + // startPosition: number; + // endPosition: number; + // sectionType?: string; + // }>): Promise> { + // const enrichedChunks = []; - for (const chunk of chunks) { - // Analyze chunk content for metadata - const hasFinancialData = this.containsFinancialData(chunk.content); - const hasMetrics = this.containsMetrics(chunk.content); - const keyTerms = this.extractKeyTerms(chunk.content); - const importance = this.calculateImportance(chunk.content, chunk.sectionType); - const conceptDensity = this.calculateConceptDensity(chunk.content); + // for (const chunk of chunks) { + // // Analyze chunk content for metadata + // const hasFinancialData = this.containsFinancialData(chunk.content); + // const hasMetrics = this.containsMetrics(chunk.content); + // const keyTerms = this.extractKeyTerms(chunk.content); + // const importance = this.calculateImportance(chunk.content, chunk.sectionType); + // const conceptDensity = this.calculateConceptDensity(chunk.content); - enrichedChunks.push({ - ...chunk, - metadata: { - hasFinancialData, - hasMetrics, - keyTerms, - importance, - conceptDensity - } - }); - } + // enrichedChunks.push({ + // ...chunk, + // metadata: { + // hasFinancialData, + // hasMetrics, + // keyTerms, + // importance, + // conceptDensity + // } + // }); + // } - return enrichedChunks; - } + // return enrichedChunks; + // } /** * Detect section boundaries in CIM documents diff --git a/backend/src/services/pdfGenerationService.ts b/backend/src/services/pdfGenerationService.ts index 41e82bf..986e243 100644 --- a/backend/src/services/pdfGenerationService.ts +++ b/backend/src/services/pdfGenerationService.ts @@ -389,6 +389,149 @@ class PDFGenerationService { } } + /** + * Generate CIM Review PDF from analysis data + */ + async generateCIMReviewPDF(analysisData: any): Promise { + try { + // Convert analysis data to HTML + const html = this.generateCIMReviewHTML(analysisData); + + // Generate PDF buffer + const pdfBuffer = await this.generatePDFBuffer(html, { + format: 'A4', + margin: { + top: '0.5in', + right: '0.5in', + bottom: '0.5in', + left: '0.5in', + }, + displayHeaderFooter: true, + printBackground: true, + }); + + if (!pdfBuffer) { + throw new Error('Failed to generate PDF buffer'); + } + + return pdfBuffer; + } catch (error) { + logger.error('Failed to generate CIM Review PDF', error); + throw error; + } + } + + /** + * Generate HTML from CIM Review analysis data + */ + private generateCIMReviewHTML(analysisData: any): string { + const sections = [ + { title: 'Deal Overview', data: analysisData.dealOverview }, + { title: 'Business Description', data: analysisData.businessDescription }, + { title: 'Market & Industry Analysis', data: analysisData.marketIndustryAnalysis }, + { title: 'Financial Summary', data: analysisData.financialSummary }, + { title: 'Management Team Overview', data: analysisData.managementTeamOverview }, + { title: 'Preliminary Investment Thesis', data: analysisData.preliminaryInvestmentThesis }, + { title: 'Key Questions & Next Steps', data: analysisData.keyQuestionsNextSteps }, + ]; + + let html = ` + + + + + CIM Review Report + + + +

CIM Review Report

+ `; + + sections.forEach(section => { + if (section.data) { + html += `

${section.title}

`; + + Object.entries(section.data).forEach(([key, value]) => { + if (value && typeof value === 'object' && !Array.isArray(value)) { + // Handle nested objects + html += `

${this.formatFieldName(key)}

`; + Object.entries(value).forEach(([subKey, subValue]) => { + if (subValue) { + html += ` +
+ ${this.formatFieldName(subKey)}: + ${subValue} +
+ `; + } + }); + } else if (key === 'financials' && typeof value === 'object') { + // Handle financial table + html += `

Financial Data

`; + html += ``; + html += ``; + + const periods = ['fy3', 'fy2', 'fy1', 'ltm']; + periods.forEach(period => { + if (value && typeof value === 'object' && value[period as keyof typeof value]) { + const data = value[period as keyof typeof value] as any; + html += ` + + + + + + + + `; + } + }); + html += `
PeriodRevenueGrowthEBITDAMargin
${period.toUpperCase()}${data?.revenue || '-'}${data?.revenueGrowth || '-'}${data?.ebitda || '-'}${data?.ebitdaMargin || '-'}
`; + } else if (value) { + // Handle simple fields + html += ` +
+ ${this.formatFieldName(key)}: + ${value} +
+ `; + } + }); + + html += `
`; + } + }); + + html += ` + + + `; + + return html; + } + + /** + * Format field names for display + */ + private formatFieldName(fieldName: string): string { + return fieldName + .replace(/([A-Z])/g, ' $1') + .replace(/^./, str => str.toUpperCase()) + .replace(/([A-Z]{2,})/g, match => match.charAt(0) + match.slice(1).toLowerCase()); + } + /** * Close browser instance */