Files
cim_summary/backend/test-pdf-extraction-direct.js
Jon 57770fd99d feat: Implement hybrid LLM approach with enhanced prompts for CIM analysis
🎯 Major Features:
- Hybrid LLM configuration: Claude 3.7 Sonnet (primary) + GPT-4.5 (fallback)
- Task-specific model selection for optimal performance
- Enhanced prompts for all analysis types with proven results

🔧 Technical Improvements:
- Enhanced financial analysis with fiscal year mapping (100% success rate)
- Business model analysis with scalability assessment
- Market positioning analysis with TAM/SAM extraction
- Management team assessment with succession planning
- Creative content generation with GPT-4.5

📊 Performance & Cost Optimization:
- Claude 3.7 Sonnet: /5 per 1M tokens (82.2% MATH score)
- GPT-4.5: Premium creative content (5/50 per 1M tokens)
- ~80% cost savings using Claude for analytical tasks
- Automatic fallback system for reliability

 Proven Results:
- Successfully extracted 3-year financial data from STAX CIM
- Correctly mapped fiscal years (2023→FY-3, 2024→FY-2, 2025E→FY-1, LTM Mar-25→LTM)
- Identified revenue: 4M→1M→1M→6M (LTM)
- Identified EBITDA: 8.9M→3.9M→1M→7.2M (LTM)

🚀 Files Added/Modified:
- Enhanced LLM service with task-specific model selection
- Updated environment configuration for hybrid approach
- Enhanced prompt builders for all analysis types
- Comprehensive testing scripts and documentation
- Updated frontend components for improved UX

📚 References:
- Eden AI Model Comparison: Claude 3.7 Sonnet vs GPT-4.5
- Artificial Analysis Benchmarks for performance metrics
- Cost optimization based on model strengths and pricing
2025-07-28 16:46:06 -04:00

129 lines
4.6 KiB
JavaScript

// Test PDF text extraction directly
const { Pool } = require('pg');
const pdfParse = require('pdf-parse');
const fs = require('fs');
async function testPDFExtractionDirect() {
try {
console.log('Testing PDF text extraction directly...');
const pool = new Pool({
connectionString: 'postgresql://postgres:password@localhost:5432/cim_processor'
});
// Find a PDF document
const result = await pool.query(`
SELECT id, original_file_name, file_path
FROM documents
WHERE original_file_name LIKE '%.pdf'
ORDER BY created_at DESC
LIMIT 1
`);
if (result.rows.length === 0) {
console.log('❌ No PDF documents found in database');
await pool.end();
return;
}
const document = result.rows[0];
console.log(`📄 Testing with document: ${document.original_file_name}`);
console.log(`📁 File path: ${document.file_path}`);
// Check if file exists
if (!fs.existsSync(document.file_path)) {
console.log('❌ File not found on disk');
await pool.end();
return;
}
// Test text extraction
console.log('\n🔄 Extracting text from PDF...');
const startTime = Date.now();
try {
const dataBuffer = fs.readFileSync(document.file_path);
const data = await pdfParse(dataBuffer);
const extractionTime = Date.now() - startTime;
console.log('✅ PDF text extraction completed!');
console.log(`⏱️ Extraction time: ${extractionTime}ms`);
console.log(`📊 Text length: ${data.text.length} characters`);
console.log(`📄 Pages: ${data.numpages}`);
console.log(`📁 File size: ${dataBuffer.length} bytes`);
// Show first 500 characters as preview
console.log('\n📋 Text preview (first 500 characters):');
console.log('=' .repeat(50));
console.log(data.text.substring(0, 500) + '...');
console.log('=' .repeat(50));
// Check if text contains expected content
const hasFinancialContent = data.text.toLowerCase().includes('revenue') ||
data.text.toLowerCase().includes('ebitda') ||
data.text.toLowerCase().includes('financial');
const hasCompanyContent = data.text.toLowerCase().includes('company') ||
data.text.toLowerCase().includes('business') ||
data.text.toLowerCase().includes('corporate');
console.log('\n🔍 Content Analysis:');
console.log(`- Contains financial terms: ${hasFinancialContent ? '✅' : '❌'}`);
console.log(`- Contains company/business terms: ${hasCompanyContent ? '✅' : '❌'}`);
if (data.text.length < 100) {
console.log('⚠️ Warning: Extracted text seems too short, may indicate extraction issues');
} else if (data.text.length > 10000) {
console.log('✅ Good: Extracted text is substantial in length');
}
// Test with Agentic RAG
console.log('\n🤖 Testing Agentic RAG with extracted text...');
// Import the agentic RAG processor
require('ts-node/register');
const { agenticRAGProcessor } = require('./src/services/agenticRAGProcessor');
const userId = '4161c088-dfb1-4855-ad34-def1cdc5084e'; // Real user ID
console.log('🔄 Processing with Agentic RAG...');
const agenticStartTime = Date.now();
const agenticResult = await agenticRAGProcessor.processDocument(data.text, document.id, userId);
const agenticTime = Date.now() - agenticStartTime;
console.log('✅ Agentic RAG processing completed!');
console.log(`⏱️ Agentic RAG time: ${agenticTime}ms`);
console.log(`✅ Success: ${agenticResult.success}`);
console.log(`📊 API Calls: ${agenticResult.apiCalls}`);
console.log(`💰 Total Cost: $${agenticResult.totalCost}`);
console.log(`📝 Summary Length: ${agenticResult.summary?.length || 0}`);
if (agenticResult.error) {
console.log(`❌ Error: ${agenticResult.error}`);
} else {
console.log('✅ No errors in Agentic RAG processing');
}
} catch (pdfError) {
console.error('❌ PDF text extraction failed:', pdfError);
console.error('Error details:', {
name: pdfError.name,
message: pdfError.message
});
}
await pool.end();
} catch (error) {
console.error('❌ Test failed:', error);
console.error('Error details:', {
name: error.name,
message: error.message
});
}
}
testPDFExtractionDirect();