Files
cim_summary/backend/process-uploaded-docs.js
Jon c67dab22b4 Add comprehensive CIM processing features and UI improvements
- Add new database migrations for analysis data and job tracking
- Implement enhanced document processing service with LLM integration
- Add processing progress and queue status components
- Create testing guides and utility scripts for CIM processing
- Update frontend components for better user experience
- Add environment configuration and backup files
- Implement job queue service and upload progress tracking
2025-07-27 20:25:46 -04:00

231 lines
6.9 KiB
JavaScript

const { Pool } = require('pg');
const fs = require('fs');
const pdfParse = require('pdf-parse');
const Anthropic = require('@anthropic-ai/sdk');
// Load environment variables
require('dotenv').config();
const pool = new Pool({
connectionString: 'postgresql://postgres:password@localhost:5432/cim_processor'
});
// Initialize Anthropic client
const anthropic = new Anthropic({
apiKey: process.env.ANTHROPIC_API_KEY,
});
async function processWithLLM(text) {
console.log('🤖 Processing with Anthropic Claude...');
try {
const prompt = `You are an expert investment analyst reviewing a Confidential Information Memorandum (CIM).
Please analyze the following CIM document and provide a comprehensive summary and analysis in the following JSON format:
{
"summary": "A concise 2-3 sentence summary of the company and investment opportunity",
"companyName": "The company name",
"industry": "Primary industry/sector",
"revenue": "Annual revenue (if available)",
"ebitda": "EBITDA (if available)",
"employees": "Number of employees (if available)",
"founded": "Year founded (if available)",
"location": "Primary location/headquarters",
"keyMetrics": {
"metric1": "value1",
"metric2": "value2"
},
"financials": {
"revenue": ["year1", "year2", "year3"],
"ebitda": ["year1", "year2", "year3"],
"margins": ["year1", "year2", "year3"]
},
"risks": [
"Risk factor 1",
"Risk factor 2",
"Risk factor 3"
],
"opportunities": [
"Opportunity 1",
"Opportunity 2",
"Opportunity 3"
],
"investmentThesis": "Key investment thesis points",
"keyQuestions": [
"Important question 1",
"Important question 2"
]
}
CIM Document Content:
${text.substring(0, 15000)}
Please provide your analysis in valid JSON format only.`;
const message = await anthropic.messages.create({
model: "claude-3-5-sonnet-20241022",
max_tokens: 2000,
temperature: 0.3,
system: "You are an expert investment analyst. Provide analysis in valid JSON format only.",
messages: [
{
role: "user",
content: prompt
}
]
});
const responseText = message.content[0].text;
try {
const analysis = JSON.parse(responseText);
return analysis;
} catch (parseError) {
console.log('⚠️ Failed to parse JSON, using fallback analysis');
return {
summary: "Document analysis completed",
companyName: "Company Name",
industry: "Industry",
revenue: "Not specified",
ebitda: "Not specified",
employees: "Not specified",
founded: "Not specified",
location: "Not specified",
keyMetrics: {
"Document Type": "CIM",
"Pages": "Multiple"
},
financials: {
revenue: ["Not specified", "Not specified", "Not specified"],
ebitda: ["Not specified", "Not specified", "Not specified"],
margins: ["Not specified", "Not specified", "Not specified"]
},
risks: [
"Analysis completed",
"Document reviewed"
],
opportunities: [
"Document contains investment information",
"Ready for review"
],
investmentThesis: "Document analysis completed",
keyQuestions: [
"Review document for specific details",
"Validate financial information"
]
};
}
} catch (error) {
console.error('❌ Error calling Anthropic API:', error.message);
throw error;
}
}
async function processUploadedDocs() {
try {
console.log('🚀 Processing All Uploaded Documents');
console.log('====================================');
// Find all documents with 'uploaded' status
const uploadedDocs = await pool.query(`
SELECT id, original_file_name, status, file_path, created_at
FROM documents
WHERE status = 'uploaded'
ORDER BY created_at DESC
`);
console.log(`📋 Found ${uploadedDocs.rows.length} documents to process:`);
uploadedDocs.rows.forEach(doc => {
console.log(` - ${doc.original_file_name} (${doc.status})`);
});
if (uploadedDocs.rows.length === 0) {
console.log('✅ No documents need processing');
return;
}
// Process each document
for (const document of uploadedDocs.rows) {
console.log(`\n🔄 Processing: ${document.original_file_name}`);
try {
// Check if file exists
if (!fs.existsSync(document.file_path)) {
console.log(`❌ File not found: ${document.file_path}`);
continue;
}
// Update status to processing
await pool.query(`
UPDATE documents
SET status = 'processing_llm',
updated_at = CURRENT_TIMESTAMP
WHERE id = $1
`, [document.id]);
console.log('📄 Extracting text from PDF...');
// Extract text from PDF
const dataBuffer = fs.readFileSync(document.file_path);
const pdfData = await pdfParse(dataBuffer);
console.log(`📊 Extracted ${pdfData.text.length} characters from ${pdfData.numpages} pages`);
// Process with LLM
console.log('🤖 Starting AI analysis...');
const llmResult = await processWithLLM(pdfData.text);
console.log('✅ AI analysis completed!');
console.log(`📋 Summary: ${llmResult.summary.substring(0, 100)}...`);
// Update document with results
await pool.query(`
UPDATE documents
SET status = 'completed',
generated_summary = $1,
updated_at = CURRENT_TIMESTAMP
WHERE id = $2
`, [llmResult.summary, document.id]);
// Update processing jobs
await pool.query(`
UPDATE processing_jobs
SET status = 'completed',
progress = 100,
completed_at = CURRENT_TIMESTAMP
WHERE document_id = $1
`, [document.id]);
console.log('💾 Results saved to database');
} catch (error) {
console.error(`❌ Error processing ${document.original_file_name}:`, error.message);
// Mark as failed
await pool.query(`
UPDATE documents
SET status = 'error',
error_message = $1,
updated_at = CURRENT_TIMESTAMP
WHERE id = $2
`, [error.message, document.id]);
}
}
console.log('\n🎉 Processing completed!');
console.log('📊 Next Steps:');
console.log('1. Go to http://localhost:3000');
console.log('2. Login with user1@example.com / user123');
console.log('3. Check the Documents tab');
console.log('4. All uploaded documents should now show as "Completed"');
} catch (error) {
console.error('❌ Error during processing:', error.message);
} finally {
await pool.end();
}
}
processUploadedDocs();