cim_summary/backend/test-pdf-extraction-direct.js

// Test PDF text extraction directly
const { Pool } = require('pg');
const pdfParse = require('pdf-parse');
const fs = require('fs');

async function testPDFExtractionDirect() {
  try {
    console.log('Testing PDF text extraction directly...');

    const pool = new Pool({
      connectionString: 'postgresql://postgres:password@localhost:5432/cim_processor'
    });

    // Find a PDF document
    const result = await pool.query(`
      SELECT id, original_file_name, file_path
      FROM documents
      WHERE original_file_name LIKE '%.pdf'
      ORDER BY created_at DESC
      LIMIT 1
    `);

    if (result.rows.length === 0) {
      console.log('❌ No PDF documents found in database');
      await pool.end();
      return;
    }

    const document = result.rows[0];
    console.log(`📄 Testing with document: ${document.original_file_name}`);
    console.log(`📁 File path: ${document.file_path}`);

    // Check if file exists
    if (!fs.existsSync(document.file_path)) {
      console.log('❌ File not found on disk');
      await pool.end();
      return;
    }

    // Test text extraction
    console.log('\n🔄 Extracting text from PDF...');
    const startTime = Date.now();

    try {
      const dataBuffer = fs.readFileSync(document.file_path);
      const data = await pdfParse(dataBuffer);

      const extractionTime = Date.now() - startTime;

      console.log('✅ PDF text extraction completed!');
      console.log(`⏱️  Extraction time: ${extractionTime}ms`);
      console.log(`📊 Text length: ${data.text.length} characters`);
      console.log(`📄 Pages: ${data.numpages}`);
      console.log(`📁 File size: ${dataBuffer.length} bytes`);

      // Show first 500 characters as preview
      console.log('\n📋 Text preview (first 500 characters):');
      console.log('=' .repeat(50));
      console.log(data.text.substring(0, 500) + '...');
      console.log('=' .repeat(50));

      // Check if text contains expected content
      const hasFinancialContent = data.text.toLowerCase().includes('revenue') ||
                                 data.text.toLowerCase().includes('ebitda') ||
                                 data.text.toLowerCase().includes('financial');

      const hasCompanyContent = data.text.toLowerCase().includes('company') ||
                               data.text.toLowerCase().includes('business') ||
                               data.text.toLowerCase().includes('corporate');

      console.log('\n🔍 Content Analysis:');
      console.log(`- Contains financial terms: ${hasFinancialContent ? '✅' : '❌'}`);
      console.log(`- Contains company/business terms: ${hasCompanyContent ? '✅' : '❌'}`);

      if (data.text.length < 100) {
        console.log('⚠️  Warning: Extracted text seems too short, may indicate extraction issues');
      } else if (data.text.length > 10000) {
        console.log('✅ Good: Extracted text is substantial in length');
      }

      // Test with Agentic RAG
      console.log('\n🤖 Testing Agentic RAG with extracted text...');

      // Import the agentic RAG processor
      require('ts-node/register');
      const { agenticRAGProcessor } = require('./src/services/agenticRAGProcessor');

      const userId = '4161c088-dfb1-4855-ad34-def1cdc5084e'; // Real user ID

      console.log('🔄 Processing with Agentic RAG...');
      const agenticStartTime = Date.now();

      const agenticResult = await agenticRAGProcessor.processDocument(data.text, document.id, userId);

      const agenticTime = Date.now() - agenticStartTime;

      console.log('✅ Agentic RAG processing completed!');
      console.log(`⏱️  Agentic RAG time: ${agenticTime}ms`);
      console.log(`✅ Success: ${agenticResult.success}`);
      console.log(`📊 API Calls: ${agenticResult.apiCalls}`);
      console.log(`💰 Total Cost: $${agenticResult.totalCost}`);
      console.log(`📝 Summary Length: ${agenticResult.summary?.length || 0}`);

      if (agenticResult.error) {
        console.log(`❌ Error: ${agenticResult.error}`);
      } else {
        console.log('✅ No errors in Agentic RAG processing');
      }

    } catch (pdfError) {
      console.error('❌ PDF text extraction failed:', pdfError);
      console.error('Error details:', {
        name: pdfError.name,
        message: pdfError.message
      });
    }

    await pool.end();

  } catch (error) {
    console.error('❌ Test failed:', error);
    console.error('Error details:', {
      name: error.name,
      message: error.message
    });
  }
}

testPDFExtractionDirect();