cim_summary/backend/test-pdf-extraction.js

// Test PDF text extraction functionality
require('ts-node/register');
const { documentController } = require('./src/controllers/documentController');

async function testPDFExtraction() {
  try {
    console.log('Testing PDF text extraction...');

    // Get a real document ID from the database
    const { Pool } = require('pg');
    const pool = new Pool({
      connectionString: 'postgresql://postgres:password@localhost:5432/cim_processor'
    });

    // Find a PDF document
    const result = await pool.query(`
      SELECT id, original_file_name, file_path
      FROM documents
      WHERE original_file_name LIKE '%.pdf'
      ORDER BY created_at DESC
      LIMIT 1
    `);

    if (result.rows.length === 0) {
      console.log('❌ No PDF documents found in database');
      await pool.end();
      return;
    }

    const document = result.rows[0];
    console.log(`📄 Testing with document: ${document.original_file_name}`);
    console.log(`📁 File path: ${document.file_path}`);

    // Test text extraction
    console.log('\n🔄 Extracting text from PDF...');
    const startTime = Date.now();

    const extractedText = await documentController.getDocumentText(document.id);

    const extractionTime = Date.now() - startTime;

    console.log('✅ PDF text extraction completed!');
    console.log(`⏱️  Extraction time: ${extractionTime}ms`);
    console.log(`📊 Text length: ${extractedText.length} characters`);
    console.log(`📄 Estimated pages: ${Math.ceil(extractedText.length / 2000)}`);

    // Show first 500 characters as preview
    console.log('\n📋 Text preview (first 500 characters):');
    console.log('=' .repeat(50));
    console.log(extractedText.substring(0, 500) + '...');
    console.log('=' .repeat(50));

    // Check if text contains expected content
    const hasFinancialContent = extractedText.toLowerCase().includes('revenue') ||
                               extractedText.toLowerCase().includes('ebitda') ||
                               extractedText.toLowerCase().includes('financial');

    const hasCompanyContent = extractedText.toLowerCase().includes('company') ||
                             extractedText.toLowerCase().includes('business') ||
                             extractedText.toLowerCase().includes('corporate');

    console.log('\n🔍 Content Analysis:');
    console.log(`- Contains financial terms: ${hasFinancialContent ? '✅' : '❌'}`);
    console.log(`- Contains company/business terms: ${hasCompanyContent ? '✅' : '❌'}`);

    if (extractedText.length < 100) {
      console.log('⚠️  Warning: Extracted text seems too short, may indicate extraction issues');
    } else if (extractedText.length > 10000) {
      console.log('✅ Good: Extracted text is substantial in length');
    }

    await pool.end();

  } catch (error) {
    console.error('❌ PDF text extraction test failed:', error);
    console.error('Error details:', {
      name: error.name,
      message: error.message,
      stack: error.stack
    });
  }
}

testPDFExtraction();