cim_summary/backend/src/services/documentAiProcessor.ts

import { logger } from '../utils/logger';
import { DocumentProcessorServiceClient } from '@google-cloud/documentai';
import { Storage } from '@google-cloud/storage';
import { config } from '../config/env';
import pdf from 'pdf-parse';
import { PDFDocument } from 'pdf-lib';

interface ProcessingResult {
  success: boolean;
  content: string;
  metadata?: any;
  error?: string;
}

export interface StructuredTable {
  headers: string[];
  rows: string[][];
  position: {
    pageNumber: number;
    confidence: number;
  };
  rawTable?: any;
}

interface DocumentAIOutput {
  text: string;
  entities: Array<{
    type: string;
    mentionText: string;
    confidence: number;
  }>;
  tables: StructuredTable[];
  pages: Array<any>;
  mimeType: string;
}

export class DocumentAiProcessor {
  private gcsBucketName: string;
  private documentAiClient: DocumentProcessorServiceClient;
  private storageClient: Storage;
  private processorName: string;
  // Reduced to 15 pages to work with non-imageless mode (safer default)
  // If imageless mode is enabled, can increase to 30
  private readonly MAX_PAGES_PER_CHUNK = 15;

  constructor() {
    this.gcsBucketName = config.googleCloud.gcsBucketName;
    this.documentAiClient = new DocumentProcessorServiceClient();
    this.storageClient = new Storage();

    // Construct the processor name
    this.processorName = `projects/${config.googleCloud.projectId}/locations/${config.googleCloud.documentAiLocation}/processors/${config.googleCloud.documentAiProcessorId}`;

    logger.info('Document AI processor initialized', {
      projectId: config.googleCloud.projectId,
      location: config.googleCloud.documentAiLocation,
      processorId: config.googleCloud.documentAiProcessorId,
      processorName: this.processorName,
      maxPagesPerChunk: this.MAX_PAGES_PER_CHUNK
    });
  }

  /**
   * Extract text from a Document AI layout object using text anchors
   */
  private getTextFromLayout(layout: any, documentText: string): string {
    try {
      const textAnchor = layout?.textAnchor;
      if (!textAnchor?.textSegments || textAnchor.textSegments.length === 0) {
        return '';
      }

      const segment = textAnchor.textSegments[0];
      const startIndex = parseInt(segment.startIndex || '0', 10);
      const endIndex = parseInt(segment.endIndex || documentText.length.toString(), 10);

      if (Number.isNaN(startIndex) || Number.isNaN(endIndex) || startIndex < 0 || endIndex > documentText.length || startIndex >= endIndex) {
        logger.warn('Invalid text anchor indices detected when extracting table cell text', {
          startIndex,
          endIndex,
          documentLength: documentText.length
        });
        return '';
      }

      return documentText.substring(startIndex, endIndex).trim();
    } catch (error) {
      logger.error('Failed to extract text from layout', {
        error: error instanceof Error ? error.message : String(error),
        layout
      });
      return '';
    }
  }

  /**
   * Convert Document AI table response into a structured, text-based representation
   */
  private extractStructuredTables(document: any, documentText: string): StructuredTable[] {
    const tables: StructuredTable[] = [];

    try {
      const pages = document?.pages || [];
      logger.info('Extracting structured tables from Document AI response', {
        pageCount: pages.length
      });

      for (const page of pages) {
        const pageTables = page.tables || [];
        const pageNumber = page.pageNumber || 0;

        for (let tableIndex = 0; tableIndex < pageTables.length; tableIndex++) {
          const table = pageTables[tableIndex];

          try {
            const headers: string[] = [];
            if (Array.isArray(table.headerRows) && table.headerRows.length > 0) {
              const headerRow = table.headerRows[0];
              for (const cell of headerRow.cells || []) {
                headers.push(this.getTextFromLayout(cell.layout, documentText));
              }
            }

            const rows: string[][] = [];
            for (const bodyRow of table.bodyRows || []) {
              const row: string[] = [];
              for (const cell of bodyRow.cells || []) {
                row.push(this.getTextFromLayout(cell.layout, documentText));
              }
              if (row.some(value => value && value.length > 0)) {
                rows.push(row);
              }
            }

            if (headers.length > 0 || rows.length > 0) {
              tables.push({
                headers,
                rows,
                position: {
                  pageNumber,
                  confidence: typeof table.confidence === 'number' ? table.confidence : 0.9
                },
                rawTable: table
              });

              logger.info('Structured table extracted', {
                pageNumber,
                tableIndex,
                headerCount: headers.length,
                rowCount: rows.length
              });
            }
          } catch (tableError) {
            logger.error('Failed to extract structured table from Document AI response', {
              pageNumber,
              tableIndex,
              error: tableError instanceof Error ? tableError.message : String(tableError)
            });
          }
        }
      }

      logger.info('Structured table extraction completed', {
        totalTables: tables.length
      });
    } catch (error) {
      logger.error('Structured table extraction failed', {
        error: error instanceof Error ? error.message : String(error)
      });
    }

    return tables;
  }

  async processDocument(
    documentId: string,
    userId: string,
    fileBuffer: Buffer,
    fileName: string,
    mimeType: string
  ): Promise<ProcessingResult> {
    const startTime = Date.now();

    try {
      logger.info('Document AI processor: processDocument called (RAG-enabled)', {
        documentId,
        userId,
        fileName,
        fileSize: fileBuffer.length,
        mimeType
      });

      // Step 1: Extract text/structured data using Document AI or fallback
      const { text: extractedText, structuredTables } = await this.extractTextFromDocument(fileBuffer, fileName, mimeType);

      if (!extractedText) {
        throw new Error('Failed to extract text from document');
      }

      logger.info('Text extraction completed', {
        textLength: extractedText.length
      });

      // Step 2: Process extracted text through Agentic RAG
      const agenticRagResult = await this.processWithAgenticRAG(documentId, extractedText, structuredTables);

      const processingTime = Date.now() - startTime;

      return {
        success: true,
        content: agenticRagResult.summary || extractedText,
        metadata: {
          processingStrategy: 'document_ai_agentic_rag',
          processingTime,
          extractedTextLength: extractedText.length,
          agenticRagResult,
          structuredTables,
          structuredTablesFound: structuredTables.length,
          fileSize: fileBuffer.length,
          fileName,
          mimeType
        }
      };

    } catch (error) {
      const processingTime = Date.now() - startTime;

      // Improved error message handling
      let errorMessage: string;
      if (error instanceof Error) {
        errorMessage = error.message;
      } else if (typeof error === 'string') {
        errorMessage = error;
      } else if (error && typeof error === 'object') {
        // Try to extract meaningful information from object
        errorMessage = (error as any).message || error.toString() || JSON.stringify(error, Object.getOwnPropertyNames(error));
      } else {
        errorMessage = String(error);
      }

      const errorStack = error instanceof Error ? error.stack : undefined;
      const errorDetails = error instanceof Error ? {
        name: error.name,
        message: error.message,
        stack: error.stack
      } : {
        type: typeof error,
        value: error
      };

      logger.error('Document AI + Agentic RAG processing failed', {
        documentId,
        error: errorMessage,
        errorDetails,
        stack: errorStack,
        processingTime,
        originalError: error
      });

      return {
        success: false,
        content: '',
        error: `Document AI + Agentic RAG processing failed: ${errorMessage}`,
        metadata: {
          processingStrategy: 'document_ai_agentic_rag',
          processingTime,
          error: errorMessage,
          errorDetails,
          stack: errorStack
        }
      };
    }
  }

  /**
   * Extract text only (no RAG processing) - for simple processor
   */
  async extractTextOnly(
    documentId: string,
    userId: string,
    fileBuffer: Buffer,
    fileName: string,
    mimeType: string
  ): Promise<{ text: string; structuredTables: StructuredTable[] }> {
      logger.info('Document AI processor: extractTextOnly called (text-only, no RAG)', {
      documentId,
      fileName,
      fileSize: fileBuffer.length,
      mimeType
    });
    return await this.extractTextFromDocument(fileBuffer, fileName, mimeType);
  }

  private async extractTextFromDocument(
    fileBuffer: Buffer,
    fileName: string,
    mimeType: string
  ): Promise<{ text: string; structuredTables: StructuredTable[] }> {
    try {
      // Check document size first
      const pdfData = await pdf(fileBuffer);
      const totalPages = pdfData.numpages;

      logger.info('PDF analysis completed', {
        totalPages,
        textLength: pdfData.text?.length || 0
      });

      // If document has more than 30 pages, split into chunks and process each
      if (totalPages > this.MAX_PAGES_PER_CHUNK) {
        logger.info('Document exceeds Document AI page limit, splitting into chunks', {
          totalPages,
          maxPagesPerChunk: this.MAX_PAGES_PER_CHUNK,
          estimatedChunks: Math.ceil(totalPages / this.MAX_PAGES_PER_CHUNK)
        });

        return await this.extractDocumentDataFromChunkedPDF(fileBuffer, fileName, mimeType, totalPages);
      }

      // For documents <= 30 pages, use Document AI directly
      logger.info('Using Document AI for text extraction', {
        totalPages,
        maxPagesPerChunk: this.MAX_PAGES_PER_CHUNK
      });

      // Upload file to GCS
      const gcsFilePath = await this.uploadToGCS(fileBuffer, fileName);

      // Process with Document AI
      const documentAiOutput = await this.processWithDocumentAI(gcsFilePath, mimeType);

      // Cleanup GCS file
      await this.cleanupGCSFiles(gcsFilePath);

      return {
        text: documentAiOutput.text,
        structuredTables: documentAiOutput.tables || []
      };

    } catch (error) {
      logger.error('Text extraction failed, using pdf-parse fallback', {
        error: error instanceof Error ? error.message : String(error)
      });

      // Fallback to pdf-parse
      try {
        const pdfDataFallback = await pdf(fileBuffer);
        return {
          text: pdfDataFallback.text || '',
          structuredTables: []
        };
      } catch (fallbackError) {
        logger.error('Both Document AI and pdf-parse failed', {
          originalError: error instanceof Error ? error.message : String(error),
          fallbackError: fallbackError instanceof Error ? fallbackError.message : String(fallbackError)
        });
        throw new Error('Failed to extract text from document using any method');
      }
    }
  }

  /**
   * Split PDF into chunks and process each chunk with Document AI, then combine results
   */
  private async extractDocumentDataFromChunkedPDF(
    fileBuffer: Buffer,
    fileName: string,
    mimeType: string,
    totalPages: number
  ): Promise<{ text: string; structuredTables: StructuredTable[] }> {
    const chunks: string[] = [];
    const structuredTables: StructuredTable[] = [];
    const numChunks = Math.ceil(totalPages / this.MAX_PAGES_PER_CHUNK);

    logger.info('Starting chunked PDF processing', {
      totalPages,
      maxPagesPerChunk: this.MAX_PAGES_PER_CHUNK,
      numChunks
    });

    try {
      // Load the original PDF
      const sourcePdf = await PDFDocument.load(fileBuffer);
      const pageCount = sourcePdf.getPageCount();

      // Process each chunk
      for (let chunkIndex = 0; chunkIndex < numChunks; chunkIndex++) {
        const startPageIndex = chunkIndex * this.MAX_PAGES_PER_CHUNK;
        const endPageIndex = Math.min(startPageIndex + this.MAX_PAGES_PER_CHUNK, pageCount);

        logger.info(`Processing chunk ${chunkIndex + 1}/${numChunks}`, {
          startPage: startPageIndex + 1, // 1-indexed for logging
          endPage: endPageIndex,
          pagesInChunk: endPageIndex - startPageIndex
        });

        // Create a new PDF with pages from this chunk
        const chunkPdf = await PDFDocument.create();

        // Create array of page indices to copy (0-indexed)
        const pageIndices: number[] = [];
        for (let i = startPageIndex; i < endPageIndex; i++) {
          pageIndices.push(i);
        }

        // Copy pages to chunk PDF
        const copiedPages = await chunkPdf.copyPages(sourcePdf, pageIndices);
        copiedPages.forEach((page) => {
          chunkPdf.addPage(page);
        });

        // Serialize chunk PDF to buffer
        const chunkBuffer = Buffer.from(await chunkPdf.save());
        const chunkFileName = `${fileName.replace('.pdf', '')}_chunk_${chunkIndex + 1}.pdf`;

        // Upload chunk to GCS
        const gcsFilePath = await this.uploadToGCS(chunkBuffer, chunkFileName);

        try {
          // Process chunk with Document AI
          const chunkOutput = await this.processWithDocumentAI(gcsFilePath, mimeType);
          chunks.push(chunkOutput.text);
          if (Array.isArray(chunkOutput.tables) && chunkOutput.tables.length > 0) {
            structuredTables.push(...chunkOutput.tables);
          }

          logger.info(`Chunk ${chunkIndex + 1}/${numChunks} processed successfully`, {
            textLength: chunkOutput.text.length,
            pagesProcessed: endPageIndex - startPageIndex
          });
        } catch (chunkError) {
          logger.error(`Failed to process chunk ${chunkIndex + 1}/${numChunks}, falling back to pdf-parse`, {
            chunkIndex: chunkIndex + 1,
            error: chunkError instanceof Error ? chunkError.message : String(chunkError)
          });

          // Fallback to pdf-parse for this chunk
          const chunkPdfData = await pdf(chunkBuffer);
          chunks.push(chunkPdfData.text || '');
        } finally {
          // Cleanup chunk file from GCS
          await this.cleanupGCSFiles(gcsFilePath);
        }
      }

      // Combine all chunks with page separators
      const combinedText = chunks
        .map((chunk, index) => {
          const startPageNum = (index * this.MAX_PAGES_PER_CHUNK) + 1;
          const endPageNum = Math.min((index + 1) * this.MAX_PAGES_PER_CHUNK, totalPages);
          const chunkHeader = `\n\n--- Page Range ${startPageNum}-${endPageNum} ---\n\n`;
          return chunkHeader + chunk;
        })
        .join('\n\n');

      logger.info('Chunked PDF processing completed', {
        totalPages,
        numChunks,
        combinedTextLength: combinedText.length,
        averageChunkLength: Math.round(combinedText.length / numChunks)
      });

      return {
        text: combinedText,
        structuredTables
      };

    } catch (error) {
      logger.error('Chunked PDF processing failed, falling back to pdf-parse', {
        error: error instanceof Error ? error.message : String(error),
        totalPages
      });

      // Fallback to pdf-parse for entire document
      const pdfData = await pdf(fileBuffer);
      return {
        text: pdfData.text || '',
        structuredTables: []
      };
    }
  }

  private async processWithAgenticRAG(documentId: string, extractedText: string, structuredTables: StructuredTable[]): Promise<any> {
    try {
      logger.info('Processing extracted text with Agentic RAG', {
        documentId,
        textLength: extractedText.length,
        structuredTableCount: structuredTables.length
      });

      // Import and use the optimized agentic RAG processor
      logger.info('Importing optimized agentic RAG processor...');
      const { optimizedAgenticRAGProcessor } = await import('./optimizedAgenticRAGProcessor');

      logger.info('Agentic RAG processor imported successfully', {
        processorType: typeof optimizedAgenticRAGProcessor,
        hasProcessLargeDocument: typeof optimizedAgenticRAGProcessor?.processLargeDocument === 'function'
      });

      logger.info('Calling processLargeDocument...');
      const result = await optimizedAgenticRAGProcessor.processLargeDocument(documentId, extractedText, {
        structuredTables
      });

      logger.info('Agentic RAG processing completed', {
        success: result.success,
        summaryLength: result.summary?.length || 0,
        analysisDataKeys: result.analysisData ? Object.keys(result.analysisData) : [],
        apiCalls: result.apiCalls,
        processingStrategy: result.processingStrategy,
        resultType: typeof result
      });

      return result;

    } catch (error) {
      const errorMessage = error instanceof Error ? error.message : String(error);
      const errorStack = error instanceof Error ? error.stack : undefined;
      const errorDetails = error instanceof Error ? {
        name: error.name,
        message: error.message,
        stack: error.stack
      } : {
        type: typeof error,
        value: error
      };

      logger.error('Agentic RAG processing failed', {
        documentId,
        error: errorMessage,
        errorDetails,
        stack: errorStack
      });
      throw error;
    }
  }

  private async uploadToGCS(fileBuffer: Buffer, fileName: string): Promise<string> {
    try {
      const bucket = this.storageClient.bucket(this.gcsBucketName);
      const file = bucket.file(`uploads/${Date.now()}_${fileName}`);

      logger.info('Uploading file to GCS', {
        fileName,
        fileSize: fileBuffer.length,
        bucket: this.gcsBucketName,
        destination: file.name
      });

      await file.save(fileBuffer, {
        metadata: {
          contentType: 'application/pdf'
        }
      });

      logger.info('File uploaded successfully to GCS', {
        gcsPath: `gs://${this.gcsBucketName}/${file.name}`
      });

      return `gs://${this.gcsBucketName}/${file.name}`;
    } catch (error) {
      logger.error('Failed to upload file to GCS', {
        fileName,
        error: error instanceof Error ? error.message : String(error)
      });
      throw error;
    }
  }

  private async processWithDocumentAI(gcsFilePath: string, mimeType: string): Promise<DocumentAIOutput> {
    try {
      logger.info('Processing with Document AI', {
        gcsFilePath,
        processorName: this.processorName,
        mimeType
      });

      // Create the request with imageless mode enabled to support up to 30 pages
      // (non-imageless mode only supports 15 pages)
      const request = {
        name: this.processorName,
        rawDocument: {
          content: '', // We'll use GCS source instead
          mimeType: mimeType
        },
        gcsDocument: {
          gcsUri: gcsFilePath,
          mimeType: mimeType
        },
        // Note: For processors that support it, imageless mode can be enabled
        // via processor settings in Google Cloud Console to support up to 30 pages
        // For now, we limit chunks to 15 pages to work with default processor settings
      };

      logger.info('Sending Document AI request', {
        processorName: this.processorName,
        gcsUri: gcsFilePath
      });

      // Process the document
      const [result] = await this.documentAiClient.processDocument(request);
      const { document } = result;

      if (!document) {
        throw new Error('Document AI returned no document');
      }

      logger.info('Document AI processing successful', {
        textLength: document.text?.length || 0,
        pagesCount: document.pages?.length || 0,
        entitiesCount: document.entities?.length || 0
      });

      // Extract text
      const text = document.text || '';

      // Extract entities
      const entities = document.entities?.map(entity => ({
        type: entity.type || 'UNKNOWN',
        mentionText: entity.mentionText || '',
        confidence: entity.confidence || 0
      })) || [];

      // Extract structured tables
      const structuredTables = this.extractStructuredTables(document, text);

      // Extract pages info
      const pages = document.pages?.map(page => ({
        pageNumber: page.pageNumber || 0,
        blocksCount: page.blocks?.length || 0
      })) || [];

      return {
        text,
        entities,
        tables: structuredTables,
        pages,
        mimeType: document.mimeType || mimeType
      };

    } catch (error) {
      logger.error('Document AI processing failed', {
        gcsFilePath,
        processorName: this.processorName,
        error: error instanceof Error ? error.message : String(error),
        stack: error instanceof Error ? error.stack : undefined
      });
      throw error;
    }
  }

  private async cleanupGCSFiles(gcsFilePath: string): Promise<void> {
    try {
      const bucketName = gcsFilePath.replace('gs://', '').split('/')[0];
      const fileName = gcsFilePath.replace(`gs://${bucketName}/`, '');

      logger.info('Cleaning up GCS files', { gcsFilePath, bucketName, fileName });

      const bucket = this.storageClient.bucket(bucketName);
      const file = bucket.file(fileName);

      await file.delete();

      logger.info('GCS file cleanup completed', { gcsFilePath });
    } catch (error) {
      logger.warn('Failed to cleanup GCS files', {
        gcsFilePath,
        error: error instanceof Error ? error.message : String(error)
      });
      // Don't throw error for cleanup failures
    }
  }
}

export const documentAiProcessor = new DocumentAiProcessor();