cim_summary/backend/src/services/vectorDatabaseService.ts

import { config } from '../config/env';
import { logger } from '../utils/logger';
import { VectorDatabaseModel, DocumentChunk, VectorSearchResult } from '../models/VectorDatabaseModel';

// Re-export types from the model
export { VectorSearchResult, DocumentChunk } from '../models/VectorDatabaseModel';

class VectorDatabaseService {
  private provider: 'pinecone' | 'pgvector' | 'chroma' | 'supabase';
  private client: any;
  private semanticCache: Map<string, { embedding: number[]; timestamp: number }> = new Map();
  private readonly CACHE_TTL = 3600000; // 1 hour cache TTL

  constructor() {
    this.provider = config.vector.provider;
    // Don't initialize client immediately - do it lazily when needed
  }

  private async initializeClient() {
    if (this.client) return; // Already initialized

    switch (this.provider) {
      case 'pinecone':
        await this.initializePinecone();
        break;
      case 'pgvector':
        await this.initializePgVector();
        break;
      case 'chroma':
        await this.initializeChroma();
        break;
      case 'supabase':
        await this.initializeSupabase();
        break;
      default:
        logger.error(`Unsupported vector database provider: ${this.provider}`);
        this.client = null;
    }
  }

  private async ensureInitialized() {
    if (!this.client) {
      await this.initializeClient();
    }
    return this.client !== null;
  }

  private async initializePinecone() {
    // const { Pinecone } = await import('@pinecone-database/pinecone');
    // this.client = new Pinecone({
    //   apiKey: config.vector.pineconeApiKey!,
    // });
    logger.info('Pinecone vector database initialized');
  }

  private async initializePgVector() {
    // Note: pgvector is deprecated in favor of Supabase
    // This method is kept for backward compatibility but will not work in Firebase
    logger.warn('pgvector provider is deprecated. Use Supabase instead for cloud deployment.');
    this.client = null;
  }


  private async initializeChroma() {
    // const { ChromaClient } = await import('chromadb');
    // this.client = new ChromaClient({
    //   path: config.vector.chromaUrl || 'http://localhost:8000'
    // });
    logger.info('Chroma vector database initialized');
  }

  private async initializeSupabase() {
    try {
      const { getSupabaseServiceClient } = await import('../config/supabase');
      this.client = getSupabaseServiceClient();

      // Create the document_chunks table if it doesn't exist
      await this.createSupabaseVectorTables();

      logger.info('Supabase vector database initialized successfully');
    } catch (error) {
      logger.error('Failed to initialize Supabase vector database', error);
      // Don't throw error, just log it and continue without vector DB
      this.client = null;
    }
  }

  private async createSupabaseVectorTables() {
    try {
      // Enable pgvector extension
      await this.client.rpc('enable_pgvector');

      // Create document_chunks table with vector support
      const { error } = await this.client.rpc('create_document_chunks_table');

      if (error && !error.message.includes('already exists')) {
        throw error;
      }

      logger.info('Supabase vector tables created successfully');
    } catch (error) {
      logger.warn('Could not create vector tables automatically. Please run the setup SQL manually:', error);
    }
  }

  /**
   * Generate embeddings for text using OpenAI or Anthropic with caching
   */
  async generateEmbeddings(text: string): Promise<number[]> {
    try {
      // Check cache first
      const cacheKey = this.generateEmbeddingHash(text);
      const cached = this.semanticCache.get(cacheKey);
      if (cached && Date.now() - cached.timestamp < this.CACHE_TTL) {
        logger.debug('Using cached embedding');
        return cached.embedding;
      }

      // Use OpenAI embeddings by default (more reliable than custom Claude embeddings)
      let embedding: number[];

      if (config.llm.openaiApiKey) {
        embedding = await this.generateOpenAIEmbeddings(text);
      } else if (config.llm.anthropicApiKey) {
        embedding = await this.generateClaudeEmbeddings(text);
      } else {
        throw new Error('No API key available for embedding generation');
      }

      // Cache the result
      this.semanticCache.set(cacheKey, {
        embedding,
        timestamp: Date.now()
      });

      return embedding;
    } catch (error) {
      logger.error('Failed to generate embeddings', error);
      throw error;
    }
  }

  private async generateOpenAIEmbeddings(text: string): Promise<number[]> {
    const { OpenAI } = await import('openai');
    const openai = new OpenAI({ apiKey: config.llm.openaiApiKey });

    const response = await openai.embeddings.create({
      model: 'text-embedding-3-small', // Using small model for compatibility with pgvector
      input: text.substring(0, 8000), // Limit text length
    });

    return response.data[0]?.embedding || [];
  }

  private async generateClaudeEmbeddings(text: string): Promise<number[]> {
    // Use a more sophisticated approach for Claude
    // Generate semantic features using text analysis
    const words = text.toLowerCase().match(/\b\w+\b/g) || [];
    const embedding = new Array(1536).fill(0); // Updated to 1536 dimensions to match small model

    // Create semantic clusters for financial, business, and market terms
    const financialTerms = ['revenue', 'ebitda', 'profit', 'margin', 'cash', 'debt', 'equity', 'growth', 'valuation', 'earnings', 'income', 'expenses', 'assets', 'liabilities'];
    const businessTerms = ['customer', 'product', 'service', 'market', 'competition', 'operation', 'management', 'strategy', 'business', 'company', 'industry'];
    const industryTerms = ['manufacturing', 'technology', 'healthcare', 'consumer', 'industrial', 'software', 'retail', 'finance', 'energy', 'telecommunications'];

    // Weight embeddings based on domain relevance
    words.forEach((word, index) => {
      let weight = 1;
      if (financialTerms.includes(word)) weight = 3;
      else if (businessTerms.includes(word)) weight = 2;
      else if (industryTerms.includes(word)) weight = 1.5;

      const hash = this.hashString(word);
      const position = Math.abs(hash) % 1536;
      embedding[position] = Math.min(1, embedding[position] + (weight / Math.sqrt(index + 1)));
    });

    // Normalize embedding
    const magnitude = Math.sqrt(embedding.reduce((sum: number, val: number) => sum + val * val, 0));
    return magnitude > 0 ? embedding.map(val => val / magnitude) : embedding;
  }

  private hashString(str: string): number {
    let hash = 0;
    for (let i = 0; i < str.length; i++) {
      const char = str.charCodeAt(i);
      hash = ((hash << 5) - hash) + char;
      hash = hash & hash; // Convert to 32-bit integer
    }
    return hash;
  }

  private generateEmbeddingHash(text: string): string {
    // Simple hash for caching
    let hash = 0;
    for (let i = 0; i < text.length; i++) {
      const char = text.charCodeAt(i);
      hash = ((hash << 5) - hash) + char;
      hash = hash & hash;
    }
    return hash.toString();
  }

  /**
   * Expand query with synonyms and related terms for better search
   */
  async expandQuery(query: string): Promise<string[]> {
    const expandedTerms = [query];

    // Add financial synonyms
    const financialSynonyms: Record<string, string[]> = {
      'revenue': ['sales', 'income', 'top line', 'gross revenue'],
      'profit': ['earnings', 'net income', 'bottom line', 'profitability'],
      'ebitda': ['earnings before interest', 'operating profit', 'operating income'],
      'margin': ['profit margin', 'gross margin', 'operating margin'],
      'growth': ['expansion', 'increase', 'rise', 'improvement'],
      'market': ['industry', 'sector', 'business environment', 'competitive landscape'],
      'customer': ['client', 'buyer', 'end user', 'consumer'],
      'product': ['service', 'offering', 'solution', 'platform']
    };

    const queryWords = query.toLowerCase().split(/\s+/);
    queryWords.forEach(word => {
      if (financialSynonyms[word]) {
        expandedTerms.push(...financialSynonyms[word]);
      }
    });

    // Add industry-specific terms
    const industryTerms = ['technology', 'healthcare', 'manufacturing', 'retail', 'finance'];
    industryTerms.forEach(industry => {
      if (query.toLowerCase().includes(industry)) {
        expandedTerms.push(industry + ' sector', industry + ' industry');
      }
    });

    return [...new Set(expandedTerms)]; // Remove duplicates
  }

  /**
   * Store document chunks with embeddings
   */
  async storeDocumentChunks(chunks: DocumentChunk[]): Promise<void> {
    try {
      const isInitialized = await this.ensureInitialized();

      if (!isInitialized) {
        logger.warn('Vector database not initialized, skipping chunk storage');
        return;
      }

      switch (this.provider) {
        case 'pinecone':
          await this.storeInPinecone(chunks);
          break;
        case 'pgvector':
          await this.storeInPgVector(chunks);
          break;
        case 'chroma':
          await this.storeInChroma(chunks);
          break;
        case 'supabase':
          await this.storeInSupabase(chunks);
          break;
        default:
          logger.warn(`Vector database provider ${this.provider} not supported for storage`);
      }
    } catch (error) {
      // Log the error but don't fail the entire upload process
      logger.error('Failed to store document chunks in vector database:', error);
      logger.warn('Continuing with upload process without vector storage');
      // Don't throw the error - let the upload continue
    }
  }

  /**
   * Search for similar content with query expansion
   */
  async search(
    query: string,
    options: {
      documentId?: string;
      limit?: number;
      similarity?: number;
      filters?: Record<string, any>;
      enableQueryExpansion?: boolean;
    } = {}
  ): Promise<VectorSearchResult[]> {
    const initialized = await this.ensureInitialized();
    if (!initialized) {
      logger.warn('Vector database not available, returning empty search results');
      return [];
    }

    try {
      let queries = [query];

      // Enable query expansion by default for better results
      if (options.enableQueryExpansion !== false) {
        queries = await this.expandQuery(query);
      }

      const allResults: VectorSearchResult[] = [];

      for (const expandedQuery of queries) {
        const embedding = await this.generateEmbeddings(expandedQuery);

        let results: VectorSearchResult[];
        switch (this.provider) {
          case 'pinecone':
            results = await this.searchPinecone(embedding, options);
            break;
          case 'pgvector':
            results = await this.searchPgVector(embedding, options);
            break;
          case 'chroma':
            results = await this.searchChroma(embedding, options);
            break;
          case 'supabase':
            results = await this.searchSupabase(embedding, options);
            break;
          default:
            throw new Error(`Unsupported provider: ${this.provider}`);
        }

        allResults.push(...results);
      }

      // Merge and deduplicate results
      const mergedResults = this.mergeAndDeduplicateResults(allResults, options.limit || 10);

      return mergedResults;
    } catch (error) {
      logger.error('Vector search failed', error);
      throw new Error('Search operation failed');
    }
  }

  /**
   * Merge and deduplicate search results
   */
  private mergeAndDeduplicateResults(results: VectorSearchResult[], limit: number): VectorSearchResult[] {
    const seen = new Set<string>();
    const merged: VectorSearchResult[] = [];

    // Sort by similarity score
    results.sort((a, b) => b.similarityScore - a.similarityScore);

    for (const result of results) {
      const key = `${result.documentId}-${result.chunkContent.substring(0, 100)}`;
      if (!seen.has(key)) {
        seen.add(key);
        merged.push(result);
        if (merged.length >= limit) break;
      }
    }

    return merged;
  }

  /**
   * Get relevant sections for RAG processing
   */
  async getRelevantSections(
    query: string,
    documentId: string,
    limit: number = 5
  ): Promise<DocumentChunk[]> {
    const results = await this.search(query, {
      documentId,
      limit,
      similarity: 0.7
    });

    return results.map((result: any) => ({
      id: result.id,
      documentId,
      chunkIndex: result.metadata?.chunkIndex || 0,
      content: result.content,
      metadata: result.metadata,
      embedding: [], // Not needed for return
      createdAt: new Date(),
      updatedAt: new Date()
    }));
  }

  /**
   * Find similar documents across the database
   */
  async findSimilarDocuments(
    documentId: string,
    limit: number = 10
  ): Promise<VectorSearchResult[]> {
    // Get document chunks
    const documentChunks = await this.getDocumentChunks(documentId);
    if (documentChunks.length === 0) return [];

    // Use the first chunk as a reference
    const referenceChunk = documentChunks[0];
    if (!referenceChunk) return [];

    return await this.search(referenceChunk.content, {
      limit,
      similarity: 0.6,
      filters: { documentId: { $ne: documentId } }
    });
  }

  /**
   * Industry-specific search
   */
  async searchByIndustry(
    industry: string,
    query: string,
    limit: number = 20
  ): Promise<VectorSearchResult[]> {
    return await this.search(query, {
      limit,
      filters: { industry: industry.toLowerCase() }
    });
  }

  /**
   * Get vector database statistics
   */
  async getVectorDatabaseStats(): Promise<{
    totalChunks: number;
    totalDocuments: number;
    averageSimilarity: number;
  }> {
    try {
      const stats = await VectorDatabaseModel.getVectorDatabaseStats();
      return stats;
    } catch (error) {
      logger.error('Failed to get vector database stats', error);
      throw error;
    }
  }

  // Private implementation methods for different providers
  private async storeInPinecone(_chunks: DocumentChunk[]): Promise<void> {
    logger.warn('Pinecone provider not fully implemented');
    throw new Error('Pinecone provider not available');
  }

  private async storeInPgVector(_chunks: DocumentChunk[]): Promise<void> {
    logger.warn('pgvector provider is deprecated. Use Supabase instead for cloud deployment.');
    throw new Error('pgvector provider not available in Firebase environment. Use Supabase instead.');
  }

  private async storeInChroma(chunks: DocumentChunk[]): Promise<void> {
    const collection = await this.client.getOrCreateCollection({
      name: 'cim_documents'
    });

    const documents = chunks.map(chunk => chunk.content);
    const metadatas = chunks.map(chunk => ({
      ...chunk.metadata,
      documentId: chunk.documentId
    }));
    const ids = chunks.map(chunk => chunk.id);

    await collection.add({
      ids,
      documents,
      metadatas
    });
  }

  private async searchPinecone(
    _embedding: number[],
    _options: any
  ): Promise<VectorSearchResult[]> {
    logger.warn('Pinecone provider not fully implemented');
    throw new Error('Pinecone provider not available');
  }

  private async searchPgVector(
    _embedding: number[],
    _options: any
  ): Promise<VectorSearchResult[]> {
    logger.warn('pgvector provider is deprecated. Use Supabase instead for cloud deployment.');
    throw new Error('pgvector provider not available in Firebase environment. Use Supabase instead.');
  }

  private async searchChroma(
    embedding: number[],
    options: any
  ): Promise<VectorSearchResult[]> {
    const collection = await this.client.getCollection({
      name: 'cim_documents'
    });

    const results = await collection.query({
      queryEmbeddings: [embedding],
      nResults: options.limit || 10,
      where: options.filters
    });

    return results.documents[0].map((doc: string, index: number) => ({
      id: results.ids[0][index],
      score: results.distances[0][index],
      metadata: results.metadatas[0][index],
      content: doc
    }));
  }

  private async storeInSupabase(chunks: DocumentChunk[]): Promise<void> {
    try {
      // Transform chunks to include embeddings
      const supabaseRows = await Promise.all(
        chunks.map(async (chunk) => ({
          id: chunk.id,
          document_id: chunk.documentId,
          chunk_index: chunk.chunkIndex,
          content: chunk.content,
          embedding: chunk.embedding,
          metadata: chunk.metadata || {}
        }))
      );

      const { error } = await this.client
        .from('document_chunks')
        .upsert(supabaseRows);

      if (error) {
        // Check if it's a table/column missing error
        if (error.message && (error.message.includes('chunkIndex') || error.message.includes('document_chunks'))) {
          logger.warn('Vector database table/columns not available, skipping vector storage:', error.message);
          return; // Don't throw, just skip vector storage
        }
        throw error;
      }

      logger.info(`Successfully stored ${chunks.length} chunks in Supabase`);
    } catch (error) {
      logger.error('Failed to store chunks in Supabase:', error);
      // Don't throw the error - let the upload continue without vector storage
      logger.warn('Continuing upload process without vector storage');
    }
  }

  private async searchSupabase(
    embedding: number[],
    options: {
      documentId?: string;
      limit?: number;
      similarity?: number;
      filters?: Record<string, any>;
    }
  ): Promise<VectorSearchResult[]> {
    try {
      let query = this.client
        .from('document_chunks')
        .select('id, content, metadata, document_id')
        .rpc('match_documents', {
          query_embedding: embedding,
          match_threshold: options.similarity || 0.7,
          match_count: options.limit || 10
        });

      // Add document filter if specified
      if (options.documentId) {
        query = query.eq('document_id', options.documentId);
      }

      const { data, error } = await query;

      if (error) {
        throw error;
      }

      return data.map((row: any) => ({
        id: row.id,
        score: row.similarity,
        metadata: {
          ...row.metadata,
          documentId: row.document_id
        },
        content: row.content
      }));
    } catch (error) {
      logger.error('Failed to search in Supabase:', error);
      return [];
    }
  }

  private async getDocumentChunks(documentId: string): Promise<DocumentChunk[]> {
    return await VectorDatabaseModel.getDocumentChunks(documentId);
  }
}

export const vectorDatabaseService = new VectorDatabaseService();