Files
cim_summary/backend/src/services/vectorDatabaseService.ts

593 lines
19 KiB
TypeScript

import { config } from '../config/env';
import { logger } from '../utils/logger';
import { VectorDatabaseModel, DocumentChunk, VectorSearchResult } from '../models/VectorDatabaseModel';
// Re-export types from the model
export { VectorSearchResult, DocumentChunk } from '../models/VectorDatabaseModel';
class VectorDatabaseService {
private provider: 'pinecone' | 'pgvector' | 'chroma' | 'supabase';
private client: any;
private semanticCache: Map<string, { embedding: number[]; timestamp: number }> = new Map();
private readonly CACHE_TTL = 3600000; // 1 hour cache TTL
constructor() {
this.provider = config.vector.provider;
// Don't initialize client immediately - do it lazily when needed
}
private async initializeClient() {
if (this.client) return; // Already initialized
switch (this.provider) {
case 'pinecone':
await this.initializePinecone();
break;
case 'pgvector':
await this.initializePgVector();
break;
case 'chroma':
await this.initializeChroma();
break;
case 'supabase':
await this.initializeSupabase();
break;
default:
logger.error(`Unsupported vector database provider: ${this.provider}`);
this.client = null;
}
}
private async ensureInitialized() {
if (!this.client) {
await this.initializeClient();
}
return this.client !== null;
}
private async initializePinecone() {
// const { Pinecone } = await import('@pinecone-database/pinecone');
// this.client = new Pinecone({
// apiKey: config.vector.pineconeApiKey!,
// });
logger.info('Pinecone vector database initialized');
}
private async initializePgVector() {
// Note: pgvector is deprecated in favor of Supabase
// This method is kept for backward compatibility but will not work in Firebase
logger.warn('pgvector provider is deprecated. Use Supabase instead for cloud deployment.');
this.client = null;
}
private async initializeChroma() {
// const { ChromaClient } = await import('chromadb');
// this.client = new ChromaClient({
// path: config.vector.chromaUrl || 'http://localhost:8000'
// });
logger.info('Chroma vector database initialized');
}
private async initializeSupabase() {
try {
const { getSupabaseServiceClient } = await import('../config/supabase');
this.client = getSupabaseServiceClient();
// Create the document_chunks table if it doesn't exist
await this.createSupabaseVectorTables();
logger.info('Supabase vector database initialized successfully');
} catch (error) {
logger.error('Failed to initialize Supabase vector database', error);
// Don't throw error, just log it and continue without vector DB
this.client = null;
}
}
private async createSupabaseVectorTables() {
try {
// Enable pgvector extension
await this.client.rpc('enable_pgvector');
// Create document_chunks table with vector support
const { error } = await this.client.rpc('create_document_chunks_table');
if (error && !error.message.includes('already exists')) {
throw error;
}
logger.info('Supabase vector tables created successfully');
} catch (error) {
logger.warn('Could not create vector tables automatically. Please run the setup SQL manually:', error);
}
}
/**
* Generate embeddings for text using OpenAI or Anthropic with caching
*/
async generateEmbeddings(text: string): Promise<number[]> {
try {
// Check cache first
const cacheKey = this.generateEmbeddingHash(text);
const cached = this.semanticCache.get(cacheKey);
if (cached && Date.now() - cached.timestamp < this.CACHE_TTL) {
logger.debug('Using cached embedding');
return cached.embedding;
}
// Use OpenAI embeddings by default (more reliable than custom Claude embeddings)
let embedding: number[];
if (config.llm.openaiApiKey) {
embedding = await this.generateOpenAIEmbeddings(text);
} else if (config.llm.anthropicApiKey) {
embedding = await this.generateClaudeEmbeddings(text);
} else {
throw new Error('No API key available for embedding generation');
}
// Cache the result
this.semanticCache.set(cacheKey, {
embedding,
timestamp: Date.now()
});
return embedding;
} catch (error) {
logger.error('Failed to generate embeddings', error);
throw error;
}
}
private async generateOpenAIEmbeddings(text: string): Promise<number[]> {
const { OpenAI } = await import('openai');
const openai = new OpenAI({ apiKey: config.llm.openaiApiKey });
const response = await openai.embeddings.create({
model: 'text-embedding-3-small', // Using small model for compatibility with pgvector
input: text.substring(0, 8000), // Limit text length
});
return response.data[0]?.embedding || [];
}
private async generateClaudeEmbeddings(text: string): Promise<number[]> {
// Use a more sophisticated approach for Claude
// Generate semantic features using text analysis
const words = text.toLowerCase().match(/\b\w+\b/g) || [];
const embedding = new Array(1536).fill(0); // Updated to 1536 dimensions to match small model
// Create semantic clusters for financial, business, and market terms
const financialTerms = ['revenue', 'ebitda', 'profit', 'margin', 'cash', 'debt', 'equity', 'growth', 'valuation', 'earnings', 'income', 'expenses', 'assets', 'liabilities'];
const businessTerms = ['customer', 'product', 'service', 'market', 'competition', 'operation', 'management', 'strategy', 'business', 'company', 'industry'];
const industryTerms = ['manufacturing', 'technology', 'healthcare', 'consumer', 'industrial', 'software', 'retail', 'finance', 'energy', 'telecommunications'];
// Weight embeddings based on domain relevance
words.forEach((word, index) => {
let weight = 1;
if (financialTerms.includes(word)) weight = 3;
else if (businessTerms.includes(word)) weight = 2;
else if (industryTerms.includes(word)) weight = 1.5;
const hash = this.hashString(word);
const position = Math.abs(hash) % 1536;
embedding[position] = Math.min(1, embedding[position] + (weight / Math.sqrt(index + 1)));
});
// Normalize embedding
const magnitude = Math.sqrt(embedding.reduce((sum: number, val: number) => sum + val * val, 0));
return magnitude > 0 ? embedding.map(val => val / magnitude) : embedding;
}
private hashString(str: string): number {
let hash = 0;
for (let i = 0; i < str.length; i++) {
const char = str.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash; // Convert to 32-bit integer
}
return hash;
}
private generateEmbeddingHash(text: string): string {
// Simple hash for caching
let hash = 0;
for (let i = 0; i < text.length; i++) {
const char = text.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash;
}
return hash.toString();
}
/**
* Expand query with synonyms and related terms for better search
*/
async expandQuery(query: string): Promise<string[]> {
const expandedTerms = [query];
// Add financial synonyms
const financialSynonyms: Record<string, string[]> = {
'revenue': ['sales', 'income', 'top line', 'gross revenue'],
'profit': ['earnings', 'net income', 'bottom line', 'profitability'],
'ebitda': ['earnings before interest', 'operating profit', 'operating income'],
'margin': ['profit margin', 'gross margin', 'operating margin'],
'growth': ['expansion', 'increase', 'rise', 'improvement'],
'market': ['industry', 'sector', 'business environment', 'competitive landscape'],
'customer': ['client', 'buyer', 'end user', 'consumer'],
'product': ['service', 'offering', 'solution', 'platform']
};
const queryWords = query.toLowerCase().split(/\s+/);
queryWords.forEach(word => {
if (financialSynonyms[word]) {
expandedTerms.push(...financialSynonyms[word]);
}
});
// Add industry-specific terms
const industryTerms = ['technology', 'healthcare', 'manufacturing', 'retail', 'finance'];
industryTerms.forEach(industry => {
if (query.toLowerCase().includes(industry)) {
expandedTerms.push(industry + ' sector', industry + ' industry');
}
});
return [...new Set(expandedTerms)]; // Remove duplicates
}
/**
* Store document chunks with embeddings
*/
async storeDocumentChunks(chunks: DocumentChunk[]): Promise<void> {
try {
const isInitialized = await this.ensureInitialized();
if (!isInitialized) {
logger.warn('Vector database not initialized, skipping chunk storage');
return;
}
switch (this.provider) {
case 'pinecone':
await this.storeInPinecone(chunks);
break;
case 'pgvector':
await this.storeInPgVector(chunks);
break;
case 'chroma':
await this.storeInChroma(chunks);
break;
case 'supabase':
await this.storeInSupabase(chunks);
break;
default:
logger.warn(`Vector database provider ${this.provider} not supported for storage`);
}
} catch (error) {
// Log the error but don't fail the entire upload process
logger.error('Failed to store document chunks in vector database:', error);
logger.warn('Continuing with upload process without vector storage');
// Don't throw the error - let the upload continue
}
}
/**
* Search for similar content with query expansion
*/
async search(
query: string,
options: {
documentId?: string;
limit?: number;
similarity?: number;
filters?: Record<string, any>;
enableQueryExpansion?: boolean;
} = {}
): Promise<VectorSearchResult[]> {
const initialized = await this.ensureInitialized();
if (!initialized) {
logger.warn('Vector database not available, returning empty search results');
return [];
}
try {
let queries = [query];
// Enable query expansion by default for better results
if (options.enableQueryExpansion !== false) {
queries = await this.expandQuery(query);
}
const allResults: VectorSearchResult[] = [];
for (const expandedQuery of queries) {
const embedding = await this.generateEmbeddings(expandedQuery);
let results: VectorSearchResult[];
switch (this.provider) {
case 'pinecone':
results = await this.searchPinecone(embedding, options);
break;
case 'pgvector':
results = await this.searchPgVector(embedding, options);
break;
case 'chroma':
results = await this.searchChroma(embedding, options);
break;
case 'supabase':
results = await this.searchSupabase(embedding, options);
break;
default:
throw new Error(`Unsupported provider: ${this.provider}`);
}
allResults.push(...results);
}
// Merge and deduplicate results
const mergedResults = this.mergeAndDeduplicateResults(allResults, options.limit || 10);
return mergedResults;
} catch (error) {
logger.error('Vector search failed', error);
throw new Error('Search operation failed');
}
}
/**
* Merge and deduplicate search results
*/
private mergeAndDeduplicateResults(results: VectorSearchResult[], limit: number): VectorSearchResult[] {
const seen = new Set<string>();
const merged: VectorSearchResult[] = [];
// Sort by similarity score
results.sort((a, b) => b.similarityScore - a.similarityScore);
for (const result of results) {
const key = `${result.documentId}-${result.chunkContent.substring(0, 100)}`;
if (!seen.has(key)) {
seen.add(key);
merged.push(result);
if (merged.length >= limit) break;
}
}
return merged;
}
/**
* Get relevant sections for RAG processing
*/
async getRelevantSections(
query: string,
documentId: string,
limit: number = 5
): Promise<DocumentChunk[]> {
const results = await this.search(query, {
documentId,
limit,
similarity: 0.7
});
return results.map((result: any) => ({
id: result.id,
documentId,
chunkIndex: result.metadata?.chunkIndex || 0,
content: result.content,
metadata: result.metadata,
embedding: [], // Not needed for return
createdAt: new Date(),
updatedAt: new Date()
}));
}
/**
* Find similar documents across the database
*/
async findSimilarDocuments(
documentId: string,
limit: number = 10
): Promise<VectorSearchResult[]> {
// Get document chunks
const documentChunks = await this.getDocumentChunks(documentId);
if (documentChunks.length === 0) return [];
// Use the first chunk as a reference
const referenceChunk = documentChunks[0];
if (!referenceChunk) return [];
return await this.search(referenceChunk.content, {
limit,
similarity: 0.6,
filters: { documentId: { $ne: documentId } }
});
}
/**
* Industry-specific search
*/
async searchByIndustry(
industry: string,
query: string,
limit: number = 20
): Promise<VectorSearchResult[]> {
return await this.search(query, {
limit,
filters: { industry: industry.toLowerCase() }
});
}
/**
* Get vector database statistics
*/
async getVectorDatabaseStats(): Promise<{
totalChunks: number;
totalDocuments: number;
averageSimilarity: number;
}> {
try {
const stats = await VectorDatabaseModel.getVectorDatabaseStats();
return stats;
} catch (error) {
logger.error('Failed to get vector database stats', error);
throw error;
}
}
// Private implementation methods for different providers
private async storeInPinecone(_chunks: DocumentChunk[]): Promise<void> {
logger.warn('Pinecone provider not fully implemented');
throw new Error('Pinecone provider not available');
}
private async storeInPgVector(_chunks: DocumentChunk[]): Promise<void> {
logger.warn('pgvector provider is deprecated. Use Supabase instead for cloud deployment.');
throw new Error('pgvector provider not available in Firebase environment. Use Supabase instead.');
}
private async storeInChroma(chunks: DocumentChunk[]): Promise<void> {
const collection = await this.client.getOrCreateCollection({
name: 'cim_documents'
});
const documents = chunks.map(chunk => chunk.content);
const metadatas = chunks.map(chunk => ({
...chunk.metadata,
documentId: chunk.documentId
}));
const ids = chunks.map(chunk => chunk.id);
await collection.add({
ids,
documents,
metadatas
});
}
private async searchPinecone(
_embedding: number[],
_options: any
): Promise<VectorSearchResult[]> {
logger.warn('Pinecone provider not fully implemented');
throw new Error('Pinecone provider not available');
}
private async searchPgVector(
_embedding: number[],
_options: any
): Promise<VectorSearchResult[]> {
logger.warn('pgvector provider is deprecated. Use Supabase instead for cloud deployment.');
throw new Error('pgvector provider not available in Firebase environment. Use Supabase instead.');
}
private async searchChroma(
embedding: number[],
options: any
): Promise<VectorSearchResult[]> {
const collection = await this.client.getCollection({
name: 'cim_documents'
});
const results = await collection.query({
queryEmbeddings: [embedding],
nResults: options.limit || 10,
where: options.filters
});
return results.documents[0].map((doc: string, index: number) => ({
id: results.ids[0][index],
score: results.distances[0][index],
metadata: results.metadatas[0][index],
content: doc
}));
}
private async storeInSupabase(chunks: DocumentChunk[]): Promise<void> {
try {
// Transform chunks to include embeddings
const supabaseRows = await Promise.all(
chunks.map(async (chunk) => ({
id: chunk.id,
document_id: chunk.documentId,
chunk_index: chunk.chunkIndex,
content: chunk.content,
embedding: chunk.embedding,
metadata: chunk.metadata || {}
}))
);
const { error } = await this.client
.from('document_chunks')
.upsert(supabaseRows);
if (error) {
// Check if it's a table/column missing error
if (error.message && (error.message.includes('chunkIndex') || error.message.includes('document_chunks'))) {
logger.warn('Vector database table/columns not available, skipping vector storage:', error.message);
return; // Don't throw, just skip vector storage
}
throw error;
}
logger.info(`Successfully stored ${chunks.length} chunks in Supabase`);
} catch (error) {
logger.error('Failed to store chunks in Supabase:', error);
// Don't throw the error - let the upload continue without vector storage
logger.warn('Continuing upload process without vector storage');
}
}
private async searchSupabase(
embedding: number[],
options: {
documentId?: string;
limit?: number;
similarity?: number;
filters?: Record<string, any>;
}
): Promise<VectorSearchResult[]> {
try {
let query = this.client
.from('document_chunks')
.select('id, content, metadata, document_id')
.rpc('match_documents', {
query_embedding: embedding,
match_threshold: options.similarity || 0.7,
match_count: options.limit || 10
});
// Add document filter if specified
if (options.documentId) {
query = query.eq('document_id', options.documentId);
}
const { data, error } = await query;
if (error) {
throw error;
}
return data.map((row: any) => ({
id: row.id,
score: row.similarity,
metadata: {
...row.metadata,
documentId: row.document_id
},
content: row.content
}));
} catch (error) {
logger.error('Failed to search in Supabase:', error);
return [];
}
}
private async getDocumentChunks(documentId: string): Promise<DocumentChunk[]> {
return await VectorDatabaseModel.getDocumentChunks(documentId);
}
}
export const vectorDatabaseService = new VectorDatabaseService();