593 lines
19 KiB
TypeScript
593 lines
19 KiB
TypeScript
import { config } from '../config/env';
|
|
import { logger } from '../utils/logger';
|
|
import { VectorDatabaseModel, DocumentChunk, VectorSearchResult } from '../models/VectorDatabaseModel';
|
|
|
|
// Re-export types from the model
|
|
export { VectorSearchResult, DocumentChunk } from '../models/VectorDatabaseModel';
|
|
|
|
class VectorDatabaseService {
|
|
private provider: 'pinecone' | 'pgvector' | 'chroma' | 'supabase';
|
|
private client: any;
|
|
private semanticCache: Map<string, { embedding: number[]; timestamp: number }> = new Map();
|
|
private readonly CACHE_TTL = 3600000; // 1 hour cache TTL
|
|
|
|
constructor() {
|
|
this.provider = config.vector.provider;
|
|
// Don't initialize client immediately - do it lazily when needed
|
|
}
|
|
|
|
private async initializeClient() {
|
|
if (this.client) return; // Already initialized
|
|
|
|
switch (this.provider) {
|
|
case 'pinecone':
|
|
await this.initializePinecone();
|
|
break;
|
|
case 'pgvector':
|
|
await this.initializePgVector();
|
|
break;
|
|
case 'chroma':
|
|
await this.initializeChroma();
|
|
break;
|
|
case 'supabase':
|
|
await this.initializeSupabase();
|
|
break;
|
|
default:
|
|
logger.error(`Unsupported vector database provider: ${this.provider}`);
|
|
this.client = null;
|
|
}
|
|
}
|
|
|
|
private async ensureInitialized() {
|
|
if (!this.client) {
|
|
await this.initializeClient();
|
|
}
|
|
return this.client !== null;
|
|
}
|
|
|
|
private async initializePinecone() {
|
|
// const { Pinecone } = await import('@pinecone-database/pinecone');
|
|
// this.client = new Pinecone({
|
|
// apiKey: config.vector.pineconeApiKey!,
|
|
// });
|
|
logger.info('Pinecone vector database initialized');
|
|
}
|
|
|
|
private async initializePgVector() {
|
|
// Note: pgvector is deprecated in favor of Supabase
|
|
// This method is kept for backward compatibility but will not work in Firebase
|
|
logger.warn('pgvector provider is deprecated. Use Supabase instead for cloud deployment.');
|
|
this.client = null;
|
|
}
|
|
|
|
|
|
private async initializeChroma() {
|
|
// const { ChromaClient } = await import('chromadb');
|
|
// this.client = new ChromaClient({
|
|
// path: config.vector.chromaUrl || 'http://localhost:8000'
|
|
// });
|
|
logger.info('Chroma vector database initialized');
|
|
}
|
|
|
|
private async initializeSupabase() {
|
|
try {
|
|
const { getSupabaseServiceClient } = await import('../config/supabase');
|
|
this.client = getSupabaseServiceClient();
|
|
|
|
// Create the document_chunks table if it doesn't exist
|
|
await this.createSupabaseVectorTables();
|
|
|
|
logger.info('Supabase vector database initialized successfully');
|
|
} catch (error) {
|
|
logger.error('Failed to initialize Supabase vector database', error);
|
|
// Don't throw error, just log it and continue without vector DB
|
|
this.client = null;
|
|
}
|
|
}
|
|
|
|
private async createSupabaseVectorTables() {
|
|
try {
|
|
// Enable pgvector extension
|
|
await this.client.rpc('enable_pgvector');
|
|
|
|
// Create document_chunks table with vector support
|
|
const { error } = await this.client.rpc('create_document_chunks_table');
|
|
|
|
if (error && !error.message.includes('already exists')) {
|
|
throw error;
|
|
}
|
|
|
|
logger.info('Supabase vector tables created successfully');
|
|
} catch (error) {
|
|
logger.warn('Could not create vector tables automatically. Please run the setup SQL manually:', error);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate embeddings for text using OpenAI or Anthropic with caching
|
|
*/
|
|
async generateEmbeddings(text: string): Promise<number[]> {
|
|
try {
|
|
// Check cache first
|
|
const cacheKey = this.generateEmbeddingHash(text);
|
|
const cached = this.semanticCache.get(cacheKey);
|
|
if (cached && Date.now() - cached.timestamp < this.CACHE_TTL) {
|
|
logger.debug('Using cached embedding');
|
|
return cached.embedding;
|
|
}
|
|
|
|
// Use OpenAI embeddings by default (more reliable than custom Claude embeddings)
|
|
let embedding: number[];
|
|
|
|
if (config.llm.openaiApiKey) {
|
|
embedding = await this.generateOpenAIEmbeddings(text);
|
|
} else if (config.llm.anthropicApiKey) {
|
|
embedding = await this.generateClaudeEmbeddings(text);
|
|
} else {
|
|
throw new Error('No API key available for embedding generation');
|
|
}
|
|
|
|
// Cache the result
|
|
this.semanticCache.set(cacheKey, {
|
|
embedding,
|
|
timestamp: Date.now()
|
|
});
|
|
|
|
return embedding;
|
|
} catch (error) {
|
|
logger.error('Failed to generate embeddings', error);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
private async generateOpenAIEmbeddings(text: string): Promise<number[]> {
|
|
const { OpenAI } = await import('openai');
|
|
const openai = new OpenAI({ apiKey: config.llm.openaiApiKey });
|
|
|
|
const response = await openai.embeddings.create({
|
|
model: 'text-embedding-3-small', // Using small model for compatibility with pgvector
|
|
input: text.substring(0, 8000), // Limit text length
|
|
});
|
|
|
|
return response.data[0]?.embedding || [];
|
|
}
|
|
|
|
private async generateClaudeEmbeddings(text: string): Promise<number[]> {
|
|
// Use a more sophisticated approach for Claude
|
|
// Generate semantic features using text analysis
|
|
const words = text.toLowerCase().match(/\b\w+\b/g) || [];
|
|
const embedding = new Array(1536).fill(0); // Updated to 1536 dimensions to match small model
|
|
|
|
// Create semantic clusters for financial, business, and market terms
|
|
const financialTerms = ['revenue', 'ebitda', 'profit', 'margin', 'cash', 'debt', 'equity', 'growth', 'valuation', 'earnings', 'income', 'expenses', 'assets', 'liabilities'];
|
|
const businessTerms = ['customer', 'product', 'service', 'market', 'competition', 'operation', 'management', 'strategy', 'business', 'company', 'industry'];
|
|
const industryTerms = ['manufacturing', 'technology', 'healthcare', 'consumer', 'industrial', 'software', 'retail', 'finance', 'energy', 'telecommunications'];
|
|
|
|
// Weight embeddings based on domain relevance
|
|
words.forEach((word, index) => {
|
|
let weight = 1;
|
|
if (financialTerms.includes(word)) weight = 3;
|
|
else if (businessTerms.includes(word)) weight = 2;
|
|
else if (industryTerms.includes(word)) weight = 1.5;
|
|
|
|
const hash = this.hashString(word);
|
|
const position = Math.abs(hash) % 1536;
|
|
embedding[position] = Math.min(1, embedding[position] + (weight / Math.sqrt(index + 1)));
|
|
});
|
|
|
|
// Normalize embedding
|
|
const magnitude = Math.sqrt(embedding.reduce((sum: number, val: number) => sum + val * val, 0));
|
|
return magnitude > 0 ? embedding.map(val => val / magnitude) : embedding;
|
|
}
|
|
|
|
private hashString(str: string): number {
|
|
let hash = 0;
|
|
for (let i = 0; i < str.length; i++) {
|
|
const char = str.charCodeAt(i);
|
|
hash = ((hash << 5) - hash) + char;
|
|
hash = hash & hash; // Convert to 32-bit integer
|
|
}
|
|
return hash;
|
|
}
|
|
|
|
private generateEmbeddingHash(text: string): string {
|
|
// Simple hash for caching
|
|
let hash = 0;
|
|
for (let i = 0; i < text.length; i++) {
|
|
const char = text.charCodeAt(i);
|
|
hash = ((hash << 5) - hash) + char;
|
|
hash = hash & hash;
|
|
}
|
|
return hash.toString();
|
|
}
|
|
|
|
/**
|
|
* Expand query with synonyms and related terms for better search
|
|
*/
|
|
async expandQuery(query: string): Promise<string[]> {
|
|
const expandedTerms = [query];
|
|
|
|
// Add financial synonyms
|
|
const financialSynonyms: Record<string, string[]> = {
|
|
'revenue': ['sales', 'income', 'top line', 'gross revenue'],
|
|
'profit': ['earnings', 'net income', 'bottom line', 'profitability'],
|
|
'ebitda': ['earnings before interest', 'operating profit', 'operating income'],
|
|
'margin': ['profit margin', 'gross margin', 'operating margin'],
|
|
'growth': ['expansion', 'increase', 'rise', 'improvement'],
|
|
'market': ['industry', 'sector', 'business environment', 'competitive landscape'],
|
|
'customer': ['client', 'buyer', 'end user', 'consumer'],
|
|
'product': ['service', 'offering', 'solution', 'platform']
|
|
};
|
|
|
|
const queryWords = query.toLowerCase().split(/\s+/);
|
|
queryWords.forEach(word => {
|
|
if (financialSynonyms[word]) {
|
|
expandedTerms.push(...financialSynonyms[word]);
|
|
}
|
|
});
|
|
|
|
// Add industry-specific terms
|
|
const industryTerms = ['technology', 'healthcare', 'manufacturing', 'retail', 'finance'];
|
|
industryTerms.forEach(industry => {
|
|
if (query.toLowerCase().includes(industry)) {
|
|
expandedTerms.push(industry + ' sector', industry + ' industry');
|
|
}
|
|
});
|
|
|
|
return [...new Set(expandedTerms)]; // Remove duplicates
|
|
}
|
|
|
|
/**
|
|
* Store document chunks with embeddings
|
|
*/
|
|
async storeDocumentChunks(chunks: DocumentChunk[]): Promise<void> {
|
|
try {
|
|
const isInitialized = await this.ensureInitialized();
|
|
|
|
if (!isInitialized) {
|
|
logger.warn('Vector database not initialized, skipping chunk storage');
|
|
return;
|
|
}
|
|
|
|
switch (this.provider) {
|
|
case 'pinecone':
|
|
await this.storeInPinecone(chunks);
|
|
break;
|
|
case 'pgvector':
|
|
await this.storeInPgVector(chunks);
|
|
break;
|
|
case 'chroma':
|
|
await this.storeInChroma(chunks);
|
|
break;
|
|
case 'supabase':
|
|
await this.storeInSupabase(chunks);
|
|
break;
|
|
default:
|
|
logger.warn(`Vector database provider ${this.provider} not supported for storage`);
|
|
}
|
|
} catch (error) {
|
|
// Log the error but don't fail the entire upload process
|
|
logger.error('Failed to store document chunks in vector database:', error);
|
|
logger.warn('Continuing with upload process without vector storage');
|
|
// Don't throw the error - let the upload continue
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Search for similar content with query expansion
|
|
*/
|
|
async search(
|
|
query: string,
|
|
options: {
|
|
documentId?: string;
|
|
limit?: number;
|
|
similarity?: number;
|
|
filters?: Record<string, any>;
|
|
enableQueryExpansion?: boolean;
|
|
} = {}
|
|
): Promise<VectorSearchResult[]> {
|
|
const initialized = await this.ensureInitialized();
|
|
if (!initialized) {
|
|
logger.warn('Vector database not available, returning empty search results');
|
|
return [];
|
|
}
|
|
|
|
try {
|
|
let queries = [query];
|
|
|
|
// Enable query expansion by default for better results
|
|
if (options.enableQueryExpansion !== false) {
|
|
queries = await this.expandQuery(query);
|
|
}
|
|
|
|
const allResults: VectorSearchResult[] = [];
|
|
|
|
for (const expandedQuery of queries) {
|
|
const embedding = await this.generateEmbeddings(expandedQuery);
|
|
|
|
let results: VectorSearchResult[];
|
|
switch (this.provider) {
|
|
case 'pinecone':
|
|
results = await this.searchPinecone(embedding, options);
|
|
break;
|
|
case 'pgvector':
|
|
results = await this.searchPgVector(embedding, options);
|
|
break;
|
|
case 'chroma':
|
|
results = await this.searchChroma(embedding, options);
|
|
break;
|
|
case 'supabase':
|
|
results = await this.searchSupabase(embedding, options);
|
|
break;
|
|
default:
|
|
throw new Error(`Unsupported provider: ${this.provider}`);
|
|
}
|
|
|
|
allResults.push(...results);
|
|
}
|
|
|
|
// Merge and deduplicate results
|
|
const mergedResults = this.mergeAndDeduplicateResults(allResults, options.limit || 10);
|
|
|
|
return mergedResults;
|
|
} catch (error) {
|
|
logger.error('Vector search failed', error);
|
|
throw new Error('Search operation failed');
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Merge and deduplicate search results
|
|
*/
|
|
private mergeAndDeduplicateResults(results: VectorSearchResult[], limit: number): VectorSearchResult[] {
|
|
const seen = new Set<string>();
|
|
const merged: VectorSearchResult[] = [];
|
|
|
|
// Sort by similarity score
|
|
results.sort((a, b) => b.similarityScore - a.similarityScore);
|
|
|
|
for (const result of results) {
|
|
const key = `${result.documentId}-${result.chunkContent.substring(0, 100)}`;
|
|
if (!seen.has(key)) {
|
|
seen.add(key);
|
|
merged.push(result);
|
|
if (merged.length >= limit) break;
|
|
}
|
|
}
|
|
|
|
return merged;
|
|
}
|
|
|
|
/**
|
|
* Get relevant sections for RAG processing
|
|
*/
|
|
async getRelevantSections(
|
|
query: string,
|
|
documentId: string,
|
|
limit: number = 5
|
|
): Promise<DocumentChunk[]> {
|
|
const results = await this.search(query, {
|
|
documentId,
|
|
limit,
|
|
similarity: 0.7
|
|
});
|
|
|
|
return results.map((result: any) => ({
|
|
id: result.id,
|
|
documentId,
|
|
chunkIndex: result.metadata?.chunkIndex || 0,
|
|
content: result.content,
|
|
metadata: result.metadata,
|
|
embedding: [], // Not needed for return
|
|
createdAt: new Date(),
|
|
updatedAt: new Date()
|
|
}));
|
|
}
|
|
|
|
/**
|
|
* Find similar documents across the database
|
|
*/
|
|
async findSimilarDocuments(
|
|
documentId: string,
|
|
limit: number = 10
|
|
): Promise<VectorSearchResult[]> {
|
|
// Get document chunks
|
|
const documentChunks = await this.getDocumentChunks(documentId);
|
|
if (documentChunks.length === 0) return [];
|
|
|
|
// Use the first chunk as a reference
|
|
const referenceChunk = documentChunks[0];
|
|
if (!referenceChunk) return [];
|
|
|
|
return await this.search(referenceChunk.content, {
|
|
limit,
|
|
similarity: 0.6,
|
|
filters: { documentId: { $ne: documentId } }
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Industry-specific search
|
|
*/
|
|
async searchByIndustry(
|
|
industry: string,
|
|
query: string,
|
|
limit: number = 20
|
|
): Promise<VectorSearchResult[]> {
|
|
return await this.search(query, {
|
|
limit,
|
|
filters: { industry: industry.toLowerCase() }
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Get vector database statistics
|
|
*/
|
|
async getVectorDatabaseStats(): Promise<{
|
|
totalChunks: number;
|
|
totalDocuments: number;
|
|
averageSimilarity: number;
|
|
}> {
|
|
try {
|
|
const stats = await VectorDatabaseModel.getVectorDatabaseStats();
|
|
return stats;
|
|
} catch (error) {
|
|
logger.error('Failed to get vector database stats', error);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
// Private implementation methods for different providers
|
|
private async storeInPinecone(_chunks: DocumentChunk[]): Promise<void> {
|
|
logger.warn('Pinecone provider not fully implemented');
|
|
throw new Error('Pinecone provider not available');
|
|
}
|
|
|
|
private async storeInPgVector(_chunks: DocumentChunk[]): Promise<void> {
|
|
logger.warn('pgvector provider is deprecated. Use Supabase instead for cloud deployment.');
|
|
throw new Error('pgvector provider not available in Firebase environment. Use Supabase instead.');
|
|
}
|
|
|
|
private async storeInChroma(chunks: DocumentChunk[]): Promise<void> {
|
|
const collection = await this.client.getOrCreateCollection({
|
|
name: 'cim_documents'
|
|
});
|
|
|
|
const documents = chunks.map(chunk => chunk.content);
|
|
const metadatas = chunks.map(chunk => ({
|
|
...chunk.metadata,
|
|
documentId: chunk.documentId
|
|
}));
|
|
const ids = chunks.map(chunk => chunk.id);
|
|
|
|
await collection.add({
|
|
ids,
|
|
documents,
|
|
metadatas
|
|
});
|
|
}
|
|
|
|
private async searchPinecone(
|
|
_embedding: number[],
|
|
_options: any
|
|
): Promise<VectorSearchResult[]> {
|
|
logger.warn('Pinecone provider not fully implemented');
|
|
throw new Error('Pinecone provider not available');
|
|
}
|
|
|
|
private async searchPgVector(
|
|
_embedding: number[],
|
|
_options: any
|
|
): Promise<VectorSearchResult[]> {
|
|
logger.warn('pgvector provider is deprecated. Use Supabase instead for cloud deployment.');
|
|
throw new Error('pgvector provider not available in Firebase environment. Use Supabase instead.');
|
|
}
|
|
|
|
private async searchChroma(
|
|
embedding: number[],
|
|
options: any
|
|
): Promise<VectorSearchResult[]> {
|
|
const collection = await this.client.getCollection({
|
|
name: 'cim_documents'
|
|
});
|
|
|
|
const results = await collection.query({
|
|
queryEmbeddings: [embedding],
|
|
nResults: options.limit || 10,
|
|
where: options.filters
|
|
});
|
|
|
|
return results.documents[0].map((doc: string, index: number) => ({
|
|
id: results.ids[0][index],
|
|
score: results.distances[0][index],
|
|
metadata: results.metadatas[0][index],
|
|
content: doc
|
|
}));
|
|
}
|
|
|
|
private async storeInSupabase(chunks: DocumentChunk[]): Promise<void> {
|
|
try {
|
|
// Transform chunks to include embeddings
|
|
const supabaseRows = await Promise.all(
|
|
chunks.map(async (chunk) => ({
|
|
id: chunk.id,
|
|
document_id: chunk.documentId,
|
|
chunk_index: chunk.chunkIndex,
|
|
content: chunk.content,
|
|
embedding: chunk.embedding,
|
|
metadata: chunk.metadata || {}
|
|
}))
|
|
);
|
|
|
|
const { error } = await this.client
|
|
.from('document_chunks')
|
|
.upsert(supabaseRows);
|
|
|
|
if (error) {
|
|
// Check if it's a table/column missing error
|
|
if (error.message && (error.message.includes('chunkIndex') || error.message.includes('document_chunks'))) {
|
|
logger.warn('Vector database table/columns not available, skipping vector storage:', error.message);
|
|
return; // Don't throw, just skip vector storage
|
|
}
|
|
throw error;
|
|
}
|
|
|
|
logger.info(`Successfully stored ${chunks.length} chunks in Supabase`);
|
|
} catch (error) {
|
|
logger.error('Failed to store chunks in Supabase:', error);
|
|
// Don't throw the error - let the upload continue without vector storage
|
|
logger.warn('Continuing upload process without vector storage');
|
|
}
|
|
}
|
|
|
|
private async searchSupabase(
|
|
embedding: number[],
|
|
options: {
|
|
documentId?: string;
|
|
limit?: number;
|
|
similarity?: number;
|
|
filters?: Record<string, any>;
|
|
}
|
|
): Promise<VectorSearchResult[]> {
|
|
try {
|
|
let query = this.client
|
|
.from('document_chunks')
|
|
.select('id, content, metadata, document_id')
|
|
.rpc('match_documents', {
|
|
query_embedding: embedding,
|
|
match_threshold: options.similarity || 0.7,
|
|
match_count: options.limit || 10
|
|
});
|
|
|
|
// Add document filter if specified
|
|
if (options.documentId) {
|
|
query = query.eq('document_id', options.documentId);
|
|
}
|
|
|
|
const { data, error } = await query;
|
|
|
|
if (error) {
|
|
throw error;
|
|
}
|
|
|
|
return data.map((row: any) => ({
|
|
id: row.id,
|
|
score: row.similarity,
|
|
metadata: {
|
|
...row.metadata,
|
|
documentId: row.document_id
|
|
},
|
|
content: row.content
|
|
}));
|
|
} catch (error) {
|
|
logger.error('Failed to search in Supabase:', error);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
private async getDocumentChunks(documentId: string): Promise<DocumentChunk[]> {
|
|
return await VectorDatabaseModel.getDocumentChunks(documentId);
|
|
}
|
|
}
|
|
|
|
export const vectorDatabaseService = new VectorDatabaseService(); |