""" Qdrant vector database service for the Virtual Board Member AI System. """ import logging from typing import List, Dict, Any, Optional, Tuple from qdrant_client import QdrantClient, models from qdrant_client.http import models as rest import numpy as np from sentence_transformers import SentenceTransformer from app.core.config import settings from app.models.tenant import Tenant logger = logging.getLogger(__name__) class VectorService: """Qdrant vector database service with tenant isolation.""" def __init__(self): self.client = None self.embedding_model = None self._init_client() self._init_embedding_model() def _init_client(self): """Initialize Qdrant client.""" try: self.client = QdrantClient( host=settings.QDRANT_HOST, port=settings.QDRANT_PORT, timeout=settings.QDRANT_TIMEOUT ) logger.info("Qdrant client initialized successfully") except Exception as e: logger.error(f"Failed to initialize Qdrant client: {e}") self.client = None def _init_embedding_model(self): """Initialize embedding model.""" try: self.embedding_model = SentenceTransformer(settings.EMBEDDING_MODEL) logger.info(f"Embedding model {settings.EMBEDDING_MODEL} loaded successfully") except Exception as e: logger.error(f"Failed to load embedding model: {e}") self.embedding_model = None def _get_collection_name(self, tenant_id: str, collection_type: str = "documents") -> str: """Generate tenant-isolated collection name.""" return f"{tenant_id}_{collection_type}" async def create_tenant_collections(self, tenant: Tenant) -> bool: """Create all necessary collections for a tenant.""" if not self.client: logger.error("Qdrant client not available") return False try: tenant_id = str(tenant.id) # Create main documents collection documents_collection = self._get_collection_name(tenant_id, "documents") await self._create_collection( collection_name=documents_collection, vector_size=settings.EMBEDDING_DIMENSION, description=f"Document embeddings for tenant {tenant.name}" ) # Create tables collection for structured data tables_collection = self._get_collection_name(tenant_id, "tables") await self._create_collection( collection_name=tables_collection, vector_size=settings.EMBEDDING_DIMENSION, description=f"Table embeddings for tenant {tenant.name}" ) # Create charts collection for visual data charts_collection = self._get_collection_name(tenant_id, "charts") await self._create_collection( collection_name=charts_collection, vector_size=settings.EMBEDDING_DIMENSION, description=f"Chart embeddings for tenant {tenant.name}" ) logger.info(f"Created collections for tenant {tenant.name} ({tenant_id})") return True except Exception as e: logger.error(f"Failed to create collections for tenant {tenant.id}: {e}") return False async def _create_collection(self, collection_name: str, vector_size: int, description: str) -> bool: """Create a collection with proper configuration.""" try: # Check if collection already exists collections = self.client.get_collections() existing_collections = [col.name for col in collections.collections] if collection_name in existing_collections: logger.info(f"Collection {collection_name} already exists") return True # Create collection with optimized settings self.client.create_collection( collection_name=collection_name, vectors_config=models.VectorParams( size=vector_size, distance=models.Distance.COSINE, on_disk=True # Store vectors on disk for large collections ), optimizers_config=models.OptimizersConfigDiff( memmap_threshold=10000, # Use memory mapping for collections > 10k points default_segment_number=2 # Optimize for parallel processing ), replication_factor=1 # Single replica for development ) # Add collection description self.client.update_collection( collection_name=collection_name, optimizers_config=models.OptimizersConfigDiff( default_segment_number=2 ) ) logger.info(f"Created collection {collection_name}: {description}") return True except Exception as e: logger.error(f"Failed to create collection {collection_name}: {e}") return False async def delete_tenant_collections(self, tenant_id: str) -> bool: """Delete all collections for a tenant.""" if not self.client: return False try: collections_to_delete = [ self._get_collection_name(tenant_id, "documents"), self._get_collection_name(tenant_id, "tables"), self._get_collection_name(tenant_id, "charts") ] for collection_name in collections_to_delete: try: self.client.delete_collection(collection_name) logger.info(f"Deleted collection {collection_name}") except Exception as e: logger.warning(f"Failed to delete collection {collection_name}: {e}") return True except Exception as e: logger.error(f"Failed to delete collections for tenant {tenant_id}: {e}") return False async def generate_embedding(self, text: str) -> Optional[List[float]]: """Generate embedding for text.""" if not self.embedding_model: logger.error("Embedding model not available") return None try: embedding = self.embedding_model.encode(text) return embedding.tolist() except Exception as e: logger.error(f"Failed to generate embedding: {e}") return None async def add_document_vectors( self, tenant_id: str, document_id: str, chunks: List[Dict[str, Any]], collection_type: str = "documents" ) -> bool: """Add document chunks to vector database.""" if not self.client or not self.embedding_model: return False try: collection_name = self._get_collection_name(tenant_id, collection_type) # Generate embeddings for all chunks points = [] for i, chunk in enumerate(chunks): # Generate embedding embedding = await self.generate_embedding(chunk["text"]) if not embedding: continue # Create point with metadata point = models.PointStruct( id=f"{document_id}_{i}", vector=embedding, payload={ "document_id": document_id, "tenant_id": tenant_id, "chunk_index": i, "text": chunk["text"], "chunk_type": chunk.get("type", "text"), "metadata": chunk.get("metadata", {}), "created_at": chunk.get("created_at") } ) points.append(point) if points: # Upsert points in batches batch_size = 100 for i in range(0, len(points), batch_size): batch = points[i:i + batch_size] self.client.upsert( collection_name=collection_name, points=batch ) logger.info(f"Added {len(points)} vectors to collection {collection_name}") return True return False except Exception as e: logger.error(f"Failed to add document vectors: {e}") return False async def search_similar( self, tenant_id: str, query: str, limit: int = 10, score_threshold: float = 0.7, collection_type: str = "documents", filters: Optional[Dict[str, Any]] = None ) -> List[Dict[str, Any]]: """Search for similar vectors.""" if not self.client or not self.embedding_model: return [] try: collection_name = self._get_collection_name(tenant_id, collection_type) # Generate query embedding query_embedding = await self.generate_embedding(query) if not query_embedding: return [] # Build search filter search_filter = models.Filter( must=[ models.FieldCondition( key="tenant_id", match=models.MatchValue(value=tenant_id) ) ] ) # Add additional filters if filters: for key, value in filters.items(): if isinstance(value, list): search_filter.must.append( models.FieldCondition( key=key, match=models.MatchAny(any=value) ) ) else: search_filter.must.append( models.FieldCondition( key=key, match=models.MatchValue(value=value) ) ) # Perform search search_result = self.client.search( collection_name=collection_name, query_vector=query_embedding, query_filter=search_filter, limit=limit, score_threshold=score_threshold, with_payload=True ) # Format results results = [] for point in search_result: results.append({ "id": point.id, "score": point.score, "payload": point.payload, "text": point.payload.get("text", ""), "document_id": point.payload.get("document_id"), "chunk_type": point.payload.get("chunk_type", "text") }) return results except Exception as e: logger.error(f"Failed to search vectors: {e}") return [] async def delete_document_vectors(self, tenant_id: str, document_id: str, collection_type: str = "documents") -> bool: """Delete all vectors for a specific document.""" if not self.client: return False try: collection_name = self._get_collection_name(tenant_id, collection_type) # Delete points with document_id filter self.client.delete( collection_name=collection_name, points_selector=models.FilterSelector( filter=models.Filter( must=[ models.FieldCondition( key="document_id", match=models.MatchValue(value=document_id) ), models.FieldCondition( key="tenant_id", match=models.MatchValue(value=tenant_id) ) ] ) ) ) logger.info(f"Deleted vectors for document {document_id} from collection {collection_name}") return True except Exception as e: logger.error(f"Failed to delete document vectors: {e}") return False async def get_collection_stats(self, tenant_id: str, collection_type: str = "documents") -> Optional[Dict[str, Any]]: """Get collection statistics.""" if not self.client: return None try: collection_name = self._get_collection_name(tenant_id, collection_type) info = self.client.get_collection(collection_name) count = self.client.count( collection_name=collection_name, count_filter=models.Filter( must=[ models.FieldCondition( key="tenant_id", match=models.MatchValue(value=tenant_id) ) ] ) ) return { "collection_name": collection_name, "tenant_id": tenant_id, "vector_count": count.count, "vector_size": info.config.params.vectors.size, "distance": info.config.params.vectors.distance, "status": info.status } except Exception as e: logger.error(f"Failed to get collection stats: {e}") return None async def health_check(self) -> bool: """Check if vector service is healthy.""" if not self.client: return False try: # Check client connection collections = self.client.get_collections() # Check embedding model if not self.embedding_model: return False # Test embedding generation test_embedding = await self.generate_embedding("test") if not test_embedding: return False return True except Exception as e: logger.error(f"Vector service health check failed: {e}") return False # Global vector service instance vector_service = VectorService()