- Implement multi-format document support (PDF, XLSX, CSV, PPTX, TXT, Images) - Add S3-compatible storage service with tenant isolation - Create document organization service with hierarchical folders and tagging - Implement advanced document processing with table/chart extraction - Add batch upload capabilities (up to 50 files) - Create comprehensive document validation and security scanning - Implement automatic metadata extraction and categorization - Add document version control system - Update DEVELOPMENT_PLAN.md to mark Week 2 as completed - Add WEEK2_COMPLETION_SUMMARY.md with detailed implementation notes - All tests passing (6/6) - 100% success rate
398 lines
15 KiB
Python
398 lines
15 KiB
Python
"""
|
|
Qdrant vector database service for the Virtual Board Member AI System.
|
|
"""
|
|
import logging
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
from qdrant_client import QdrantClient, models
|
|
from qdrant_client.http import models as rest
|
|
import numpy as np
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
from app.core.config import settings
|
|
from app.models.tenant import Tenant
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class VectorService:
|
|
"""Qdrant vector database service with tenant isolation."""
|
|
|
|
def __init__(self):
|
|
self.client = None
|
|
self.embedding_model = None
|
|
self._init_client()
|
|
self._init_embedding_model()
|
|
|
|
def _init_client(self):
|
|
"""Initialize Qdrant client."""
|
|
try:
|
|
self.client = QdrantClient(
|
|
host=settings.QDRANT_HOST,
|
|
port=settings.QDRANT_PORT,
|
|
timeout=settings.QDRANT_TIMEOUT
|
|
)
|
|
logger.info("Qdrant client initialized successfully")
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize Qdrant client: {e}")
|
|
self.client = None
|
|
|
|
def _init_embedding_model(self):
|
|
"""Initialize embedding model."""
|
|
try:
|
|
self.embedding_model = SentenceTransformer(settings.EMBEDDING_MODEL)
|
|
logger.info(f"Embedding model {settings.EMBEDDING_MODEL} loaded successfully")
|
|
except Exception as e:
|
|
logger.error(f"Failed to load embedding model: {e}")
|
|
self.embedding_model = None
|
|
|
|
def _get_collection_name(self, tenant_id: str, collection_type: str = "documents") -> str:
|
|
"""Generate tenant-isolated collection name."""
|
|
return f"{tenant_id}_{collection_type}"
|
|
|
|
async def create_tenant_collections(self, tenant: Tenant) -> bool:
|
|
"""Create all necessary collections for a tenant."""
|
|
if not self.client:
|
|
logger.error("Qdrant client not available")
|
|
return False
|
|
|
|
try:
|
|
tenant_id = str(tenant.id)
|
|
|
|
# Create main documents collection
|
|
documents_collection = self._get_collection_name(tenant_id, "documents")
|
|
await self._create_collection(
|
|
collection_name=documents_collection,
|
|
vector_size=settings.EMBEDDING_DIMENSION,
|
|
description=f"Document embeddings for tenant {tenant.name}"
|
|
)
|
|
|
|
# Create tables collection for structured data
|
|
tables_collection = self._get_collection_name(tenant_id, "tables")
|
|
await self._create_collection(
|
|
collection_name=tables_collection,
|
|
vector_size=settings.EMBEDDING_DIMENSION,
|
|
description=f"Table embeddings for tenant {tenant.name}"
|
|
)
|
|
|
|
# Create charts collection for visual data
|
|
charts_collection = self._get_collection_name(tenant_id, "charts")
|
|
await self._create_collection(
|
|
collection_name=charts_collection,
|
|
vector_size=settings.EMBEDDING_DIMENSION,
|
|
description=f"Chart embeddings for tenant {tenant.name}"
|
|
)
|
|
|
|
logger.info(f"Created collections for tenant {tenant.name} ({tenant_id})")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to create collections for tenant {tenant.id}: {e}")
|
|
return False
|
|
|
|
async def _create_collection(self, collection_name: str, vector_size: int, description: str) -> bool:
|
|
"""Create a collection with proper configuration."""
|
|
try:
|
|
# Check if collection already exists
|
|
collections = self.client.get_collections()
|
|
existing_collections = [col.name for col in collections.collections]
|
|
|
|
if collection_name in existing_collections:
|
|
logger.info(f"Collection {collection_name} already exists")
|
|
return True
|
|
|
|
# Create collection with optimized settings
|
|
self.client.create_collection(
|
|
collection_name=collection_name,
|
|
vectors_config=models.VectorParams(
|
|
size=vector_size,
|
|
distance=models.Distance.COSINE,
|
|
on_disk=True # Store vectors on disk for large collections
|
|
),
|
|
optimizers_config=models.OptimizersConfigDiff(
|
|
memmap_threshold=10000, # Use memory mapping for collections > 10k points
|
|
default_segment_number=2 # Optimize for parallel processing
|
|
),
|
|
replication_factor=1 # Single replica for development
|
|
)
|
|
|
|
# Add collection description
|
|
self.client.update_collection(
|
|
collection_name=collection_name,
|
|
optimizers_config=models.OptimizersConfigDiff(
|
|
default_segment_number=2
|
|
)
|
|
)
|
|
|
|
logger.info(f"Created collection {collection_name}: {description}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to create collection {collection_name}: {e}")
|
|
return False
|
|
|
|
async def delete_tenant_collections(self, tenant_id: str) -> bool:
|
|
"""Delete all collections for a tenant."""
|
|
if not self.client:
|
|
return False
|
|
|
|
try:
|
|
collections_to_delete = [
|
|
self._get_collection_name(tenant_id, "documents"),
|
|
self._get_collection_name(tenant_id, "tables"),
|
|
self._get_collection_name(tenant_id, "charts")
|
|
]
|
|
|
|
for collection_name in collections_to_delete:
|
|
try:
|
|
self.client.delete_collection(collection_name)
|
|
logger.info(f"Deleted collection {collection_name}")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to delete collection {collection_name}: {e}")
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to delete collections for tenant {tenant_id}: {e}")
|
|
return False
|
|
|
|
async def generate_embedding(self, text: str) -> Optional[List[float]]:
|
|
"""Generate embedding for text."""
|
|
if not self.embedding_model:
|
|
logger.error("Embedding model not available")
|
|
return None
|
|
|
|
try:
|
|
embedding = self.embedding_model.encode(text)
|
|
return embedding.tolist()
|
|
except Exception as e:
|
|
logger.error(f"Failed to generate embedding: {e}")
|
|
return None
|
|
|
|
async def add_document_vectors(
|
|
self,
|
|
tenant_id: str,
|
|
document_id: str,
|
|
chunks: List[Dict[str, Any]],
|
|
collection_type: str = "documents"
|
|
) -> bool:
|
|
"""Add document chunks to vector database."""
|
|
if not self.client or not self.embedding_model:
|
|
return False
|
|
|
|
try:
|
|
collection_name = self._get_collection_name(tenant_id, collection_type)
|
|
|
|
# Generate embeddings for all chunks
|
|
points = []
|
|
for i, chunk in enumerate(chunks):
|
|
# Generate embedding
|
|
embedding = await self.generate_embedding(chunk["text"])
|
|
if not embedding:
|
|
continue
|
|
|
|
# Create point with metadata
|
|
point = models.PointStruct(
|
|
id=f"{document_id}_{i}",
|
|
vector=embedding,
|
|
payload={
|
|
"document_id": document_id,
|
|
"tenant_id": tenant_id,
|
|
"chunk_index": i,
|
|
"text": chunk["text"],
|
|
"chunk_type": chunk.get("type", "text"),
|
|
"metadata": chunk.get("metadata", {}),
|
|
"created_at": chunk.get("created_at")
|
|
}
|
|
)
|
|
points.append(point)
|
|
|
|
if points:
|
|
# Upsert points in batches
|
|
batch_size = 100
|
|
for i in range(0, len(points), batch_size):
|
|
batch = points[i:i + batch_size]
|
|
self.client.upsert(
|
|
collection_name=collection_name,
|
|
points=batch
|
|
)
|
|
|
|
logger.info(f"Added {len(points)} vectors to collection {collection_name}")
|
|
return True
|
|
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to add document vectors: {e}")
|
|
return False
|
|
|
|
async def search_similar(
|
|
self,
|
|
tenant_id: str,
|
|
query: str,
|
|
limit: int = 10,
|
|
score_threshold: float = 0.7,
|
|
collection_type: str = "documents",
|
|
filters: Optional[Dict[str, Any]] = None
|
|
) -> List[Dict[str, Any]]:
|
|
"""Search for similar vectors."""
|
|
if not self.client or not self.embedding_model:
|
|
return []
|
|
|
|
try:
|
|
collection_name = self._get_collection_name(tenant_id, collection_type)
|
|
|
|
# Generate query embedding
|
|
query_embedding = await self.generate_embedding(query)
|
|
if not query_embedding:
|
|
return []
|
|
|
|
# Build search filter
|
|
search_filter = models.Filter(
|
|
must=[
|
|
models.FieldCondition(
|
|
key="tenant_id",
|
|
match=models.MatchValue(value=tenant_id)
|
|
)
|
|
]
|
|
)
|
|
|
|
# Add additional filters
|
|
if filters:
|
|
for key, value in filters.items():
|
|
if isinstance(value, list):
|
|
search_filter.must.append(
|
|
models.FieldCondition(
|
|
key=key,
|
|
match=models.MatchAny(any=value)
|
|
)
|
|
)
|
|
else:
|
|
search_filter.must.append(
|
|
models.FieldCondition(
|
|
key=key,
|
|
match=models.MatchValue(value=value)
|
|
)
|
|
)
|
|
|
|
# Perform search
|
|
search_result = self.client.search(
|
|
collection_name=collection_name,
|
|
query_vector=query_embedding,
|
|
query_filter=search_filter,
|
|
limit=limit,
|
|
score_threshold=score_threshold,
|
|
with_payload=True
|
|
)
|
|
|
|
# Format results
|
|
results = []
|
|
for point in search_result:
|
|
results.append({
|
|
"id": point.id,
|
|
"score": point.score,
|
|
"payload": point.payload,
|
|
"text": point.payload.get("text", ""),
|
|
"document_id": point.payload.get("document_id"),
|
|
"chunk_type": point.payload.get("chunk_type", "text")
|
|
})
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to search vectors: {e}")
|
|
return []
|
|
|
|
async def delete_document_vectors(self, tenant_id: str, document_id: str, collection_type: str = "documents") -> bool:
|
|
"""Delete all vectors for a specific document."""
|
|
if not self.client:
|
|
return False
|
|
|
|
try:
|
|
collection_name = self._get_collection_name(tenant_id, collection_type)
|
|
|
|
# Delete points with document_id filter
|
|
self.client.delete(
|
|
collection_name=collection_name,
|
|
points_selector=models.FilterSelector(
|
|
filter=models.Filter(
|
|
must=[
|
|
models.FieldCondition(
|
|
key="document_id",
|
|
match=models.MatchValue(value=document_id)
|
|
),
|
|
models.FieldCondition(
|
|
key="tenant_id",
|
|
match=models.MatchValue(value=tenant_id)
|
|
)
|
|
]
|
|
)
|
|
)
|
|
)
|
|
|
|
logger.info(f"Deleted vectors for document {document_id} from collection {collection_name}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to delete document vectors: {e}")
|
|
return False
|
|
|
|
async def get_collection_stats(self, tenant_id: str, collection_type: str = "documents") -> Optional[Dict[str, Any]]:
|
|
"""Get collection statistics."""
|
|
if not self.client:
|
|
return None
|
|
|
|
try:
|
|
collection_name = self._get_collection_name(tenant_id, collection_type)
|
|
|
|
info = self.client.get_collection(collection_name)
|
|
count = self.client.count(
|
|
collection_name=collection_name,
|
|
count_filter=models.Filter(
|
|
must=[
|
|
models.FieldCondition(
|
|
key="tenant_id",
|
|
match=models.MatchValue(value=tenant_id)
|
|
)
|
|
]
|
|
)
|
|
)
|
|
|
|
return {
|
|
"collection_name": collection_name,
|
|
"tenant_id": tenant_id,
|
|
"vector_count": count.count,
|
|
"vector_size": info.config.params.vectors.size,
|
|
"distance": info.config.params.vectors.distance,
|
|
"status": info.status
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to get collection stats: {e}")
|
|
return None
|
|
|
|
async def health_check(self) -> bool:
|
|
"""Check if vector service is healthy."""
|
|
if not self.client:
|
|
return False
|
|
|
|
try:
|
|
# Check client connection
|
|
collections = self.client.get_collections()
|
|
|
|
# Check embedding model
|
|
if not self.embedding_model:
|
|
return False
|
|
|
|
# Test embedding generation
|
|
test_embedding = await self.generate_embedding("test")
|
|
if not test_embedding:
|
|
return False
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Vector service health check failed: {e}")
|
|
return False
|
|
|
|
# Global vector service instance
|
|
vector_service = VectorService()
|