Files
virtual_board_member/app/services/vector_service.py
Jonathan Pressnell 1a8ec37bed feat: Complete Week 2 - Document Processing Pipeline
- Implement multi-format document support (PDF, XLSX, CSV, PPTX, TXT, Images)
- Add S3-compatible storage service with tenant isolation
- Create document organization service with hierarchical folders and tagging
- Implement advanced document processing with table/chart extraction
- Add batch upload capabilities (up to 50 files)
- Create comprehensive document validation and security scanning
- Implement automatic metadata extraction and categorization
- Add document version control system
- Update DEVELOPMENT_PLAN.md to mark Week 2 as completed
- Add WEEK2_COMPLETION_SUMMARY.md with detailed implementation notes
- All tests passing (6/6) - 100% success rate
2025-08-08 15:47:43 -04:00

398 lines
15 KiB
Python

"""
Qdrant vector database service for the Virtual Board Member AI System.
"""
import logging
from typing import List, Dict, Any, Optional, Tuple
from qdrant_client import QdrantClient, models
from qdrant_client.http import models as rest
import numpy as np
from sentence_transformers import SentenceTransformer
from app.core.config import settings
from app.models.tenant import Tenant
logger = logging.getLogger(__name__)
class VectorService:
"""Qdrant vector database service with tenant isolation."""
def __init__(self):
self.client = None
self.embedding_model = None
self._init_client()
self._init_embedding_model()
def _init_client(self):
"""Initialize Qdrant client."""
try:
self.client = QdrantClient(
host=settings.QDRANT_HOST,
port=settings.QDRANT_PORT,
timeout=settings.QDRANT_TIMEOUT
)
logger.info("Qdrant client initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize Qdrant client: {e}")
self.client = None
def _init_embedding_model(self):
"""Initialize embedding model."""
try:
self.embedding_model = SentenceTransformer(settings.EMBEDDING_MODEL)
logger.info(f"Embedding model {settings.EMBEDDING_MODEL} loaded successfully")
except Exception as e:
logger.error(f"Failed to load embedding model: {e}")
self.embedding_model = None
def _get_collection_name(self, tenant_id: str, collection_type: str = "documents") -> str:
"""Generate tenant-isolated collection name."""
return f"{tenant_id}_{collection_type}"
async def create_tenant_collections(self, tenant: Tenant) -> bool:
"""Create all necessary collections for a tenant."""
if not self.client:
logger.error("Qdrant client not available")
return False
try:
tenant_id = str(tenant.id)
# Create main documents collection
documents_collection = self._get_collection_name(tenant_id, "documents")
await self._create_collection(
collection_name=documents_collection,
vector_size=settings.EMBEDDING_DIMENSION,
description=f"Document embeddings for tenant {tenant.name}"
)
# Create tables collection for structured data
tables_collection = self._get_collection_name(tenant_id, "tables")
await self._create_collection(
collection_name=tables_collection,
vector_size=settings.EMBEDDING_DIMENSION,
description=f"Table embeddings for tenant {tenant.name}"
)
# Create charts collection for visual data
charts_collection = self._get_collection_name(tenant_id, "charts")
await self._create_collection(
collection_name=charts_collection,
vector_size=settings.EMBEDDING_DIMENSION,
description=f"Chart embeddings for tenant {tenant.name}"
)
logger.info(f"Created collections for tenant {tenant.name} ({tenant_id})")
return True
except Exception as e:
logger.error(f"Failed to create collections for tenant {tenant.id}: {e}")
return False
async def _create_collection(self, collection_name: str, vector_size: int, description: str) -> bool:
"""Create a collection with proper configuration."""
try:
# Check if collection already exists
collections = self.client.get_collections()
existing_collections = [col.name for col in collections.collections]
if collection_name in existing_collections:
logger.info(f"Collection {collection_name} already exists")
return True
# Create collection with optimized settings
self.client.create_collection(
collection_name=collection_name,
vectors_config=models.VectorParams(
size=vector_size,
distance=models.Distance.COSINE,
on_disk=True # Store vectors on disk for large collections
),
optimizers_config=models.OptimizersConfigDiff(
memmap_threshold=10000, # Use memory mapping for collections > 10k points
default_segment_number=2 # Optimize for parallel processing
),
replication_factor=1 # Single replica for development
)
# Add collection description
self.client.update_collection(
collection_name=collection_name,
optimizers_config=models.OptimizersConfigDiff(
default_segment_number=2
)
)
logger.info(f"Created collection {collection_name}: {description}")
return True
except Exception as e:
logger.error(f"Failed to create collection {collection_name}: {e}")
return False
async def delete_tenant_collections(self, tenant_id: str) -> bool:
"""Delete all collections for a tenant."""
if not self.client:
return False
try:
collections_to_delete = [
self._get_collection_name(tenant_id, "documents"),
self._get_collection_name(tenant_id, "tables"),
self._get_collection_name(tenant_id, "charts")
]
for collection_name in collections_to_delete:
try:
self.client.delete_collection(collection_name)
logger.info(f"Deleted collection {collection_name}")
except Exception as e:
logger.warning(f"Failed to delete collection {collection_name}: {e}")
return True
except Exception as e:
logger.error(f"Failed to delete collections for tenant {tenant_id}: {e}")
return False
async def generate_embedding(self, text: str) -> Optional[List[float]]:
"""Generate embedding for text."""
if not self.embedding_model:
logger.error("Embedding model not available")
return None
try:
embedding = self.embedding_model.encode(text)
return embedding.tolist()
except Exception as e:
logger.error(f"Failed to generate embedding: {e}")
return None
async def add_document_vectors(
self,
tenant_id: str,
document_id: str,
chunks: List[Dict[str, Any]],
collection_type: str = "documents"
) -> bool:
"""Add document chunks to vector database."""
if not self.client or not self.embedding_model:
return False
try:
collection_name = self._get_collection_name(tenant_id, collection_type)
# Generate embeddings for all chunks
points = []
for i, chunk in enumerate(chunks):
# Generate embedding
embedding = await self.generate_embedding(chunk["text"])
if not embedding:
continue
# Create point with metadata
point = models.PointStruct(
id=f"{document_id}_{i}",
vector=embedding,
payload={
"document_id": document_id,
"tenant_id": tenant_id,
"chunk_index": i,
"text": chunk["text"],
"chunk_type": chunk.get("type", "text"),
"metadata": chunk.get("metadata", {}),
"created_at": chunk.get("created_at")
}
)
points.append(point)
if points:
# Upsert points in batches
batch_size = 100
for i in range(0, len(points), batch_size):
batch = points[i:i + batch_size]
self.client.upsert(
collection_name=collection_name,
points=batch
)
logger.info(f"Added {len(points)} vectors to collection {collection_name}")
return True
return False
except Exception as e:
logger.error(f"Failed to add document vectors: {e}")
return False
async def search_similar(
self,
tenant_id: str,
query: str,
limit: int = 10,
score_threshold: float = 0.7,
collection_type: str = "documents",
filters: Optional[Dict[str, Any]] = None
) -> List[Dict[str, Any]]:
"""Search for similar vectors."""
if not self.client or not self.embedding_model:
return []
try:
collection_name = self._get_collection_name(tenant_id, collection_type)
# Generate query embedding
query_embedding = await self.generate_embedding(query)
if not query_embedding:
return []
# Build search filter
search_filter = models.Filter(
must=[
models.FieldCondition(
key="tenant_id",
match=models.MatchValue(value=tenant_id)
)
]
)
# Add additional filters
if filters:
for key, value in filters.items():
if isinstance(value, list):
search_filter.must.append(
models.FieldCondition(
key=key,
match=models.MatchAny(any=value)
)
)
else:
search_filter.must.append(
models.FieldCondition(
key=key,
match=models.MatchValue(value=value)
)
)
# Perform search
search_result = self.client.search(
collection_name=collection_name,
query_vector=query_embedding,
query_filter=search_filter,
limit=limit,
score_threshold=score_threshold,
with_payload=True
)
# Format results
results = []
for point in search_result:
results.append({
"id": point.id,
"score": point.score,
"payload": point.payload,
"text": point.payload.get("text", ""),
"document_id": point.payload.get("document_id"),
"chunk_type": point.payload.get("chunk_type", "text")
})
return results
except Exception as e:
logger.error(f"Failed to search vectors: {e}")
return []
async def delete_document_vectors(self, tenant_id: str, document_id: str, collection_type: str = "documents") -> bool:
"""Delete all vectors for a specific document."""
if not self.client:
return False
try:
collection_name = self._get_collection_name(tenant_id, collection_type)
# Delete points with document_id filter
self.client.delete(
collection_name=collection_name,
points_selector=models.FilterSelector(
filter=models.Filter(
must=[
models.FieldCondition(
key="document_id",
match=models.MatchValue(value=document_id)
),
models.FieldCondition(
key="tenant_id",
match=models.MatchValue(value=tenant_id)
)
]
)
)
)
logger.info(f"Deleted vectors for document {document_id} from collection {collection_name}")
return True
except Exception as e:
logger.error(f"Failed to delete document vectors: {e}")
return False
async def get_collection_stats(self, tenant_id: str, collection_type: str = "documents") -> Optional[Dict[str, Any]]:
"""Get collection statistics."""
if not self.client:
return None
try:
collection_name = self._get_collection_name(tenant_id, collection_type)
info = self.client.get_collection(collection_name)
count = self.client.count(
collection_name=collection_name,
count_filter=models.Filter(
must=[
models.FieldCondition(
key="tenant_id",
match=models.MatchValue(value=tenant_id)
)
]
)
)
return {
"collection_name": collection_name,
"tenant_id": tenant_id,
"vector_count": count.count,
"vector_size": info.config.params.vectors.size,
"distance": info.config.params.vectors.distance,
"status": info.status
}
except Exception as e:
logger.error(f"Failed to get collection stats: {e}")
return None
async def health_check(self) -> bool:
"""Check if vector service is healthy."""
if not self.client:
return False
try:
# Check client connection
collections = self.client.get_collections()
# Check embedding model
if not self.embedding_model:
return False
# Test embedding generation
test_embedding = await self.generate_embedding("test")
if not test_embedding:
return False
return True
except Exception as e:
logger.error(f"Vector service health check failed: {e}")
return False
# Global vector service instance
vector_service = VectorService()