""" Test suite for Week 3 Vector Database & Embedding System functionality. Comprehensive tests that validate actual functionality, not just test structure. """ import pytest import asyncio from unittest.mock import Mock, patch, AsyncMock, MagicMock from typing import Dict, List, Any import json from app.services.vector_service import VectorService from app.services.document_chunking import DocumentChunkingService from app.models.tenant import Tenant from app.core.config import settings class TestDocumentChunkingService: """Test cases for document chunking functionality with real validation.""" @pytest.fixture def mock_tenant(self): """Create a mock tenant for testing.""" tenant = Mock(spec=Tenant) tenant.id = "test-tenant-123" tenant.name = "Test Tenant" return tenant @pytest.fixture def chunking_service(self, mock_tenant): """Create a document chunking service instance.""" return DocumentChunkingService(mock_tenant) @pytest.fixture def sample_document_content(self): """Sample document content for testing.""" return { "text_content": [ { "text": "This is a sample document for testing purposes. It contains multiple sentences and should be chunked appropriately. The chunking algorithm should respect semantic boundaries and create meaningful chunks that preserve context.", "page_number": 1 }, { "text": "This is the second page of the document. It contains additional content that should also be processed. The system should handle multiple pages correctly and maintain proper page numbering in the chunks.", "page_number": 2 } ], "tables": [ { "data": [ ["Name", "Age", "Department", "Salary"], ["John Doe", "30", "Engineering", "$85,000"], ["Jane Smith", "25", "Marketing", "$65,000"], ["Bob Johnson", "35", "Sales", "$75,000"] ], "metadata": { "page_number": 1, "title": "Employee Information" } } ], "charts": [ { "data": { "labels": ["Q1", "Q2", "Q3", "Q4"], "values": [100000, 150000, 200000, 250000] }, "metadata": { "page_number": 2, "chart_type": "bar", "title": "Quarterly Revenue" } } ] } @pytest.mark.asyncio async def test_chunk_document_content_structure_and_content(self, chunking_service, sample_document_content): """Test document chunking with comprehensive validation of structure and content.""" document_id = "test-doc-123" chunks = await chunking_service.chunk_document_content(document_id, sample_document_content) # Verify structure assert "text_chunks" in chunks assert "table_chunks" in chunks assert "chart_chunks" in chunks assert "metadata" in chunks # Verify metadata content assert chunks["metadata"]["document_id"] == document_id assert chunks["metadata"]["tenant_id"] == "test-tenant-123" assert "chunking_timestamp" in chunks["metadata"] assert chunks["metadata"]["chunk_size"] == settings.CHUNK_SIZE assert chunks["metadata"]["chunk_overlap"] == settings.CHUNK_OVERLAP # Verify chunk counts are reasonable assert len(chunks["text_chunks"]) > 0, "Should have text chunks" assert len(chunks["table_chunks"]) > 0, "Should have table chunks" assert len(chunks["chart_chunks"]) > 0, "Should have chart chunks" # Verify text chunks have meaningful content for i, chunk in enumerate(chunks["text_chunks"]): assert "id" in chunk, f"Text chunk {i} missing id" assert "text" in chunk, f"Text chunk {i} missing text" assert chunk["chunk_type"] == "text", f"Text chunk {i} wrong type" assert "token_count" in chunk, f"Text chunk {i} missing token_count" assert "page_numbers" in chunk, f"Text chunk {i} missing page_numbers" assert len(chunk["text"]) > 0, f"Text chunk {i} has empty text" assert chunk["token_count"] > 0, f"Text chunk {i} has zero tokens" assert len(chunk["page_numbers"]) > 0, f"Text chunk {i} has no page numbers" # Verify text content is meaningful (not just whitespace) assert chunk["text"].strip(), f"Text chunk {i} contains only whitespace" # Verify chunk size is within reasonable bounds assert chunk["token_count"] <= settings.CHUNK_MAX_SIZE, f"Text chunk {i} too large" if len(chunks["text_chunks"]) > 1: # If multiple chunks, check minimum size assert chunk["token_count"] >= settings.CHUNK_MIN_SIZE, f"Text chunk {i} too small" @pytest.mark.asyncio async def test_chunk_text_content_semantic_boundaries(self, chunking_service): """Test that text chunking respects semantic boundaries.""" document_id = "test-doc-123" # Create text with clear semantic boundaries text_content = [ { "text": "This is the first paragraph. It contains multiple sentences. The chunking should respect sentence boundaries. This paragraph should be chunked appropriately.", "page_number": 1 }, { "text": "This is the second paragraph. It has different content. The system should maintain context between paragraphs. Each chunk should be meaningful.", "page_number": 2 } ] chunks = await chunking_service._chunk_text_content(document_id, text_content) assert len(chunks) > 0, "Should create chunks" # Verify each chunk contains complete sentences for i, chunk in enumerate(chunks): assert chunk["document_id"] == document_id assert chunk["tenant_id"] == "test-tenant-123" assert chunk["chunk_type"] == "text" assert len(chunk["text"]) > 0 # Check that chunks don't break in the middle of sentences (basic check) text = chunk["text"] if text.count('.') > 0: # If there are sentences # Should not end with a partial sentence (very basic check) assert not text.strip().endswith(','), f"Chunk {i} ends with comma" assert not text.strip().endswith('and'), f"Chunk {i} ends with 'and'" @pytest.mark.asyncio async def test_chunk_table_content_structure_preservation(self, chunking_service): """Test that table chunking preserves table structure and creates meaningful descriptions.""" document_id = "test-doc-123" tables = [ { "data": [ ["Product", "Sales", "Revenue", "Growth"], ["Product A", "100", "$10,000", "15%"], ["Product B", "150", "$15,000", "20%"], ["Product C", "200", "$20,000", "25%"] ], "metadata": { "page_number": 1, "title": "Sales Report Q4" } } ] chunks = await chunking_service._chunk_table_content(document_id, tables) assert len(chunks) > 0, "Should create table chunks" for chunk in chunks: assert chunk["document_id"] == document_id assert chunk["chunk_type"] == "table" assert "table_data" in chunk assert "table_metadata" in chunk # Verify table data is preserved table_data = chunk["table_data"] assert len(table_data) > 0, "Table data should not be empty" assert len(table_data[0]) == 4, "Should preserve column count" # Verify text description is meaningful text = chunk["text"] assert "table" in text.lower(), "Should mention table in description" assert "4 rows" in text or "4 columns" in text, "Should mention dimensions" assert "Product" in text, "Should mention column headers" @pytest.mark.asyncio async def test_chunk_chart_content_description_quality(self, chunking_service): """Test that chart chunking creates meaningful descriptions.""" document_id = "test-doc-123" charts = [ { "data": { "labels": ["Jan", "Feb", "Mar", "Apr"], "values": [100, 120, 140, 160] }, "metadata": { "page_number": 1, "chart_type": "line", "title": "Monthly Growth Trend" } } ] chunks = await chunking_service._chunk_chart_content(document_id, charts) assert len(chunks) > 0, "Should create chart chunks" for chunk in chunks: assert chunk["document_id"] == document_id assert chunk["chunk_type"] == "chart" assert "chart_data" in chunk assert "chart_metadata" in chunk # Verify chart data is preserved chart_data = chunk["chart_data"] assert "labels" in chart_data assert "values" in chart_data assert len(chart_data["labels"]) == 4 assert len(chart_data["values"]) == 4 # Verify text description is meaningful text = chunk["text"] assert "chart" in text.lower(), "Should mention chart in description" assert "line" in text.lower(), "Should mention chart type" assert "Monthly Growth" in text, "Should include chart title" assert "Jan" in text or "Feb" in text, "Should mention some labels" @pytest.mark.asyncio async def test_chunk_statistics_accuracy(self, chunking_service, sample_document_content): """Test that chunk statistics are calculated correctly.""" document_id = "test-doc-123" chunks = await chunking_service.chunk_document_content(document_id, sample_document_content) stats = await chunking_service.get_chunk_statistics(chunks) # Verify all required fields assert "total_chunks" in stats assert "total_tokens" in stats assert "average_tokens_per_chunk" in stats assert "chunk_types" in stats assert "chunking_parameters" in stats # Verify calculations are correct expected_total = len(chunks["text_chunks"]) + len(chunks["table_chunks"]) + len(chunks["chart_chunks"]) assert stats["total_chunks"] == expected_total, "Total chunks count mismatch" # Verify token counts are reasonable assert stats["total_tokens"] > 0, "Total tokens should be positive" assert stats["average_tokens_per_chunk"] > 0, "Average tokens should be positive" # Verify chunk type breakdown assert "text" in stats["chunk_types"] assert "table" in stats["chunk_types"] assert "chart" in stats["chunk_types"] assert stats["chunk_types"]["text"] == len(chunks["text_chunks"]) assert stats["chunk_types"]["table"] == len(chunks["table_chunks"]) assert stats["chunk_types"]["chart"] == len(chunks["chart_chunks"]) @pytest.mark.asyncio async def test_chunking_with_empty_content(self, chunking_service): """Test chunking behavior with empty or minimal content.""" document_id = "test-doc-123" # Test with minimal text minimal_content = { "text_content": [{"text": "Short text.", "page_number": 1}], "tables": [], "charts": [] } chunks = await chunking_service.chunk_document_content(document_id, minimal_content) # Should still create structure even with minimal content assert "text_chunks" in chunks assert "table_chunks" in chunks assert "chart_chunks" in chunks assert "metadata" in chunks # Should have at least one text chunk even for short text assert len(chunks["text_chunks"]) >= 1 # Test with completely empty content empty_content = { "text_content": [], "tables": [], "charts": [] } chunks = await chunking_service.chunk_document_content(document_id, empty_content) # Should handle empty content gracefully assert len(chunks["text_chunks"]) == 0 assert len(chunks["table_chunks"]) == 0 assert len(chunks["chart_chunks"]) == 0 class TestVectorService: """Test cases for vector service functionality with real validation.""" @pytest.fixture def mock_tenant(self): """Create a mock tenant for testing.""" tenant = Mock(spec=Tenant) tenant.id = "test-tenant-123" tenant.name = "Test Tenant" return tenant @pytest.fixture def vector_service(self): """Create a vector service instance.""" return VectorService() @pytest.fixture def sample_chunks(self): """Sample chunks for testing.""" return { "text_chunks": [ { "id": "doc123_text_0", "document_id": "doc123", "tenant_id": "test-tenant-123", "chunk_type": "text", "chunk_index": 0, "text": "This is a sample text chunk for testing vector operations.", "token_count": 12, "page_numbers": [1], "metadata": { "content_type": "text", "created_at": "2024-01-01T00:00:00Z" } } ], "table_chunks": [ { "id": "doc123_table_0", "document_id": "doc123", "tenant_id": "test-tenant-123", "chunk_type": "table", "chunk_index": 0, "text": "Table with 3 rows and 3 columns. Columns: Product, Sales, Revenue", "token_count": 15, "page_numbers": [1], "table_data": [["Product", "Sales"], ["A", "100"]], "table_metadata": {"page_number": 1}, "metadata": { "content_type": "table", "created_at": "2024-01-01T00:00:00Z" } } ], "chart_chunks": [ { "id": "doc123_chart_0", "document_id": "doc123", "tenant_id": "test-tenant-123", "chunk_type": "chart", "chunk_index": 0, "text": "Chart (bar): Monthly Revenue. Shows Jan, Feb, Mar with values 100, 120, 140", "token_count": 20, "page_numbers": [1], "chart_data": {"labels": ["Jan", "Feb"], "values": [100, 120]}, "chart_metadata": {"chart_type": "bar"}, "metadata": { "content_type": "chart", "created_at": "2024-01-01T00:00:00Z" } } ] } @pytest.mark.asyncio async def test_embedding_generation_quality(self, vector_service): """Test that embedding generation produces meaningful vectors.""" test_texts = [ "This is a test text for embedding generation.", "This is a different test text with different content.", "This is a third test text that should produce different embeddings." ] embeddings = [] for text in test_texts: embedding = await vector_service.generate_embedding(text) assert embedding is not None, f"Embedding should not be None for: {text}" assert len(embedding) in [1024, 384], f"Embedding dimension should be 1024 or 384, got {len(embedding)}" assert all(isinstance(x, float) for x in embedding), "All embedding values should be floats" embeddings.append(embedding) # Test that different texts produce different embeddings # (This is a basic test - in practice, embeddings should be semantically different) assert embeddings[0] != embeddings[1], "Different texts should produce different embeddings" assert embeddings[1] != embeddings[2], "Different texts should produce different embeddings" @pytest.mark.asyncio async def test_batch_embedding_consistency(self, vector_service): """Test that batch embeddings are consistent with individual embeddings.""" texts = [ "First test text for batch embedding.", "Second test text for batch embedding.", "Third test text for batch embedding." ] # Generate individual embeddings individual_embeddings = [] for text in texts: embedding = await vector_service.generate_embedding(text) individual_embeddings.append(embedding) # Generate batch embeddings batch_embeddings = await vector_service.generate_batch_embeddings(texts) assert len(batch_embeddings) == len(texts), "Batch should return same number of embeddings" # Verify each embedding has correct dimension for i, embedding in enumerate(batch_embeddings): assert embedding is not None, f"Batch embedding {i} should not be None" assert len(embedding) in [1024, 384], f"Batch embedding {i} wrong dimension" assert all(isinstance(x, float) for x in embedding), f"Batch embedding {i} should contain floats" @pytest.mark.asyncio async def test_add_document_vectors_data_integrity(self, vector_service, sample_chunks): """Test that adding document vectors preserves data integrity.""" tenant_id = "test-tenant-123" document_id = "doc123" # Mock the client and embedding generation with patch.object(vector_service, 'client') as mock_client, \ patch.object(vector_service, 'generate_batch_embeddings', new_callable=AsyncMock) as mock_embeddings: mock_client.return_value = Mock() mock_embeddings.return_value = [ [0.1, 0.2, 0.3] * 341, # 1024 dimensions [0.4, 0.5, 0.6] * 341, [0.7, 0.8, 0.9] * 341 ] success = await vector_service.add_document_vectors(tenant_id, document_id, sample_chunks) assert success is True, "Should return True on success" # Verify that the correct number of embeddings were requested # (one for each chunk) total_chunks = len(sample_chunks["text_chunks"]) + len(sample_chunks["table_chunks"]) + len(sample_chunks["chart_chunks"]) assert mock_embeddings.call_count == 1, "Should call batch embeddings once" # Verify the call arguments call_args = mock_embeddings.call_args[0][0] # First argument (texts) assert len(call_args) == total_chunks, "Should request embeddings for all chunks" @pytest.mark.asyncio async def test_search_similar_result_quality(self, vector_service): """Test that search returns meaningful results with proper structure.""" tenant_id = "test-tenant-123" query = "test query for search" # Mock the client and embedding generation with patch.object(vector_service, 'client') as mock_client, \ patch.object(vector_service, 'generate_embedding', new_callable=AsyncMock) as mock_embedding: mock_client.return_value = Mock() mock_embedding.return_value = [0.1, 0.2, 0.3] * 341 # 1024 dimensions # Mock search results with realistic data mock_search_result = [ Mock( id="result1", score=0.85, payload={ "text": "This is a search result that matches the query", "document_id": "doc123", "chunk_type": "text", "token_count": 10, "page_numbers": [1], "metadata": {"content_type": "text"} } ), Mock( id="result2", score=0.75, payload={ "text": "Another search result with lower relevance", "document_id": "doc124", "chunk_type": "table", "token_count": 15, "page_numbers": [2], "metadata": {"content_type": "table"} } ) ] mock_client.return_value.search.return_value = mock_search_result # Mock the collection name generation with patch.object(vector_service, '_get_collection_name', return_value="test_collection"): vector_service.client = mock_client.return_value results = await vector_service.search_similar(tenant_id, query, limit=5) assert len(results) == 2, "Should return all search results" # Verify result structure and content for i, result in enumerate(results): assert "id" in result, f"Result {i} missing id" assert "score" in result, f"Result {i} missing score" assert "text" in result, f"Result {i} missing text" assert "document_id" in result, f"Result {i} missing document_id" assert "chunk_type" in result, f"Result {i} missing chunk_type" assert "token_count" in result, f"Result {i} missing token_count" assert "page_numbers" in result, f"Result {i} missing page_numbers" assert "metadata" in result, f"Result {i} missing metadata" # Verify score is reasonable assert 0 <= result["score"] <= 1, f"Result {i} score should be between 0 and 1" # Verify text is meaningful assert len(result["text"]) > 0, f"Result {i} text should not be empty" # Verify chunk type is valid assert result["chunk_type"] in ["text", "table", "chart"], f"Result {i} invalid chunk type" # Verify results are sorted by score (descending) scores = [result["score"] for result in results] assert scores == sorted(scores, reverse=True), "Results should be sorted by score" @pytest.mark.asyncio async def test_search_structured_data_filtering(self, vector_service): """Test that structured data search properly filters by data type.""" tenant_id = "test-tenant-123" query = "table data query" data_type = "table" # Mock the search_similar method to verify it's called with correct filters with patch.object(vector_service, 'search_similar', new_callable=AsyncMock) as mock_search: mock_search.return_value = [ { "id": "table_result", "score": 0.9, "text": "Table with sales data", "document_id": "doc123", "chunk_type": "table" } ] results = await vector_service.search_structured_data(tenant_id, query, data_type) assert len(results) > 0, "Should return results" assert results[0]["chunk_type"] == "table", "Should only return table results" # Verify search_similar was called with correct chunk_types filter mock_search.assert_called_once() call_kwargs = mock_search.call_args[1] # Keyword arguments assert "chunk_types" in call_kwargs, "Should pass chunk_types filter" assert call_kwargs["chunk_types"] == ["table"], "Should filter for table chunks only" @pytest.mark.asyncio async def test_hybrid_search_combination_logic(self, vector_service): """Test that hybrid search properly combines semantic and keyword results.""" tenant_id = "test-tenant-123" query = "hybrid search query" # Mock the search methods with patch.object(vector_service, 'search_similar', new_callable=AsyncMock) as mock_semantic, \ patch.object(vector_service, '_keyword_search', new_callable=AsyncMock) as mock_keyword, \ patch.object(vector_service, '_combine_search_results', new_callable=AsyncMock) as mock_combine: mock_semantic.return_value = [ {"id": "semantic1", "score": 0.8, "text": "Semantic result"} ] mock_keyword.return_value = [ {"id": "keyword1", "score": 0.7, "text": "Keyword result"} ] mock_combine.return_value = [ {"id": "combined1", "score": 0.75, "text": "Combined result"} ] results = await vector_service.hybrid_search(tenant_id, query, limit=5) assert len(results) > 0, "Should return combined results" assert mock_semantic.called, "Should call semantic search" assert mock_keyword.called, "Should call keyword search" assert mock_combine.called, "Should call result combination" # Verify the combination was called with correct parameters combine_call_args = mock_combine.call_args[0] assert len(combine_call_args) == 4, "Should pass 4 arguments to combine" assert combine_call_args[0] == mock_semantic.return_value, "Should pass semantic results" assert combine_call_args[1] == mock_keyword.return_value, "Should pass keyword results" assert combine_call_args[2] == 0.7, "Should pass semantic weight" assert combine_call_args[3] == 0.3, "Should pass keyword weight" @pytest.mark.asyncio async def test_performance_metrics_accuracy(self, vector_service): """Test that performance metrics are calculated correctly.""" tenant_id = "test-tenant-123" # Mock the client with realistic data with patch.object(vector_service, 'client') as mock_client: mock_client.return_value = Mock() # Mock collection info mock_info = Mock() mock_info.segments_count = 4 mock_info.status = "green" mock_info.config.params.vectors.size = 1024 mock_info.config.params.vectors.distance = "cosine" # Mock count mock_count = Mock() mock_count.count = 1000 mock_client.return_value.get_collection.return_value = mock_info mock_client.return_value.count.return_value = mock_count metrics = await vector_service.get_performance_metrics(tenant_id) # Verify all required fields assert "tenant_id" in metrics assert "timestamp" in metrics assert "collections" in metrics assert "embedding_model" in metrics assert "embedding_dimension" in metrics # Verify values are correct assert metrics["tenant_id"] == tenant_id assert metrics["embedding_model"] == settings.EMBEDDING_MODEL assert metrics["embedding_dimension"] == settings.EMBEDDING_DIMENSION # Verify collections data collections = metrics["collections"] assert "documents" in collections assert "tables" in collections assert "charts" in collections @pytest.mark.asyncio async def test_health_check_comprehensive(self, vector_service): """Test that health check validates all critical components.""" # Mock the client and embedding generation with patch.object(vector_service, 'generate_embedding', new_callable=AsyncMock) as mock_embedding: # Create a mock client mock_client_instance = Mock() mock_client_instance.get_collections.return_value = Mock() vector_service.client = mock_client_instance mock_embedding.return_value = [0.1, 0.2, 0.3] * 341 is_healthy = await vector_service.health_check() assert is_healthy is True, "Should return True when all components are healthy" # Verify that all health checks were performed mock_client_instance.get_collections.assert_called_once() mock_embedding.assert_called_once() class TestIntegration: """Integration tests for Week 3 functionality with real end-to-end validation.""" @pytest.fixture def mock_tenant(self): """Create a mock tenant for testing.""" tenant = Mock(spec=Tenant) tenant.id = "test-tenant-123" tenant.name = "Test Tenant" return tenant @pytest.mark.asyncio async def test_end_to_end_document_processing_pipeline(self, mock_tenant): """Test the complete document processing pipeline from chunking to vector indexing.""" chunking_service = DocumentChunkingService(mock_tenant) vector_service = VectorService() # Create realistic document content content = { "text_content": [ { "text": "This is a comprehensive document for testing the complete pipeline. " * 50, "page_number": 1 } ], "tables": [ { "data": [ ["Metric", "Value", "Change"], ["Revenue", "$1M", "+15%"], ["Users", "10K", "+25%"] ], "metadata": { "page_number": 1, "title": "Performance Metrics" } } ], "charts": [ { "data": { "labels": ["Jan", "Feb", "Mar"], "values": [100, 120, 140] }, "metadata": { "page_number": 1, "chart_type": "line", "title": "Growth Trend" } } ] } # Test chunking chunks = await chunking_service.chunk_document_content("test-doc", content) assert "text_chunks" in chunks, "Should have text chunks" assert "table_chunks" in chunks, "Should have table chunks" assert "chart_chunks" in chunks, "Should have chart chunks" assert len(chunks["text_chunks"]) > 0, "Should create text chunks" assert len(chunks["table_chunks"]) > 0, "Should create table chunks" assert len(chunks["chart_chunks"]) > 0, "Should create chart chunks" # Test statistics stats = await chunking_service.get_chunk_statistics(chunks) assert stats["total_chunks"] > 0, "Should have total chunks" assert stats["total_tokens"] > 0, "Should have total tokens" # Test vector service integration (with mocking) with patch.object(vector_service, 'client') as mock_client, \ patch.object(vector_service, 'generate_batch_embeddings', new_callable=AsyncMock) as mock_embeddings: mock_client.return_value = Mock() total_chunks = len(chunks["text_chunks"]) + len(chunks["table_chunks"]) + len(chunks["chart_chunks"]) mock_embeddings.return_value = [[0.1, 0.2, 0.3] * 341] * total_chunks success = await vector_service.add_document_vectors( str(mock_tenant.id), "test-doc", chunks ) assert success is True, "Vector indexing should succeed" assert mock_embeddings.called, "Should generate embeddings for all chunks" @pytest.mark.asyncio async def test_error_handling_and_edge_cases(self, mock_tenant): """Test error handling and edge cases in the pipeline.""" chunking_service = DocumentChunkingService(mock_tenant) vector_service = VectorService() # Test with malformed content malformed_content = { "text_content": [{"text": "", "page_number": 1}], # Empty text "tables": [{"data": [], "metadata": {}}], # Empty table "charts": [{"data": {}, "metadata": {}}] # Empty chart } # Should handle gracefully chunks = await chunking_service.chunk_document_content("test-doc", malformed_content) assert "text_chunks" in chunks, "Should handle empty text" assert "table_chunks" in chunks, "Should handle empty tables" assert "chart_chunks" in chunks, "Should handle empty charts" # Test vector service with invalid data vector_service.client = None # Simulate connection failure success = await vector_service.add_document_vectors( str(mock_tenant.id), "test-doc", chunks ) assert success is False, "Should return False on connection failure" if __name__ == "__main__": pytest.main([__file__, "-v"])