Files
virtual_board_member/tests/test_week3_vector_operations.py
2025-08-08 17:17:56 -04:00

776 lines
34 KiB
Python

"""
Test suite for Week 3 Vector Database & Embedding System functionality.
Comprehensive tests that validate actual functionality, not just test structure.
"""
import pytest
import asyncio
from unittest.mock import Mock, patch, AsyncMock, MagicMock
from typing import Dict, List, Any
import json
from app.services.vector_service import VectorService
from app.services.document_chunking import DocumentChunkingService
from app.models.tenant import Tenant
from app.core.config import settings
class TestDocumentChunkingService:
"""Test cases for document chunking functionality with real validation."""
@pytest.fixture
def mock_tenant(self):
"""Create a mock tenant for testing."""
tenant = Mock(spec=Tenant)
tenant.id = "test-tenant-123"
tenant.name = "Test Tenant"
return tenant
@pytest.fixture
def chunking_service(self, mock_tenant):
"""Create a document chunking service instance."""
return DocumentChunkingService(mock_tenant)
@pytest.fixture
def sample_document_content(self):
"""Sample document content for testing."""
return {
"text_content": [
{
"text": "This is a sample document for testing purposes. It contains multiple sentences and should be chunked appropriately. The chunking algorithm should respect semantic boundaries and create meaningful chunks that preserve context.",
"page_number": 1
},
{
"text": "This is the second page of the document. It contains additional content that should also be processed. The system should handle multiple pages correctly and maintain proper page numbering in the chunks.",
"page_number": 2
}
],
"tables": [
{
"data": [
["Name", "Age", "Department", "Salary"],
["John Doe", "30", "Engineering", "$85,000"],
["Jane Smith", "25", "Marketing", "$65,000"],
["Bob Johnson", "35", "Sales", "$75,000"]
],
"metadata": {
"page_number": 1,
"title": "Employee Information"
}
}
],
"charts": [
{
"data": {
"labels": ["Q1", "Q2", "Q3", "Q4"],
"values": [100000, 150000, 200000, 250000]
},
"metadata": {
"page_number": 2,
"chart_type": "bar",
"title": "Quarterly Revenue"
}
}
]
}
@pytest.mark.asyncio
async def test_chunk_document_content_structure_and_content(self, chunking_service, sample_document_content):
"""Test document chunking with comprehensive validation of structure and content."""
document_id = "test-doc-123"
chunks = await chunking_service.chunk_document_content(document_id, sample_document_content)
# Verify structure
assert "text_chunks" in chunks
assert "table_chunks" in chunks
assert "chart_chunks" in chunks
assert "metadata" in chunks
# Verify metadata content
assert chunks["metadata"]["document_id"] == document_id
assert chunks["metadata"]["tenant_id"] == "test-tenant-123"
assert "chunking_timestamp" in chunks["metadata"]
assert chunks["metadata"]["chunk_size"] == settings.CHUNK_SIZE
assert chunks["metadata"]["chunk_overlap"] == settings.CHUNK_OVERLAP
# Verify chunk counts are reasonable
assert len(chunks["text_chunks"]) > 0, "Should have text chunks"
assert len(chunks["table_chunks"]) > 0, "Should have table chunks"
assert len(chunks["chart_chunks"]) > 0, "Should have chart chunks"
# Verify text chunks have meaningful content
for i, chunk in enumerate(chunks["text_chunks"]):
assert "id" in chunk, f"Text chunk {i} missing id"
assert "text" in chunk, f"Text chunk {i} missing text"
assert chunk["chunk_type"] == "text", f"Text chunk {i} wrong type"
assert "token_count" in chunk, f"Text chunk {i} missing token_count"
assert "page_numbers" in chunk, f"Text chunk {i} missing page_numbers"
assert len(chunk["text"]) > 0, f"Text chunk {i} has empty text"
assert chunk["token_count"] > 0, f"Text chunk {i} has zero tokens"
assert len(chunk["page_numbers"]) > 0, f"Text chunk {i} has no page numbers"
# Verify text content is meaningful (not just whitespace)
assert chunk["text"].strip(), f"Text chunk {i} contains only whitespace"
# Verify chunk size is within reasonable bounds
assert chunk["token_count"] <= settings.CHUNK_MAX_SIZE, f"Text chunk {i} too large"
if len(chunks["text_chunks"]) > 1: # If multiple chunks, check minimum size
assert chunk["token_count"] >= settings.CHUNK_MIN_SIZE, f"Text chunk {i} too small"
@pytest.mark.asyncio
async def test_chunk_text_content_semantic_boundaries(self, chunking_service):
"""Test that text chunking respects semantic boundaries."""
document_id = "test-doc-123"
# Create text with clear semantic boundaries
text_content = [
{
"text": "This is the first paragraph. It contains multiple sentences. The chunking should respect sentence boundaries. This paragraph should be chunked appropriately.",
"page_number": 1
},
{
"text": "This is the second paragraph. It has different content. The system should maintain context between paragraphs. Each chunk should be meaningful.",
"page_number": 2
}
]
chunks = await chunking_service._chunk_text_content(document_id, text_content)
assert len(chunks) > 0, "Should create chunks"
# Verify each chunk contains complete sentences
for i, chunk in enumerate(chunks):
assert chunk["document_id"] == document_id
assert chunk["tenant_id"] == "test-tenant-123"
assert chunk["chunk_type"] == "text"
assert len(chunk["text"]) > 0
# Check that chunks don't break in the middle of sentences (basic check)
text = chunk["text"]
if text.count('.') > 0: # If there are sentences
# Should not end with a partial sentence (very basic check)
assert not text.strip().endswith(','), f"Chunk {i} ends with comma"
assert not text.strip().endswith('and'), f"Chunk {i} ends with 'and'"
@pytest.mark.asyncio
async def test_chunk_table_content_structure_preservation(self, chunking_service):
"""Test that table chunking preserves table structure and creates meaningful descriptions."""
document_id = "test-doc-123"
tables = [
{
"data": [
["Product", "Sales", "Revenue", "Growth"],
["Product A", "100", "$10,000", "15%"],
["Product B", "150", "$15,000", "20%"],
["Product C", "200", "$20,000", "25%"]
],
"metadata": {
"page_number": 1,
"title": "Sales Report Q4"
}
}
]
chunks = await chunking_service._chunk_table_content(document_id, tables)
assert len(chunks) > 0, "Should create table chunks"
for chunk in chunks:
assert chunk["document_id"] == document_id
assert chunk["chunk_type"] == "table"
assert "table_data" in chunk
assert "table_metadata" in chunk
# Verify table data is preserved
table_data = chunk["table_data"]
assert len(table_data) > 0, "Table data should not be empty"
assert len(table_data[0]) == 4, "Should preserve column count"
# Verify text description is meaningful
text = chunk["text"]
assert "table" in text.lower(), "Should mention table in description"
assert "4 rows" in text or "4 columns" in text, "Should mention dimensions"
assert "Product" in text, "Should mention column headers"
@pytest.mark.asyncio
async def test_chunk_chart_content_description_quality(self, chunking_service):
"""Test that chart chunking creates meaningful descriptions."""
document_id = "test-doc-123"
charts = [
{
"data": {
"labels": ["Jan", "Feb", "Mar", "Apr"],
"values": [100, 120, 140, 160]
},
"metadata": {
"page_number": 1,
"chart_type": "line",
"title": "Monthly Growth Trend"
}
}
]
chunks = await chunking_service._chunk_chart_content(document_id, charts)
assert len(chunks) > 0, "Should create chart chunks"
for chunk in chunks:
assert chunk["document_id"] == document_id
assert chunk["chunk_type"] == "chart"
assert "chart_data" in chunk
assert "chart_metadata" in chunk
# Verify chart data is preserved
chart_data = chunk["chart_data"]
assert "labels" in chart_data
assert "values" in chart_data
assert len(chart_data["labels"]) == 4
assert len(chart_data["values"]) == 4
# Verify text description is meaningful
text = chunk["text"]
assert "chart" in text.lower(), "Should mention chart in description"
assert "line" in text.lower(), "Should mention chart type"
assert "Monthly Growth" in text, "Should include chart title"
assert "Jan" in text or "Feb" in text, "Should mention some labels"
@pytest.mark.asyncio
async def test_chunk_statistics_accuracy(self, chunking_service, sample_document_content):
"""Test that chunk statistics are calculated correctly."""
document_id = "test-doc-123"
chunks = await chunking_service.chunk_document_content(document_id, sample_document_content)
stats = await chunking_service.get_chunk_statistics(chunks)
# Verify all required fields
assert "total_chunks" in stats
assert "total_tokens" in stats
assert "average_tokens_per_chunk" in stats
assert "chunk_types" in stats
assert "chunking_parameters" in stats
# Verify calculations are correct
expected_total = len(chunks["text_chunks"]) + len(chunks["table_chunks"]) + len(chunks["chart_chunks"])
assert stats["total_chunks"] == expected_total, "Total chunks count mismatch"
# Verify token counts are reasonable
assert stats["total_tokens"] > 0, "Total tokens should be positive"
assert stats["average_tokens_per_chunk"] > 0, "Average tokens should be positive"
# Verify chunk type breakdown
assert "text" in stats["chunk_types"]
assert "table" in stats["chunk_types"]
assert "chart" in stats["chunk_types"]
assert stats["chunk_types"]["text"] == len(chunks["text_chunks"])
assert stats["chunk_types"]["table"] == len(chunks["table_chunks"])
assert stats["chunk_types"]["chart"] == len(chunks["chart_chunks"])
@pytest.mark.asyncio
async def test_chunking_with_empty_content(self, chunking_service):
"""Test chunking behavior with empty or minimal content."""
document_id = "test-doc-123"
# Test with minimal text
minimal_content = {
"text_content": [{"text": "Short text.", "page_number": 1}],
"tables": [],
"charts": []
}
chunks = await chunking_service.chunk_document_content(document_id, minimal_content)
# Should still create structure even with minimal content
assert "text_chunks" in chunks
assert "table_chunks" in chunks
assert "chart_chunks" in chunks
assert "metadata" in chunks
# Should have at least one text chunk even for short text
assert len(chunks["text_chunks"]) >= 1
# Test with completely empty content
empty_content = {
"text_content": [],
"tables": [],
"charts": []
}
chunks = await chunking_service.chunk_document_content(document_id, empty_content)
# Should handle empty content gracefully
assert len(chunks["text_chunks"]) == 0
assert len(chunks["table_chunks"]) == 0
assert len(chunks["chart_chunks"]) == 0
class TestVectorService:
"""Test cases for vector service functionality with real validation."""
@pytest.fixture
def mock_tenant(self):
"""Create a mock tenant for testing."""
tenant = Mock(spec=Tenant)
tenant.id = "test-tenant-123"
tenant.name = "Test Tenant"
return tenant
@pytest.fixture
def vector_service(self):
"""Create a vector service instance."""
return VectorService()
@pytest.fixture
def sample_chunks(self):
"""Sample chunks for testing."""
return {
"text_chunks": [
{
"id": "doc123_text_0",
"document_id": "doc123",
"tenant_id": "test-tenant-123",
"chunk_type": "text",
"chunk_index": 0,
"text": "This is a sample text chunk for testing vector operations.",
"token_count": 12,
"page_numbers": [1],
"metadata": {
"content_type": "text",
"created_at": "2024-01-01T00:00:00Z"
}
}
],
"table_chunks": [
{
"id": "doc123_table_0",
"document_id": "doc123",
"tenant_id": "test-tenant-123",
"chunk_type": "table",
"chunk_index": 0,
"text": "Table with 3 rows and 3 columns. Columns: Product, Sales, Revenue",
"token_count": 15,
"page_numbers": [1],
"table_data": [["Product", "Sales"], ["A", "100"]],
"table_metadata": {"page_number": 1},
"metadata": {
"content_type": "table",
"created_at": "2024-01-01T00:00:00Z"
}
}
],
"chart_chunks": [
{
"id": "doc123_chart_0",
"document_id": "doc123",
"tenant_id": "test-tenant-123",
"chunk_type": "chart",
"chunk_index": 0,
"text": "Chart (bar): Monthly Revenue. Shows Jan, Feb, Mar with values 100, 120, 140",
"token_count": 20,
"page_numbers": [1],
"chart_data": {"labels": ["Jan", "Feb"], "values": [100, 120]},
"chart_metadata": {"chart_type": "bar"},
"metadata": {
"content_type": "chart",
"created_at": "2024-01-01T00:00:00Z"
}
}
]
}
@pytest.mark.asyncio
async def test_embedding_generation_quality(self, vector_service):
"""Test that embedding generation produces meaningful vectors."""
test_texts = [
"This is a test text for embedding generation.",
"This is a different test text with different content.",
"This is a third test text that should produce different embeddings."
]
embeddings = []
for text in test_texts:
embedding = await vector_service.generate_embedding(text)
assert embedding is not None, f"Embedding should not be None for: {text}"
assert len(embedding) in [1024, 384], f"Embedding dimension should be 1024 or 384, got {len(embedding)}"
assert all(isinstance(x, float) for x in embedding), "All embedding values should be floats"
embeddings.append(embedding)
# Test that different texts produce different embeddings
# (This is a basic test - in practice, embeddings should be semantically different)
assert embeddings[0] != embeddings[1], "Different texts should produce different embeddings"
assert embeddings[1] != embeddings[2], "Different texts should produce different embeddings"
@pytest.mark.asyncio
async def test_batch_embedding_consistency(self, vector_service):
"""Test that batch embeddings are consistent with individual embeddings."""
texts = [
"First test text for batch embedding.",
"Second test text for batch embedding.",
"Third test text for batch embedding."
]
# Generate individual embeddings
individual_embeddings = []
for text in texts:
embedding = await vector_service.generate_embedding(text)
individual_embeddings.append(embedding)
# Generate batch embeddings
batch_embeddings = await vector_service.generate_batch_embeddings(texts)
assert len(batch_embeddings) == len(texts), "Batch should return same number of embeddings"
# Verify each embedding has correct dimension
for i, embedding in enumerate(batch_embeddings):
assert embedding is not None, f"Batch embedding {i} should not be None"
assert len(embedding) in [1024, 384], f"Batch embedding {i} wrong dimension"
assert all(isinstance(x, float) for x in embedding), f"Batch embedding {i} should contain floats"
@pytest.mark.asyncio
async def test_add_document_vectors_data_integrity(self, vector_service, sample_chunks):
"""Test that adding document vectors preserves data integrity."""
tenant_id = "test-tenant-123"
document_id = "doc123"
# Mock the client and embedding generation
with patch.object(vector_service, 'client') as mock_client, \
patch.object(vector_service, 'generate_batch_embeddings', new_callable=AsyncMock) as mock_embeddings:
mock_client.return_value = Mock()
mock_embeddings.return_value = [
[0.1, 0.2, 0.3] * 341, # 1024 dimensions
[0.4, 0.5, 0.6] * 341,
[0.7, 0.8, 0.9] * 341
]
success = await vector_service.add_document_vectors(tenant_id, document_id, sample_chunks)
assert success is True, "Should return True on success"
# Verify that the correct number of embeddings were requested
# (one for each chunk)
total_chunks = len(sample_chunks["text_chunks"]) + len(sample_chunks["table_chunks"]) + len(sample_chunks["chart_chunks"])
assert mock_embeddings.call_count == 1, "Should call batch embeddings once"
# Verify the call arguments
call_args = mock_embeddings.call_args[0][0] # First argument (texts)
assert len(call_args) == total_chunks, "Should request embeddings for all chunks"
@pytest.mark.asyncio
async def test_search_similar_result_quality(self, vector_service):
"""Test that search returns meaningful results with proper structure."""
tenant_id = "test-tenant-123"
query = "test query for search"
# Mock the client and embedding generation
with patch.object(vector_service, 'client') as mock_client, \
patch.object(vector_service, 'generate_embedding', new_callable=AsyncMock) as mock_embedding:
mock_client.return_value = Mock()
mock_embedding.return_value = [0.1, 0.2, 0.3] * 341 # 1024 dimensions
# Mock search results with realistic data
mock_search_result = [
Mock(
id="result1",
score=0.85,
payload={
"text": "This is a search result that matches the query",
"document_id": "doc123",
"chunk_type": "text",
"token_count": 10,
"page_numbers": [1],
"metadata": {"content_type": "text"}
}
),
Mock(
id="result2",
score=0.75,
payload={
"text": "Another search result with lower relevance",
"document_id": "doc124",
"chunk_type": "table",
"token_count": 15,
"page_numbers": [2],
"metadata": {"content_type": "table"}
}
)
]
mock_client.return_value.search.return_value = mock_search_result
# Mock the collection name generation
with patch.object(vector_service, '_get_collection_name', return_value="test_collection"):
vector_service.client = mock_client.return_value
results = await vector_service.search_similar(tenant_id, query, limit=5)
assert len(results) == 2, "Should return all search results"
# Verify result structure and content
for i, result in enumerate(results):
assert "id" in result, f"Result {i} missing id"
assert "score" in result, f"Result {i} missing score"
assert "text" in result, f"Result {i} missing text"
assert "document_id" in result, f"Result {i} missing document_id"
assert "chunk_type" in result, f"Result {i} missing chunk_type"
assert "token_count" in result, f"Result {i} missing token_count"
assert "page_numbers" in result, f"Result {i} missing page_numbers"
assert "metadata" in result, f"Result {i} missing metadata"
# Verify score is reasonable
assert 0 <= result["score"] <= 1, f"Result {i} score should be between 0 and 1"
# Verify text is meaningful
assert len(result["text"]) > 0, f"Result {i} text should not be empty"
# Verify chunk type is valid
assert result["chunk_type"] in ["text", "table", "chart"], f"Result {i} invalid chunk type"
# Verify results are sorted by score (descending)
scores = [result["score"] for result in results]
assert scores == sorted(scores, reverse=True), "Results should be sorted by score"
@pytest.mark.asyncio
async def test_search_structured_data_filtering(self, vector_service):
"""Test that structured data search properly filters by data type."""
tenant_id = "test-tenant-123"
query = "table data query"
data_type = "table"
# Mock the search_similar method to verify it's called with correct filters
with patch.object(vector_service, 'search_similar', new_callable=AsyncMock) as mock_search:
mock_search.return_value = [
{
"id": "table_result",
"score": 0.9,
"text": "Table with sales data",
"document_id": "doc123",
"chunk_type": "table"
}
]
results = await vector_service.search_structured_data(tenant_id, query, data_type)
assert len(results) > 0, "Should return results"
assert results[0]["chunk_type"] == "table", "Should only return table results"
# Verify search_similar was called with correct chunk_types filter
mock_search.assert_called_once()
call_kwargs = mock_search.call_args[1] # Keyword arguments
assert "chunk_types" in call_kwargs, "Should pass chunk_types filter"
assert call_kwargs["chunk_types"] == ["table"], "Should filter for table chunks only"
@pytest.mark.asyncio
async def test_hybrid_search_combination_logic(self, vector_service):
"""Test that hybrid search properly combines semantic and keyword results."""
tenant_id = "test-tenant-123"
query = "hybrid search query"
# Mock the search methods
with patch.object(vector_service, 'search_similar', new_callable=AsyncMock) as mock_semantic, \
patch.object(vector_service, '_keyword_search', new_callable=AsyncMock) as mock_keyword, \
patch.object(vector_service, '_combine_search_results', new_callable=AsyncMock) as mock_combine:
mock_semantic.return_value = [
{"id": "semantic1", "score": 0.8, "text": "Semantic result"}
]
mock_keyword.return_value = [
{"id": "keyword1", "score": 0.7, "text": "Keyword result"}
]
mock_combine.return_value = [
{"id": "combined1", "score": 0.75, "text": "Combined result"}
]
results = await vector_service.hybrid_search(tenant_id, query, limit=5)
assert len(results) > 0, "Should return combined results"
assert mock_semantic.called, "Should call semantic search"
assert mock_keyword.called, "Should call keyword search"
assert mock_combine.called, "Should call result combination"
# Verify the combination was called with correct parameters
combine_call_args = mock_combine.call_args[0]
assert len(combine_call_args) == 4, "Should pass 4 arguments to combine"
assert combine_call_args[0] == mock_semantic.return_value, "Should pass semantic results"
assert combine_call_args[1] == mock_keyword.return_value, "Should pass keyword results"
assert combine_call_args[2] == 0.7, "Should pass semantic weight"
assert combine_call_args[3] == 0.3, "Should pass keyword weight"
@pytest.mark.asyncio
async def test_performance_metrics_accuracy(self, vector_service):
"""Test that performance metrics are calculated correctly."""
tenant_id = "test-tenant-123"
# Mock the client with realistic data
with patch.object(vector_service, 'client') as mock_client:
mock_client.return_value = Mock()
# Mock collection info
mock_info = Mock()
mock_info.segments_count = 4
mock_info.status = "green"
mock_info.config.params.vectors.size = 1024
mock_info.config.params.vectors.distance = "cosine"
# Mock count
mock_count = Mock()
mock_count.count = 1000
mock_client.return_value.get_collection.return_value = mock_info
mock_client.return_value.count.return_value = mock_count
metrics = await vector_service.get_performance_metrics(tenant_id)
# Verify all required fields
assert "tenant_id" in metrics
assert "timestamp" in metrics
assert "collections" in metrics
assert "embedding_model" in metrics
assert "embedding_dimension" in metrics
# Verify values are correct
assert metrics["tenant_id"] == tenant_id
assert metrics["embedding_model"] == settings.EMBEDDING_MODEL
assert metrics["embedding_dimension"] == settings.EMBEDDING_DIMENSION
# Verify collections data
collections = metrics["collections"]
assert "documents" in collections
assert "tables" in collections
assert "charts" in collections
@pytest.mark.asyncio
async def test_health_check_comprehensive(self, vector_service):
"""Test that health check validates all critical components."""
# Mock the client and embedding generation
with patch.object(vector_service, 'generate_embedding', new_callable=AsyncMock) as mock_embedding:
# Create a mock client
mock_client_instance = Mock()
mock_client_instance.get_collections.return_value = Mock()
vector_service.client = mock_client_instance
mock_embedding.return_value = [0.1, 0.2, 0.3] * 341
is_healthy = await vector_service.health_check()
assert is_healthy is True, "Should return True when all components are healthy"
# Verify that all health checks were performed
mock_client_instance.get_collections.assert_called_once()
mock_embedding.assert_called_once()
class TestIntegration:
"""Integration tests for Week 3 functionality with real end-to-end validation."""
@pytest.fixture
def mock_tenant(self):
"""Create a mock tenant for testing."""
tenant = Mock(spec=Tenant)
tenant.id = "test-tenant-123"
tenant.name = "Test Tenant"
return tenant
@pytest.mark.asyncio
async def test_end_to_end_document_processing_pipeline(self, mock_tenant):
"""Test the complete document processing pipeline from chunking to vector indexing."""
chunking_service = DocumentChunkingService(mock_tenant)
vector_service = VectorService()
# Create realistic document content
content = {
"text_content": [
{
"text": "This is a comprehensive document for testing the complete pipeline. " * 50,
"page_number": 1
}
],
"tables": [
{
"data": [
["Metric", "Value", "Change"],
["Revenue", "$1M", "+15%"],
["Users", "10K", "+25%"]
],
"metadata": {
"page_number": 1,
"title": "Performance Metrics"
}
}
],
"charts": [
{
"data": {
"labels": ["Jan", "Feb", "Mar"],
"values": [100, 120, 140]
},
"metadata": {
"page_number": 1,
"chart_type": "line",
"title": "Growth Trend"
}
}
]
}
# Test chunking
chunks = await chunking_service.chunk_document_content("test-doc", content)
assert "text_chunks" in chunks, "Should have text chunks"
assert "table_chunks" in chunks, "Should have table chunks"
assert "chart_chunks" in chunks, "Should have chart chunks"
assert len(chunks["text_chunks"]) > 0, "Should create text chunks"
assert len(chunks["table_chunks"]) > 0, "Should create table chunks"
assert len(chunks["chart_chunks"]) > 0, "Should create chart chunks"
# Test statistics
stats = await chunking_service.get_chunk_statistics(chunks)
assert stats["total_chunks"] > 0, "Should have total chunks"
assert stats["total_tokens"] > 0, "Should have total tokens"
# Test vector service integration (with mocking)
with patch.object(vector_service, 'client') as mock_client, \
patch.object(vector_service, 'generate_batch_embeddings', new_callable=AsyncMock) as mock_embeddings:
mock_client.return_value = Mock()
total_chunks = len(chunks["text_chunks"]) + len(chunks["table_chunks"]) + len(chunks["chart_chunks"])
mock_embeddings.return_value = [[0.1, 0.2, 0.3] * 341] * total_chunks
success = await vector_service.add_document_vectors(
str(mock_tenant.id), "test-doc", chunks
)
assert success is True, "Vector indexing should succeed"
assert mock_embeddings.called, "Should generate embeddings for all chunks"
@pytest.mark.asyncio
async def test_error_handling_and_edge_cases(self, mock_tenant):
"""Test error handling and edge cases in the pipeline."""
chunking_service = DocumentChunkingService(mock_tenant)
vector_service = VectorService()
# Test with malformed content
malformed_content = {
"text_content": [{"text": "", "page_number": 1}], # Empty text
"tables": [{"data": [], "metadata": {}}], # Empty table
"charts": [{"data": {}, "metadata": {}}] # Empty chart
}
# Should handle gracefully
chunks = await chunking_service.chunk_document_content("test-doc", malformed_content)
assert "text_chunks" in chunks, "Should handle empty text"
assert "table_chunks" in chunks, "Should handle empty tables"
assert "chart_chunks" in chunks, "Should handle empty charts"
# Test vector service with invalid data
vector_service.client = None # Simulate connection failure
success = await vector_service.add_document_vectors(
str(mock_tenant.id), "test-doc", chunks
)
assert success is False, "Should return False on connection failure"
if __name__ == "__main__":
pytest.main([__file__, "-v"])