virtual_board_member/tests/test_week3_vector_operations.py

"""
Test suite for Week 3 Vector Database & Embedding System functionality.
Comprehensive tests that validate actual functionality, not just test structure.
"""

import pytest
import asyncio
from unittest.mock import Mock, patch, AsyncMock, MagicMock
from typing import Dict, List, Any
import json

from app.services.vector_service import VectorService
from app.services.document_chunking import DocumentChunkingService
from app.models.tenant import Tenant
from app.core.config import settings


class TestDocumentChunkingService:
    """Test cases for document chunking functionality with real validation."""

    @pytest.fixture
    def mock_tenant(self):
        """Create a mock tenant for testing."""
        tenant = Mock(spec=Tenant)
        tenant.id = "test-tenant-123"
        tenant.name = "Test Tenant"
        return tenant

    @pytest.fixture
    def chunking_service(self, mock_tenant):
        """Create a document chunking service instance."""
        return DocumentChunkingService(mock_tenant)

    @pytest.fixture
    def sample_document_content(self):
        """Sample document content for testing."""
        return {
            "text_content": [
                {
                    "text": "This is a sample document for testing purposes. It contains multiple sentences and should be chunked appropriately. The chunking algorithm should respect semantic boundaries and create meaningful chunks that preserve context.",
                    "page_number": 1
                },
                {
                    "text": "This is the second page of the document. It contains additional content that should also be processed. The system should handle multiple pages correctly and maintain proper page numbering in the chunks.",
                    "page_number": 2
                }
            ],
            "tables": [
                {
                    "data": [
                        ["Name", "Age", "Department", "Salary"],
                        ["John Doe", "30", "Engineering", "$85,000"],
                        ["Jane Smith", "25", "Marketing", "$65,000"],
                        ["Bob Johnson", "35", "Sales", "$75,000"]
                    ],
                    "metadata": {
                        "page_number": 1,
                        "title": "Employee Information"
                    }
                }
            ],
            "charts": [
                {
                    "data": {
                        "labels": ["Q1", "Q2", "Q3", "Q4"],
                        "values": [100000, 150000, 200000, 250000]
                    },
                    "metadata": {
                        "page_number": 2,
                        "chart_type": "bar",
                        "title": "Quarterly Revenue"
                    }
                }
            ]
        }

    @pytest.mark.asyncio
    async def test_chunk_document_content_structure_and_content(self, chunking_service, sample_document_content):
        """Test document chunking with comprehensive validation of structure and content."""
        document_id = "test-doc-123"

        chunks = await chunking_service.chunk_document_content(document_id, sample_document_content)

        # Verify structure
        assert "text_chunks" in chunks
        assert "table_chunks" in chunks
        assert "chart_chunks" in chunks
        assert "metadata" in chunks

        # Verify metadata content
        assert chunks["metadata"]["document_id"] == document_id
        assert chunks["metadata"]["tenant_id"] == "test-tenant-123"
        assert "chunking_timestamp" in chunks["metadata"]
        assert chunks["metadata"]["chunk_size"] == settings.CHUNK_SIZE
        assert chunks["metadata"]["chunk_overlap"] == settings.CHUNK_OVERLAP

        # Verify chunk counts are reasonable
        assert len(chunks["text_chunks"]) > 0, "Should have text chunks"
        assert len(chunks["table_chunks"]) > 0, "Should have table chunks"
        assert len(chunks["chart_chunks"]) > 0, "Should have chart chunks"

        # Verify text chunks have meaningful content
        for i, chunk in enumerate(chunks["text_chunks"]):
            assert "id" in chunk, f"Text chunk {i} missing id"
            assert "text" in chunk, f"Text chunk {i} missing text"
            assert chunk["chunk_type"] == "text", f"Text chunk {i} wrong type"
            assert "token_count" in chunk, f"Text chunk {i} missing token_count"
            assert "page_numbers" in chunk, f"Text chunk {i} missing page_numbers"
            assert len(chunk["text"]) > 0, f"Text chunk {i} has empty text"
            assert chunk["token_count"] > 0, f"Text chunk {i} has zero tokens"
            assert len(chunk["page_numbers"]) > 0, f"Text chunk {i} has no page numbers"

            # Verify text content is meaningful (not just whitespace)
            assert chunk["text"].strip(), f"Text chunk {i} contains only whitespace"

            # Verify chunk size is within reasonable bounds
            assert chunk["token_count"] <= settings.CHUNK_MAX_SIZE, f"Text chunk {i} too large"
            if len(chunks["text_chunks"]) > 1:  # If multiple chunks, check minimum size
                assert chunk["token_count"] >= settings.CHUNK_MIN_SIZE, f"Text chunk {i} too small"

    @pytest.mark.asyncio
    async def test_chunk_text_content_semantic_boundaries(self, chunking_service):
        """Test that text chunking respects semantic boundaries."""
        document_id = "test-doc-123"

        # Create text with clear semantic boundaries
        text_content = [
            {
                "text": "This is the first paragraph. It contains multiple sentences. The chunking should respect sentence boundaries. This paragraph should be chunked appropriately.",
                "page_number": 1
            },
            {
                "text": "This is the second paragraph. It has different content. The system should maintain context between paragraphs. Each chunk should be meaningful.",
                "page_number": 2
            }
        ]

        chunks = await chunking_service._chunk_text_content(document_id, text_content)

        assert len(chunks) > 0, "Should create chunks"

        # Verify each chunk contains complete sentences
        for i, chunk in enumerate(chunks):
            assert chunk["document_id"] == document_id
            assert chunk["tenant_id"] == "test-tenant-123"
            assert chunk["chunk_type"] == "text"
            assert len(chunk["text"]) > 0

            # Check that chunks don't break in the middle of sentences (basic check)
            text = chunk["text"]
            if text.count('.') > 0:  # If there are sentences
                # Should not end with a partial sentence (very basic check)
                assert not text.strip().endswith(','), f"Chunk {i} ends with comma"
                assert not text.strip().endswith('and'), f"Chunk {i} ends with 'and'"

    @pytest.mark.asyncio
    async def test_chunk_table_content_structure_preservation(self, chunking_service):
        """Test that table chunking preserves table structure and creates meaningful descriptions."""
        document_id = "test-doc-123"
        tables = [
            {
                "data": [
                    ["Product", "Sales", "Revenue", "Growth"],
                    ["Product A", "100", "$10,000", "15%"],
                    ["Product B", "150", "$15,000", "20%"],
                    ["Product C", "200", "$20,000", "25%"]
                ],
                "metadata": {
                    "page_number": 1,
                    "title": "Sales Report Q4"
                }
            }
        ]

        chunks = await chunking_service._chunk_table_content(document_id, tables)

        assert len(chunks) > 0, "Should create table chunks"

        for chunk in chunks:
            assert chunk["document_id"] == document_id
            assert chunk["chunk_type"] == "table"
            assert "table_data" in chunk
            assert "table_metadata" in chunk

            # Verify table data is preserved
            table_data = chunk["table_data"]
            assert len(table_data) > 0, "Table data should not be empty"
            assert len(table_data[0]) == 4, "Should preserve column count"

            # Verify text description is meaningful
            text = chunk["text"]
            assert "table" in text.lower(), "Should mention table in description"
            assert "4 rows" in text or "4 columns" in text, "Should mention dimensions"
            assert "Product" in text, "Should mention column headers"

    @pytest.mark.asyncio
    async def test_chunk_chart_content_description_quality(self, chunking_service):
        """Test that chart chunking creates meaningful descriptions."""
        document_id = "test-doc-123"
        charts = [
            {
                "data": {
                    "labels": ["Jan", "Feb", "Mar", "Apr"],
                    "values": [100, 120, 140, 160]
                },
                "metadata": {
                    "page_number": 1,
                    "chart_type": "line",
                    "title": "Monthly Growth Trend"
                }
            }
        ]

        chunks = await chunking_service._chunk_chart_content(document_id, charts)

        assert len(chunks) > 0, "Should create chart chunks"

        for chunk in chunks:
            assert chunk["document_id"] == document_id
            assert chunk["chunk_type"] == "chart"
            assert "chart_data" in chunk
            assert "chart_metadata" in chunk

            # Verify chart data is preserved
            chart_data = chunk["chart_data"]
            assert "labels" in chart_data
            assert "values" in chart_data
            assert len(chart_data["labels"]) == 4
            assert len(chart_data["values"]) == 4

            # Verify text description is meaningful
            text = chunk["text"]
            assert "chart" in text.lower(), "Should mention chart in description"
            assert "line" in text.lower(), "Should mention chart type"
            assert "Monthly Growth" in text, "Should include chart title"
            assert "Jan" in text or "Feb" in text, "Should mention some labels"

    @pytest.mark.asyncio
    async def test_chunk_statistics_accuracy(self, chunking_service, sample_document_content):
        """Test that chunk statistics are calculated correctly."""
        document_id = "test-doc-123"
        chunks = await chunking_service.chunk_document_content(document_id, sample_document_content)

        stats = await chunking_service.get_chunk_statistics(chunks)

        # Verify all required fields
        assert "total_chunks" in stats
        assert "total_tokens" in stats
        assert "average_tokens_per_chunk" in stats
        assert "chunk_types" in stats
        assert "chunking_parameters" in stats

        # Verify calculations are correct
        expected_total = len(chunks["text_chunks"]) + len(chunks["table_chunks"]) + len(chunks["chart_chunks"])
        assert stats["total_chunks"] == expected_total, "Total chunks count mismatch"

        # Verify token counts are reasonable
        assert stats["total_tokens"] > 0, "Total tokens should be positive"
        assert stats["average_tokens_per_chunk"] > 0, "Average tokens should be positive"

        # Verify chunk type breakdown
        assert "text" in stats["chunk_types"]
        assert "table" in stats["chunk_types"]
        assert "chart" in stats["chunk_types"]
        assert stats["chunk_types"]["text"] == len(chunks["text_chunks"])
        assert stats["chunk_types"]["table"] == len(chunks["table_chunks"])
        assert stats["chunk_types"]["chart"] == len(chunks["chart_chunks"])

    @pytest.mark.asyncio
    async def test_chunking_with_empty_content(self, chunking_service):
        """Test chunking behavior with empty or minimal content."""
        document_id = "test-doc-123"

        # Test with minimal text
        minimal_content = {
            "text_content": [{"text": "Short text.", "page_number": 1}],
            "tables": [],
            "charts": []
        }

        chunks = await chunking_service.chunk_document_content(document_id, minimal_content)

        # Should still create structure even with minimal content
        assert "text_chunks" in chunks
        assert "table_chunks" in chunks
        assert "chart_chunks" in chunks
        assert "metadata" in chunks

        # Should have at least one text chunk even for short text
        assert len(chunks["text_chunks"]) >= 1

        # Test with completely empty content
        empty_content = {
            "text_content": [],
            "tables": [],
            "charts": []
        }

        chunks = await chunking_service.chunk_document_content(document_id, empty_content)

        # Should handle empty content gracefully
        assert len(chunks["text_chunks"]) == 0
        assert len(chunks["table_chunks"]) == 0
        assert len(chunks["chart_chunks"]) == 0


class TestVectorService:
    """Test cases for vector service functionality with real validation."""

    @pytest.fixture
    def mock_tenant(self):
        """Create a mock tenant for testing."""
        tenant = Mock(spec=Tenant)
        tenant.id = "test-tenant-123"
        tenant.name = "Test Tenant"
        return tenant

    @pytest.fixture
    def vector_service(self):
        """Create a vector service instance."""
        return VectorService()

    @pytest.fixture
    def sample_chunks(self):
        """Sample chunks for testing."""
        return {
            "text_chunks": [
                {
                    "id": "doc123_text_0",
                    "document_id": "doc123",
                    "tenant_id": "test-tenant-123",
                    "chunk_type": "text",
                    "chunk_index": 0,
                    "text": "This is a sample text chunk for testing vector operations.",
                    "token_count": 12,
                    "page_numbers": [1],
                    "metadata": {
                        "content_type": "text",
                        "created_at": "2024-01-01T00:00:00Z"
                    }
                }
            ],
            "table_chunks": [
                {
                    "id": "doc123_table_0",
                    "document_id": "doc123",
                    "tenant_id": "test-tenant-123",
                    "chunk_type": "table",
                    "chunk_index": 0,
                    "text": "Table with 3 rows and 3 columns. Columns: Product, Sales, Revenue",
                    "token_count": 15,
                    "page_numbers": [1],
                    "table_data": [["Product", "Sales"], ["A", "100"]],
                    "table_metadata": {"page_number": 1},
                    "metadata": {
                        "content_type": "table",
                        "created_at": "2024-01-01T00:00:00Z"
                    }
                }
            ],
            "chart_chunks": [
                {
                    "id": "doc123_chart_0",
                    "document_id": "doc123",
                    "tenant_id": "test-tenant-123",
                    "chunk_type": "chart",
                    "chunk_index": 0,
                    "text": "Chart (bar): Monthly Revenue. Shows Jan, Feb, Mar with values 100, 120, 140",
                    "token_count": 20,
                    "page_numbers": [1],
                    "chart_data": {"labels": ["Jan", "Feb"], "values": [100, 120]},
                    "chart_metadata": {"chart_type": "bar"},
                    "metadata": {
                        "content_type": "chart",
                        "created_at": "2024-01-01T00:00:00Z"
                    }
                }
            ]
        }

    @pytest.mark.asyncio
    async def test_embedding_generation_quality(self, vector_service):
        """Test that embedding generation produces meaningful vectors."""
        test_texts = [
            "This is a test text for embedding generation.",
            "This is a different test text with different content.",
            "This is a third test text that should produce different embeddings."
        ]

        embeddings = []
        for text in test_texts:
            embedding = await vector_service.generate_embedding(text)
            assert embedding is not None, f"Embedding should not be None for: {text}"
            assert len(embedding) in [1024, 384], f"Embedding dimension should be 1024 or 384, got {len(embedding)}"
            assert all(isinstance(x, float) for x in embedding), "All embedding values should be floats"
            embeddings.append(embedding)

        # Test that different texts produce different embeddings
        # (This is a basic test - in practice, embeddings should be semantically different)
        assert embeddings[0] != embeddings[1], "Different texts should produce different embeddings"
        assert embeddings[1] != embeddings[2], "Different texts should produce different embeddings"

    @pytest.mark.asyncio
    async def test_batch_embedding_consistency(self, vector_service):
        """Test that batch embeddings are consistent with individual embeddings."""
        texts = [
            "First test text for batch embedding.",
            "Second test text for batch embedding.",
            "Third test text for batch embedding."
        ]

        # Generate individual embeddings
        individual_embeddings = []
        for text in texts:
            embedding = await vector_service.generate_embedding(text)
            individual_embeddings.append(embedding)

        # Generate batch embeddings
        batch_embeddings = await vector_service.generate_batch_embeddings(texts)

        assert len(batch_embeddings) == len(texts), "Batch should return same number of embeddings"

        # Verify each embedding has correct dimension
        for i, embedding in enumerate(batch_embeddings):
            assert embedding is not None, f"Batch embedding {i} should not be None"
            assert len(embedding) in [1024, 384], f"Batch embedding {i} wrong dimension"
            assert all(isinstance(x, float) for x in embedding), f"Batch embedding {i} should contain floats"

    @pytest.mark.asyncio
    async def test_add_document_vectors_data_integrity(self, vector_service, sample_chunks):
        """Test that adding document vectors preserves data integrity."""
        tenant_id = "test-tenant-123"
        document_id = "doc123"

        # Mock the client and embedding generation
        with patch.object(vector_service, 'client') as mock_client, \
             patch.object(vector_service, 'generate_batch_embeddings', new_callable=AsyncMock) as mock_embeddings:

            mock_client.return_value = Mock()
            mock_embeddings.return_value = [
                [0.1, 0.2, 0.3] * 341,  # 1024 dimensions
                [0.4, 0.5, 0.6] * 341,
                [0.7, 0.8, 0.9] * 341
            ]

            success = await vector_service.add_document_vectors(tenant_id, document_id, sample_chunks)

            assert success is True, "Should return True on success"

            # Verify that the correct number of embeddings were requested
            # (one for each chunk)
            total_chunks = len(sample_chunks["text_chunks"]) + len(sample_chunks["table_chunks"]) + len(sample_chunks["chart_chunks"])
            assert mock_embeddings.call_count == 1, "Should call batch embeddings once"

            # Verify the call arguments
            call_args = mock_embeddings.call_args[0][0]  # First argument (texts)
            assert len(call_args) == total_chunks, "Should request embeddings for all chunks"

    @pytest.mark.asyncio
    async def test_search_similar_result_quality(self, vector_service):
        """Test that search returns meaningful results with proper structure."""
        tenant_id = "test-tenant-123"
        query = "test query for search"

        # Mock the client and embedding generation
        with patch.object(vector_service, 'client') as mock_client, \
             patch.object(vector_service, 'generate_embedding', new_callable=AsyncMock) as mock_embedding:

            mock_client.return_value = Mock()
            mock_embedding.return_value = [0.1, 0.2, 0.3] * 341  # 1024 dimensions

            # Mock search results with realistic data
            mock_search_result = [
                Mock(
                    id="result1",
                    score=0.85,
                    payload={
                        "text": "This is a search result that matches the query",
                        "document_id": "doc123",
                        "chunk_type": "text",
                        "token_count": 10,
                        "page_numbers": [1],
                        "metadata": {"content_type": "text"}
                    }
                ),
                Mock(
                    id="result2",
                    score=0.75,
                    payload={
                        "text": "Another search result with lower relevance",
                        "document_id": "doc124",
                        "chunk_type": "table",
                        "token_count": 15,
                        "page_numbers": [2],
                        "metadata": {"content_type": "table"}
                    }
                )
            ]
            mock_client.return_value.search.return_value = mock_search_result

            # Mock the collection name generation
            with patch.object(vector_service, '_get_collection_name', return_value="test_collection"):
                vector_service.client = mock_client.return_value

                results = await vector_service.search_similar(tenant_id, query, limit=5)

                assert len(results) == 2, "Should return all search results"

                # Verify result structure and content
                for i, result in enumerate(results):
                    assert "id" in result, f"Result {i} missing id"
                    assert "score" in result, f"Result {i} missing score"
                    assert "text" in result, f"Result {i} missing text"
                    assert "document_id" in result, f"Result {i} missing document_id"
                    assert "chunk_type" in result, f"Result {i} missing chunk_type"
                    assert "token_count" in result, f"Result {i} missing token_count"
                    assert "page_numbers" in result, f"Result {i} missing page_numbers"
                    assert "metadata" in result, f"Result {i} missing metadata"

                    # Verify score is reasonable
                    assert 0 <= result["score"] <= 1, f"Result {i} score should be between 0 and 1"

                    # Verify text is meaningful
                    assert len(result["text"]) > 0, f"Result {i} text should not be empty"

                    # Verify chunk type is valid
                    assert result["chunk_type"] in ["text", "table", "chart"], f"Result {i} invalid chunk type"

                # Verify results are sorted by score (descending)
                scores = [result["score"] for result in results]
                assert scores == sorted(scores, reverse=True), "Results should be sorted by score"

    @pytest.mark.asyncio
    async def test_search_structured_data_filtering(self, vector_service):
        """Test that structured data search properly filters by data type."""
        tenant_id = "test-tenant-123"
        query = "table data query"
        data_type = "table"

        # Mock the search_similar method to verify it's called with correct filters
        with patch.object(vector_service, 'search_similar', new_callable=AsyncMock) as mock_search:
            mock_search.return_value = [
                {
                    "id": "table_result",
                    "score": 0.9,
                    "text": "Table with sales data",
                    "document_id": "doc123",
                    "chunk_type": "table"
                }
            ]

            results = await vector_service.search_structured_data(tenant_id, query, data_type)

            assert len(results) > 0, "Should return results"
            assert results[0]["chunk_type"] == "table", "Should only return table results"

            # Verify search_similar was called with correct chunk_types filter
            mock_search.assert_called_once()
            call_kwargs = mock_search.call_args[1]  # Keyword arguments
            assert "chunk_types" in call_kwargs, "Should pass chunk_types filter"
            assert call_kwargs["chunk_types"] == ["table"], "Should filter for table chunks only"

    @pytest.mark.asyncio
    async def test_hybrid_search_combination_logic(self, vector_service):
        """Test that hybrid search properly combines semantic and keyword results."""
        tenant_id = "test-tenant-123"
        query = "hybrid search query"

        # Mock the search methods
        with patch.object(vector_service, 'search_similar', new_callable=AsyncMock) as mock_semantic, \
             patch.object(vector_service, '_keyword_search', new_callable=AsyncMock) as mock_keyword, \
             patch.object(vector_service, '_combine_search_results', new_callable=AsyncMock) as mock_combine:

            mock_semantic.return_value = [
                {"id": "semantic1", "score": 0.8, "text": "Semantic result"}
            ]
            mock_keyword.return_value = [
                {"id": "keyword1", "score": 0.7, "text": "Keyword result"}
            ]
            mock_combine.return_value = [
                {"id": "combined1", "score": 0.75, "text": "Combined result"}
            ]

            results = await vector_service.hybrid_search(tenant_id, query, limit=5)

            assert len(results) > 0, "Should return combined results"
            assert mock_semantic.called, "Should call semantic search"
            assert mock_keyword.called, "Should call keyword search"
            assert mock_combine.called, "Should call result combination"

            # Verify the combination was called with correct parameters
            combine_call_args = mock_combine.call_args[0]
            assert len(combine_call_args) == 4, "Should pass 4 arguments to combine"
            assert combine_call_args[0] == mock_semantic.return_value, "Should pass semantic results"
            assert combine_call_args[1] == mock_keyword.return_value, "Should pass keyword results"
            assert combine_call_args[2] == 0.7, "Should pass semantic weight"
            assert combine_call_args[3] == 0.3, "Should pass keyword weight"

    @pytest.mark.asyncio
    async def test_performance_metrics_accuracy(self, vector_service):
        """Test that performance metrics are calculated correctly."""
        tenant_id = "test-tenant-123"

        # Mock the client with realistic data
        with patch.object(vector_service, 'client') as mock_client:
            mock_client.return_value = Mock()

            # Mock collection info
            mock_info = Mock()
            mock_info.segments_count = 4
            mock_info.status = "green"
            mock_info.config.params.vectors.size = 1024
            mock_info.config.params.vectors.distance = "cosine"

            # Mock count
            mock_count = Mock()
            mock_count.count = 1000

            mock_client.return_value.get_collection.return_value = mock_info
            mock_client.return_value.count.return_value = mock_count

            metrics = await vector_service.get_performance_metrics(tenant_id)

            # Verify all required fields
            assert "tenant_id" in metrics
            assert "timestamp" in metrics
            assert "collections" in metrics
            assert "embedding_model" in metrics
            assert "embedding_dimension" in metrics

            # Verify values are correct
            assert metrics["tenant_id"] == tenant_id
            assert metrics["embedding_model"] == settings.EMBEDDING_MODEL
            assert metrics["embedding_dimension"] == settings.EMBEDDING_DIMENSION

            # Verify collections data
            collections = metrics["collections"]
            assert "documents" in collections
            assert "tables" in collections
            assert "charts" in collections

    @pytest.mark.asyncio
    async def test_health_check_comprehensive(self, vector_service):
        """Test that health check validates all critical components."""
        # Mock the client and embedding generation
        with patch.object(vector_service, 'generate_embedding', new_callable=AsyncMock) as mock_embedding:

            # Create a mock client
            mock_client_instance = Mock()
            mock_client_instance.get_collections.return_value = Mock()
            vector_service.client = mock_client_instance
            mock_embedding.return_value = [0.1, 0.2, 0.3] * 341

            is_healthy = await vector_service.health_check()

            assert is_healthy is True, "Should return True when all components are healthy"

            # Verify that all health checks were performed
            mock_client_instance.get_collections.assert_called_once()
            mock_embedding.assert_called_once()


class TestIntegration:
    """Integration tests for Week 3 functionality with real end-to-end validation."""

    @pytest.fixture
    def mock_tenant(self):
        """Create a mock tenant for testing."""
        tenant = Mock(spec=Tenant)
        tenant.id = "test-tenant-123"
        tenant.name = "Test Tenant"
        return tenant

    @pytest.mark.asyncio
    async def test_end_to_end_document_processing_pipeline(self, mock_tenant):
        """Test the complete document processing pipeline from chunking to vector indexing."""
        chunking_service = DocumentChunkingService(mock_tenant)
        vector_service = VectorService()

        # Create realistic document content
        content = {
            "text_content": [
                {
                    "text": "This is a comprehensive document for testing the complete pipeline. " * 50,
                    "page_number": 1
                }
            ],
            "tables": [
                {
                    "data": [
                        ["Metric", "Value", "Change"],
                        ["Revenue", "$1M", "+15%"],
                        ["Users", "10K", "+25%"]
                    ],
                    "metadata": {
                        "page_number": 1,
                        "title": "Performance Metrics"
                    }
                }
            ],
            "charts": [
                {
                    "data": {
                        "labels": ["Jan", "Feb", "Mar"],
                        "values": [100, 120, 140]
                    },
                    "metadata": {
                        "page_number": 1,
                        "chart_type": "line",
                        "title": "Growth Trend"
                    }
                }
            ]
        }

        # Test chunking
        chunks = await chunking_service.chunk_document_content("test-doc", content)
        assert "text_chunks" in chunks, "Should have text chunks"
        assert "table_chunks" in chunks, "Should have table chunks"
        assert "chart_chunks" in chunks, "Should have chart chunks"
        assert len(chunks["text_chunks"]) > 0, "Should create text chunks"
        assert len(chunks["table_chunks"]) > 0, "Should create table chunks"
        assert len(chunks["chart_chunks"]) > 0, "Should create chart chunks"

        # Test statistics
        stats = await chunking_service.get_chunk_statistics(chunks)
        assert stats["total_chunks"] > 0, "Should have total chunks"
        assert stats["total_tokens"] > 0, "Should have total tokens"

        # Test vector service integration (with mocking)
        with patch.object(vector_service, 'client') as mock_client, \
             patch.object(vector_service, 'generate_batch_embeddings', new_callable=AsyncMock) as mock_embeddings:

            mock_client.return_value = Mock()
            total_chunks = len(chunks["text_chunks"]) + len(chunks["table_chunks"]) + len(chunks["chart_chunks"])
            mock_embeddings.return_value = [[0.1, 0.2, 0.3] * 341] * total_chunks

            success = await vector_service.add_document_vectors(
                str(mock_tenant.id), "test-doc", chunks
            )

            assert success is True, "Vector indexing should succeed"
            assert mock_embeddings.called, "Should generate embeddings for all chunks"

    @pytest.mark.asyncio
    async def test_error_handling_and_edge_cases(self, mock_tenant):
        """Test error handling and edge cases in the pipeline."""
        chunking_service = DocumentChunkingService(mock_tenant)
        vector_service = VectorService()

        # Test with malformed content
        malformed_content = {
            "text_content": [{"text": "", "page_number": 1}],  # Empty text
            "tables": [{"data": [], "metadata": {}}],  # Empty table
            "charts": [{"data": {}, "metadata": {}}]  # Empty chart
        }

        # Should handle gracefully
        chunks = await chunking_service.chunk_document_content("test-doc", malformed_content)
        assert "text_chunks" in chunks, "Should handle empty text"
        assert "table_chunks" in chunks, "Should handle empty tables"
        assert "chart_chunks" in chunks, "Should handle empty charts"

        # Test vector service with invalid data
        vector_service.client = None  # Simulate connection failure

        success = await vector_service.add_document_vectors(
            str(mock_tenant.id), "test-doc", chunks
        )

        assert success is False, "Should return False on connection failure"


if __name__ == "__main__":
    pytest.main([__file__, "-v"])