Claude-Code-Workflow/codex-lens/tests/test_ann_index.py

"""Tests for ANN (Approximate Nearest Neighbor) index using HNSW."""

import tempfile
from pathlib import Path
from unittest.mock import patch

import pytest

# Skip all tests if semantic dependencies not available
pytest.importorskip("numpy")


def _hnswlib_available() -> bool:
    """Check if hnswlib is available."""
    try:
        import hnswlib
        return True
    except ImportError:
        return False


class TestANNIndex:
    """Test suite for ANNIndex class."""

    @pytest.fixture
    def temp_db(self):
        """Create a temporary database file."""
        with tempfile.TemporaryDirectory() as tmpdir:
            yield Path(tmpdir) / "_index.db"

    @pytest.fixture
    def sample_vectors(self):
        """Generate sample vectors for testing."""
        import numpy as np
        np.random.seed(42)
        # 100 vectors of dimension 384 (matches fast model)
        return np.random.randn(100, 384).astype(np.float32)

    @pytest.fixture
    def sample_ids(self):
        """Generate sample IDs."""
        return list(range(1, 101))

    def test_import_check(self):
        """Test that HNSWLIB_AVAILABLE flag is set correctly."""
        try:
            from codexlens.semantic.ann_index import HNSWLIB_AVAILABLE
            # Should be True if hnswlib is installed, False otherwise
            assert isinstance(HNSWLIB_AVAILABLE, bool)
        except ImportError:
            pytest.skip("ann_index module not available")

    @pytest.mark.skipif(
        not _hnswlib_available(),
        reason="hnswlib not installed"
    )
    def test_create_index(self, temp_db):
        """Test creating a new ANN index."""
        from codexlens.semantic.ann_index import ANNIndex

        index = ANNIndex(temp_db, dim=384)
        assert index.dim == 384
        assert index.count() == 0
        assert not index.is_loaded

    @pytest.mark.skipif(
        not _hnswlib_available(),
        reason="hnswlib not installed"
    )
    def test_add_vectors(self, temp_db, sample_vectors, sample_ids):
        """Test adding vectors to the index."""
        from codexlens.semantic.ann_index import ANNIndex

        index = ANNIndex(temp_db, dim=384)
        index.add_vectors(sample_ids, sample_vectors)

        assert index.count() == 100
        assert index.is_loaded

    @pytest.mark.skipif(
        not _hnswlib_available(),
        reason="hnswlib not installed"
    )
    def test_search(self, temp_db, sample_vectors, sample_ids):
        """Test searching for similar vectors."""
        from codexlens.semantic.ann_index import ANNIndex

        index = ANNIndex(temp_db, dim=384)
        index.add_vectors(sample_ids, sample_vectors)

        # Search for the first vector - should find itself
        query = sample_vectors[0]
        ids, distances = index.search(query, top_k=5)

        assert len(ids) == 5
        assert len(distances) == 5
        # First result should be the query vector itself (or very close)
        assert ids[0] == 1  # ID of first vector
        assert distances[0] < 0.01  # Very small distance (almost identical)

    @pytest.mark.skipif(
        not _hnswlib_available(),
        reason="hnswlib not installed"
    )
    def test_search_clamps_top_k_to_available_vectors(self, temp_db, sample_vectors, sample_ids):
        """Search should clamp top_k to the loaded vector count."""
        from codexlens.semantic.ann_index import ANNIndex

        index = ANNIndex(temp_db, dim=384)
        index.add_vectors(sample_ids[:3], sample_vectors[:3])

        ids, distances = index.search(sample_vectors[0], top_k=10)

        assert len(ids) == 3
        assert len(distances) == 3
        assert ids[0] == 1

    @pytest.mark.skipif(
        not _hnswlib_available(),
        reason="hnswlib not installed"
    )
    def test_save_and_load(self, temp_db, sample_vectors, sample_ids):
        """Test saving and loading index from disk."""
        from codexlens.semantic.ann_index import ANNIndex

        # Create and save index
        index1 = ANNIndex(temp_db, dim=384)
        index1.add_vectors(sample_ids, sample_vectors)
        index1.save()

        # Check that file was created (new naming: {db_stem}_vectors.hnsw)
        hnsw_path = temp_db.parent / f"{temp_db.stem}_vectors.hnsw"
        assert hnsw_path.exists()

        # Load in new instance
        index2 = ANNIndex(temp_db, dim=384)
        loaded = index2.load()

        assert loaded is True
        assert index2.count() == 100
        assert index2.is_loaded

        # Verify search still works
        query = sample_vectors[0]
        ids, distances = index2.search(query, top_k=5)
        assert ids[0] == 1

    @pytest.mark.skipif(
        not _hnswlib_available(),
        reason="hnswlib not installed"
    )
    def test_load_nonexistent(self, temp_db):
        """Test loading when index file doesn't exist."""
        from codexlens.semantic.ann_index import ANNIndex

        index = ANNIndex(temp_db, dim=384)
        loaded = index.load()

        assert loaded is False
        assert not index.is_loaded

    @pytest.mark.skipif(
        not _hnswlib_available(),
        reason="hnswlib not installed"
    )
    def test_remove_vectors(self, temp_db, sample_vectors, sample_ids):
        """Test removing vectors from the index."""
        from codexlens.semantic.ann_index import ANNIndex

        index = ANNIndex(temp_db, dim=384)
        index.add_vectors(sample_ids, sample_vectors)

        # Remove first 10 vectors
        index.remove_vectors(list(range(1, 11)))

        # Search for removed vector - should not be in results
        query = sample_vectors[0]
        ids, distances = index.search(query, top_k=5)

        # ID 1 should not be in results (soft deleted)
        assert 1 not in ids

    @pytest.mark.skipif(
        not _hnswlib_available(),
        reason="hnswlib not installed"
    )
    def test_incremental_add(self, temp_db):
        """Test adding vectors incrementally."""
        import numpy as np
        from codexlens.semantic.ann_index import ANNIndex

        index = ANNIndex(temp_db, dim=384)

        # Add first batch
        vectors1 = np.random.randn(50, 384).astype(np.float32)
        index.add_vectors(list(range(1, 51)), vectors1)
        assert index.count() == 50

        # Add second batch
        vectors2 = np.random.randn(50, 384).astype(np.float32)
        index.add_vectors(list(range(51, 101)), vectors2)
        assert index.count() == 100

    @pytest.mark.skipif(
        not _hnswlib_available(),
        reason="hnswlib not installed"
    )
    def test_search_empty_index(self, temp_db):
        """Test searching an empty index."""
        import numpy as np
        from codexlens.semantic.ann_index import ANNIndex

        index = ANNIndex(temp_db, dim=384)
        query = np.random.randn(384).astype(np.float32)

        ids, distances = index.search(query, top_k=5)

        assert ids == []
        assert distances == []

    @pytest.mark.skipif(
        not _hnswlib_available(),
        reason="hnswlib not installed"
    )
    def test_invalid_dimension(self, temp_db, sample_vectors, sample_ids):
        """Test adding vectors with wrong dimension."""
        import numpy as np
        from codexlens.semantic.ann_index import ANNIndex

        index = ANNIndex(temp_db, dim=384)

        # Try to add vectors with wrong dimension
        wrong_vectors = np.random.randn(10, 768).astype(np.float32)
        with pytest.raises(ValueError, match="dimension"):
            index.add_vectors(list(range(1, 11)), wrong_vectors)

    @pytest.mark.skipif(
        not _hnswlib_available(),
        reason="hnswlib not installed"
    )
    def test_auto_resize(self, temp_db):
        """Test that index automatically resizes when capacity is exceeded."""
        import numpy as np
        from codexlens.semantic.ann_index import ANNIndex

        index = ANNIndex(temp_db, dim=384)
        # Override initial capacity to test resize
        index._max_elements = 100

        # Add more vectors than initial capacity
        vectors = np.random.randn(150, 384).astype(np.float32)
        index.add_vectors(list(range(1, 151)), vectors)

        assert index.count() == 150
        assert index._max_elements >= 150


class TestVectorStoreWithANN:
    """Test VectorStore integration with ANN index."""

    @pytest.fixture
    def temp_db(self):
        """Create a temporary database file."""
        with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir:
            yield Path(tmpdir) / "_index.db"

    @pytest.fixture
    def sample_chunks(self):
        """Create sample semantic chunks with embeddings."""
        import numpy as np
        from codexlens.entities import SemanticChunk

        np.random.seed(42)
        chunks = []
        for i in range(10):
            chunk = SemanticChunk(
                content=f"def function_{i}(): pass",
                metadata={"symbol_name": f"function_{i}", "symbol_kind": "function"},
            )
            chunk.embedding = np.random.randn(384).astype(np.float32).tolist()
            chunks.append(chunk)
        return chunks

    def test_vector_store_with_ann(self, temp_db, sample_chunks):
        """Test VectorStore using ANN index for search."""
        from codexlens.semantic.vector_store import VectorStore, HNSWLIB_AVAILABLE

        store = VectorStore(temp_db)

        # Add chunks
        ids = store.add_chunks(sample_chunks, "test.py")
        assert len(ids) == 10

        # Check ANN status
        if HNSWLIB_AVAILABLE:
            assert store.ann_available or store.ann_count >= 0

        # Search
        query_embedding = sample_chunks[0].embedding
        results = store.search_similar(query_embedding, top_k=5)

        assert len(results) <= 5
        if results:
            # First result should have high similarity
            assert results[0].score > 0.9

    def test_vector_store_rebuild_ann(self, temp_db, sample_chunks):
        """Test rebuilding ANN index from SQLite data."""
        from codexlens.semantic.vector_store import VectorStore, HNSWLIB_AVAILABLE

        if not HNSWLIB_AVAILABLE:
            pytest.skip("hnswlib not installed")

        store = VectorStore(temp_db)

        # Add chunks
        store.add_chunks(sample_chunks, "test.py")

        # Rebuild ANN index
        count = store.rebuild_ann_index()
        assert count == 10

        # Verify search works
        query_embedding = sample_chunks[0].embedding
        results = store.search_similar(query_embedding, top_k=5)
        assert len(results) > 0

    def test_vector_store_delete_updates_ann(self, temp_db, sample_chunks):
        """Test that deleting chunks updates ANN index."""
        from codexlens.semantic.vector_store import VectorStore, HNSWLIB_AVAILABLE

        if not HNSWLIB_AVAILABLE:
            pytest.skip("hnswlib not installed")

        store = VectorStore(temp_db)

        # Add chunks for two files
        store.add_chunks(sample_chunks[:5], "file1.py")
        store.add_chunks(sample_chunks[5:], "file2.py")

        initial_count = store.count_chunks()
        assert initial_count == 10

        # Delete one file's chunks
        deleted = store.delete_file_chunks("file1.py")
        assert deleted == 5

        # Verify count
        assert store.count_chunks() == 5

    def test_vector_store_batch_add(self, temp_db, sample_chunks):
        """Test batch adding chunks from multiple files."""
        from codexlens.semantic.vector_store import VectorStore

        store = VectorStore(temp_db)

        # Prepare chunks with paths
        chunks_with_paths = [
            (chunk, f"file{i % 3}.py")
            for i, chunk in enumerate(sample_chunks)
        ]

        # Batch add
        ids = store.add_chunks_batch(chunks_with_paths)
        assert len(ids) == 10

        # Verify
        assert store.count_chunks() == 10

    def test_vector_store_fallback_search(self, temp_db, sample_chunks):
        """Test that search falls back to brute-force when ANN unavailable."""
        from codexlens.semantic.vector_store import VectorStore

        store = VectorStore(temp_db)
        store.add_chunks(sample_chunks, "test.py")

        # Force disable ANN
        store._ann_index = None

        # Search should still work (brute-force fallback)
        query_embedding = sample_chunks[0].embedding
        results = store.search_similar(query_embedding, top_k=5)

        assert len(results) > 0
        assert results[0].score > 0.9


class TestSearchAccuracy:
    """Test search accuracy comparing ANN vs brute-force."""

    @pytest.fixture
    def temp_db(self):
        """Create a temporary database file."""
        with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir:
            yield Path(tmpdir) / "_index.db"

    @pytest.mark.skipif(
        not _hnswlib_available(),
        reason="hnswlib not installed"
    )
    def test_ann_vs_brute_force_recall(self, temp_db):
        """Test that ANN search has high recall compared to brute-force."""
        import numpy as np
        from codexlens.entities import SemanticChunk
        from codexlens.semantic.vector_store import VectorStore

        np.random.seed(42)

        # Create larger dataset
        chunks = []
        for i in range(100):
            chunk = SemanticChunk(
                content=f"code block {i}",
                metadata={"chunk_id": i},
            )
            chunk.embedding = np.random.randn(384).astype(np.float32).tolist()
            chunks.append(chunk)

        store = VectorStore(temp_db)
        store.add_chunks(chunks, "test.py")

        # Get brute-force results
        store._ann_index = None  # Force brute-force
        store._invalidate_cache()  # Clear cache to force refresh
        query = chunks[0].embedding
        bf_results = store.search_similar(query, top_k=10)
        # Use chunk_id from metadata for comparison (more reliable than path+score)
        bf_chunk_ids = {r.metadata.get("chunk_id") for r in bf_results}

        # Rebuild ANN and get ANN results
        store.rebuild_ann_index()
        ann_results = store.search_similar(query, top_k=10)
        ann_chunk_ids = {r.metadata.get("chunk_id") for r in ann_results}

        # Calculate recall (how many brute-force results are in ANN results)
        # ANN should find at least 80% of the same results
        overlap = len(bf_chunk_ids & ann_chunk_ids)
        recall = overlap / len(bf_chunk_ids) if bf_chunk_ids else 1.0

        assert recall >= 0.8, f"ANN recall too low: {recall} (overlap: {overlap}, bf: {bf_chunk_ids}, ann: {ann_chunk_ids})"


class TestBinaryANNIndex:
    """Test suite for BinaryANNIndex class (Hamming distance-based search)."""

    @pytest.fixture
    def temp_db(self):
        """Create a temporary database file."""
        with tempfile.TemporaryDirectory() as tmpdir:
            yield Path(tmpdir) / "_index.db"

    @pytest.fixture
    def sample_binary_vectors(self):
        """Generate sample binary vectors for testing."""
        import numpy as np
        np.random.seed(42)
        # 100 binary vectors of dimension 256 (packed as 32 bytes each)
        binary_unpacked = (np.random.rand(100, 256) > 0.5).astype(np.uint8)
        packed = [np.packbits(v).tobytes() for v in binary_unpacked]
        return packed, binary_unpacked

    @pytest.fixture
    def sample_ids(self):
        """Generate sample IDs."""
        return list(range(1, 101))

    def test_create_binary_index(self, temp_db):
        """Test creating a new Binary ANN index."""
        from codexlens.semantic.ann_index import BinaryANNIndex

        index = BinaryANNIndex(temp_db, dim=256)
        assert index.dim == 256
        assert index.packed_dim == 32
        assert index.count() == 0
        assert not index.is_loaded

    def test_invalid_dimension(self, temp_db):
        """Test that invalid dimensions are rejected."""
        from codexlens.semantic.ann_index import BinaryANNIndex

        # Dimension must be divisible by 8
        with pytest.raises(ValueError, match="divisible by 8"):
            BinaryANNIndex(temp_db, dim=255)

        with pytest.raises(ValueError, match="positive"):
            BinaryANNIndex(temp_db, dim=0)

    def test_add_packed_vectors(self, temp_db, sample_binary_vectors, sample_ids):
        """Test adding packed binary vectors to the index."""
        from codexlens.semantic.ann_index import BinaryANNIndex

        packed, _ = sample_binary_vectors
        index = BinaryANNIndex(temp_db, dim=256)
        index.add_vectors(sample_ids, packed)

        assert index.count() == 100
        assert index.is_loaded

    def test_add_numpy_vectors(self, temp_db, sample_binary_vectors, sample_ids):
        """Test adding unpacked numpy binary vectors."""
        from codexlens.semantic.ann_index import BinaryANNIndex
        import numpy as np

        _, unpacked = sample_binary_vectors
        index = BinaryANNIndex(temp_db, dim=256)
        index.add_vectors_numpy(sample_ids, unpacked)

        assert index.count() == 100

    def test_search_packed(self, temp_db, sample_binary_vectors, sample_ids):
        """Test searching with packed binary query."""
        from codexlens.semantic.ann_index import BinaryANNIndex

        packed, _ = sample_binary_vectors
        index = BinaryANNIndex(temp_db, dim=256)
        index.add_vectors(sample_ids, packed)

        # Search for the first vector - should find itself with distance 0
        query = packed[0]
        ids, distances = index.search(query, top_k=5)

        assert len(ids) == 5
        assert len(distances) == 5
        # First result should be the query vector itself
        assert ids[0] == 1
        assert distances[0] == 0  # Hamming distance of 0 (identical)

    def test_search_numpy(self, temp_db, sample_binary_vectors, sample_ids):
        """Test searching with unpacked numpy query."""
        from codexlens.semantic.ann_index import BinaryANNIndex

        packed, unpacked = sample_binary_vectors
        index = BinaryANNIndex(temp_db, dim=256)
        index.add_vectors(sample_ids, packed)

        # Search for the first vector using numpy interface
        query = unpacked[0]
        ids, distances = index.search_numpy(query, top_k=5)

        assert len(ids) == 5
        assert ids[0] == 1
        assert distances[0] == 0

    def test_search_batch(self, temp_db, sample_binary_vectors, sample_ids):
        """Test batch search with multiple queries."""
        from codexlens.semantic.ann_index import BinaryANNIndex

        packed, _ = sample_binary_vectors
        index = BinaryANNIndex(temp_db, dim=256)
        index.add_vectors(sample_ids, packed)

        # Search for first 3 vectors
        queries = packed[:3]
        results = index.search_batch(queries, top_k=5)

        assert len(results) == 3
        # Each result should find itself first
        for i, (ids, dists) in enumerate(results):
            assert ids[0] == i + 1
            assert dists[0] == 0

    def test_hamming_distance_ordering(self, temp_db):
        """Test that results are ordered by Hamming distance."""
        from codexlens.semantic.ann_index import BinaryANNIndex
        import numpy as np

        index = BinaryANNIndex(temp_db, dim=256)

        # Create vectors with known Hamming distances from a query
        query = np.zeros(256, dtype=np.uint8)  # All zeros
        v1 = np.zeros(256, dtype=np.uint8)  # Distance 0
        v2 = np.zeros(256, dtype=np.uint8); v2[:10] = 1  # Distance 10
        v3 = np.zeros(256, dtype=np.uint8); v3[:50] = 1  # Distance 50
        v4 = np.ones(256, dtype=np.uint8)  # Distance 256

        index.add_vectors_numpy([1, 2, 3, 4], np.array([v1, v2, v3, v4]))

        query_packed = np.packbits(query).tobytes()
        ids, distances = index.search(query_packed, top_k=4)

        assert ids == [1, 2, 3, 4]
        assert distances == [0, 10, 50, 256]

    def test_save_and_load(self, temp_db, sample_binary_vectors, sample_ids):
        """Test saving and loading binary index from disk."""
        from codexlens.semantic.ann_index import BinaryANNIndex

        packed, _ = sample_binary_vectors

        # Create and save index
        index1 = BinaryANNIndex(temp_db, dim=256)
        index1.add_vectors(sample_ids, packed)
        index1.save()

        # Check that file was created
        binary_path = temp_db.parent / f"{temp_db.stem}_binary_vectors.bin"
        assert binary_path.exists()

        # Load in new instance
        index2 = BinaryANNIndex(temp_db, dim=256)
        loaded = index2.load()

        assert loaded is True
        assert index2.count() == 100
        assert index2.is_loaded

        # Verify search still works
        query = packed[0]
        ids, distances = index2.search(query, top_k=5)
        assert ids[0] == 1
        assert distances[0] == 0

    def test_load_nonexistent(self, temp_db):
        """Test loading when index file doesn't exist."""
        from codexlens.semantic.ann_index import BinaryANNIndex

        index = BinaryANNIndex(temp_db, dim=256)
        loaded = index.load()

        assert loaded is False
        assert not index.is_loaded

    def test_remove_vectors(self, temp_db, sample_binary_vectors, sample_ids):
        """Test removing vectors from the index."""
        from codexlens.semantic.ann_index import BinaryANNIndex

        packed, _ = sample_binary_vectors
        index = BinaryANNIndex(temp_db, dim=256)
        index.add_vectors(sample_ids, packed)

        # Remove first 10 vectors
        index.remove_vectors(list(range(1, 11)))

        assert index.count() == 90

        # Removed vectors should not be findable
        query = packed[0]
        ids, _ = index.search(query, top_k=100)
        for removed_id in range(1, 11):
            assert removed_id not in ids

    def test_get_vector(self, temp_db, sample_binary_vectors, sample_ids):
        """Test retrieving a specific vector by ID."""
        from codexlens.semantic.ann_index import BinaryANNIndex

        packed, _ = sample_binary_vectors
        index = BinaryANNIndex(temp_db, dim=256)
        index.add_vectors(sample_ids, packed)

        # Get existing vector
        vec = index.get_vector(1)
        assert vec == packed[0]

        # Get non-existing vector
        vec = index.get_vector(9999)
        assert vec is None

    def test_clear(self, temp_db, sample_binary_vectors, sample_ids):
        """Test clearing all vectors from the index."""
        from codexlens.semantic.ann_index import BinaryANNIndex

        packed, _ = sample_binary_vectors
        index = BinaryANNIndex(temp_db, dim=256)
        index.add_vectors(sample_ids, packed)
        assert index.count() == 100

        index.clear()
        assert index.count() == 0
        assert not index.is_loaded

    def test_search_empty_index(self, temp_db):
        """Test searching an empty index."""
        from codexlens.semantic.ann_index import BinaryANNIndex
        import numpy as np

        index = BinaryANNIndex(temp_db, dim=256)
        query = np.packbits(np.zeros(256, dtype=np.uint8)).tobytes()

        ids, distances = index.search(query, top_k=5)

        assert ids == []
        assert distances == []

    def test_update_existing_vector(self, temp_db):
        """Test updating an existing vector with new data."""
        from codexlens.semantic.ann_index import BinaryANNIndex
        import numpy as np

        index = BinaryANNIndex(temp_db, dim=256)

        # Add initial vector
        v1 = np.zeros(256, dtype=np.uint8)
        index.add_vectors_numpy([1], v1.reshape(1, -1))

        # Update with different vector
        v2 = np.ones(256, dtype=np.uint8)
        index.add_vectors_numpy([1], v2.reshape(1, -1))

        # Count should still be 1
        assert index.count() == 1

        # Retrieved vector should be the updated one
        stored = index.get_vector(1)
        expected = np.packbits(v2).tobytes()
        assert stored == expected


class TestCreateAnnIndexFactory:
    """Test suite for create_ann_index factory function."""

    @pytest.fixture
    def temp_db(self):
        """Create a temporary database file."""
        with tempfile.TemporaryDirectory() as tmpdir:
            yield Path(tmpdir) / "_index.db"

    @pytest.mark.skipif(
        not _hnswlib_available(),
        reason="hnswlib not installed"
    )
    def test_create_hnsw_index(self, temp_db):
        """Test creating HNSW index via factory."""
        from codexlens.semantic.ann_index import create_ann_index, ANNIndex

        index = create_ann_index(temp_db, index_type="hnsw", dim=384)
        assert isinstance(index, ANNIndex)
        assert index.dim == 384

    def test_create_binary_index(self, temp_db):
        """Test creating binary index via factory."""
        from codexlens.semantic.ann_index import create_ann_index, BinaryANNIndex

        index = create_ann_index(temp_db, index_type="binary", dim=256)
        assert isinstance(index, BinaryANNIndex)
        assert index.dim == 256

    def test_create_binary_index_default_dim(self, temp_db):
        """Test that binary index defaults to 256 dim when dense default is used."""
        from codexlens.semantic.ann_index import create_ann_index, BinaryANNIndex

        # When dim=2048 (dense default) is passed with binary type,
        # it should auto-adjust to 256
        index = create_ann_index(temp_db, index_type="binary")
        assert isinstance(index, BinaryANNIndex)
        assert index.dim == 256

    def test_invalid_index_type(self, temp_db):
        """Test that invalid index type raises error."""
        from codexlens.semantic.ann_index import create_ann_index

        with pytest.raises(ValueError, match="Invalid index_type"):
            create_ann_index(temp_db, index_type="invalid")

    def test_case_insensitive_index_type(self, temp_db):
        """Test that index_type is case-insensitive."""
        from codexlens.semantic.ann_index import create_ann_index, BinaryANNIndex

        index = create_ann_index(temp_db, index_type="BINARY", dim=256)
        assert isinstance(index, BinaryANNIndex)