feat: Add multi-type embedding backends for cascade retrieval

- Implemented BinaryEmbeddingBackend for fast coarse filtering using 256-dimensional binary vectors. - Developed DenseEmbeddingBackend for high-precision dense vectors (2048 dimensions) for reranking. - Created CascadeEmbeddingBackend to combine binary and dense embeddings for two-stage retrieval. - Introduced utility functions for embedding conversion and distance computation. chore: Migration 010 - Add multi-vector storage support - Added 'chunks' table to support multi-vector embeddings for cascade retrieval. - Included new columns: embedding_binary (256-dim) and embedding_dense (2048-dim) for efficient storage. - Implemented upgrade and downgrade functions to manage schema changes and data migration.
2026-02-05 01:50:27 +08:00 · 2026-01-02 10:52:43 +08:00
parent 195438d26a
commit e21d801523
13 changed files with 3449 additions and 6 deletions
--- a/codex-lens/tests/test_ann_index.py
+++ b/codex-lens/tests/test_ann_index.py
@@ -421,3 +421,323 @@ class TestSearchAccuracy:
        recall = overlap / len(bf_chunk_ids) if bf_chunk_ids else 1.0

        assert recall >= 0.8, f"ANN recall too low: {recall} (overlap: {overlap}, bf: {bf_chunk_ids}, ann: {ann_chunk_ids})"
+
+
+
+class TestBinaryANNIndex:
+    """Test suite for BinaryANNIndex class (Hamming distance-based search)."""
+
+    @pytest.fixture
+    def temp_db(self):
+        """Create a temporary database file."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            yield Path(tmpdir) / "_index.db"
+
+    @pytest.fixture
+    def sample_binary_vectors(self):
+        """Generate sample binary vectors for testing."""
+        import numpy as np
+        np.random.seed(42)
+        # 100 binary vectors of dimension 256 (packed as 32 bytes each)
+        binary_unpacked = (np.random.rand(100, 256) > 0.5).astype(np.uint8)
+        packed = [np.packbits(v).tobytes() for v in binary_unpacked]
+        return packed, binary_unpacked
+
+    @pytest.fixture
+    def sample_ids(self):
+        """Generate sample IDs."""
+        return list(range(1, 101))
+
+    def test_create_binary_index(self, temp_db):
+        """Test creating a new Binary ANN index."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+
+        index = BinaryANNIndex(temp_db, dim=256)
+        assert index.dim == 256
+        assert index.packed_dim == 32
+        assert index.count() == 0
+        assert not index.is_loaded
+
+    def test_invalid_dimension(self, temp_db):
+        """Test that invalid dimensions are rejected."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+
+        # Dimension must be divisible by 8
+        with pytest.raises(ValueError, match="divisible by 8"):
+            BinaryANNIndex(temp_db, dim=255)
+
+        with pytest.raises(ValueError, match="positive"):
+            BinaryANNIndex(temp_db, dim=0)
+
+    def test_add_packed_vectors(self, temp_db, sample_binary_vectors, sample_ids):
+        """Test adding packed binary vectors to the index."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+
+        packed, _ = sample_binary_vectors
+        index = BinaryANNIndex(temp_db, dim=256)
+        index.add_vectors(sample_ids, packed)
+
+        assert index.count() == 100
+        assert index.is_loaded
+
+    def test_add_numpy_vectors(self, temp_db, sample_binary_vectors, sample_ids):
+        """Test adding unpacked numpy binary vectors."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+        import numpy as np
+
+        _, unpacked = sample_binary_vectors
+        index = BinaryANNIndex(temp_db, dim=256)
+        index.add_vectors_numpy(sample_ids, unpacked)
+
+        assert index.count() == 100
+
+    def test_search_packed(self, temp_db, sample_binary_vectors, sample_ids):
+        """Test searching with packed binary query."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+
+        packed, _ = sample_binary_vectors
+        index = BinaryANNIndex(temp_db, dim=256)
+        index.add_vectors(sample_ids, packed)
+
+        # Search for the first vector - should find itself with distance 0
+        query = packed[0]
+        ids, distances = index.search(query, top_k=5)
+
+        assert len(ids) == 5
+        assert len(distances) == 5
+        # First result should be the query vector itself
+        assert ids[0] == 1
+        assert distances[0] == 0  # Hamming distance of 0 (identical)
+
+    def test_search_numpy(self, temp_db, sample_binary_vectors, sample_ids):
+        """Test searching with unpacked numpy query."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+
+        packed, unpacked = sample_binary_vectors
+        index = BinaryANNIndex(temp_db, dim=256)
+        index.add_vectors(sample_ids, packed)
+
+        # Search for the first vector using numpy interface
+        query = unpacked[0]
+        ids, distances = index.search_numpy(query, top_k=5)
+
+        assert len(ids) == 5
+        assert ids[0] == 1
+        assert distances[0] == 0
+
+    def test_search_batch(self, temp_db, sample_binary_vectors, sample_ids):
+        """Test batch search with multiple queries."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+
+        packed, _ = sample_binary_vectors
+        index = BinaryANNIndex(temp_db, dim=256)
+        index.add_vectors(sample_ids, packed)
+
+        # Search for first 3 vectors
+        queries = packed[:3]
+        results = index.search_batch(queries, top_k=5)
+
+        assert len(results) == 3
+        # Each result should find itself first
+        for i, (ids, dists) in enumerate(results):
+            assert ids[0] == i + 1
+            assert dists[0] == 0
+
+    def test_hamming_distance_ordering(self, temp_db):
+        """Test that results are ordered by Hamming distance."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+        import numpy as np
+
+        index = BinaryANNIndex(temp_db, dim=256)
+
+        # Create vectors with known Hamming distances from a query
+        query = np.zeros(256, dtype=np.uint8)  # All zeros
+        v1 = np.zeros(256, dtype=np.uint8)  # Distance 0
+        v2 = np.zeros(256, dtype=np.uint8); v2[:10] = 1  # Distance 10
+        v3 = np.zeros(256, dtype=np.uint8); v3[:50] = 1  # Distance 50
+        v4 = np.ones(256, dtype=np.uint8)  # Distance 256
+
+        index.add_vectors_numpy([1, 2, 3, 4], np.array([v1, v2, v3, v4]))
+
+        query_packed = np.packbits(query).tobytes()
+        ids, distances = index.search(query_packed, top_k=4)
+
+        assert ids == [1, 2, 3, 4]
+        assert distances == [0, 10, 50, 256]
+
+    def test_save_and_load(self, temp_db, sample_binary_vectors, sample_ids):
+        """Test saving and loading binary index from disk."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+
+        packed, _ = sample_binary_vectors
+
+        # Create and save index
+        index1 = BinaryANNIndex(temp_db, dim=256)
+        index1.add_vectors(sample_ids, packed)
+        index1.save()
+
+        # Check that file was created
+        binary_path = temp_db.parent / f"{temp_db.stem}_binary_vectors.bin"
+        assert binary_path.exists()
+
+        # Load in new instance
+        index2 = BinaryANNIndex(temp_db, dim=256)
+        loaded = index2.load()
+
+        assert loaded is True
+        assert index2.count() == 100
+        assert index2.is_loaded
+
+        # Verify search still works
+        query = packed[0]
+        ids, distances = index2.search(query, top_k=5)
+        assert ids[0] == 1
+        assert distances[0] == 0
+
+    def test_load_nonexistent(self, temp_db):
+        """Test loading when index file doesn't exist."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+
+        index = BinaryANNIndex(temp_db, dim=256)
+        loaded = index.load()
+
+        assert loaded is False
+        assert not index.is_loaded
+
+    def test_remove_vectors(self, temp_db, sample_binary_vectors, sample_ids):
+        """Test removing vectors from the index."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+
+        packed, _ = sample_binary_vectors
+        index = BinaryANNIndex(temp_db, dim=256)
+        index.add_vectors(sample_ids, packed)
+
+        # Remove first 10 vectors
+        index.remove_vectors(list(range(1, 11)))
+
+        assert index.count() == 90
+
+        # Removed vectors should not be findable
+        query = packed[0]
+        ids, _ = index.search(query, top_k=100)
+        for removed_id in range(1, 11):
+            assert removed_id not in ids
+
+    def test_get_vector(self, temp_db, sample_binary_vectors, sample_ids):
+        """Test retrieving a specific vector by ID."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+
+        packed, _ = sample_binary_vectors
+        index = BinaryANNIndex(temp_db, dim=256)
+        index.add_vectors(sample_ids, packed)
+
+        # Get existing vector
+        vec = index.get_vector(1)
+        assert vec == packed[0]
+
+        # Get non-existing vector
+        vec = index.get_vector(9999)
+        assert vec is None
+
+    def test_clear(self, temp_db, sample_binary_vectors, sample_ids):
+        """Test clearing all vectors from the index."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+
+        packed, _ = sample_binary_vectors
+        index = BinaryANNIndex(temp_db, dim=256)
+        index.add_vectors(sample_ids, packed)
+        assert index.count() == 100
+
+        index.clear()
+        assert index.count() == 0
+        assert not index.is_loaded
+
+    def test_search_empty_index(self, temp_db):
+        """Test searching an empty index."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+        import numpy as np
+
+        index = BinaryANNIndex(temp_db, dim=256)
+        query = np.packbits(np.zeros(256, dtype=np.uint8)).tobytes()
+
+        ids, distances = index.search(query, top_k=5)
+
+        assert ids == []
+        assert distances == []
+
+    def test_update_existing_vector(self, temp_db):
+        """Test updating an existing vector with new data."""
+        from codexlens.semantic.ann_index import BinaryANNIndex
+        import numpy as np
+
+        index = BinaryANNIndex(temp_db, dim=256)
+
+        # Add initial vector
+        v1 = np.zeros(256, dtype=np.uint8)
+        index.add_vectors_numpy([1], v1.reshape(1, -1))
+
+        # Update with different vector
+        v2 = np.ones(256, dtype=np.uint8)
+        index.add_vectors_numpy([1], v2.reshape(1, -1))
+
+        # Count should still be 1
+        assert index.count() == 1
+
+        # Retrieved vector should be the updated one
+        stored = index.get_vector(1)
+        expected = np.packbits(v2).tobytes()
+        assert stored == expected
+
+
+class TestCreateAnnIndexFactory:
+    """Test suite for create_ann_index factory function."""
+
+    @pytest.fixture
+    def temp_db(self):
+        """Create a temporary database file."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            yield Path(tmpdir) / "_index.db"
+
+    @pytest.mark.skipif(
+        not _hnswlib_available(),
+        reason="hnswlib not installed"
+    )
+    def test_create_hnsw_index(self, temp_db):
+        """Test creating HNSW index via factory."""
+        from codexlens.semantic.ann_index import create_ann_index, ANNIndex
+
+        index = create_ann_index(temp_db, index_type="hnsw", dim=384)
+        assert isinstance(index, ANNIndex)
+        assert index.dim == 384
+
+    def test_create_binary_index(self, temp_db):
+        """Test creating binary index via factory."""
+        from codexlens.semantic.ann_index import create_ann_index, BinaryANNIndex
+
+        index = create_ann_index(temp_db, index_type="binary", dim=256)
+        assert isinstance(index, BinaryANNIndex)
+        assert index.dim == 256
+
+    def test_create_binary_index_default_dim(self, temp_db):
+        """Test that binary index defaults to 256 dim when dense default is used."""
+        from codexlens.semantic.ann_index import create_ann_index, BinaryANNIndex
+
+        # When dim=2048 (dense default) is passed with binary type,
+        # it should auto-adjust to 256
+        index = create_ann_index(temp_db, index_type="binary")
+        assert isinstance(index, BinaryANNIndex)
+        assert index.dim == 256
+
+    def test_invalid_index_type(self, temp_db):
+        """Test that invalid index type raises error."""
+        from codexlens.semantic.ann_index import create_ann_index
+
+        with pytest.raises(ValueError, match="Invalid index_type"):
+            create_ann_index(temp_db, index_type="invalid")
+
+    def test_case_insensitive_index_type(self, temp_db):
+        """Test that index_type is case-insensitive."""
+        from codexlens.semantic.ann_index import create_ann_index, BinaryANNIndex
+
+        index = create_ann_index(temp_db, index_type="BINARY", dim=256)
+        assert isinstance(index, BinaryANNIndex)
--- a/codex-lens/tests/test_sqlite_store.py
+++ b/codex-lens/tests/test_sqlite_store.py
@@ -201,3 +201,244 @@ def test_add_files_rollback_failure_is_chained(
        assert "boom" in caplog.text
    finally:
        store.close()
+
+
+class TestMultiVectorChunks:
+    """Tests for multi-vector chunk storage operations."""
+
+    def test_add_chunks_basic(self, tmp_path: Path) -> None:
+        """Basic chunk insertion without embeddings."""
+        store = SQLiteStore(tmp_path / "chunks_basic.db")
+        store.initialize()
+
+        try:
+            chunks_data = [
+                {"content": "def hello(): pass", "metadata": {"type": "function"}},
+                {"content": "class World: pass", "metadata": {"type": "class"}},
+            ]
+
+            ids = store.add_chunks("test.py", chunks_data)
+
+            assert len(ids) == 2
+            assert ids == [1, 2]
+            assert store.count_chunks() == 2
+        finally:
+            store.close()
+
+    def test_add_chunks_with_binary_embeddings(self, tmp_path: Path) -> None:
+        """Chunk insertion with binary embeddings for coarse ranking."""
+        store = SQLiteStore(tmp_path / "chunks_binary.db")
+        store.initialize()
+
+        try:
+            chunks_data = [
+                {"content": "content1"},
+                {"content": "content2"},
+            ]
+            # 256-bit binary = 32 bytes
+            binary_embs = [b"\x00" * 32, b"\xff" * 32]
+
+            ids = store.add_chunks(
+                "test.py", chunks_data, embedding_binary=binary_embs
+            )
+
+            assert len(ids) == 2
+
+            retrieved = store.get_binary_embeddings(ids)
+            assert len(retrieved) == 2
+            assert retrieved[ids[0]] == b"\x00" * 32
+            assert retrieved[ids[1]] == b"\xff" * 32
+        finally:
+            store.close()
+
+    def test_add_chunks_with_dense_embeddings(self, tmp_path: Path) -> None:
+        """Chunk insertion with dense embeddings for fine ranking."""
+        store = SQLiteStore(tmp_path / "chunks_dense.db")
+        store.initialize()
+
+        try:
+            chunks_data = [{"content": "content1"}, {"content": "content2"}]
+            # 2048 floats = 8192 bytes
+            dense_embs = [b"\x00" * 8192, b"\xff" * 8192]
+
+            ids = store.add_chunks(
+                "test.py", chunks_data, embedding_dense=dense_embs
+            )
+
+            assert len(ids) == 2
+
+            retrieved = store.get_dense_embeddings(ids)
+            assert len(retrieved) == 2
+            assert retrieved[ids[0]] == b"\x00" * 8192
+            assert retrieved[ids[1]] == b"\xff" * 8192
+        finally:
+            store.close()
+
+    def test_add_chunks_with_all_embeddings(self, tmp_path: Path) -> None:
+        """Chunk insertion with all embedding types."""
+        store = SQLiteStore(tmp_path / "chunks_all.db")
+        store.initialize()
+
+        try:
+            chunks_data = [{"content": "full test"}]
+            embedding = [[0.1, 0.2, 0.3]]
+            binary_embs = [b"\xab" * 32]
+            dense_embs = [b"\xcd" * 8192]
+
+            ids = store.add_chunks(
+                "test.py",
+                chunks_data,
+                embedding=embedding,
+                embedding_binary=binary_embs,
+                embedding_dense=dense_embs,
+            )
+
+            assert len(ids) == 1
+
+            binary = store.get_binary_embeddings(ids)
+            dense = store.get_dense_embeddings(ids)
+
+            assert binary[ids[0]] == b"\xab" * 32
+            assert dense[ids[0]] == b"\xcd" * 8192
+        finally:
+            store.close()
+
+    def test_add_chunks_length_mismatch_raises(self, tmp_path: Path) -> None:
+        """Mismatched embedding length should raise ValueError."""
+        store = SQLiteStore(tmp_path / "chunks_mismatch.db")
+        store.initialize()
+
+        try:
+            chunks_data = [{"content": "a"}, {"content": "b"}]
+
+            with pytest.raises(ValueError, match="embedding_binary length"):
+                store.add_chunks(
+                    "test.py", chunks_data, embedding_binary=[b"\x00" * 32]
+                )
+
+            with pytest.raises(ValueError, match="embedding_dense length"):
+                store.add_chunks(
+                    "test.py", chunks_data, embedding_dense=[b"\x00" * 8192]
+                )
+
+            with pytest.raises(ValueError, match="embedding length"):
+                store.add_chunks(
+                    "test.py", chunks_data, embedding=[[0.1]]
+                )
+        finally:
+            store.close()
+
+    def test_get_chunks_by_ids(self, tmp_path: Path) -> None:
+        """Retrieve chunk data by IDs."""
+        store = SQLiteStore(tmp_path / "chunks_get.db")
+        store.initialize()
+
+        try:
+            chunks_data = [
+                {"content": "def foo(): pass", "metadata": {"line": 1}},
+                {"content": "def bar(): pass", "metadata": {"line": 5}},
+            ]
+
+            ids = store.add_chunks("test.py", chunks_data)
+            retrieved = store.get_chunks_by_ids(ids)
+
+            assert len(retrieved) == 2
+            assert retrieved[0]["content"] == "def foo(): pass"
+            assert retrieved[0]["metadata"]["line"] == 1
+            assert retrieved[1]["content"] == "def bar(): pass"
+            assert retrieved[1]["file_path"] == "test.py"
+        finally:
+            store.close()
+
+    def test_delete_chunks_by_file(self, tmp_path: Path) -> None:
+        """Delete all chunks for a file."""
+        store = SQLiteStore(tmp_path / "chunks_delete.db")
+        store.initialize()
+
+        try:
+            store.add_chunks("a.py", [{"content": "a1"}, {"content": "a2"}])
+            store.add_chunks("b.py", [{"content": "b1"}])
+
+            assert store.count_chunks() == 3
+
+            deleted = store.delete_chunks_by_file("a.py")
+            assert deleted == 2
+            assert store.count_chunks() == 1
+
+            deleted = store.delete_chunks_by_file("nonexistent.py")
+            assert deleted == 0
+        finally:
+            store.close()
+
+    def test_get_embeddings_empty_list(self, tmp_path: Path) -> None:
+        """Empty chunk ID list returns empty dict."""
+        store = SQLiteStore(tmp_path / "chunks_empty.db")
+        store.initialize()
+
+        try:
+            assert store.get_binary_embeddings([]) == {}
+            assert store.get_dense_embeddings([]) == {}
+            assert store.get_chunks_by_ids([]) == []
+        finally:
+            store.close()
+
+    def test_add_chunks_empty_list(self, tmp_path: Path) -> None:
+        """Empty chunks list returns empty IDs."""
+        store = SQLiteStore(tmp_path / "chunks_empty_add.db")
+        store.initialize()
+
+        try:
+            ids = store.add_chunks("test.py", [])
+            assert ids == []
+            assert store.count_chunks() == 0
+        finally:
+            store.close()
+
+    def test_chunks_table_migration(self, tmp_path: Path) -> None:
+        """Existing chunks table gets new columns via migration."""
+        db_path = tmp_path / "chunks_migration.db"
+
+        # Create old schema without multi-vector columns
+        conn = sqlite3.connect(db_path)
+        conn.execute(
+            """
+            CREATE TABLE chunks (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                file_path TEXT NOT NULL,
+                content TEXT NOT NULL,
+                embedding BLOB,
+                metadata TEXT,
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            )
+            """
+        )
+        conn.execute("CREATE INDEX idx_chunks_file_path ON chunks(file_path)")
+        conn.execute(
+            "INSERT INTO chunks (file_path, content) VALUES ('old.py', 'old content')"
+        )
+        conn.commit()
+        conn.close()
+
+        # Open with SQLiteStore - should migrate
+        store = SQLiteStore(db_path)
+        store.initialize()
+
+        try:
+            # Verify new columns exist by using them
+            ids = store.add_chunks(
+                "new.py",
+                [{"content": "new content"}],
+                embedding_binary=[b"\x00" * 32],
+                embedding_dense=[b"\x00" * 8192],
+            )
+
+            assert len(ids) == 1
+
+            # Old data should still be accessible
+            assert store.count_chunks() == 2
+
+            # New embeddings should work
+            binary = store.get_binary_embeddings(ids)
+            assert binary[ids[0]] == b"\x00" * 32
+        finally:
+            store.close()