feat: Add multi-type embedding backends for cascade retrieval

- Implemented BinaryEmbeddingBackend for fast coarse filtering using 256-dimensional binary vectors. - Developed DenseEmbeddingBackend for high-precision dense vectors (2048 dimensions) for reranking. - Created CascadeEmbeddingBackend to combine binary and dense embeddings for two-stage retrieval. - Introduced utility functions for embedding conversion and distance computation. chore: Migration 010 - Add multi-vector storage support - Added 'chunks' table to support multi-vector embeddings for cascade retrieval. - Included new columns: embedding_binary (256-dim) and embedding_dense (2048-dim) for efficient storage. - Implemented upgrade and downgrade functions to manage schema changes and data migration.
2026-02-12 02:37:45 +08:00 · 2026-01-02 10:52:43 +08:00
parent 195438d26a
commit e21d801523
13 changed files with 3449 additions and 6 deletions
--- a/codex-lens/src/codexlens/semantic/ann_index.py
+++ b/codex-lens/src/codexlens/semantic/ann_index.py
@@ -412,3 +412,489 @@ class ANNIndex:
        """
        with self._lock:
            return self._index is not None and self._current_count > 0
+
+
+
+class BinaryANNIndex:
+    """Binary vector ANN index using Hamming distance for fast coarse retrieval.
+
+    Optimized for binary vectors (256-bit / 32 bytes per vector).
+    Uses packed binary representation for memory efficiency.
+
+    Performance characteristics:
+    - Storage: 32 bytes per vector (vs ~8KB for dense vectors)
+    - Distance: Hamming distance via XOR + popcount (CPU-efficient)
+    - Search: O(N) brute-force with SIMD-accelerated distance computation
+
+    Index parameters:
+    - dim: Binary vector dimension (default: 256)
+    - packed_dim: Packed bytes size (dim / 8 = 32 for 256-bit)
+
+    Usage:
+        index = BinaryANNIndex(index_path, dim=256)
+        index.add_vectors([1, 2, 3], packed_vectors)  # List of 32-byte packed vectors
+        ids, distances = index.search(query_packed, top_k=10)
+    """
+
+    DEFAULT_DIM = 256  # Default binary vector dimension
+
+    def __init__(
+        self,
+        index_path: Path,
+        dim: int = 256,
+        initial_capacity: int = 100000,
+        auto_save: bool = False,
+    ) -> None:
+        """Initialize Binary ANN index.
+
+        Args:
+            index_path: Path to database (index will be saved as _binary_vectors.bin)
+            dim: Dimension of binary vectors (default: 256)
+            initial_capacity: Initial capacity hint (default: 100000)
+            auto_save: Whether to automatically save index after operations
+
+        Raises:
+            ImportError: If required dependencies are not available
+            ValueError: If dimension is invalid
+        """
+        if not SEMANTIC_AVAILABLE:
+            raise ImportError(
+                "Semantic search dependencies not available. "
+                "Install with: pip install codexlens[semantic]"
+            )
+
+        if dim <= 0 or dim % 8 != 0:
+            raise ValueError(
+                f"Invalid dimension: {dim}. Must be positive and divisible by 8."
+            )
+
+        self.index_path = Path(index_path)
+        self.dim = dim
+        self.packed_dim = dim // 8  # 32 bytes for 256-bit vectors
+
+        # Derive binary index path from database path
+        db_stem = self.index_path.stem
+        self.binary_path = self.index_path.parent / f"{db_stem}_binary_vectors.bin"
+
+        # Memory management
+        self._auto_save = auto_save
+        self._initial_capacity = initial_capacity
+
+        # Thread safety
+        self._lock = threading.RLock()
+
+        # In-memory storage: id -> packed binary vector
+        self._vectors: dict[int, bytes] = {}
+        self._id_list: list[int] = []  # Ordered list for efficient iteration
+
+        logger.info(
+            f"Initialized BinaryANNIndex with dim={dim}, packed_dim={self.packed_dim}"
+        )
+
+    def add_vectors(self, ids: List[int], vectors: List[bytes]) -> None:
+        """Add packed binary vectors to the index.
+
+        Args:
+            ids: List of vector IDs (must be unique)
+            vectors: List of packed binary vectors (each of size packed_dim bytes)
+
+        Raises:
+            ValueError: If shapes don't match or vectors are invalid
+            StorageError: If index operation fails
+        """
+        if len(ids) == 0:
+            return
+
+        if len(vectors) != len(ids):
+            raise ValueError(
+                f"Number of vectors ({len(vectors)}) must match number of IDs ({len(ids)})"
+            )
+
+        # Validate vector sizes
+        for i, vec in enumerate(vectors):
+            if len(vec) != self.packed_dim:
+                raise ValueError(
+                    f"Vector {i} has size {len(vec)}, expected {self.packed_dim}"
+                )
+
+        with self._lock:
+            try:
+                for vec_id, vec in zip(ids, vectors):
+                    if vec_id not in self._vectors:
+                        self._id_list.append(vec_id)
+                    self._vectors[vec_id] = vec
+
+                logger.debug(
+                    f"Added {len(ids)} binary vectors to index (total: {len(self._vectors)})"
+                )
+
+                if self._auto_save:
+                    self.save()
+
+            except Exception as e:
+                raise StorageError(f"Failed to add vectors to Binary ANN index: {e}")
+
+    def add_vectors_numpy(self, ids: List[int], vectors: np.ndarray) -> None:
+        """Add unpacked binary vectors (0/1 values) to the index.
+
+        Convenience method that packs the vectors before adding.
+
+        Args:
+            ids: List of vector IDs (must be unique)
+            vectors: Numpy array of shape (N, dim) with binary values (0 or 1)
+
+        Raises:
+            ValueError: If shapes don't match
+            StorageError: If index operation fails
+        """
+        if len(ids) == 0:
+            return
+
+        if vectors.shape[0] != len(ids):
+            raise ValueError(
+                f"Number of vectors ({vectors.shape[0]}) must match number of IDs ({len(ids)})"
+            )
+
+        if vectors.shape[1] != self.dim:
+            raise ValueError(
+                f"Vector dimension ({vectors.shape[1]}) must match index dimension ({self.dim})"
+            )
+
+        # Pack vectors
+        packed_vectors = []
+        for i in range(vectors.shape[0]):
+            packed = np.packbits(vectors[i].astype(np.uint8)).tobytes()
+            packed_vectors.append(packed)
+
+        self.add_vectors(ids, packed_vectors)
+
+    def remove_vectors(self, ids: List[int]) -> None:
+        """Remove vectors from the index.
+
+        Args:
+            ids: List of vector IDs to remove
+
+        Raises:
+            StorageError: If index operation fails
+
+        Note:
+            Optimized for batch deletion using set operations instead of
+            O(N) list.remove() calls for each ID.
+        """
+        if len(ids) == 0:
+            return
+
+        with self._lock:
+            try:
+                # Use set for O(1) lookup during filtering
+                ids_to_remove = set(ids)
+                removed_count = 0
+
+                # Remove from dictionary - O(1) per deletion
+                for vec_id in ids_to_remove:
+                    if vec_id in self._vectors:
+                        del self._vectors[vec_id]
+                        removed_count += 1
+
+                # Rebuild ID list efficiently - O(N) once instead of O(N) per removal
+                if removed_count > 0:
+                    self._id_list = [id_ for id_ in self._id_list if id_ not in ids_to_remove]
+
+                logger.debug(f"Removed {removed_count}/{len(ids)} vectors from index")
+
+                if self._auto_save and removed_count > 0:
+                    self.save()
+
+            except Exception as e:
+                raise StorageError(
+                    f"Failed to remove vectors from Binary ANN index: {e}"
+                )
+
+    def search(
+        self, query: bytes, top_k: int = 10
+    ) -> Tuple[List[int], List[int]]:
+        """Search for nearest neighbors using Hamming distance.
+
+        Args:
+            query: Packed binary query vector (size: packed_dim bytes)
+            top_k: Number of nearest neighbors to return
+
+        Returns:
+            Tuple of (ids, distances) where:
+            - ids: List of vector IDs ordered by Hamming distance (ascending)
+            - distances: List of Hamming distances (lower = more similar)
+
+        Raises:
+            ValueError: If query size is invalid
+            StorageError: If search operation fails
+        """
+        if len(query) != self.packed_dim:
+            raise ValueError(
+                f"Query size ({len(query)}) must match packed_dim ({self.packed_dim})"
+            )
+
+        with self._lock:
+            try:
+                if len(self._vectors) == 0:
+                    return [], []
+
+                # Compute Hamming distances to all vectors
+                query_arr = np.frombuffer(query, dtype=np.uint8)
+                distances = []
+
+                for vec_id in self._id_list:
+                    vec = self._vectors[vec_id]
+                    vec_arr = np.frombuffer(vec, dtype=np.uint8)
+                    # XOR and popcount for Hamming distance
+                    xor = np.bitwise_xor(query_arr, vec_arr)
+                    dist = int(np.unpackbits(xor).sum())
+                    distances.append((vec_id, dist))
+
+                # Sort by distance (ascending)
+                distances.sort(key=lambda x: x[1])
+
+                # Return top-k
+                top_results = distances[:top_k]
+                ids = [r[0] for r in top_results]
+                dists = [r[1] for r in top_results]
+
+                return ids, dists
+
+            except Exception as e:
+                raise StorageError(f"Failed to search Binary ANN index: {e}")
+
+    def search_numpy(
+        self, query: np.ndarray, top_k: int = 10
+    ) -> Tuple[List[int], List[int]]:
+        """Search with unpacked binary query vector.
+
+        Convenience method that packs the query before searching.
+
+        Args:
+            query: Binary query vector of shape (dim,) with values 0 or 1
+            top_k: Number of nearest neighbors to return
+
+        Returns:
+            Tuple of (ids, distances)
+        """
+        if query.ndim == 2:
+            query = query.flatten()
+
+        if len(query) != self.dim:
+            raise ValueError(
+                f"Query dimension ({len(query)}) must match index dimension ({self.dim})"
+            )
+
+        packed_query = np.packbits(query.astype(np.uint8)).tobytes()
+        return self.search(packed_query, top_k)
+
+    def search_batch(
+        self, queries: List[bytes], top_k: int = 10
+    ) -> List[Tuple[List[int], List[int]]]:
+        """Batch search for multiple queries.
+
+        Args:
+            queries: List of packed binary query vectors
+            top_k: Number of nearest neighbors to return per query
+
+        Returns:
+            List of (ids, distances) tuples, one per query
+        """
+        results = []
+        for query in queries:
+            ids, dists = self.search(query, top_k)
+            results.append((ids, dists))
+        return results
+
+    def save(self) -> None:
+        """Save index to disk.
+
+        Binary format:
+        - 4 bytes: magic number (0x42494E56 = "BINV")
+        - 4 bytes: version (1)
+        - 4 bytes: dim
+        - 4 bytes: packed_dim
+        - 4 bytes: num_vectors
+        - For each vector:
+          - 4 bytes: id
+          - packed_dim bytes: vector data
+
+        Raises:
+            StorageError: If save operation fails
+        """
+        with self._lock:
+            try:
+                if len(self._vectors) == 0:
+                    logger.debug("Skipping save: index is empty")
+                    return
+
+                # Ensure parent directory exists
+                self.binary_path.parent.mkdir(parents=True, exist_ok=True)
+
+                with open(self.binary_path, "wb") as f:
+                    # Header
+                    f.write(b"BINV")  # Magic number
+                    f.write(np.array([1], dtype=np.uint32).tobytes())  # Version
+                    f.write(np.array([self.dim], dtype=np.uint32).tobytes())
+                    f.write(np.array([self.packed_dim], dtype=np.uint32).tobytes())
+                    f.write(
+                        np.array([len(self._vectors)], dtype=np.uint32).tobytes()
+                    )
+
+                    # Vectors
+                    for vec_id in self._id_list:
+                        f.write(np.array([vec_id], dtype=np.uint32).tobytes())
+                        f.write(self._vectors[vec_id])
+
+                logger.debug(
+                    f"Saved binary index to {self.binary_path} "
+                    f"({len(self._vectors)} vectors)"
+                )
+
+            except Exception as e:
+                raise StorageError(f"Failed to save Binary ANN index: {e}")
+
+    def load(self) -> bool:
+        """Load index from disk.
+
+        Returns:
+            True if index was loaded successfully, False if index file doesn't exist
+
+        Raises:
+            StorageError: If load operation fails
+        """
+        with self._lock:
+            try:
+                if not self.binary_path.exists():
+                    logger.debug(f"Binary index file not found: {self.binary_path}")
+                    return False
+
+                with open(self.binary_path, "rb") as f:
+                    # Read header
+                    magic = f.read(4)
+                    if magic != b"BINV":
+                        raise StorageError(
+                            f"Invalid binary index file: bad magic number"
+                        )
+
+                    version = np.frombuffer(f.read(4), dtype=np.uint32)[0]
+                    if version != 1:
+                        raise StorageError(
+                            f"Unsupported binary index version: {version}"
+                        )
+
+                    file_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0]
+                    file_packed_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0]
+                    num_vectors = np.frombuffer(f.read(4), dtype=np.uint32)[0]
+
+                    if file_dim != self.dim or file_packed_dim != self.packed_dim:
+                        raise StorageError(
+                            f"Dimension mismatch: file has dim={file_dim}, "
+                            f"packed_dim={file_packed_dim}, "
+                            f"expected dim={self.dim}, packed_dim={self.packed_dim}"
+                        )
+
+                    # Clear existing data
+                    self._vectors.clear()
+                    self._id_list.clear()
+
+                    # Read vectors
+                    for _ in range(num_vectors):
+                        vec_id = np.frombuffer(f.read(4), dtype=np.uint32)[0]
+                        vec_data = f.read(self.packed_dim)
+                        self._vectors[int(vec_id)] = vec_data
+                        self._id_list.append(int(vec_id))
+
+                logger.info(
+                    f"Loaded binary index from {self.binary_path} "
+                    f"({len(self._vectors)} vectors)"
+                )
+
+                return True
+
+            except StorageError:
+                raise
+            except Exception as e:
+                raise StorageError(f"Failed to load Binary ANN index: {e}")
+
+    def count(self) -> int:
+        """Get number of vectors in the index.
+
+        Returns:
+            Number of vectors currently in the index
+        """
+        with self._lock:
+            return len(self._vectors)
+
+    @property
+    def is_loaded(self) -> bool:
+        """Check if index has vectors.
+
+        Returns:
+            True if index has vectors, False otherwise
+        """
+        with self._lock:
+            return len(self._vectors) > 0
+
+    def get_vector(self, vec_id: int) -> Optional[bytes]:
+        """Get a specific vector by ID.
+
+        Args:
+            vec_id: Vector ID to retrieve
+
+        Returns:
+            Packed binary vector or None if not found
+        """
+        with self._lock:
+            return self._vectors.get(vec_id)
+
+    def clear(self) -> None:
+        """Clear all vectors from the index."""
+        with self._lock:
+            self._vectors.clear()
+            self._id_list.clear()
+            logger.debug("Cleared binary index")
+
+
+def create_ann_index(
+    index_path: Path,
+    index_type: str = "hnsw",
+    dim: int = 2048,
+    **kwargs,
+) -> ANNIndex | BinaryANNIndex:
+    """Factory function to create an ANN index.
+
+    Args:
+        index_path: Path to database file
+        index_type: Type of index - "hnsw" for dense vectors, "binary" for binary vectors
+        dim: Vector dimension (default: 2048 for dense, 256 for binary)
+        **kwargs: Additional arguments passed to the index constructor
+
+    Returns:
+        ANNIndex for dense vectors or BinaryANNIndex for binary vectors
+
+    Raises:
+        ValueError: If index_type is invalid
+
+    Example:
+        >>> # Dense vector index (HNSW)
+        >>> dense_index = create_ann_index(path, index_type="hnsw", dim=2048)
+        >>> dense_index.add_vectors(ids, dense_vectors)
+        >>>
+        >>> # Binary vector index (Hamming distance)
+        >>> binary_index = create_ann_index(path, index_type="binary", dim=256)
+        >>> binary_index.add_vectors(ids, packed_vectors)
+    """
+    index_type = index_type.lower()
+
+    if index_type == "hnsw":
+        return ANNIndex(index_path=index_path, dim=dim, **kwargs)
+    elif index_type == "binary":
+        # Default to 256 for binary if not specified
+        if dim == 2048:  # Default dense dim was used
+            dim = 256
+        return BinaryANNIndex(index_path=index_path, dim=dim, **kwargs)
+    else:
+        raise ValueError(
+            f"Invalid index_type: {index_type}. Must be 'hnsw' or 'binary'."
+        )
--- a/codex-lens/src/codexlens/semantic/vector_store.py
+++ b/codex-lens/src/codexlens/semantic/vector_store.py
@@ -29,10 +29,17 @@ except ImportError:

 # Try to import ANN index (optional hnswlib dependency)
 try:
-    from codexlens.semantic.ann_index import ANNIndex, HNSWLIB_AVAILABLE
+    from codexlens.semantic.ann_index import (
+        ANNIndex,
+        BinaryANNIndex,
+        create_ann_index,
+        HNSWLIB_AVAILABLE,
+    )
 except ImportError:
    HNSWLIB_AVAILABLE = False
    ANNIndex = None
+    BinaryANNIndex = None
+    create_ann_index = None


 logger = logging.getLogger(__name__)