feat: Add multi-type embedding backends for cascade retrieval

- Implemented BinaryEmbeddingBackend for fast coarse filtering using 256-dimensional binary vectors.
- Developed DenseEmbeddingBackend for high-precision dense vectors (2048 dimensions) for reranking.
- Created CascadeEmbeddingBackend to combine binary and dense embeddings for two-stage retrieval.
- Introduced utility functions for embedding conversion and distance computation.

chore: Migration 010 - Add multi-vector storage support

- Added 'chunks' table to support multi-vector embeddings for cascade retrieval.
- Included new columns: embedding_binary (256-dim) and embedding_dense (2048-dim) for efficient storage.
- Implemented upgrade and downgrade functions to manage schema changes and data migration.
This commit is contained in:
catlog22
2026-01-02 10:52:43 +08:00
parent 195438d26a
commit e21d801523
13 changed files with 3449 additions and 6 deletions

View File

@@ -412,3 +412,489 @@ class ANNIndex:
"""
with self._lock:
return self._index is not None and self._current_count > 0
class BinaryANNIndex:
"""Binary vector ANN index using Hamming distance for fast coarse retrieval.
Optimized for binary vectors (256-bit / 32 bytes per vector).
Uses packed binary representation for memory efficiency.
Performance characteristics:
- Storage: 32 bytes per vector (vs ~8KB for dense vectors)
- Distance: Hamming distance via XOR + popcount (CPU-efficient)
- Search: O(N) brute-force with SIMD-accelerated distance computation
Index parameters:
- dim: Binary vector dimension (default: 256)
- packed_dim: Packed bytes size (dim / 8 = 32 for 256-bit)
Usage:
index = BinaryANNIndex(index_path, dim=256)
index.add_vectors([1, 2, 3], packed_vectors) # List of 32-byte packed vectors
ids, distances = index.search(query_packed, top_k=10)
"""
DEFAULT_DIM = 256 # Default binary vector dimension
def __init__(
self,
index_path: Path,
dim: int = 256,
initial_capacity: int = 100000,
auto_save: bool = False,
) -> None:
"""Initialize Binary ANN index.
Args:
index_path: Path to database (index will be saved as _binary_vectors.bin)
dim: Dimension of binary vectors (default: 256)
initial_capacity: Initial capacity hint (default: 100000)
auto_save: Whether to automatically save index after operations
Raises:
ImportError: If required dependencies are not available
ValueError: If dimension is invalid
"""
if not SEMANTIC_AVAILABLE:
raise ImportError(
"Semantic search dependencies not available. "
"Install with: pip install codexlens[semantic]"
)
if dim <= 0 or dim % 8 != 0:
raise ValueError(
f"Invalid dimension: {dim}. Must be positive and divisible by 8."
)
self.index_path = Path(index_path)
self.dim = dim
self.packed_dim = dim // 8 # 32 bytes for 256-bit vectors
# Derive binary index path from database path
db_stem = self.index_path.stem
self.binary_path = self.index_path.parent / f"{db_stem}_binary_vectors.bin"
# Memory management
self._auto_save = auto_save
self._initial_capacity = initial_capacity
# Thread safety
self._lock = threading.RLock()
# In-memory storage: id -> packed binary vector
self._vectors: dict[int, bytes] = {}
self._id_list: list[int] = [] # Ordered list for efficient iteration
logger.info(
f"Initialized BinaryANNIndex with dim={dim}, packed_dim={self.packed_dim}"
)
def add_vectors(self, ids: List[int], vectors: List[bytes]) -> None:
"""Add packed binary vectors to the index.
Args:
ids: List of vector IDs (must be unique)
vectors: List of packed binary vectors (each of size packed_dim bytes)
Raises:
ValueError: If shapes don't match or vectors are invalid
StorageError: If index operation fails
"""
if len(ids) == 0:
return
if len(vectors) != len(ids):
raise ValueError(
f"Number of vectors ({len(vectors)}) must match number of IDs ({len(ids)})"
)
# Validate vector sizes
for i, vec in enumerate(vectors):
if len(vec) != self.packed_dim:
raise ValueError(
f"Vector {i} has size {len(vec)}, expected {self.packed_dim}"
)
with self._lock:
try:
for vec_id, vec in zip(ids, vectors):
if vec_id not in self._vectors:
self._id_list.append(vec_id)
self._vectors[vec_id] = vec
logger.debug(
f"Added {len(ids)} binary vectors to index (total: {len(self._vectors)})"
)
if self._auto_save:
self.save()
except Exception as e:
raise StorageError(f"Failed to add vectors to Binary ANN index: {e}")
def add_vectors_numpy(self, ids: List[int], vectors: np.ndarray) -> None:
"""Add unpacked binary vectors (0/1 values) to the index.
Convenience method that packs the vectors before adding.
Args:
ids: List of vector IDs (must be unique)
vectors: Numpy array of shape (N, dim) with binary values (0 or 1)
Raises:
ValueError: If shapes don't match
StorageError: If index operation fails
"""
if len(ids) == 0:
return
if vectors.shape[0] != len(ids):
raise ValueError(
f"Number of vectors ({vectors.shape[0]}) must match number of IDs ({len(ids)})"
)
if vectors.shape[1] != self.dim:
raise ValueError(
f"Vector dimension ({vectors.shape[1]}) must match index dimension ({self.dim})"
)
# Pack vectors
packed_vectors = []
for i in range(vectors.shape[0]):
packed = np.packbits(vectors[i].astype(np.uint8)).tobytes()
packed_vectors.append(packed)
self.add_vectors(ids, packed_vectors)
def remove_vectors(self, ids: List[int]) -> None:
"""Remove vectors from the index.
Args:
ids: List of vector IDs to remove
Raises:
StorageError: If index operation fails
Note:
Optimized for batch deletion using set operations instead of
O(N) list.remove() calls for each ID.
"""
if len(ids) == 0:
return
with self._lock:
try:
# Use set for O(1) lookup during filtering
ids_to_remove = set(ids)
removed_count = 0
# Remove from dictionary - O(1) per deletion
for vec_id in ids_to_remove:
if vec_id in self._vectors:
del self._vectors[vec_id]
removed_count += 1
# Rebuild ID list efficiently - O(N) once instead of O(N) per removal
if removed_count > 0:
self._id_list = [id_ for id_ in self._id_list if id_ not in ids_to_remove]
logger.debug(f"Removed {removed_count}/{len(ids)} vectors from index")
if self._auto_save and removed_count > 0:
self.save()
except Exception as e:
raise StorageError(
f"Failed to remove vectors from Binary ANN index: {e}"
)
def search(
self, query: bytes, top_k: int = 10
) -> Tuple[List[int], List[int]]:
"""Search for nearest neighbors using Hamming distance.
Args:
query: Packed binary query vector (size: packed_dim bytes)
top_k: Number of nearest neighbors to return
Returns:
Tuple of (ids, distances) where:
- ids: List of vector IDs ordered by Hamming distance (ascending)
- distances: List of Hamming distances (lower = more similar)
Raises:
ValueError: If query size is invalid
StorageError: If search operation fails
"""
if len(query) != self.packed_dim:
raise ValueError(
f"Query size ({len(query)}) must match packed_dim ({self.packed_dim})"
)
with self._lock:
try:
if len(self._vectors) == 0:
return [], []
# Compute Hamming distances to all vectors
query_arr = np.frombuffer(query, dtype=np.uint8)
distances = []
for vec_id in self._id_list:
vec = self._vectors[vec_id]
vec_arr = np.frombuffer(vec, dtype=np.uint8)
# XOR and popcount for Hamming distance
xor = np.bitwise_xor(query_arr, vec_arr)
dist = int(np.unpackbits(xor).sum())
distances.append((vec_id, dist))
# Sort by distance (ascending)
distances.sort(key=lambda x: x[1])
# Return top-k
top_results = distances[:top_k]
ids = [r[0] for r in top_results]
dists = [r[1] for r in top_results]
return ids, dists
except Exception as e:
raise StorageError(f"Failed to search Binary ANN index: {e}")
def search_numpy(
self, query: np.ndarray, top_k: int = 10
) -> Tuple[List[int], List[int]]:
"""Search with unpacked binary query vector.
Convenience method that packs the query before searching.
Args:
query: Binary query vector of shape (dim,) with values 0 or 1
top_k: Number of nearest neighbors to return
Returns:
Tuple of (ids, distances)
"""
if query.ndim == 2:
query = query.flatten()
if len(query) != self.dim:
raise ValueError(
f"Query dimension ({len(query)}) must match index dimension ({self.dim})"
)
packed_query = np.packbits(query.astype(np.uint8)).tobytes()
return self.search(packed_query, top_k)
def search_batch(
self, queries: List[bytes], top_k: int = 10
) -> List[Tuple[List[int], List[int]]]:
"""Batch search for multiple queries.
Args:
queries: List of packed binary query vectors
top_k: Number of nearest neighbors to return per query
Returns:
List of (ids, distances) tuples, one per query
"""
results = []
for query in queries:
ids, dists = self.search(query, top_k)
results.append((ids, dists))
return results
def save(self) -> None:
"""Save index to disk.
Binary format:
- 4 bytes: magic number (0x42494E56 = "BINV")
- 4 bytes: version (1)
- 4 bytes: dim
- 4 bytes: packed_dim
- 4 bytes: num_vectors
- For each vector:
- 4 bytes: id
- packed_dim bytes: vector data
Raises:
StorageError: If save operation fails
"""
with self._lock:
try:
if len(self._vectors) == 0:
logger.debug("Skipping save: index is empty")
return
# Ensure parent directory exists
self.binary_path.parent.mkdir(parents=True, exist_ok=True)
with open(self.binary_path, "wb") as f:
# Header
f.write(b"BINV") # Magic number
f.write(np.array([1], dtype=np.uint32).tobytes()) # Version
f.write(np.array([self.dim], dtype=np.uint32).tobytes())
f.write(np.array([self.packed_dim], dtype=np.uint32).tobytes())
f.write(
np.array([len(self._vectors)], dtype=np.uint32).tobytes()
)
# Vectors
for vec_id in self._id_list:
f.write(np.array([vec_id], dtype=np.uint32).tobytes())
f.write(self._vectors[vec_id])
logger.debug(
f"Saved binary index to {self.binary_path} "
f"({len(self._vectors)} vectors)"
)
except Exception as e:
raise StorageError(f"Failed to save Binary ANN index: {e}")
def load(self) -> bool:
"""Load index from disk.
Returns:
True if index was loaded successfully, False if index file doesn't exist
Raises:
StorageError: If load operation fails
"""
with self._lock:
try:
if not self.binary_path.exists():
logger.debug(f"Binary index file not found: {self.binary_path}")
return False
with open(self.binary_path, "rb") as f:
# Read header
magic = f.read(4)
if magic != b"BINV":
raise StorageError(
f"Invalid binary index file: bad magic number"
)
version = np.frombuffer(f.read(4), dtype=np.uint32)[0]
if version != 1:
raise StorageError(
f"Unsupported binary index version: {version}"
)
file_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0]
file_packed_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0]
num_vectors = np.frombuffer(f.read(4), dtype=np.uint32)[0]
if file_dim != self.dim or file_packed_dim != self.packed_dim:
raise StorageError(
f"Dimension mismatch: file has dim={file_dim}, "
f"packed_dim={file_packed_dim}, "
f"expected dim={self.dim}, packed_dim={self.packed_dim}"
)
# Clear existing data
self._vectors.clear()
self._id_list.clear()
# Read vectors
for _ in range(num_vectors):
vec_id = np.frombuffer(f.read(4), dtype=np.uint32)[0]
vec_data = f.read(self.packed_dim)
self._vectors[int(vec_id)] = vec_data
self._id_list.append(int(vec_id))
logger.info(
f"Loaded binary index from {self.binary_path} "
f"({len(self._vectors)} vectors)"
)
return True
except StorageError:
raise
except Exception as e:
raise StorageError(f"Failed to load Binary ANN index: {e}")
def count(self) -> int:
"""Get number of vectors in the index.
Returns:
Number of vectors currently in the index
"""
with self._lock:
return len(self._vectors)
@property
def is_loaded(self) -> bool:
"""Check if index has vectors.
Returns:
True if index has vectors, False otherwise
"""
with self._lock:
return len(self._vectors) > 0
def get_vector(self, vec_id: int) -> Optional[bytes]:
"""Get a specific vector by ID.
Args:
vec_id: Vector ID to retrieve
Returns:
Packed binary vector or None if not found
"""
with self._lock:
return self._vectors.get(vec_id)
def clear(self) -> None:
"""Clear all vectors from the index."""
with self._lock:
self._vectors.clear()
self._id_list.clear()
logger.debug("Cleared binary index")
def create_ann_index(
index_path: Path,
index_type: str = "hnsw",
dim: int = 2048,
**kwargs,
) -> ANNIndex | BinaryANNIndex:
"""Factory function to create an ANN index.
Args:
index_path: Path to database file
index_type: Type of index - "hnsw" for dense vectors, "binary" for binary vectors
dim: Vector dimension (default: 2048 for dense, 256 for binary)
**kwargs: Additional arguments passed to the index constructor
Returns:
ANNIndex for dense vectors or BinaryANNIndex for binary vectors
Raises:
ValueError: If index_type is invalid
Example:
>>> # Dense vector index (HNSW)
>>> dense_index = create_ann_index(path, index_type="hnsw", dim=2048)
>>> dense_index.add_vectors(ids, dense_vectors)
>>>
>>> # Binary vector index (Hamming distance)
>>> binary_index = create_ann_index(path, index_type="binary", dim=256)
>>> binary_index.add_vectors(ids, packed_vectors)
"""
index_type = index_type.lower()
if index_type == "hnsw":
return ANNIndex(index_path=index_path, dim=dim, **kwargs)
elif index_type == "binary":
# Default to 256 for binary if not specified
if dim == 2048: # Default dense dim was used
dim = 256
return BinaryANNIndex(index_path=index_path, dim=dim, **kwargs)
else:
raise ValueError(
f"Invalid index_type: {index_type}. Must be 'hnsw' or 'binary'."
)

View File

@@ -29,10 +29,17 @@ except ImportError:
# Try to import ANN index (optional hnswlib dependency)
try:
from codexlens.semantic.ann_index import ANNIndex, HNSWLIB_AVAILABLE
from codexlens.semantic.ann_index import (
ANNIndex,
BinaryANNIndex,
create_ann_index,
HNSWLIB_AVAILABLE,
)
except ImportError:
HNSWLIB_AVAILABLE = False
ANNIndex = None
BinaryANNIndex = None
create_ann_index = None
logger = logging.getLogger(__name__)