mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-12 02:37:45 +08:00
feat: Add multi-type embedding backends for cascade retrieval
- Implemented BinaryEmbeddingBackend for fast coarse filtering using 256-dimensional binary vectors. - Developed DenseEmbeddingBackend for high-precision dense vectors (2048 dimensions) for reranking. - Created CascadeEmbeddingBackend to combine binary and dense embeddings for two-stage retrieval. - Introduced utility functions for embedding conversion and distance computation. chore: Migration 010 - Add multi-vector storage support - Added 'chunks' table to support multi-vector embeddings for cascade retrieval. - Included new columns: embedding_binary (256-dim) and embedding_dense (2048-dim) for efficient storage. - Implemented upgrade and downgrade functions to manage schema changes and data migration.
This commit is contained in:
@@ -412,3 +412,489 @@ class ANNIndex:
|
||||
"""
|
||||
with self._lock:
|
||||
return self._index is not None and self._current_count > 0
|
||||
|
||||
|
||||
|
||||
class BinaryANNIndex:
|
||||
"""Binary vector ANN index using Hamming distance for fast coarse retrieval.
|
||||
|
||||
Optimized for binary vectors (256-bit / 32 bytes per vector).
|
||||
Uses packed binary representation for memory efficiency.
|
||||
|
||||
Performance characteristics:
|
||||
- Storage: 32 bytes per vector (vs ~8KB for dense vectors)
|
||||
- Distance: Hamming distance via XOR + popcount (CPU-efficient)
|
||||
- Search: O(N) brute-force with SIMD-accelerated distance computation
|
||||
|
||||
Index parameters:
|
||||
- dim: Binary vector dimension (default: 256)
|
||||
- packed_dim: Packed bytes size (dim / 8 = 32 for 256-bit)
|
||||
|
||||
Usage:
|
||||
index = BinaryANNIndex(index_path, dim=256)
|
||||
index.add_vectors([1, 2, 3], packed_vectors) # List of 32-byte packed vectors
|
||||
ids, distances = index.search(query_packed, top_k=10)
|
||||
"""
|
||||
|
||||
DEFAULT_DIM = 256 # Default binary vector dimension
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
index_path: Path,
|
||||
dim: int = 256,
|
||||
initial_capacity: int = 100000,
|
||||
auto_save: bool = False,
|
||||
) -> None:
|
||||
"""Initialize Binary ANN index.
|
||||
|
||||
Args:
|
||||
index_path: Path to database (index will be saved as _binary_vectors.bin)
|
||||
dim: Dimension of binary vectors (default: 256)
|
||||
initial_capacity: Initial capacity hint (default: 100000)
|
||||
auto_save: Whether to automatically save index after operations
|
||||
|
||||
Raises:
|
||||
ImportError: If required dependencies are not available
|
||||
ValueError: If dimension is invalid
|
||||
"""
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
raise ImportError(
|
||||
"Semantic search dependencies not available. "
|
||||
"Install with: pip install codexlens[semantic]"
|
||||
)
|
||||
|
||||
if dim <= 0 or dim % 8 != 0:
|
||||
raise ValueError(
|
||||
f"Invalid dimension: {dim}. Must be positive and divisible by 8."
|
||||
)
|
||||
|
||||
self.index_path = Path(index_path)
|
||||
self.dim = dim
|
||||
self.packed_dim = dim // 8 # 32 bytes for 256-bit vectors
|
||||
|
||||
# Derive binary index path from database path
|
||||
db_stem = self.index_path.stem
|
||||
self.binary_path = self.index_path.parent / f"{db_stem}_binary_vectors.bin"
|
||||
|
||||
# Memory management
|
||||
self._auto_save = auto_save
|
||||
self._initial_capacity = initial_capacity
|
||||
|
||||
# Thread safety
|
||||
self._lock = threading.RLock()
|
||||
|
||||
# In-memory storage: id -> packed binary vector
|
||||
self._vectors: dict[int, bytes] = {}
|
||||
self._id_list: list[int] = [] # Ordered list for efficient iteration
|
||||
|
||||
logger.info(
|
||||
f"Initialized BinaryANNIndex with dim={dim}, packed_dim={self.packed_dim}"
|
||||
)
|
||||
|
||||
def add_vectors(self, ids: List[int], vectors: List[bytes]) -> None:
|
||||
"""Add packed binary vectors to the index.
|
||||
|
||||
Args:
|
||||
ids: List of vector IDs (must be unique)
|
||||
vectors: List of packed binary vectors (each of size packed_dim bytes)
|
||||
|
||||
Raises:
|
||||
ValueError: If shapes don't match or vectors are invalid
|
||||
StorageError: If index operation fails
|
||||
"""
|
||||
if len(ids) == 0:
|
||||
return
|
||||
|
||||
if len(vectors) != len(ids):
|
||||
raise ValueError(
|
||||
f"Number of vectors ({len(vectors)}) must match number of IDs ({len(ids)})"
|
||||
)
|
||||
|
||||
# Validate vector sizes
|
||||
for i, vec in enumerate(vectors):
|
||||
if len(vec) != self.packed_dim:
|
||||
raise ValueError(
|
||||
f"Vector {i} has size {len(vec)}, expected {self.packed_dim}"
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
try:
|
||||
for vec_id, vec in zip(ids, vectors):
|
||||
if vec_id not in self._vectors:
|
||||
self._id_list.append(vec_id)
|
||||
self._vectors[vec_id] = vec
|
||||
|
||||
logger.debug(
|
||||
f"Added {len(ids)} binary vectors to index (total: {len(self._vectors)})"
|
||||
)
|
||||
|
||||
if self._auto_save:
|
||||
self.save()
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to add vectors to Binary ANN index: {e}")
|
||||
|
||||
def add_vectors_numpy(self, ids: List[int], vectors: np.ndarray) -> None:
|
||||
"""Add unpacked binary vectors (0/1 values) to the index.
|
||||
|
||||
Convenience method that packs the vectors before adding.
|
||||
|
||||
Args:
|
||||
ids: List of vector IDs (must be unique)
|
||||
vectors: Numpy array of shape (N, dim) with binary values (0 or 1)
|
||||
|
||||
Raises:
|
||||
ValueError: If shapes don't match
|
||||
StorageError: If index operation fails
|
||||
"""
|
||||
if len(ids) == 0:
|
||||
return
|
||||
|
||||
if vectors.shape[0] != len(ids):
|
||||
raise ValueError(
|
||||
f"Number of vectors ({vectors.shape[0]}) must match number of IDs ({len(ids)})"
|
||||
)
|
||||
|
||||
if vectors.shape[1] != self.dim:
|
||||
raise ValueError(
|
||||
f"Vector dimension ({vectors.shape[1]}) must match index dimension ({self.dim})"
|
||||
)
|
||||
|
||||
# Pack vectors
|
||||
packed_vectors = []
|
||||
for i in range(vectors.shape[0]):
|
||||
packed = np.packbits(vectors[i].astype(np.uint8)).tobytes()
|
||||
packed_vectors.append(packed)
|
||||
|
||||
self.add_vectors(ids, packed_vectors)
|
||||
|
||||
def remove_vectors(self, ids: List[int]) -> None:
|
||||
"""Remove vectors from the index.
|
||||
|
||||
Args:
|
||||
ids: List of vector IDs to remove
|
||||
|
||||
Raises:
|
||||
StorageError: If index operation fails
|
||||
|
||||
Note:
|
||||
Optimized for batch deletion using set operations instead of
|
||||
O(N) list.remove() calls for each ID.
|
||||
"""
|
||||
if len(ids) == 0:
|
||||
return
|
||||
|
||||
with self._lock:
|
||||
try:
|
||||
# Use set for O(1) lookup during filtering
|
||||
ids_to_remove = set(ids)
|
||||
removed_count = 0
|
||||
|
||||
# Remove from dictionary - O(1) per deletion
|
||||
for vec_id in ids_to_remove:
|
||||
if vec_id in self._vectors:
|
||||
del self._vectors[vec_id]
|
||||
removed_count += 1
|
||||
|
||||
# Rebuild ID list efficiently - O(N) once instead of O(N) per removal
|
||||
if removed_count > 0:
|
||||
self._id_list = [id_ for id_ in self._id_list if id_ not in ids_to_remove]
|
||||
|
||||
logger.debug(f"Removed {removed_count}/{len(ids)} vectors from index")
|
||||
|
||||
if self._auto_save and removed_count > 0:
|
||||
self.save()
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(
|
||||
f"Failed to remove vectors from Binary ANN index: {e}"
|
||||
)
|
||||
|
||||
def search(
|
||||
self, query: bytes, top_k: int = 10
|
||||
) -> Tuple[List[int], List[int]]:
|
||||
"""Search for nearest neighbors using Hamming distance.
|
||||
|
||||
Args:
|
||||
query: Packed binary query vector (size: packed_dim bytes)
|
||||
top_k: Number of nearest neighbors to return
|
||||
|
||||
Returns:
|
||||
Tuple of (ids, distances) where:
|
||||
- ids: List of vector IDs ordered by Hamming distance (ascending)
|
||||
- distances: List of Hamming distances (lower = more similar)
|
||||
|
||||
Raises:
|
||||
ValueError: If query size is invalid
|
||||
StorageError: If search operation fails
|
||||
"""
|
||||
if len(query) != self.packed_dim:
|
||||
raise ValueError(
|
||||
f"Query size ({len(query)}) must match packed_dim ({self.packed_dim})"
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
try:
|
||||
if len(self._vectors) == 0:
|
||||
return [], []
|
||||
|
||||
# Compute Hamming distances to all vectors
|
||||
query_arr = np.frombuffer(query, dtype=np.uint8)
|
||||
distances = []
|
||||
|
||||
for vec_id in self._id_list:
|
||||
vec = self._vectors[vec_id]
|
||||
vec_arr = np.frombuffer(vec, dtype=np.uint8)
|
||||
# XOR and popcount for Hamming distance
|
||||
xor = np.bitwise_xor(query_arr, vec_arr)
|
||||
dist = int(np.unpackbits(xor).sum())
|
||||
distances.append((vec_id, dist))
|
||||
|
||||
# Sort by distance (ascending)
|
||||
distances.sort(key=lambda x: x[1])
|
||||
|
||||
# Return top-k
|
||||
top_results = distances[:top_k]
|
||||
ids = [r[0] for r in top_results]
|
||||
dists = [r[1] for r in top_results]
|
||||
|
||||
return ids, dists
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to search Binary ANN index: {e}")
|
||||
|
||||
def search_numpy(
|
||||
self, query: np.ndarray, top_k: int = 10
|
||||
) -> Tuple[List[int], List[int]]:
|
||||
"""Search with unpacked binary query vector.
|
||||
|
||||
Convenience method that packs the query before searching.
|
||||
|
||||
Args:
|
||||
query: Binary query vector of shape (dim,) with values 0 or 1
|
||||
top_k: Number of nearest neighbors to return
|
||||
|
||||
Returns:
|
||||
Tuple of (ids, distances)
|
||||
"""
|
||||
if query.ndim == 2:
|
||||
query = query.flatten()
|
||||
|
||||
if len(query) != self.dim:
|
||||
raise ValueError(
|
||||
f"Query dimension ({len(query)}) must match index dimension ({self.dim})"
|
||||
)
|
||||
|
||||
packed_query = np.packbits(query.astype(np.uint8)).tobytes()
|
||||
return self.search(packed_query, top_k)
|
||||
|
||||
def search_batch(
|
||||
self, queries: List[bytes], top_k: int = 10
|
||||
) -> List[Tuple[List[int], List[int]]]:
|
||||
"""Batch search for multiple queries.
|
||||
|
||||
Args:
|
||||
queries: List of packed binary query vectors
|
||||
top_k: Number of nearest neighbors to return per query
|
||||
|
||||
Returns:
|
||||
List of (ids, distances) tuples, one per query
|
||||
"""
|
||||
results = []
|
||||
for query in queries:
|
||||
ids, dists = self.search(query, top_k)
|
||||
results.append((ids, dists))
|
||||
return results
|
||||
|
||||
def save(self) -> None:
|
||||
"""Save index to disk.
|
||||
|
||||
Binary format:
|
||||
- 4 bytes: magic number (0x42494E56 = "BINV")
|
||||
- 4 bytes: version (1)
|
||||
- 4 bytes: dim
|
||||
- 4 bytes: packed_dim
|
||||
- 4 bytes: num_vectors
|
||||
- For each vector:
|
||||
- 4 bytes: id
|
||||
- packed_dim bytes: vector data
|
||||
|
||||
Raises:
|
||||
StorageError: If save operation fails
|
||||
"""
|
||||
with self._lock:
|
||||
try:
|
||||
if len(self._vectors) == 0:
|
||||
logger.debug("Skipping save: index is empty")
|
||||
return
|
||||
|
||||
# Ensure parent directory exists
|
||||
self.binary_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(self.binary_path, "wb") as f:
|
||||
# Header
|
||||
f.write(b"BINV") # Magic number
|
||||
f.write(np.array([1], dtype=np.uint32).tobytes()) # Version
|
||||
f.write(np.array([self.dim], dtype=np.uint32).tobytes())
|
||||
f.write(np.array([self.packed_dim], dtype=np.uint32).tobytes())
|
||||
f.write(
|
||||
np.array([len(self._vectors)], dtype=np.uint32).tobytes()
|
||||
)
|
||||
|
||||
# Vectors
|
||||
for vec_id in self._id_list:
|
||||
f.write(np.array([vec_id], dtype=np.uint32).tobytes())
|
||||
f.write(self._vectors[vec_id])
|
||||
|
||||
logger.debug(
|
||||
f"Saved binary index to {self.binary_path} "
|
||||
f"({len(self._vectors)} vectors)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to save Binary ANN index: {e}")
|
||||
|
||||
def load(self) -> bool:
|
||||
"""Load index from disk.
|
||||
|
||||
Returns:
|
||||
True if index was loaded successfully, False if index file doesn't exist
|
||||
|
||||
Raises:
|
||||
StorageError: If load operation fails
|
||||
"""
|
||||
with self._lock:
|
||||
try:
|
||||
if not self.binary_path.exists():
|
||||
logger.debug(f"Binary index file not found: {self.binary_path}")
|
||||
return False
|
||||
|
||||
with open(self.binary_path, "rb") as f:
|
||||
# Read header
|
||||
magic = f.read(4)
|
||||
if magic != b"BINV":
|
||||
raise StorageError(
|
||||
f"Invalid binary index file: bad magic number"
|
||||
)
|
||||
|
||||
version = np.frombuffer(f.read(4), dtype=np.uint32)[0]
|
||||
if version != 1:
|
||||
raise StorageError(
|
||||
f"Unsupported binary index version: {version}"
|
||||
)
|
||||
|
||||
file_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0]
|
||||
file_packed_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0]
|
||||
num_vectors = np.frombuffer(f.read(4), dtype=np.uint32)[0]
|
||||
|
||||
if file_dim != self.dim or file_packed_dim != self.packed_dim:
|
||||
raise StorageError(
|
||||
f"Dimension mismatch: file has dim={file_dim}, "
|
||||
f"packed_dim={file_packed_dim}, "
|
||||
f"expected dim={self.dim}, packed_dim={self.packed_dim}"
|
||||
)
|
||||
|
||||
# Clear existing data
|
||||
self._vectors.clear()
|
||||
self._id_list.clear()
|
||||
|
||||
# Read vectors
|
||||
for _ in range(num_vectors):
|
||||
vec_id = np.frombuffer(f.read(4), dtype=np.uint32)[0]
|
||||
vec_data = f.read(self.packed_dim)
|
||||
self._vectors[int(vec_id)] = vec_data
|
||||
self._id_list.append(int(vec_id))
|
||||
|
||||
logger.info(
|
||||
f"Loaded binary index from {self.binary_path} "
|
||||
f"({len(self._vectors)} vectors)"
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except StorageError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to load Binary ANN index: {e}")
|
||||
|
||||
def count(self) -> int:
|
||||
"""Get number of vectors in the index.
|
||||
|
||||
Returns:
|
||||
Number of vectors currently in the index
|
||||
"""
|
||||
with self._lock:
|
||||
return len(self._vectors)
|
||||
|
||||
@property
|
||||
def is_loaded(self) -> bool:
|
||||
"""Check if index has vectors.
|
||||
|
||||
Returns:
|
||||
True if index has vectors, False otherwise
|
||||
"""
|
||||
with self._lock:
|
||||
return len(self._vectors) > 0
|
||||
|
||||
def get_vector(self, vec_id: int) -> Optional[bytes]:
|
||||
"""Get a specific vector by ID.
|
||||
|
||||
Args:
|
||||
vec_id: Vector ID to retrieve
|
||||
|
||||
Returns:
|
||||
Packed binary vector or None if not found
|
||||
"""
|
||||
with self._lock:
|
||||
return self._vectors.get(vec_id)
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all vectors from the index."""
|
||||
with self._lock:
|
||||
self._vectors.clear()
|
||||
self._id_list.clear()
|
||||
logger.debug("Cleared binary index")
|
||||
|
||||
|
||||
def create_ann_index(
|
||||
index_path: Path,
|
||||
index_type: str = "hnsw",
|
||||
dim: int = 2048,
|
||||
**kwargs,
|
||||
) -> ANNIndex | BinaryANNIndex:
|
||||
"""Factory function to create an ANN index.
|
||||
|
||||
Args:
|
||||
index_path: Path to database file
|
||||
index_type: Type of index - "hnsw" for dense vectors, "binary" for binary vectors
|
||||
dim: Vector dimension (default: 2048 for dense, 256 for binary)
|
||||
**kwargs: Additional arguments passed to the index constructor
|
||||
|
||||
Returns:
|
||||
ANNIndex for dense vectors or BinaryANNIndex for binary vectors
|
||||
|
||||
Raises:
|
||||
ValueError: If index_type is invalid
|
||||
|
||||
Example:
|
||||
>>> # Dense vector index (HNSW)
|
||||
>>> dense_index = create_ann_index(path, index_type="hnsw", dim=2048)
|
||||
>>> dense_index.add_vectors(ids, dense_vectors)
|
||||
>>>
|
||||
>>> # Binary vector index (Hamming distance)
|
||||
>>> binary_index = create_ann_index(path, index_type="binary", dim=256)
|
||||
>>> binary_index.add_vectors(ids, packed_vectors)
|
||||
"""
|
||||
index_type = index_type.lower()
|
||||
|
||||
if index_type == "hnsw":
|
||||
return ANNIndex(index_path=index_path, dim=dim, **kwargs)
|
||||
elif index_type == "binary":
|
||||
# Default to 256 for binary if not specified
|
||||
if dim == 2048: # Default dense dim was used
|
||||
dim = 256
|
||||
return BinaryANNIndex(index_path=index_path, dim=dim, **kwargs)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid index_type: {index_type}. Must be 'hnsw' or 'binary'."
|
||||
)
|
||||
|
||||
@@ -29,10 +29,17 @@ except ImportError:
|
||||
|
||||
# Try to import ANN index (optional hnswlib dependency)
|
||||
try:
|
||||
from codexlens.semantic.ann_index import ANNIndex, HNSWLIB_AVAILABLE
|
||||
from codexlens.semantic.ann_index import (
|
||||
ANNIndex,
|
||||
BinaryANNIndex,
|
||||
create_ann_index,
|
||||
HNSWLIB_AVAILABLE,
|
||||
)
|
||||
except ImportError:
|
||||
HNSWLIB_AVAILABLE = False
|
||||
ANNIndex = None
|
||||
BinaryANNIndex = None
|
||||
create_ann_index = None
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
Reference in New Issue
Block a user