feat: Add multi-type embedding backends for cascade retrieval

- Implemented BinaryEmbeddingBackend for fast coarse filtering using 256-dimensional binary vectors.
- Developed DenseEmbeddingBackend for high-precision dense vectors (2048 dimensions) for reranking.
- Created CascadeEmbeddingBackend to combine binary and dense embeddings for two-stage retrieval.
- Introduced utility functions for embedding conversion and distance computation.

chore: Migration 010 - Add multi-vector storage support

- Added 'chunks' table to support multi-vector embeddings for cascade retrieval.
- Included new columns: embedding_binary (256-dim) and embedding_dense (2048-dim) for efficient storage.
- Implemented upgrade and downgrade functions to manage schema changes and data migration.
This commit is contained in:
catlog22
2026-01-02 10:52:43 +08:00
parent 195438d26a
commit e21d801523
13 changed files with 3449 additions and 6 deletions

View File

@@ -131,6 +131,16 @@ class Config:
reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
reranker_top_k: int = 50
# Cascade search configuration (two-stage retrieval)
enable_cascade_search: bool = False # Enable cascade search (coarse + fine ranking)
cascade_coarse_k: int = 100 # Number of coarse candidates from first stage
cascade_fine_k: int = 10 # Number of final results after reranking
cascade_strategy: str = "binary" # "binary" (fast binary+dense) or "hybrid" (FTS+SPLADE+Vector+CrossEncoder)
# RRF fusion configuration
fusion_method: str = "rrf" # "simple" (weighted sum) or "rrf" (reciprocal rank fusion)
rrf_k: int = 60 # RRF constant (default 60)
# Multi-endpoint configuration for litellm backend
embedding_endpoints: List[Dict[str, Any]] = field(default_factory=list)
# List of endpoint configs: [{"model": "...", "api_key": "...", "api_base": "...", "weight": 1.0}]

View File

@@ -1,4 +1,26 @@
"""Code indexing and symbol extraction."""
from codexlens.indexing.symbol_extractor import SymbolExtractor
from codexlens.indexing.embedding import (
BinaryEmbeddingBackend,
DenseEmbeddingBackend,
CascadeEmbeddingBackend,
get_cascade_embedder,
binarize_embedding,
pack_binary_embedding,
unpack_binary_embedding,
hamming_distance,
)
__all__ = ["SymbolExtractor"]
__all__ = [
"SymbolExtractor",
# Cascade embedding backends
"BinaryEmbeddingBackend",
"DenseEmbeddingBackend",
"CascadeEmbeddingBackend",
"get_cascade_embedder",
# Utility functions
"binarize_embedding",
"pack_binary_embedding",
"unpack_binary_embedding",
"hamming_distance",
]

View File

@@ -0,0 +1,582 @@
"""Multi-type embedding backends for cascade retrieval.
This module provides embedding backends optimized for cascade retrieval:
1. BinaryEmbeddingBackend - Fast coarse filtering with binary vectors
2. DenseEmbeddingBackend - High-precision dense vectors for reranking
3. CascadeEmbeddingBackend - Combined binary + dense for two-stage retrieval
Cascade retrieval workflow:
1. Binary search (fast, ~32 bytes/vector) -> top-K candidates
2. Dense rerank (precise, ~8KB/vector) -> final results
"""
from __future__ import annotations
import logging
from typing import Iterable, List, Optional, Tuple
import numpy as np
from codexlens.semantic.base import BaseEmbedder
logger = logging.getLogger(__name__)
# =============================================================================
# Utility Functions
# =============================================================================
def binarize_embedding(embedding: np.ndarray) -> np.ndarray:
"""Convert float embedding to binary vector.
Applies sign-based quantization: values > 0 become 1, values <= 0 become 0.
Args:
embedding: Float32 embedding of any dimension
Returns:
Binary vector (uint8 with values 0 or 1) of same dimension
"""
return (embedding > 0).astype(np.uint8)
def pack_binary_embedding(binary_vector: np.ndarray) -> bytes:
"""Pack binary vector into compact bytes format.
Packs 8 binary values into each byte for storage efficiency.
For a 256-dim binary vector, output is 32 bytes.
Args:
binary_vector: Binary vector (uint8 with values 0 or 1)
Returns:
Packed bytes (length = ceil(dim / 8))
"""
# Ensure vector length is multiple of 8 by padding if needed
dim = len(binary_vector)
padded_dim = ((dim + 7) // 8) * 8
if padded_dim > dim:
padded = np.zeros(padded_dim, dtype=np.uint8)
padded[:dim] = binary_vector
binary_vector = padded
# Pack 8 bits per byte
packed = np.packbits(binary_vector)
return packed.tobytes()
def unpack_binary_embedding(packed_bytes: bytes, dim: int = 256) -> np.ndarray:
"""Unpack bytes back to binary vector.
Args:
packed_bytes: Packed binary data
dim: Original vector dimension (default: 256)
Returns:
Binary vector (uint8 with values 0 or 1)
"""
unpacked = np.unpackbits(np.frombuffer(packed_bytes, dtype=np.uint8))
return unpacked[:dim]
def hamming_distance(a: bytes, b: bytes) -> int:
"""Compute Hamming distance between two packed binary vectors.
Uses XOR and popcount for efficient distance computation.
Args:
a: First packed binary vector
b: Second packed binary vector
Returns:
Hamming distance (number of differing bits)
"""
a_arr = np.frombuffer(a, dtype=np.uint8)
b_arr = np.frombuffer(b, dtype=np.uint8)
xor = np.bitwise_xor(a_arr, b_arr)
return int(np.unpackbits(xor).sum())
# =============================================================================
# Binary Embedding Backend
# =============================================================================
class BinaryEmbeddingBackend(BaseEmbedder):
"""Generate 256-dimensional binary embeddings for fast coarse retrieval.
Uses a lightweight embedding model and applies sign-based quantization
to produce compact binary vectors (32 bytes per embedding).
Suitable for:
- First-stage candidate retrieval
- Hamming distance-based similarity search
- Memory-constrained environments
Model: sentence-transformers/all-MiniLM-L6-v2 (384 dim) -> quantized to 256 bits
"""
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" # 384 dim, fast
BINARY_DIM = 256
def __init__(
self,
model_name: Optional[str] = None,
use_gpu: bool = True,
) -> None:
"""Initialize binary embedding backend.
Args:
model_name: Base embedding model name. Defaults to BAAI/bge-small-en-v1.5
use_gpu: Whether to use GPU acceleration
"""
from codexlens.semantic import SEMANTIC_AVAILABLE
if not SEMANTIC_AVAILABLE:
raise ImportError(
"Semantic search dependencies not available. "
"Install with: pip install codexlens[semantic]"
)
self._model_name = model_name or self.DEFAULT_MODEL
self._use_gpu = use_gpu
self._model = None
# Projection matrix for dimension reduction (lazily initialized)
self._projection_matrix: Optional[np.ndarray] = None
@property
def model_name(self) -> str:
"""Return model name."""
return self._model_name
@property
def embedding_dim(self) -> int:
"""Return binary embedding dimension (256)."""
return self.BINARY_DIM
@property
def packed_bytes(self) -> int:
"""Return packed bytes size (32 bytes for 256 bits)."""
return self.BINARY_DIM // 8
def _load_model(self) -> None:
"""Lazy load the embedding model."""
if self._model is not None:
return
from fastembed import TextEmbedding
from codexlens.semantic.gpu_support import get_optimal_providers
providers = get_optimal_providers(use_gpu=self._use_gpu, with_device_options=True)
try:
self._model = TextEmbedding(
model_name=self._model_name,
providers=providers,
)
except TypeError:
# Fallback for older fastembed versions
self._model = TextEmbedding(model_name=self._model_name)
logger.debug(f"BinaryEmbeddingBackend loaded model: {self._model_name}")
def _get_projection_matrix(self, input_dim: int) -> np.ndarray:
"""Get or create projection matrix for dimension reduction.
Uses random projection with fixed seed for reproducibility.
Args:
input_dim: Input embedding dimension from base model
Returns:
Projection matrix of shape (input_dim, BINARY_DIM)
"""
if self._projection_matrix is not None:
return self._projection_matrix
# Fixed seed for reproducibility across sessions
rng = np.random.RandomState(42)
# Gaussian random projection
self._projection_matrix = rng.randn(input_dim, self.BINARY_DIM).astype(np.float32)
# Normalize columns for consistent scale
norms = np.linalg.norm(self._projection_matrix, axis=0, keepdims=True)
self._projection_matrix /= (norms + 1e-8)
return self._projection_matrix
def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
"""Generate binary embeddings as numpy array.
Args:
texts: Single text or iterable of texts
Returns:
Binary embeddings of shape (n_texts, 256) with values 0 or 1
"""
self._load_model()
if isinstance(texts, str):
texts = [texts]
else:
texts = list(texts)
# Get base float embeddings
float_embeddings = np.array(list(self._model.embed(texts)))
input_dim = float_embeddings.shape[1]
# Project to target dimension if needed
if input_dim != self.BINARY_DIM:
projection = self._get_projection_matrix(input_dim)
float_embeddings = float_embeddings @ projection
# Binarize
return binarize_embedding(float_embeddings)
def embed_packed(self, texts: str | Iterable[str]) -> List[bytes]:
"""Generate packed binary embeddings.
Args:
texts: Single text or iterable of texts
Returns:
List of packed bytes (32 bytes each for 256-dim)
"""
binary = self.embed_to_numpy(texts)
return [pack_binary_embedding(vec) for vec in binary]
# =============================================================================
# Dense Embedding Backend
# =============================================================================
class DenseEmbeddingBackend(BaseEmbedder):
"""Generate high-dimensional dense embeddings for precise reranking.
Uses large embedding models to produce 2048-dimensional float32 vectors
for maximum retrieval quality.
Suitable for:
- Second-stage reranking
- High-precision similarity search
- Quality-critical applications
Model: BAAI/bge-large-en-v1.5 (1024 dim) with optional expansion
"""
DEFAULT_MODEL = "BAAI/bge-large-en-v1.5" # 1024 dim, high quality
TARGET_DIM = 2048
def __init__(
self,
model_name: Optional[str] = None,
use_gpu: bool = True,
expand_dim: bool = True,
) -> None:
"""Initialize dense embedding backend.
Args:
model_name: Dense embedding model name. Defaults to BAAI/bge-large-en-v1.5
use_gpu: Whether to use GPU acceleration
expand_dim: If True, expand embeddings to TARGET_DIM using learned expansion
"""
from codexlens.semantic import SEMANTIC_AVAILABLE
if not SEMANTIC_AVAILABLE:
raise ImportError(
"Semantic search dependencies not available. "
"Install with: pip install codexlens[semantic]"
)
self._model_name = model_name or self.DEFAULT_MODEL
self._use_gpu = use_gpu
self._expand_dim = expand_dim
self._model = None
self._native_dim: Optional[int] = None
# Expansion matrix for dimension expansion (lazily initialized)
self._expansion_matrix: Optional[np.ndarray] = None
@property
def model_name(self) -> str:
"""Return model name."""
return self._model_name
@property
def embedding_dim(self) -> int:
"""Return embedding dimension.
Returns TARGET_DIM if expand_dim is True, otherwise native model dimension.
"""
if self._expand_dim:
return self.TARGET_DIM
# Return cached native dim or estimate based on model
if self._native_dim is not None:
return self._native_dim
# Model dimension estimates
model_dims = {
"BAAI/bge-large-en-v1.5": 1024,
"BAAI/bge-base-en-v1.5": 768,
"BAAI/bge-small-en-v1.5": 384,
"intfloat/multilingual-e5-large": 1024,
}
return model_dims.get(self._model_name, 1024)
@property
def max_tokens(self) -> int:
"""Return maximum token limit."""
return 512 # Conservative default for large models
def _load_model(self) -> None:
"""Lazy load the embedding model."""
if self._model is not None:
return
from fastembed import TextEmbedding
from codexlens.semantic.gpu_support import get_optimal_providers
providers = get_optimal_providers(use_gpu=self._use_gpu, with_device_options=True)
try:
self._model = TextEmbedding(
model_name=self._model_name,
providers=providers,
)
except TypeError:
self._model = TextEmbedding(model_name=self._model_name)
logger.debug(f"DenseEmbeddingBackend loaded model: {self._model_name}")
def _get_expansion_matrix(self, input_dim: int) -> np.ndarray:
"""Get or create expansion matrix for dimension expansion.
Uses random orthogonal projection for information-preserving expansion.
Args:
input_dim: Input embedding dimension from base model
Returns:
Expansion matrix of shape (input_dim, TARGET_DIM)
"""
if self._expansion_matrix is not None:
return self._expansion_matrix
# Fixed seed for reproducibility
rng = np.random.RandomState(123)
# Create semi-orthogonal expansion matrix
# First input_dim columns form identity-like structure
self._expansion_matrix = np.zeros((input_dim, self.TARGET_DIM), dtype=np.float32)
# Copy original dimensions
copy_dim = min(input_dim, self.TARGET_DIM)
self._expansion_matrix[:copy_dim, :copy_dim] = np.eye(copy_dim, dtype=np.float32)
# Fill remaining with random projections
if self.TARGET_DIM > input_dim:
random_part = rng.randn(input_dim, self.TARGET_DIM - input_dim).astype(np.float32)
# Normalize
norms = np.linalg.norm(random_part, axis=0, keepdims=True)
random_part /= (norms + 1e-8)
self._expansion_matrix[:, input_dim:] = random_part
return self._expansion_matrix
def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
"""Generate dense embeddings as numpy array.
Args:
texts: Single text or iterable of texts
Returns:
Dense embeddings of shape (n_texts, TARGET_DIM) as float32
"""
self._load_model()
if isinstance(texts, str):
texts = [texts]
else:
texts = list(texts)
# Get base float embeddings
float_embeddings = np.array(list(self._model.embed(texts)), dtype=np.float32)
self._native_dim = float_embeddings.shape[1]
# Expand to target dimension if needed
if self._expand_dim and self._native_dim < self.TARGET_DIM:
expansion = self._get_expansion_matrix(self._native_dim)
float_embeddings = float_embeddings @ expansion
return float_embeddings
# =============================================================================
# Cascade Embedding Backend
# =============================================================================
class CascadeEmbeddingBackend(BaseEmbedder):
"""Combined binary + dense embedding backend for cascade retrieval.
Generates both binary (for fast coarse filtering) and dense (for precise
reranking) embeddings in a single pass, optimized for two-stage retrieval.
Cascade workflow:
1. encode_cascade() returns (binary_embeddings, dense_embeddings)
2. Binary search: Use Hamming distance on binary vectors -> top-K candidates
3. Dense rerank: Use cosine similarity on dense vectors -> final results
Memory efficiency:
- Binary: 32 bytes per vector (256 bits)
- Dense: 8192 bytes per vector (2048 x float32)
- Total: ~8KB per document for full cascade support
"""
def __init__(
self,
binary_model: Optional[str] = None,
dense_model: Optional[str] = None,
use_gpu: bool = True,
) -> None:
"""Initialize cascade embedding backend.
Args:
binary_model: Model for binary embeddings. Defaults to BAAI/bge-small-en-v1.5
dense_model: Model for dense embeddings. Defaults to BAAI/bge-large-en-v1.5
use_gpu: Whether to use GPU acceleration
"""
self._binary_backend = BinaryEmbeddingBackend(
model_name=binary_model,
use_gpu=use_gpu,
)
self._dense_backend = DenseEmbeddingBackend(
model_name=dense_model,
use_gpu=use_gpu,
expand_dim=True,
)
self._use_gpu = use_gpu
@property
def model_name(self) -> str:
"""Return model names for both backends."""
return f"cascade({self._binary_backend.model_name}, {self._dense_backend.model_name})"
@property
def embedding_dim(self) -> int:
"""Return dense embedding dimension (for compatibility)."""
return self._dense_backend.embedding_dim
@property
def binary_dim(self) -> int:
"""Return binary embedding dimension."""
return self._binary_backend.embedding_dim
@property
def dense_dim(self) -> int:
"""Return dense embedding dimension."""
return self._dense_backend.embedding_dim
def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
"""Generate dense embeddings (for BaseEmbedder compatibility).
For cascade embeddings, use encode_cascade() instead.
Args:
texts: Single text or iterable of texts
Returns:
Dense embeddings of shape (n_texts, dense_dim)
"""
return self._dense_backend.embed_to_numpy(texts)
def encode_cascade(
self,
texts: str | Iterable[str],
batch_size: int = 32,
) -> Tuple[np.ndarray, np.ndarray]:
"""Generate both binary and dense embeddings.
Args:
texts: Single text or iterable of texts
batch_size: Batch size for processing
Returns:
Tuple of:
- binary_embeddings: Shape (n_texts, 256), uint8 values 0/1
- dense_embeddings: Shape (n_texts, 2048), float32
"""
if isinstance(texts, str):
texts = [texts]
else:
texts = list(texts)
binary_embeddings = self._binary_backend.embed_to_numpy(texts)
dense_embeddings = self._dense_backend.embed_to_numpy(texts)
return binary_embeddings, dense_embeddings
def encode_binary(self, texts: str | Iterable[str]) -> np.ndarray:
"""Generate only binary embeddings.
Args:
texts: Single text or iterable of texts
Returns:
Binary embeddings of shape (n_texts, 256)
"""
return self._binary_backend.embed_to_numpy(texts)
def encode_dense(self, texts: str | Iterable[str]) -> np.ndarray:
"""Generate only dense embeddings.
Args:
texts: Single text or iterable of texts
Returns:
Dense embeddings of shape (n_texts, 2048)
"""
return self._dense_backend.embed_to_numpy(texts)
def encode_binary_packed(self, texts: str | Iterable[str]) -> List[bytes]:
"""Generate packed binary embeddings.
Args:
texts: Single text or iterable of texts
Returns:
List of packed bytes (32 bytes each)
"""
return self._binary_backend.embed_packed(texts)
# =============================================================================
# Factory Function
# =============================================================================
def get_cascade_embedder(
binary_model: Optional[str] = None,
dense_model: Optional[str] = None,
use_gpu: bool = True,
) -> CascadeEmbeddingBackend:
"""Factory function to create a cascade embedder.
Args:
binary_model: Model for binary embeddings (default: BAAI/bge-small-en-v1.5)
dense_model: Model for dense embeddings (default: BAAI/bge-large-en-v1.5)
use_gpu: Whether to use GPU acceleration
Returns:
Configured CascadeEmbeddingBackend instance
Example:
>>> embedder = get_cascade_embedder()
>>> binary, dense = embedder.encode_cascade(["hello world"])
>>> binary.shape # (1, 256)
>>> dense.shape # (1, 2048)
"""
return CascadeEmbeddingBackend(
binary_model=binary_model,
dense_model=dense_model,
use_gpu=use_gpu,
)

View File

@@ -9,12 +9,21 @@ from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional, Dict, Any
from typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING
import logging
import os
import time
from codexlens.entities import SearchResult, Symbol
if TYPE_CHECKING:
import numpy as np
try:
import numpy as np
NUMPY_AVAILABLE = True
except ImportError:
NUMPY_AVAILABLE = False
from codexlens.config import Config
from codexlens.storage.registry import RegistryStore, DirMapping
from codexlens.storage.dir_index import DirIndexStore, SubdirLink
@@ -260,6 +269,672 @@ class ChainSearchEngine:
related_results=related_results,
)
def hybrid_cascade_search(
self,
query: str,
source_path: Path,
k: int = 10,
coarse_k: int = 100,
options: Optional[SearchOptions] = None,
) -> ChainSearchResult:
"""Execute two-stage cascade search with hybrid coarse retrieval and cross-encoder reranking.
Hybrid cascade search process:
1. Stage 1 (Coarse): Fast retrieval using RRF fusion of FTS + SPLADE + Vector
to get coarse_k candidates
2. Stage 2 (Fine): CrossEncoder reranking of candidates to get final k results
This approach balances recall (from broad coarse search) with precision
(from expensive but accurate cross-encoder scoring).
Note: This method is the original hybrid approach. For binary vector cascade,
use binary_cascade_search() instead.
Args:
query: Natural language or keyword query string
source_path: Starting directory path
k: Number of final results to return (default 10)
coarse_k: Number of coarse candidates from first stage (default 100)
options: Search configuration (uses defaults if None)
Returns:
ChainSearchResult with reranked results and statistics
Examples:
>>> engine = ChainSearchEngine(registry, mapper, config=config)
>>> result = engine.hybrid_cascade_search(
... "how to authenticate users",
... Path("D:/project/src"),
... k=10,
... coarse_k=100
... )
>>> for r in result.results:
... print(f"{r.path}: {r.score:.3f}")
"""
options = options or SearchOptions()
start_time = time.time()
stats = SearchStats()
# Use config defaults if available
if self._config is not None:
if hasattr(self._config, "cascade_coarse_k"):
coarse_k = coarse_k or self._config.cascade_coarse_k
if hasattr(self._config, "cascade_fine_k"):
k = k or self._config.cascade_fine_k
# Step 1: Find starting index
start_index = self._find_start_index(source_path)
if not start_index:
self.logger.warning(f"No index found for {source_path}")
stats.time_ms = (time.time() - start_time) * 1000
return ChainSearchResult(
query=query,
results=[],
symbols=[],
stats=stats
)
# Step 2: Collect all index paths
index_paths = self._collect_index_paths(start_index, options.depth)
stats.dirs_searched = len(index_paths)
if not index_paths:
self.logger.warning(f"No indexes collected from {start_index}")
stats.time_ms = (time.time() - start_time) * 1000
return ChainSearchResult(
query=query,
results=[],
symbols=[],
stats=stats
)
# Stage 1: Coarse retrieval with hybrid search (FTS + SPLADE + Vector)
# Use hybrid mode for multi-signal retrieval
coarse_options = SearchOptions(
depth=options.depth,
max_workers=1, # Single thread for GPU safety
limit_per_dir=max(coarse_k // len(index_paths), 20),
total_limit=coarse_k,
hybrid_mode=True,
enable_fuzzy=options.enable_fuzzy,
enable_vector=True, # Enable vector for semantic matching
pure_vector=False,
hybrid_weights=options.hybrid_weights,
)
self.logger.debug(
"Cascade Stage 1: Coarse retrieval for %d candidates", coarse_k
)
coarse_results, search_stats = self._search_parallel(
index_paths, query, coarse_options
)
stats.errors = search_stats.errors
# Merge and deduplicate coarse results
coarse_merged = self._merge_and_rank(coarse_results, coarse_k)
self.logger.debug(
"Cascade Stage 1 complete: %d candidates retrieved", len(coarse_merged)
)
if not coarse_merged:
stats.time_ms = (time.time() - start_time) * 1000
return ChainSearchResult(
query=query,
results=[],
symbols=[],
stats=stats
)
# Stage 2: Cross-encoder reranking
self.logger.debug(
"Cascade Stage 2: Cross-encoder reranking %d candidates to top-%d",
len(coarse_merged),
k,
)
final_results = self._cross_encoder_rerank(query, coarse_merged, k)
# Optional: grouping of similar results
if options.group_results:
from codexlens.search.ranking import group_similar_results
final_results = group_similar_results(
final_results, score_threshold_abs=options.grouping_threshold
)
stats.files_matched = len(final_results)
stats.time_ms = (time.time() - start_time) * 1000
self.logger.debug(
"Cascade search complete: %d results in %.2fms",
len(final_results),
stats.time_ms,
)
return ChainSearchResult(
query=query,
results=final_results,
symbols=[],
stats=stats,
)
def binary_cascade_search(
self,
query: str,
source_path: Path,
k: int = 10,
coarse_k: int = 100,
options: Optional[SearchOptions] = None,
) -> ChainSearchResult:
"""Execute binary cascade search with binary coarse ranking and dense fine ranking.
Binary cascade search process:
1. Stage 1 (Coarse): Fast binary vector search using Hamming distance
to quickly filter to coarse_k candidates (256-dim binary, 32 bytes/vector)
2. Stage 2 (Fine): Dense vector cosine similarity for precise reranking
of candidates (2048-dim float32)
This approach leverages the speed of binary search (~100x faster) while
maintaining precision through dense vector reranking.
Performance characteristics:
- Binary search: O(N) with SIMD-accelerated XOR + popcount
- Dense rerank: Only applied to top coarse_k candidates
- Memory: 32 bytes (binary) + 8KB (dense) per chunk
Args:
query: Natural language or keyword query string
source_path: Starting directory path
k: Number of final results to return (default 10)
coarse_k: Number of coarse candidates from first stage (default 100)
options: Search configuration (uses defaults if None)
Returns:
ChainSearchResult with reranked results and statistics
Examples:
>>> engine = ChainSearchEngine(registry, mapper, config=config)
>>> result = engine.binary_cascade_search(
... "how to authenticate users",
... Path("D:/project/src"),
... k=10,
... coarse_k=100
... )
>>> for r in result.results:
... print(f"{r.path}: {r.score:.3f}")
"""
if not NUMPY_AVAILABLE:
self.logger.warning(
"NumPy not available, falling back to hybrid cascade search"
)
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
options = options or SearchOptions()
start_time = time.time()
stats = SearchStats()
# Use config defaults if available
if self._config is not None:
if hasattr(self._config, "cascade_coarse_k"):
coarse_k = coarse_k or self._config.cascade_coarse_k
if hasattr(self._config, "cascade_fine_k"):
k = k or self._config.cascade_fine_k
# Step 1: Find starting index
start_index = self._find_start_index(source_path)
if not start_index:
self.logger.warning(f"No index found for {source_path}")
stats.time_ms = (time.time() - start_time) * 1000
return ChainSearchResult(
query=query,
results=[],
symbols=[],
stats=stats
)
# Step 2: Collect all index paths
index_paths = self._collect_index_paths(start_index, options.depth)
stats.dirs_searched = len(index_paths)
if not index_paths:
self.logger.warning(f"No indexes collected from {start_index}")
stats.time_ms = (time.time() - start_time) * 1000
return ChainSearchResult(
query=query,
results=[],
symbols=[],
stats=stats
)
# Initialize embedding backends
try:
from codexlens.indexing.embedding import (
BinaryEmbeddingBackend,
DenseEmbeddingBackend,
)
from codexlens.semantic.ann_index import BinaryANNIndex
except ImportError as exc:
self.logger.warning(
"Binary cascade dependencies not available: %s. "
"Falling back to hybrid cascade search.",
exc
)
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
# Stage 1: Binary vector coarse retrieval
self.logger.debug(
"Binary Cascade Stage 1: Binary coarse retrieval for %d candidates",
coarse_k,
)
use_gpu = True
if self._config is not None:
use_gpu = getattr(self._config, "embedding_use_gpu", True)
try:
binary_backend = BinaryEmbeddingBackend(use_gpu=use_gpu)
query_binary_packed = binary_backend.embed_packed([query])[0]
except Exception as exc:
self.logger.warning(
"Failed to generate binary query embedding: %s. "
"Falling back to hybrid cascade search.",
exc
)
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
# Search all indexes for binary candidates
all_candidates: List[Tuple[int, int, Path]] = [] # (chunk_id, distance, index_path)
for index_path in index_paths:
try:
# Get or create binary index for this path
binary_index = self._get_or_create_binary_index(index_path)
if binary_index is None or binary_index.count() == 0:
continue
# Search binary index
ids, distances = binary_index.search(query_binary_packed, coarse_k)
for chunk_id, dist in zip(ids, distances):
all_candidates.append((chunk_id, dist, index_path))
except Exception as exc:
self.logger.debug(
"Binary search failed for %s: %s", index_path, exc
)
stats.errors.append(f"Binary search failed for {index_path}: {exc}")
if not all_candidates:
self.logger.debug("No binary candidates found, falling back to hybrid")
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
# Sort by Hamming distance and take top coarse_k
all_candidates.sort(key=lambda x: x[1])
coarse_candidates = all_candidates[:coarse_k]
self.logger.debug(
"Binary Cascade Stage 1 complete: %d candidates retrieved",
len(coarse_candidates),
)
# Stage 2: Dense vector fine ranking
self.logger.debug(
"Binary Cascade Stage 2: Dense reranking %d candidates to top-%d",
len(coarse_candidates),
k,
)
try:
dense_backend = DenseEmbeddingBackend(use_gpu=use_gpu)
query_dense = dense_backend.embed_to_numpy([query])[0]
except Exception as exc:
self.logger.warning(
"Failed to generate dense query embedding: %s. "
"Using Hamming distance scores only.",
exc
)
# Fall back to using Hamming distance as score
return self._build_results_from_candidates(
coarse_candidates[:k], index_paths, stats, query, start_time
)
# Group candidates by index path for batch retrieval
candidates_by_index: Dict[Path, List[int]] = {}
for chunk_id, _, index_path in coarse_candidates:
if index_path not in candidates_by_index:
candidates_by_index[index_path] = []
candidates_by_index[index_path].append(chunk_id)
# Retrieve dense embeddings and compute cosine similarity
scored_results: List[Tuple[float, SearchResult]] = []
for index_path, chunk_ids in candidates_by_index.items():
try:
store = SQLiteStore(index_path)
dense_embeddings = store.get_dense_embeddings(chunk_ids)
chunks_data = store.get_chunks_by_ids(chunk_ids)
# Create lookup for chunk content
chunk_content: Dict[int, Dict[str, Any]] = {
c["id"]: c for c in chunks_data
}
for chunk_id in chunk_ids:
dense_bytes = dense_embeddings.get(chunk_id)
chunk_info = chunk_content.get(chunk_id)
if dense_bytes is None or chunk_info is None:
continue
# Compute cosine similarity
dense_vec = np.frombuffer(dense_bytes, dtype=np.float32)
score = self._compute_cosine_similarity(query_dense, dense_vec)
# Create search result
excerpt = chunk_info.get("content", "")[:500]
result = SearchResult(
path=chunk_info.get("file_path", ""),
score=float(score),
excerpt=excerpt,
)
scored_results.append((score, result))
except Exception as exc:
self.logger.debug(
"Dense reranking failed for %s: %s", index_path, exc
)
stats.errors.append(f"Dense reranking failed for {index_path}: {exc}")
# Sort by score descending and deduplicate by path
scored_results.sort(key=lambda x: x[0], reverse=True)
path_to_result: Dict[str, SearchResult] = {}
for score, result in scored_results:
if result.path not in path_to_result:
path_to_result[result.path] = result
final_results = list(path_to_result.values())[:k]
# Optional: grouping of similar results
if options.group_results:
from codexlens.search.ranking import group_similar_results
final_results = group_similar_results(
final_results, score_threshold_abs=options.grouping_threshold
)
stats.files_matched = len(final_results)
stats.time_ms = (time.time() - start_time) * 1000
self.logger.debug(
"Binary cascade search complete: %d results in %.2fms",
len(final_results),
stats.time_ms,
)
return ChainSearchResult(
query=query,
results=final_results,
symbols=[],
stats=stats,
)
def cascade_search(
self,
query: str,
source_path: Path,
k: int = 10,
coarse_k: int = 100,
options: Optional[SearchOptions] = None,
strategy: Literal["binary", "hybrid"] = "binary",
) -> ChainSearchResult:
"""Unified cascade search entry point with strategy selection.
Provides a single interface for cascade search with configurable strategy:
- "binary": Uses binary vector coarse ranking + dense fine ranking (faster)
- "hybrid": Uses FTS+SPLADE+Vector coarse ranking + cross-encoder reranking (original)
The strategy can be configured via:
1. The `strategy` parameter (highest priority)
2. Config `cascade_strategy` setting
3. Default: "binary"
Args:
query: Natural language or keyword query string
source_path: Starting directory path
k: Number of final results to return (default 10)
coarse_k: Number of coarse candidates from first stage (default 100)
options: Search configuration (uses defaults if None)
strategy: Cascade strategy - "binary" or "hybrid" (default "binary")
Returns:
ChainSearchResult with reranked results and statistics
Examples:
>>> engine = ChainSearchEngine(registry, mapper, config=config)
>>> # Use binary cascade (default, faster)
>>> result = engine.cascade_search("auth", Path("D:/project"))
>>> # Use hybrid cascade (original behavior)
>>> result = engine.cascade_search("auth", Path("D:/project"), strategy="hybrid")
"""
# Check config for strategy override
effective_strategy = strategy
if self._config is not None:
config_strategy = getattr(self._config, "cascade_strategy", None)
if config_strategy in ("binary", "hybrid"):
# Only use config if no explicit strategy was passed
# (we can't detect if strategy was explicitly passed vs default)
effective_strategy = config_strategy
if effective_strategy == "binary":
return self.binary_cascade_search(query, source_path, k, coarse_k, options)
else:
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
def _get_or_create_binary_index(self, index_path: Path) -> Optional[Any]:
"""Get or create a BinaryANNIndex for the given index path.
Attempts to load an existing binary index from disk. If not found,
returns None (binary index should be built during indexing).
Args:
index_path: Path to the _index.db file
Returns:
BinaryANNIndex instance or None if not available
"""
try:
from codexlens.semantic.ann_index import BinaryANNIndex
binary_index = BinaryANNIndex(index_path, dim=256)
if binary_index.load():
return binary_index
return None
except Exception as exc:
self.logger.debug("Failed to load binary index for %s: %s", index_path, exc)
return None
def _compute_cosine_similarity(
self,
query_vec: "np.ndarray",
doc_vec: "np.ndarray",
) -> float:
"""Compute cosine similarity between query and document vectors.
Args:
query_vec: Query embedding vector
doc_vec: Document embedding vector
Returns:
Cosine similarity score in range [-1, 1]
"""
if not NUMPY_AVAILABLE:
return 0.0
# Ensure same shape
min_len = min(len(query_vec), len(doc_vec))
q = query_vec[:min_len]
d = doc_vec[:min_len]
# Compute cosine similarity
dot_product = np.dot(q, d)
norm_q = np.linalg.norm(q)
norm_d = np.linalg.norm(d)
if norm_q == 0 or norm_d == 0:
return 0.0
return float(dot_product / (norm_q * norm_d))
def _build_results_from_candidates(
self,
candidates: List[Tuple[int, int, Path]],
index_paths: List[Path],
stats: SearchStats,
query: str,
start_time: float,
) -> ChainSearchResult:
"""Build ChainSearchResult from binary candidates using Hamming distance scores.
Used as fallback when dense embeddings are not available.
Args:
candidates: List of (chunk_id, hamming_distance, index_path) tuples
index_paths: List of all searched index paths
stats: SearchStats to update
query: Original query string
start_time: Search start time for timing
Returns:
ChainSearchResult with results scored by Hamming distance
"""
results: List[SearchResult] = []
# Group by index path
candidates_by_index: Dict[Path, List[Tuple[int, int]]] = {}
for chunk_id, distance, index_path in candidates:
if index_path not in candidates_by_index:
candidates_by_index[index_path] = []
candidates_by_index[index_path].append((chunk_id, distance))
for index_path, chunk_tuples in candidates_by_index.items():
try:
store = SQLiteStore(index_path)
chunk_ids = [c[0] for c in chunk_tuples]
chunks_data = store.get_chunks_by_ids(chunk_ids)
chunk_content: Dict[int, Dict[str, Any]] = {
c["id"]: c for c in chunks_data
}
for chunk_id, distance in chunk_tuples:
chunk_info = chunk_content.get(chunk_id)
if chunk_info is None:
continue
# Convert Hamming distance to score (lower distance = higher score)
# Max Hamming distance for 256-bit is 256
score = 1.0 - (distance / 256.0)
excerpt = chunk_info.get("content", "")[:500]
result = SearchResult(
path=chunk_info.get("file_path", ""),
score=float(score),
excerpt=excerpt,
)
results.append(result)
except Exception as exc:
self.logger.debug(
"Failed to build results from %s: %s", index_path, exc
)
# Deduplicate by path
path_to_result: Dict[str, SearchResult] = {}
for result in results:
if result.path not in path_to_result or result.score > path_to_result[result.path].score:
path_to_result[result.path] = result
final_results = sorted(
path_to_result.values(),
key=lambda r: r.score,
reverse=True,
)
stats.files_matched = len(final_results)
stats.time_ms = (time.time() - start_time) * 1000
return ChainSearchResult(
query=query,
results=final_results,
symbols=[],
stats=stats,
)
def _cross_encoder_rerank(
self,
query: str,
results: List[SearchResult],
top_k: int,
) -> List[SearchResult]:
"""Rerank results using cross-encoder model.
Args:
query: Search query string
results: Candidate results to rerank
top_k: Number of top results to return
Returns:
Reranked results sorted by cross-encoder score
"""
if not results:
return []
# Try to get reranker from config or create new one
reranker = None
try:
from codexlens.semantic.reranker import (
check_reranker_available,
get_reranker,
)
# Determine backend and model from config
backend = "onnx"
model_name = None
use_gpu = True
if self._config is not None:
backend = getattr(self._config, "reranker_backend", "onnx") or "onnx"
model_name = getattr(self._config, "reranker_model", None)
use_gpu = getattr(self._config, "embedding_use_gpu", True)
ok, err = check_reranker_available(backend)
if not ok:
self.logger.debug("Reranker backend unavailable (%s): %s", backend, err)
return results[:top_k]
# Create reranker
kwargs = {}
if backend == "onnx":
kwargs["use_gpu"] = use_gpu
reranker = get_reranker(backend=backend, model_name=model_name, **kwargs)
except ImportError as exc:
self.logger.debug("Reranker not available: %s", exc)
return results[:top_k]
except Exception as exc:
self.logger.debug("Failed to initialize reranker: %s", exc)
return results[:top_k]
# Use cross_encoder_rerank from ranking module
from codexlens.search.ranking import cross_encoder_rerank
return cross_encoder_rerank(
query=query,
results=results,
reranker=reranker,
top_k=top_k,
batch_size=32,
)
def search_files_only(self, query: str,
source_path: Path,
options: Optional[SearchOptions] = None) -> List[str]:

View File

@@ -40,11 +40,20 @@ from codexlens.search.ranking import (
get_rrf_weights,
reciprocal_rank_fusion,
rerank_results,
simple_weighted_fusion,
tag_search_source,
)
from codexlens.storage.dir_index import DirIndexStore
# Three-way fusion weights (FTS + Vector + SPLADE)
THREE_WAY_WEIGHTS = {
"exact": 0.2,
"splade": 0.3,
"vector": 0.5,
}
class HybridSearchEngine:
"""Hybrid search engine with parallel execution and RRF fusion.
@@ -193,9 +202,22 @@ class HybridSearchEngine:
if source in results_map
}
with timer("rrf_fusion", self.logger):
# Determine fusion method from config (default: rrf)
fusion_method = "rrf"
rrf_k = 60
if self._config is not None:
fusion_method = getattr(self._config, "fusion_method", "rrf") or "rrf"
rrf_k = getattr(self._config, "rrf_k", 60) or 60
with timer("fusion", self.logger):
adaptive_weights = get_rrf_weights(query, active_weights)
fused_results = reciprocal_rank_fusion(results_map, adaptive_weights)
if fusion_method == "simple":
fused_results = simple_weighted_fusion(results_map, adaptive_weights)
else:
# Default to RRF
fused_results = reciprocal_rank_fusion(
results_map, adaptive_weights, k=rrf_k
)
# Optional: boost results that include explicit symbol matches
boost_factor = (

View File

@@ -132,6 +132,116 @@ def get_rrf_weights(
return adjust_weights_by_intent(detect_query_intent(query), base_weights)
def simple_weighted_fusion(
results_map: Dict[str, List[SearchResult]],
weights: Dict[str, float] = None,
) -> List[SearchResult]:
"""Combine search results using simple weighted sum of normalized scores.
This is an alternative to RRF that preserves score magnitude information.
Scores are min-max normalized per source before weighted combination.
Formula: score(d) = Σ weight_source * normalized_score_source(d)
Args:
results_map: Dictionary mapping source name to list of SearchResult objects
Sources: 'exact', 'fuzzy', 'vector', 'splade'
weights: Dictionary mapping source name to weight (default: equal weights)
Example: {'exact': 0.3, 'fuzzy': 0.1, 'vector': 0.6}
Returns:
List of SearchResult objects sorted by fused score (descending)
Examples:
>>> fts_results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
>>> vector_results = [SearchResult(path="b.py", score=0.85, excerpt="...")]
>>> results_map = {'exact': fts_results, 'vector': vector_results}
>>> fused = simple_weighted_fusion(results_map)
"""
if not results_map:
return []
# Default equal weights if not provided
if weights is None:
num_sources = len(results_map)
weights = {source: 1.0 / num_sources for source in results_map}
# Normalize weights to sum to 1.0
weight_sum = sum(weights.values())
if not math.isclose(weight_sum, 1.0, abs_tol=0.01) and weight_sum > 0:
weights = {source: w / weight_sum for source, w in weights.items()}
# Compute min-max normalization parameters per source
source_stats: Dict[str, tuple] = {}
for source_name, results in results_map.items():
if not results:
continue
scores = [r.score for r in results]
min_s, max_s = min(scores), max(scores)
source_stats[source_name] = (min_s, max_s)
def normalize_score(score: float, source: str) -> float:
"""Normalize score to [0, 1] range using min-max scaling."""
if source not in source_stats:
return 0.0
min_s, max_s = source_stats[source]
if max_s == min_s:
return 1.0 if score >= min_s else 0.0
return (score - min_s) / (max_s - min_s)
# Build unified result set with weighted scores
path_to_result: Dict[str, SearchResult] = {}
path_to_fusion_score: Dict[str, float] = {}
path_to_source_scores: Dict[str, Dict[str, float]] = {}
for source_name, results in results_map.items():
weight = weights.get(source_name, 0.0)
if weight == 0:
continue
for result in results:
path = result.path
normalized = normalize_score(result.score, source_name)
contribution = weight * normalized
if path not in path_to_fusion_score:
path_to_fusion_score[path] = 0.0
path_to_result[path] = result
path_to_source_scores[path] = {}
path_to_fusion_score[path] += contribution
path_to_source_scores[path][source_name] = normalized
# Create final results with fusion scores
fused_results = []
for path, base_result in path_to_result.items():
fusion_score = path_to_fusion_score[path]
fused_result = SearchResult(
path=base_result.path,
score=fusion_score,
excerpt=base_result.excerpt,
content=base_result.content,
symbol=base_result.symbol,
chunk=base_result.chunk,
metadata={
**base_result.metadata,
"fusion_method": "simple_weighted",
"fusion_score": fusion_score,
"original_score": base_result.score,
"source_scores": path_to_source_scores[path],
},
start_line=base_result.start_line,
end_line=base_result.end_line,
symbol_name=base_result.symbol_name,
symbol_kind=base_result.symbol_kind,
)
fused_results.append(fused_result)
fused_results.sort(key=lambda r: r.score, reverse=True)
return fused_results
def reciprocal_rank_fusion(
results_map: Dict[str, List[SearchResult]],
weights: Dict[str, float] = None,
@@ -141,11 +251,14 @@ def reciprocal_rank_fusion(
RRF formula: score(d) = Σ weight_source / (k + rank_source(d))
Supports three-way fusion with FTS, Vector, and SPLADE sources.
Args:
results_map: Dictionary mapping source name to list of SearchResult objects
Sources: 'exact', 'fuzzy', 'vector'
Sources: 'exact', 'fuzzy', 'vector', 'splade'
weights: Dictionary mapping source name to weight (default: equal weights)
Example: {'exact': 0.3, 'fuzzy': 0.1, 'vector': 0.6}
Or: {'splade': 0.4, 'vector': 0.6}
k: Constant to avoid division by zero and control rank influence (default 60)
Returns:
@@ -156,6 +269,14 @@ def reciprocal_rank_fusion(
>>> fuzzy_results = [SearchResult(path="b.py", score=8.0, excerpt="...")]
>>> results_map = {'exact': exact_results, 'fuzzy': fuzzy_results}
>>> fused = reciprocal_rank_fusion(results_map)
# Three-way fusion with SPLADE
>>> results_map = {
... 'exact': exact_results,
... 'vector': vector_results,
... 'splade': splade_results
... }
>>> fused = reciprocal_rank_fusion(results_map, k=60)
"""
if not results_map:
return []
@@ -174,6 +295,7 @@ def reciprocal_rank_fusion(
# Build unified result set with RRF scores
path_to_result: Dict[str, SearchResult] = {}
path_to_fusion_score: Dict[str, float] = {}
path_to_source_ranks: Dict[str, Dict[str, int]] = {}
for source_name, results in results_map.items():
weight = weights.get(source_name, 0.0)
@@ -188,8 +310,10 @@ def reciprocal_rank_fusion(
if path not in path_to_fusion_score:
path_to_fusion_score[path] = 0.0
path_to_result[path] = result
path_to_source_ranks[path] = {}
path_to_fusion_score[path] += rrf_contribution
path_to_source_ranks[path][source_name] = rank
# Create final results with fusion scores
fused_results = []
@@ -206,8 +330,11 @@ def reciprocal_rank_fusion(
chunk=base_result.chunk,
metadata={
**base_result.metadata,
"fusion_method": "rrf",
"fusion_score": fusion_score,
"original_score": base_result.score,
"rrf_k": k,
"source_ranks": path_to_source_ranks[path],
},
start_line=base_result.start_line,
end_line=base_result.end_line,

View File

@@ -412,3 +412,489 @@ class ANNIndex:
"""
with self._lock:
return self._index is not None and self._current_count > 0
class BinaryANNIndex:
"""Binary vector ANN index using Hamming distance for fast coarse retrieval.
Optimized for binary vectors (256-bit / 32 bytes per vector).
Uses packed binary representation for memory efficiency.
Performance characteristics:
- Storage: 32 bytes per vector (vs ~8KB for dense vectors)
- Distance: Hamming distance via XOR + popcount (CPU-efficient)
- Search: O(N) brute-force with SIMD-accelerated distance computation
Index parameters:
- dim: Binary vector dimension (default: 256)
- packed_dim: Packed bytes size (dim / 8 = 32 for 256-bit)
Usage:
index = BinaryANNIndex(index_path, dim=256)
index.add_vectors([1, 2, 3], packed_vectors) # List of 32-byte packed vectors
ids, distances = index.search(query_packed, top_k=10)
"""
DEFAULT_DIM = 256 # Default binary vector dimension
def __init__(
self,
index_path: Path,
dim: int = 256,
initial_capacity: int = 100000,
auto_save: bool = False,
) -> None:
"""Initialize Binary ANN index.
Args:
index_path: Path to database (index will be saved as _binary_vectors.bin)
dim: Dimension of binary vectors (default: 256)
initial_capacity: Initial capacity hint (default: 100000)
auto_save: Whether to automatically save index after operations
Raises:
ImportError: If required dependencies are not available
ValueError: If dimension is invalid
"""
if not SEMANTIC_AVAILABLE:
raise ImportError(
"Semantic search dependencies not available. "
"Install with: pip install codexlens[semantic]"
)
if dim <= 0 or dim % 8 != 0:
raise ValueError(
f"Invalid dimension: {dim}. Must be positive and divisible by 8."
)
self.index_path = Path(index_path)
self.dim = dim
self.packed_dim = dim // 8 # 32 bytes for 256-bit vectors
# Derive binary index path from database path
db_stem = self.index_path.stem
self.binary_path = self.index_path.parent / f"{db_stem}_binary_vectors.bin"
# Memory management
self._auto_save = auto_save
self._initial_capacity = initial_capacity
# Thread safety
self._lock = threading.RLock()
# In-memory storage: id -> packed binary vector
self._vectors: dict[int, bytes] = {}
self._id_list: list[int] = [] # Ordered list for efficient iteration
logger.info(
f"Initialized BinaryANNIndex with dim={dim}, packed_dim={self.packed_dim}"
)
def add_vectors(self, ids: List[int], vectors: List[bytes]) -> None:
"""Add packed binary vectors to the index.
Args:
ids: List of vector IDs (must be unique)
vectors: List of packed binary vectors (each of size packed_dim bytes)
Raises:
ValueError: If shapes don't match or vectors are invalid
StorageError: If index operation fails
"""
if len(ids) == 0:
return
if len(vectors) != len(ids):
raise ValueError(
f"Number of vectors ({len(vectors)}) must match number of IDs ({len(ids)})"
)
# Validate vector sizes
for i, vec in enumerate(vectors):
if len(vec) != self.packed_dim:
raise ValueError(
f"Vector {i} has size {len(vec)}, expected {self.packed_dim}"
)
with self._lock:
try:
for vec_id, vec in zip(ids, vectors):
if vec_id not in self._vectors:
self._id_list.append(vec_id)
self._vectors[vec_id] = vec
logger.debug(
f"Added {len(ids)} binary vectors to index (total: {len(self._vectors)})"
)
if self._auto_save:
self.save()
except Exception as e:
raise StorageError(f"Failed to add vectors to Binary ANN index: {e}")
def add_vectors_numpy(self, ids: List[int], vectors: np.ndarray) -> None:
"""Add unpacked binary vectors (0/1 values) to the index.
Convenience method that packs the vectors before adding.
Args:
ids: List of vector IDs (must be unique)
vectors: Numpy array of shape (N, dim) with binary values (0 or 1)
Raises:
ValueError: If shapes don't match
StorageError: If index operation fails
"""
if len(ids) == 0:
return
if vectors.shape[0] != len(ids):
raise ValueError(
f"Number of vectors ({vectors.shape[0]}) must match number of IDs ({len(ids)})"
)
if vectors.shape[1] != self.dim:
raise ValueError(
f"Vector dimension ({vectors.shape[1]}) must match index dimension ({self.dim})"
)
# Pack vectors
packed_vectors = []
for i in range(vectors.shape[0]):
packed = np.packbits(vectors[i].astype(np.uint8)).tobytes()
packed_vectors.append(packed)
self.add_vectors(ids, packed_vectors)
def remove_vectors(self, ids: List[int]) -> None:
"""Remove vectors from the index.
Args:
ids: List of vector IDs to remove
Raises:
StorageError: If index operation fails
Note:
Optimized for batch deletion using set operations instead of
O(N) list.remove() calls for each ID.
"""
if len(ids) == 0:
return
with self._lock:
try:
# Use set for O(1) lookup during filtering
ids_to_remove = set(ids)
removed_count = 0
# Remove from dictionary - O(1) per deletion
for vec_id in ids_to_remove:
if vec_id in self._vectors:
del self._vectors[vec_id]
removed_count += 1
# Rebuild ID list efficiently - O(N) once instead of O(N) per removal
if removed_count > 0:
self._id_list = [id_ for id_ in self._id_list if id_ not in ids_to_remove]
logger.debug(f"Removed {removed_count}/{len(ids)} vectors from index")
if self._auto_save and removed_count > 0:
self.save()
except Exception as e:
raise StorageError(
f"Failed to remove vectors from Binary ANN index: {e}"
)
def search(
self, query: bytes, top_k: int = 10
) -> Tuple[List[int], List[int]]:
"""Search for nearest neighbors using Hamming distance.
Args:
query: Packed binary query vector (size: packed_dim bytes)
top_k: Number of nearest neighbors to return
Returns:
Tuple of (ids, distances) where:
- ids: List of vector IDs ordered by Hamming distance (ascending)
- distances: List of Hamming distances (lower = more similar)
Raises:
ValueError: If query size is invalid
StorageError: If search operation fails
"""
if len(query) != self.packed_dim:
raise ValueError(
f"Query size ({len(query)}) must match packed_dim ({self.packed_dim})"
)
with self._lock:
try:
if len(self._vectors) == 0:
return [], []
# Compute Hamming distances to all vectors
query_arr = np.frombuffer(query, dtype=np.uint8)
distances = []
for vec_id in self._id_list:
vec = self._vectors[vec_id]
vec_arr = np.frombuffer(vec, dtype=np.uint8)
# XOR and popcount for Hamming distance
xor = np.bitwise_xor(query_arr, vec_arr)
dist = int(np.unpackbits(xor).sum())
distances.append((vec_id, dist))
# Sort by distance (ascending)
distances.sort(key=lambda x: x[1])
# Return top-k
top_results = distances[:top_k]
ids = [r[0] for r in top_results]
dists = [r[1] for r in top_results]
return ids, dists
except Exception as e:
raise StorageError(f"Failed to search Binary ANN index: {e}")
def search_numpy(
self, query: np.ndarray, top_k: int = 10
) -> Tuple[List[int], List[int]]:
"""Search with unpacked binary query vector.
Convenience method that packs the query before searching.
Args:
query: Binary query vector of shape (dim,) with values 0 or 1
top_k: Number of nearest neighbors to return
Returns:
Tuple of (ids, distances)
"""
if query.ndim == 2:
query = query.flatten()
if len(query) != self.dim:
raise ValueError(
f"Query dimension ({len(query)}) must match index dimension ({self.dim})"
)
packed_query = np.packbits(query.astype(np.uint8)).tobytes()
return self.search(packed_query, top_k)
def search_batch(
self, queries: List[bytes], top_k: int = 10
) -> List[Tuple[List[int], List[int]]]:
"""Batch search for multiple queries.
Args:
queries: List of packed binary query vectors
top_k: Number of nearest neighbors to return per query
Returns:
List of (ids, distances) tuples, one per query
"""
results = []
for query in queries:
ids, dists = self.search(query, top_k)
results.append((ids, dists))
return results
def save(self) -> None:
"""Save index to disk.
Binary format:
- 4 bytes: magic number (0x42494E56 = "BINV")
- 4 bytes: version (1)
- 4 bytes: dim
- 4 bytes: packed_dim
- 4 bytes: num_vectors
- For each vector:
- 4 bytes: id
- packed_dim bytes: vector data
Raises:
StorageError: If save operation fails
"""
with self._lock:
try:
if len(self._vectors) == 0:
logger.debug("Skipping save: index is empty")
return
# Ensure parent directory exists
self.binary_path.parent.mkdir(parents=True, exist_ok=True)
with open(self.binary_path, "wb") as f:
# Header
f.write(b"BINV") # Magic number
f.write(np.array([1], dtype=np.uint32).tobytes()) # Version
f.write(np.array([self.dim], dtype=np.uint32).tobytes())
f.write(np.array([self.packed_dim], dtype=np.uint32).tobytes())
f.write(
np.array([len(self._vectors)], dtype=np.uint32).tobytes()
)
# Vectors
for vec_id in self._id_list:
f.write(np.array([vec_id], dtype=np.uint32).tobytes())
f.write(self._vectors[vec_id])
logger.debug(
f"Saved binary index to {self.binary_path} "
f"({len(self._vectors)} vectors)"
)
except Exception as e:
raise StorageError(f"Failed to save Binary ANN index: {e}")
def load(self) -> bool:
"""Load index from disk.
Returns:
True if index was loaded successfully, False if index file doesn't exist
Raises:
StorageError: If load operation fails
"""
with self._lock:
try:
if not self.binary_path.exists():
logger.debug(f"Binary index file not found: {self.binary_path}")
return False
with open(self.binary_path, "rb") as f:
# Read header
magic = f.read(4)
if magic != b"BINV":
raise StorageError(
f"Invalid binary index file: bad magic number"
)
version = np.frombuffer(f.read(4), dtype=np.uint32)[0]
if version != 1:
raise StorageError(
f"Unsupported binary index version: {version}"
)
file_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0]
file_packed_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0]
num_vectors = np.frombuffer(f.read(4), dtype=np.uint32)[0]
if file_dim != self.dim or file_packed_dim != self.packed_dim:
raise StorageError(
f"Dimension mismatch: file has dim={file_dim}, "
f"packed_dim={file_packed_dim}, "
f"expected dim={self.dim}, packed_dim={self.packed_dim}"
)
# Clear existing data
self._vectors.clear()
self._id_list.clear()
# Read vectors
for _ in range(num_vectors):
vec_id = np.frombuffer(f.read(4), dtype=np.uint32)[0]
vec_data = f.read(self.packed_dim)
self._vectors[int(vec_id)] = vec_data
self._id_list.append(int(vec_id))
logger.info(
f"Loaded binary index from {self.binary_path} "
f"({len(self._vectors)} vectors)"
)
return True
except StorageError:
raise
except Exception as e:
raise StorageError(f"Failed to load Binary ANN index: {e}")
def count(self) -> int:
"""Get number of vectors in the index.
Returns:
Number of vectors currently in the index
"""
with self._lock:
return len(self._vectors)
@property
def is_loaded(self) -> bool:
"""Check if index has vectors.
Returns:
True if index has vectors, False otherwise
"""
with self._lock:
return len(self._vectors) > 0
def get_vector(self, vec_id: int) -> Optional[bytes]:
"""Get a specific vector by ID.
Args:
vec_id: Vector ID to retrieve
Returns:
Packed binary vector or None if not found
"""
with self._lock:
return self._vectors.get(vec_id)
def clear(self) -> None:
"""Clear all vectors from the index."""
with self._lock:
self._vectors.clear()
self._id_list.clear()
logger.debug("Cleared binary index")
def create_ann_index(
index_path: Path,
index_type: str = "hnsw",
dim: int = 2048,
**kwargs,
) -> ANNIndex | BinaryANNIndex:
"""Factory function to create an ANN index.
Args:
index_path: Path to database file
index_type: Type of index - "hnsw" for dense vectors, "binary" for binary vectors
dim: Vector dimension (default: 2048 for dense, 256 for binary)
**kwargs: Additional arguments passed to the index constructor
Returns:
ANNIndex for dense vectors or BinaryANNIndex for binary vectors
Raises:
ValueError: If index_type is invalid
Example:
>>> # Dense vector index (HNSW)
>>> dense_index = create_ann_index(path, index_type="hnsw", dim=2048)
>>> dense_index.add_vectors(ids, dense_vectors)
>>>
>>> # Binary vector index (Hamming distance)
>>> binary_index = create_ann_index(path, index_type="binary", dim=256)
>>> binary_index.add_vectors(ids, packed_vectors)
"""
index_type = index_type.lower()
if index_type == "hnsw":
return ANNIndex(index_path=index_path, dim=dim, **kwargs)
elif index_type == "binary":
# Default to 256 for binary if not specified
if dim == 2048: # Default dense dim was used
dim = 256
return BinaryANNIndex(index_path=index_path, dim=dim, **kwargs)
else:
raise ValueError(
f"Invalid index_type: {index_type}. Must be 'hnsw' or 'binary'."
)

View File

@@ -29,10 +29,17 @@ except ImportError:
# Try to import ANN index (optional hnswlib dependency)
try:
from codexlens.semantic.ann_index import ANNIndex, HNSWLIB_AVAILABLE
from codexlens.semantic.ann_index import (
ANNIndex,
BinaryANNIndex,
create_ann_index,
HNSWLIB_AVAILABLE,
)
except ImportError:
HNSWLIB_AVAILABLE = False
ANNIndex = None
BinaryANNIndex = None
create_ann_index = None
logger = logging.getLogger(__name__)

View File

@@ -0,0 +1,162 @@
"""
Migration 010: Add multi-vector storage support for cascade retrieval.
This migration introduces the chunks table with multi-vector support:
- chunks: Stores code chunks with multiple embedding types
- embedding: Original embedding for backward compatibility
- embedding_binary: 256-dim binary vector for coarse ranking (fast)
- embedding_dense: 2048-dim dense vector for fine ranking (precise)
The multi-vector architecture enables cascade retrieval:
1. First stage: Fast binary vector search for candidate retrieval
2. Second stage: Dense vector reranking for precision
"""
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
"""
Adds chunks table with multi-vector embedding columns.
Creates:
- chunks: Table for storing code chunks with multiple embedding types
- idx_chunks_file_path: Index for efficient file-based lookups
Also migrates existing chunks tables by adding new columns if needed.
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
# Check if chunks table already exists
table_exists = cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
).fetchone()
if table_exists:
# Migrate existing table - add new columns if missing
log.info("chunks table exists, checking for missing columns...")
col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall()
existing_columns = {row[1] for row in col_info}
if "embedding_binary" not in existing_columns:
log.info("Adding embedding_binary column to chunks table...")
cursor.execute(
"ALTER TABLE chunks ADD COLUMN embedding_binary BLOB"
)
if "embedding_dense" not in existing_columns:
log.info("Adding embedding_dense column to chunks table...")
cursor.execute(
"ALTER TABLE chunks ADD COLUMN embedding_dense BLOB"
)
else:
# Create new table with all columns
log.info("Creating chunks table with multi-vector support...")
cursor.execute(
"""
CREATE TABLE chunks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT NOT NULL,
content TEXT NOT NULL,
embedding BLOB,
embedding_binary BLOB,
embedding_dense BLOB,
metadata TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
)
# Create index for file-based lookups
log.info("Creating index for chunks table...")
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_chunks_file_path
ON chunks(file_path)
"""
)
log.info("Migration 010 completed successfully")
def downgrade(db_conn: Connection) -> None:
"""
Removes multi-vector columns from chunks table.
Note: This does not drop the chunks table entirely to preserve data.
Only the new columns added by this migration are removed.
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
log.info("Removing multi-vector columns from chunks table...")
# SQLite doesn't support DROP COLUMN directly in older versions
# We need to recreate the table without the columns
# Check if chunks table exists
table_exists = cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
).fetchone()
if not table_exists:
log.info("chunks table does not exist, nothing to downgrade")
return
# Check if the columns exist before trying to remove them
col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall()
existing_columns = {row[1] for row in col_info}
needs_migration = (
"embedding_binary" in existing_columns or
"embedding_dense" in existing_columns
)
if not needs_migration:
log.info("Multi-vector columns not present, nothing to remove")
return
# Recreate table without the new columns
log.info("Recreating chunks table without multi-vector columns...")
cursor.execute(
"""
CREATE TABLE chunks_backup (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT NOT NULL,
content TEXT NOT NULL,
embedding BLOB,
metadata TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
)
cursor.execute(
"""
INSERT INTO chunks_backup (id, file_path, content, embedding, metadata, created_at)
SELECT id, file_path, content, embedding, metadata, created_at FROM chunks
"""
)
cursor.execute("DROP TABLE chunks")
cursor.execute("ALTER TABLE chunks_backup RENAME TO chunks")
# Recreate index
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_chunks_file_path
ON chunks(file_path)
"""
)
log.info("Migration 010 downgrade completed successfully")

View File

@@ -539,6 +539,27 @@ class SQLiteStore:
)
conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
# Chunks table for multi-vector storage (cascade retrieval architecture)
# - embedding: Original embedding for backward compatibility
# - embedding_binary: 256-dim binary vector for coarse ranking
# - embedding_dense: 2048-dim dense vector for fine ranking
conn.execute(
"""
CREATE TABLE IF NOT EXISTS chunks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT NOT NULL,
content TEXT NOT NULL,
embedding BLOB,
embedding_binary BLOB,
embedding_dense BLOB,
metadata TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
)
conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_file_path ON chunks(file_path)")
# Run migration for existing databases
self._migrate_chunks_table(conn)
conn.commit()
except sqlite3.DatabaseError as exc:
raise StorageError(f"Failed to initialize database schema: {exc}") from exc
@@ -650,3 +671,306 @@ class SQLiteStore:
conn.execute("VACUUM")
except sqlite3.DatabaseError:
pass
def _migrate_chunks_table(self, conn: sqlite3.Connection) -> None:
"""Migrate existing chunks table to add multi-vector columns if needed.
This handles upgrading existing databases that may have the chunks table
without the embedding_binary and embedding_dense columns.
"""
# Check if chunks table exists
table_exists = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
).fetchone()
if not table_exists:
# Table doesn't exist yet, nothing to migrate
return
# Check existing columns
cursor = conn.execute("PRAGMA table_info(chunks)")
columns = {row[1] for row in cursor.fetchall()}
# Add embedding_binary column if missing
if "embedding_binary" not in columns:
logger.info("Migrating chunks table: adding embedding_binary column")
conn.execute(
"ALTER TABLE chunks ADD COLUMN embedding_binary BLOB"
)
# Add embedding_dense column if missing
if "embedding_dense" not in columns:
logger.info("Migrating chunks table: adding embedding_dense column")
conn.execute(
"ALTER TABLE chunks ADD COLUMN embedding_dense BLOB"
)
def add_chunks(
self,
file_path: str,
chunks_data: List[Dict[str, Any]],
*,
embedding: Optional[List[List[float]]] = None,
embedding_binary: Optional[List[bytes]] = None,
embedding_dense: Optional[List[bytes]] = None,
) -> List[int]:
"""Add multiple chunks with multi-vector embeddings support.
This method supports the cascade retrieval architecture with three embedding types:
- embedding: Original dense embedding for backward compatibility
- embedding_binary: 256-dim binary vector for fast coarse ranking
- embedding_dense: 2048-dim dense vector for precise fine ranking
Args:
file_path: Path to the source file for all chunks.
chunks_data: List of dicts with 'content' and optional 'metadata' keys.
embedding: Optional list of dense embeddings (one per chunk).
embedding_binary: Optional list of binary embeddings as bytes (one per chunk).
embedding_dense: Optional list of dense embeddings as bytes (one per chunk).
Returns:
List of inserted chunk IDs.
Raises:
ValueError: If embedding list lengths don't match chunks_data length.
StorageError: If database operation fails.
"""
if not chunks_data:
return []
n_chunks = len(chunks_data)
# Validate embedding lengths
if embedding is not None and len(embedding) != n_chunks:
raise ValueError(
f"embedding length ({len(embedding)}) != chunks_data length ({n_chunks})"
)
if embedding_binary is not None and len(embedding_binary) != n_chunks:
raise ValueError(
f"embedding_binary length ({len(embedding_binary)}) != chunks_data length ({n_chunks})"
)
if embedding_dense is not None and len(embedding_dense) != n_chunks:
raise ValueError(
f"embedding_dense length ({len(embedding_dense)}) != chunks_data length ({n_chunks})"
)
# Prepare batch data
batch_data = []
for i, chunk in enumerate(chunks_data):
content = chunk.get("content", "")
metadata = chunk.get("metadata")
metadata_json = json.dumps(metadata) if metadata else None
# Convert embeddings to bytes if needed
emb_blob = None
if embedding is not None:
import struct
emb_blob = struct.pack(f"{len(embedding[i])}f", *embedding[i])
emb_binary_blob = embedding_binary[i] if embedding_binary is not None else None
emb_dense_blob = embedding_dense[i] if embedding_dense is not None else None
batch_data.append((
file_path, content, emb_blob, emb_binary_blob, emb_dense_blob, metadata_json
))
with self._lock:
conn = self._get_connection()
try:
# Get starting ID before insert
row = conn.execute("SELECT MAX(id) FROM chunks").fetchone()
start_id = (row[0] or 0) + 1
conn.executemany(
"""
INSERT INTO chunks (
file_path, content, embedding, embedding_binary,
embedding_dense, metadata
)
VALUES (?, ?, ?, ?, ?, ?)
""",
batch_data
)
conn.commit()
# Calculate inserted IDs
return list(range(start_id, start_id + n_chunks))
except sqlite3.DatabaseError as exc:
raise StorageError(
f"Failed to add chunks: {exc}",
db_path=str(self.db_path),
operation="add_chunks",
) from exc
def get_binary_embeddings(
self, chunk_ids: List[int]
) -> Dict[int, Optional[bytes]]:
"""Get binary embeddings for specified chunk IDs.
Used for coarse ranking in cascade retrieval architecture.
Binary embeddings (256-dim) enable fast approximate similarity search.
Args:
chunk_ids: List of chunk IDs to retrieve embeddings for.
Returns:
Dictionary mapping chunk_id to embedding_binary bytes (or None if not set).
Raises:
StorageError: If database query fails.
"""
if not chunk_ids:
return {}
with self._lock:
conn = self._get_connection()
try:
placeholders = ",".join("?" * len(chunk_ids))
rows = conn.execute(
f"SELECT id, embedding_binary FROM chunks WHERE id IN ({placeholders})",
chunk_ids
).fetchall()
return {row["id"]: row["embedding_binary"] for row in rows}
except sqlite3.DatabaseError as exc:
raise StorageError(
f"Failed to get binary embeddings: {exc}",
db_path=str(self.db_path),
operation="get_binary_embeddings",
) from exc
def get_dense_embeddings(
self, chunk_ids: List[int]
) -> Dict[int, Optional[bytes]]:
"""Get dense embeddings for specified chunk IDs.
Used for fine ranking in cascade retrieval architecture.
Dense embeddings (2048-dim) provide high-precision similarity scoring.
Args:
chunk_ids: List of chunk IDs to retrieve embeddings for.
Returns:
Dictionary mapping chunk_id to embedding_dense bytes (or None if not set).
Raises:
StorageError: If database query fails.
"""
if not chunk_ids:
return {}
with self._lock:
conn = self._get_connection()
try:
placeholders = ",".join("?" * len(chunk_ids))
rows = conn.execute(
f"SELECT id, embedding_dense FROM chunks WHERE id IN ({placeholders})",
chunk_ids
).fetchall()
return {row["id"]: row["embedding_dense"] for row in rows}
except sqlite3.DatabaseError as exc:
raise StorageError(
f"Failed to get dense embeddings: {exc}",
db_path=str(self.db_path),
operation="get_dense_embeddings",
) from exc
def get_chunks_by_ids(
self, chunk_ids: List[int]
) -> List[Dict[str, Any]]:
"""Get chunk data for specified IDs.
Args:
chunk_ids: List of chunk IDs to retrieve.
Returns:
List of chunk dictionaries with id, file_path, content, metadata.
Raises:
StorageError: If database query fails.
"""
if not chunk_ids:
return []
with self._lock:
conn = self._get_connection()
try:
placeholders = ",".join("?" * len(chunk_ids))
rows = conn.execute(
f"""
SELECT id, file_path, content, metadata, created_at
FROM chunks
WHERE id IN ({placeholders})
""",
chunk_ids
).fetchall()
results = []
for row in rows:
metadata = None
if row["metadata"]:
try:
metadata = json.loads(row["metadata"])
except json.JSONDecodeError:
pass
results.append({
"id": row["id"],
"file_path": row["file_path"],
"content": row["content"],
"metadata": metadata,
"created_at": row["created_at"],
})
return results
except sqlite3.DatabaseError as exc:
raise StorageError(
f"Failed to get chunks: {exc}",
db_path=str(self.db_path),
operation="get_chunks_by_ids",
) from exc
def delete_chunks_by_file(self, file_path: str) -> int:
"""Delete all chunks for a given file path.
Args:
file_path: Path to the source file.
Returns:
Number of deleted chunks.
Raises:
StorageError: If database operation fails.
"""
with self._lock:
conn = self._get_connection()
try:
cursor = conn.execute(
"DELETE FROM chunks WHERE file_path = ?",
(file_path,)
)
conn.commit()
return cursor.rowcount
except sqlite3.DatabaseError as exc:
raise StorageError(
f"Failed to delete chunks: {exc}",
db_path=str(self.db_path),
operation="delete_chunks_by_file",
) from exc
def count_chunks(self) -> int:
"""Count total chunks in store.
Returns:
Total number of chunks.
"""
with self._lock:
conn = self._get_connection()
row = conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone()
return int(row["c"]) if row else 0