Files

583 lines
19 KiB
Python

"""Multi-type embedding backends for cascade retrieval.
This module provides embedding backends optimized for cascade retrieval:
1. BinaryEmbeddingBackend - Fast coarse filtering with binary vectors
2. DenseEmbeddingBackend - High-precision dense vectors for reranking
3. CascadeEmbeddingBackend - Combined binary + dense for two-stage retrieval
Cascade retrieval workflow:
1. Binary search (fast, ~32 bytes/vector) -> top-K candidates
2. Dense rerank (precise, ~8KB/vector) -> final results
"""
from __future__ import annotations
import logging
from typing import Iterable, List, Optional, Tuple
import numpy as np
from codexlens.semantic.base import BaseEmbedder
logger = logging.getLogger(__name__)
# =============================================================================
# Utility Functions
# =============================================================================
def binarize_embedding(embedding: np.ndarray) -> np.ndarray:
"""Convert float embedding to binary vector.
Applies sign-based quantization: values > 0 become 1, values <= 0 become 0.
Args:
embedding: Float32 embedding of any dimension
Returns:
Binary vector (uint8 with values 0 or 1) of same dimension
"""
return (embedding > 0).astype(np.uint8)
def pack_binary_embedding(binary_vector: np.ndarray) -> bytes:
"""Pack binary vector into compact bytes format.
Packs 8 binary values into each byte for storage efficiency.
For a 256-dim binary vector, output is 32 bytes.
Args:
binary_vector: Binary vector (uint8 with values 0 or 1)
Returns:
Packed bytes (length = ceil(dim / 8))
"""
# Ensure vector length is multiple of 8 by padding if needed
dim = len(binary_vector)
padded_dim = ((dim + 7) // 8) * 8
if padded_dim > dim:
padded = np.zeros(padded_dim, dtype=np.uint8)
padded[:dim] = binary_vector
binary_vector = padded
# Pack 8 bits per byte
packed = np.packbits(binary_vector)
return packed.tobytes()
def unpack_binary_embedding(packed_bytes: bytes, dim: int = 256) -> np.ndarray:
"""Unpack bytes back to binary vector.
Args:
packed_bytes: Packed binary data
dim: Original vector dimension (default: 256)
Returns:
Binary vector (uint8 with values 0 or 1)
"""
unpacked = np.unpackbits(np.frombuffer(packed_bytes, dtype=np.uint8))
return unpacked[:dim]
def hamming_distance(a: bytes, b: bytes) -> int:
"""Compute Hamming distance between two packed binary vectors.
Uses XOR and popcount for efficient distance computation.
Args:
a: First packed binary vector
b: Second packed binary vector
Returns:
Hamming distance (number of differing bits)
"""
a_arr = np.frombuffer(a, dtype=np.uint8)
b_arr = np.frombuffer(b, dtype=np.uint8)
xor = np.bitwise_xor(a_arr, b_arr)
return int(np.unpackbits(xor).sum())
# =============================================================================
# Binary Embedding Backend
# =============================================================================
class BinaryEmbeddingBackend(BaseEmbedder):
"""Generate 256-dimensional binary embeddings for fast coarse retrieval.
Uses a lightweight embedding model and applies sign-based quantization
to produce compact binary vectors (32 bytes per embedding).
Suitable for:
- First-stage candidate retrieval
- Hamming distance-based similarity search
- Memory-constrained environments
Model: sentence-transformers/all-MiniLM-L6-v2 (384 dim) -> quantized to 256 bits
"""
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" # 384 dim, fast
BINARY_DIM = 256
def __init__(
self,
model_name: Optional[str] = None,
use_gpu: bool = True,
) -> None:
"""Initialize binary embedding backend.
Args:
model_name: Base embedding model name. Defaults to BAAI/bge-small-en-v1.5
use_gpu: Whether to use GPU acceleration
"""
from codexlens.semantic import SEMANTIC_AVAILABLE
if not SEMANTIC_AVAILABLE:
raise ImportError(
"Semantic search dependencies not available. "
"Install with: pip install codexlens[semantic]"
)
self._model_name = model_name or self.DEFAULT_MODEL
self._use_gpu = use_gpu
self._model = None
# Projection matrix for dimension reduction (lazily initialized)
self._projection_matrix: Optional[np.ndarray] = None
@property
def model_name(self) -> str:
"""Return model name."""
return self._model_name
@property
def embedding_dim(self) -> int:
"""Return binary embedding dimension (256)."""
return self.BINARY_DIM
@property
def packed_bytes(self) -> int:
"""Return packed bytes size (32 bytes for 256 bits)."""
return self.BINARY_DIM // 8
def _load_model(self) -> None:
"""Lazy load the embedding model."""
if self._model is not None:
return
from fastembed import TextEmbedding
from codexlens.semantic.gpu_support import get_optimal_providers
providers = get_optimal_providers(use_gpu=self._use_gpu, with_device_options=True)
try:
self._model = TextEmbedding(
model_name=self._model_name,
providers=providers,
)
except TypeError:
# Fallback for older fastembed versions
self._model = TextEmbedding(model_name=self._model_name)
logger.debug(f"BinaryEmbeddingBackend loaded model: {self._model_name}")
def _get_projection_matrix(self, input_dim: int) -> np.ndarray:
"""Get or create projection matrix for dimension reduction.
Uses random projection with fixed seed for reproducibility.
Args:
input_dim: Input embedding dimension from base model
Returns:
Projection matrix of shape (input_dim, BINARY_DIM)
"""
if self._projection_matrix is not None:
return self._projection_matrix
# Fixed seed for reproducibility across sessions
rng = np.random.RandomState(42)
# Gaussian random projection
self._projection_matrix = rng.randn(input_dim, self.BINARY_DIM).astype(np.float32)
# Normalize columns for consistent scale
norms = np.linalg.norm(self._projection_matrix, axis=0, keepdims=True)
self._projection_matrix /= (norms + 1e-8)
return self._projection_matrix
def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
"""Generate binary embeddings as numpy array.
Args:
texts: Single text or iterable of texts
Returns:
Binary embeddings of shape (n_texts, 256) with values 0 or 1
"""
self._load_model()
if isinstance(texts, str):
texts = [texts]
else:
texts = list(texts)
# Get base float embeddings
float_embeddings = np.array(list(self._model.embed(texts)))
input_dim = float_embeddings.shape[1]
# Project to target dimension if needed
if input_dim != self.BINARY_DIM:
projection = self._get_projection_matrix(input_dim)
float_embeddings = float_embeddings @ projection
# Binarize
return binarize_embedding(float_embeddings)
def embed_packed(self, texts: str | Iterable[str]) -> List[bytes]:
"""Generate packed binary embeddings.
Args:
texts: Single text or iterable of texts
Returns:
List of packed bytes (32 bytes each for 256-dim)
"""
binary = self.embed_to_numpy(texts)
return [pack_binary_embedding(vec) for vec in binary]
# =============================================================================
# Dense Embedding Backend
# =============================================================================
class DenseEmbeddingBackend(BaseEmbedder):
"""Generate high-dimensional dense embeddings for precise reranking.
Uses large embedding models to produce 2048-dimensional float32 vectors
for maximum retrieval quality.
Suitable for:
- Second-stage reranking
- High-precision similarity search
- Quality-critical applications
Model: BAAI/bge-large-en-v1.5 (1024 dim) with optional expansion
"""
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" # 384 dim, use small for testing
TARGET_DIM = 768 # Reduced target for faster testing
def __init__(
self,
model_name: Optional[str] = None,
use_gpu: bool = True,
expand_dim: bool = True,
) -> None:
"""Initialize dense embedding backend.
Args:
model_name: Dense embedding model name. Defaults to BAAI/bge-large-en-v1.5
use_gpu: Whether to use GPU acceleration
expand_dim: If True, expand embeddings to TARGET_DIM using learned expansion
"""
from codexlens.semantic import SEMANTIC_AVAILABLE
if not SEMANTIC_AVAILABLE:
raise ImportError(
"Semantic search dependencies not available. "
"Install with: pip install codexlens[semantic]"
)
self._model_name = model_name or self.DEFAULT_MODEL
self._use_gpu = use_gpu
self._expand_dim = expand_dim
self._model = None
self._native_dim: Optional[int] = None
# Expansion matrix for dimension expansion (lazily initialized)
self._expansion_matrix: Optional[np.ndarray] = None
@property
def model_name(self) -> str:
"""Return model name."""
return self._model_name
@property
def embedding_dim(self) -> int:
"""Return embedding dimension.
Returns TARGET_DIM if expand_dim is True, otherwise native model dimension.
"""
if self._expand_dim:
return self.TARGET_DIM
# Return cached native dim or estimate based on model
if self._native_dim is not None:
return self._native_dim
# Model dimension estimates
model_dims = {
"BAAI/bge-large-en-v1.5": 1024,
"BAAI/bge-base-en-v1.5": 768,
"BAAI/bge-small-en-v1.5": 384,
"intfloat/multilingual-e5-large": 1024,
}
return model_dims.get(self._model_name, 1024)
@property
def max_tokens(self) -> int:
"""Return maximum token limit."""
return 512 # Conservative default for large models
def _load_model(self) -> None:
"""Lazy load the embedding model."""
if self._model is not None:
return
from fastembed import TextEmbedding
from codexlens.semantic.gpu_support import get_optimal_providers
providers = get_optimal_providers(use_gpu=self._use_gpu, with_device_options=True)
try:
self._model = TextEmbedding(
model_name=self._model_name,
providers=providers,
)
except TypeError:
self._model = TextEmbedding(model_name=self._model_name)
logger.debug(f"DenseEmbeddingBackend loaded model: {self._model_name}")
def _get_expansion_matrix(self, input_dim: int) -> np.ndarray:
"""Get or create expansion matrix for dimension expansion.
Uses random orthogonal projection for information-preserving expansion.
Args:
input_dim: Input embedding dimension from base model
Returns:
Expansion matrix of shape (input_dim, TARGET_DIM)
"""
if self._expansion_matrix is not None:
return self._expansion_matrix
# Fixed seed for reproducibility
rng = np.random.RandomState(123)
# Create semi-orthogonal expansion matrix
# First input_dim columns form identity-like structure
self._expansion_matrix = np.zeros((input_dim, self.TARGET_DIM), dtype=np.float32)
# Copy original dimensions
copy_dim = min(input_dim, self.TARGET_DIM)
self._expansion_matrix[:copy_dim, :copy_dim] = np.eye(copy_dim, dtype=np.float32)
# Fill remaining with random projections
if self.TARGET_DIM > input_dim:
random_part = rng.randn(input_dim, self.TARGET_DIM - input_dim).astype(np.float32)
# Normalize
norms = np.linalg.norm(random_part, axis=0, keepdims=True)
random_part /= (norms + 1e-8)
self._expansion_matrix[:, input_dim:] = random_part
return self._expansion_matrix
def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
"""Generate dense embeddings as numpy array.
Args:
texts: Single text or iterable of texts
Returns:
Dense embeddings of shape (n_texts, TARGET_DIM) as float32
"""
self._load_model()
if isinstance(texts, str):
texts = [texts]
else:
texts = list(texts)
# Get base float embeddings
float_embeddings = np.array(list(self._model.embed(texts)), dtype=np.float32)
self._native_dim = float_embeddings.shape[1]
# Expand to target dimension if needed
if self._expand_dim and self._native_dim < self.TARGET_DIM:
expansion = self._get_expansion_matrix(self._native_dim)
float_embeddings = float_embeddings @ expansion
return float_embeddings
# =============================================================================
# Cascade Embedding Backend
# =============================================================================
class CascadeEmbeddingBackend(BaseEmbedder):
"""Combined binary + dense embedding backend for cascade retrieval.
Generates both binary (for fast coarse filtering) and dense (for precise
reranking) embeddings in a single pass, optimized for two-stage retrieval.
Cascade workflow:
1. encode_cascade() returns (binary_embeddings, dense_embeddings)
2. Binary search: Use Hamming distance on binary vectors -> top-K candidates
3. Dense rerank: Use cosine similarity on dense vectors -> final results
Memory efficiency:
- Binary: 32 bytes per vector (256 bits)
- Dense: 8192 bytes per vector (2048 x float32)
- Total: ~8KB per document for full cascade support
"""
def __init__(
self,
binary_model: Optional[str] = None,
dense_model: Optional[str] = None,
use_gpu: bool = True,
) -> None:
"""Initialize cascade embedding backend.
Args:
binary_model: Model for binary embeddings. Defaults to BAAI/bge-small-en-v1.5
dense_model: Model for dense embeddings. Defaults to BAAI/bge-large-en-v1.5
use_gpu: Whether to use GPU acceleration
"""
self._binary_backend = BinaryEmbeddingBackend(
model_name=binary_model,
use_gpu=use_gpu,
)
self._dense_backend = DenseEmbeddingBackend(
model_name=dense_model,
use_gpu=use_gpu,
expand_dim=True,
)
self._use_gpu = use_gpu
@property
def model_name(self) -> str:
"""Return model names for both backends."""
return f"cascade({self._binary_backend.model_name}, {self._dense_backend.model_name})"
@property
def embedding_dim(self) -> int:
"""Return dense embedding dimension (for compatibility)."""
return self._dense_backend.embedding_dim
@property
def binary_dim(self) -> int:
"""Return binary embedding dimension."""
return self._binary_backend.embedding_dim
@property
def dense_dim(self) -> int:
"""Return dense embedding dimension."""
return self._dense_backend.embedding_dim
def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
"""Generate dense embeddings (for BaseEmbedder compatibility).
For cascade embeddings, use encode_cascade() instead.
Args:
texts: Single text or iterable of texts
Returns:
Dense embeddings of shape (n_texts, dense_dim)
"""
return self._dense_backend.embed_to_numpy(texts)
def encode_cascade(
self,
texts: str | Iterable[str],
batch_size: int = 32,
) -> Tuple[np.ndarray, np.ndarray]:
"""Generate both binary and dense embeddings.
Args:
texts: Single text or iterable of texts
batch_size: Batch size for processing
Returns:
Tuple of:
- binary_embeddings: Shape (n_texts, 256), uint8 values 0/1
- dense_embeddings: Shape (n_texts, 2048), float32
"""
if isinstance(texts, str):
texts = [texts]
else:
texts = list(texts)
binary_embeddings = self._binary_backend.embed_to_numpy(texts)
dense_embeddings = self._dense_backend.embed_to_numpy(texts)
return binary_embeddings, dense_embeddings
def encode_binary(self, texts: str | Iterable[str]) -> np.ndarray:
"""Generate only binary embeddings.
Args:
texts: Single text or iterable of texts
Returns:
Binary embeddings of shape (n_texts, 256)
"""
return self._binary_backend.embed_to_numpy(texts)
def encode_dense(self, texts: str | Iterable[str]) -> np.ndarray:
"""Generate only dense embeddings.
Args:
texts: Single text or iterable of texts
Returns:
Dense embeddings of shape (n_texts, 2048)
"""
return self._dense_backend.embed_to_numpy(texts)
def encode_binary_packed(self, texts: str | Iterable[str]) -> List[bytes]:
"""Generate packed binary embeddings.
Args:
texts: Single text or iterable of texts
Returns:
List of packed bytes (32 bytes each)
"""
return self._binary_backend.embed_packed(texts)
# =============================================================================
# Factory Function
# =============================================================================
def get_cascade_embedder(
binary_model: Optional[str] = None,
dense_model: Optional[str] = None,
use_gpu: bool = True,
) -> CascadeEmbeddingBackend:
"""Factory function to create a cascade embedder.
Args:
binary_model: Model for binary embeddings (default: BAAI/bge-small-en-v1.5)
dense_model: Model for dense embeddings (default: BAAI/bge-large-en-v1.5)
use_gpu: Whether to use GPU acceleration
Returns:
Configured CascadeEmbeddingBackend instance
Example:
>>> embedder = get_cascade_embedder()
>>> binary, dense = embedder.encode_cascade(["hello world"])
>>> binary.shape # (1, 256)
>>> dense.shape # (1, 2048)
"""
return CascadeEmbeddingBackend(
binary_model=binary_model,
dense_model=dense_model,
use_gpu=use_gpu,
)