mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
583 lines
19 KiB
Python
583 lines
19 KiB
Python
"""Multi-type embedding backends for cascade retrieval.
|
|
|
|
This module provides embedding backends optimized for cascade retrieval:
|
|
1. BinaryEmbeddingBackend - Fast coarse filtering with binary vectors
|
|
2. DenseEmbeddingBackend - High-precision dense vectors for reranking
|
|
3. CascadeEmbeddingBackend - Combined binary + dense for two-stage retrieval
|
|
|
|
Cascade retrieval workflow:
|
|
1. Binary search (fast, ~32 bytes/vector) -> top-K candidates
|
|
2. Dense rerank (precise, ~8KB/vector) -> final results
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Iterable, List, Optional, Tuple
|
|
|
|
import numpy as np
|
|
|
|
from codexlens.semantic.base import BaseEmbedder
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# =============================================================================
|
|
# Utility Functions
|
|
# =============================================================================
|
|
|
|
|
|
def binarize_embedding(embedding: np.ndarray) -> np.ndarray:
|
|
"""Convert float embedding to binary vector.
|
|
|
|
Applies sign-based quantization: values > 0 become 1, values <= 0 become 0.
|
|
|
|
Args:
|
|
embedding: Float32 embedding of any dimension
|
|
|
|
Returns:
|
|
Binary vector (uint8 with values 0 or 1) of same dimension
|
|
"""
|
|
return (embedding > 0).astype(np.uint8)
|
|
|
|
|
|
def pack_binary_embedding(binary_vector: np.ndarray) -> bytes:
|
|
"""Pack binary vector into compact bytes format.
|
|
|
|
Packs 8 binary values into each byte for storage efficiency.
|
|
For a 256-dim binary vector, output is 32 bytes.
|
|
|
|
Args:
|
|
binary_vector: Binary vector (uint8 with values 0 or 1)
|
|
|
|
Returns:
|
|
Packed bytes (length = ceil(dim / 8))
|
|
"""
|
|
# Ensure vector length is multiple of 8 by padding if needed
|
|
dim = len(binary_vector)
|
|
padded_dim = ((dim + 7) // 8) * 8
|
|
if padded_dim > dim:
|
|
padded = np.zeros(padded_dim, dtype=np.uint8)
|
|
padded[:dim] = binary_vector
|
|
binary_vector = padded
|
|
|
|
# Pack 8 bits per byte
|
|
packed = np.packbits(binary_vector)
|
|
return packed.tobytes()
|
|
|
|
|
|
def unpack_binary_embedding(packed_bytes: bytes, dim: int = 256) -> np.ndarray:
|
|
"""Unpack bytes back to binary vector.
|
|
|
|
Args:
|
|
packed_bytes: Packed binary data
|
|
dim: Original vector dimension (default: 256)
|
|
|
|
Returns:
|
|
Binary vector (uint8 with values 0 or 1)
|
|
"""
|
|
unpacked = np.unpackbits(np.frombuffer(packed_bytes, dtype=np.uint8))
|
|
return unpacked[:dim]
|
|
|
|
|
|
def hamming_distance(a: bytes, b: bytes) -> int:
|
|
"""Compute Hamming distance between two packed binary vectors.
|
|
|
|
Uses XOR and popcount for efficient distance computation.
|
|
|
|
Args:
|
|
a: First packed binary vector
|
|
b: Second packed binary vector
|
|
|
|
Returns:
|
|
Hamming distance (number of differing bits)
|
|
"""
|
|
a_arr = np.frombuffer(a, dtype=np.uint8)
|
|
b_arr = np.frombuffer(b, dtype=np.uint8)
|
|
xor = np.bitwise_xor(a_arr, b_arr)
|
|
return int(np.unpackbits(xor).sum())
|
|
|
|
|
|
# =============================================================================
|
|
# Binary Embedding Backend
|
|
# =============================================================================
|
|
|
|
|
|
class BinaryEmbeddingBackend(BaseEmbedder):
|
|
"""Generate 256-dimensional binary embeddings for fast coarse retrieval.
|
|
|
|
Uses a lightweight embedding model and applies sign-based quantization
|
|
to produce compact binary vectors (32 bytes per embedding).
|
|
|
|
Suitable for:
|
|
- First-stage candidate retrieval
|
|
- Hamming distance-based similarity search
|
|
- Memory-constrained environments
|
|
|
|
Model: sentence-transformers/all-MiniLM-L6-v2 (384 dim) -> quantized to 256 bits
|
|
"""
|
|
|
|
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" # 384 dim, fast
|
|
BINARY_DIM = 256
|
|
|
|
def __init__(
|
|
self,
|
|
model_name: Optional[str] = None,
|
|
use_gpu: bool = True,
|
|
) -> None:
|
|
"""Initialize binary embedding backend.
|
|
|
|
Args:
|
|
model_name: Base embedding model name. Defaults to BAAI/bge-small-en-v1.5
|
|
use_gpu: Whether to use GPU acceleration
|
|
"""
|
|
from codexlens.semantic import SEMANTIC_AVAILABLE
|
|
|
|
if not SEMANTIC_AVAILABLE:
|
|
raise ImportError(
|
|
"Semantic search dependencies not available. "
|
|
"Install with: pip install codexlens[semantic]"
|
|
)
|
|
|
|
self._model_name = model_name or self.DEFAULT_MODEL
|
|
self._use_gpu = use_gpu
|
|
self._model = None
|
|
|
|
# Projection matrix for dimension reduction (lazily initialized)
|
|
self._projection_matrix: Optional[np.ndarray] = None
|
|
|
|
@property
|
|
def model_name(self) -> str:
|
|
"""Return model name."""
|
|
return self._model_name
|
|
|
|
@property
|
|
def embedding_dim(self) -> int:
|
|
"""Return binary embedding dimension (256)."""
|
|
return self.BINARY_DIM
|
|
|
|
@property
|
|
def packed_bytes(self) -> int:
|
|
"""Return packed bytes size (32 bytes for 256 bits)."""
|
|
return self.BINARY_DIM // 8
|
|
|
|
def _load_model(self) -> None:
|
|
"""Lazy load the embedding model."""
|
|
if self._model is not None:
|
|
return
|
|
|
|
from fastembed import TextEmbedding
|
|
from codexlens.semantic.gpu_support import get_optimal_providers
|
|
|
|
providers = get_optimal_providers(use_gpu=self._use_gpu, with_device_options=True)
|
|
try:
|
|
self._model = TextEmbedding(
|
|
model_name=self._model_name,
|
|
providers=providers,
|
|
)
|
|
except TypeError:
|
|
# Fallback for older fastembed versions
|
|
self._model = TextEmbedding(model_name=self._model_name)
|
|
|
|
logger.debug(f"BinaryEmbeddingBackend loaded model: {self._model_name}")
|
|
|
|
def _get_projection_matrix(self, input_dim: int) -> np.ndarray:
|
|
"""Get or create projection matrix for dimension reduction.
|
|
|
|
Uses random projection with fixed seed for reproducibility.
|
|
|
|
Args:
|
|
input_dim: Input embedding dimension from base model
|
|
|
|
Returns:
|
|
Projection matrix of shape (input_dim, BINARY_DIM)
|
|
"""
|
|
if self._projection_matrix is not None:
|
|
return self._projection_matrix
|
|
|
|
# Fixed seed for reproducibility across sessions
|
|
rng = np.random.RandomState(42)
|
|
# Gaussian random projection
|
|
self._projection_matrix = rng.randn(input_dim, self.BINARY_DIM).astype(np.float32)
|
|
# Normalize columns for consistent scale
|
|
norms = np.linalg.norm(self._projection_matrix, axis=0, keepdims=True)
|
|
self._projection_matrix /= (norms + 1e-8)
|
|
|
|
return self._projection_matrix
|
|
|
|
def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
|
|
"""Generate binary embeddings as numpy array.
|
|
|
|
Args:
|
|
texts: Single text or iterable of texts
|
|
|
|
Returns:
|
|
Binary embeddings of shape (n_texts, 256) with values 0 or 1
|
|
"""
|
|
self._load_model()
|
|
|
|
if isinstance(texts, str):
|
|
texts = [texts]
|
|
else:
|
|
texts = list(texts)
|
|
|
|
# Get base float embeddings
|
|
float_embeddings = np.array(list(self._model.embed(texts)))
|
|
input_dim = float_embeddings.shape[1]
|
|
|
|
# Project to target dimension if needed
|
|
if input_dim != self.BINARY_DIM:
|
|
projection = self._get_projection_matrix(input_dim)
|
|
float_embeddings = float_embeddings @ projection
|
|
|
|
# Binarize
|
|
return binarize_embedding(float_embeddings)
|
|
|
|
def embed_packed(self, texts: str | Iterable[str]) -> List[bytes]:
|
|
"""Generate packed binary embeddings.
|
|
|
|
Args:
|
|
texts: Single text or iterable of texts
|
|
|
|
Returns:
|
|
List of packed bytes (32 bytes each for 256-dim)
|
|
"""
|
|
binary = self.embed_to_numpy(texts)
|
|
return [pack_binary_embedding(vec) for vec in binary]
|
|
|
|
|
|
# =============================================================================
|
|
# Dense Embedding Backend
|
|
# =============================================================================
|
|
|
|
|
|
class DenseEmbeddingBackend(BaseEmbedder):
|
|
"""Generate high-dimensional dense embeddings for precise reranking.
|
|
|
|
Uses large embedding models to produce 2048-dimensional float32 vectors
|
|
for maximum retrieval quality.
|
|
|
|
Suitable for:
|
|
- Second-stage reranking
|
|
- High-precision similarity search
|
|
- Quality-critical applications
|
|
|
|
Model: BAAI/bge-large-en-v1.5 (1024 dim) with optional expansion
|
|
"""
|
|
|
|
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" # 384 dim, use small for testing
|
|
TARGET_DIM = 768 # Reduced target for faster testing
|
|
|
|
def __init__(
|
|
self,
|
|
model_name: Optional[str] = None,
|
|
use_gpu: bool = True,
|
|
expand_dim: bool = True,
|
|
) -> None:
|
|
"""Initialize dense embedding backend.
|
|
|
|
Args:
|
|
model_name: Dense embedding model name. Defaults to BAAI/bge-large-en-v1.5
|
|
use_gpu: Whether to use GPU acceleration
|
|
expand_dim: If True, expand embeddings to TARGET_DIM using learned expansion
|
|
"""
|
|
from codexlens.semantic import SEMANTIC_AVAILABLE
|
|
|
|
if not SEMANTIC_AVAILABLE:
|
|
raise ImportError(
|
|
"Semantic search dependencies not available. "
|
|
"Install with: pip install codexlens[semantic]"
|
|
)
|
|
|
|
self._model_name = model_name or self.DEFAULT_MODEL
|
|
self._use_gpu = use_gpu
|
|
self._expand_dim = expand_dim
|
|
self._model = None
|
|
self._native_dim: Optional[int] = None
|
|
|
|
# Expansion matrix for dimension expansion (lazily initialized)
|
|
self._expansion_matrix: Optional[np.ndarray] = None
|
|
|
|
@property
|
|
def model_name(self) -> str:
|
|
"""Return model name."""
|
|
return self._model_name
|
|
|
|
@property
|
|
def embedding_dim(self) -> int:
|
|
"""Return embedding dimension.
|
|
|
|
Returns TARGET_DIM if expand_dim is True, otherwise native model dimension.
|
|
"""
|
|
if self._expand_dim:
|
|
return self.TARGET_DIM
|
|
# Return cached native dim or estimate based on model
|
|
if self._native_dim is not None:
|
|
return self._native_dim
|
|
# Model dimension estimates
|
|
model_dims = {
|
|
"BAAI/bge-large-en-v1.5": 1024,
|
|
"BAAI/bge-base-en-v1.5": 768,
|
|
"BAAI/bge-small-en-v1.5": 384,
|
|
"intfloat/multilingual-e5-large": 1024,
|
|
}
|
|
return model_dims.get(self._model_name, 1024)
|
|
|
|
@property
|
|
def max_tokens(self) -> int:
|
|
"""Return maximum token limit."""
|
|
return 512 # Conservative default for large models
|
|
|
|
def _load_model(self) -> None:
|
|
"""Lazy load the embedding model."""
|
|
if self._model is not None:
|
|
return
|
|
|
|
from fastembed import TextEmbedding
|
|
from codexlens.semantic.gpu_support import get_optimal_providers
|
|
|
|
providers = get_optimal_providers(use_gpu=self._use_gpu, with_device_options=True)
|
|
try:
|
|
self._model = TextEmbedding(
|
|
model_name=self._model_name,
|
|
providers=providers,
|
|
)
|
|
except TypeError:
|
|
self._model = TextEmbedding(model_name=self._model_name)
|
|
|
|
logger.debug(f"DenseEmbeddingBackend loaded model: {self._model_name}")
|
|
|
|
def _get_expansion_matrix(self, input_dim: int) -> np.ndarray:
|
|
"""Get or create expansion matrix for dimension expansion.
|
|
|
|
Uses random orthogonal projection for information-preserving expansion.
|
|
|
|
Args:
|
|
input_dim: Input embedding dimension from base model
|
|
|
|
Returns:
|
|
Expansion matrix of shape (input_dim, TARGET_DIM)
|
|
"""
|
|
if self._expansion_matrix is not None:
|
|
return self._expansion_matrix
|
|
|
|
# Fixed seed for reproducibility
|
|
rng = np.random.RandomState(123)
|
|
|
|
# Create semi-orthogonal expansion matrix
|
|
# First input_dim columns form identity-like structure
|
|
self._expansion_matrix = np.zeros((input_dim, self.TARGET_DIM), dtype=np.float32)
|
|
|
|
# Copy original dimensions
|
|
copy_dim = min(input_dim, self.TARGET_DIM)
|
|
self._expansion_matrix[:copy_dim, :copy_dim] = np.eye(copy_dim, dtype=np.float32)
|
|
|
|
# Fill remaining with random projections
|
|
if self.TARGET_DIM > input_dim:
|
|
random_part = rng.randn(input_dim, self.TARGET_DIM - input_dim).astype(np.float32)
|
|
# Normalize
|
|
norms = np.linalg.norm(random_part, axis=0, keepdims=True)
|
|
random_part /= (norms + 1e-8)
|
|
self._expansion_matrix[:, input_dim:] = random_part
|
|
|
|
return self._expansion_matrix
|
|
|
|
def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
|
|
"""Generate dense embeddings as numpy array.
|
|
|
|
Args:
|
|
texts: Single text or iterable of texts
|
|
|
|
Returns:
|
|
Dense embeddings of shape (n_texts, TARGET_DIM) as float32
|
|
"""
|
|
self._load_model()
|
|
|
|
if isinstance(texts, str):
|
|
texts = [texts]
|
|
else:
|
|
texts = list(texts)
|
|
|
|
# Get base float embeddings
|
|
float_embeddings = np.array(list(self._model.embed(texts)), dtype=np.float32)
|
|
self._native_dim = float_embeddings.shape[1]
|
|
|
|
# Expand to target dimension if needed
|
|
if self._expand_dim and self._native_dim < self.TARGET_DIM:
|
|
expansion = self._get_expansion_matrix(self._native_dim)
|
|
float_embeddings = float_embeddings @ expansion
|
|
|
|
return float_embeddings
|
|
|
|
|
|
# =============================================================================
|
|
# Cascade Embedding Backend
|
|
# =============================================================================
|
|
|
|
|
|
class CascadeEmbeddingBackend(BaseEmbedder):
|
|
"""Combined binary + dense embedding backend for cascade retrieval.
|
|
|
|
Generates both binary (for fast coarse filtering) and dense (for precise
|
|
reranking) embeddings in a single pass, optimized for two-stage retrieval.
|
|
|
|
Cascade workflow:
|
|
1. encode_cascade() returns (binary_embeddings, dense_embeddings)
|
|
2. Binary search: Use Hamming distance on binary vectors -> top-K candidates
|
|
3. Dense rerank: Use cosine similarity on dense vectors -> final results
|
|
|
|
Memory efficiency:
|
|
- Binary: 32 bytes per vector (256 bits)
|
|
- Dense: 8192 bytes per vector (2048 x float32)
|
|
- Total: ~8KB per document for full cascade support
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
binary_model: Optional[str] = None,
|
|
dense_model: Optional[str] = None,
|
|
use_gpu: bool = True,
|
|
) -> None:
|
|
"""Initialize cascade embedding backend.
|
|
|
|
Args:
|
|
binary_model: Model for binary embeddings. Defaults to BAAI/bge-small-en-v1.5
|
|
dense_model: Model for dense embeddings. Defaults to BAAI/bge-large-en-v1.5
|
|
use_gpu: Whether to use GPU acceleration
|
|
"""
|
|
self._binary_backend = BinaryEmbeddingBackend(
|
|
model_name=binary_model,
|
|
use_gpu=use_gpu,
|
|
)
|
|
self._dense_backend = DenseEmbeddingBackend(
|
|
model_name=dense_model,
|
|
use_gpu=use_gpu,
|
|
expand_dim=True,
|
|
)
|
|
self._use_gpu = use_gpu
|
|
|
|
@property
|
|
def model_name(self) -> str:
|
|
"""Return model names for both backends."""
|
|
return f"cascade({self._binary_backend.model_name}, {self._dense_backend.model_name})"
|
|
|
|
@property
|
|
def embedding_dim(self) -> int:
|
|
"""Return dense embedding dimension (for compatibility)."""
|
|
return self._dense_backend.embedding_dim
|
|
|
|
@property
|
|
def binary_dim(self) -> int:
|
|
"""Return binary embedding dimension."""
|
|
return self._binary_backend.embedding_dim
|
|
|
|
@property
|
|
def dense_dim(self) -> int:
|
|
"""Return dense embedding dimension."""
|
|
return self._dense_backend.embedding_dim
|
|
|
|
def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
|
|
"""Generate dense embeddings (for BaseEmbedder compatibility).
|
|
|
|
For cascade embeddings, use encode_cascade() instead.
|
|
|
|
Args:
|
|
texts: Single text or iterable of texts
|
|
|
|
Returns:
|
|
Dense embeddings of shape (n_texts, dense_dim)
|
|
"""
|
|
return self._dense_backend.embed_to_numpy(texts)
|
|
|
|
def encode_cascade(
|
|
self,
|
|
texts: str | Iterable[str],
|
|
batch_size: int = 32,
|
|
) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""Generate both binary and dense embeddings.
|
|
|
|
Args:
|
|
texts: Single text or iterable of texts
|
|
batch_size: Batch size for processing
|
|
|
|
Returns:
|
|
Tuple of:
|
|
- binary_embeddings: Shape (n_texts, 256), uint8 values 0/1
|
|
- dense_embeddings: Shape (n_texts, 2048), float32
|
|
"""
|
|
if isinstance(texts, str):
|
|
texts = [texts]
|
|
else:
|
|
texts = list(texts)
|
|
|
|
binary_embeddings = self._binary_backend.embed_to_numpy(texts)
|
|
dense_embeddings = self._dense_backend.embed_to_numpy(texts)
|
|
|
|
return binary_embeddings, dense_embeddings
|
|
|
|
def encode_binary(self, texts: str | Iterable[str]) -> np.ndarray:
|
|
"""Generate only binary embeddings.
|
|
|
|
Args:
|
|
texts: Single text or iterable of texts
|
|
|
|
Returns:
|
|
Binary embeddings of shape (n_texts, 256)
|
|
"""
|
|
return self._binary_backend.embed_to_numpy(texts)
|
|
|
|
def encode_dense(self, texts: str | Iterable[str]) -> np.ndarray:
|
|
"""Generate only dense embeddings.
|
|
|
|
Args:
|
|
texts: Single text or iterable of texts
|
|
|
|
Returns:
|
|
Dense embeddings of shape (n_texts, 2048)
|
|
"""
|
|
return self._dense_backend.embed_to_numpy(texts)
|
|
|
|
def encode_binary_packed(self, texts: str | Iterable[str]) -> List[bytes]:
|
|
"""Generate packed binary embeddings.
|
|
|
|
Args:
|
|
texts: Single text or iterable of texts
|
|
|
|
Returns:
|
|
List of packed bytes (32 bytes each)
|
|
"""
|
|
return self._binary_backend.embed_packed(texts)
|
|
|
|
|
|
# =============================================================================
|
|
# Factory Function
|
|
# =============================================================================
|
|
|
|
|
|
def get_cascade_embedder(
|
|
binary_model: Optional[str] = None,
|
|
dense_model: Optional[str] = None,
|
|
use_gpu: bool = True,
|
|
) -> CascadeEmbeddingBackend:
|
|
"""Factory function to create a cascade embedder.
|
|
|
|
Args:
|
|
binary_model: Model for binary embeddings (default: BAAI/bge-small-en-v1.5)
|
|
dense_model: Model for dense embeddings (default: BAAI/bge-large-en-v1.5)
|
|
use_gpu: Whether to use GPU acceleration
|
|
|
|
Returns:
|
|
Configured CascadeEmbeddingBackend instance
|
|
|
|
Example:
|
|
>>> embedder = get_cascade_embedder()
|
|
>>> binary, dense = embedder.encode_cascade(["hello world"])
|
|
>>> binary.shape # (1, 256)
|
|
>>> dense.shape # (1, 2048)
|
|
"""
|
|
return CascadeEmbeddingBackend(
|
|
binary_model=binary_model,
|
|
dense_model=dense_model,
|
|
use_gpu=use_gpu,
|
|
)
|