Files

289 lines
11 KiB
Python

"""Embedder for semantic code search using fastembed.
Supports GPU acceleration via ONNX execution providers (CUDA, TensorRT, DirectML, ROCm, CoreML).
GPU acceleration is automatic when available, with transparent CPU fallback.
"""
from __future__ import annotations
import gc
import logging
import threading
from typing import Dict, Iterable, List, Optional
import numpy as np
from . import SEMANTIC_AVAILABLE
from .base import BaseEmbedder
from .gpu_support import get_optimal_providers, is_gpu_available, get_gpu_summary, get_selected_device_id
logger = logging.getLogger(__name__)
# Global embedder cache for singleton pattern
_embedder_cache: Dict[str, "Embedder"] = {}
_cache_lock = threading.RLock()
def get_embedder(profile: str = "code", use_gpu: bool = True) -> "Embedder":
"""Get or create a cached Embedder instance (thread-safe singleton).
This function provides significant performance improvement by reusing
Embedder instances across multiple searches, avoiding repeated model
loading overhead (~0.8s per load).
Args:
profile: Model profile ("fast", "code", "multilingual", "balanced")
use_gpu: If True, use GPU acceleration when available (default: True)
Returns:
Cached Embedder instance for the given profile
"""
global _embedder_cache
# Cache key includes GPU preference to support mixed configurations
cache_key = f"{profile}:{'gpu' if use_gpu else 'cpu'}"
# All cache access is protected by _cache_lock to avoid races with
# clear_embedder_cache() during concurrent access.
with _cache_lock:
embedder = _embedder_cache.get(cache_key)
if embedder is not None:
return embedder
# Create new embedder and cache it
embedder = Embedder(profile=profile, use_gpu=use_gpu)
# Pre-load model to ensure it's ready
embedder._load_model()
_embedder_cache[cache_key] = embedder
# Log GPU status on first embedder creation
if use_gpu and is_gpu_available():
logger.info(f"Embedder initialized with GPU: {get_gpu_summary()}")
elif use_gpu:
logger.debug("GPU not available, using CPU for embeddings")
return embedder
def clear_embedder_cache() -> None:
"""Clear the embedder cache and release ONNX resources.
This method ensures proper cleanup of ONNX model resources to prevent
memory leaks when embedders are no longer needed.
"""
global _embedder_cache
with _cache_lock:
# Release ONNX resources before clearing cache
for embedder in _embedder_cache.values():
if embedder._model is not None:
del embedder._model
embedder._model = None
_embedder_cache.clear()
gc.collect()
class Embedder(BaseEmbedder):
"""Generate embeddings for code chunks using fastembed (ONNX-based).
Supported Model Profiles:
- fast: BAAI/bge-small-en-v1.5 (384 dim) - Fast, lightweight, English-optimized
- code: jinaai/jina-embeddings-v2-base-code (768 dim) - Code-optimized, best for programming languages
- multilingual: intfloat/multilingual-e5-large (1024 dim) - Multilingual + code support
- balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dim) - High accuracy, general purpose
"""
# Model profiles for different use cases
MODELS = {
"fast": "BAAI/bge-small-en-v1.5", # 384 dim - Fast, lightweight
"code": "jinaai/jina-embeddings-v2-base-code", # 768 dim - Code-optimized
"multilingual": "intfloat/multilingual-e5-large", # 1024 dim - Multilingual
"balanced": "mixedbread-ai/mxbai-embed-large-v1", # 1024 dim - High accuracy
}
# Dimension mapping for each model
MODEL_DIMS = {
"BAAI/bge-small-en-v1.5": 384,
"jinaai/jina-embeddings-v2-base-code": 768,
"intfloat/multilingual-e5-large": 1024,
"mixedbread-ai/mxbai-embed-large-v1": 1024,
}
# Default model (fast profile)
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
DEFAULT_PROFILE = "fast"
def __init__(
self,
model_name: str | None = None,
profile: str | None = None,
use_gpu: bool = True,
providers: List[str] | None = None,
) -> None:
"""Initialize embedder with model or profile.
Args:
model_name: Explicit model name (e.g., "jinaai/jina-embeddings-v2-base-code")
profile: Model profile shortcut ("fast", "code", "multilingual", "balanced")
If both provided, model_name takes precedence.
use_gpu: If True, use GPU acceleration when available (default: True)
providers: Explicit ONNX providers list (overrides use_gpu if provided)
"""
if not SEMANTIC_AVAILABLE:
raise ImportError(
"Semantic search dependencies not available. "
"Install with: pip install codexlens[semantic]"
)
# Resolve model name from profile or use explicit name
if model_name:
self._model_name = model_name
elif profile and profile in self.MODELS:
self._model_name = self.MODELS[profile]
else:
self._model_name = self.DEFAULT_MODEL
# Configure ONNX execution providers with device_id options for GPU selection
# Using with_device_options=True ensures DirectML/CUDA device_id is passed correctly
if providers is not None:
self._providers = providers
else:
self._providers = get_optimal_providers(use_gpu=use_gpu, with_device_options=True)
self._use_gpu = use_gpu
self._model = None
@property
def model_name(self) -> str:
"""Get model name."""
return self._model_name
@property
def embedding_dim(self) -> int:
"""Get embedding dimension for current model."""
return self.MODEL_DIMS.get(self._model_name, 768) # Default to 768 if unknown
@property
def max_tokens(self) -> int:
"""Get maximum token limit for current model.
Returns:
int: Maximum number of tokens based on model profile.
- fast: 512 (lightweight, optimized for speed)
- code: 8192 (code-optimized, larger context)
- multilingual: 512 (standard multilingual model)
- balanced: 512 (general purpose)
"""
# Determine profile from model name
profile = None
for prof, model in self.MODELS.items():
if model == self._model_name:
profile = prof
break
# Return token limit based on profile
if profile == "code":
return 8192
elif profile in ("fast", "multilingual", "balanced"):
return 512
else:
# Default for unknown models
return 512
@property
def providers(self) -> List[str]:
"""Get configured ONNX execution providers."""
return self._providers
@property
def is_gpu_enabled(self) -> bool:
"""Check if GPU acceleration is enabled for this embedder."""
gpu_providers = {"CUDAExecutionProvider", "TensorrtExecutionProvider",
"DmlExecutionProvider", "ROCMExecutionProvider", "CoreMLExecutionProvider"}
# Handle both string providers and tuple providers (name, options)
for p in self._providers:
provider_name = p[0] if isinstance(p, tuple) else p
if provider_name in gpu_providers:
return True
return False
def _load_model(self) -> None:
"""Lazy load the embedding model with configured providers."""
if self._model is not None:
return
from fastembed import TextEmbedding
# providers already include device_id options via get_optimal_providers(with_device_options=True)
# DO NOT pass device_ids separately - fastembed ignores it when providers is specified
# See: fastembed/text/onnx_embedding.py - device_ids is only used with cuda=True
try:
self._model = TextEmbedding(
model_name=self.model_name,
providers=self._providers,
)
logger.debug(f"Model loaded with providers: {self._providers}")
except TypeError:
# Fallback for older fastembed versions without providers parameter
logger.warning(
"fastembed version doesn't support 'providers' parameter. "
"Upgrade fastembed for GPU acceleration: pip install --upgrade fastembed"
)
self._model = TextEmbedding(model_name=self.model_name)
def embed(self, texts: str | Iterable[str]) -> List[List[float]]:
"""Generate embeddings for one or more texts.
Args:
texts: Single text or iterable of texts to embed.
Returns:
List of embedding vectors (each is a list of floats).
Note:
This method converts numpy arrays to Python lists for backward compatibility.
For memory-efficient processing, use embed_to_numpy() instead.
"""
self._load_model()
if isinstance(texts, str):
texts = [texts]
else:
texts = list(texts)
embeddings = list(self._model.embed(texts))
return [emb.tolist() for emb in embeddings]
def embed_to_numpy(self, texts: str | Iterable[str], batch_size: Optional[int] = None) -> np.ndarray:
"""Generate embeddings for one or more texts (returns numpy arrays).
This method is more memory-efficient than embed() as it avoids converting
numpy arrays to Python lists, which can significantly reduce memory usage
during batch processing.
Args:
texts: Single text or iterable of texts to embed.
batch_size: Optional batch size for fastembed processing.
Larger values improve GPU utilization but use more memory.
Returns:
numpy.ndarray of shape (n_texts, embedding_dim) containing embeddings.
"""
self._load_model()
if isinstance(texts, str):
texts = [texts]
else:
texts = list(texts)
# Pass batch_size to fastembed for optimal GPU utilization
# Default batch_size in fastembed is 256, but larger values can improve throughput
if batch_size is not None:
embeddings = list(self._model.embed(texts, batch_size=batch_size))
else:
embeddings = list(self._model.embed(texts))
return np.array(embeddings)
def embed_single(self, text: str) -> List[float]:
"""Generate embedding for a single text."""
return self.embed(text)[0]