feat: Enhance embedding generation and search capabilities

- Added pre-calculation of estimated chunk count for HNSW capacity in `generate_dense_embeddings_centralized` to optimize indexing performance. - Implemented binary vector generation with memory-mapped storage for efficient cascade search, including metadata saving. - Introduced SPLADE sparse index generation with improved handling and metadata storage. - Updated `ChainSearchEngine` to prefer centralized binary searcher for improved performance and added fallback to legacy binary index. - Deprecated `BinaryANNIndex` in favor of `BinarySearcher` for better memory management and performance. - Enhanced `SpladeEncoder` with warmup functionality to reduce latency spikes during first-time inference. - Improved `SpladeIndex` with cache size adjustments for better query performance. - Added methods for managing binary vectors in `VectorMetadataStore`, including batch insertion and retrieval. - Created a new `BinarySearcher` class for efficient binary vector search using Hamming distance, supporting both memory-mapped and database loading modes.
2026-02-10 02:24:35 +08:00 · 2026-01-02 23:57:55 +08:00
parent 96b44e1482
commit 54fd94547c
12 changed files with 945 additions and 167 deletions
--- a/codex-lens/src/codexlens/semantic/splade_encoder.py
+++ b/codex-lens/src/codexlens/semantic/splade_encoder.py
@@ -220,12 +220,16 @@ class SpladeEncoder:
            from transformers import AutoTokenizer

            if self.providers is None:
-                from .gpu_support import get_optimal_providers
+                from .gpu_support import get_optimal_providers, get_selected_device_id

-                # Include device_id options for DirectML/CUDA selection when available
+                # Get providers as pure string list (cache-friendly)
+                # NOTE: with_device_options=False to avoid tuple-based providers
+                # which break optimum's caching mechanism
                self.providers = get_optimal_providers(
-                    use_gpu=self.use_gpu, with_device_options=True
+                    use_gpu=self.use_gpu, with_device_options=False
                )
+                # Get device_id separately for provider_options
+                self._device_id = get_selected_device_id() if self.use_gpu else None

            # Some Optimum versions accept `providers`, others accept a single `provider`
            # Prefer passing the full providers list, with a conservative fallback
@@ -234,6 +238,15 @@ class SpladeEncoder:
                params = signature(ORTModelForMaskedLM.from_pretrained).parameters
                if "providers" in params:
                    model_kwargs["providers"] = self.providers
+                    # Pass device_id via provider_options for GPU selection
+                    if "provider_options" in params and hasattr(self, '_device_id') and self._device_id is not None:
+                        # Build provider_options dict for each GPU provider
+                        provider_options = {}
+                        for p in self.providers:
+                            if p in ("DmlExecutionProvider", "CUDAExecutionProvider", "ROCMExecutionProvider"):
+                                provider_options[p] = {"device_id": self._device_id}
+                        if provider_options:
+                            model_kwargs["provider_options"] = provider_options
                elif "provider" in params:
                    provider_name = "CPUExecutionProvider"
                    if self.providers:
@@ -369,6 +382,21 @@ class SpladeEncoder:

        return sparse_dict

+    def warmup(self, text: str = "warmup query") -> None:
+        """Warmup the encoder by running a dummy inference.
+
+        First-time model inference includes initialization overhead.
+        Call this method once before the first real search to avoid
+        latency spikes.
+
+        Args:
+            text: Dummy text for warmup (default: "warmup query")
+        """
+        logger.info("Warming up SPLADE encoder...")
+        # Trigger model loading and first inference
+        _ = self.encode_text(text)
+        logger.info("SPLADE encoder warmup complete")
+
    def encode_text(self, text: str) -> Dict[int, float]:
        """Encode text to sparse vector {token_id: weight}.