feat: Enhance embedding generation and search capabilities

- Added pre-calculation of estimated chunk count for HNSW capacity in `generate_dense_embeddings_centralized` to optimize indexing performance.
- Implemented binary vector generation with memory-mapped storage for efficient cascade search, including metadata saving.
- Introduced SPLADE sparse index generation with improved handling and metadata storage.
- Updated `ChainSearchEngine` to prefer centralized binary searcher for improved performance and added fallback to legacy binary index.
- Deprecated `BinaryANNIndex` in favor of `BinarySearcher` for better memory management and performance.
- Enhanced `SpladeEncoder` with warmup functionality to reduce latency spikes during first-time inference.
- Improved `SpladeIndex` with cache size adjustments for better query performance.
- Added methods for managing binary vectors in `VectorMetadataStore`, including batch insertion and retrieval.
- Created a new `BinarySearcher` class for efficient binary vector search using Hamming distance, supporting both memory-mapped and database loading modes.
This commit is contained in:
catlog22
2026-01-02 23:57:55 +08:00
parent 96b44e1482
commit 54fd94547c
12 changed files with 945 additions and 167 deletions

View File

@@ -220,12 +220,16 @@ class SpladeEncoder:
from transformers import AutoTokenizer
if self.providers is None:
from .gpu_support import get_optimal_providers
from .gpu_support import get_optimal_providers, get_selected_device_id
# Include device_id options for DirectML/CUDA selection when available
# Get providers as pure string list (cache-friendly)
# NOTE: with_device_options=False to avoid tuple-based providers
# which break optimum's caching mechanism
self.providers = get_optimal_providers(
use_gpu=self.use_gpu, with_device_options=True
use_gpu=self.use_gpu, with_device_options=False
)
# Get device_id separately for provider_options
self._device_id = get_selected_device_id() if self.use_gpu else None
# Some Optimum versions accept `providers`, others accept a single `provider`
# Prefer passing the full providers list, with a conservative fallback
@@ -234,6 +238,15 @@ class SpladeEncoder:
params = signature(ORTModelForMaskedLM.from_pretrained).parameters
if "providers" in params:
model_kwargs["providers"] = self.providers
# Pass device_id via provider_options for GPU selection
if "provider_options" in params and hasattr(self, '_device_id') and self._device_id is not None:
# Build provider_options dict for each GPU provider
provider_options = {}
for p in self.providers:
if p in ("DmlExecutionProvider", "CUDAExecutionProvider", "ROCMExecutionProvider"):
provider_options[p] = {"device_id": self._device_id}
if provider_options:
model_kwargs["provider_options"] = provider_options
elif "provider" in params:
provider_name = "CPUExecutionProvider"
if self.providers:
@@ -369,6 +382,21 @@ class SpladeEncoder:
return sparse_dict
def warmup(self, text: str = "warmup query") -> None:
"""Warmup the encoder by running a dummy inference.
First-time model inference includes initialization overhead.
Call this method once before the first real search to avoid
latency spikes.
Args:
text: Dummy text for warmup (default: "warmup query")
"""
logger.info("Warming up SPLADE encoder...")
# Trigger model loading and first inference
_ = self.encode_text(text)
logger.info("SPLADE encoder warmup complete")
def encode_text(self, text: str) -> Dict[int, float]:
"""Encode text to sparse vector {token_id: weight}.