feat(splade): add cache directory support for ONNX models and improve thread-local database connection handling

2026-02-13 02:41:50 +08:00 · 2026-01-01 22:40:00 +08:00
parent 5bb01755bc
commit 195438d26a
2 changed files with 81 additions and 6 deletions
--- a/codex-lens/src/codexlens/semantic/splade_encoder.py
+++ b/codex-lens/src/codexlens/semantic/splade_encoder.py
@@ -15,6 +15,7 @@ from __future__ import annotations
 import logging
 import threading
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 logger = logging.getLogger(__name__)
@@ -68,6 +69,7 @@ def get_splade_encoder(
    use_gpu: bool = True,
    max_length: int = 512,
    sparsity_threshold: float = 0.01,
    cache_dir: Optional[str] = None,
 ) -> "SpladeEncoder":
    """Get or create cached SPLADE encoder (thread-safe singleton).
@@ -80,6 +82,7 @@ def get_splade_encoder(
        use_gpu: If True, use GPU acceleration when available
        max_length: Maximum sequence length for tokenization
        sparsity_threshold: Minimum weight to include in sparse vector
        cache_dir: Directory to cache ONNX models (default: ~/.cache/codexlens/splade)
    Returns:
        Cached SpladeEncoder instance for the given configuration
@@ -100,6 +103,7 @@ def get_splade_encoder(
            use_gpu=use_gpu,
            max_length=max_length,
            sparsity_threshold=sparsity_threshold,
            cache_dir=cache_dir,
        )
        # Pre-load model to ensure it's ready
        encoder._load_model()
@@ -151,6 +155,7 @@ class SpladeEncoder:
        max_length: int = 512,
        sparsity_threshold: float = 0.01,
        providers: Optional[List[Any]] = None,
        cache_dir: Optional[str] = None,
    ) -> None:
        """Initialize SPLADE encoder.
@@ -160,6 +165,7 @@ class SpladeEncoder:
            max_length: Maximum sequence length for tokenization
            sparsity_threshold: Minimum weight to include in sparse vector
            providers: Explicit ONNX providers list (overrides use_gpu)
            cache_dir: Directory to cache ONNX models (default: ~/.cache/codexlens/splade)
        """
        self.model_name = (model_name or self.DEFAULT_MODEL).strip()
        if not self.model_name:
@@ -170,13 +176,33 @@ class SpladeEncoder:
        self.sparsity_threshold = float(sparsity_threshold)
        self.providers = providers
        # Setup ONNX cache directory
        if cache_dir:
            self._cache_dir = Path(cache_dir)
        else:
            self._cache_dir = Path.home() / ".cache" / "codexlens" / "splade"
        self._tokenizer: Any | None = None
        self._model: Any | None = None
        self._vocab_size: int | None = None
        self._lock = threading.RLock()
    def _get_local_cache_path(self) -> Path:
        """Get local cache path for this model's ONNX files.
        Returns:
            Path to the local ONNX cache directory for this model
        """
        # Replace / with -- for filesystem-safe naming
        safe_name = self.model_name.replace("/", "--")
        return self._cache_dir / safe_name
    def _load_model(self) -> None:
-        """Lazy load ONNX model and tokenizer."""
+        """Lazy load ONNX model and tokenizer.
        First checks local cache for ONNX model, falling back to
        HuggingFace download and conversion if not cached.
        """
        if self._model is not None and self._tokenizer is not None:
            return
@@ -214,18 +240,48 @@ class SpladeEncoder:
                        first = self.providers[0]
                        provider_name = first[0] if isinstance(first, tuple) else str(first)
                    model_kwargs["provider"] = provider_name
-            except Exception:
+            except Exception as e:
                logger.debug(f"Failed to inspect ORTModel signature: {e}")
                model_kwargs = {}
            # Check for local ONNX cache first
            local_cache = self._get_local_cache_path()
            onnx_model_path = local_cache / "model.onnx"
            if onnx_model_path.exists():
                # Load from local cache
                logger.info(f"Loading SPLADE from local cache: {local_cache}")
                try:
                    self._model = ORTModelForMaskedLM.from_pretrained(
                        str(local_cache),
                        **model_kwargs,
                    )
                    self._tokenizer = AutoTokenizer.from_pretrained(
                        str(local_cache), use_fast=True
                    )
                    self._vocab_size = len(self._tokenizer)
                    logger.info(
                        f"SPLADE loaded from cache: {self.model_name}, vocab={self._vocab_size}"
                    )
                    return
                except Exception as e:
                    logger.warning(f"Failed to load from cache, redownloading: {e}")
            # Download and convert from HuggingFace
            logger.info(f"Downloading SPLADE model: {self.model_name}")
            try:
                self._model = ORTModelForMaskedLM.from_pretrained(
                    self.model_name,
                    export=True,  # Export to ONNX
                    **model_kwargs,
                )
                logger.debug(f"SPLADE model loaded: {self.model_name}")
            except TypeError:
                # Fallback for older Optimum versions: retry without provider arguments
-                self._model = ORTModelForMaskedLM.from_pretrained(self.model_name)
+                self._model = ORTModelForMaskedLM.from_pretrained(
                    self.model_name,
                    export=True,
                )
                logger.warning(
                    "Optimum version doesn't support provider parameters. "
                    "Upgrade optimum for GPU acceleration: pip install --upgrade optimum"
@@ -237,6 +293,15 @@ class SpladeEncoder:
            self._vocab_size = len(self._tokenizer)
            logger.debug(f"SPLADE tokenizer loaded: vocab_size={self._vocab_size}")
            # Save to local cache for future use
            try:
                local_cache.mkdir(parents=True, exist_ok=True)
                self._model.save_pretrained(str(local_cache))
                self._tokenizer.save_pretrained(str(local_cache))
                logger.info(f"SPLADE model cached to: {local_cache}")
            except Exception as e:
                logger.warning(f"Failed to cache SPLADE model: {e}")
    @staticmethod
    def _splade_activation(logits: Any, attention_mask: Any) -> Any:
        """Apply SPLADE activation function to model outputs.
--- a/codex-lens/src/codexlens/storage/splade_index.py
+++ b/codex-lens/src/codexlens/storage/splade_index.py
@@ -40,15 +40,25 @@ class SpladeIndex:
        self._local = threading.local()
    def _get_connection(self) -> sqlite3.Connection:
-        """Get or create a thread-local database connection."""
+        """Get or create a thread-local database connection.
        Each thread gets its own connection to ensure thread safety.
        Connections are stored in thread-local storage.
        """
        conn = getattr(self._local, "conn", None)
        if conn is None:
-            conn = sqlite3.connect(self.db_path, check_same_thread=False)
+            # Thread-local connection - each thread has its own
            conn = sqlite3.connect(
                self.db_path,
                timeout=30.0,  # Wait up to 30s for locks
                check_same_thread=True,  # Enforce thread safety
            )
            conn.row_factory = sqlite3.Row
            conn.execute("PRAGMA journal_mode=WAL")
            conn.execute("PRAGMA synchronous=NORMAL")
            conn.execute("PRAGMA foreign_keys=ON")
-            conn.execute("PRAGMA mmap_size=30000000000")  # 30GB limit
+            # Limit mmap to 1GB to avoid OOM on smaller systems
            conn.execute("PRAGMA mmap_size=1073741824")
            self._local.conn = conn
        return conn