Implement SPLADE sparse encoder and associated database migrations

- Added `splade_encoder.py` for ONNX-optimized SPLADE encoding, including methods for encoding text and batch processing. - Created `SPLADE_IMPLEMENTATION.md` to document the SPLADE encoder's functionality, design patterns, and integration points. - Introduced migration script `migration_009_add_splade.py` to add SPLADE metadata and posting list tables to the database. - Developed `splade_index.py` for managing the SPLADE inverted index, supporting efficient sparse vector retrieval. - Added verification script `verify_watcher.py` to test FileWatcher event filtering and debouncing functionality.
2026-02-12 02:37:45 +08:00 · 2026-01-01 17:41:22 +08:00
parent 520f2d26f2
commit 5bb01755bc
16 changed files with 3122 additions and 2792 deletions
--- a/codex-lens/src/codexlens/search/ranking.py
+++ b/codex-lens/src/codexlens/search/ranking.py
@@ -1,7 +1,7 @@
 """Ranking algorithms for hybrid search result fusion.

 Implements Reciprocal Rank Fusion (RRF) and score normalization utilities
-for combining results from heterogeneous search backends (exact FTS, fuzzy FTS, vector search).
+for combining results from heterogeneous search backends (SPLADE, exact FTS, fuzzy FTS, vector search).
 """

 from __future__ import annotations
@@ -14,6 +14,20 @@ from typing import Any, Dict, List
 from codexlens.entities import SearchResult, AdditionalLocation


+# Default RRF weights for SPLADE-based hybrid search
+DEFAULT_WEIGHTS = {
+    "splade": 0.4,  # Replaces exact(0.3) + fuzzy(0.1)
+    "vector": 0.6,
+}
+
+# Legacy weights for FTS fallback mode (when SPLADE unavailable)
+FTS_FALLBACK_WEIGHTS = {
+    "exact": 0.3,
+    "fuzzy": 0.1,
+    "vector": 0.6,
+}
+
+
 class QueryIntent(str, Enum):
    """Query intent for adaptive RRF weights (Python/TypeScript parity)."""

@@ -87,15 +101,24 @@ def adjust_weights_by_intent(
    intent: QueryIntent,
    base_weights: Dict[str, float],
 ) -> Dict[str, float]:
-    """Map intent → weights (kept aligned with TypeScript mapping)."""
+    """Adjust RRF weights based on query intent."""
+    # Check if using SPLADE or FTS mode
+    use_splade = "splade" in base_weights
+    
    if intent == QueryIntent.KEYWORD:
-        target = {"exact": 0.5, "fuzzy": 0.1, "vector": 0.4}
+        if use_splade:
+            target = {"splade": 0.6, "vector": 0.4}
+        else:
+            target = {"exact": 0.5, "fuzzy": 0.1, "vector": 0.4}
    elif intent == QueryIntent.SEMANTIC:
-        target = {"exact": 0.2, "fuzzy": 0.1, "vector": 0.7}
+        if use_splade:
+            target = {"splade": 0.3, "vector": 0.7}
+        else:
+            target = {"exact": 0.2, "fuzzy": 0.1, "vector": 0.7}
    else:
        target = dict(base_weights)
-
-    # Preserve only keys that are present in base_weights (active backends).
+    
+    # Filter to active backends
    keys = list(base_weights.keys())
    filtered = {k: float(target.get(k, 0.0)) for k in keys}
    return normalize_weights(filtered)