feat: Add multi-type embedding backends for cascade retrieval

- Implemented BinaryEmbeddingBackend for fast coarse filtering using 256-dimensional binary vectors. - Developed DenseEmbeddingBackend for high-precision dense vectors (2048 dimensions) for reranking. - Created CascadeEmbeddingBackend to combine binary and dense embeddings for two-stage retrieval. - Introduced utility functions for embedding conversion and distance computation. chore: Migration 010 - Add multi-vector storage support - Added 'chunks' table to support multi-vector embeddings for cascade retrieval. - Included new columns: embedding_binary (256-dim) and embedding_dense (2048-dim) for efficient storage. - Implemented upgrade and downgrade functions to manage schema changes and data migration.
2026-02-14 02:42:04 +08:00 · 2026-01-02 10:52:43 +08:00
parent 195438d26a
commit e21d801523
13 changed files with 3449 additions and 6 deletions
--- a/codex-lens/src/codexlens/search/hybrid_search.py
+++ b/codex-lens/src/codexlens/search/hybrid_search.py
@@ -40,11 +40,20 @@ from codexlens.search.ranking import (
    get_rrf_weights,
    reciprocal_rank_fusion,
    rerank_results,
+    simple_weighted_fusion,
    tag_search_source,
 )
 from codexlens.storage.dir_index import DirIndexStore


+# Three-way fusion weights (FTS + Vector + SPLADE)
+THREE_WAY_WEIGHTS = {
+    "exact": 0.2,
+    "splade": 0.3,
+    "vector": 0.5,
+}
+
+
 class HybridSearchEngine:
    """Hybrid search engine with parallel execution and RRF fusion.

@@ -193,9 +202,22 @@ class HybridSearchEngine:
            if source in results_map
        }

-        with timer("rrf_fusion", self.logger):
+        # Determine fusion method from config (default: rrf)
+        fusion_method = "rrf"
+        rrf_k = 60
+        if self._config is not None:
+            fusion_method = getattr(self._config, "fusion_method", "rrf") or "rrf"
+            rrf_k = getattr(self._config, "rrf_k", 60) or 60
+
+        with timer("fusion", self.logger):
            adaptive_weights = get_rrf_weights(query, active_weights)
-            fused_results = reciprocal_rank_fusion(results_map, adaptive_weights)
+            if fusion_method == "simple":
+                fused_results = simple_weighted_fusion(results_map, adaptive_weights)
+            else:
+                # Default to RRF
+                fused_results = reciprocal_rank_fusion(
+                    results_map, adaptive_weights, k=rrf_k
+                )

        # Optional: boost results that include explicit symbol matches
        boost_factor = (