Add comprehensive tests for query parsing and Reciprocal Rank Fusion

- Implemented tests for the QueryParser class, covering various identifier splitting methods (CamelCase, snake_case, kebab-case), OR expansion, and FTS5 operator preservation. - Added parameterized tests to validate expected token outputs for different query formats. - Created edge case tests to ensure robustness against unusual input scenarios. - Developed tests for the Reciprocal Rank Fusion (RRF) algorithm, including score computation, weight handling, and result ranking across multiple sources. - Included tests for normalization of BM25 scores and tagging search results with source metadata.
2026-02-12 02:37:45 +08:00 · 2025-12-16 10:20:19 +08:00
parent 35485bbbb1
commit 3da0ef2adb
39 changed files with 6171 additions and 240 deletions
--- a/codex-lens/src/codexlens/search/chain_search.py
+++ b/codex-lens/src/codexlens/search/chain_search.py
@@ -18,6 +18,7 @@ from codexlens.storage.registry import RegistryStore, DirMapping
 from codexlens.storage.dir_index import DirIndexStore, SubdirLink
 from codexlens.storage.path_mapper import PathMapper
 from codexlens.storage.sqlite_store import SQLiteStore
+from codexlens.search.hybrid_search import HybridSearchEngine


@dataclass
@@ -32,6 +33,9 @@ class SearchOptions:
        include_symbols: Whether to include symbol search results
        files_only: Return only file paths without excerpts
        include_semantic: Whether to include semantic keyword search results
+        hybrid_mode: Enable hybrid search with RRF fusion (default False)
+        enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True)
+        hybrid_weights: Custom RRF weights for hybrid search (optional)
    """
    depth: int = -1
    max_workers: int = 8
@@ -40,6 +44,9 @@ class SearchOptions:
    include_symbols: bool = False
    files_only: bool = False
    include_semantic: bool = False
+    hybrid_mode: bool = False
+    enable_fuzzy: bool = True
+    hybrid_weights: Optional[Dict[str, float]] = None


@dataclass
@@ -484,7 +491,10 @@ class ChainSearchEngine:
                query,
                options.limit_per_dir,
                options.files_only,
-                options.include_semantic
+                options.include_semantic,
+                options.hybrid_mode,
+                options.enable_fuzzy,
+                options.hybrid_weights
            ): idx_path
            for idx_path in index_paths
        }
@@ -507,7 +517,10 @@ class ChainSearchEngine:
                              query: str,
                              limit: int,
                              files_only: bool = False,
-                              include_semantic: bool = False) -> List[SearchResult]:
+                              include_semantic: bool = False,
+                              hybrid_mode: bool = False,
+                              enable_fuzzy: bool = True,
+                              hybrid_weights: Optional[Dict[str, float]] = None) -> List[SearchResult]:
        """Search a single index database.

        Handles exceptions gracefully, returning empty list on failure.
@@ -518,39 +531,54 @@ class ChainSearchEngine:
            limit: Maximum results from this index
            files_only: If True, skip snippet generation for faster search
            include_semantic: If True, also search semantic keywords and merge results
+            hybrid_mode: If True, use hybrid search with RRF fusion
+            enable_fuzzy: Enable fuzzy FTS in hybrid mode
+            hybrid_weights: Custom RRF weights for hybrid search

        Returns:
            List of SearchResult objects (empty on error)
        """
        try:
-            with DirIndexStore(index_path) as store:
-                # Get FTS results
-                if files_only:
-                    # Fast path: return paths only without snippets
-                    paths = store.search_files_only(query, limit=limit)
-                    fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
-                else:
-                    fts_results = store.search_fts(query, limit=limit)
-                
-                # Optionally add semantic keyword results
-                if include_semantic:
-                    try:
-                        semantic_matches = store.search_semantic_keywords(query)
-                        # Convert semantic matches to SearchResult with 0.8x weight
-                        for file_entry, keywords in semantic_matches:
-                            # Create excerpt from keywords
-                            excerpt = f"Keywords: {', '.join(keywords[:5])}"
-                            # Use a base score of 10.0 for semantic matches, weighted by 0.8
-                            semantic_result = SearchResult(
-                                path=str(file_entry.full_path),
-                                score=10.0 * 0.8,
-                                excerpt=excerpt
-                            )
-                            fts_results.append(semantic_result)
-                    except Exception as sem_exc:
-                        self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
-                
-                return fts_results
+            # Use hybrid search if enabled
+            if hybrid_mode:
+                hybrid_engine = HybridSearchEngine(weights=hybrid_weights)
+                fts_results = hybrid_engine.search(
+                    index_path,
+                    query,
+                    limit=limit,
+                    enable_fuzzy=enable_fuzzy,
+                    enable_vector=False,  # Vector search not yet implemented
+                )
+            else:
+                # Legacy single-FTS search
+                with DirIndexStore(index_path) as store:
+                    # Get FTS results
+                    if files_only:
+                        # Fast path: return paths only without snippets
+                        paths = store.search_files_only(query, limit=limit)
+                        fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
+                    else:
+                        fts_results = store.search_fts(query, limit=limit)
+
+                    # Optionally add semantic keyword results
+                    if include_semantic:
+                        try:
+                            semantic_matches = store.search_semantic_keywords(query)
+                            # Convert semantic matches to SearchResult with 0.8x weight
+                            for file_entry, keywords in semantic_matches:
+                                # Create excerpt from keywords
+                                excerpt = f"Keywords: {', '.join(keywords[:5])}"
+                                # Use a base score of 10.0 for semantic matches, weighted by 0.8
+                                semantic_result = SearchResult(
+                                    path=str(file_entry.full_path),
+                                    score=10.0 * 0.8,
+                                    excerpt=excerpt
+                                )
+                                fts_results.append(semantic_result)
+                        except Exception as sem_exc:
+                            self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
+
+            return fts_results
        except Exception as exc:
            self.logger.debug(f"Search error in {index_path}: {exc}")
            return []