Add comprehensive tests for query parsing and Reciprocal Rank Fusion

- Implemented tests for the QueryParser class, covering various identifier splitting methods (CamelCase, snake_case, kebab-case), OR expansion, and FTS5 operator preservation.
- Added parameterized tests to validate expected token outputs for different query formats.
- Created edge case tests to ensure robustness against unusual input scenarios.
- Developed tests for the Reciprocal Rank Fusion (RRF) algorithm, including score computation, weight handling, and result ranking across multiple sources.
- Included tests for normalization of BM25 scores and tagging search results with source metadata.
This commit is contained in:
catlog22
2025-12-16 10:20:19 +08:00
parent 35485bbbb1
commit 3da0ef2adb
39 changed files with 6171 additions and 240 deletions

View File

@@ -18,6 +18,7 @@ from codexlens.storage.registry import RegistryStore, DirMapping
from codexlens.storage.dir_index import DirIndexStore, SubdirLink
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.sqlite_store import SQLiteStore
from codexlens.search.hybrid_search import HybridSearchEngine
@dataclass
@@ -32,6 +33,9 @@ class SearchOptions:
include_symbols: Whether to include symbol search results
files_only: Return only file paths without excerpts
include_semantic: Whether to include semantic keyword search results
hybrid_mode: Enable hybrid search with RRF fusion (default False)
enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True)
hybrid_weights: Custom RRF weights for hybrid search (optional)
"""
depth: int = -1
max_workers: int = 8
@@ -40,6 +44,9 @@ class SearchOptions:
include_symbols: bool = False
files_only: bool = False
include_semantic: bool = False
hybrid_mode: bool = False
enable_fuzzy: bool = True
hybrid_weights: Optional[Dict[str, float]] = None
@dataclass
@@ -484,7 +491,10 @@ class ChainSearchEngine:
query,
options.limit_per_dir,
options.files_only,
options.include_semantic
options.include_semantic,
options.hybrid_mode,
options.enable_fuzzy,
options.hybrid_weights
): idx_path
for idx_path in index_paths
}
@@ -507,7 +517,10 @@ class ChainSearchEngine:
query: str,
limit: int,
files_only: bool = False,
include_semantic: bool = False) -> List[SearchResult]:
include_semantic: bool = False,
hybrid_mode: bool = False,
enable_fuzzy: bool = True,
hybrid_weights: Optional[Dict[str, float]] = None) -> List[SearchResult]:
"""Search a single index database.
Handles exceptions gracefully, returning empty list on failure.
@@ -518,39 +531,54 @@ class ChainSearchEngine:
limit: Maximum results from this index
files_only: If True, skip snippet generation for faster search
include_semantic: If True, also search semantic keywords and merge results
hybrid_mode: If True, use hybrid search with RRF fusion
enable_fuzzy: Enable fuzzy FTS in hybrid mode
hybrid_weights: Custom RRF weights for hybrid search
Returns:
List of SearchResult objects (empty on error)
"""
try:
with DirIndexStore(index_path) as store:
# Get FTS results
if files_only:
# Fast path: return paths only without snippets
paths = store.search_files_only(query, limit=limit)
fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
else:
fts_results = store.search_fts(query, limit=limit)
# Optionally add semantic keyword results
if include_semantic:
try:
semantic_matches = store.search_semantic_keywords(query)
# Convert semantic matches to SearchResult with 0.8x weight
for file_entry, keywords in semantic_matches:
# Create excerpt from keywords
excerpt = f"Keywords: {', '.join(keywords[:5])}"
# Use a base score of 10.0 for semantic matches, weighted by 0.8
semantic_result = SearchResult(
path=str(file_entry.full_path),
score=10.0 * 0.8,
excerpt=excerpt
)
fts_results.append(semantic_result)
except Exception as sem_exc:
self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
return fts_results
# Use hybrid search if enabled
if hybrid_mode:
hybrid_engine = HybridSearchEngine(weights=hybrid_weights)
fts_results = hybrid_engine.search(
index_path,
query,
limit=limit,
enable_fuzzy=enable_fuzzy,
enable_vector=False, # Vector search not yet implemented
)
else:
# Legacy single-FTS search
with DirIndexStore(index_path) as store:
# Get FTS results
if files_only:
# Fast path: return paths only without snippets
paths = store.search_files_only(query, limit=limit)
fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
else:
fts_results = store.search_fts(query, limit=limit)
# Optionally add semantic keyword results
if include_semantic:
try:
semantic_matches = store.search_semantic_keywords(query)
# Convert semantic matches to SearchResult with 0.8x weight
for file_entry, keywords in semantic_matches:
# Create excerpt from keywords
excerpt = f"Keywords: {', '.join(keywords[:5])}"
# Use a base score of 10.0 for semantic matches, weighted by 0.8
semantic_result = SearchResult(
path=str(file_entry.full_path),
score=10.0 * 0.8,
excerpt=excerpt
)
fts_results.append(semantic_result)
except Exception as sem_exc:
self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
return fts_results
except Exception as exc:
self.logger.debug(f"Search error in {index_path}: {exc}")
return []