mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-19 18:58:47 +08:00
Major improvements to smart-search, chain-search cascade, ranking pipeline, reranker factory, CLI history store, codex-lens integration, and uv-manager. Simplify command-generator skill by inlining phases. Add comprehensive tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
636 lines
23 KiB
Python
636 lines
23 KiB
Python
"""Unit tests for HybridSearchEngine - parallel search and RRF fusion.
|
|
|
|
Tests cover:
|
|
- search: exact only, fuzzy enabled, vector enabled, pure vector mode
|
|
- search: RRF fusion, empty query, no results, reranking, category filtering
|
|
- _search_parallel: parallel backend execution
|
|
- _search_lsp_graph: LSP graph expansion with seeds, vector-to-FTS fallback
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import tempfile
|
|
from pathlib import Path
|
|
from typing import Dict, List
|
|
from unittest.mock import MagicMock, Mock, patch, PropertyMock
|
|
|
|
import pytest
|
|
|
|
from codexlens.config import Config
|
|
from codexlens.entities import SearchResult
|
|
from codexlens.search.hybrid_search import HybridSearchEngine
|
|
|
|
|
|
# =============================================================================
|
|
# Test Fixtures
|
|
# =============================================================================
|
|
|
|
|
|
@pytest.fixture
|
|
def temp_paths():
|
|
"""Create temporary directory structure with a mock index."""
|
|
tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
|
|
root = Path(tmpdir.name)
|
|
# Create a non-empty index file to pass the empty-file guard
|
|
index_path = root / "_index.db"
|
|
index_path.write_bytes(b"\x00" * 100)
|
|
yield root
|
|
try:
|
|
tmpdir.cleanup()
|
|
except (PermissionError, OSError):
|
|
pass
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_config():
|
|
"""Create mock config for hybrid search."""
|
|
config = MagicMock(spec=Config)
|
|
config.embedding_use_gpu = False
|
|
config.enable_reranking = False
|
|
config.enable_cross_encoder_rerank = False
|
|
config.symbol_boost_factor = 1.5
|
|
config.fusion_method = "rrf"
|
|
config.rrf_k = 60
|
|
config.enable_category_filter = True
|
|
return config
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_results() -> List[SearchResult]:
|
|
"""Create sample search results."""
|
|
return [
|
|
SearchResult(
|
|
path="auth.py",
|
|
score=0.9,
|
|
excerpt="def authenticate(user):",
|
|
symbol_name="authenticate",
|
|
symbol_kind="function",
|
|
),
|
|
SearchResult(
|
|
path="login.py",
|
|
score=0.7,
|
|
excerpt="class LoginHandler:",
|
|
symbol_name="LoginHandler",
|
|
symbol_kind="class",
|
|
),
|
|
]
|
|
|
|
|
|
# =============================================================================
|
|
# Tests: search with different backends
|
|
# =============================================================================
|
|
|
|
|
|
class TestHybridSearchBackends:
|
|
"""Tests for HybridSearchEngine.search() backend configurations."""
|
|
|
|
def test_search_exact_only(self, temp_paths, mock_config):
|
|
"""Search with only exact FTS backend."""
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
with patch.object(engine, "_search_parallel") as mock_parallel:
|
|
mock_parallel.return_value = {
|
|
"exact": [
|
|
SearchResult(path="a.py", score=10.0, excerpt="result"),
|
|
]
|
|
}
|
|
|
|
results = engine.search(
|
|
index_path, "test query",
|
|
enable_fuzzy=False, enable_vector=False,
|
|
)
|
|
|
|
assert len(results) == 1
|
|
# Verify only exact backend was requested
|
|
call_args = mock_parallel.call_args
|
|
backends = call_args[0][2] # 3rd positional arg
|
|
assert "exact" in backends
|
|
assert "fuzzy" not in backends
|
|
assert "vector" not in backends
|
|
|
|
def test_search_fuzzy_enabled(self, temp_paths, mock_config):
|
|
"""Search with exact + fuzzy backends."""
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
with patch.object(engine, "_search_parallel") as mock_parallel:
|
|
mock_parallel.return_value = {
|
|
"exact": [SearchResult(path="a.py", score=10.0, excerpt="exact")],
|
|
"fuzzy": [SearchResult(path="b.py", score=8.0, excerpt="fuzzy")],
|
|
}
|
|
|
|
results = engine.search(
|
|
index_path, "test_query",
|
|
enable_fuzzy=True, enable_vector=False,
|
|
)
|
|
|
|
assert len(results) >= 1
|
|
backends = mock_parallel.call_args[0][2]
|
|
assert "exact" in backends
|
|
assert "fuzzy" in backends
|
|
|
|
def test_search_vector_enabled(self, temp_paths, mock_config):
|
|
"""Search with exact + fuzzy + vector backends."""
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
with patch.object(engine, "_search_parallel") as mock_parallel:
|
|
mock_parallel.return_value = {
|
|
"exact": [SearchResult(path="a.py", score=10.0, excerpt="exact")],
|
|
"vector": [SearchResult(path="c.py", score=0.85, excerpt="vector")],
|
|
}
|
|
|
|
results = engine.search(
|
|
index_path, "test_query",
|
|
enable_fuzzy=False, enable_vector=True,
|
|
)
|
|
|
|
backends = mock_parallel.call_args[0][2]
|
|
assert "exact" in backends
|
|
assert "vector" in backends
|
|
|
|
def test_search_lexical_priority_query_skips_vector_backend(self, temp_paths, mock_config):
|
|
"""Config/env/factory queries should stay lexical-first in hybrid mode."""
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
with patch.object(engine, "_search_parallel") as mock_parallel:
|
|
mock_parallel.return_value = {
|
|
"exact": [SearchResult(path="config.py", score=10.0, excerpt="exact")],
|
|
"fuzzy": [SearchResult(path="env_config.py", score=8.0, excerpt="fuzzy")],
|
|
}
|
|
|
|
results = engine.search(
|
|
index_path,
|
|
"embedding backend fastembed local litellm api config",
|
|
enable_fuzzy=True,
|
|
enable_vector=True,
|
|
)
|
|
|
|
assert len(results) >= 1
|
|
backends = mock_parallel.call_args[0][2]
|
|
assert "exact" in backends
|
|
assert "fuzzy" in backends
|
|
assert "vector" not in backends
|
|
|
|
def test_search_pure_vector(self, temp_paths, mock_config):
|
|
"""Pure vector mode should only use vector backend."""
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
mock_config.enable_category_filter = False
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
with patch.object(engine, "_search_parallel") as mock_parallel:
|
|
mock_parallel.return_value = {
|
|
"vector": [SearchResult(path="a.py", score=0.9, excerpt="vec")],
|
|
}
|
|
|
|
results = engine.search(
|
|
index_path, "semantic query",
|
|
enable_vector=True, pure_vector=True,
|
|
)
|
|
|
|
backends = mock_parallel.call_args[0][2]
|
|
assert "vector" in backends
|
|
assert "exact" not in backends
|
|
|
|
|
|
# =============================================================================
|
|
# Tests: search fusion and post-processing
|
|
# =============================================================================
|
|
|
|
|
|
class TestHybridSearchFusion:
|
|
"""Tests for RRF fusion, empty query, no results, reranking, filtering."""
|
|
|
|
def test_search_rrf_fusion(self, temp_paths, mock_config):
|
|
"""Results from multiple backends should be fused via RRF."""
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
with patch.object(engine, "_search_parallel") as mock_parallel:
|
|
mock_parallel.return_value = {
|
|
"exact": [
|
|
SearchResult(path="a.py", score=10.0, excerpt="exact a"),
|
|
SearchResult(path="b.py", score=5.0, excerpt="exact b"),
|
|
],
|
|
"vector": [
|
|
SearchResult(path="b.py", score=0.9, excerpt="vector b"),
|
|
SearchResult(path="c.py", score=0.8, excerpt="vector c"),
|
|
],
|
|
}
|
|
|
|
results = engine.search(
|
|
index_path, "test",
|
|
enable_fuzzy=False, enable_vector=True,
|
|
)
|
|
|
|
# b.py appears in both sources - should have high fusion score
|
|
assert any(r.path == "b.py" for r in results)
|
|
|
|
def test_search_empty_query(self, temp_paths, mock_config):
|
|
"""Empty query should still execute (handled gracefully)."""
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
with patch.object(engine, "_search_parallel") as mock_parallel:
|
|
mock_parallel.return_value = {"exact": []}
|
|
|
|
results = engine.search(index_path, "", enable_fuzzy=False)
|
|
|
|
assert results == []
|
|
|
|
def test_search_no_results(self, temp_paths, mock_config):
|
|
"""All backends returning empty should produce empty results."""
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
with patch.object(engine, "_search_parallel") as mock_parallel:
|
|
mock_parallel.return_value = {
|
|
"exact": [],
|
|
"fuzzy": [],
|
|
}
|
|
|
|
results = engine.search(index_path, "nonexistent")
|
|
|
|
assert results == []
|
|
|
|
def test_search_reranking(self, temp_paths, mock_config):
|
|
"""Reranking should be applied when config enables it."""
|
|
mock_config.enable_reranking = True
|
|
mock_config.enable_cross_encoder_rerank = False
|
|
mock_config.reranking_top_k = 50
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
mock_embedder = MagicMock()
|
|
mock_embedder.embed_single.return_value = [0.1] * 128
|
|
mock_embedder.embed.return_value = [[0.1] * 128]
|
|
engine.embedder = mock_embedder
|
|
|
|
with patch.object(engine, "_search_parallel") as mock_parallel:
|
|
mock_parallel.return_value = {
|
|
"exact": [SearchResult(path="a.py", score=10.0, excerpt="code")],
|
|
}
|
|
|
|
with patch("codexlens.search.hybrid_search.rerank_results") as mock_rerank:
|
|
mock_rerank.return_value = [
|
|
SearchResult(path="a.py", score=0.85, excerpt="code"),
|
|
]
|
|
results = engine.search(index_path, "query", enable_fuzzy=False)
|
|
|
|
mock_rerank.assert_called_once()
|
|
|
|
def test_search_lexical_priority_query_skips_expensive_reranking(self, temp_paths, mock_config):
|
|
"""Lexical-priority queries should bypass embedder and cross-encoder reranking."""
|
|
mock_config.enable_reranking = True
|
|
mock_config.enable_cross_encoder_rerank = True
|
|
mock_config.reranking_top_k = 50
|
|
mock_config.reranker_top_k = 20
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
with patch.object(engine, "_search_parallel") as mock_parallel:
|
|
mock_parallel.return_value = {
|
|
"exact": [SearchResult(path="config.py", score=10.0, excerpt="code")],
|
|
"fuzzy": [SearchResult(path="env_config.py", score=9.0, excerpt="env vars")],
|
|
}
|
|
|
|
with patch("codexlens.search.hybrid_search.rerank_results") as mock_rerank, patch(
|
|
"codexlens.search.hybrid_search.cross_encoder_rerank"
|
|
) as mock_cross_encoder, patch.object(
|
|
engine,
|
|
"_get_cross_encoder_reranker",
|
|
) as mock_get_reranker:
|
|
results = engine.search(
|
|
index_path,
|
|
"get_reranker factory onnx backend selection",
|
|
enable_fuzzy=True,
|
|
enable_vector=True,
|
|
)
|
|
|
|
assert len(results) >= 1
|
|
mock_rerank.assert_not_called()
|
|
mock_cross_encoder.assert_not_called()
|
|
mock_get_reranker.assert_not_called()
|
|
|
|
def test_search_category_filtering(self, temp_paths, mock_config):
|
|
"""Category filtering should separate code/doc results by intent."""
|
|
mock_config.enable_category_filter = True
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
with patch.object(engine, "_search_parallel") as mock_parallel:
|
|
mock_parallel.return_value = {
|
|
"exact": [
|
|
SearchResult(path="auth.py", score=10.0, excerpt="def auth"),
|
|
SearchResult(path="README.md", score=8.0, excerpt="docs"),
|
|
],
|
|
}
|
|
|
|
# Keyword-like query should filter to code
|
|
results = engine.search(
|
|
index_path, "AuthManager",
|
|
enable_fuzzy=False,
|
|
)
|
|
|
|
paths = [r.path for r in results]
|
|
# Code files should remain, doc files filtered for KEYWORD intent
|
|
assert "auth.py" in paths
|
|
|
|
|
|
# =============================================================================
|
|
# Tests: _search_parallel
|
|
# =============================================================================
|
|
|
|
|
|
class TestSearchParallel:
|
|
"""Tests for _search_parallel() parallel backend execution."""
|
|
|
|
def test_search_parallel_backends(self, temp_paths, mock_config):
|
|
"""Parallel execution should run all requested backends."""
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
with patch.object(engine, "_search_exact") as mock_exact, \
|
|
patch.object(engine, "_search_fuzzy") as mock_fuzzy:
|
|
mock_exact.return_value = [
|
|
SearchResult(path="a.py", score=10.0, excerpt="exact"),
|
|
]
|
|
mock_fuzzy.return_value = [
|
|
SearchResult(path="b.py", score=8.0, excerpt="fuzzy"),
|
|
]
|
|
|
|
results_map = engine._search_parallel(
|
|
index_path, "query",
|
|
backends={"exact": True, "fuzzy": True},
|
|
limit=10,
|
|
)
|
|
|
|
assert "exact" in results_map
|
|
assert "fuzzy" in results_map
|
|
mock_exact.assert_called_once()
|
|
mock_fuzzy.assert_called_once()
|
|
|
|
|
|
class TestCentralizedMetadataFetch:
|
|
"""Tests for centralized metadata retrieval helpers."""
|
|
|
|
def test_fetch_from_vector_meta_store_clamps_negative_scores(self, temp_paths, mock_config, monkeypatch):
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
|
|
class FakeMetaStore:
|
|
def __init__(self, _path):
|
|
pass
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc, tb):
|
|
return False
|
|
|
|
def get_chunks_by_ids(self, _chunk_ids, category=None):
|
|
assert category is None
|
|
return [
|
|
{
|
|
"chunk_id": 7,
|
|
"file_path": "src/app.py",
|
|
"content": "def app(): pass",
|
|
"metadata": {},
|
|
"start_line": 1,
|
|
"end_line": 1,
|
|
}
|
|
]
|
|
|
|
import codexlens.storage.vector_meta_store as vector_meta_store
|
|
|
|
monkeypatch.setattr(vector_meta_store, "VectorMetadataStore", FakeMetaStore)
|
|
|
|
results = engine._fetch_from_vector_meta_store(
|
|
temp_paths / "_vectors_meta.db",
|
|
[7],
|
|
{7: -0.01},
|
|
)
|
|
|
|
assert len(results) == 1
|
|
assert results[0].path == "src/app.py"
|
|
assert results[0].score == 0.0
|
|
|
|
|
|
class TestCentralizedVectorCaching:
|
|
"""Tests for centralized vector search runtime caches."""
|
|
|
|
def test_search_vector_centralized_reuses_cached_resources(
|
|
self,
|
|
temp_paths,
|
|
mock_config,
|
|
):
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
hnsw_path = temp_paths / "_vectors.hnsw"
|
|
hnsw_path.write_bytes(b"hnsw")
|
|
|
|
vector_store_opened: List[Path] = []
|
|
|
|
class FakeVectorStore:
|
|
def __init__(self, path):
|
|
vector_store_opened.append(Path(path))
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc, tb):
|
|
return False
|
|
|
|
def get_model_config(self):
|
|
return {
|
|
"backend": "fastembed",
|
|
"model_name": "BAAI/bge-small-en-v1.5",
|
|
"model_profile": "fast",
|
|
"embedding_dim": 384,
|
|
}
|
|
|
|
class FakeEmbedder:
|
|
embedding_dim = 384
|
|
|
|
def __init__(self):
|
|
self.embed_calls: List[str] = []
|
|
|
|
def embed_single(self, query):
|
|
self.embed_calls.append(query)
|
|
return [0.1, 0.2, 0.3]
|
|
|
|
class FakeAnnIndex:
|
|
def __init__(self):
|
|
self.load_calls = 0
|
|
self.search_calls = 0
|
|
|
|
def load(self):
|
|
self.load_calls += 1
|
|
return True
|
|
|
|
def count(self):
|
|
return 3
|
|
|
|
def search(self, _query_vec, top_k):
|
|
self.search_calls += 1
|
|
assert top_k == 10
|
|
return [7], [0.2]
|
|
|
|
fake_embedder = FakeEmbedder()
|
|
fake_ann_index = FakeAnnIndex()
|
|
|
|
with patch("codexlens.semantic.vector_store.VectorStore", FakeVectorStore), patch(
|
|
"codexlens.semantic.factory.get_embedder",
|
|
return_value=fake_embedder,
|
|
) as mock_get_embedder, patch(
|
|
"codexlens.semantic.ann_index.ANNIndex.create_central",
|
|
return_value=fake_ann_index,
|
|
) as mock_create_central, patch.object(
|
|
engine,
|
|
"_fetch_chunks_by_ids_centralized",
|
|
return_value=[SearchResult(path="src/app.py", score=0.8, excerpt="hit")],
|
|
) as mock_fetch:
|
|
first = engine._search_vector_centralized(
|
|
temp_paths / "child-a" / "_index.db",
|
|
hnsw_path,
|
|
"smart search routing",
|
|
limit=5,
|
|
)
|
|
second = engine._search_vector_centralized(
|
|
temp_paths / "child-b" / "_index.db",
|
|
hnsw_path,
|
|
"smart search routing",
|
|
limit=5,
|
|
)
|
|
|
|
assert [result.path for result in first] == ["src/app.py"]
|
|
assert [result.path for result in second] == ["src/app.py"]
|
|
assert vector_store_opened == [temp_paths / "_index.db"]
|
|
assert mock_get_embedder.call_count == 1
|
|
assert mock_create_central.call_count == 1
|
|
assert fake_ann_index.load_calls == 1
|
|
assert fake_embedder.embed_calls == ["smart search routing"]
|
|
assert fake_ann_index.search_calls == 2
|
|
assert mock_fetch.call_count == 2
|
|
|
|
def test_search_vector_centralized_respects_embedding_use_gpu(
|
|
self,
|
|
temp_paths,
|
|
mock_config,
|
|
):
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
hnsw_path = temp_paths / "_vectors.hnsw"
|
|
hnsw_path.write_bytes(b"hnsw")
|
|
|
|
class FakeVectorStore:
|
|
def __init__(self, _path):
|
|
pass
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc, tb):
|
|
return False
|
|
|
|
def get_model_config(self):
|
|
return {
|
|
"backend": "fastembed",
|
|
"model_name": "BAAI/bge-small-en-v1.5",
|
|
"model_profile": "code",
|
|
"embedding_dim": 384,
|
|
}
|
|
|
|
class FakeEmbedder:
|
|
embedding_dim = 384
|
|
|
|
def embed_single(self, _query):
|
|
return [0.1, 0.2]
|
|
|
|
class FakeAnnIndex:
|
|
def load(self):
|
|
return True
|
|
|
|
def count(self):
|
|
return 1
|
|
|
|
def search(self, _query_vec, top_k):
|
|
assert top_k == 6
|
|
return [9], [0.1]
|
|
|
|
with patch("codexlens.semantic.vector_store.VectorStore", FakeVectorStore), patch(
|
|
"codexlens.semantic.factory.get_embedder",
|
|
return_value=FakeEmbedder(),
|
|
) as mock_get_embedder, patch(
|
|
"codexlens.semantic.ann_index.ANNIndex.create_central",
|
|
return_value=FakeAnnIndex(),
|
|
), patch.object(
|
|
engine,
|
|
"_fetch_chunks_by_ids_centralized",
|
|
return_value=[SearchResult(path="src/app.py", score=0.9, excerpt="hit")],
|
|
):
|
|
results = engine._search_vector_centralized(
|
|
temp_paths / "_index.db",
|
|
hnsw_path,
|
|
"semantic query",
|
|
limit=3,
|
|
)
|
|
|
|
assert len(results) == 1
|
|
assert mock_get_embedder.call_count == 1
|
|
assert mock_get_embedder.call_args.kwargs == {
|
|
"backend": "fastembed",
|
|
"profile": "code",
|
|
"use_gpu": False,
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Tests: _search_lsp_graph
|
|
# =============================================================================
|
|
|
|
|
|
class TestSearchLspGraph:
|
|
"""Tests for _search_lsp_graph() LSP graph expansion."""
|
|
|
|
def test_search_lsp_graph(self, temp_paths, mock_config):
|
|
"""LSP graph search should use seed results for expansion."""
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
# When HAS_LSP is False, _search_lsp_graph returns []
|
|
with patch("codexlens.search.hybrid_search.HAS_LSP", False):
|
|
results = engine._search_lsp_graph(
|
|
index_path, "auth function", limit=5,
|
|
)
|
|
assert results == []
|
|
|
|
def test_lsp_fallback_vector_to_fts(self, temp_paths, mock_config):
|
|
"""When vector seeds fail, should fall back to FTS seeds."""
|
|
engine = HybridSearchEngine(config=mock_config)
|
|
index_path = temp_paths / "_index.db"
|
|
|
|
with patch("codexlens.search.hybrid_search.HAS_LSP", True):
|
|
# Mock _search_vector to return empty (no seeds from vector)
|
|
with patch.object(engine, "_search_vector", return_value=[]):
|
|
# Mock _search_exact to return seeds
|
|
with patch.object(engine, "_search_exact") as mock_exact:
|
|
mock_exact.return_value = [
|
|
SearchResult(
|
|
path="auth.py", score=10.0,
|
|
excerpt="def auth():", symbol_name="auth",
|
|
start_line=1, end_line=5,
|
|
),
|
|
]
|
|
|
|
# Mock the LSP bridge (will fail on import or async)
|
|
# The function should attempt FTS fallback before LSP expansion
|
|
try:
|
|
results = engine._search_lsp_graph(
|
|
index_path, "auth", limit=5,
|
|
)
|
|
except Exception:
|
|
pass # LSP deps may not be available, but FTS fallback was attempted
|
|
|
|
# Verify FTS was called as fallback
|
|
mock_exact.assert_called_once()
|