Add graph expansion and cross-encoder reranking features

- Implemented GraphExpander to enhance search results with related symbols using precomputed neighbors.
- Added CrossEncoderReranker for second-stage search ranking, allowing for improved result scoring.
- Created migrations to establish necessary database tables for relationships and graph neighbors.
- Developed tests for graph expansion functionality, ensuring related results are populated correctly.
- Enhanced performance benchmarks for cross-encoder reranking latency and graph expansion overhead.
- Updated schema cleanup tests to reflect changes in versioning and deprecated fields.
- Added new test cases for Treesitter parser to validate relationship extraction with alias resolution.
This commit is contained in:
catlog22
2025-12-31 16:58:59 +08:00
parent 4bde13e83a
commit 31a45f1f30
27 changed files with 2566 additions and 97 deletions

View File

@@ -869,3 +869,47 @@ class TestHybridSearchAdaptiveWeights:
) as rerank_mock:
engine_on.search(Path("dummy.db"), "query", enable_vector=True)
assert rerank_mock.call_count == 1
def test_cross_encoder_reranking_enabled(self, tmp_path):
"""Cross-encoder stage runs only when explicitly enabled via config."""
from unittest.mock import patch
results_map = {
"exact": [SearchResult(path="a.py", score=10.0, excerpt="a")],
"fuzzy": [SearchResult(path="b.py", score=9.0, excerpt="b")],
"vector": [SearchResult(path="c.py", score=0.9, excerpt="c")],
}
class DummyEmbedder:
def embed(self, texts):
if isinstance(texts, str):
texts = [texts]
return [[1.0, 0.0] for _ in texts]
class DummyReranker:
def score_pairs(self, pairs, batch_size=32):
return [0.0 for _ in pairs]
config = Config(
data_dir=tmp_path / "ce",
enable_reranking=True,
enable_cross_encoder_rerank=True,
reranker_top_k=10,
)
engine = HybridSearchEngine(config=config, embedder=DummyEmbedder())
with patch.object(HybridSearchEngine, "_search_parallel", return_value=results_map), patch(
"codexlens.search.hybrid_search.rerank_results",
side_effect=lambda q, r, e, top_k=50: r,
) as rerank_mock, patch.object(
HybridSearchEngine,
"_get_cross_encoder_reranker",
return_value=DummyReranker(),
) as get_ce_mock, patch(
"codexlens.search.hybrid_search.cross_encoder_rerank",
side_effect=lambda q, r, ce, top_k=50: r,
) as ce_mock:
engine.search(Path("dummy.db"), "query", enable_vector=True)
assert rerank_mock.call_count == 1
assert get_ce_mock.call_count == 1
assert ce_mock.call_count == 1