Add graph expansion and cross-encoder reranking features

- Implemented GraphExpander to enhance search results with related symbols using precomputed neighbors.
- Added CrossEncoderReranker for second-stage search ranking, allowing for improved result scoring.
- Created migrations to establish necessary database tables for relationships and graph neighbors.
- Developed tests for graph expansion functionality, ensuring related results are populated correctly.
- Enhanced performance benchmarks for cross-encoder reranking latency and graph expansion overhead.
- Updated schema cleanup tests to reflect changes in versioning and deprecated fields.
- Added new test cases for Treesitter parser to validate relationship extraction with alias resolution.
This commit is contained in:
catlog22
2025-12-31 16:58:59 +08:00
parent 4bde13e83a
commit 31a45f1f30
27 changed files with 2566 additions and 97 deletions

View File

@@ -34,6 +34,7 @@ from codexlens.config import Config
from codexlens.entities import SearchResult
from codexlens.search.ranking import (
apply_symbol_boost,
cross_encoder_rerank,
get_rrf_weights,
reciprocal_rank_fusion,
rerank_results,
@@ -77,6 +78,7 @@ class HybridSearchEngine:
self.weights = weights or self.DEFAULT_WEIGHTS.copy()
self._config = config
self.embedder = embedder
self.reranker: Any = None
def search(
self,
@@ -112,6 +114,14 @@ class HybridSearchEngine:
>>> for r in results[:5]:
... print(f"{r.path}: {r.score:.3f}")
"""
# Defensive: avoid creating/locking an index database when callers pass
# an empty placeholder file (common in tests and misconfigured callers).
try:
if index_path.exists() and index_path.stat().st_size == 0:
return []
except OSError:
return []
# Determine which backends to use
backends = {}
@@ -180,9 +190,30 @@ class HybridSearchEngine:
query,
fused_results[:100],
self.embedder,
top_k=self._config.reranking_top_k,
top_k=(
100
if self._config.enable_cross_encoder_rerank
else self._config.reranking_top_k
),
)
# Optional: cross-encoder reranking as a second stage
if (
self._config is not None
and self._config.enable_reranking
and self._config.enable_cross_encoder_rerank
):
with timer("cross_encoder_rerank", self.logger):
if self.reranker is None:
self.reranker = self._get_cross_encoder_reranker()
if self.reranker is not None:
fused_results = cross_encoder_rerank(
query,
fused_results,
self.reranker,
top_k=self._config.reranker_top_k,
)
# Apply final limit
return fused_results[:limit]
@@ -222,6 +253,27 @@ class HybridSearchEngine:
)
return None
def _get_cross_encoder_reranker(self) -> Any:
if self._config is None:
return None
try:
from codexlens.semantic.reranker import CrossEncoderReranker, check_cross_encoder_available
except Exception as exc:
self.logger.debug("Cross-encoder reranker unavailable: %s", exc)
return None
ok, err = check_cross_encoder_available()
if not ok:
self.logger.debug("Cross-encoder reranker unavailable: %s", err)
return None
try:
return CrossEncoderReranker(model_name=self._config.reranker_model)
except Exception as exc:
self.logger.debug("Failed to initialize cross-encoder reranker: %s", exc)
return None
def _search_parallel(
self,
index_path: Path,