feat: Implement adaptive RRF weights and query intent detection

- Added integration tests for adaptive RRF weights in hybrid search.
- Enhanced query intent detection with new classifications: keyword, semantic, and mixed.
- Introduced symbol boosting in search results based on explicit symbol matches.
- Implemented embedding-based reranking with configurable options.
- Added global symbol index for efficient symbol lookups across projects.
- Improved file deletion handling on Windows to avoid permission errors.
- Updated chunk configuration to increase overlap for better context.
- Modified package.json test script to target specific test files.
- Created comprehensive writing style guidelines for documentation.
- Added TypeScript tests for query intent detection and adaptive weights.
- Established performance benchmarks for global symbol indexing.
This commit is contained in:
catlog22
2025-12-26 15:08:47 +08:00
parent ecd5085e51
commit 4061ae48c4
29 changed files with 2685 additions and 828 deletions

View File

@@ -7,8 +7,12 @@ import pytest
from codexlens.entities import SearchResult
from codexlens.search.ranking import (
apply_symbol_boost,
QueryIntent,
detect_query_intent,
normalize_bm25_score,
reciprocal_rank_fusion,
rerank_results,
tag_search_source,
)
@@ -342,6 +346,62 @@ class TestTagSearchSource:
assert tagged[0].symbol_kind == "function"
class TestSymbolBoost:
"""Tests for apply_symbol_boost function."""
def test_symbol_boost(self):
results = [
SearchResult(path="a.py", score=0.2, excerpt="...", symbol_name="foo"),
SearchResult(path="b.py", score=0.21, excerpt="..."),
]
boosted = apply_symbol_boost(results, boost_factor=1.5)
assert boosted[0].path == "a.py"
assert boosted[0].score == pytest.approx(0.2 * 1.5)
assert boosted[0].metadata["boosted"] is True
assert boosted[0].metadata["original_fusion_score"] == pytest.approx(0.2)
assert boosted[1].path == "b.py"
assert boosted[1].score == pytest.approx(0.21)
assert "boosted" not in boosted[1].metadata
class TestEmbeddingReranking:
"""Tests for rerank_results embedding-based similarity."""
def test_rerank_embedding_similarity(self):
class DummyEmbedder:
def embed(self, texts):
if isinstance(texts, str):
texts = [texts]
mapping = {
"query": [1.0, 0.0],
"doc1": [1.0, 0.0],
"doc2": [0.0, 1.0],
}
return [mapping[t] for t in texts]
results = [
SearchResult(path="a.py", score=0.2, excerpt="doc1"),
SearchResult(path="b.py", score=0.9, excerpt="doc2"),
]
reranked = rerank_results("query", results, DummyEmbedder(), top_k=2)
assert reranked[0].path == "a.py"
assert reranked[0].metadata["reranked"] is True
assert reranked[0].metadata["rrf_score"] == pytest.approx(0.2)
assert reranked[0].metadata["cosine_similarity"] == pytest.approx(1.0)
assert reranked[0].score == pytest.approx(0.5 * 0.2 + 0.5 * 1.0)
assert reranked[1].path == "b.py"
assert reranked[1].metadata["reranked"] is True
assert reranked[1].metadata["rrf_score"] == pytest.approx(0.9)
assert reranked[1].metadata["cosine_similarity"] == pytest.approx(0.0)
assert reranked[1].score == pytest.approx(0.5 * 0.9 + 0.5 * 0.0)
@pytest.mark.parametrize("k_value", [30, 60, 100])
class TestRRFParameterized:
"""Parameterized tests for RRF with different k values."""
@@ -419,3 +479,41 @@ class TestRRFEdgeCases:
# Should work with normalization
assert len(fused) == 1 # Deduplicated
assert fused[0].score > 0
class TestSymbolBoostAndIntentV1:
"""Tests for symbol boosting and query intent detection (v1.0)."""
def test_symbol_boost_application(self):
"""Results with symbol_name receive a multiplicative boost (default 1.5x)."""
results = [
SearchResult(path="a.py", score=0.4, excerpt="...", symbol_name="AuthManager"),
SearchResult(path="b.py", score=0.41, excerpt="..."),
]
boosted = apply_symbol_boost(results, boost_factor=1.5)
assert boosted[0].score == pytest.approx(0.4 * 1.5)
assert boosted[0].metadata["boosted"] is True
assert boosted[0].metadata["original_fusion_score"] == pytest.approx(0.4)
assert boosted[1].score == pytest.approx(0.41)
assert "boosted" not in boosted[1].metadata
@pytest.mark.parametrize(
("query", "expected"),
[
("def authenticate", QueryIntent.KEYWORD),
("MyClass", QueryIntent.KEYWORD),
("user_id", QueryIntent.KEYWORD),
("UserService::authenticate", QueryIntent.KEYWORD),
("ptr->next", QueryIntent.KEYWORD),
("how to handle user login", QueryIntent.SEMANTIC),
("what is authentication?", QueryIntent.SEMANTIC),
("where is this used?", QueryIntent.SEMANTIC),
("why does FooBar crash?", QueryIntent.MIXED),
("how to use user_id in query", QueryIntent.MIXED),
],
)
def test_query_intent_detection(self, query, expected):
"""Detect intent for representative queries (Python/TypeScript parity)."""
assert detect_query_intent(query) == expected