Add comprehensive tests for query parsing and Reciprocal Rank Fusion

- Implemented tests for the QueryParser class, covering various identifier splitting methods (CamelCase, snake_case, kebab-case), OR expansion, and FTS5 operator preservation.
- Added parameterized tests to validate expected token outputs for different query formats.
- Created edge case tests to ensure robustness against unusual input scenarios.
- Developed tests for the Reciprocal Rank Fusion (RRF) algorithm, including score computation, weight handling, and result ranking across multiple sources.
- Included tests for normalization of BM25 scores and tagging search results with source metadata.
This commit is contained in:
catlog22
2025-12-16 10:20:19 +08:00
parent 35485bbbb1
commit 3da0ef2adb
39 changed files with 6171 additions and 240 deletions

View File

@@ -0,0 +1,421 @@
"""Tests for Reciprocal Rank Fusion (RRF) algorithm (P2).
Tests RRF fusion logic, score computation, weight handling, and result ranking.
"""
import pytest
from codexlens.entities import SearchResult
from codexlens.search.ranking import (
normalize_bm25_score,
reciprocal_rank_fusion,
tag_search_source,
)
class TestReciprocalRankFusion:
"""Tests for reciprocal_rank_fusion function."""
def test_single_source_ranking(self):
"""Test RRF with single source returns ranked results."""
results = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="b.py", score=8.0, excerpt="..."),
SearchResult(path="c.py", score=6.0, excerpt="..."),
]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map)
assert len(fused) == 3
# Order should be preserved (highest original score first)
assert fused[0].path == "a.py"
assert fused[1].path == "b.py"
assert fused[2].path == "c.py"
def test_two_sources_fusion(self):
"""Test RRF combines rankings from two sources."""
exact_results = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="b.py", score=8.0, excerpt="..."),
SearchResult(path="c.py", score=6.0, excerpt="..."),
]
fuzzy_results = [
SearchResult(path="b.py", score=9.0, excerpt="..."),
SearchResult(path="c.py", score=7.0, excerpt="..."),
SearchResult(path="d.py", score=5.0, excerpt="..."),
]
results_map = {"exact": exact_results, "fuzzy": fuzzy_results}
fused = reciprocal_rank_fusion(results_map)
# Should have all unique paths
paths = [r.path for r in fused]
assert set(paths) == {"a.py", "b.py", "c.py", "d.py"}
# Results appearing in both should rank higher
# b.py and c.py appear in both sources
assert fused[0].path in ["b.py", "c.py"], "Items in both sources should rank highest"
def test_rrf_score_calculation(self):
"""Test RRF scores are calculated correctly with default k=60."""
# Simple scenario: single source
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map, k=60)
# RRF score = weight / (k + rank) = 1.0 / (60 + 1) ≈ 0.0164
expected_score = 1.0 / 61
assert abs(fused[0].score - expected_score) < 0.001
def test_custom_weights(self):
"""Test custom weights affect RRF scores."""
results_a = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_b = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_map = {"exact": results_a, "fuzzy": results_b}
# Higher weight for exact
weights = {"exact": 0.7, "fuzzy": 0.3}
fused = reciprocal_rank_fusion(results_map, weights=weights, k=60)
# Score should be: 0.7/(60+1) + 0.3/(60+1) = 1.0/61 ≈ 0.0164
expected_score = (0.7 + 0.3) / 61
assert abs(fused[0].score - expected_score) < 0.001
def test_weight_normalization(self):
"""Test weights are normalized to sum to 1.0."""
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_map = {"exact": results}
# Weights not summing to 1.0
weights = {"exact": 2.0} # Will be normalized to 1.0
fused = reciprocal_rank_fusion(results_map, weights=weights)
# Should work without error and produce normalized scores
assert len(fused) == 1
assert fused[0].score > 0
def test_empty_results_map(self):
"""Test RRF with empty results returns empty list."""
fused = reciprocal_rank_fusion({})
assert fused == []
def test_zero_weight_source_ignored(self):
"""Test sources with zero weight are ignored."""
results_a = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_b = [SearchResult(path="b.py", score=10.0, excerpt="...")]
results_map = {"exact": results_a, "fuzzy": results_b}
weights = {"exact": 1.0, "fuzzy": 0.0} # Ignore fuzzy
fused = reciprocal_rank_fusion(results_map, weights=weights)
# Should only have result from exact source
assert len(fused) == 1
assert fused[0].path == "a.py"
def test_fusion_score_in_metadata(self):
"""Test fusion score is stored in result metadata."""
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map)
# Check metadata
assert "fusion_score" in fused[0].metadata
assert "original_score" in fused[0].metadata
assert fused[0].metadata["original_score"] == 10.0
def test_rank_order_matters(self):
"""Test rank position affects RRF score (lower rank = higher score)."""
results = [
SearchResult(path="a.py", score=10.0, excerpt="..."), # rank 1
SearchResult(path="b.py", score=8.0, excerpt="..."), # rank 2
SearchResult(path="c.py", score=6.0, excerpt="..."), # rank 3
]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map, k=60)
# a.py (rank 1): score = 1/(60+1) ≈ 0.0164
# b.py (rank 2): score = 1/(60+2) ≈ 0.0161
# c.py (rank 3): score = 1/(60+3) ≈ 0.0159
assert fused[0].score > fused[1].score > fused[2].score
class TestRRFSyntheticRankings:
"""Tests with synthetic rankings to verify RRF correctness."""
def test_perfect_agreement(self):
"""Test RRF when all sources rank items identically."""
# All sources rank a > b > c
exact = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="b.py", score=8.0, excerpt="..."),
SearchResult(path="c.py", score=6.0, excerpt="..."),
]
fuzzy = [
SearchResult(path="a.py", score=9.0, excerpt="..."),
SearchResult(path="b.py", score=7.0, excerpt="..."),
SearchResult(path="c.py", score=5.0, excerpt="..."),
]
results_map = {"exact": exact, "fuzzy": fuzzy}
fused = reciprocal_rank_fusion(results_map)
# Order should match both sources
assert fused[0].path == "a.py"
assert fused[1].path == "b.py"
assert fused[2].path == "c.py"
def test_complete_disagreement(self):
"""Test RRF when sources have opposite rankings."""
# exact: a > b > c
# fuzzy: c > b > a
exact = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="b.py", score=8.0, excerpt="..."),
SearchResult(path="c.py", score=6.0, excerpt="..."),
]
fuzzy = [
SearchResult(path="c.py", score=9.0, excerpt="..."),
SearchResult(path="b.py", score=7.0, excerpt="..."),
SearchResult(path="a.py", score=5.0, excerpt="..."),
]
results_map = {"exact": exact, "fuzzy": fuzzy}
fused = reciprocal_rank_fusion(results_map)
# With opposite rankings, a.py and c.py get equal RRF scores:
# a.py: 0.5/(60+1) + 0.5/(60+3) = 0.01613
# c.py: 0.5/(60+3) + 0.5/(60+1) = 0.01613 (same!)
# b.py: 0.5/(60+2) + 0.5/(60+2) = 0.01613 (slightly lower due to rounding)
# So top result should be a.py or c.py (tied)
assert fused[0].path in ["a.py", "c.py"], "Items with symmetric ranks should tie for first"
def test_partial_overlap(self):
"""Test RRF with partial overlap between sources."""
# exact: [A, B, C]
# fuzzy: [B, C, D]
exact = [
SearchResult(path="A", score=10.0, excerpt="..."),
SearchResult(path="B", score=8.0, excerpt="..."),
SearchResult(path="C", score=6.0, excerpt="..."),
]
fuzzy = [
SearchResult(path="B", score=9.0, excerpt="..."),
SearchResult(path="C", score=7.0, excerpt="..."),
SearchResult(path="D", score=5.0, excerpt="..."),
]
results_map = {"exact": exact, "fuzzy": fuzzy}
fused = reciprocal_rank_fusion(results_map)
# B and C appear in both, should rank higher than A and D
paths = [r.path for r in fused]
b_idx = paths.index("B")
c_idx = paths.index("C")
a_idx = paths.index("A")
d_idx = paths.index("D")
assert b_idx < a_idx, "B (in both) should outrank A (in one)"
assert c_idx < d_idx, "C (in both) should outrank D (in one)"
def test_three_sources(self):
"""Test RRF with three sources (exact, fuzzy, vector)."""
exact = [SearchResult(path="a.py", score=10.0, excerpt="...")]
fuzzy = [SearchResult(path="b.py", score=9.0, excerpt="...")]
vector = [SearchResult(path="c.py", score=8.0, excerpt="...")]
results_map = {"exact": exact, "fuzzy": fuzzy, "vector": vector}
weights = {"exact": 0.4, "fuzzy": 0.3, "vector": 0.3}
fused = reciprocal_rank_fusion(results_map, weights=weights)
assert len(fused) == 3
# Each appears in one source only, so scores differ by weights
# a.py: 0.4/61 ≈ 0.0066
# b.py: 0.3/61 ≈ 0.0049
# c.py: 0.3/61 ≈ 0.0049
assert fused[0].path == "a.py", "Exact (higher weight) should rank first"
class TestNormalizeBM25Score:
"""Tests for normalize_bm25_score function."""
def test_negative_bm25_normalization(self):
"""Test BM25 scores (negative) are normalized to 0-1 range."""
# SQLite FTS5 returns negative BM25 scores
scores = [-20.0, -10.0, -5.0, -1.0, 0.0]
for score in scores:
normalized = normalize_bm25_score(score)
assert 0.0 <= normalized <= 1.0, f"Normalized score {normalized} out of range"
def test_better_match_higher_score(self):
"""Test more negative BM25 (better match) gives higher normalized score."""
good_match = -15.0
weak_match = -2.0
norm_good = normalize_bm25_score(good_match)
norm_weak = normalize_bm25_score(weak_match)
assert norm_good > norm_weak, "Better match should have higher normalized score"
def test_zero_score(self):
"""Test zero BM25 score normalization."""
normalized = normalize_bm25_score(0.0)
assert 0.0 <= normalized <= 1.0
def test_positive_score_handling(self):
"""Test positive scores (edge case) are handled."""
normalized = normalize_bm25_score(5.0)
# Should still be in valid range
assert 0.0 <= normalized <= 1.0
class TestTagSearchSource:
"""Tests for tag_search_source function."""
def test_tagging_adds_source_metadata(self):
"""Test tagging adds search_source to metadata."""
results = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="b.py", score=8.0, excerpt="..."),
]
tagged = tag_search_source(results, "exact")
for result in tagged:
assert "search_source" in result.metadata
assert result.metadata["search_source"] == "exact"
def test_tagging_preserves_existing_metadata(self):
"""Test tagging preserves existing metadata fields."""
results = [
SearchResult(
path="a.py",
score=10.0,
excerpt="...",
metadata={"custom_field": "value"}
),
]
tagged = tag_search_source(results, "fuzzy")
assert "custom_field" in tagged[0].metadata
assert tagged[0].metadata["custom_field"] == "value"
assert "search_source" in tagged[0].metadata
assert tagged[0].metadata["search_source"] == "fuzzy"
def test_tagging_empty_list(self):
"""Test tagging empty list returns empty list."""
tagged = tag_search_source([], "exact")
assert tagged == []
def test_tagging_preserves_result_fields(self):
"""Test tagging preserves all SearchResult fields."""
results = [
SearchResult(
path="a.py",
score=10.0,
excerpt="test excerpt",
content="full content",
start_line=10,
end_line=20,
symbol_name="test_func",
symbol_kind="function"
),
]
tagged = tag_search_source(results, "exact")
assert tagged[0].path == "a.py"
assert tagged[0].score == 10.0
assert tagged[0].excerpt == "test excerpt"
assert tagged[0].content == "full content"
assert tagged[0].start_line == 10
assert tagged[0].end_line == 20
assert tagged[0].symbol_name == "test_func"
assert tagged[0].symbol_kind == "function"
@pytest.mark.parametrize("k_value", [30, 60, 100])
class TestRRFParameterized:
"""Parameterized tests for RRF with different k values."""
def test_k_value_affects_scores(self, k_value):
"""Test k parameter affects RRF score magnitude."""
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map, k=k_value)
# Score should be 1.0 / (k + 1)
expected = 1.0 / (k_value + 1)
assert abs(fused[0].score - expected) < 0.001
class TestRRFEdgeCases:
"""Edge case tests for RRF."""
def test_duplicate_paths_in_same_source(self):
"""Test handling of duplicate paths in single source."""
results = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="a.py", score=8.0, excerpt="..."), # Duplicate
]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map)
# Should deduplicate (first occurrence wins)
assert len(fused) == 1
assert fused[0].path == "a.py"
def test_very_large_result_lists(self):
"""Test RRF handles large result sets efficiently."""
# Create 1000 results
results = [
SearchResult(path=f"file{i}.py", score=1000-i, excerpt="...")
for i in range(1000)
]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map)
assert len(fused) == 1000
# Should maintain ranking
assert fused[0].path == "file0.py"
assert fused[-1].path == "file999.py"
def test_all_same_score(self):
"""Test RRF when all results have same original score."""
results = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="b.py", score=10.0, excerpt="..."),
SearchResult(path="c.py", score=10.0, excerpt="..."),
]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map)
# Should still rank by position (rank matters)
assert len(fused) == 3
assert fused[0].score > fused[1].score > fused[2].score
def test_missing_weight_for_source(self):
"""Test missing weight for source uses default."""
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_map = {"exact": results, "fuzzy": results}
# Only provide weight for exact
weights = {"exact": 1.0}
fused = reciprocal_rank_fusion(results_map, weights=weights)
# Should work with normalization
assert len(fused) == 1 # Deduplicated
assert fused[0].score > 0