mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
- Implemented tests for the QueryParser class, covering various identifier splitting methods (CamelCase, snake_case, kebab-case), OR expansion, and FTS5 operator preservation. - Added parameterized tests to validate expected token outputs for different query formats. - Created edge case tests to ensure robustness against unusual input scenarios. - Developed tests for the Reciprocal Rank Fusion (RRF) algorithm, including score computation, weight handling, and result ranking across multiple sources. - Included tests for normalization of BM25 scores and tagging search results with source metadata.
422 lines
15 KiB
Python
422 lines
15 KiB
Python
"""Tests for Reciprocal Rank Fusion (RRF) algorithm (P2).
|
|
|
|
Tests RRF fusion logic, score computation, weight handling, and result ranking.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from codexlens.entities import SearchResult
|
|
from codexlens.search.ranking import (
|
|
normalize_bm25_score,
|
|
reciprocal_rank_fusion,
|
|
tag_search_source,
|
|
)
|
|
|
|
|
|
class TestReciprocalRankFusion:
|
|
"""Tests for reciprocal_rank_fusion function."""
|
|
|
|
def test_single_source_ranking(self):
|
|
"""Test RRF with single source returns ranked results."""
|
|
results = [
|
|
SearchResult(path="a.py", score=10.0, excerpt="..."),
|
|
SearchResult(path="b.py", score=8.0, excerpt="..."),
|
|
SearchResult(path="c.py", score=6.0, excerpt="..."),
|
|
]
|
|
results_map = {"exact": results}
|
|
|
|
fused = reciprocal_rank_fusion(results_map)
|
|
|
|
assert len(fused) == 3
|
|
# Order should be preserved (highest original score first)
|
|
assert fused[0].path == "a.py"
|
|
assert fused[1].path == "b.py"
|
|
assert fused[2].path == "c.py"
|
|
|
|
def test_two_sources_fusion(self):
|
|
"""Test RRF combines rankings from two sources."""
|
|
exact_results = [
|
|
SearchResult(path="a.py", score=10.0, excerpt="..."),
|
|
SearchResult(path="b.py", score=8.0, excerpt="..."),
|
|
SearchResult(path="c.py", score=6.0, excerpt="..."),
|
|
]
|
|
fuzzy_results = [
|
|
SearchResult(path="b.py", score=9.0, excerpt="..."),
|
|
SearchResult(path="c.py", score=7.0, excerpt="..."),
|
|
SearchResult(path="d.py", score=5.0, excerpt="..."),
|
|
]
|
|
results_map = {"exact": exact_results, "fuzzy": fuzzy_results}
|
|
|
|
fused = reciprocal_rank_fusion(results_map)
|
|
|
|
# Should have all unique paths
|
|
paths = [r.path for r in fused]
|
|
assert set(paths) == {"a.py", "b.py", "c.py", "d.py"}
|
|
|
|
# Results appearing in both should rank higher
|
|
# b.py and c.py appear in both sources
|
|
assert fused[0].path in ["b.py", "c.py"], "Items in both sources should rank highest"
|
|
|
|
def test_rrf_score_calculation(self):
|
|
"""Test RRF scores are calculated correctly with default k=60."""
|
|
# Simple scenario: single source
|
|
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
|
|
results_map = {"exact": results}
|
|
|
|
fused = reciprocal_rank_fusion(results_map, k=60)
|
|
|
|
# RRF score = weight / (k + rank) = 1.0 / (60 + 1) ≈ 0.0164
|
|
expected_score = 1.0 / 61
|
|
assert abs(fused[0].score - expected_score) < 0.001
|
|
|
|
def test_custom_weights(self):
|
|
"""Test custom weights affect RRF scores."""
|
|
results_a = [SearchResult(path="a.py", score=10.0, excerpt="...")]
|
|
results_b = [SearchResult(path="a.py", score=10.0, excerpt="...")]
|
|
|
|
results_map = {"exact": results_a, "fuzzy": results_b}
|
|
|
|
# Higher weight for exact
|
|
weights = {"exact": 0.7, "fuzzy": 0.3}
|
|
fused = reciprocal_rank_fusion(results_map, weights=weights, k=60)
|
|
|
|
# Score should be: 0.7/(60+1) + 0.3/(60+1) = 1.0/61 ≈ 0.0164
|
|
expected_score = (0.7 + 0.3) / 61
|
|
assert abs(fused[0].score - expected_score) < 0.001
|
|
|
|
def test_weight_normalization(self):
|
|
"""Test weights are normalized to sum to 1.0."""
|
|
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
|
|
results_map = {"exact": results}
|
|
|
|
# Weights not summing to 1.0
|
|
weights = {"exact": 2.0} # Will be normalized to 1.0
|
|
fused = reciprocal_rank_fusion(results_map, weights=weights)
|
|
|
|
# Should work without error and produce normalized scores
|
|
assert len(fused) == 1
|
|
assert fused[0].score > 0
|
|
|
|
def test_empty_results_map(self):
|
|
"""Test RRF with empty results returns empty list."""
|
|
fused = reciprocal_rank_fusion({})
|
|
assert fused == []
|
|
|
|
def test_zero_weight_source_ignored(self):
|
|
"""Test sources with zero weight are ignored."""
|
|
results_a = [SearchResult(path="a.py", score=10.0, excerpt="...")]
|
|
results_b = [SearchResult(path="b.py", score=10.0, excerpt="...")]
|
|
|
|
results_map = {"exact": results_a, "fuzzy": results_b}
|
|
weights = {"exact": 1.0, "fuzzy": 0.0} # Ignore fuzzy
|
|
|
|
fused = reciprocal_rank_fusion(results_map, weights=weights)
|
|
|
|
# Should only have result from exact source
|
|
assert len(fused) == 1
|
|
assert fused[0].path == "a.py"
|
|
|
|
def test_fusion_score_in_metadata(self):
|
|
"""Test fusion score is stored in result metadata."""
|
|
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
|
|
results_map = {"exact": results}
|
|
|
|
fused = reciprocal_rank_fusion(results_map)
|
|
|
|
# Check metadata
|
|
assert "fusion_score" in fused[0].metadata
|
|
assert "original_score" in fused[0].metadata
|
|
assert fused[0].metadata["original_score"] == 10.0
|
|
|
|
def test_rank_order_matters(self):
|
|
"""Test rank position affects RRF score (lower rank = higher score)."""
|
|
results = [
|
|
SearchResult(path="a.py", score=10.0, excerpt="..."), # rank 1
|
|
SearchResult(path="b.py", score=8.0, excerpt="..."), # rank 2
|
|
SearchResult(path="c.py", score=6.0, excerpt="..."), # rank 3
|
|
]
|
|
results_map = {"exact": results}
|
|
|
|
fused = reciprocal_rank_fusion(results_map, k=60)
|
|
|
|
# a.py (rank 1): score = 1/(60+1) ≈ 0.0164
|
|
# b.py (rank 2): score = 1/(60+2) ≈ 0.0161
|
|
# c.py (rank 3): score = 1/(60+3) ≈ 0.0159
|
|
assert fused[0].score > fused[1].score > fused[2].score
|
|
|
|
|
|
class TestRRFSyntheticRankings:
|
|
"""Tests with synthetic rankings to verify RRF correctness."""
|
|
|
|
def test_perfect_agreement(self):
|
|
"""Test RRF when all sources rank items identically."""
|
|
# All sources rank a > b > c
|
|
exact = [
|
|
SearchResult(path="a.py", score=10.0, excerpt="..."),
|
|
SearchResult(path="b.py", score=8.0, excerpt="..."),
|
|
SearchResult(path="c.py", score=6.0, excerpt="..."),
|
|
]
|
|
fuzzy = [
|
|
SearchResult(path="a.py", score=9.0, excerpt="..."),
|
|
SearchResult(path="b.py", score=7.0, excerpt="..."),
|
|
SearchResult(path="c.py", score=5.0, excerpt="..."),
|
|
]
|
|
|
|
results_map = {"exact": exact, "fuzzy": fuzzy}
|
|
fused = reciprocal_rank_fusion(results_map)
|
|
|
|
# Order should match both sources
|
|
assert fused[0].path == "a.py"
|
|
assert fused[1].path == "b.py"
|
|
assert fused[2].path == "c.py"
|
|
|
|
def test_complete_disagreement(self):
|
|
"""Test RRF when sources have opposite rankings."""
|
|
# exact: a > b > c
|
|
# fuzzy: c > b > a
|
|
exact = [
|
|
SearchResult(path="a.py", score=10.0, excerpt="..."),
|
|
SearchResult(path="b.py", score=8.0, excerpt="..."),
|
|
SearchResult(path="c.py", score=6.0, excerpt="..."),
|
|
]
|
|
fuzzy = [
|
|
SearchResult(path="c.py", score=9.0, excerpt="..."),
|
|
SearchResult(path="b.py", score=7.0, excerpt="..."),
|
|
SearchResult(path="a.py", score=5.0, excerpt="..."),
|
|
]
|
|
|
|
results_map = {"exact": exact, "fuzzy": fuzzy}
|
|
fused = reciprocal_rank_fusion(results_map)
|
|
|
|
# With opposite rankings, a.py and c.py get equal RRF scores:
|
|
# a.py: 0.5/(60+1) + 0.5/(60+3) = 0.01613
|
|
# c.py: 0.5/(60+3) + 0.5/(60+1) = 0.01613 (same!)
|
|
# b.py: 0.5/(60+2) + 0.5/(60+2) = 0.01613 (slightly lower due to rounding)
|
|
# So top result should be a.py or c.py (tied)
|
|
assert fused[0].path in ["a.py", "c.py"], "Items with symmetric ranks should tie for first"
|
|
|
|
def test_partial_overlap(self):
|
|
"""Test RRF with partial overlap between sources."""
|
|
# exact: [A, B, C]
|
|
# fuzzy: [B, C, D]
|
|
exact = [
|
|
SearchResult(path="A", score=10.0, excerpt="..."),
|
|
SearchResult(path="B", score=8.0, excerpt="..."),
|
|
SearchResult(path="C", score=6.0, excerpt="..."),
|
|
]
|
|
fuzzy = [
|
|
SearchResult(path="B", score=9.0, excerpt="..."),
|
|
SearchResult(path="C", score=7.0, excerpt="..."),
|
|
SearchResult(path="D", score=5.0, excerpt="..."),
|
|
]
|
|
|
|
results_map = {"exact": exact, "fuzzy": fuzzy}
|
|
fused = reciprocal_rank_fusion(results_map)
|
|
|
|
# B and C appear in both, should rank higher than A and D
|
|
paths = [r.path for r in fused]
|
|
b_idx = paths.index("B")
|
|
c_idx = paths.index("C")
|
|
a_idx = paths.index("A")
|
|
d_idx = paths.index("D")
|
|
|
|
assert b_idx < a_idx, "B (in both) should outrank A (in one)"
|
|
assert c_idx < d_idx, "C (in both) should outrank D (in one)"
|
|
|
|
def test_three_sources(self):
|
|
"""Test RRF with three sources (exact, fuzzy, vector)."""
|
|
exact = [SearchResult(path="a.py", score=10.0, excerpt="...")]
|
|
fuzzy = [SearchResult(path="b.py", score=9.0, excerpt="...")]
|
|
vector = [SearchResult(path="c.py", score=8.0, excerpt="...")]
|
|
|
|
results_map = {"exact": exact, "fuzzy": fuzzy, "vector": vector}
|
|
weights = {"exact": 0.4, "fuzzy": 0.3, "vector": 0.3}
|
|
|
|
fused = reciprocal_rank_fusion(results_map, weights=weights)
|
|
|
|
assert len(fused) == 3
|
|
# Each appears in one source only, so scores differ by weights
|
|
# a.py: 0.4/61 ≈ 0.0066
|
|
# b.py: 0.3/61 ≈ 0.0049
|
|
# c.py: 0.3/61 ≈ 0.0049
|
|
assert fused[0].path == "a.py", "Exact (higher weight) should rank first"
|
|
|
|
|
|
class TestNormalizeBM25Score:
|
|
"""Tests for normalize_bm25_score function."""
|
|
|
|
def test_negative_bm25_normalization(self):
|
|
"""Test BM25 scores (negative) are normalized to 0-1 range."""
|
|
# SQLite FTS5 returns negative BM25 scores
|
|
scores = [-20.0, -10.0, -5.0, -1.0, 0.0]
|
|
|
|
for score in scores:
|
|
normalized = normalize_bm25_score(score)
|
|
assert 0.0 <= normalized <= 1.0, f"Normalized score {normalized} out of range"
|
|
|
|
def test_better_match_higher_score(self):
|
|
"""Test more negative BM25 (better match) gives higher normalized score."""
|
|
good_match = -15.0
|
|
weak_match = -2.0
|
|
|
|
norm_good = normalize_bm25_score(good_match)
|
|
norm_weak = normalize_bm25_score(weak_match)
|
|
|
|
assert norm_good > norm_weak, "Better match should have higher normalized score"
|
|
|
|
def test_zero_score(self):
|
|
"""Test zero BM25 score normalization."""
|
|
normalized = normalize_bm25_score(0.0)
|
|
assert 0.0 <= normalized <= 1.0
|
|
|
|
def test_positive_score_handling(self):
|
|
"""Test positive scores (edge case) are handled."""
|
|
normalized = normalize_bm25_score(5.0)
|
|
# Should still be in valid range
|
|
assert 0.0 <= normalized <= 1.0
|
|
|
|
|
|
class TestTagSearchSource:
|
|
"""Tests for tag_search_source function."""
|
|
|
|
def test_tagging_adds_source_metadata(self):
|
|
"""Test tagging adds search_source to metadata."""
|
|
results = [
|
|
SearchResult(path="a.py", score=10.0, excerpt="..."),
|
|
SearchResult(path="b.py", score=8.0, excerpt="..."),
|
|
]
|
|
|
|
tagged = tag_search_source(results, "exact")
|
|
|
|
for result in tagged:
|
|
assert "search_source" in result.metadata
|
|
assert result.metadata["search_source"] == "exact"
|
|
|
|
def test_tagging_preserves_existing_metadata(self):
|
|
"""Test tagging preserves existing metadata fields."""
|
|
results = [
|
|
SearchResult(
|
|
path="a.py",
|
|
score=10.0,
|
|
excerpt="...",
|
|
metadata={"custom_field": "value"}
|
|
),
|
|
]
|
|
|
|
tagged = tag_search_source(results, "fuzzy")
|
|
|
|
assert "custom_field" in tagged[0].metadata
|
|
assert tagged[0].metadata["custom_field"] == "value"
|
|
assert "search_source" in tagged[0].metadata
|
|
assert tagged[0].metadata["search_source"] == "fuzzy"
|
|
|
|
def test_tagging_empty_list(self):
|
|
"""Test tagging empty list returns empty list."""
|
|
tagged = tag_search_source([], "exact")
|
|
assert tagged == []
|
|
|
|
def test_tagging_preserves_result_fields(self):
|
|
"""Test tagging preserves all SearchResult fields."""
|
|
results = [
|
|
SearchResult(
|
|
path="a.py",
|
|
score=10.0,
|
|
excerpt="test excerpt",
|
|
content="full content",
|
|
start_line=10,
|
|
end_line=20,
|
|
symbol_name="test_func",
|
|
symbol_kind="function"
|
|
),
|
|
]
|
|
|
|
tagged = tag_search_source(results, "exact")
|
|
|
|
assert tagged[0].path == "a.py"
|
|
assert tagged[0].score == 10.0
|
|
assert tagged[0].excerpt == "test excerpt"
|
|
assert tagged[0].content == "full content"
|
|
assert tagged[0].start_line == 10
|
|
assert tagged[0].end_line == 20
|
|
assert tagged[0].symbol_name == "test_func"
|
|
assert tagged[0].symbol_kind == "function"
|
|
|
|
|
|
@pytest.mark.parametrize("k_value", [30, 60, 100])
|
|
class TestRRFParameterized:
|
|
"""Parameterized tests for RRF with different k values."""
|
|
|
|
def test_k_value_affects_scores(self, k_value):
|
|
"""Test k parameter affects RRF score magnitude."""
|
|
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
|
|
results_map = {"exact": results}
|
|
|
|
fused = reciprocal_rank_fusion(results_map, k=k_value)
|
|
|
|
# Score should be 1.0 / (k + 1)
|
|
expected = 1.0 / (k_value + 1)
|
|
assert abs(fused[0].score - expected) < 0.001
|
|
|
|
|
|
class TestRRFEdgeCases:
|
|
"""Edge case tests for RRF."""
|
|
|
|
def test_duplicate_paths_in_same_source(self):
|
|
"""Test handling of duplicate paths in single source."""
|
|
results = [
|
|
SearchResult(path="a.py", score=10.0, excerpt="..."),
|
|
SearchResult(path="a.py", score=8.0, excerpt="..."), # Duplicate
|
|
]
|
|
results_map = {"exact": results}
|
|
|
|
fused = reciprocal_rank_fusion(results_map)
|
|
|
|
# Should deduplicate (first occurrence wins)
|
|
assert len(fused) == 1
|
|
assert fused[0].path == "a.py"
|
|
|
|
def test_very_large_result_lists(self):
|
|
"""Test RRF handles large result sets efficiently."""
|
|
# Create 1000 results
|
|
results = [
|
|
SearchResult(path=f"file{i}.py", score=1000-i, excerpt="...")
|
|
for i in range(1000)
|
|
]
|
|
results_map = {"exact": results}
|
|
|
|
fused = reciprocal_rank_fusion(results_map)
|
|
|
|
assert len(fused) == 1000
|
|
# Should maintain ranking
|
|
assert fused[0].path == "file0.py"
|
|
assert fused[-1].path == "file999.py"
|
|
|
|
def test_all_same_score(self):
|
|
"""Test RRF when all results have same original score."""
|
|
results = [
|
|
SearchResult(path="a.py", score=10.0, excerpt="..."),
|
|
SearchResult(path="b.py", score=10.0, excerpt="..."),
|
|
SearchResult(path="c.py", score=10.0, excerpt="..."),
|
|
]
|
|
results_map = {"exact": results}
|
|
|
|
fused = reciprocal_rank_fusion(results_map)
|
|
|
|
# Should still rank by position (rank matters)
|
|
assert len(fused) == 3
|
|
assert fused[0].score > fused[1].score > fused[2].score
|
|
|
|
def test_missing_weight_for_source(self):
|
|
"""Test missing weight for source uses default."""
|
|
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
|
|
results_map = {"exact": results, "fuzzy": results}
|
|
|
|
# Only provide weight for exact
|
|
weights = {"exact": 1.0}
|
|
|
|
fused = reciprocal_rank_fusion(results_map, weights=weights)
|
|
|
|
# Should work with normalization
|
|
assert len(fused) == 1 # Deduplicated
|
|
assert fused[0].score > 0
|