Files
Claude-Code-Workflow/codex-lens/tests/test_rrf_fusion.py
catlog22 8e744597d1 feat: Implement CodexLens multi-provider embedding rotation management
- Added functions to get and update CodexLens embedding rotation configuration.
- Introduced functionality to retrieve enabled embedding providers for rotation.
- Created endpoints for managing rotation configuration via API.
- Enhanced dashboard UI to support multi-provider rotation configuration.
- Updated internationalization strings for new rotation features.
- Adjusted CLI commands and embedding manager to support increased concurrency limits.
- Modified hybrid search weights for improved ranking behavior.
2025-12-25 14:13:27 +08:00

422 lines
16 KiB
Python

"""Tests for Reciprocal Rank Fusion (RRF) algorithm (P2).
Tests RRF fusion logic, score computation, weight handling, and result ranking.
"""
import pytest
from codexlens.entities import SearchResult
from codexlens.search.ranking import (
normalize_bm25_score,
reciprocal_rank_fusion,
tag_search_source,
)
class TestReciprocalRankFusion:
"""Tests for reciprocal_rank_fusion function."""
def test_single_source_ranking(self):
"""Test RRF with single source returns ranked results."""
results = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="b.py", score=8.0, excerpt="..."),
SearchResult(path="c.py", score=6.0, excerpt="..."),
]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map)
assert len(fused) == 3
# Order should be preserved (highest original score first)
assert fused[0].path == "a.py"
assert fused[1].path == "b.py"
assert fused[2].path == "c.py"
def test_two_sources_fusion(self):
"""Test RRF combines rankings from two sources."""
exact_results = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="b.py", score=8.0, excerpt="..."),
SearchResult(path="c.py", score=6.0, excerpt="..."),
]
fuzzy_results = [
SearchResult(path="b.py", score=9.0, excerpt="..."),
SearchResult(path="c.py", score=7.0, excerpt="..."),
SearchResult(path="d.py", score=5.0, excerpt="..."),
]
results_map = {"exact": exact_results, "fuzzy": fuzzy_results}
fused = reciprocal_rank_fusion(results_map)
# Should have all unique paths
paths = [r.path for r in fused]
assert set(paths) == {"a.py", "b.py", "c.py", "d.py"}
# Results appearing in both should rank higher
# b.py and c.py appear in both sources
assert fused[0].path in ["b.py", "c.py"], "Items in both sources should rank highest"
def test_rrf_score_calculation(self):
"""Test RRF scores are calculated correctly with default k=60."""
# Simple scenario: single source
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map, k=60)
# RRF score = weight / (k + rank) = 1.0 / (60 + 1) ≈ 0.0164
expected_score = 1.0 / 61
assert abs(fused[0].score - expected_score) < 0.001
def test_custom_weights(self):
"""Test custom weights affect RRF scores."""
results_a = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_b = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_map = {"exact": results_a, "fuzzy": results_b}
# Higher weight for exact
weights = {"exact": 0.7, "fuzzy": 0.3}
fused = reciprocal_rank_fusion(results_map, weights=weights, k=60)
# Score should be: 0.7/(60+1) + 0.3/(60+1) = 1.0/61 ≈ 0.0164
expected_score = (0.7 + 0.3) / 61
assert abs(fused[0].score - expected_score) < 0.001
def test_weight_normalization(self):
"""Test weights are normalized to sum to 1.0."""
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_map = {"exact": results}
# Weights not summing to 1.0
weights = {"exact": 2.0} # Will be normalized to 1.0
fused = reciprocal_rank_fusion(results_map, weights=weights)
# Should work without error and produce normalized scores
assert len(fused) == 1
assert fused[0].score > 0
def test_empty_results_map(self):
"""Test RRF with empty results returns empty list."""
fused = reciprocal_rank_fusion({})
assert fused == []
def test_zero_weight_source_ignored(self):
"""Test sources with zero weight are ignored."""
results_a = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_b = [SearchResult(path="b.py", score=10.0, excerpt="...")]
results_map = {"exact": results_a, "fuzzy": results_b}
weights = {"exact": 1.0, "fuzzy": 0.0} # Ignore fuzzy
fused = reciprocal_rank_fusion(results_map, weights=weights)
# Should only have result from exact source
assert len(fused) == 1
assert fused[0].path == "a.py"
def test_fusion_score_in_metadata(self):
"""Test fusion score is stored in result metadata."""
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map)
# Check metadata
assert "fusion_score" in fused[0].metadata
assert "original_score" in fused[0].metadata
assert fused[0].metadata["original_score"] == 10.0
def test_rank_order_matters(self):
"""Test rank position affects RRF score (lower rank = higher score)."""
results = [
SearchResult(path="a.py", score=10.0, excerpt="..."), # rank 1
SearchResult(path="b.py", score=8.0, excerpt="..."), # rank 2
SearchResult(path="c.py", score=6.0, excerpt="..."), # rank 3
]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map, k=60)
# a.py (rank 1): score = 1/(60+1) ≈ 0.0164
# b.py (rank 2): score = 1/(60+2) ≈ 0.0161
# c.py (rank 3): score = 1/(60+3) ≈ 0.0159
assert fused[0].score > fused[1].score > fused[2].score
class TestRRFSyntheticRankings:
"""Tests with synthetic rankings to verify RRF correctness."""
def test_perfect_agreement(self):
"""Test RRF when all sources rank items identically."""
# All sources rank a > b > c
exact = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="b.py", score=8.0, excerpt="..."),
SearchResult(path="c.py", score=6.0, excerpt="..."),
]
fuzzy = [
SearchResult(path="a.py", score=9.0, excerpt="..."),
SearchResult(path="b.py", score=7.0, excerpt="..."),
SearchResult(path="c.py", score=5.0, excerpt="..."),
]
results_map = {"exact": exact, "fuzzy": fuzzy}
fused = reciprocal_rank_fusion(results_map)
# Order should match both sources
assert fused[0].path == "a.py"
assert fused[1].path == "b.py"
assert fused[2].path == "c.py"
def test_complete_disagreement(self):
"""Test RRF when sources have opposite rankings."""
# exact: a > b > c
# fuzzy: c > b > a
exact = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="b.py", score=8.0, excerpt="..."),
SearchResult(path="c.py", score=6.0, excerpt="..."),
]
fuzzy = [
SearchResult(path="c.py", score=9.0, excerpt="..."),
SearchResult(path="b.py", score=7.0, excerpt="..."),
SearchResult(path="a.py", score=5.0, excerpt="..."),
]
results_map = {"exact": exact, "fuzzy": fuzzy}
fused = reciprocal_rank_fusion(results_map)
# With opposite rankings, a.py and c.py get equal RRF scores:
# a.py: 0.5/(60+1) + 0.5/(60+3) = 0.01613
# c.py: 0.5/(60+3) + 0.5/(60+1) = 0.01613 (same!)
# b.py: 0.5/(60+2) + 0.5/(60+2) = 0.01613 (slightly lower due to rounding)
# So top result should be a.py or c.py (tied)
assert fused[0].path in ["a.py", "c.py"], "Items with symmetric ranks should tie for first"
def test_partial_overlap(self):
"""Test RRF with partial overlap between sources."""
# exact: [A, B, C]
# fuzzy: [B, C, D]
exact = [
SearchResult(path="A", score=10.0, excerpt="..."),
SearchResult(path="B", score=8.0, excerpt="..."),
SearchResult(path="C", score=6.0, excerpt="..."),
]
fuzzy = [
SearchResult(path="B", score=9.0, excerpt="..."),
SearchResult(path="C", score=7.0, excerpt="..."),
SearchResult(path="D", score=5.0, excerpt="..."),
]
results_map = {"exact": exact, "fuzzy": fuzzy}
fused = reciprocal_rank_fusion(results_map)
# B and C appear in both, should rank higher than A and D
paths = [r.path for r in fused]
b_idx = paths.index("B")
c_idx = paths.index("C")
a_idx = paths.index("A")
d_idx = paths.index("D")
assert b_idx < a_idx, "B (in both) should outrank A (in one)"
assert c_idx < d_idx, "C (in both) should outrank D (in one)"
def test_three_sources(self):
"""Test RRF with three sources (exact, fuzzy, vector)."""
exact = [SearchResult(path="a.py", score=10.0, excerpt="...")]
fuzzy = [SearchResult(path="b.py", score=9.0, excerpt="...")]
vector = [SearchResult(path="c.py", score=8.0, excerpt="...")]
results_map = {"exact": exact, "fuzzy": fuzzy, "vector": vector}
weights = {"exact": 0.3, "fuzzy": 0.1, "vector": 0.6}
fused = reciprocal_rank_fusion(results_map, weights=weights)
assert len(fused) == 3
# Each appears in one source only, so scores differ by weights
# c.py: 0.6/61 ≈ 0.0098 (vector, highest weight)
# a.py: 0.3/61 ≈ 0.0049 (exact)
# b.py: 0.1/61 ≈ 0.0016 (fuzzy)
assert fused[0].path == "c.py", "Vector (higher weight) should rank first"
class TestNormalizeBM25Score:
"""Tests for normalize_bm25_score function."""
def test_negative_bm25_normalization(self):
"""Test BM25 scores (negative) are normalized to 0-1 range."""
# SQLite FTS5 returns negative BM25 scores
scores = [-20.0, -10.0, -5.0, -1.0, 0.0]
for score in scores:
normalized = normalize_bm25_score(score)
assert 0.0 <= normalized <= 1.0, f"Normalized score {normalized} out of range"
def test_better_match_higher_score(self):
"""Test more negative BM25 (better match) gives higher normalized score."""
good_match = -15.0
weak_match = -2.0
norm_good = normalize_bm25_score(good_match)
norm_weak = normalize_bm25_score(weak_match)
assert norm_good > norm_weak, "Better match should have higher normalized score"
def test_zero_score(self):
"""Test zero BM25 score normalization."""
normalized = normalize_bm25_score(0.0)
assert 0.0 <= normalized <= 1.0
def test_positive_score_handling(self):
"""Test positive scores (edge case) are handled."""
normalized = normalize_bm25_score(5.0)
# Should still be in valid range
assert 0.0 <= normalized <= 1.0
class TestTagSearchSource:
"""Tests for tag_search_source function."""
def test_tagging_adds_source_metadata(self):
"""Test tagging adds search_source to metadata."""
results = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="b.py", score=8.0, excerpt="..."),
]
tagged = tag_search_source(results, "exact")
for result in tagged:
assert "search_source" in result.metadata
assert result.metadata["search_source"] == "exact"
def test_tagging_preserves_existing_metadata(self):
"""Test tagging preserves existing metadata fields."""
results = [
SearchResult(
path="a.py",
score=10.0,
excerpt="...",
metadata={"custom_field": "value"}
),
]
tagged = tag_search_source(results, "fuzzy")
assert "custom_field" in tagged[0].metadata
assert tagged[0].metadata["custom_field"] == "value"
assert "search_source" in tagged[0].metadata
assert tagged[0].metadata["search_source"] == "fuzzy"
def test_tagging_empty_list(self):
"""Test tagging empty list returns empty list."""
tagged = tag_search_source([], "exact")
assert tagged == []
def test_tagging_preserves_result_fields(self):
"""Test tagging preserves all SearchResult fields."""
results = [
SearchResult(
path="a.py",
score=10.0,
excerpt="test excerpt",
content="full content",
start_line=10,
end_line=20,
symbol_name="test_func",
symbol_kind="function"
),
]
tagged = tag_search_source(results, "exact")
assert tagged[0].path == "a.py"
assert tagged[0].score == 10.0
assert tagged[0].excerpt == "test excerpt"
assert tagged[0].content == "full content"
assert tagged[0].start_line == 10
assert tagged[0].end_line == 20
assert tagged[0].symbol_name == "test_func"
assert tagged[0].symbol_kind == "function"
@pytest.mark.parametrize("k_value", [30, 60, 100])
class TestRRFParameterized:
"""Parameterized tests for RRF with different k values."""
def test_k_value_affects_scores(self, k_value):
"""Test k parameter affects RRF score magnitude."""
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map, k=k_value)
# Score should be 1.0 / (k + 1)
expected = 1.0 / (k_value + 1)
assert abs(fused[0].score - expected) < 0.001
class TestRRFEdgeCases:
"""Edge case tests for RRF."""
def test_duplicate_paths_in_same_source(self):
"""Test handling of duplicate paths in single source."""
results = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="a.py", score=8.0, excerpt="..."), # Duplicate
]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map)
# Should deduplicate (first occurrence wins)
assert len(fused) == 1
assert fused[0].path == "a.py"
def test_very_large_result_lists(self):
"""Test RRF handles large result sets efficiently."""
# Create 1000 results
results = [
SearchResult(path=f"file{i}.py", score=1000-i, excerpt="...")
for i in range(1000)
]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map)
assert len(fused) == 1000
# Should maintain ranking
assert fused[0].path == "file0.py"
assert fused[-1].path == "file999.py"
def test_all_same_score(self):
"""Test RRF when all results have same original score."""
results = [
SearchResult(path="a.py", score=10.0, excerpt="..."),
SearchResult(path="b.py", score=10.0, excerpt="..."),
SearchResult(path="c.py", score=10.0, excerpt="..."),
]
results_map = {"exact": results}
fused = reciprocal_rank_fusion(results_map)
# Should still rank by position (rank matters)
assert len(fused) == 3
assert fused[0].score > fused[1].score > fused[2].score
def test_missing_weight_for_source(self):
"""Test missing weight for source uses default."""
results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
results_map = {"exact": results, "fuzzy": results}
# Only provide weight for exact
weights = {"exact": 1.0}
fused = reciprocal_rank_fusion(results_map, weights=weights)
# Should work with normalization
assert len(fused) == 1 # Deduplicated
assert fused[0].score > 0