feat: Add search result grouping by similarity score

Add functionality to group search results with similar content and scores into a single representative result with additional locations. Changes: - Add AdditionalLocation entity model for storing grouped result locations - Add additional_locations field to SearchResult for backward compatibility - Implement group_similar_results() function in ranking.py with: - Content-based grouping (by excerpt or content field) - Score-based sub-grouping with configurable threshold - Metadata preservation with grouped_count tracking - Add group_results and grouping_threshold options to SearchOptions - Integrate grouping into ChainSearchEngine.search() after RRF fusion Test coverage: - 36 multi-level tests covering unit, boundary, integration, and performance - Real-world scenario tests for RRF scores and duplicate code detection 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 01:50:27 +08:00 · 2025-12-19 16:33:44 +08:00
parent 3428642d04
commit 7adde91e9f
4 changed files with 738 additions and 2 deletions
--- a/codex-lens/src/codexlens/entities.py
+++ b/codex-lens/src/codexlens/entities.py
@@ -90,6 +90,21 @@ class CodeRelationship(BaseModel):
        return value


+class AdditionalLocation(BaseModel):
+    """A pointer to another location where a similar result was found.
+
+    Used for grouping search results with similar scores and content,
+    where the primary result is stored in SearchResult and secondary
+    locations are stored in this model.
+    """
+
+    path: str = Field(..., min_length=1)
+    score: float = Field(..., ge=0.0)
+    start_line: Optional[int] = Field(default=None, description="Start line of the result (1-based)")
+    end_line: Optional[int] = Field(default=None, description="End line of the result (1-based)")
+    symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol")
+
+
 class SearchResult(BaseModel):
    """A unified search result for lexical or semantic search."""

@@ -100,10 +115,16 @@ class SearchResult(BaseModel):
    symbol: Optional[Symbol] = None
    chunk: Optional[SemanticChunk] = None
    metadata: Dict[str, Any] = Field(default_factory=dict)
-    
+
    # Additional context for complete code blocks
    start_line: Optional[int] = Field(default=None, description="Start line of code block (1-based)")
    end_line: Optional[int] = Field(default=None, description="End line of code block (1-based)")
    symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol/function/class")
    symbol_kind: Optional[str] = Field(default=None, description="Kind of symbol (function/class/method)")

+    # Field for grouping similar results
+    additional_locations: List["AdditionalLocation"] = Field(
+        default_factory=list,
+        description="Other locations for grouped results with similar scores and content."
+    )
+
--- a/codex-lens/src/codexlens/search/chain_search.py
+++ b/codex-lens/src/codexlens/search/chain_search.py
@@ -38,6 +38,8 @@ class SearchOptions:
        enable_vector: Enable vector semantic search (default False)
        pure_vector: If True, only use vector search without FTS fallback (default False)
        hybrid_weights: Custom RRF weights for hybrid search (optional)
+        group_results: Enable grouping of similar results (default False)
+        grouping_threshold: Score threshold for grouping similar results (default 0.01)
    """
    depth: int = -1
    max_workers: int = 8
@@ -51,6 +53,8 @@ class SearchOptions:
    enable_vector: bool = False
    pure_vector: bool = False
    hybrid_weights: Optional[Dict[str, float]] = None
+    group_results: bool = False
+    grouping_threshold: float = 0.01


@dataclass
@@ -210,6 +214,14 @@ class ChainSearchEngine:

        # Step 4: Merge and rank
        final_results = self._merge_and_rank(results, options.total_limit)
+
+        # Step 5: Optional grouping of similar results
+        if options.group_results:
+            from codexlens.search.ranking import group_similar_results
+            final_results = group_similar_results(
+                final_results, score_threshold_abs=options.grouping_threshold
+            )
+
        stats.files_matched = len(final_results)

        # Optional: Symbol search
--- a/codex-lens/src/codexlens/search/ranking.py
+++ b/codex-lens/src/codexlens/search/ranking.py
@@ -9,7 +9,7 @@ from __future__ import annotations
 import math
 from typing import Dict, List

-from codexlens.entities import SearchResult
+from codexlens.entities import SearchResult, AdditionalLocation


 def reciprocal_rank_fusion(
@@ -158,3 +158,117 @@ def tag_search_source(results: List[SearchResult], source: str) -> List[SearchRe
        tagged_results.append(tagged_result)

    return tagged_results
+
+
+def group_similar_results(
+    results: List[SearchResult],
+    score_threshold_abs: float = 0.01,
+    content_field: str = "excerpt"
+) -> List[SearchResult]:
+    """Group search results by content and score similarity.
+
+    Groups results that have similar content and similar scores into a single
+    representative result, with other locations stored in additional_locations.
+
+    Algorithm:
+    1. Group results by content (using excerpt or content field)
+    2. Within each content group, create subgroups based on score similarity
+    3. Select highest-scoring result as representative for each subgroup
+    4. Store other results in subgroup as additional_locations
+
+    Args:
+        results: A list of SearchResult objects (typically sorted by score)
+        score_threshold_abs: Absolute score difference to consider results similar.
+                            Results with |score_a - score_b| <= threshold are grouped.
+                            Default 0.01 is suitable for RRF fusion scores.
+        content_field: The field to use for content grouping ('excerpt' or 'content')
+
+    Returns:
+        A new list of SearchResult objects where similar items are grouped.
+        The list is sorted by score descending.
+
+    Examples:
+        >>> results = [SearchResult(path="a.py", score=0.5, excerpt="def foo()"),
+        ...            SearchResult(path="b.py", score=0.5, excerpt="def foo()")]
+        >>> grouped = group_similar_results(results)
+        >>> len(grouped)  # Two results merged into one
+        1
+        >>> len(grouped[0].additional_locations)  # One additional location
+        1
+    """
+    if not results:
+        return []
+
+    # Group results by content
+    content_map: Dict[str, List[SearchResult]] = {}
+    unidentifiable_results: List[SearchResult] = []
+
+    for r in results:
+        key = getattr(r, content_field, None)
+        if key and key.strip():
+            content_map.setdefault(key, []).append(r)
+        else:
+            # Results without content can't be grouped by content
+            unidentifiable_results.append(r)
+
+    final_results: List[SearchResult] = []
+
+    # Process each content group
+    for content_group in content_map.values():
+        # Sort by score descending within group
+        content_group.sort(key=lambda r: r.score, reverse=True)
+
+        while content_group:
+            # Take highest scoring as representative
+            representative = content_group.pop(0)
+            others_in_group = []
+            remaining_for_next_pass = []
+
+            # Find results with similar scores
+            for item in content_group:
+                if abs(representative.score - item.score) <= score_threshold_abs:
+                    others_in_group.append(item)
+                else:
+                    remaining_for_next_pass.append(item)
+
+            # Create grouped result with additional locations
+            if others_in_group:
+                # Build new result with additional_locations populated
+                grouped_result = SearchResult(
+                    path=representative.path,
+                    score=representative.score,
+                    excerpt=representative.excerpt,
+                    content=representative.content,
+                    symbol=representative.symbol,
+                    chunk=representative.chunk,
+                    metadata={
+                        **representative.metadata,
+                        "grouped_count": len(others_in_group) + 1,
+                    },
+                    start_line=representative.start_line,
+                    end_line=representative.end_line,
+                    symbol_name=representative.symbol_name,
+                    symbol_kind=representative.symbol_kind,
+                    additional_locations=[
+                        AdditionalLocation(
+                            path=other.path,
+                            score=other.score,
+                            start_line=other.start_line,
+                            end_line=other.end_line,
+                            symbol_name=other.symbol_name,
+                        ) for other in others_in_group
+                    ],
+                )
+                final_results.append(grouped_result)
+            else:
+                final_results.append(representative)
+
+            content_group = remaining_for_next_pass
+
+    # Add ungroupable results
+    final_results.extend(unidentifiable_results)
+
+    # Sort final results by score descending
+    final_results.sort(key=lambda r: r.score, reverse=True)
+
+    return final_results
--- a/codex-lens/tests/test_result_grouping.py
+++ b/codex-lens/tests/test_result_grouping.py
@@ -0,0 +1,589 @@
+"""Multi-level tests for search result grouping functionality.
+
+Tests cover:
+1. Unit tests for group_similar_results function
+2. Boundary condition tests
+3. Integration tests with SearchOptions
+4. Performance/stress tests
+"""
+
+import pytest
+from typing import List
+
+from codexlens.entities import SearchResult, AdditionalLocation
+from codexlens.search.ranking import group_similar_results
+from codexlens.search.chain_search import SearchOptions
+
+
+# =============================================================================
+# Test Fixtures
+# =============================================================================
+
+@pytest.fixture
+def sample_results() -> List[SearchResult]:
+    """Create sample search results for testing."""
+    return [
+        SearchResult(path="a.py", score=0.5, excerpt="def foo(): pass", start_line=10, symbol_name="foo"),
+        SearchResult(path="b.py", score=0.5, excerpt="def foo(): pass", start_line=20, symbol_name="foo"),
+        SearchResult(path="c.py", score=0.49, excerpt="def foo(): pass", start_line=30, symbol_name="foo"),
+        SearchResult(path="d.py", score=0.3, excerpt="def bar(): pass", start_line=40, symbol_name="bar"),
+    ]
+
+
+@pytest.fixture
+def results_with_different_excerpts() -> List[SearchResult]:
+    """Results with same scores but different content."""
+    return [
+        SearchResult(path="a.py", score=0.5, excerpt="def foo(): pass"),
+        SearchResult(path="b.py", score=0.5, excerpt="def bar(): pass"),
+        SearchResult(path="c.py", score=0.5, excerpt="def baz(): pass"),
+    ]
+
+
+@pytest.fixture
+def results_with_same_excerpt_different_scores() -> List[SearchResult]:
+    """Results with same content but very different scores."""
+    return [
+        SearchResult(path="a.py", score=0.9, excerpt="def foo(): pass"),
+        SearchResult(path="b.py", score=0.5, excerpt="def foo(): pass"),
+        SearchResult(path="c.py", score=0.1, excerpt="def foo(): pass"),
+    ]
+
+
+# =============================================================================
+# Level 1: Unit Tests - Basic Functionality
+# =============================================================================
+
+class TestGroupSimilarResultsBasic:
+    """Basic unit tests for group_similar_results function."""
+
+    def test_empty_results_returns_empty(self):
+        """Empty input should return empty output."""
+        result = group_similar_results([])
+        assert result == []
+
+    def test_single_result_returns_unchanged(self):
+        """Single result should be returned as-is."""
+        single = SearchResult(path="test.py", score=0.5, excerpt="code")
+        result = group_similar_results([single])
+
+        assert len(result) == 1
+        assert result[0].path == "test.py"
+        assert result[0].additional_locations == []
+
+    def test_groups_identical_excerpt_similar_score(self, sample_results):
+        """Results with same excerpt and similar scores should be grouped."""
+        grouped = group_similar_results(sample_results, score_threshold_abs=0.02)
+
+        # Should have 2 groups: foo group (a, b, c) and bar (d)
+        assert len(grouped) == 2
+
+        # First group should have additional locations
+        foo_group = next(r for r in grouped if r.excerpt == "def foo(): pass")
+        assert len(foo_group.additional_locations) == 2
+
+        # Second group (bar) should have no additional locations
+        bar_group = next(r for r in grouped if r.excerpt == "def bar(): pass")
+        assert len(bar_group.additional_locations) == 0
+
+    def test_preserves_highest_score_as_representative(self, sample_results):
+        """Representative result should have the highest score in group."""
+        grouped = group_similar_results(sample_results, score_threshold_abs=0.02)
+
+        foo_group = next(r for r in grouped if r.excerpt == "def foo(): pass")
+        # a.py has score 0.5, which is highest
+        assert foo_group.path == "a.py"
+        assert foo_group.score == 0.5
+
+    def test_additional_locations_contain_correct_info(self, sample_results):
+        """Additional locations should contain correct path, score, line info."""
+        grouped = group_similar_results(sample_results, score_threshold_abs=0.02)
+
+        foo_group = next(r for r in grouped if r.excerpt == "def foo(): pass")
+        locations = foo_group.additional_locations
+
+        paths = {loc.path for loc in locations}
+        assert "b.py" in paths
+        assert "c.py" in paths
+
+        # Check that start_line is preserved
+        for loc in locations:
+            if loc.path == "b.py":
+                assert loc.start_line == 20
+            elif loc.path == "c.py":
+                assert loc.start_line == 30
+
+
+# =============================================================================
+# Level 2: Boundary Condition Tests
+# =============================================================================
+
+class TestGroupSimilarResultsBoundary:
+    """Boundary condition tests for edge cases."""
+
+    def test_threshold_zero_no_grouping(self):
+        """With threshold=0, only exactly equal scores should group."""
+        results = [
+            SearchResult(path="a.py", score=0.5, excerpt="def foo()"),
+            SearchResult(path="b.py", score=0.5, excerpt="def foo()"),
+            SearchResult(path="c.py", score=0.50001, excerpt="def foo()"),  # Slightly different
+        ]
+
+        grouped = group_similar_results(results, score_threshold_abs=0.0)
+
+        # a and b should group (exact same score), c should be separate
+        assert len(grouped) == 2
+
+        main_group = next(r for r in grouped if len(r.additional_locations) > 0)
+        assert len(main_group.additional_locations) == 1
+
+    def test_threshold_exact_boundary(self):
+        """Test behavior at exact threshold boundary.
+
+        Note: Due to floating-point precision, 0.5 - 0.49 = 0.010000000000000009
+        which is slightly > 0.01, so they won't group with threshold=0.01.
+        Use a slightly larger threshold to account for floating-point precision.
+        """
+        results = [
+            SearchResult(path="a.py", score=0.5, excerpt="def foo()"),
+            SearchResult(path="b.py", score=0.49, excerpt="def foo()"),  # 0.01 diff (floating-point)
+            SearchResult(path="c.py", score=0.48, excerpt="def foo()"),  # 0.02 diff from a
+        ]
+
+        # With threshold 0.011 (slightly above floating-point 0.01), a and b should group
+        grouped = group_similar_results(results, score_threshold_abs=0.011)
+
+        # a groups with b, c is separate (0.02 from a, 0.01 from b)
+        # After a+b group, c is compared with remaining and forms its own group
+        assert len(grouped) == 2
+
+        # Verify a is representative (highest score)
+        main_group = next(r for r in grouped if r.score == 0.5)
+        assert main_group.path == "a.py"
+        assert len(main_group.additional_locations) == 1
+        assert main_group.additional_locations[0].path == "b.py"
+
+    def test_large_threshold_groups_all(self):
+        """Very large threshold should group all same-content results."""
+        results = [
+            SearchResult(path="a.py", score=0.9, excerpt="def foo()"),
+            SearchResult(path="b.py", score=0.1, excerpt="def foo()"),
+        ]
+
+        grouped = group_similar_results(results, score_threshold_abs=1.0)
+
+        assert len(grouped) == 1
+        assert len(grouped[0].additional_locations) == 1
+
+    def test_none_excerpt_not_grouped(self):
+        """Results with None excerpt should not be grouped."""
+        results = [
+            SearchResult(path="a.py", score=0.5, excerpt=None),
+            SearchResult(path="b.py", score=0.5, excerpt=None),
+        ]
+
+        grouped = group_similar_results(results)
+
+        # None excerpts can't be grouped by content
+        assert len(grouped) == 2
+        for r in grouped:
+            assert len(r.additional_locations) == 0
+
+    def test_empty_excerpt_not_grouped(self):
+        """Results with empty string excerpt should not be grouped."""
+        results = [
+            SearchResult(path="a.py", score=0.5, excerpt=""),
+            SearchResult(path="b.py", score=0.5, excerpt=""),
+            SearchResult(path="c.py", score=0.5, excerpt="   "),  # Whitespace only
+        ]
+
+        grouped = group_similar_results(results)
+
+        # Empty/whitespace excerpts can't be grouped
+        assert len(grouped) == 3
+
+    def test_different_excerpts_not_grouped(self, results_with_different_excerpts):
+        """Results with different excerpts should not be grouped even with same score."""
+        grouped = group_similar_results(results_with_different_excerpts, score_threshold_abs=1.0)
+
+        # Different content = no grouping
+        assert len(grouped) == 3
+        for r in grouped:
+            assert len(r.additional_locations) == 0
+
+    def test_same_excerpt_different_scores_creates_subgroups(self, results_with_same_excerpt_different_scores):
+        """Same content but very different scores should create separate subgroups."""
+        grouped = group_similar_results(
+            results_with_same_excerpt_different_scores,
+            score_threshold_abs=0.1
+        )
+
+        # Scores 0.9, 0.5, 0.1 with threshold 0.1
+        # 0.9 and 0.5 differ by 0.4 > 0.1, so separate
+        # 0.5 and 0.1 differ by 0.4 > 0.1, so separate
+        assert len(grouped) == 3
+
+
+# =============================================================================
+# Level 3: Content Field Tests
+# =============================================================================
+
+class TestGroupSimilarResultsContentField:
+    """Tests for different content_field options."""
+
+    def test_group_by_content_field(self):
+        """Should be able to group by 'content' field instead of 'excerpt'."""
+        results = [
+            SearchResult(path="a.py", score=0.5, excerpt="short", content="full content here"),
+            SearchResult(path="b.py", score=0.5, excerpt="different", content="full content here"),
+        ]
+
+        # Group by excerpt - different excerpts, no grouping
+        grouped_by_excerpt = group_similar_results(results, content_field="excerpt")
+        assert len(grouped_by_excerpt) == 2
+
+        # Group by content - same content, should group
+        grouped_by_content = group_similar_results(results, content_field="content")
+        assert len(grouped_by_content) == 1
+        assert len(grouped_by_content[0].additional_locations) == 1
+
+    def test_fallback_when_content_field_missing(self):
+        """Results without the specified content field should not be grouped."""
+        results = [
+            SearchResult(path="a.py", score=0.5, content=None),
+            SearchResult(path="b.py", score=0.5, content=None),
+        ]
+
+        grouped = group_similar_results(results, content_field="content")
+
+        # None content = ungroupable
+        assert len(grouped) == 2
+
+
+# =============================================================================
+# Level 4: Metadata and Ordering Tests
+# =============================================================================
+
+class TestGroupSimilarResultsMetadata:
+    """Tests for metadata handling and result ordering."""
+
+    def test_grouped_count_in_metadata(self, sample_results):
+        """Grouped results should have grouped_count in metadata."""
+        grouped = group_similar_results(sample_results, score_threshold_abs=0.02)
+
+        foo_group = next(r for r in grouped if r.excerpt == "def foo(): pass")
+
+        assert "grouped_count" in foo_group.metadata
+        assert foo_group.metadata["grouped_count"] == 3  # a, b, c
+
+    def test_preserves_original_metadata(self):
+        """Original metadata should be preserved in grouped result."""
+        results = [
+            SearchResult(
+                path="a.py",
+                score=0.5,
+                excerpt="def foo()",
+                metadata={"original_key": "original_value", "fusion_score": 0.5}
+            ),
+            SearchResult(path="b.py", score=0.5, excerpt="def foo()"),
+        ]
+
+        grouped = group_similar_results(results, score_threshold_abs=0.1)
+
+        assert grouped[0].metadata["original_key"] == "original_value"
+        assert grouped[0].metadata["fusion_score"] == 0.5
+
+    def test_results_sorted_by_score_descending(self):
+        """Final results should be sorted by score descending."""
+        results = [
+            SearchResult(path="low.py", score=0.1, excerpt="low"),
+            SearchResult(path="high.py", score=0.9, excerpt="high"),
+            SearchResult(path="mid.py", score=0.5, excerpt="mid"),
+        ]
+
+        grouped = group_similar_results(results)
+
+        scores = [r.score for r in grouped]
+        assert scores == sorted(scores, reverse=True)
+        assert scores == [0.9, 0.5, 0.1]
+
+
+# =============================================================================
+# Level 5: Integration Tests with SearchOptions
+# =============================================================================
+
+class TestSearchOptionsGrouping:
+    """Integration tests for SearchOptions grouping configuration."""
+
+    def test_search_options_default_grouping_disabled(self):
+        """Default SearchOptions should have grouping disabled."""
+        options = SearchOptions()
+
+        assert options.group_results is False
+        assert options.grouping_threshold == 0.01
+
+    def test_search_options_enable_grouping(self):
+        """SearchOptions should allow enabling grouping."""
+        options = SearchOptions(group_results=True)
+
+        assert options.group_results is True
+
+    def test_search_options_custom_threshold(self):
+        """SearchOptions should allow custom grouping threshold."""
+        options = SearchOptions(group_results=True, grouping_threshold=0.05)
+
+        assert options.grouping_threshold == 0.05
+
+    def test_search_options_all_parameters(self):
+        """SearchOptions should work with all parameters combined."""
+        options = SearchOptions(
+            depth=3,
+            max_workers=4,
+            limit_per_dir=20,
+            total_limit=200,
+            include_symbols=True,
+            hybrid_mode=True,
+            group_results=True,
+            grouping_threshold=0.02,
+        )
+
+        assert options.depth == 3
+        assert options.group_results is True
+        assert options.grouping_threshold == 0.02
+
+
+# =============================================================================
+# Level 6: AdditionalLocation Entity Tests
+# =============================================================================
+
+class TestAdditionalLocationEntity:
+    """Tests for AdditionalLocation entity model."""
+
+    def test_create_minimal_additional_location(self):
+        """Create AdditionalLocation with minimal required fields."""
+        loc = AdditionalLocation(path="test.py", score=0.5)
+
+        assert loc.path == "test.py"
+        assert loc.score == 0.5
+        assert loc.start_line is None
+        assert loc.end_line is None
+        assert loc.symbol_name is None
+
+    def test_create_full_additional_location(self):
+        """Create AdditionalLocation with all fields."""
+        loc = AdditionalLocation(
+            path="test.py",
+            score=0.75,
+            start_line=10,
+            end_line=20,
+            symbol_name="my_function"
+        )
+
+        assert loc.path == "test.py"
+        assert loc.score == 0.75
+        assert loc.start_line == 10
+        assert loc.end_line == 20
+        assert loc.symbol_name == "my_function"
+
+    def test_additional_location_path_required(self):
+        """Path should be required for AdditionalLocation."""
+        with pytest.raises(Exception):  # ValidationError
+            AdditionalLocation(score=0.5)
+
+    def test_additional_location_score_required(self):
+        """Score should be required for AdditionalLocation."""
+        with pytest.raises(Exception):  # ValidationError
+            AdditionalLocation(path="test.py")
+
+    def test_additional_location_score_non_negative(self):
+        """Score should be non-negative."""
+        with pytest.raises(Exception):  # ValidationError
+            AdditionalLocation(path="test.py", score=-0.1)
+
+    def test_additional_location_serialization(self):
+        """AdditionalLocation should serialize correctly."""
+        loc = AdditionalLocation(
+            path="test.py",
+            score=0.5,
+            start_line=10,
+            symbol_name="func"
+        )
+
+        data = loc.model_dump()
+
+        assert data["path"] == "test.py"
+        assert data["score"] == 0.5
+        assert data["start_line"] == 10
+        assert data["symbol_name"] == "func"
+
+
+# =============================================================================
+# Level 7: SearchResult with AdditionalLocations Tests
+# =============================================================================
+
+class TestSearchResultWithAdditionalLocations:
+    """Tests for SearchResult entity with additional_locations field."""
+
+    def test_search_result_default_empty_locations(self):
+        """SearchResult should have empty additional_locations by default."""
+        result = SearchResult(path="test.py", score=0.5)
+
+        assert result.additional_locations == []
+
+    def test_search_result_with_additional_locations(self):
+        """SearchResult should accept additional_locations."""
+        locations = [
+            AdditionalLocation(path="other.py", score=0.4, start_line=5),
+        ]
+
+        result = SearchResult(
+            path="main.py",
+            score=0.5,
+            additional_locations=locations
+        )
+
+        assert len(result.additional_locations) == 1
+        assert result.additional_locations[0].path == "other.py"
+
+    def test_search_result_serialization_with_locations(self):
+        """SearchResult with additional_locations should serialize correctly."""
+        locations = [
+            AdditionalLocation(path="loc1.py", score=0.4),
+            AdditionalLocation(path="loc2.py", score=0.3),
+        ]
+
+        result = SearchResult(
+            path="main.py",
+            score=0.5,
+            excerpt="code",
+            additional_locations=locations
+        )
+
+        data = result.model_dump()
+
+        assert len(data["additional_locations"]) == 2
+        assert data["additional_locations"][0]["path"] == "loc1.py"
+        assert data["additional_locations"][1]["path"] == "loc2.py"
+
+
+# =============================================================================
+# Level 8: Stress/Performance Tests
+# =============================================================================
+
+class TestGroupSimilarResultsPerformance:
+    """Performance and stress tests."""
+
+    def test_handles_large_result_set(self):
+        """Should handle large number of results efficiently."""
+        # Create 1000 results with 100 different excerpts
+        results = []
+        for i in range(1000):
+            excerpt_id = i % 100
+            results.append(SearchResult(
+                path=f"file_{i}.py",
+                score=0.5 + (i % 10) * 0.01,  # Scores vary slightly
+                excerpt=f"def func_{excerpt_id}(): pass",
+                start_line=i,
+            ))
+
+        grouped = group_similar_results(results, score_threshold_abs=0.05)
+
+        # Should reduce to approximately 100 groups (one per excerpt)
+        # with some variation due to score subgrouping
+        assert len(grouped) <= 200
+        assert len(grouped) >= 50  # At least some grouping happened
+
+    def test_handles_all_identical_results(self):
+        """Should handle case where all results are identical."""
+        results = [
+            SearchResult(path=f"file_{i}.py", score=0.5, excerpt="same code")
+            for i in range(100)
+        ]
+
+        grouped = group_similar_results(results, score_threshold_abs=0.01)
+
+        # All should be grouped into one
+        assert len(grouped) == 1
+        assert len(grouped[0].additional_locations) == 99
+
+    def test_handles_all_unique_results(self):
+        """Should handle case where all results are unique."""
+        results = [
+            SearchResult(path=f"file_{i}.py", score=0.5, excerpt=f"unique_{i}")
+            for i in range(100)
+        ]
+
+        grouped = group_similar_results(results, score_threshold_abs=0.01)
+
+        # None should be grouped
+        assert len(grouped) == 100
+        for r in grouped:
+            assert len(r.additional_locations) == 0
+
+
+# =============================================================================
+# Level 9: Real-world Scenario Tests
+# =============================================================================
+
+class TestGroupSimilarResultsRealWorld:
+    """Tests simulating real-world usage scenarios."""
+
+    def test_rrf_fusion_scores_grouping(self):
+        """Test with typical RRF fusion score ranges (0.001 - 0.02)."""
+        results = [
+            SearchResult(path="auth/login.py", score=0.0164, excerpt="def authenticate():"),
+            SearchResult(path="auth/oauth.py", score=0.0163, excerpt="def authenticate():"),
+            SearchResult(path="auth/basic.py", score=0.0162, excerpt="def authenticate():"),
+            SearchResult(path="utils/helper.py", score=0.0082, excerpt="def helper():"),
+        ]
+
+        # RRF scores are typically very small, use appropriate threshold
+        grouped = group_similar_results(results, score_threshold_abs=0.001)
+
+        assert len(grouped) == 2
+
+        auth_group = next(r for r in grouped if "auth" in r.path)
+        assert len(auth_group.additional_locations) == 2
+
+    def test_duplicate_code_detection(self):
+        """Simulate detecting duplicate code across files."""
+        duplicate_code = """
+def calculate_total(items):
+    return sum(item.price for item in items)
+"""
+        results = [
+            SearchResult(path="orders/service.py", score=0.5, excerpt=duplicate_code, start_line=45),
+            SearchResult(path="cart/calculator.py", score=0.5, excerpt=duplicate_code, start_line=12),
+            SearchResult(path="invoices/generator.py", score=0.5, excerpt=duplicate_code, start_line=78),
+        ]
+
+        grouped = group_similar_results(results, score_threshold_abs=0.01)
+
+        # All duplicates should be grouped
+        assert len(grouped) == 1
+        assert len(grouped[0].additional_locations) == 2
+
+        # Can identify all locations
+        all_paths = {grouped[0].path} | {loc.path for loc in grouped[0].additional_locations}
+        assert all_paths == {"orders/service.py", "cart/calculator.py", "invoices/generator.py"}
+
+    def test_mixed_relevance_results(self):
+        """Test with mixed relevance results typical of code search."""
+        results = [
+            # High relevance group - exact match
+            SearchResult(path="core.py", score=0.9, excerpt="def process():"),
+            SearchResult(path="core_v2.py", score=0.89, excerpt="def process():"),
+            # Medium relevance - partial match
+            SearchResult(path="utils.py", score=0.5, excerpt="def process_data():"),
+            # Low relevance - tangential
+            SearchResult(path="test.py", score=0.2, excerpt="def test_process():"),
+        ]
+
+        grouped = group_similar_results(results, score_threshold_abs=0.02)
+
+        # core.py and core_v2.py should group (same excerpt, similar score)
+        # Others should remain separate (different excerpts)
+        assert len(grouped) == 3
+
+        high_rel = next(r for r in grouped if r.score >= 0.89)
+        assert len(high_rel.additional_locations) == 1