feat: Add search result grouping by similarity score

Add functionality to group search results with similar content and scores into a single representative result with additional locations. Changes: - Add AdditionalLocation entity model for storing grouped result locations - Add additional_locations field to SearchResult for backward compatibility - Implement group_similar_results() function in ranking.py with: - Content-based grouping (by excerpt or content field) - Score-based sub-grouping with configurable threshold - Metadata preservation with grouped_count tracking - Add group_results and grouping_threshold options to SearchOptions - Integrate grouping into ChainSearchEngine.search() after RRF fusion Test coverage: - 36 multi-level tests covering unit, boundary, integration, and performance - Real-world scenario tests for RRF scores and duplicate code detection 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-09 02:24:11 +08:00 · 2025-12-19 16:33:44 +08:00
parent 3428642d04
commit 7adde91e9f
4 changed files with 738 additions and 2 deletions
--- a/codex-lens/src/codexlens/entities.py
+++ b/codex-lens/src/codexlens/entities.py
@@ -90,6 +90,21 @@ class CodeRelationship(BaseModel):
        return value


+class AdditionalLocation(BaseModel):
+    """A pointer to another location where a similar result was found.
+
+    Used for grouping search results with similar scores and content,
+    where the primary result is stored in SearchResult and secondary
+    locations are stored in this model.
+    """
+
+    path: str = Field(..., min_length=1)
+    score: float = Field(..., ge=0.0)
+    start_line: Optional[int] = Field(default=None, description="Start line of the result (1-based)")
+    end_line: Optional[int] = Field(default=None, description="End line of the result (1-based)")
+    symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol")
+
+
 class SearchResult(BaseModel):
    """A unified search result for lexical or semantic search."""

@@ -100,10 +115,16 @@ class SearchResult(BaseModel):
    symbol: Optional[Symbol] = None
    chunk: Optional[SemanticChunk] = None
    metadata: Dict[str, Any] = Field(default_factory=dict)
-    
+
    # Additional context for complete code blocks
    start_line: Optional[int] = Field(default=None, description="Start line of code block (1-based)")
    end_line: Optional[int] = Field(default=None, description="End line of code block (1-based)")
    symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol/function/class")
    symbol_kind: Optional[str] = Field(default=None, description="Kind of symbol (function/class/method)")

+    # Field for grouping similar results
+    additional_locations: List["AdditionalLocation"] = Field(
+        default_factory=list,
+        description="Other locations for grouped results with similar scores and content."
+    )
+
--- a/codex-lens/src/codexlens/search/chain_search.py
+++ b/codex-lens/src/codexlens/search/chain_search.py
@@ -38,6 +38,8 @@ class SearchOptions:
        enable_vector: Enable vector semantic search (default False)
        pure_vector: If True, only use vector search without FTS fallback (default False)
        hybrid_weights: Custom RRF weights for hybrid search (optional)
+        group_results: Enable grouping of similar results (default False)
+        grouping_threshold: Score threshold for grouping similar results (default 0.01)
    """
    depth: int = -1
    max_workers: int = 8
@@ -51,6 +53,8 @@ class SearchOptions:
    enable_vector: bool = False
    pure_vector: bool = False
    hybrid_weights: Optional[Dict[str, float]] = None
+    group_results: bool = False
+    grouping_threshold: float = 0.01


@dataclass
@@ -210,6 +214,14 @@ class ChainSearchEngine:

        # Step 4: Merge and rank
        final_results = self._merge_and_rank(results, options.total_limit)
+
+        # Step 5: Optional grouping of similar results
+        if options.group_results:
+            from codexlens.search.ranking import group_similar_results
+            final_results = group_similar_results(
+                final_results, score_threshold_abs=options.grouping_threshold
+            )
+
        stats.files_matched = len(final_results)

        # Optional: Symbol search
--- a/codex-lens/src/codexlens/search/ranking.py
+++ b/codex-lens/src/codexlens/search/ranking.py
@@ -9,7 +9,7 @@ from __future__ import annotations
 import math
 from typing import Dict, List

-from codexlens.entities import SearchResult
+from codexlens.entities import SearchResult, AdditionalLocation


 def reciprocal_rank_fusion(
@@ -158,3 +158,117 @@ def tag_search_source(results: List[SearchResult], source: str) -> List[SearchRe
        tagged_results.append(tagged_result)

    return tagged_results
+
+
+def group_similar_results(
+    results: List[SearchResult],
+    score_threshold_abs: float = 0.01,
+    content_field: str = "excerpt"
+) -> List[SearchResult]:
+    """Group search results by content and score similarity.
+
+    Groups results that have similar content and similar scores into a single
+    representative result, with other locations stored in additional_locations.
+
+    Algorithm:
+    1. Group results by content (using excerpt or content field)
+    2. Within each content group, create subgroups based on score similarity
+    3. Select highest-scoring result as representative for each subgroup
+    4. Store other results in subgroup as additional_locations
+
+    Args:
+        results: A list of SearchResult objects (typically sorted by score)
+        score_threshold_abs: Absolute score difference to consider results similar.
+                            Results with |score_a - score_b| <= threshold are grouped.
+                            Default 0.01 is suitable for RRF fusion scores.
+        content_field: The field to use for content grouping ('excerpt' or 'content')
+
+    Returns:
+        A new list of SearchResult objects where similar items are grouped.
+        The list is sorted by score descending.
+
+    Examples:
+        >>> results = [SearchResult(path="a.py", score=0.5, excerpt="def foo()"),
+        ...            SearchResult(path="b.py", score=0.5, excerpt="def foo()")]
+        >>> grouped = group_similar_results(results)
+        >>> len(grouped)  # Two results merged into one
+        1
+        >>> len(grouped[0].additional_locations)  # One additional location
+        1
+    """
+    if not results:
+        return []
+
+    # Group results by content
+    content_map: Dict[str, List[SearchResult]] = {}
+    unidentifiable_results: List[SearchResult] = []
+
+    for r in results:
+        key = getattr(r, content_field, None)
+        if key and key.strip():
+            content_map.setdefault(key, []).append(r)
+        else:
+            # Results without content can't be grouped by content
+            unidentifiable_results.append(r)
+
+    final_results: List[SearchResult] = []
+
+    # Process each content group
+    for content_group in content_map.values():
+        # Sort by score descending within group
+        content_group.sort(key=lambda r: r.score, reverse=True)
+
+        while content_group:
+            # Take highest scoring as representative
+            representative = content_group.pop(0)
+            others_in_group = []
+            remaining_for_next_pass = []
+
+            # Find results with similar scores
+            for item in content_group:
+                if abs(representative.score - item.score) <= score_threshold_abs:
+                    others_in_group.append(item)
+                else:
+                    remaining_for_next_pass.append(item)
+
+            # Create grouped result with additional locations
+            if others_in_group:
+                # Build new result with additional_locations populated
+                grouped_result = SearchResult(
+                    path=representative.path,
+                    score=representative.score,
+                    excerpt=representative.excerpt,
+                    content=representative.content,
+                    symbol=representative.symbol,
+                    chunk=representative.chunk,
+                    metadata={
+                        **representative.metadata,
+                        "grouped_count": len(others_in_group) + 1,
+                    },
+                    start_line=representative.start_line,
+                    end_line=representative.end_line,
+                    symbol_name=representative.symbol_name,
+                    symbol_kind=representative.symbol_kind,
+                    additional_locations=[
+                        AdditionalLocation(
+                            path=other.path,
+                            score=other.score,
+                            start_line=other.start_line,
+                            end_line=other.end_line,
+                            symbol_name=other.symbol_name,
+                        ) for other in others_in_group
+                    ],
+                )
+                final_results.append(grouped_result)
+            else:
+                final_results.append(representative)
+
+            content_group = remaining_for_next_pass
+
+    # Add ungroupable results
+    final_results.extend(unidentifiable_results)
+
+    # Sort final results by score descending
+    final_results.sort(key=lambda r: r.score, reverse=True)
+
+    return final_results