feat: Add search result grouping by similarity score

Add functionality to group search results with similar content and scores
into a single representative result with additional locations.

Changes:
- Add AdditionalLocation entity model for storing grouped result locations
- Add additional_locations field to SearchResult for backward compatibility
- Implement group_similar_results() function in ranking.py with:
  - Content-based grouping (by excerpt or content field)
  - Score-based sub-grouping with configurable threshold
  - Metadata preservation with grouped_count tracking
- Add group_results and grouping_threshold options to SearchOptions
- Integrate grouping into ChainSearchEngine.search() after RRF fusion

Test coverage:
- 36 multi-level tests covering unit, boundary, integration, and performance
- Real-world scenario tests for RRF scores and duplicate code detection

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
catlog22
2025-12-19 16:33:44 +08:00
parent 3428642d04
commit 7adde91e9f
4 changed files with 738 additions and 2 deletions

View File

@@ -90,6 +90,21 @@ class CodeRelationship(BaseModel):
return value
class AdditionalLocation(BaseModel):
"""A pointer to another location where a similar result was found.
Used for grouping search results with similar scores and content,
where the primary result is stored in SearchResult and secondary
locations are stored in this model.
"""
path: str = Field(..., min_length=1)
score: float = Field(..., ge=0.0)
start_line: Optional[int] = Field(default=None, description="Start line of the result (1-based)")
end_line: Optional[int] = Field(default=None, description="End line of the result (1-based)")
symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol")
class SearchResult(BaseModel):
"""A unified search result for lexical or semantic search."""
@@ -100,10 +115,16 @@ class SearchResult(BaseModel):
symbol: Optional[Symbol] = None
chunk: Optional[SemanticChunk] = None
metadata: Dict[str, Any] = Field(default_factory=dict)
# Additional context for complete code blocks
start_line: Optional[int] = Field(default=None, description="Start line of code block (1-based)")
end_line: Optional[int] = Field(default=None, description="End line of code block (1-based)")
symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol/function/class")
symbol_kind: Optional[str] = Field(default=None, description="Kind of symbol (function/class/method)")
# Field for grouping similar results
additional_locations: List["AdditionalLocation"] = Field(
default_factory=list,
description="Other locations for grouped results with similar scores and content."
)

View File

@@ -38,6 +38,8 @@ class SearchOptions:
enable_vector: Enable vector semantic search (default False)
pure_vector: If True, only use vector search without FTS fallback (default False)
hybrid_weights: Custom RRF weights for hybrid search (optional)
group_results: Enable grouping of similar results (default False)
grouping_threshold: Score threshold for grouping similar results (default 0.01)
"""
depth: int = -1
max_workers: int = 8
@@ -51,6 +53,8 @@ class SearchOptions:
enable_vector: bool = False
pure_vector: bool = False
hybrid_weights: Optional[Dict[str, float]] = None
group_results: bool = False
grouping_threshold: float = 0.01
@dataclass
@@ -210,6 +214,14 @@ class ChainSearchEngine:
# Step 4: Merge and rank
final_results = self._merge_and_rank(results, options.total_limit)
# Step 5: Optional grouping of similar results
if options.group_results:
from codexlens.search.ranking import group_similar_results
final_results = group_similar_results(
final_results, score_threshold_abs=options.grouping_threshold
)
stats.files_matched = len(final_results)
# Optional: Symbol search

View File

@@ -9,7 +9,7 @@ from __future__ import annotations
import math
from typing import Dict, List
from codexlens.entities import SearchResult
from codexlens.entities import SearchResult, AdditionalLocation
def reciprocal_rank_fusion(
@@ -158,3 +158,117 @@ def tag_search_source(results: List[SearchResult], source: str) -> List[SearchRe
tagged_results.append(tagged_result)
return tagged_results
def group_similar_results(
results: List[SearchResult],
score_threshold_abs: float = 0.01,
content_field: str = "excerpt"
) -> List[SearchResult]:
"""Group search results by content and score similarity.
Groups results that have similar content and similar scores into a single
representative result, with other locations stored in additional_locations.
Algorithm:
1. Group results by content (using excerpt or content field)
2. Within each content group, create subgroups based on score similarity
3. Select highest-scoring result as representative for each subgroup
4. Store other results in subgroup as additional_locations
Args:
results: A list of SearchResult objects (typically sorted by score)
score_threshold_abs: Absolute score difference to consider results similar.
Results with |score_a - score_b| <= threshold are grouped.
Default 0.01 is suitable for RRF fusion scores.
content_field: The field to use for content grouping ('excerpt' or 'content')
Returns:
A new list of SearchResult objects where similar items are grouped.
The list is sorted by score descending.
Examples:
>>> results = [SearchResult(path="a.py", score=0.5, excerpt="def foo()"),
... SearchResult(path="b.py", score=0.5, excerpt="def foo()")]
>>> grouped = group_similar_results(results)
>>> len(grouped) # Two results merged into one
1
>>> len(grouped[0].additional_locations) # One additional location
1
"""
if not results:
return []
# Group results by content
content_map: Dict[str, List[SearchResult]] = {}
unidentifiable_results: List[SearchResult] = []
for r in results:
key = getattr(r, content_field, None)
if key and key.strip():
content_map.setdefault(key, []).append(r)
else:
# Results without content can't be grouped by content
unidentifiable_results.append(r)
final_results: List[SearchResult] = []
# Process each content group
for content_group in content_map.values():
# Sort by score descending within group
content_group.sort(key=lambda r: r.score, reverse=True)
while content_group:
# Take highest scoring as representative
representative = content_group.pop(0)
others_in_group = []
remaining_for_next_pass = []
# Find results with similar scores
for item in content_group:
if abs(representative.score - item.score) <= score_threshold_abs:
others_in_group.append(item)
else:
remaining_for_next_pass.append(item)
# Create grouped result with additional locations
if others_in_group:
# Build new result with additional_locations populated
grouped_result = SearchResult(
path=representative.path,
score=representative.score,
excerpt=representative.excerpt,
content=representative.content,
symbol=representative.symbol,
chunk=representative.chunk,
metadata={
**representative.metadata,
"grouped_count": len(others_in_group) + 1,
},
start_line=representative.start_line,
end_line=representative.end_line,
symbol_name=representative.symbol_name,
symbol_kind=representative.symbol_kind,
additional_locations=[
AdditionalLocation(
path=other.path,
score=other.score,
start_line=other.start_line,
end_line=other.end_line,
symbol_name=other.symbol_name,
) for other in others_in_group
],
)
final_results.append(grouped_result)
else:
final_results.append(representative)
content_group = remaining_for_next_pass
# Add ungroupable results
final_results.extend(unidentifiable_results)
# Sort final results by score descending
final_results.sort(key=lambda r: r.score, reverse=True)
return final_results