mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-09 02:24:11 +08:00
feat: Add search result grouping by similarity score
Add functionality to group search results with similar content and scores into a single representative result with additional locations. Changes: - Add AdditionalLocation entity model for storing grouped result locations - Add additional_locations field to SearchResult for backward compatibility - Implement group_similar_results() function in ranking.py with: - Content-based grouping (by excerpt or content field) - Score-based sub-grouping with configurable threshold - Metadata preservation with grouped_count tracking - Add group_results and grouping_threshold options to SearchOptions - Integrate grouping into ChainSearchEngine.search() after RRF fusion Test coverage: - 36 multi-level tests covering unit, boundary, integration, and performance - Real-world scenario tests for RRF scores and duplicate code detection 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -90,6 +90,21 @@ class CodeRelationship(BaseModel):
|
||||
return value
|
||||
|
||||
|
||||
class AdditionalLocation(BaseModel):
|
||||
"""A pointer to another location where a similar result was found.
|
||||
|
||||
Used for grouping search results with similar scores and content,
|
||||
where the primary result is stored in SearchResult and secondary
|
||||
locations are stored in this model.
|
||||
"""
|
||||
|
||||
path: str = Field(..., min_length=1)
|
||||
score: float = Field(..., ge=0.0)
|
||||
start_line: Optional[int] = Field(default=None, description="Start line of the result (1-based)")
|
||||
end_line: Optional[int] = Field(default=None, description="End line of the result (1-based)")
|
||||
symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol")
|
||||
|
||||
|
||||
class SearchResult(BaseModel):
|
||||
"""A unified search result for lexical or semantic search."""
|
||||
|
||||
@@ -100,10 +115,16 @@ class SearchResult(BaseModel):
|
||||
symbol: Optional[Symbol] = None
|
||||
chunk: Optional[SemanticChunk] = None
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
# Additional context for complete code blocks
|
||||
start_line: Optional[int] = Field(default=None, description="Start line of code block (1-based)")
|
||||
end_line: Optional[int] = Field(default=None, description="End line of code block (1-based)")
|
||||
symbol_name: Optional[str] = Field(default=None, description="Name of matched symbol/function/class")
|
||||
symbol_kind: Optional[str] = Field(default=None, description="Kind of symbol (function/class/method)")
|
||||
|
||||
# Field for grouping similar results
|
||||
additional_locations: List["AdditionalLocation"] = Field(
|
||||
default_factory=list,
|
||||
description="Other locations for grouped results with similar scores and content."
|
||||
)
|
||||
|
||||
|
||||
@@ -38,6 +38,8 @@ class SearchOptions:
|
||||
enable_vector: Enable vector semantic search (default False)
|
||||
pure_vector: If True, only use vector search without FTS fallback (default False)
|
||||
hybrid_weights: Custom RRF weights for hybrid search (optional)
|
||||
group_results: Enable grouping of similar results (default False)
|
||||
grouping_threshold: Score threshold for grouping similar results (default 0.01)
|
||||
"""
|
||||
depth: int = -1
|
||||
max_workers: int = 8
|
||||
@@ -51,6 +53,8 @@ class SearchOptions:
|
||||
enable_vector: bool = False
|
||||
pure_vector: bool = False
|
||||
hybrid_weights: Optional[Dict[str, float]] = None
|
||||
group_results: bool = False
|
||||
grouping_threshold: float = 0.01
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -210,6 +214,14 @@ class ChainSearchEngine:
|
||||
|
||||
# Step 4: Merge and rank
|
||||
final_results = self._merge_and_rank(results, options.total_limit)
|
||||
|
||||
# Step 5: Optional grouping of similar results
|
||||
if options.group_results:
|
||||
from codexlens.search.ranking import group_similar_results
|
||||
final_results = group_similar_results(
|
||||
final_results, score_threshold_abs=options.grouping_threshold
|
||||
)
|
||||
|
||||
stats.files_matched = len(final_results)
|
||||
|
||||
# Optional: Symbol search
|
||||
|
||||
@@ -9,7 +9,7 @@ from __future__ import annotations
|
||||
import math
|
||||
from typing import Dict, List
|
||||
|
||||
from codexlens.entities import SearchResult
|
||||
from codexlens.entities import SearchResult, AdditionalLocation
|
||||
|
||||
|
||||
def reciprocal_rank_fusion(
|
||||
@@ -158,3 +158,117 @@ def tag_search_source(results: List[SearchResult], source: str) -> List[SearchRe
|
||||
tagged_results.append(tagged_result)
|
||||
|
||||
return tagged_results
|
||||
|
||||
|
||||
def group_similar_results(
|
||||
results: List[SearchResult],
|
||||
score_threshold_abs: float = 0.01,
|
||||
content_field: str = "excerpt"
|
||||
) -> List[SearchResult]:
|
||||
"""Group search results by content and score similarity.
|
||||
|
||||
Groups results that have similar content and similar scores into a single
|
||||
representative result, with other locations stored in additional_locations.
|
||||
|
||||
Algorithm:
|
||||
1. Group results by content (using excerpt or content field)
|
||||
2. Within each content group, create subgroups based on score similarity
|
||||
3. Select highest-scoring result as representative for each subgroup
|
||||
4. Store other results in subgroup as additional_locations
|
||||
|
||||
Args:
|
||||
results: A list of SearchResult objects (typically sorted by score)
|
||||
score_threshold_abs: Absolute score difference to consider results similar.
|
||||
Results with |score_a - score_b| <= threshold are grouped.
|
||||
Default 0.01 is suitable for RRF fusion scores.
|
||||
content_field: The field to use for content grouping ('excerpt' or 'content')
|
||||
|
||||
Returns:
|
||||
A new list of SearchResult objects where similar items are grouped.
|
||||
The list is sorted by score descending.
|
||||
|
||||
Examples:
|
||||
>>> results = [SearchResult(path="a.py", score=0.5, excerpt="def foo()"),
|
||||
... SearchResult(path="b.py", score=0.5, excerpt="def foo()")]
|
||||
>>> grouped = group_similar_results(results)
|
||||
>>> len(grouped) # Two results merged into one
|
||||
1
|
||||
>>> len(grouped[0].additional_locations) # One additional location
|
||||
1
|
||||
"""
|
||||
if not results:
|
||||
return []
|
||||
|
||||
# Group results by content
|
||||
content_map: Dict[str, List[SearchResult]] = {}
|
||||
unidentifiable_results: List[SearchResult] = []
|
||||
|
||||
for r in results:
|
||||
key = getattr(r, content_field, None)
|
||||
if key and key.strip():
|
||||
content_map.setdefault(key, []).append(r)
|
||||
else:
|
||||
# Results without content can't be grouped by content
|
||||
unidentifiable_results.append(r)
|
||||
|
||||
final_results: List[SearchResult] = []
|
||||
|
||||
# Process each content group
|
||||
for content_group in content_map.values():
|
||||
# Sort by score descending within group
|
||||
content_group.sort(key=lambda r: r.score, reverse=True)
|
||||
|
||||
while content_group:
|
||||
# Take highest scoring as representative
|
||||
representative = content_group.pop(0)
|
||||
others_in_group = []
|
||||
remaining_for_next_pass = []
|
||||
|
||||
# Find results with similar scores
|
||||
for item in content_group:
|
||||
if abs(representative.score - item.score) <= score_threshold_abs:
|
||||
others_in_group.append(item)
|
||||
else:
|
||||
remaining_for_next_pass.append(item)
|
||||
|
||||
# Create grouped result with additional locations
|
||||
if others_in_group:
|
||||
# Build new result with additional_locations populated
|
||||
grouped_result = SearchResult(
|
||||
path=representative.path,
|
||||
score=representative.score,
|
||||
excerpt=representative.excerpt,
|
||||
content=representative.content,
|
||||
symbol=representative.symbol,
|
||||
chunk=representative.chunk,
|
||||
metadata={
|
||||
**representative.metadata,
|
||||
"grouped_count": len(others_in_group) + 1,
|
||||
},
|
||||
start_line=representative.start_line,
|
||||
end_line=representative.end_line,
|
||||
symbol_name=representative.symbol_name,
|
||||
symbol_kind=representative.symbol_kind,
|
||||
additional_locations=[
|
||||
AdditionalLocation(
|
||||
path=other.path,
|
||||
score=other.score,
|
||||
start_line=other.start_line,
|
||||
end_line=other.end_line,
|
||||
symbol_name=other.symbol_name,
|
||||
) for other in others_in_group
|
||||
],
|
||||
)
|
||||
final_results.append(grouped_result)
|
||||
else:
|
||||
final_results.append(representative)
|
||||
|
||||
content_group = remaining_for_next_pass
|
||||
|
||||
# Add ungroupable results
|
||||
final_results.extend(unidentifiable_results)
|
||||
|
||||
# Sort final results by score descending
|
||||
final_results.sort(key=lambda r: r.score, reverse=True)
|
||||
|
||||
return final_results
|
||||
|
||||
Reference in New Issue
Block a user