Refactor code structure and remove redundant changes

This commit is contained in:
catlog22
2026-01-24 14:47:47 +08:00
parent cf5fecd66d
commit f2b0a5bbc9
113 changed files with 43217 additions and 235 deletions

View File

@@ -0,0 +1,124 @@
"""Clustering strategies for the staged hybrid search pipeline.
This module provides extensible clustering infrastructure for grouping
similar search results and selecting representative results.
Install with: pip install codexlens[clustering]
Example:
>>> from codexlens.search.clustering import (
... CLUSTERING_AVAILABLE,
... ClusteringConfig,
... get_strategy,
... )
>>> config = ClusteringConfig(min_cluster_size=3)
>>> # Auto-select best available strategy with fallback
>>> strategy = get_strategy("auto", config)
>>> representatives = strategy.fit_predict(embeddings, results)
>>>
>>> # Or explicitly use a specific strategy
>>> if CLUSTERING_AVAILABLE:
... from codexlens.search.clustering import HDBSCANStrategy
... strategy = HDBSCANStrategy(config)
... representatives = strategy.fit_predict(embeddings, results)
"""
from __future__ import annotations
# Always export base classes and factory (no heavy dependencies)
from .base import BaseClusteringStrategy, ClusteringConfig
from .factory import (
ClusteringStrategyFactory,
check_clustering_strategy_available,
get_strategy,
)
from .noop_strategy import NoOpStrategy
from .frequency_strategy import FrequencyStrategy, FrequencyConfig
# Feature flag for clustering availability (hdbscan + sklearn)
CLUSTERING_AVAILABLE = False
HDBSCAN_AVAILABLE = False
DBSCAN_AVAILABLE = False
_import_error: str | None = None
def _detect_clustering_available() -> tuple[bool, bool, bool, str | None]:
"""Detect if clustering dependencies are available.
Returns:
Tuple of (all_available, hdbscan_available, dbscan_available, error_message).
"""
hdbscan_ok = False
dbscan_ok = False
try:
import hdbscan # noqa: F401
hdbscan_ok = True
except ImportError:
pass
try:
from sklearn.cluster import DBSCAN # noqa: F401
dbscan_ok = True
except ImportError:
pass
all_ok = hdbscan_ok and dbscan_ok
error = None
if not all_ok:
missing = []
if not hdbscan_ok:
missing.append("hdbscan")
if not dbscan_ok:
missing.append("scikit-learn")
error = f"{', '.join(missing)} not available. Install with: pip install codexlens[clustering]"
return all_ok, hdbscan_ok, dbscan_ok, error
# Initialize on module load
CLUSTERING_AVAILABLE, HDBSCAN_AVAILABLE, DBSCAN_AVAILABLE, _import_error = (
_detect_clustering_available()
)
def check_clustering_available() -> tuple[bool, str | None]:
"""Check if all clustering dependencies are available.
Returns:
Tuple of (is_available, error_message).
error_message is None if available, otherwise contains install instructions.
"""
return CLUSTERING_AVAILABLE, _import_error
# Conditionally export strategy implementations
__all__ = [
# Feature flags
"CLUSTERING_AVAILABLE",
"HDBSCAN_AVAILABLE",
"DBSCAN_AVAILABLE",
"check_clustering_available",
# Base classes
"BaseClusteringStrategy",
"ClusteringConfig",
# Factory
"ClusteringStrategyFactory",
"get_strategy",
"check_clustering_strategy_available",
# Always-available strategies
"NoOpStrategy",
"FrequencyStrategy",
"FrequencyConfig",
]
# Conditionally add strategy classes to __all__ and module namespace
if HDBSCAN_AVAILABLE:
from .hdbscan_strategy import HDBSCANStrategy
__all__.append("HDBSCANStrategy")
if DBSCAN_AVAILABLE:
from .dbscan_strategy import DBSCANStrategy
__all__.append("DBSCANStrategy")

View File

@@ -0,0 +1,153 @@
"""Base classes for clustering strategies in the hybrid search pipeline.
This module defines the abstract base class for clustering strategies used
in the staged hybrid search pipeline. Strategies cluster search results
based on their embeddings and select representative results from each cluster.
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, List, Optional
if TYPE_CHECKING:
import numpy as np
from codexlens.entities import SearchResult
@dataclass
class ClusteringConfig:
"""Configuration parameters for clustering strategies.
Attributes:
min_cluster_size: Minimum number of results to form a cluster.
HDBSCAN default is 5, but for search results 2-3 is often better.
min_samples: Number of samples in a neighborhood for a point to be
considered a core point. Lower values allow more clusters.
metric: Distance metric for clustering. Common options:
- 'euclidean': Standard L2 distance
- 'cosine': Cosine distance (1 - cosine_similarity)
- 'manhattan': L1 distance
cluster_selection_epsilon: Distance threshold for cluster selection.
Results within this distance may be merged into the same cluster.
allow_single_cluster: If True, allow all results to form one cluster.
Useful when results are very similar.
prediction_data: If True, generate prediction data for new points.
"""
min_cluster_size: int = 3
min_samples: int = 2
metric: str = "cosine"
cluster_selection_epsilon: float = 0.0
allow_single_cluster: bool = True
prediction_data: bool = False
def __post_init__(self) -> None:
"""Validate configuration parameters."""
if self.min_cluster_size < 2:
raise ValueError("min_cluster_size must be >= 2")
if self.min_samples < 1:
raise ValueError("min_samples must be >= 1")
if self.metric not in ("euclidean", "cosine", "manhattan"):
raise ValueError(f"metric must be one of: euclidean, cosine, manhattan; got {self.metric}")
if self.cluster_selection_epsilon < 0:
raise ValueError("cluster_selection_epsilon must be >= 0")
class BaseClusteringStrategy(ABC):
"""Abstract base class for clustering strategies.
Clustering strategies are used in the staged hybrid search pipeline to
group similar search results and select representative results from each
cluster, reducing redundancy while maintaining diversity.
Subclasses must implement:
- cluster(): Group results into clusters based on embeddings
- select_representatives(): Choose best result(s) from each cluster
"""
def __init__(self, config: Optional[ClusteringConfig] = None) -> None:
"""Initialize the clustering strategy.
Args:
config: Clustering configuration. Uses defaults if not provided.
"""
self.config = config or ClusteringConfig()
@abstractmethod
def cluster(
self,
embeddings: "np.ndarray",
results: List["SearchResult"],
) -> List[List[int]]:
"""Cluster search results based on their embeddings.
Args:
embeddings: NumPy array of shape (n_results, embedding_dim)
containing the embedding vectors for each result.
results: List of SearchResult objects corresponding to embeddings.
Used for additional metadata during clustering.
Returns:
List of clusters, where each cluster is a list of indices
into the results list. Results not assigned to any cluster
(noise points) should be returned as single-element clusters.
Example:
>>> strategy = HDBSCANStrategy()
>>> clusters = strategy.cluster(embeddings, results)
>>> # clusters = [[0, 2, 5], [1, 3], [4], [6, 7, 8]]
>>> # Result indices 0, 2, 5 are in cluster 0
>>> # Result indices 1, 3 are in cluster 1
>>> # Result index 4 is a noise point (singleton cluster)
>>> # Result indices 6, 7, 8 are in cluster 2
"""
...
@abstractmethod
def select_representatives(
self,
clusters: List[List[int]],
results: List["SearchResult"],
embeddings: Optional["np.ndarray"] = None,
) -> List["SearchResult"]:
"""Select representative results from each cluster.
This method chooses the best result(s) from each cluster to include
in the final search results. The selection can be based on:
- Highest score within cluster
- Closest to cluster centroid
- Custom selection logic
Args:
clusters: List of clusters from cluster() method.
results: Original list of SearchResult objects.
embeddings: Optional embeddings array for centroid-based selection.
Returns:
List of representative SearchResult objects, one or more per cluster,
ordered by relevance (highest score first).
Example:
>>> representatives = strategy.select_representatives(clusters, results)
>>> # Returns best result from each cluster
"""
...
def fit_predict(
self,
embeddings: "np.ndarray",
results: List["SearchResult"],
) -> List["SearchResult"]:
"""Convenience method to cluster and select representatives in one call.
Args:
embeddings: NumPy array of shape (n_results, embedding_dim).
results: List of SearchResult objects.
Returns:
List of representative SearchResult objects.
"""
clusters = self.cluster(embeddings, results)
return self.select_representatives(clusters, results, embeddings)

View File

@@ -0,0 +1,197 @@
"""DBSCAN-based clustering strategy for search results.
DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
is the fallback clustering strategy when HDBSCAN is not available.
"""
from __future__ import annotations
from typing import TYPE_CHECKING, List, Optional
from .base import BaseClusteringStrategy, ClusteringConfig
if TYPE_CHECKING:
import numpy as np
from codexlens.entities import SearchResult
class DBSCANStrategy(BaseClusteringStrategy):
"""DBSCAN-based clustering strategy.
Uses sklearn's DBSCAN algorithm as a fallback when HDBSCAN is not available.
DBSCAN requires an explicit eps parameter, which is auto-computed from the
distance distribution if not provided.
Example:
>>> from codexlens.search.clustering import DBSCANStrategy, ClusteringConfig
>>> config = ClusteringConfig(min_cluster_size=3, metric='cosine')
>>> strategy = DBSCANStrategy(config)
>>> clusters = strategy.cluster(embeddings, results)
>>> representatives = strategy.select_representatives(clusters, results)
"""
# Default eps percentile for auto-computation
DEFAULT_EPS_PERCENTILE: float = 15.0
def __init__(
self,
config: Optional[ClusteringConfig] = None,
eps: Optional[float] = None,
eps_percentile: float = DEFAULT_EPS_PERCENTILE,
) -> None:
"""Initialize DBSCAN clustering strategy.
Args:
config: Clustering configuration. Uses defaults if not provided.
eps: Explicit eps parameter for DBSCAN. If None, auto-computed
from the distance distribution.
eps_percentile: Percentile of pairwise distances to use for
auto-computing eps. Default is 15th percentile.
Raises:
ImportError: If sklearn is not installed.
"""
super().__init__(config)
self.eps = eps
self.eps_percentile = eps_percentile
# Validate sklearn is available
try:
from sklearn.cluster import DBSCAN # noqa: F401
except ImportError as exc:
raise ImportError(
"scikit-learn package is required for DBSCANStrategy. "
"Install with: pip install codexlens[clustering]"
) from exc
def _compute_eps(self, embeddings: "np.ndarray") -> float:
"""Auto-compute eps from pairwise distance distribution.
Uses the specified percentile of pairwise distances as eps,
which typically captures local density well.
Args:
embeddings: NumPy array of shape (n_results, embedding_dim).
Returns:
Computed eps value.
"""
import numpy as np
from sklearn.metrics import pairwise_distances
# Compute pairwise distances
distances = pairwise_distances(embeddings, metric=self.config.metric)
# Get upper triangle (excluding diagonal)
upper_tri = distances[np.triu_indices_from(distances, k=1)]
if len(upper_tri) == 0:
# Only one point, return a default small eps
return 0.1
# Use percentile of distances as eps
eps = float(np.percentile(upper_tri, self.eps_percentile))
# Ensure eps is positive
return max(eps, 1e-6)
def cluster(
self,
embeddings: "np.ndarray",
results: List["SearchResult"],
) -> List[List[int]]:
"""Cluster search results using DBSCAN algorithm.
Args:
embeddings: NumPy array of shape (n_results, embedding_dim)
containing the embedding vectors for each result.
results: List of SearchResult objects corresponding to embeddings.
Returns:
List of clusters, where each cluster is a list of indices
into the results list. Noise points are returned as singleton clusters.
"""
from sklearn.cluster import DBSCAN
import numpy as np
n_results = len(results)
if n_results == 0:
return []
# Handle edge case: single result
if n_results == 1:
return [[0]]
# Determine eps value
eps = self.eps if self.eps is not None else self._compute_eps(embeddings)
# Configure DBSCAN clusterer
# Note: DBSCAN min_samples corresponds to min_cluster_size concept
clusterer = DBSCAN(
eps=eps,
min_samples=self.config.min_samples,
metric=self.config.metric,
)
# Fit and get cluster labels
# Labels: -1 = noise, 0+ = cluster index
labels = clusterer.fit_predict(embeddings)
# Group indices by cluster label
cluster_map: dict[int, list[int]] = {}
for idx, label in enumerate(labels):
if label not in cluster_map:
cluster_map[label] = []
cluster_map[label].append(idx)
# Build result: non-noise clusters first, then noise as singletons
clusters: List[List[int]] = []
# Add proper clusters (label >= 0)
for label in sorted(cluster_map.keys()):
if label >= 0:
clusters.append(cluster_map[label])
# Add noise points as singleton clusters (label == -1)
if -1 in cluster_map:
for idx in cluster_map[-1]:
clusters.append([idx])
return clusters
def select_representatives(
self,
clusters: List[List[int]],
results: List["SearchResult"],
embeddings: Optional["np.ndarray"] = None,
) -> List["SearchResult"]:
"""Select representative results from each cluster.
Selects the result with the highest score from each cluster.
Args:
clusters: List of clusters from cluster() method.
results: Original list of SearchResult objects.
embeddings: Optional embeddings (not used in score-based selection).
Returns:
List of representative SearchResult objects, one per cluster,
ordered by score (highest first).
"""
if not clusters or not results:
return []
representatives: List["SearchResult"] = []
for cluster_indices in clusters:
if not cluster_indices:
continue
# Find the result with the highest score in this cluster
best_idx = max(cluster_indices, key=lambda i: results[i].score)
representatives.append(results[best_idx])
# Sort by score descending
representatives.sort(key=lambda r: r.score, reverse=True)
return representatives

View File

@@ -0,0 +1,202 @@
"""Factory for creating clustering strategies.
Provides a unified interface for instantiating different clustering backends
with automatic fallback chain: hdbscan -> dbscan -> noop.
"""
from __future__ import annotations
from typing import Any, Optional
from .base import BaseClusteringStrategy, ClusteringConfig
from .noop_strategy import NoOpStrategy
def check_clustering_strategy_available(strategy: str) -> tuple[bool, str | None]:
"""Check whether a specific clustering strategy can be used.
Args:
strategy: Strategy name to check. Options:
- "hdbscan": HDBSCAN clustering (requires hdbscan package)
- "dbscan": DBSCAN clustering (requires sklearn)
- "frequency": Frequency-based clustering (always available)
- "noop": No-op strategy (always available)
Returns:
Tuple of (is_available, error_message).
error_message is None if available, otherwise contains install instructions.
"""
strategy = (strategy or "").strip().lower()
if strategy == "hdbscan":
try:
import hdbscan # noqa: F401
except ImportError:
return False, (
"hdbscan package not available. "
"Install with: pip install codexlens[clustering]"
)
return True, None
if strategy == "dbscan":
try:
from sklearn.cluster import DBSCAN # noqa: F401
except ImportError:
return False, (
"scikit-learn package not available. "
"Install with: pip install codexlens[clustering]"
)
return True, None
if strategy == "frequency":
# Frequency strategy is always available (no external deps)
return True, None
if strategy == "noop":
return True, None
return False, (
f"Invalid clustering strategy: {strategy}. "
"Must be 'hdbscan', 'dbscan', 'frequency', or 'noop'."
)
def get_strategy(
strategy: str = "hdbscan",
config: Optional[ClusteringConfig] = None,
*,
fallback: bool = True,
**kwargs: Any,
) -> BaseClusteringStrategy:
"""Factory function to create clustering strategy with fallback chain.
The fallback chain is: hdbscan -> dbscan -> frequency -> noop
Args:
strategy: Clustering strategy to use. Options:
- "hdbscan": HDBSCAN clustering (default, recommended)
- "dbscan": DBSCAN clustering (fallback)
- "frequency": Frequency-based clustering (groups by symbol occurrence)
- "noop": No-op strategy (returns all results ungrouped)
- "auto": Try hdbscan, then dbscan, then noop
config: Clustering configuration. Uses defaults if not provided.
For frequency strategy, pass FrequencyConfig for full control.
fallback: If True (default), automatically fall back to next strategy
in the chain when primary is unavailable. If False, raise ImportError
when requested strategy is unavailable.
**kwargs: Additional strategy-specific arguments.
For DBSCANStrategy: eps, eps_percentile
For FrequencyStrategy: group_by, min_frequency, etc.
Returns:
BaseClusteringStrategy: Configured clustering strategy instance.
Raises:
ValueError: If strategy is not recognized.
ImportError: If required dependencies are not installed and fallback=False.
Example:
>>> from codexlens.search.clustering import get_strategy, ClusteringConfig
>>> config = ClusteringConfig(min_cluster_size=3)
>>> # Auto-select best available strategy
>>> strategy = get_strategy("auto", config)
>>> # Explicitly use HDBSCAN (will fall back if unavailable)
>>> strategy = get_strategy("hdbscan", config)
>>> # Use frequency-based strategy
>>> from codexlens.search.clustering import FrequencyConfig
>>> freq_config = FrequencyConfig(min_frequency=2, group_by="symbol")
>>> strategy = get_strategy("frequency", freq_config)
"""
strategy = (strategy or "").strip().lower()
# Handle "auto" - try strategies in order
if strategy == "auto":
return _get_best_available_strategy(config, **kwargs)
if strategy == "hdbscan":
ok, err = check_clustering_strategy_available("hdbscan")
if ok:
from .hdbscan_strategy import HDBSCANStrategy
return HDBSCANStrategy(config)
if fallback:
# Try dbscan fallback
ok_dbscan, _ = check_clustering_strategy_available("dbscan")
if ok_dbscan:
from .dbscan_strategy import DBSCANStrategy
return DBSCANStrategy(config, **kwargs)
# Final fallback to noop
return NoOpStrategy(config)
raise ImportError(err)
if strategy == "dbscan":
ok, err = check_clustering_strategy_available("dbscan")
if ok:
from .dbscan_strategy import DBSCANStrategy
return DBSCANStrategy(config, **kwargs)
if fallback:
# Fallback to noop
return NoOpStrategy(config)
raise ImportError(err)
if strategy == "frequency":
from .frequency_strategy import FrequencyStrategy, FrequencyConfig
# If config is ClusteringConfig but not FrequencyConfig, create default FrequencyConfig
if config is None or not isinstance(config, FrequencyConfig):
freq_config = FrequencyConfig(**kwargs) if kwargs else FrequencyConfig()
else:
freq_config = config
return FrequencyStrategy(freq_config)
if strategy == "noop":
return NoOpStrategy(config)
raise ValueError(
f"Unknown clustering strategy: {strategy}. "
"Supported strategies: 'hdbscan', 'dbscan', 'frequency', 'noop', 'auto'"
)
def _get_best_available_strategy(
config: Optional[ClusteringConfig] = None,
**kwargs: Any,
) -> BaseClusteringStrategy:
"""Get the best available clustering strategy.
Tries strategies in order: hdbscan -> dbscan -> noop
Args:
config: Clustering configuration.
**kwargs: Additional strategy-specific arguments.
Returns:
Best available clustering strategy instance.
"""
# Try HDBSCAN first
ok, _ = check_clustering_strategy_available("hdbscan")
if ok:
from .hdbscan_strategy import HDBSCANStrategy
return HDBSCANStrategy(config)
# Try DBSCAN second
ok, _ = check_clustering_strategy_available("dbscan")
if ok:
from .dbscan_strategy import DBSCANStrategy
return DBSCANStrategy(config, **kwargs)
# Fallback to NoOp
return NoOpStrategy(config)
# Alias for backward compatibility
ClusteringStrategyFactory = type(
"ClusteringStrategyFactory",
(),
{
"get_strategy": staticmethod(get_strategy),
"check_available": staticmethod(check_clustering_strategy_available),
},
)

View File

@@ -0,0 +1,263 @@
"""Frequency-based clustering strategy for search result deduplication.
This strategy groups search results by symbol/method name and prunes based on
occurrence frequency. High-frequency symbols (frequently referenced methods)
are considered more important and retained, while low-frequency results
(potentially noise) can be filtered out.
Use cases:
- Prioritize commonly called methods/functions
- Filter out one-off results that may be less relevant
- Deduplicate results pointing to the same symbol from different locations
"""
from __future__ import annotations
from collections import defaultdict
from dataclasses import dataclass
from typing import TYPE_CHECKING, Dict, List, Optional, Literal
from .base import BaseClusteringStrategy, ClusteringConfig
if TYPE_CHECKING:
import numpy as np
from codexlens.entities import SearchResult
@dataclass
class FrequencyConfig(ClusteringConfig):
"""Configuration for frequency-based clustering strategy.
Attributes:
group_by: Field to group results by for frequency counting.
- 'symbol': Group by symbol_name (default, for method/function dedup)
- 'file': Group by file path
- 'symbol_kind': Group by symbol type (function, class, etc.)
min_frequency: Minimum occurrence count to keep a result.
Results appearing less than this are considered noise and pruned.
max_representatives_per_group: Maximum results to keep per symbol group.
frequency_weight: How much to boost score based on frequency.
Final score = original_score * (1 + frequency_weight * log(frequency))
keep_mode: How to handle low-frequency results.
- 'filter': Remove results below min_frequency
- 'demote': Keep but lower their score ranking
"""
group_by: Literal["symbol", "file", "symbol_kind"] = "symbol"
min_frequency: int = 1 # 1 means keep all, 2+ filters singletons
max_representatives_per_group: int = 3
frequency_weight: float = 0.1 # Boost factor for frequency
keep_mode: Literal["filter", "demote"] = "demote"
def __post_init__(self) -> None:
"""Validate configuration parameters."""
# Skip parent validation since we don't use HDBSCAN params
if self.min_frequency < 1:
raise ValueError("min_frequency must be >= 1")
if self.max_representatives_per_group < 1:
raise ValueError("max_representatives_per_group must be >= 1")
if self.frequency_weight < 0:
raise ValueError("frequency_weight must be >= 0")
if self.group_by not in ("symbol", "file", "symbol_kind"):
raise ValueError(f"group_by must be one of: symbol, file, symbol_kind; got {self.group_by}")
if self.keep_mode not in ("filter", "demote"):
raise ValueError(f"keep_mode must be one of: filter, demote; got {self.keep_mode}")
class FrequencyStrategy(BaseClusteringStrategy):
"""Frequency-based clustering strategy for search result deduplication.
This strategy groups search results by symbol name (or file/kind) and:
1. Counts how many times each symbol appears in results
2. Higher frequency = more important (frequently referenced method)
3. Filters or demotes low-frequency results
4. Selects top representatives from each frequency group
Unlike embedding-based strategies (HDBSCAN, DBSCAN), this strategy:
- Does NOT require embeddings (works with metadata only)
- Is very fast (O(n) complexity)
- Is deterministic (no random initialization)
- Works well for symbol-level deduplication
Example:
>>> config = FrequencyConfig(min_frequency=2, group_by="symbol")
>>> strategy = FrequencyStrategy(config)
>>> # Results with symbol "authenticate" appearing 5 times
>>> # will be prioritized over "helper_func" appearing once
>>> representatives = strategy.fit_predict(embeddings, results)
"""
def __init__(self, config: Optional[FrequencyConfig] = None) -> None:
"""Initialize the frequency strategy.
Args:
config: Frequency configuration. Uses defaults if not provided.
"""
self.config: FrequencyConfig = config or FrequencyConfig()
def _get_group_key(self, result: "SearchResult") -> str:
"""Extract grouping key from a search result.
Args:
result: SearchResult to extract key from.
Returns:
String key for grouping (symbol name, file path, or kind).
"""
if self.config.group_by == "symbol":
# Use symbol_name if available, otherwise fall back to file:line
symbol = getattr(result, "symbol_name", None)
if symbol:
return str(symbol)
# Fallback: use file path + start_line as pseudo-symbol
start_line = getattr(result, "start_line", 0) or 0
return f"{result.path}:{start_line}"
elif self.config.group_by == "file":
return str(result.path)
elif self.config.group_by == "symbol_kind":
kind = getattr(result, "symbol_kind", None)
return str(kind) if kind else "unknown"
return str(result.path) # Default fallback
def cluster(
self,
embeddings: "np.ndarray",
results: List["SearchResult"],
) -> List[List[int]]:
"""Group search results by frequency of occurrence.
Note: This method ignores embeddings and groups by metadata only.
The embeddings parameter is kept for interface compatibility.
Args:
embeddings: Ignored (kept for interface compatibility).
results: List of SearchResult objects to cluster.
Returns:
List of clusters (groups), where each cluster contains indices
of results with the same grouping key. Clusters are ordered by
frequency (highest frequency first).
"""
if not results:
return []
# Group results by key
groups: Dict[str, List[int]] = defaultdict(list)
for idx, result in enumerate(results):
key = self._get_group_key(result)
groups[key].append(idx)
# Sort groups by frequency (descending) then by key (for stability)
sorted_groups = sorted(
groups.items(),
key=lambda x: (-len(x[1]), x[0]) # -frequency, then alphabetical
)
# Convert to list of clusters
clusters = [indices for _, indices in sorted_groups]
return clusters
def select_representatives(
self,
clusters: List[List[int]],
results: List["SearchResult"],
embeddings: Optional["np.ndarray"] = None,
) -> List["SearchResult"]:
"""Select representative results based on frequency and score.
For each frequency group:
1. If frequency < min_frequency: filter or demote based on keep_mode
2. Sort by score within group
3. Apply frequency boost to scores
4. Select top N representatives
Args:
clusters: List of clusters from cluster() method.
results: Original list of SearchResult objects.
embeddings: Optional embeddings (used for tie-breaking if provided).
Returns:
List of representative SearchResult objects, ordered by
frequency-adjusted score (highest first).
"""
import math
if not clusters or not results:
return []
representatives: List["SearchResult"] = []
demoted: List["SearchResult"] = []
for cluster_indices in clusters:
if not cluster_indices:
continue
frequency = len(cluster_indices)
# Get results in this cluster, sorted by score
cluster_results = [results[i] for i in cluster_indices]
cluster_results.sort(key=lambda r: getattr(r, "score", 0.0), reverse=True)
# Check frequency threshold
if frequency < self.config.min_frequency:
if self.config.keep_mode == "filter":
# Skip low-frequency results entirely
continue
else: # demote mode
# Keep but add to demoted list (lower priority)
for result in cluster_results[: self.config.max_representatives_per_group]:
demoted.append(result)
continue
# Apply frequency boost and select top representatives
for result in cluster_results[: self.config.max_representatives_per_group]:
# Calculate frequency-boosted score
original_score = getattr(result, "score", 0.0)
# log(frequency + 1) to handle frequency=1 case smoothly
frequency_boost = 1.0 + self.config.frequency_weight * math.log(frequency + 1)
boosted_score = original_score * frequency_boost
# Create new result with boosted score and frequency metadata
# Note: SearchResult might be immutable, so we preserve original
# and track boosted score in metadata
if hasattr(result, "metadata") and isinstance(result.metadata, dict):
result.metadata["frequency"] = frequency
result.metadata["frequency_boosted_score"] = boosted_score
representatives.append(result)
# Sort representatives by boosted score (or original score as fallback)
def get_sort_score(r: "SearchResult") -> float:
if hasattr(r, "metadata") and isinstance(r.metadata, dict):
return r.metadata.get("frequency_boosted_score", getattr(r, "score", 0.0))
return getattr(r, "score", 0.0)
representatives.sort(key=get_sort_score, reverse=True)
# Add demoted results at the end
if demoted:
demoted.sort(key=lambda r: getattr(r, "score", 0.0), reverse=True)
representatives.extend(demoted)
return representatives
def fit_predict(
self,
embeddings: "np.ndarray",
results: List["SearchResult"],
) -> List["SearchResult"]:
"""Convenience method to cluster and select representatives in one call.
Args:
embeddings: NumPy array (may be ignored for frequency-based clustering).
results: List of SearchResult objects.
Returns:
List of representative SearchResult objects.
"""
clusters = self.cluster(embeddings, results)
return self.select_representatives(clusters, results, embeddings)

View File

@@ -0,0 +1,153 @@
"""HDBSCAN-based clustering strategy for search results.
HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
is the primary clustering strategy for grouping similar search results.
"""
from __future__ import annotations
from typing import TYPE_CHECKING, List, Optional
from .base import BaseClusteringStrategy, ClusteringConfig
if TYPE_CHECKING:
import numpy as np
from codexlens.entities import SearchResult
class HDBSCANStrategy(BaseClusteringStrategy):
"""HDBSCAN-based clustering strategy.
Uses HDBSCAN algorithm to cluster search results based on embedding similarity.
HDBSCAN is preferred over DBSCAN because it:
- Automatically determines the number of clusters
- Handles varying density clusters well
- Identifies noise points (outliers) effectively
Example:
>>> from codexlens.search.clustering import HDBSCANStrategy, ClusteringConfig
>>> config = ClusteringConfig(min_cluster_size=3, metric='cosine')
>>> strategy = HDBSCANStrategy(config)
>>> clusters = strategy.cluster(embeddings, results)
>>> representatives = strategy.select_representatives(clusters, results)
"""
def __init__(self, config: Optional[ClusteringConfig] = None) -> None:
"""Initialize HDBSCAN clustering strategy.
Args:
config: Clustering configuration. Uses defaults if not provided.
Raises:
ImportError: If hdbscan package is not installed.
"""
super().__init__(config)
# Validate hdbscan is available
try:
import hdbscan # noqa: F401
except ImportError as exc:
raise ImportError(
"hdbscan package is required for HDBSCANStrategy. "
"Install with: pip install codexlens[clustering]"
) from exc
def cluster(
self,
embeddings: "np.ndarray",
results: List["SearchResult"],
) -> List[List[int]]:
"""Cluster search results using HDBSCAN algorithm.
Args:
embeddings: NumPy array of shape (n_results, embedding_dim)
containing the embedding vectors for each result.
results: List of SearchResult objects corresponding to embeddings.
Returns:
List of clusters, where each cluster is a list of indices
into the results list. Noise points are returned as singleton clusters.
"""
import hdbscan
import numpy as np
n_results = len(results)
if n_results == 0:
return []
# Handle edge case: fewer results than min_cluster_size
if n_results < self.config.min_cluster_size:
# Return each result as its own singleton cluster
return [[i] for i in range(n_results)]
# Configure HDBSCAN clusterer
clusterer = hdbscan.HDBSCAN(
min_cluster_size=self.config.min_cluster_size,
min_samples=self.config.min_samples,
metric=self.config.metric,
cluster_selection_epsilon=self.config.cluster_selection_epsilon,
allow_single_cluster=self.config.allow_single_cluster,
prediction_data=self.config.prediction_data,
)
# Fit and get cluster labels
# Labels: -1 = noise, 0+ = cluster index
labels = clusterer.fit_predict(embeddings)
# Group indices by cluster label
cluster_map: dict[int, list[int]] = {}
for idx, label in enumerate(labels):
if label not in cluster_map:
cluster_map[label] = []
cluster_map[label].append(idx)
# Build result: non-noise clusters first, then noise as singletons
clusters: List[List[int]] = []
# Add proper clusters (label >= 0)
for label in sorted(cluster_map.keys()):
if label >= 0:
clusters.append(cluster_map[label])
# Add noise points as singleton clusters (label == -1)
if -1 in cluster_map:
for idx in cluster_map[-1]:
clusters.append([idx])
return clusters
def select_representatives(
self,
clusters: List[List[int]],
results: List["SearchResult"],
embeddings: Optional["np.ndarray"] = None,
) -> List["SearchResult"]:
"""Select representative results from each cluster.
Selects the result with the highest score from each cluster.
Args:
clusters: List of clusters from cluster() method.
results: Original list of SearchResult objects.
embeddings: Optional embeddings (not used in score-based selection).
Returns:
List of representative SearchResult objects, one per cluster,
ordered by score (highest first).
"""
if not clusters or not results:
return []
representatives: List["SearchResult"] = []
for cluster_indices in clusters:
if not cluster_indices:
continue
# Find the result with the highest score in this cluster
best_idx = max(cluster_indices, key=lambda i: results[i].score)
representatives.append(results[best_idx])
# Sort by score descending
representatives.sort(key=lambda r: r.score, reverse=True)
return representatives

View File

@@ -0,0 +1,83 @@
"""No-op clustering strategy for search results.
NoOpStrategy returns all results ungrouped when clustering dependencies
are not available or clustering is disabled.
"""
from __future__ import annotations
from typing import TYPE_CHECKING, List, Optional
from .base import BaseClusteringStrategy, ClusteringConfig
if TYPE_CHECKING:
import numpy as np
from codexlens.entities import SearchResult
class NoOpStrategy(BaseClusteringStrategy):
"""No-op clustering strategy that returns all results ungrouped.
This strategy is used as a final fallback when no clustering dependencies
are available, or when clustering is explicitly disabled. Each result
is treated as its own singleton cluster.
Example:
>>> from codexlens.search.clustering import NoOpStrategy
>>> strategy = NoOpStrategy()
>>> clusters = strategy.cluster(embeddings, results)
>>> # Returns [[0], [1], [2], ...] - each result in its own cluster
>>> representatives = strategy.select_representatives(clusters, results)
>>> # Returns all results sorted by score
"""
def __init__(self, config: Optional[ClusteringConfig] = None) -> None:
"""Initialize NoOp clustering strategy.
Args:
config: Clustering configuration. Ignored for NoOpStrategy
but accepted for interface compatibility.
"""
super().__init__(config)
def cluster(
self,
embeddings: "np.ndarray",
results: List["SearchResult"],
) -> List[List[int]]:
"""Return each result as its own singleton cluster.
Args:
embeddings: NumPy array of shape (n_results, embedding_dim).
Not used but accepted for interface compatibility.
results: List of SearchResult objects.
Returns:
List of singleton clusters, one per result.
"""
return [[i] for i in range(len(results))]
def select_representatives(
self,
clusters: List[List[int]],
results: List["SearchResult"],
embeddings: Optional["np.ndarray"] = None,
) -> List["SearchResult"]:
"""Return all results sorted by score.
Since each cluster is a singleton, this effectively returns all
results sorted by score descending.
Args:
clusters: List of singleton clusters.
results: Original list of SearchResult objects.
embeddings: Optional embeddings (not used).
Returns:
All SearchResult objects sorted by score (highest first).
"""
if not results:
return []
# Return all results sorted by score
return sorted(results, key=lambda r: r.score, reverse=True)