mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-13 02:41:50 +08:00
Refactor code structure and remove redundant changes
This commit is contained in:
124
codex-lens/build/lib/codexlens/search/clustering/__init__.py
Normal file
124
codex-lens/build/lib/codexlens/search/clustering/__init__.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""Clustering strategies for the staged hybrid search pipeline.
|
||||
|
||||
This module provides extensible clustering infrastructure for grouping
|
||||
similar search results and selecting representative results.
|
||||
|
||||
Install with: pip install codexlens[clustering]
|
||||
|
||||
Example:
|
||||
>>> from codexlens.search.clustering import (
|
||||
... CLUSTERING_AVAILABLE,
|
||||
... ClusteringConfig,
|
||||
... get_strategy,
|
||||
... )
|
||||
>>> config = ClusteringConfig(min_cluster_size=3)
|
||||
>>> # Auto-select best available strategy with fallback
|
||||
>>> strategy = get_strategy("auto", config)
|
||||
>>> representatives = strategy.fit_predict(embeddings, results)
|
||||
>>>
|
||||
>>> # Or explicitly use a specific strategy
|
||||
>>> if CLUSTERING_AVAILABLE:
|
||||
... from codexlens.search.clustering import HDBSCANStrategy
|
||||
... strategy = HDBSCANStrategy(config)
|
||||
... representatives = strategy.fit_predict(embeddings, results)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
# Always export base classes and factory (no heavy dependencies)
|
||||
from .base import BaseClusteringStrategy, ClusteringConfig
|
||||
from .factory import (
|
||||
ClusteringStrategyFactory,
|
||||
check_clustering_strategy_available,
|
||||
get_strategy,
|
||||
)
|
||||
from .noop_strategy import NoOpStrategy
|
||||
from .frequency_strategy import FrequencyStrategy, FrequencyConfig
|
||||
|
||||
# Feature flag for clustering availability (hdbscan + sklearn)
|
||||
CLUSTERING_AVAILABLE = False
|
||||
HDBSCAN_AVAILABLE = False
|
||||
DBSCAN_AVAILABLE = False
|
||||
_import_error: str | None = None
|
||||
|
||||
|
||||
def _detect_clustering_available() -> tuple[bool, bool, bool, str | None]:
|
||||
"""Detect if clustering dependencies are available.
|
||||
|
||||
Returns:
|
||||
Tuple of (all_available, hdbscan_available, dbscan_available, error_message).
|
||||
"""
|
||||
hdbscan_ok = False
|
||||
dbscan_ok = False
|
||||
|
||||
try:
|
||||
import hdbscan # noqa: F401
|
||||
hdbscan_ok = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from sklearn.cluster import DBSCAN # noqa: F401
|
||||
dbscan_ok = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
all_ok = hdbscan_ok and dbscan_ok
|
||||
error = None
|
||||
if not all_ok:
|
||||
missing = []
|
||||
if not hdbscan_ok:
|
||||
missing.append("hdbscan")
|
||||
if not dbscan_ok:
|
||||
missing.append("scikit-learn")
|
||||
error = f"{', '.join(missing)} not available. Install with: pip install codexlens[clustering]"
|
||||
|
||||
return all_ok, hdbscan_ok, dbscan_ok, error
|
||||
|
||||
|
||||
# Initialize on module load
|
||||
CLUSTERING_AVAILABLE, HDBSCAN_AVAILABLE, DBSCAN_AVAILABLE, _import_error = (
|
||||
_detect_clustering_available()
|
||||
)
|
||||
|
||||
|
||||
def check_clustering_available() -> tuple[bool, str | None]:
|
||||
"""Check if all clustering dependencies are available.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_available, error_message).
|
||||
error_message is None if available, otherwise contains install instructions.
|
||||
"""
|
||||
return CLUSTERING_AVAILABLE, _import_error
|
||||
|
||||
|
||||
# Conditionally export strategy implementations
|
||||
__all__ = [
|
||||
# Feature flags
|
||||
"CLUSTERING_AVAILABLE",
|
||||
"HDBSCAN_AVAILABLE",
|
||||
"DBSCAN_AVAILABLE",
|
||||
"check_clustering_available",
|
||||
# Base classes
|
||||
"BaseClusteringStrategy",
|
||||
"ClusteringConfig",
|
||||
# Factory
|
||||
"ClusteringStrategyFactory",
|
||||
"get_strategy",
|
||||
"check_clustering_strategy_available",
|
||||
# Always-available strategies
|
||||
"NoOpStrategy",
|
||||
"FrequencyStrategy",
|
||||
"FrequencyConfig",
|
||||
]
|
||||
|
||||
# Conditionally add strategy classes to __all__ and module namespace
|
||||
if HDBSCAN_AVAILABLE:
|
||||
from .hdbscan_strategy import HDBSCANStrategy
|
||||
|
||||
__all__.append("HDBSCANStrategy")
|
||||
|
||||
if DBSCAN_AVAILABLE:
|
||||
from .dbscan_strategy import DBSCANStrategy
|
||||
|
||||
__all__.append("DBSCANStrategy")
|
||||
153
codex-lens/build/lib/codexlens/search/clustering/base.py
Normal file
153
codex-lens/build/lib/codexlens/search/clustering/base.py
Normal file
@@ -0,0 +1,153 @@
|
||||
"""Base classes for clustering strategies in the hybrid search pipeline.
|
||||
|
||||
This module defines the abstract base class for clustering strategies used
|
||||
in the staged hybrid search pipeline. Strategies cluster search results
|
||||
based on their embeddings and select representative results from each cluster.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
from codexlens.entities import SearchResult
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClusteringConfig:
|
||||
"""Configuration parameters for clustering strategies.
|
||||
|
||||
Attributes:
|
||||
min_cluster_size: Minimum number of results to form a cluster.
|
||||
HDBSCAN default is 5, but for search results 2-3 is often better.
|
||||
min_samples: Number of samples in a neighborhood for a point to be
|
||||
considered a core point. Lower values allow more clusters.
|
||||
metric: Distance metric for clustering. Common options:
|
||||
- 'euclidean': Standard L2 distance
|
||||
- 'cosine': Cosine distance (1 - cosine_similarity)
|
||||
- 'manhattan': L1 distance
|
||||
cluster_selection_epsilon: Distance threshold for cluster selection.
|
||||
Results within this distance may be merged into the same cluster.
|
||||
allow_single_cluster: If True, allow all results to form one cluster.
|
||||
Useful when results are very similar.
|
||||
prediction_data: If True, generate prediction data for new points.
|
||||
"""
|
||||
|
||||
min_cluster_size: int = 3
|
||||
min_samples: int = 2
|
||||
metric: str = "cosine"
|
||||
cluster_selection_epsilon: float = 0.0
|
||||
allow_single_cluster: bool = True
|
||||
prediction_data: bool = False
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Validate configuration parameters."""
|
||||
if self.min_cluster_size < 2:
|
||||
raise ValueError("min_cluster_size must be >= 2")
|
||||
if self.min_samples < 1:
|
||||
raise ValueError("min_samples must be >= 1")
|
||||
if self.metric not in ("euclidean", "cosine", "manhattan"):
|
||||
raise ValueError(f"metric must be one of: euclidean, cosine, manhattan; got {self.metric}")
|
||||
if self.cluster_selection_epsilon < 0:
|
||||
raise ValueError("cluster_selection_epsilon must be >= 0")
|
||||
|
||||
|
||||
class BaseClusteringStrategy(ABC):
|
||||
"""Abstract base class for clustering strategies.
|
||||
|
||||
Clustering strategies are used in the staged hybrid search pipeline to
|
||||
group similar search results and select representative results from each
|
||||
cluster, reducing redundancy while maintaining diversity.
|
||||
|
||||
Subclasses must implement:
|
||||
- cluster(): Group results into clusters based on embeddings
|
||||
- select_representatives(): Choose best result(s) from each cluster
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ClusteringConfig] = None) -> None:
|
||||
"""Initialize the clustering strategy.
|
||||
|
||||
Args:
|
||||
config: Clustering configuration. Uses defaults if not provided.
|
||||
"""
|
||||
self.config = config or ClusteringConfig()
|
||||
|
||||
@abstractmethod
|
||||
def cluster(
|
||||
self,
|
||||
embeddings: "np.ndarray",
|
||||
results: List["SearchResult"],
|
||||
) -> List[List[int]]:
|
||||
"""Cluster search results based on their embeddings.
|
||||
|
||||
Args:
|
||||
embeddings: NumPy array of shape (n_results, embedding_dim)
|
||||
containing the embedding vectors for each result.
|
||||
results: List of SearchResult objects corresponding to embeddings.
|
||||
Used for additional metadata during clustering.
|
||||
|
||||
Returns:
|
||||
List of clusters, where each cluster is a list of indices
|
||||
into the results list. Results not assigned to any cluster
|
||||
(noise points) should be returned as single-element clusters.
|
||||
|
||||
Example:
|
||||
>>> strategy = HDBSCANStrategy()
|
||||
>>> clusters = strategy.cluster(embeddings, results)
|
||||
>>> # clusters = [[0, 2, 5], [1, 3], [4], [6, 7, 8]]
|
||||
>>> # Result indices 0, 2, 5 are in cluster 0
|
||||
>>> # Result indices 1, 3 are in cluster 1
|
||||
>>> # Result index 4 is a noise point (singleton cluster)
|
||||
>>> # Result indices 6, 7, 8 are in cluster 2
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def select_representatives(
|
||||
self,
|
||||
clusters: List[List[int]],
|
||||
results: List["SearchResult"],
|
||||
embeddings: Optional["np.ndarray"] = None,
|
||||
) -> List["SearchResult"]:
|
||||
"""Select representative results from each cluster.
|
||||
|
||||
This method chooses the best result(s) from each cluster to include
|
||||
in the final search results. The selection can be based on:
|
||||
- Highest score within cluster
|
||||
- Closest to cluster centroid
|
||||
- Custom selection logic
|
||||
|
||||
Args:
|
||||
clusters: List of clusters from cluster() method.
|
||||
results: Original list of SearchResult objects.
|
||||
embeddings: Optional embeddings array for centroid-based selection.
|
||||
|
||||
Returns:
|
||||
List of representative SearchResult objects, one or more per cluster,
|
||||
ordered by relevance (highest score first).
|
||||
|
||||
Example:
|
||||
>>> representatives = strategy.select_representatives(clusters, results)
|
||||
>>> # Returns best result from each cluster
|
||||
"""
|
||||
...
|
||||
|
||||
def fit_predict(
|
||||
self,
|
||||
embeddings: "np.ndarray",
|
||||
results: List["SearchResult"],
|
||||
) -> List["SearchResult"]:
|
||||
"""Convenience method to cluster and select representatives in one call.
|
||||
|
||||
Args:
|
||||
embeddings: NumPy array of shape (n_results, embedding_dim).
|
||||
results: List of SearchResult objects.
|
||||
|
||||
Returns:
|
||||
List of representative SearchResult objects.
|
||||
"""
|
||||
clusters = self.cluster(embeddings, results)
|
||||
return self.select_representatives(clusters, results, embeddings)
|
||||
@@ -0,0 +1,197 @@
|
||||
"""DBSCAN-based clustering strategy for search results.
|
||||
|
||||
DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
|
||||
is the fallback clustering strategy when HDBSCAN is not available.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
|
||||
from .base import BaseClusteringStrategy, ClusteringConfig
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
from codexlens.entities import SearchResult
|
||||
|
||||
|
||||
class DBSCANStrategy(BaseClusteringStrategy):
|
||||
"""DBSCAN-based clustering strategy.
|
||||
|
||||
Uses sklearn's DBSCAN algorithm as a fallback when HDBSCAN is not available.
|
||||
DBSCAN requires an explicit eps parameter, which is auto-computed from the
|
||||
distance distribution if not provided.
|
||||
|
||||
Example:
|
||||
>>> from codexlens.search.clustering import DBSCANStrategy, ClusteringConfig
|
||||
>>> config = ClusteringConfig(min_cluster_size=3, metric='cosine')
|
||||
>>> strategy = DBSCANStrategy(config)
|
||||
>>> clusters = strategy.cluster(embeddings, results)
|
||||
>>> representatives = strategy.select_representatives(clusters, results)
|
||||
"""
|
||||
|
||||
# Default eps percentile for auto-computation
|
||||
DEFAULT_EPS_PERCENTILE: float = 15.0
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: Optional[ClusteringConfig] = None,
|
||||
eps: Optional[float] = None,
|
||||
eps_percentile: float = DEFAULT_EPS_PERCENTILE,
|
||||
) -> None:
|
||||
"""Initialize DBSCAN clustering strategy.
|
||||
|
||||
Args:
|
||||
config: Clustering configuration. Uses defaults if not provided.
|
||||
eps: Explicit eps parameter for DBSCAN. If None, auto-computed
|
||||
from the distance distribution.
|
||||
eps_percentile: Percentile of pairwise distances to use for
|
||||
auto-computing eps. Default is 15th percentile.
|
||||
|
||||
Raises:
|
||||
ImportError: If sklearn is not installed.
|
||||
"""
|
||||
super().__init__(config)
|
||||
self.eps = eps
|
||||
self.eps_percentile = eps_percentile
|
||||
|
||||
# Validate sklearn is available
|
||||
try:
|
||||
from sklearn.cluster import DBSCAN # noqa: F401
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"scikit-learn package is required for DBSCANStrategy. "
|
||||
"Install with: pip install codexlens[clustering]"
|
||||
) from exc
|
||||
|
||||
def _compute_eps(self, embeddings: "np.ndarray") -> float:
|
||||
"""Auto-compute eps from pairwise distance distribution.
|
||||
|
||||
Uses the specified percentile of pairwise distances as eps,
|
||||
which typically captures local density well.
|
||||
|
||||
Args:
|
||||
embeddings: NumPy array of shape (n_results, embedding_dim).
|
||||
|
||||
Returns:
|
||||
Computed eps value.
|
||||
"""
|
||||
import numpy as np
|
||||
from sklearn.metrics import pairwise_distances
|
||||
|
||||
# Compute pairwise distances
|
||||
distances = pairwise_distances(embeddings, metric=self.config.metric)
|
||||
|
||||
# Get upper triangle (excluding diagonal)
|
||||
upper_tri = distances[np.triu_indices_from(distances, k=1)]
|
||||
|
||||
if len(upper_tri) == 0:
|
||||
# Only one point, return a default small eps
|
||||
return 0.1
|
||||
|
||||
# Use percentile of distances as eps
|
||||
eps = float(np.percentile(upper_tri, self.eps_percentile))
|
||||
|
||||
# Ensure eps is positive
|
||||
return max(eps, 1e-6)
|
||||
|
||||
def cluster(
|
||||
self,
|
||||
embeddings: "np.ndarray",
|
||||
results: List["SearchResult"],
|
||||
) -> List[List[int]]:
|
||||
"""Cluster search results using DBSCAN algorithm.
|
||||
|
||||
Args:
|
||||
embeddings: NumPy array of shape (n_results, embedding_dim)
|
||||
containing the embedding vectors for each result.
|
||||
results: List of SearchResult objects corresponding to embeddings.
|
||||
|
||||
Returns:
|
||||
List of clusters, where each cluster is a list of indices
|
||||
into the results list. Noise points are returned as singleton clusters.
|
||||
"""
|
||||
from sklearn.cluster import DBSCAN
|
||||
import numpy as np
|
||||
|
||||
n_results = len(results)
|
||||
if n_results == 0:
|
||||
return []
|
||||
|
||||
# Handle edge case: single result
|
||||
if n_results == 1:
|
||||
return [[0]]
|
||||
|
||||
# Determine eps value
|
||||
eps = self.eps if self.eps is not None else self._compute_eps(embeddings)
|
||||
|
||||
# Configure DBSCAN clusterer
|
||||
# Note: DBSCAN min_samples corresponds to min_cluster_size concept
|
||||
clusterer = DBSCAN(
|
||||
eps=eps,
|
||||
min_samples=self.config.min_samples,
|
||||
metric=self.config.metric,
|
||||
)
|
||||
|
||||
# Fit and get cluster labels
|
||||
# Labels: -1 = noise, 0+ = cluster index
|
||||
labels = clusterer.fit_predict(embeddings)
|
||||
|
||||
# Group indices by cluster label
|
||||
cluster_map: dict[int, list[int]] = {}
|
||||
for idx, label in enumerate(labels):
|
||||
if label not in cluster_map:
|
||||
cluster_map[label] = []
|
||||
cluster_map[label].append(idx)
|
||||
|
||||
# Build result: non-noise clusters first, then noise as singletons
|
||||
clusters: List[List[int]] = []
|
||||
|
||||
# Add proper clusters (label >= 0)
|
||||
for label in sorted(cluster_map.keys()):
|
||||
if label >= 0:
|
||||
clusters.append(cluster_map[label])
|
||||
|
||||
# Add noise points as singleton clusters (label == -1)
|
||||
if -1 in cluster_map:
|
||||
for idx in cluster_map[-1]:
|
||||
clusters.append([idx])
|
||||
|
||||
return clusters
|
||||
|
||||
def select_representatives(
|
||||
self,
|
||||
clusters: List[List[int]],
|
||||
results: List["SearchResult"],
|
||||
embeddings: Optional["np.ndarray"] = None,
|
||||
) -> List["SearchResult"]:
|
||||
"""Select representative results from each cluster.
|
||||
|
||||
Selects the result with the highest score from each cluster.
|
||||
|
||||
Args:
|
||||
clusters: List of clusters from cluster() method.
|
||||
results: Original list of SearchResult objects.
|
||||
embeddings: Optional embeddings (not used in score-based selection).
|
||||
|
||||
Returns:
|
||||
List of representative SearchResult objects, one per cluster,
|
||||
ordered by score (highest first).
|
||||
"""
|
||||
if not clusters or not results:
|
||||
return []
|
||||
|
||||
representatives: List["SearchResult"] = []
|
||||
|
||||
for cluster_indices in clusters:
|
||||
if not cluster_indices:
|
||||
continue
|
||||
|
||||
# Find the result with the highest score in this cluster
|
||||
best_idx = max(cluster_indices, key=lambda i: results[i].score)
|
||||
representatives.append(results[best_idx])
|
||||
|
||||
# Sort by score descending
|
||||
representatives.sort(key=lambda r: r.score, reverse=True)
|
||||
|
||||
return representatives
|
||||
202
codex-lens/build/lib/codexlens/search/clustering/factory.py
Normal file
202
codex-lens/build/lib/codexlens/search/clustering/factory.py
Normal file
@@ -0,0 +1,202 @@
|
||||
"""Factory for creating clustering strategies.
|
||||
|
||||
Provides a unified interface for instantiating different clustering backends
|
||||
with automatic fallback chain: hdbscan -> dbscan -> noop.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from .base import BaseClusteringStrategy, ClusteringConfig
|
||||
from .noop_strategy import NoOpStrategy
|
||||
|
||||
|
||||
def check_clustering_strategy_available(strategy: str) -> tuple[bool, str | None]:
|
||||
"""Check whether a specific clustering strategy can be used.
|
||||
|
||||
Args:
|
||||
strategy: Strategy name to check. Options:
|
||||
- "hdbscan": HDBSCAN clustering (requires hdbscan package)
|
||||
- "dbscan": DBSCAN clustering (requires sklearn)
|
||||
- "frequency": Frequency-based clustering (always available)
|
||||
- "noop": No-op strategy (always available)
|
||||
|
||||
Returns:
|
||||
Tuple of (is_available, error_message).
|
||||
error_message is None if available, otherwise contains install instructions.
|
||||
"""
|
||||
strategy = (strategy or "").strip().lower()
|
||||
|
||||
if strategy == "hdbscan":
|
||||
try:
|
||||
import hdbscan # noqa: F401
|
||||
except ImportError:
|
||||
return False, (
|
||||
"hdbscan package not available. "
|
||||
"Install with: pip install codexlens[clustering]"
|
||||
)
|
||||
return True, None
|
||||
|
||||
if strategy == "dbscan":
|
||||
try:
|
||||
from sklearn.cluster import DBSCAN # noqa: F401
|
||||
except ImportError:
|
||||
return False, (
|
||||
"scikit-learn package not available. "
|
||||
"Install with: pip install codexlens[clustering]"
|
||||
)
|
||||
return True, None
|
||||
|
||||
if strategy == "frequency":
|
||||
# Frequency strategy is always available (no external deps)
|
||||
return True, None
|
||||
|
||||
if strategy == "noop":
|
||||
return True, None
|
||||
|
||||
return False, (
|
||||
f"Invalid clustering strategy: {strategy}. "
|
||||
"Must be 'hdbscan', 'dbscan', 'frequency', or 'noop'."
|
||||
)
|
||||
|
||||
|
||||
def get_strategy(
|
||||
strategy: str = "hdbscan",
|
||||
config: Optional[ClusteringConfig] = None,
|
||||
*,
|
||||
fallback: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> BaseClusteringStrategy:
|
||||
"""Factory function to create clustering strategy with fallback chain.
|
||||
|
||||
The fallback chain is: hdbscan -> dbscan -> frequency -> noop
|
||||
|
||||
Args:
|
||||
strategy: Clustering strategy to use. Options:
|
||||
- "hdbscan": HDBSCAN clustering (default, recommended)
|
||||
- "dbscan": DBSCAN clustering (fallback)
|
||||
- "frequency": Frequency-based clustering (groups by symbol occurrence)
|
||||
- "noop": No-op strategy (returns all results ungrouped)
|
||||
- "auto": Try hdbscan, then dbscan, then noop
|
||||
config: Clustering configuration. Uses defaults if not provided.
|
||||
For frequency strategy, pass FrequencyConfig for full control.
|
||||
fallback: If True (default), automatically fall back to next strategy
|
||||
in the chain when primary is unavailable. If False, raise ImportError
|
||||
when requested strategy is unavailable.
|
||||
**kwargs: Additional strategy-specific arguments.
|
||||
For DBSCANStrategy: eps, eps_percentile
|
||||
For FrequencyStrategy: group_by, min_frequency, etc.
|
||||
|
||||
Returns:
|
||||
BaseClusteringStrategy: Configured clustering strategy instance.
|
||||
|
||||
Raises:
|
||||
ValueError: If strategy is not recognized.
|
||||
ImportError: If required dependencies are not installed and fallback=False.
|
||||
|
||||
Example:
|
||||
>>> from codexlens.search.clustering import get_strategy, ClusteringConfig
|
||||
>>> config = ClusteringConfig(min_cluster_size=3)
|
||||
>>> # Auto-select best available strategy
|
||||
>>> strategy = get_strategy("auto", config)
|
||||
>>> # Explicitly use HDBSCAN (will fall back if unavailable)
|
||||
>>> strategy = get_strategy("hdbscan", config)
|
||||
>>> # Use frequency-based strategy
|
||||
>>> from codexlens.search.clustering import FrequencyConfig
|
||||
>>> freq_config = FrequencyConfig(min_frequency=2, group_by="symbol")
|
||||
>>> strategy = get_strategy("frequency", freq_config)
|
||||
"""
|
||||
strategy = (strategy or "").strip().lower()
|
||||
|
||||
# Handle "auto" - try strategies in order
|
||||
if strategy == "auto":
|
||||
return _get_best_available_strategy(config, **kwargs)
|
||||
|
||||
if strategy == "hdbscan":
|
||||
ok, err = check_clustering_strategy_available("hdbscan")
|
||||
if ok:
|
||||
from .hdbscan_strategy import HDBSCANStrategy
|
||||
return HDBSCANStrategy(config)
|
||||
|
||||
if fallback:
|
||||
# Try dbscan fallback
|
||||
ok_dbscan, _ = check_clustering_strategy_available("dbscan")
|
||||
if ok_dbscan:
|
||||
from .dbscan_strategy import DBSCANStrategy
|
||||
return DBSCANStrategy(config, **kwargs)
|
||||
# Final fallback to noop
|
||||
return NoOpStrategy(config)
|
||||
|
||||
raise ImportError(err)
|
||||
|
||||
if strategy == "dbscan":
|
||||
ok, err = check_clustering_strategy_available("dbscan")
|
||||
if ok:
|
||||
from .dbscan_strategy import DBSCANStrategy
|
||||
return DBSCANStrategy(config, **kwargs)
|
||||
|
||||
if fallback:
|
||||
# Fallback to noop
|
||||
return NoOpStrategy(config)
|
||||
|
||||
raise ImportError(err)
|
||||
|
||||
if strategy == "frequency":
|
||||
from .frequency_strategy import FrequencyStrategy, FrequencyConfig
|
||||
# If config is ClusteringConfig but not FrequencyConfig, create default FrequencyConfig
|
||||
if config is None or not isinstance(config, FrequencyConfig):
|
||||
freq_config = FrequencyConfig(**kwargs) if kwargs else FrequencyConfig()
|
||||
else:
|
||||
freq_config = config
|
||||
return FrequencyStrategy(freq_config)
|
||||
|
||||
if strategy == "noop":
|
||||
return NoOpStrategy(config)
|
||||
|
||||
raise ValueError(
|
||||
f"Unknown clustering strategy: {strategy}. "
|
||||
"Supported strategies: 'hdbscan', 'dbscan', 'frequency', 'noop', 'auto'"
|
||||
)
|
||||
|
||||
|
||||
def _get_best_available_strategy(
|
||||
config: Optional[ClusteringConfig] = None,
|
||||
**kwargs: Any,
|
||||
) -> BaseClusteringStrategy:
|
||||
"""Get the best available clustering strategy.
|
||||
|
||||
Tries strategies in order: hdbscan -> dbscan -> noop
|
||||
|
||||
Args:
|
||||
config: Clustering configuration.
|
||||
**kwargs: Additional strategy-specific arguments.
|
||||
|
||||
Returns:
|
||||
Best available clustering strategy instance.
|
||||
"""
|
||||
# Try HDBSCAN first
|
||||
ok, _ = check_clustering_strategy_available("hdbscan")
|
||||
if ok:
|
||||
from .hdbscan_strategy import HDBSCANStrategy
|
||||
return HDBSCANStrategy(config)
|
||||
|
||||
# Try DBSCAN second
|
||||
ok, _ = check_clustering_strategy_available("dbscan")
|
||||
if ok:
|
||||
from .dbscan_strategy import DBSCANStrategy
|
||||
return DBSCANStrategy(config, **kwargs)
|
||||
|
||||
# Fallback to NoOp
|
||||
return NoOpStrategy(config)
|
||||
|
||||
|
||||
# Alias for backward compatibility
|
||||
ClusteringStrategyFactory = type(
|
||||
"ClusteringStrategyFactory",
|
||||
(),
|
||||
{
|
||||
"get_strategy": staticmethod(get_strategy),
|
||||
"check_available": staticmethod(check_clustering_strategy_available),
|
||||
},
|
||||
)
|
||||
@@ -0,0 +1,263 @@
|
||||
"""Frequency-based clustering strategy for search result deduplication.
|
||||
|
||||
This strategy groups search results by symbol/method name and prunes based on
|
||||
occurrence frequency. High-frequency symbols (frequently referenced methods)
|
||||
are considered more important and retained, while low-frequency results
|
||||
(potentially noise) can be filtered out.
|
||||
|
||||
Use cases:
|
||||
- Prioritize commonly called methods/functions
|
||||
- Filter out one-off results that may be less relevant
|
||||
- Deduplicate results pointing to the same symbol from different locations
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Literal
|
||||
|
||||
from .base import BaseClusteringStrategy, ClusteringConfig
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
from codexlens.entities import SearchResult
|
||||
|
||||
|
||||
@dataclass
|
||||
class FrequencyConfig(ClusteringConfig):
|
||||
"""Configuration for frequency-based clustering strategy.
|
||||
|
||||
Attributes:
|
||||
group_by: Field to group results by for frequency counting.
|
||||
- 'symbol': Group by symbol_name (default, for method/function dedup)
|
||||
- 'file': Group by file path
|
||||
- 'symbol_kind': Group by symbol type (function, class, etc.)
|
||||
min_frequency: Minimum occurrence count to keep a result.
|
||||
Results appearing less than this are considered noise and pruned.
|
||||
max_representatives_per_group: Maximum results to keep per symbol group.
|
||||
frequency_weight: How much to boost score based on frequency.
|
||||
Final score = original_score * (1 + frequency_weight * log(frequency))
|
||||
keep_mode: How to handle low-frequency results.
|
||||
- 'filter': Remove results below min_frequency
|
||||
- 'demote': Keep but lower their score ranking
|
||||
"""
|
||||
|
||||
group_by: Literal["symbol", "file", "symbol_kind"] = "symbol"
|
||||
min_frequency: int = 1 # 1 means keep all, 2+ filters singletons
|
||||
max_representatives_per_group: int = 3
|
||||
frequency_weight: float = 0.1 # Boost factor for frequency
|
||||
keep_mode: Literal["filter", "demote"] = "demote"
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Validate configuration parameters."""
|
||||
# Skip parent validation since we don't use HDBSCAN params
|
||||
if self.min_frequency < 1:
|
||||
raise ValueError("min_frequency must be >= 1")
|
||||
if self.max_representatives_per_group < 1:
|
||||
raise ValueError("max_representatives_per_group must be >= 1")
|
||||
if self.frequency_weight < 0:
|
||||
raise ValueError("frequency_weight must be >= 0")
|
||||
if self.group_by not in ("symbol", "file", "symbol_kind"):
|
||||
raise ValueError(f"group_by must be one of: symbol, file, symbol_kind; got {self.group_by}")
|
||||
if self.keep_mode not in ("filter", "demote"):
|
||||
raise ValueError(f"keep_mode must be one of: filter, demote; got {self.keep_mode}")
|
||||
|
||||
|
||||
class FrequencyStrategy(BaseClusteringStrategy):
|
||||
"""Frequency-based clustering strategy for search result deduplication.
|
||||
|
||||
This strategy groups search results by symbol name (or file/kind) and:
|
||||
1. Counts how many times each symbol appears in results
|
||||
2. Higher frequency = more important (frequently referenced method)
|
||||
3. Filters or demotes low-frequency results
|
||||
4. Selects top representatives from each frequency group
|
||||
|
||||
Unlike embedding-based strategies (HDBSCAN, DBSCAN), this strategy:
|
||||
- Does NOT require embeddings (works with metadata only)
|
||||
- Is very fast (O(n) complexity)
|
||||
- Is deterministic (no random initialization)
|
||||
- Works well for symbol-level deduplication
|
||||
|
||||
Example:
|
||||
>>> config = FrequencyConfig(min_frequency=2, group_by="symbol")
|
||||
>>> strategy = FrequencyStrategy(config)
|
||||
>>> # Results with symbol "authenticate" appearing 5 times
|
||||
>>> # will be prioritized over "helper_func" appearing once
|
||||
>>> representatives = strategy.fit_predict(embeddings, results)
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[FrequencyConfig] = None) -> None:
|
||||
"""Initialize the frequency strategy.
|
||||
|
||||
Args:
|
||||
config: Frequency configuration. Uses defaults if not provided.
|
||||
"""
|
||||
self.config: FrequencyConfig = config or FrequencyConfig()
|
||||
|
||||
def _get_group_key(self, result: "SearchResult") -> str:
|
||||
"""Extract grouping key from a search result.
|
||||
|
||||
Args:
|
||||
result: SearchResult to extract key from.
|
||||
|
||||
Returns:
|
||||
String key for grouping (symbol name, file path, or kind).
|
||||
"""
|
||||
if self.config.group_by == "symbol":
|
||||
# Use symbol_name if available, otherwise fall back to file:line
|
||||
symbol = getattr(result, "symbol_name", None)
|
||||
if symbol:
|
||||
return str(symbol)
|
||||
# Fallback: use file path + start_line as pseudo-symbol
|
||||
start_line = getattr(result, "start_line", 0) or 0
|
||||
return f"{result.path}:{start_line}"
|
||||
|
||||
elif self.config.group_by == "file":
|
||||
return str(result.path)
|
||||
|
||||
elif self.config.group_by == "symbol_kind":
|
||||
kind = getattr(result, "symbol_kind", None)
|
||||
return str(kind) if kind else "unknown"
|
||||
|
||||
return str(result.path) # Default fallback
|
||||
|
||||
def cluster(
|
||||
self,
|
||||
embeddings: "np.ndarray",
|
||||
results: List["SearchResult"],
|
||||
) -> List[List[int]]:
|
||||
"""Group search results by frequency of occurrence.
|
||||
|
||||
Note: This method ignores embeddings and groups by metadata only.
|
||||
The embeddings parameter is kept for interface compatibility.
|
||||
|
||||
Args:
|
||||
embeddings: Ignored (kept for interface compatibility).
|
||||
results: List of SearchResult objects to cluster.
|
||||
|
||||
Returns:
|
||||
List of clusters (groups), where each cluster contains indices
|
||||
of results with the same grouping key. Clusters are ordered by
|
||||
frequency (highest frequency first).
|
||||
"""
|
||||
if not results:
|
||||
return []
|
||||
|
||||
# Group results by key
|
||||
groups: Dict[str, List[int]] = defaultdict(list)
|
||||
for idx, result in enumerate(results):
|
||||
key = self._get_group_key(result)
|
||||
groups[key].append(idx)
|
||||
|
||||
# Sort groups by frequency (descending) then by key (for stability)
|
||||
sorted_groups = sorted(
|
||||
groups.items(),
|
||||
key=lambda x: (-len(x[1]), x[0]) # -frequency, then alphabetical
|
||||
)
|
||||
|
||||
# Convert to list of clusters
|
||||
clusters = [indices for _, indices in sorted_groups]
|
||||
|
||||
return clusters
|
||||
|
||||
def select_representatives(
|
||||
self,
|
||||
clusters: List[List[int]],
|
||||
results: List["SearchResult"],
|
||||
embeddings: Optional["np.ndarray"] = None,
|
||||
) -> List["SearchResult"]:
|
||||
"""Select representative results based on frequency and score.
|
||||
|
||||
For each frequency group:
|
||||
1. If frequency < min_frequency: filter or demote based on keep_mode
|
||||
2. Sort by score within group
|
||||
3. Apply frequency boost to scores
|
||||
4. Select top N representatives
|
||||
|
||||
Args:
|
||||
clusters: List of clusters from cluster() method.
|
||||
results: Original list of SearchResult objects.
|
||||
embeddings: Optional embeddings (used for tie-breaking if provided).
|
||||
|
||||
Returns:
|
||||
List of representative SearchResult objects, ordered by
|
||||
frequency-adjusted score (highest first).
|
||||
"""
|
||||
import math
|
||||
|
||||
if not clusters or not results:
|
||||
return []
|
||||
|
||||
representatives: List["SearchResult"] = []
|
||||
demoted: List["SearchResult"] = []
|
||||
|
||||
for cluster_indices in clusters:
|
||||
if not cluster_indices:
|
||||
continue
|
||||
|
||||
frequency = len(cluster_indices)
|
||||
|
||||
# Get results in this cluster, sorted by score
|
||||
cluster_results = [results[i] for i in cluster_indices]
|
||||
cluster_results.sort(key=lambda r: getattr(r, "score", 0.0), reverse=True)
|
||||
|
||||
# Check frequency threshold
|
||||
if frequency < self.config.min_frequency:
|
||||
if self.config.keep_mode == "filter":
|
||||
# Skip low-frequency results entirely
|
||||
continue
|
||||
else: # demote mode
|
||||
# Keep but add to demoted list (lower priority)
|
||||
for result in cluster_results[: self.config.max_representatives_per_group]:
|
||||
demoted.append(result)
|
||||
continue
|
||||
|
||||
# Apply frequency boost and select top representatives
|
||||
for result in cluster_results[: self.config.max_representatives_per_group]:
|
||||
# Calculate frequency-boosted score
|
||||
original_score = getattr(result, "score", 0.0)
|
||||
# log(frequency + 1) to handle frequency=1 case smoothly
|
||||
frequency_boost = 1.0 + self.config.frequency_weight * math.log(frequency + 1)
|
||||
boosted_score = original_score * frequency_boost
|
||||
|
||||
# Create new result with boosted score and frequency metadata
|
||||
# Note: SearchResult might be immutable, so we preserve original
|
||||
# and track boosted score in metadata
|
||||
if hasattr(result, "metadata") and isinstance(result.metadata, dict):
|
||||
result.metadata["frequency"] = frequency
|
||||
result.metadata["frequency_boosted_score"] = boosted_score
|
||||
|
||||
representatives.append(result)
|
||||
|
||||
# Sort representatives by boosted score (or original score as fallback)
|
||||
def get_sort_score(r: "SearchResult") -> float:
|
||||
if hasattr(r, "metadata") and isinstance(r.metadata, dict):
|
||||
return r.metadata.get("frequency_boosted_score", getattr(r, "score", 0.0))
|
||||
return getattr(r, "score", 0.0)
|
||||
|
||||
representatives.sort(key=get_sort_score, reverse=True)
|
||||
|
||||
# Add demoted results at the end
|
||||
if demoted:
|
||||
demoted.sort(key=lambda r: getattr(r, "score", 0.0), reverse=True)
|
||||
representatives.extend(demoted)
|
||||
|
||||
return representatives
|
||||
|
||||
def fit_predict(
|
||||
self,
|
||||
embeddings: "np.ndarray",
|
||||
results: List["SearchResult"],
|
||||
) -> List["SearchResult"]:
|
||||
"""Convenience method to cluster and select representatives in one call.
|
||||
|
||||
Args:
|
||||
embeddings: NumPy array (may be ignored for frequency-based clustering).
|
||||
results: List of SearchResult objects.
|
||||
|
||||
Returns:
|
||||
List of representative SearchResult objects.
|
||||
"""
|
||||
clusters = self.cluster(embeddings, results)
|
||||
return self.select_representatives(clusters, results, embeddings)
|
||||
@@ -0,0 +1,153 @@
|
||||
"""HDBSCAN-based clustering strategy for search results.
|
||||
|
||||
HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
|
||||
is the primary clustering strategy for grouping similar search results.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
|
||||
from .base import BaseClusteringStrategy, ClusteringConfig
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
from codexlens.entities import SearchResult
|
||||
|
||||
|
||||
class HDBSCANStrategy(BaseClusteringStrategy):
|
||||
"""HDBSCAN-based clustering strategy.
|
||||
|
||||
Uses HDBSCAN algorithm to cluster search results based on embedding similarity.
|
||||
HDBSCAN is preferred over DBSCAN because it:
|
||||
- Automatically determines the number of clusters
|
||||
- Handles varying density clusters well
|
||||
- Identifies noise points (outliers) effectively
|
||||
|
||||
Example:
|
||||
>>> from codexlens.search.clustering import HDBSCANStrategy, ClusteringConfig
|
||||
>>> config = ClusteringConfig(min_cluster_size=3, metric='cosine')
|
||||
>>> strategy = HDBSCANStrategy(config)
|
||||
>>> clusters = strategy.cluster(embeddings, results)
|
||||
>>> representatives = strategy.select_representatives(clusters, results)
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ClusteringConfig] = None) -> None:
|
||||
"""Initialize HDBSCAN clustering strategy.
|
||||
|
||||
Args:
|
||||
config: Clustering configuration. Uses defaults if not provided.
|
||||
|
||||
Raises:
|
||||
ImportError: If hdbscan package is not installed.
|
||||
"""
|
||||
super().__init__(config)
|
||||
# Validate hdbscan is available
|
||||
try:
|
||||
import hdbscan # noqa: F401
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"hdbscan package is required for HDBSCANStrategy. "
|
||||
"Install with: pip install codexlens[clustering]"
|
||||
) from exc
|
||||
|
||||
def cluster(
|
||||
self,
|
||||
embeddings: "np.ndarray",
|
||||
results: List["SearchResult"],
|
||||
) -> List[List[int]]:
|
||||
"""Cluster search results using HDBSCAN algorithm.
|
||||
|
||||
Args:
|
||||
embeddings: NumPy array of shape (n_results, embedding_dim)
|
||||
containing the embedding vectors for each result.
|
||||
results: List of SearchResult objects corresponding to embeddings.
|
||||
|
||||
Returns:
|
||||
List of clusters, where each cluster is a list of indices
|
||||
into the results list. Noise points are returned as singleton clusters.
|
||||
"""
|
||||
import hdbscan
|
||||
import numpy as np
|
||||
|
||||
n_results = len(results)
|
||||
if n_results == 0:
|
||||
return []
|
||||
|
||||
# Handle edge case: fewer results than min_cluster_size
|
||||
if n_results < self.config.min_cluster_size:
|
||||
# Return each result as its own singleton cluster
|
||||
return [[i] for i in range(n_results)]
|
||||
|
||||
# Configure HDBSCAN clusterer
|
||||
clusterer = hdbscan.HDBSCAN(
|
||||
min_cluster_size=self.config.min_cluster_size,
|
||||
min_samples=self.config.min_samples,
|
||||
metric=self.config.metric,
|
||||
cluster_selection_epsilon=self.config.cluster_selection_epsilon,
|
||||
allow_single_cluster=self.config.allow_single_cluster,
|
||||
prediction_data=self.config.prediction_data,
|
||||
)
|
||||
|
||||
# Fit and get cluster labels
|
||||
# Labels: -1 = noise, 0+ = cluster index
|
||||
labels = clusterer.fit_predict(embeddings)
|
||||
|
||||
# Group indices by cluster label
|
||||
cluster_map: dict[int, list[int]] = {}
|
||||
for idx, label in enumerate(labels):
|
||||
if label not in cluster_map:
|
||||
cluster_map[label] = []
|
||||
cluster_map[label].append(idx)
|
||||
|
||||
# Build result: non-noise clusters first, then noise as singletons
|
||||
clusters: List[List[int]] = []
|
||||
|
||||
# Add proper clusters (label >= 0)
|
||||
for label in sorted(cluster_map.keys()):
|
||||
if label >= 0:
|
||||
clusters.append(cluster_map[label])
|
||||
|
||||
# Add noise points as singleton clusters (label == -1)
|
||||
if -1 in cluster_map:
|
||||
for idx in cluster_map[-1]:
|
||||
clusters.append([idx])
|
||||
|
||||
return clusters
|
||||
|
||||
def select_representatives(
|
||||
self,
|
||||
clusters: List[List[int]],
|
||||
results: List["SearchResult"],
|
||||
embeddings: Optional["np.ndarray"] = None,
|
||||
) -> List["SearchResult"]:
|
||||
"""Select representative results from each cluster.
|
||||
|
||||
Selects the result with the highest score from each cluster.
|
||||
|
||||
Args:
|
||||
clusters: List of clusters from cluster() method.
|
||||
results: Original list of SearchResult objects.
|
||||
embeddings: Optional embeddings (not used in score-based selection).
|
||||
|
||||
Returns:
|
||||
List of representative SearchResult objects, one per cluster,
|
||||
ordered by score (highest first).
|
||||
"""
|
||||
if not clusters or not results:
|
||||
return []
|
||||
|
||||
representatives: List["SearchResult"] = []
|
||||
|
||||
for cluster_indices in clusters:
|
||||
if not cluster_indices:
|
||||
continue
|
||||
|
||||
# Find the result with the highest score in this cluster
|
||||
best_idx = max(cluster_indices, key=lambda i: results[i].score)
|
||||
representatives.append(results[best_idx])
|
||||
|
||||
# Sort by score descending
|
||||
representatives.sort(key=lambda r: r.score, reverse=True)
|
||||
|
||||
return representatives
|
||||
@@ -0,0 +1,83 @@
|
||||
"""No-op clustering strategy for search results.
|
||||
|
||||
NoOpStrategy returns all results ungrouped when clustering dependencies
|
||||
are not available or clustering is disabled.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
|
||||
from .base import BaseClusteringStrategy, ClusteringConfig
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
from codexlens.entities import SearchResult
|
||||
|
||||
|
||||
class NoOpStrategy(BaseClusteringStrategy):
|
||||
"""No-op clustering strategy that returns all results ungrouped.
|
||||
|
||||
This strategy is used as a final fallback when no clustering dependencies
|
||||
are available, or when clustering is explicitly disabled. Each result
|
||||
is treated as its own singleton cluster.
|
||||
|
||||
Example:
|
||||
>>> from codexlens.search.clustering import NoOpStrategy
|
||||
>>> strategy = NoOpStrategy()
|
||||
>>> clusters = strategy.cluster(embeddings, results)
|
||||
>>> # Returns [[0], [1], [2], ...] - each result in its own cluster
|
||||
>>> representatives = strategy.select_representatives(clusters, results)
|
||||
>>> # Returns all results sorted by score
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ClusteringConfig] = None) -> None:
|
||||
"""Initialize NoOp clustering strategy.
|
||||
|
||||
Args:
|
||||
config: Clustering configuration. Ignored for NoOpStrategy
|
||||
but accepted for interface compatibility.
|
||||
"""
|
||||
super().__init__(config)
|
||||
|
||||
def cluster(
|
||||
self,
|
||||
embeddings: "np.ndarray",
|
||||
results: List["SearchResult"],
|
||||
) -> List[List[int]]:
|
||||
"""Return each result as its own singleton cluster.
|
||||
|
||||
Args:
|
||||
embeddings: NumPy array of shape (n_results, embedding_dim).
|
||||
Not used but accepted for interface compatibility.
|
||||
results: List of SearchResult objects.
|
||||
|
||||
Returns:
|
||||
List of singleton clusters, one per result.
|
||||
"""
|
||||
return [[i] for i in range(len(results))]
|
||||
|
||||
def select_representatives(
|
||||
self,
|
||||
clusters: List[List[int]],
|
||||
results: List["SearchResult"],
|
||||
embeddings: Optional["np.ndarray"] = None,
|
||||
) -> List["SearchResult"]:
|
||||
"""Return all results sorted by score.
|
||||
|
||||
Since each cluster is a singleton, this effectively returns all
|
||||
results sorted by score descending.
|
||||
|
||||
Args:
|
||||
clusters: List of singleton clusters.
|
||||
results: Original list of SearchResult objects.
|
||||
embeddings: Optional embeddings (not used).
|
||||
|
||||
Returns:
|
||||
All SearchResult objects sorted by score (highest first).
|
||||
"""
|
||||
if not results:
|
||||
return []
|
||||
|
||||
# Return all results sorted by score
|
||||
return sorted(results, key=lambda r: r.score, reverse=True)
|
||||
Reference in New Issue
Block a user