mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-10 02:24:35 +08:00
198 lines
6.5 KiB
Python
198 lines
6.5 KiB
Python
"""DBSCAN-based clustering strategy for search results.
|
|
|
|
DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
|
|
is the fallback clustering strategy when HDBSCAN is not available.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING, List, Optional
|
|
|
|
from .base import BaseClusteringStrategy, ClusteringConfig
|
|
|
|
if TYPE_CHECKING:
|
|
import numpy as np
|
|
from codexlens.entities import SearchResult
|
|
|
|
|
|
class DBSCANStrategy(BaseClusteringStrategy):
|
|
"""DBSCAN-based clustering strategy.
|
|
|
|
Uses sklearn's DBSCAN algorithm as a fallback when HDBSCAN is not available.
|
|
DBSCAN requires an explicit eps parameter, which is auto-computed from the
|
|
distance distribution if not provided.
|
|
|
|
Example:
|
|
>>> from codexlens.search.clustering import DBSCANStrategy, ClusteringConfig
|
|
>>> config = ClusteringConfig(min_cluster_size=3, metric='cosine')
|
|
>>> strategy = DBSCANStrategy(config)
|
|
>>> clusters = strategy.cluster(embeddings, results)
|
|
>>> representatives = strategy.select_representatives(clusters, results)
|
|
"""
|
|
|
|
# Default eps percentile for auto-computation
|
|
DEFAULT_EPS_PERCENTILE: float = 15.0
|
|
|
|
def __init__(
|
|
self,
|
|
config: Optional[ClusteringConfig] = None,
|
|
eps: Optional[float] = None,
|
|
eps_percentile: float = DEFAULT_EPS_PERCENTILE,
|
|
) -> None:
|
|
"""Initialize DBSCAN clustering strategy.
|
|
|
|
Args:
|
|
config: Clustering configuration. Uses defaults if not provided.
|
|
eps: Explicit eps parameter for DBSCAN. If None, auto-computed
|
|
from the distance distribution.
|
|
eps_percentile: Percentile of pairwise distances to use for
|
|
auto-computing eps. Default is 15th percentile.
|
|
|
|
Raises:
|
|
ImportError: If sklearn is not installed.
|
|
"""
|
|
super().__init__(config)
|
|
self.eps = eps
|
|
self.eps_percentile = eps_percentile
|
|
|
|
# Validate sklearn is available
|
|
try:
|
|
from sklearn.cluster import DBSCAN # noqa: F401
|
|
except ImportError as exc:
|
|
raise ImportError(
|
|
"scikit-learn package is required for DBSCANStrategy. "
|
|
"Install with: pip install codexlens[clustering]"
|
|
) from exc
|
|
|
|
def _compute_eps(self, embeddings: "np.ndarray") -> float:
|
|
"""Auto-compute eps from pairwise distance distribution.
|
|
|
|
Uses the specified percentile of pairwise distances as eps,
|
|
which typically captures local density well.
|
|
|
|
Args:
|
|
embeddings: NumPy array of shape (n_results, embedding_dim).
|
|
|
|
Returns:
|
|
Computed eps value.
|
|
"""
|
|
import numpy as np
|
|
from sklearn.metrics import pairwise_distances
|
|
|
|
# Compute pairwise distances
|
|
distances = pairwise_distances(embeddings, metric=self.config.metric)
|
|
|
|
# Get upper triangle (excluding diagonal)
|
|
upper_tri = distances[np.triu_indices_from(distances, k=1)]
|
|
|
|
if len(upper_tri) == 0:
|
|
# Only one point, return a default small eps
|
|
return 0.1
|
|
|
|
# Use percentile of distances as eps
|
|
eps = float(np.percentile(upper_tri, self.eps_percentile))
|
|
|
|
# Ensure eps is positive
|
|
return max(eps, 1e-6)
|
|
|
|
def cluster(
|
|
self,
|
|
embeddings: "np.ndarray",
|
|
results: List["SearchResult"],
|
|
) -> List[List[int]]:
|
|
"""Cluster search results using DBSCAN algorithm.
|
|
|
|
Args:
|
|
embeddings: NumPy array of shape (n_results, embedding_dim)
|
|
containing the embedding vectors for each result.
|
|
results: List of SearchResult objects corresponding to embeddings.
|
|
|
|
Returns:
|
|
List of clusters, where each cluster is a list of indices
|
|
into the results list. Noise points are returned as singleton clusters.
|
|
"""
|
|
from sklearn.cluster import DBSCAN
|
|
import numpy as np
|
|
|
|
n_results = len(results)
|
|
if n_results == 0:
|
|
return []
|
|
|
|
# Handle edge case: single result
|
|
if n_results == 1:
|
|
return [[0]]
|
|
|
|
# Determine eps value
|
|
eps = self.eps if self.eps is not None else self._compute_eps(embeddings)
|
|
|
|
# Configure DBSCAN clusterer
|
|
# Note: DBSCAN min_samples corresponds to min_cluster_size concept
|
|
clusterer = DBSCAN(
|
|
eps=eps,
|
|
min_samples=self.config.min_samples,
|
|
metric=self.config.metric,
|
|
)
|
|
|
|
# Fit and get cluster labels
|
|
# Labels: -1 = noise, 0+ = cluster index
|
|
labels = clusterer.fit_predict(embeddings)
|
|
|
|
# Group indices by cluster label
|
|
cluster_map: dict[int, list[int]] = {}
|
|
for idx, label in enumerate(labels):
|
|
if label not in cluster_map:
|
|
cluster_map[label] = []
|
|
cluster_map[label].append(idx)
|
|
|
|
# Build result: non-noise clusters first, then noise as singletons
|
|
clusters: List[List[int]] = []
|
|
|
|
# Add proper clusters (label >= 0)
|
|
for label in sorted(cluster_map.keys()):
|
|
if label >= 0:
|
|
clusters.append(cluster_map[label])
|
|
|
|
# Add noise points as singleton clusters (label == -1)
|
|
if -1 in cluster_map:
|
|
for idx in cluster_map[-1]:
|
|
clusters.append([idx])
|
|
|
|
return clusters
|
|
|
|
def select_representatives(
|
|
self,
|
|
clusters: List[List[int]],
|
|
results: List["SearchResult"],
|
|
embeddings: Optional["np.ndarray"] = None,
|
|
) -> List["SearchResult"]:
|
|
"""Select representative results from each cluster.
|
|
|
|
Selects the result with the highest score from each cluster.
|
|
|
|
Args:
|
|
clusters: List of clusters from cluster() method.
|
|
results: Original list of SearchResult objects.
|
|
embeddings: Optional embeddings (not used in score-based selection).
|
|
|
|
Returns:
|
|
List of representative SearchResult objects, one per cluster,
|
|
ordered by score (highest first).
|
|
"""
|
|
if not clusters or not results:
|
|
return []
|
|
|
|
representatives: List["SearchResult"] = []
|
|
|
|
for cluster_indices in clusters:
|
|
if not cluster_indices:
|
|
continue
|
|
|
|
# Find the result with the highest score in this cluster
|
|
best_idx = max(cluster_indices, key=lambda i: results[i].score)
|
|
representatives.append(results[best_idx])
|
|
|
|
# Sort by score descending
|
|
representatives.sort(key=lambda r: r.score, reverse=True)
|
|
|
|
return representatives
|