mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-10 02:24:35 +08:00
154 lines
5.3 KiB
Python
154 lines
5.3 KiB
Python
"""HDBSCAN-based clustering strategy for search results.
|
|
|
|
HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
|
|
is the primary clustering strategy for grouping similar search results.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import TYPE_CHECKING, List, Optional
|
|
|
|
from .base import BaseClusteringStrategy, ClusteringConfig
|
|
|
|
if TYPE_CHECKING:
|
|
import numpy as np
|
|
from codexlens.entities import SearchResult
|
|
|
|
|
|
class HDBSCANStrategy(BaseClusteringStrategy):
|
|
"""HDBSCAN-based clustering strategy.
|
|
|
|
Uses HDBSCAN algorithm to cluster search results based on embedding similarity.
|
|
HDBSCAN is preferred over DBSCAN because it:
|
|
- Automatically determines the number of clusters
|
|
- Handles varying density clusters well
|
|
- Identifies noise points (outliers) effectively
|
|
|
|
Example:
|
|
>>> from codexlens.search.clustering import HDBSCANStrategy, ClusteringConfig
|
|
>>> config = ClusteringConfig(min_cluster_size=3, metric='cosine')
|
|
>>> strategy = HDBSCANStrategy(config)
|
|
>>> clusters = strategy.cluster(embeddings, results)
|
|
>>> representatives = strategy.select_representatives(clusters, results)
|
|
"""
|
|
|
|
def __init__(self, config: Optional[ClusteringConfig] = None) -> None:
|
|
"""Initialize HDBSCAN clustering strategy.
|
|
|
|
Args:
|
|
config: Clustering configuration. Uses defaults if not provided.
|
|
|
|
Raises:
|
|
ImportError: If hdbscan package is not installed.
|
|
"""
|
|
super().__init__(config)
|
|
# Validate hdbscan is available
|
|
try:
|
|
import hdbscan # noqa: F401
|
|
except ImportError as exc:
|
|
raise ImportError(
|
|
"hdbscan package is required for HDBSCANStrategy. "
|
|
"Install with: pip install codexlens[clustering]"
|
|
) from exc
|
|
|
|
def cluster(
|
|
self,
|
|
embeddings: "np.ndarray",
|
|
results: List["SearchResult"],
|
|
) -> List[List[int]]:
|
|
"""Cluster search results using HDBSCAN algorithm.
|
|
|
|
Args:
|
|
embeddings: NumPy array of shape (n_results, embedding_dim)
|
|
containing the embedding vectors for each result.
|
|
results: List of SearchResult objects corresponding to embeddings.
|
|
|
|
Returns:
|
|
List of clusters, where each cluster is a list of indices
|
|
into the results list. Noise points are returned as singleton clusters.
|
|
"""
|
|
import hdbscan
|
|
import numpy as np
|
|
|
|
n_results = len(results)
|
|
if n_results == 0:
|
|
return []
|
|
|
|
# Handle edge case: fewer results than min_cluster_size
|
|
if n_results < self.config.min_cluster_size:
|
|
# Return each result as its own singleton cluster
|
|
return [[i] for i in range(n_results)]
|
|
|
|
# Configure HDBSCAN clusterer
|
|
clusterer = hdbscan.HDBSCAN(
|
|
min_cluster_size=self.config.min_cluster_size,
|
|
min_samples=self.config.min_samples,
|
|
metric=self.config.metric,
|
|
cluster_selection_epsilon=self.config.cluster_selection_epsilon,
|
|
allow_single_cluster=self.config.allow_single_cluster,
|
|
prediction_data=self.config.prediction_data,
|
|
)
|
|
|
|
# Fit and get cluster labels
|
|
# Labels: -1 = noise, 0+ = cluster index
|
|
labels = clusterer.fit_predict(embeddings)
|
|
|
|
# Group indices by cluster label
|
|
cluster_map: dict[int, list[int]] = {}
|
|
for idx, label in enumerate(labels):
|
|
if label not in cluster_map:
|
|
cluster_map[label] = []
|
|
cluster_map[label].append(idx)
|
|
|
|
# Build result: non-noise clusters first, then noise as singletons
|
|
clusters: List[List[int]] = []
|
|
|
|
# Add proper clusters (label >= 0)
|
|
for label in sorted(cluster_map.keys()):
|
|
if label >= 0:
|
|
clusters.append(cluster_map[label])
|
|
|
|
# Add noise points as singleton clusters (label == -1)
|
|
if -1 in cluster_map:
|
|
for idx in cluster_map[-1]:
|
|
clusters.append([idx])
|
|
|
|
return clusters
|
|
|
|
def select_representatives(
|
|
self,
|
|
clusters: List[List[int]],
|
|
results: List["SearchResult"],
|
|
embeddings: Optional["np.ndarray"] = None,
|
|
) -> List["SearchResult"]:
|
|
"""Select representative results from each cluster.
|
|
|
|
Selects the result with the highest score from each cluster.
|
|
|
|
Args:
|
|
clusters: List of clusters from cluster() method.
|
|
results: Original list of SearchResult objects.
|
|
embeddings: Optional embeddings (not used in score-based selection).
|
|
|
|
Returns:
|
|
List of representative SearchResult objects, one per cluster,
|
|
ordered by score (highest first).
|
|
"""
|
|
if not clusters or not results:
|
|
return []
|
|
|
|
representatives: List["SearchResult"] = []
|
|
|
|
for cluster_indices in clusters:
|
|
if not cluster_indices:
|
|
continue
|
|
|
|
# Find the result with the highest score in this cluster
|
|
best_idx = max(cluster_indices, key=lambda i: results[i].score)
|
|
representatives.append(results[best_idx])
|
|
|
|
# Sort by score descending
|
|
representatives.sort(key=lambda r: r.score, reverse=True)
|
|
|
|
return representatives
|