Claude-Code-Workflow/codex-lens/build/lib/codexlens/search/clustering/hdbscan_strategy.py

"""HDBSCAN-based clustering strategy for search results.

HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
is the primary clustering strategy for grouping similar search results.
"""

from __future__ import annotations

from typing import TYPE_CHECKING, List, Optional

from .base import BaseClusteringStrategy, ClusteringConfig

if TYPE_CHECKING:
    import numpy as np
    from codexlens.entities import SearchResult


class HDBSCANStrategy(BaseClusteringStrategy):
    """HDBSCAN-based clustering strategy.

    Uses HDBSCAN algorithm to cluster search results based on embedding similarity.
    HDBSCAN is preferred over DBSCAN because it:
    - Automatically determines the number of clusters
    - Handles varying density clusters well
    - Identifies noise points (outliers) effectively

    Example:
        >>> from codexlens.search.clustering import HDBSCANStrategy, ClusteringConfig
        >>> config = ClusteringConfig(min_cluster_size=3, metric='cosine')
        >>> strategy = HDBSCANStrategy(config)
        >>> clusters = strategy.cluster(embeddings, results)
        >>> representatives = strategy.select_representatives(clusters, results)
    """

    def __init__(self, config: Optional[ClusteringConfig] = None) -> None:
        """Initialize HDBSCAN clustering strategy.

        Args:
            config: Clustering configuration. Uses defaults if not provided.

        Raises:
            ImportError: If hdbscan package is not installed.
        """
        super().__init__(config)
        # Validate hdbscan is available
        try:
            import hdbscan  # noqa: F401
        except ImportError as exc:
            raise ImportError(
                "hdbscan package is required for HDBSCANStrategy. "
                "Install with: pip install codexlens[clustering]"
            ) from exc

    def cluster(
        self,
        embeddings: "np.ndarray",
        results: List["SearchResult"],
    ) -> List[List[int]]:
        """Cluster search results using HDBSCAN algorithm.

        Args:
            embeddings: NumPy array of shape (n_results, embedding_dim)
                containing the embedding vectors for each result.
            results: List of SearchResult objects corresponding to embeddings.

        Returns:
            List of clusters, where each cluster is a list of indices
            into the results list. Noise points are returned as singleton clusters.
        """
        import hdbscan
        import numpy as np

        n_results = len(results)
        if n_results == 0:
            return []

        # Handle edge case: fewer results than min_cluster_size
        if n_results < self.config.min_cluster_size:
            # Return each result as its own singleton cluster
            return [[i] for i in range(n_results)]

        # Configure HDBSCAN clusterer
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=self.config.min_cluster_size,
            min_samples=self.config.min_samples,
            metric=self.config.metric,
            cluster_selection_epsilon=self.config.cluster_selection_epsilon,
            allow_single_cluster=self.config.allow_single_cluster,
            prediction_data=self.config.prediction_data,
        )

        # Fit and get cluster labels
        # Labels: -1 = noise, 0+ = cluster index
        labels = clusterer.fit_predict(embeddings)

        # Group indices by cluster label
        cluster_map: dict[int, list[int]] = {}
        for idx, label in enumerate(labels):
            if label not in cluster_map:
                cluster_map[label] = []
            cluster_map[label].append(idx)

        # Build result: non-noise clusters first, then noise as singletons
        clusters: List[List[int]] = []

        # Add proper clusters (label >= 0)
        for label in sorted(cluster_map.keys()):
            if label >= 0:
                clusters.append(cluster_map[label])

        # Add noise points as singleton clusters (label == -1)
        if -1 in cluster_map:
            for idx in cluster_map[-1]:
                clusters.append([idx])

        return clusters

    def select_representatives(
        self,
        clusters: List[List[int]],
        results: List["SearchResult"],
        embeddings: Optional["np.ndarray"] = None,
    ) -> List["SearchResult"]:
        """Select representative results from each cluster.

        Selects the result with the highest score from each cluster.

        Args:
            clusters: List of clusters from cluster() method.
            results: Original list of SearchResult objects.
            embeddings: Optional embeddings (not used in score-based selection).

        Returns:
            List of representative SearchResult objects, one per cluster,
            ordered by score (highest first).
        """
        if not clusters or not results:
            return []

        representatives: List["SearchResult"] = []

        for cluster_indices in clusters:
            if not cluster_indices:
                continue

            # Find the result with the highest score in this cluster
            best_idx = max(cluster_indices, key=lambda i: results[i].score)
            representatives.append(results[best_idx])

        # Sort by score descending
        representatives.sort(key=lambda r: r.score, reverse=True)

        return representatives