Refactor code structure and remove redundant changes

2026-02-14 02:42:04 +08:00 · 2026-01-24 14:47:47 +08:00
parent cf5fecd66d
commit f2b0a5bbc9
113 changed files with 43217 additions and 235 deletions
--- a/codex-lens/build/lib/codexlens/search/clustering/dbscan_strategy.py
+++ b/codex-lens/build/lib/codexlens/search/clustering/dbscan_strategy.py
@@ -0,0 +1,197 @@
+"""DBSCAN-based clustering strategy for search results.
+
+DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
+is the fallback clustering strategy when HDBSCAN is not available.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Optional
+
+from .base import BaseClusteringStrategy, ClusteringConfig
+
+if TYPE_CHECKING:
+    import numpy as np
+    from codexlens.entities import SearchResult
+
+
+class DBSCANStrategy(BaseClusteringStrategy):
+    """DBSCAN-based clustering strategy.
+
+    Uses sklearn's DBSCAN algorithm as a fallback when HDBSCAN is not available.
+    DBSCAN requires an explicit eps parameter, which is auto-computed from the
+    distance distribution if not provided.
+
+    Example:
+        >>> from codexlens.search.clustering import DBSCANStrategy, ClusteringConfig
+        >>> config = ClusteringConfig(min_cluster_size=3, metric='cosine')
+        >>> strategy = DBSCANStrategy(config)
+        >>> clusters = strategy.cluster(embeddings, results)
+        >>> representatives = strategy.select_representatives(clusters, results)
+    """
+
+    # Default eps percentile for auto-computation
+    DEFAULT_EPS_PERCENTILE: float = 15.0
+
+    def __init__(
+        self,
+        config: Optional[ClusteringConfig] = None,
+        eps: Optional[float] = None,
+        eps_percentile: float = DEFAULT_EPS_PERCENTILE,
+    ) -> None:
+        """Initialize DBSCAN clustering strategy.
+
+        Args:
+            config: Clustering configuration. Uses defaults if not provided.
+            eps: Explicit eps parameter for DBSCAN. If None, auto-computed
+                from the distance distribution.
+            eps_percentile: Percentile of pairwise distances to use for
+                auto-computing eps. Default is 15th percentile.
+
+        Raises:
+            ImportError: If sklearn is not installed.
+        """
+        super().__init__(config)
+        self.eps = eps
+        self.eps_percentile = eps_percentile
+
+        # Validate sklearn is available
+        try:
+            from sklearn.cluster import DBSCAN  # noqa: F401
+        except ImportError as exc:
+            raise ImportError(
+                "scikit-learn package is required for DBSCANStrategy. "
+                "Install with: pip install codexlens[clustering]"
+            ) from exc
+
+    def _compute_eps(self, embeddings: "np.ndarray") -> float:
+        """Auto-compute eps from pairwise distance distribution.
+
+        Uses the specified percentile of pairwise distances as eps,
+        which typically captures local density well.
+
+        Args:
+            embeddings: NumPy array of shape (n_results, embedding_dim).
+
+        Returns:
+            Computed eps value.
+        """
+        import numpy as np
+        from sklearn.metrics import pairwise_distances
+
+        # Compute pairwise distances
+        distances = pairwise_distances(embeddings, metric=self.config.metric)
+
+        # Get upper triangle (excluding diagonal)
+        upper_tri = distances[np.triu_indices_from(distances, k=1)]
+
+        if len(upper_tri) == 0:
+            # Only one point, return a default small eps
+            return 0.1
+
+        # Use percentile of distances as eps
+        eps = float(np.percentile(upper_tri, self.eps_percentile))
+
+        # Ensure eps is positive
+        return max(eps, 1e-6)
+
+    def cluster(
+        self,
+        embeddings: "np.ndarray",
+        results: List["SearchResult"],
+    ) -> List[List[int]]:
+        """Cluster search results using DBSCAN algorithm.
+
+        Args:
+            embeddings: NumPy array of shape (n_results, embedding_dim)
+                containing the embedding vectors for each result.
+            results: List of SearchResult objects corresponding to embeddings.
+
+        Returns:
+            List of clusters, where each cluster is a list of indices
+            into the results list. Noise points are returned as singleton clusters.
+        """
+        from sklearn.cluster import DBSCAN
+        import numpy as np
+
+        n_results = len(results)
+        if n_results == 0:
+            return []
+
+        # Handle edge case: single result
+        if n_results == 1:
+            return [[0]]
+
+        # Determine eps value
+        eps = self.eps if self.eps is not None else self._compute_eps(embeddings)
+
+        # Configure DBSCAN clusterer
+        # Note: DBSCAN min_samples corresponds to min_cluster_size concept
+        clusterer = DBSCAN(
+            eps=eps,
+            min_samples=self.config.min_samples,
+            metric=self.config.metric,
+        )
+
+        # Fit and get cluster labels
+        # Labels: -1 = noise, 0+ = cluster index
+        labels = clusterer.fit_predict(embeddings)
+
+        # Group indices by cluster label
+        cluster_map: dict[int, list[int]] = {}
+        for idx, label in enumerate(labels):
+            if label not in cluster_map:
+                cluster_map[label] = []
+            cluster_map[label].append(idx)
+
+        # Build result: non-noise clusters first, then noise as singletons
+        clusters: List[List[int]] = []
+
+        # Add proper clusters (label >= 0)
+        for label in sorted(cluster_map.keys()):
+            if label >= 0:
+                clusters.append(cluster_map[label])
+
+        # Add noise points as singleton clusters (label == -1)
+        if -1 in cluster_map:
+            for idx in cluster_map[-1]:
+                clusters.append([idx])
+
+        return clusters
+
+    def select_representatives(
+        self,
+        clusters: List[List[int]],
+        results: List["SearchResult"],
+        embeddings: Optional["np.ndarray"] = None,
+    ) -> List["SearchResult"]:
+        """Select representative results from each cluster.
+
+        Selects the result with the highest score from each cluster.
+
+        Args:
+            clusters: List of clusters from cluster() method.
+            results: Original list of SearchResult objects.
+            embeddings: Optional embeddings (not used in score-based selection).
+
+        Returns:
+            List of representative SearchResult objects, one per cluster,
+            ordered by score (highest first).
+        """
+        if not clusters or not results:
+            return []
+
+        representatives: List["SearchResult"] = []
+
+        for cluster_indices in clusters:
+            if not cluster_indices:
+                continue
+
+            # Find the result with the highest score in this cluster
+            best_idx = max(cluster_indices, key=lambda i: results[i].score)
+            representatives.append(results[best_idx])
+
+        # Sort by score descending
+        representatives.sort(key=lambda r: r.score, reverse=True)
+
+        return representatives