Refactor code structure and remove redundant changes

2026-02-13 02:41:50 +08:00 · 2026-01-24 14:47:47 +08:00
parent cf5fecd66d
commit f2b0a5bbc9
113 changed files with 43217 additions and 235 deletions
--- a/codex-lens/build/lib/codexlens/search/clustering/init.py
+++ b/codex-lens/build/lib/codexlens/search/clustering/init.py
@@ -0,0 +1,124 @@
+"""Clustering strategies for the staged hybrid search pipeline.
+
+This module provides extensible clustering infrastructure for grouping
+similar search results and selecting representative results.
+
+Install with: pip install codexlens[clustering]
+
+Example:
+    >>> from codexlens.search.clustering import (
+    ...     CLUSTERING_AVAILABLE,
+    ...     ClusteringConfig,
+    ...     get_strategy,
+    ... )
+    >>> config = ClusteringConfig(min_cluster_size=3)
+    >>> # Auto-select best available strategy with fallback
+    >>> strategy = get_strategy("auto", config)
+    >>> representatives = strategy.fit_predict(embeddings, results)
+    >>>
+    >>> # Or explicitly use a specific strategy
+    >>> if CLUSTERING_AVAILABLE:
+    ...     from codexlens.search.clustering import HDBSCANStrategy
+    ...     strategy = HDBSCANStrategy(config)
+    ...     representatives = strategy.fit_predict(embeddings, results)
+"""
+
+from __future__ import annotations
+
+# Always export base classes and factory (no heavy dependencies)
+from .base import BaseClusteringStrategy, ClusteringConfig
+from .factory import (
+    ClusteringStrategyFactory,
+    check_clustering_strategy_available,
+    get_strategy,
+)
+from .noop_strategy import NoOpStrategy
+from .frequency_strategy import FrequencyStrategy, FrequencyConfig
+
+# Feature flag for clustering availability (hdbscan + sklearn)
+CLUSTERING_AVAILABLE = False
+HDBSCAN_AVAILABLE = False
+DBSCAN_AVAILABLE = False
+_import_error: str | None = None
+
+
+def _detect_clustering_available() -> tuple[bool, bool, bool, str | None]:
+    """Detect if clustering dependencies are available.
+
+    Returns:
+        Tuple of (all_available, hdbscan_available, dbscan_available, error_message).
+    """
+    hdbscan_ok = False
+    dbscan_ok = False
+
+    try:
+        import hdbscan  # noqa: F401
+        hdbscan_ok = True
+    except ImportError:
+        pass
+
+    try:
+        from sklearn.cluster import DBSCAN  # noqa: F401
+        dbscan_ok = True
+    except ImportError:
+        pass
+
+    all_ok = hdbscan_ok and dbscan_ok
+    error = None
+    if not all_ok:
+        missing = []
+        if not hdbscan_ok:
+            missing.append("hdbscan")
+        if not dbscan_ok:
+            missing.append("scikit-learn")
+        error = f"{', '.join(missing)} not available. Install with: pip install codexlens[clustering]"
+
+    return all_ok, hdbscan_ok, dbscan_ok, error
+
+
+# Initialize on module load
+CLUSTERING_AVAILABLE, HDBSCAN_AVAILABLE, DBSCAN_AVAILABLE, _import_error = (
+    _detect_clustering_available()
+)
+
+
+def check_clustering_available() -> tuple[bool, str | None]:
+    """Check if all clustering dependencies are available.
+
+    Returns:
+        Tuple of (is_available, error_message).
+        error_message is None if available, otherwise contains install instructions.
+    """
+    return CLUSTERING_AVAILABLE, _import_error
+
+
+# Conditionally export strategy implementations
+__all__ = [
+    # Feature flags
+    "CLUSTERING_AVAILABLE",
+    "HDBSCAN_AVAILABLE",
+    "DBSCAN_AVAILABLE",
+    "check_clustering_available",
+    # Base classes
+    "BaseClusteringStrategy",
+    "ClusteringConfig",
+    # Factory
+    "ClusteringStrategyFactory",
+    "get_strategy",
+    "check_clustering_strategy_available",
+    # Always-available strategies
+    "NoOpStrategy",
+    "FrequencyStrategy",
+    "FrequencyConfig",
+]
+
+# Conditionally add strategy classes to __all__ and module namespace
+if HDBSCAN_AVAILABLE:
+    from .hdbscan_strategy import HDBSCANStrategy
+
+    __all__.append("HDBSCANStrategy")
+
+if DBSCAN_AVAILABLE:
+    from .dbscan_strategy import DBSCANStrategy
+
+    __all__.append("DBSCANStrategy")
--- a/codex-lens/build/lib/codexlens/search/clustering/base.py
+++ b/codex-lens/build/lib/codexlens/search/clustering/base.py
@@ -0,0 +1,153 @@
+"""Base classes for clustering strategies in the hybrid search pipeline.
+
+This module defines the abstract base class for clustering strategies used
+in the staged hybrid search pipeline. Strategies cluster search results
+based on their embeddings and select representative results from each cluster.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, List, Optional
+
+if TYPE_CHECKING:
+    import numpy as np
+    from codexlens.entities import SearchResult
+
+
+@dataclass
+class ClusteringConfig:
+    """Configuration parameters for clustering strategies.
+
+    Attributes:
+        min_cluster_size: Minimum number of results to form a cluster.
+            HDBSCAN default is 5, but for search results 2-3 is often better.
+        min_samples: Number of samples in a neighborhood for a point to be
+            considered a core point. Lower values allow more clusters.
+        metric: Distance metric for clustering. Common options:
+            - 'euclidean': Standard L2 distance
+            - 'cosine': Cosine distance (1 - cosine_similarity)
+            - 'manhattan': L1 distance
+        cluster_selection_epsilon: Distance threshold for cluster selection.
+            Results within this distance may be merged into the same cluster.
+        allow_single_cluster: If True, allow all results to form one cluster.
+            Useful when results are very similar.
+        prediction_data: If True, generate prediction data for new points.
+    """
+
+    min_cluster_size: int = 3
+    min_samples: int = 2
+    metric: str = "cosine"
+    cluster_selection_epsilon: float = 0.0
+    allow_single_cluster: bool = True
+    prediction_data: bool = False
+
+    def __post_init__(self) -> None:
+        """Validate configuration parameters."""
+        if self.min_cluster_size < 2:
+            raise ValueError("min_cluster_size must be >= 2")
+        if self.min_samples < 1:
+            raise ValueError("min_samples must be >= 1")
+        if self.metric not in ("euclidean", "cosine", "manhattan"):
+            raise ValueError(f"metric must be one of: euclidean, cosine, manhattan; got {self.metric}")
+        if self.cluster_selection_epsilon < 0:
+            raise ValueError("cluster_selection_epsilon must be >= 0")
+
+
+class BaseClusteringStrategy(ABC):
+    """Abstract base class for clustering strategies.
+
+    Clustering strategies are used in the staged hybrid search pipeline to
+    group similar search results and select representative results from each
+    cluster, reducing redundancy while maintaining diversity.
+
+    Subclasses must implement:
+        - cluster(): Group results into clusters based on embeddings
+        - select_representatives(): Choose best result(s) from each cluster
+    """
+
+    def __init__(self, config: Optional[ClusteringConfig] = None) -> None:
+        """Initialize the clustering strategy.
+
+        Args:
+            config: Clustering configuration. Uses defaults if not provided.
+        """
+        self.config = config or ClusteringConfig()
+
+    @abstractmethod
+    def cluster(
+        self,
+        embeddings: "np.ndarray",
+        results: List["SearchResult"],
+    ) -> List[List[int]]:
+        """Cluster search results based on their embeddings.
+
+        Args:
+            embeddings: NumPy array of shape (n_results, embedding_dim)
+                containing the embedding vectors for each result.
+            results: List of SearchResult objects corresponding to embeddings.
+                Used for additional metadata during clustering.
+
+        Returns:
+            List of clusters, where each cluster is a list of indices
+            into the results list. Results not assigned to any cluster
+            (noise points) should be returned as single-element clusters.
+
+        Example:
+            >>> strategy = HDBSCANStrategy()
+            >>> clusters = strategy.cluster(embeddings, results)
+            >>> # clusters = [[0, 2, 5], [1, 3], [4], [6, 7, 8]]
+            >>> # Result indices 0, 2, 5 are in cluster 0
+            >>> # Result indices 1, 3 are in cluster 1
+            >>> # Result index 4 is a noise point (singleton cluster)
+            >>> # Result indices 6, 7, 8 are in cluster 2
+        """
+        ...
+
+    @abstractmethod
+    def select_representatives(
+        self,
+        clusters: List[List[int]],
+        results: List["SearchResult"],
+        embeddings: Optional["np.ndarray"] = None,
+    ) -> List["SearchResult"]:
+        """Select representative results from each cluster.
+
+        This method chooses the best result(s) from each cluster to include
+        in the final search results. The selection can be based on:
+        - Highest score within cluster
+        - Closest to cluster centroid
+        - Custom selection logic
+
+        Args:
+            clusters: List of clusters from cluster() method.
+            results: Original list of SearchResult objects.
+            embeddings: Optional embeddings array for centroid-based selection.
+
+        Returns:
+            List of representative SearchResult objects, one or more per cluster,
+            ordered by relevance (highest score first).
+
+        Example:
+            >>> representatives = strategy.select_representatives(clusters, results)
+            >>> # Returns best result from each cluster
+        """
+        ...
+
+    def fit_predict(
+        self,
+        embeddings: "np.ndarray",
+        results: List["SearchResult"],
+    ) -> List["SearchResult"]:
+        """Convenience method to cluster and select representatives in one call.
+
+        Args:
+            embeddings: NumPy array of shape (n_results, embedding_dim).
+            results: List of SearchResult objects.
+
+        Returns:
+            List of representative SearchResult objects.
+        """
+        clusters = self.cluster(embeddings, results)
+        return self.select_representatives(clusters, results, embeddings)
--- a/codex-lens/build/lib/codexlens/search/clustering/dbscan_strategy.py
+++ b/codex-lens/build/lib/codexlens/search/clustering/dbscan_strategy.py
@@ -0,0 +1,197 @@
+"""DBSCAN-based clustering strategy for search results.
+
+DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
+is the fallback clustering strategy when HDBSCAN is not available.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Optional
+
+from .base import BaseClusteringStrategy, ClusteringConfig
+
+if TYPE_CHECKING:
+    import numpy as np
+    from codexlens.entities import SearchResult
+
+
+class DBSCANStrategy(BaseClusteringStrategy):
+    """DBSCAN-based clustering strategy.
+
+    Uses sklearn's DBSCAN algorithm as a fallback when HDBSCAN is not available.
+    DBSCAN requires an explicit eps parameter, which is auto-computed from the
+    distance distribution if not provided.
+
+    Example:
+        >>> from codexlens.search.clustering import DBSCANStrategy, ClusteringConfig
+        >>> config = ClusteringConfig(min_cluster_size=3, metric='cosine')
+        >>> strategy = DBSCANStrategy(config)
+        >>> clusters = strategy.cluster(embeddings, results)
+        >>> representatives = strategy.select_representatives(clusters, results)
+    """
+
+    # Default eps percentile for auto-computation
+    DEFAULT_EPS_PERCENTILE: float = 15.0
+
+    def __init__(
+        self,
+        config: Optional[ClusteringConfig] = None,
+        eps: Optional[float] = None,
+        eps_percentile: float = DEFAULT_EPS_PERCENTILE,
+    ) -> None:
+        """Initialize DBSCAN clustering strategy.
+
+        Args:
+            config: Clustering configuration. Uses defaults if not provided.
+            eps: Explicit eps parameter for DBSCAN. If None, auto-computed
+                from the distance distribution.
+            eps_percentile: Percentile of pairwise distances to use for
+                auto-computing eps. Default is 15th percentile.
+
+        Raises:
+            ImportError: If sklearn is not installed.
+        """
+        super().__init__(config)
+        self.eps = eps
+        self.eps_percentile = eps_percentile
+
+        # Validate sklearn is available
+        try:
+            from sklearn.cluster import DBSCAN  # noqa: F401
+        except ImportError as exc:
+            raise ImportError(
+                "scikit-learn package is required for DBSCANStrategy. "
+                "Install with: pip install codexlens[clustering]"
+            ) from exc
+
+    def _compute_eps(self, embeddings: "np.ndarray") -> float:
+        """Auto-compute eps from pairwise distance distribution.
+
+        Uses the specified percentile of pairwise distances as eps,
+        which typically captures local density well.
+
+        Args:
+            embeddings: NumPy array of shape (n_results, embedding_dim).
+
+        Returns:
+            Computed eps value.
+        """
+        import numpy as np
+        from sklearn.metrics import pairwise_distances
+
+        # Compute pairwise distances
+        distances = pairwise_distances(embeddings, metric=self.config.metric)
+
+        # Get upper triangle (excluding diagonal)
+        upper_tri = distances[np.triu_indices_from(distances, k=1)]
+
+        if len(upper_tri) == 0:
+            # Only one point, return a default small eps
+            return 0.1
+
+        # Use percentile of distances as eps
+        eps = float(np.percentile(upper_tri, self.eps_percentile))
+
+        # Ensure eps is positive
+        return max(eps, 1e-6)
+
+    def cluster(
+        self,
+        embeddings: "np.ndarray",
+        results: List["SearchResult"],
+    ) -> List[List[int]]:
+        """Cluster search results using DBSCAN algorithm.
+
+        Args:
+            embeddings: NumPy array of shape (n_results, embedding_dim)
+                containing the embedding vectors for each result.
+            results: List of SearchResult objects corresponding to embeddings.
+
+        Returns:
+            List of clusters, where each cluster is a list of indices
+            into the results list. Noise points are returned as singleton clusters.
+        """
+        from sklearn.cluster import DBSCAN
+        import numpy as np
+
+        n_results = len(results)
+        if n_results == 0:
+            return []
+
+        # Handle edge case: single result
+        if n_results == 1:
+            return [[0]]
+
+        # Determine eps value
+        eps = self.eps if self.eps is not None else self._compute_eps(embeddings)
+
+        # Configure DBSCAN clusterer
+        # Note: DBSCAN min_samples corresponds to min_cluster_size concept
+        clusterer = DBSCAN(
+            eps=eps,
+            min_samples=self.config.min_samples,
+            metric=self.config.metric,
+        )
+
+        # Fit and get cluster labels
+        # Labels: -1 = noise, 0+ = cluster index
+        labels = clusterer.fit_predict(embeddings)
+
+        # Group indices by cluster label
+        cluster_map: dict[int, list[int]] = {}
+        for idx, label in enumerate(labels):
+            if label not in cluster_map:
+                cluster_map[label] = []
+            cluster_map[label].append(idx)
+
+        # Build result: non-noise clusters first, then noise as singletons
+        clusters: List[List[int]] = []
+
+        # Add proper clusters (label >= 0)
+        for label in sorted(cluster_map.keys()):
+            if label >= 0:
+                clusters.append(cluster_map[label])
+
+        # Add noise points as singleton clusters (label == -1)
+        if -1 in cluster_map:
+            for idx in cluster_map[-1]:
+                clusters.append([idx])
+
+        return clusters
+
+    def select_representatives(
+        self,
+        clusters: List[List[int]],
+        results: List["SearchResult"],
+        embeddings: Optional["np.ndarray"] = None,
+    ) -> List["SearchResult"]:
+        """Select representative results from each cluster.
+
+        Selects the result with the highest score from each cluster.
+
+        Args:
+            clusters: List of clusters from cluster() method.
+            results: Original list of SearchResult objects.
+            embeddings: Optional embeddings (not used in score-based selection).
+
+        Returns:
+            List of representative SearchResult objects, one per cluster,
+            ordered by score (highest first).
+        """
+        if not clusters or not results:
+            return []
+
+        representatives: List["SearchResult"] = []
+
+        for cluster_indices in clusters:
+            if not cluster_indices:
+                continue
+
+            # Find the result with the highest score in this cluster
+            best_idx = max(cluster_indices, key=lambda i: results[i].score)
+            representatives.append(results[best_idx])
+
+        # Sort by score descending
+        representatives.sort(key=lambda r: r.score, reverse=True)
+
+        return representatives
--- a/codex-lens/build/lib/codexlens/search/clustering/factory.py
+++ b/codex-lens/build/lib/codexlens/search/clustering/factory.py
@@ -0,0 +1,202 @@
+"""Factory for creating clustering strategies.
+
+Provides a unified interface for instantiating different clustering backends
+with automatic fallback chain: hdbscan -> dbscan -> noop.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from .base import BaseClusteringStrategy, ClusteringConfig
+from .noop_strategy import NoOpStrategy
+
+
+def check_clustering_strategy_available(strategy: str) -> tuple[bool, str | None]:
+    """Check whether a specific clustering strategy can be used.
+
+    Args:
+        strategy: Strategy name to check. Options:
+            - "hdbscan": HDBSCAN clustering (requires hdbscan package)
+            - "dbscan": DBSCAN clustering (requires sklearn)
+            - "frequency": Frequency-based clustering (always available)
+            - "noop": No-op strategy (always available)
+
+    Returns:
+        Tuple of (is_available, error_message).
+        error_message is None if available, otherwise contains install instructions.
+    """
+    strategy = (strategy or "").strip().lower()
+
+    if strategy == "hdbscan":
+        try:
+            import hdbscan  # noqa: F401
+        except ImportError:
+            return False, (
+                "hdbscan package not available. "
+                "Install with: pip install codexlens[clustering]"
+            )
+        return True, None
+
+    if strategy == "dbscan":
+        try:
+            from sklearn.cluster import DBSCAN  # noqa: F401
+        except ImportError:
+            return False, (
+                "scikit-learn package not available. "
+                "Install with: pip install codexlens[clustering]"
+            )
+        return True, None
+
+    if strategy == "frequency":
+        # Frequency strategy is always available (no external deps)
+        return True, None
+
+    if strategy == "noop":
+        return True, None
+
+    return False, (
+        f"Invalid clustering strategy: {strategy}. "
+        "Must be 'hdbscan', 'dbscan', 'frequency', or 'noop'."
+    )
+
+
+def get_strategy(
+    strategy: str = "hdbscan",
+    config: Optional[ClusteringConfig] = None,
+    *,
+    fallback: bool = True,
+    **kwargs: Any,
+) -> BaseClusteringStrategy:
+    """Factory function to create clustering strategy with fallback chain.
+
+    The fallback chain is: hdbscan -> dbscan -> frequency -> noop
+
+    Args:
+        strategy: Clustering strategy to use. Options:
+            - "hdbscan": HDBSCAN clustering (default, recommended)
+            - "dbscan": DBSCAN clustering (fallback)
+            - "frequency": Frequency-based clustering (groups by symbol occurrence)
+            - "noop": No-op strategy (returns all results ungrouped)
+            - "auto": Try hdbscan, then dbscan, then noop
+        config: Clustering configuration. Uses defaults if not provided.
+            For frequency strategy, pass FrequencyConfig for full control.
+        fallback: If True (default), automatically fall back to next strategy
+            in the chain when primary is unavailable. If False, raise ImportError
+            when requested strategy is unavailable.
+        **kwargs: Additional strategy-specific arguments.
+            For DBSCANStrategy: eps, eps_percentile
+            For FrequencyStrategy: group_by, min_frequency, etc.
+
+    Returns:
+        BaseClusteringStrategy: Configured clustering strategy instance.
+
+    Raises:
+        ValueError: If strategy is not recognized.
+        ImportError: If required dependencies are not installed and fallback=False.
+
+    Example:
+        >>> from codexlens.search.clustering import get_strategy, ClusteringConfig
+        >>> config = ClusteringConfig(min_cluster_size=3)
+        >>> # Auto-select best available strategy
+        >>> strategy = get_strategy("auto", config)
+        >>> # Explicitly use HDBSCAN (will fall back if unavailable)
+        >>> strategy = get_strategy("hdbscan", config)
+        >>> # Use frequency-based strategy
+        >>> from codexlens.search.clustering import FrequencyConfig
+        >>> freq_config = FrequencyConfig(min_frequency=2, group_by="symbol")
+        >>> strategy = get_strategy("frequency", freq_config)
+    """
+    strategy = (strategy or "").strip().lower()
+
+    # Handle "auto" - try strategies in order
+    if strategy == "auto":
+        return _get_best_available_strategy(config, **kwargs)
+
+    if strategy == "hdbscan":
+        ok, err = check_clustering_strategy_available("hdbscan")
+        if ok:
+            from .hdbscan_strategy import HDBSCANStrategy
+            return HDBSCANStrategy(config)
+
+        if fallback:
+            # Try dbscan fallback
+            ok_dbscan, _ = check_clustering_strategy_available("dbscan")
+            if ok_dbscan:
+                from .dbscan_strategy import DBSCANStrategy
+                return DBSCANStrategy(config, **kwargs)
+            # Final fallback to noop
+            return NoOpStrategy(config)
+
+        raise ImportError(err)
+
+    if strategy == "dbscan":
+        ok, err = check_clustering_strategy_available("dbscan")
+        if ok:
+            from .dbscan_strategy import DBSCANStrategy
+            return DBSCANStrategy(config, **kwargs)
+
+        if fallback:
+            # Fallback to noop
+            return NoOpStrategy(config)
+
+        raise ImportError(err)
+
+    if strategy == "frequency":
+        from .frequency_strategy import FrequencyStrategy, FrequencyConfig
+        # If config is ClusteringConfig but not FrequencyConfig, create default FrequencyConfig
+        if config is None or not isinstance(config, FrequencyConfig):
+            freq_config = FrequencyConfig(**kwargs) if kwargs else FrequencyConfig()
+        else:
+            freq_config = config
+        return FrequencyStrategy(freq_config)
+
+    if strategy == "noop":
+        return NoOpStrategy(config)
+
+    raise ValueError(
+        f"Unknown clustering strategy: {strategy}. "
+        "Supported strategies: 'hdbscan', 'dbscan', 'frequency', 'noop', 'auto'"
+    )
+
+
+def _get_best_available_strategy(
+    config: Optional[ClusteringConfig] = None,
+    **kwargs: Any,
+) -> BaseClusteringStrategy:
+    """Get the best available clustering strategy.
+
+    Tries strategies in order: hdbscan -> dbscan -> noop
+
+    Args:
+        config: Clustering configuration.
+        **kwargs: Additional strategy-specific arguments.
+
+    Returns:
+        Best available clustering strategy instance.
+    """
+    # Try HDBSCAN first
+    ok, _ = check_clustering_strategy_available("hdbscan")
+    if ok:
+        from .hdbscan_strategy import HDBSCANStrategy
+        return HDBSCANStrategy(config)
+
+    # Try DBSCAN second
+    ok, _ = check_clustering_strategy_available("dbscan")
+    if ok:
+        from .dbscan_strategy import DBSCANStrategy
+        return DBSCANStrategy(config, **kwargs)
+
+    # Fallback to NoOp
+    return NoOpStrategy(config)
+
+
+# Alias for backward compatibility
+ClusteringStrategyFactory = type(
+    "ClusteringStrategyFactory",
+    (),
+    {
+        "get_strategy": staticmethod(get_strategy),
+        "check_available": staticmethod(check_clustering_strategy_available),
+    },
+)
--- a/codex-lens/build/lib/codexlens/search/clustering/frequency_strategy.py
+++ b/codex-lens/build/lib/codexlens/search/clustering/frequency_strategy.py
@@ -0,0 +1,263 @@
+"""Frequency-based clustering strategy for search result deduplication.
+
+This strategy groups search results by symbol/method name and prunes based on
+occurrence frequency. High-frequency symbols (frequently referenced methods)
+are considered more important and retained, while low-frequency results
+(potentially noise) can be filtered out.
+
+Use cases:
+- Prioritize commonly called methods/functions
+- Filter out one-off results that may be less relevant
+- Deduplicate results pointing to the same symbol from different locations
+"""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List, Optional, Literal
+
+from .base import BaseClusteringStrategy, ClusteringConfig
+
+if TYPE_CHECKING:
+    import numpy as np
+    from codexlens.entities import SearchResult
+
+
+@dataclass
+class FrequencyConfig(ClusteringConfig):
+    """Configuration for frequency-based clustering strategy.
+
+    Attributes:
+        group_by: Field to group results by for frequency counting.
+            - 'symbol': Group by symbol_name (default, for method/function dedup)
+            - 'file': Group by file path
+            - 'symbol_kind': Group by symbol type (function, class, etc.)
+        min_frequency: Minimum occurrence count to keep a result.
+            Results appearing less than this are considered noise and pruned.
+        max_representatives_per_group: Maximum results to keep per symbol group.
+        frequency_weight: How much to boost score based on frequency.
+            Final score = original_score * (1 + frequency_weight * log(frequency))
+        keep_mode: How to handle low-frequency results.
+            - 'filter': Remove results below min_frequency
+            - 'demote': Keep but lower their score ranking
+    """
+
+    group_by: Literal["symbol", "file", "symbol_kind"] = "symbol"
+    min_frequency: int = 1  # 1 means keep all, 2+ filters singletons
+    max_representatives_per_group: int = 3
+    frequency_weight: float = 0.1  # Boost factor for frequency
+    keep_mode: Literal["filter", "demote"] = "demote"
+
+    def __post_init__(self) -> None:
+        """Validate configuration parameters."""
+        # Skip parent validation since we don't use HDBSCAN params
+        if self.min_frequency < 1:
+            raise ValueError("min_frequency must be >= 1")
+        if self.max_representatives_per_group < 1:
+            raise ValueError("max_representatives_per_group must be >= 1")
+        if self.frequency_weight < 0:
+            raise ValueError("frequency_weight must be >= 0")
+        if self.group_by not in ("symbol", "file", "symbol_kind"):
+            raise ValueError(f"group_by must be one of: symbol, file, symbol_kind; got {self.group_by}")
+        if self.keep_mode not in ("filter", "demote"):
+            raise ValueError(f"keep_mode must be one of: filter, demote; got {self.keep_mode}")
+
+
+class FrequencyStrategy(BaseClusteringStrategy):
+    """Frequency-based clustering strategy for search result deduplication.
+
+    This strategy groups search results by symbol name (or file/kind) and:
+    1. Counts how many times each symbol appears in results
+    2. Higher frequency = more important (frequently referenced method)
+    3. Filters or demotes low-frequency results
+    4. Selects top representatives from each frequency group
+
+    Unlike embedding-based strategies (HDBSCAN, DBSCAN), this strategy:
+    - Does NOT require embeddings (works with metadata only)
+    - Is very fast (O(n) complexity)
+    - Is deterministic (no random initialization)
+    - Works well for symbol-level deduplication
+
+    Example:
+        >>> config = FrequencyConfig(min_frequency=2, group_by="symbol")
+        >>> strategy = FrequencyStrategy(config)
+        >>> # Results with symbol "authenticate" appearing 5 times
+        >>> # will be prioritized over "helper_func" appearing once
+        >>> representatives = strategy.fit_predict(embeddings, results)
+    """
+
+    def __init__(self, config: Optional[FrequencyConfig] = None) -> None:
+        """Initialize the frequency strategy.
+
+        Args:
+            config: Frequency configuration. Uses defaults if not provided.
+        """
+        self.config: FrequencyConfig = config or FrequencyConfig()
+
+    def _get_group_key(self, result: "SearchResult") -> str:
+        """Extract grouping key from a search result.
+
+        Args:
+            result: SearchResult to extract key from.
+
+        Returns:
+            String key for grouping (symbol name, file path, or kind).
+        """
+        if self.config.group_by == "symbol":
+            # Use symbol_name if available, otherwise fall back to file:line
+            symbol = getattr(result, "symbol_name", None)
+            if symbol:
+                return str(symbol)
+            # Fallback: use file path + start_line as pseudo-symbol
+            start_line = getattr(result, "start_line", 0) or 0
+            return f"{result.path}:{start_line}"
+
+        elif self.config.group_by == "file":
+            return str(result.path)
+
+        elif self.config.group_by == "symbol_kind":
+            kind = getattr(result, "symbol_kind", None)
+            return str(kind) if kind else "unknown"
+
+        return str(result.path)  # Default fallback
+
+    def cluster(
+        self,
+        embeddings: "np.ndarray",
+        results: List["SearchResult"],
+    ) -> List[List[int]]:
+        """Group search results by frequency of occurrence.
+
+        Note: This method ignores embeddings and groups by metadata only.
+        The embeddings parameter is kept for interface compatibility.
+
+        Args:
+            embeddings: Ignored (kept for interface compatibility).
+            results: List of SearchResult objects to cluster.
+
+        Returns:
+            List of clusters (groups), where each cluster contains indices
+            of results with the same grouping key. Clusters are ordered by
+            frequency (highest frequency first).
+        """
+        if not results:
+            return []
+
+        # Group results by key
+        groups: Dict[str, List[int]] = defaultdict(list)
+        for idx, result in enumerate(results):
+            key = self._get_group_key(result)
+            groups[key].append(idx)
+
+        # Sort groups by frequency (descending) then by key (for stability)
+        sorted_groups = sorted(
+            groups.items(),
+            key=lambda x: (-len(x[1]), x[0])  # -frequency, then alphabetical
+        )
+
+        # Convert to list of clusters
+        clusters = [indices for _, indices in sorted_groups]
+
+        return clusters
+
+    def select_representatives(
+        self,
+        clusters: List[List[int]],
+        results: List["SearchResult"],
+        embeddings: Optional["np.ndarray"] = None,
+    ) -> List["SearchResult"]:
+        """Select representative results based on frequency and score.
+
+        For each frequency group:
+        1. If frequency < min_frequency: filter or demote based on keep_mode
+        2. Sort by score within group
+        3. Apply frequency boost to scores
+        4. Select top N representatives
+
+        Args:
+            clusters: List of clusters from cluster() method.
+            results: Original list of SearchResult objects.
+            embeddings: Optional embeddings (used for tie-breaking if provided).
+
+        Returns:
+            List of representative SearchResult objects, ordered by
+            frequency-adjusted score (highest first).
+        """
+        import math
+
+        if not clusters or not results:
+            return []
+
+        representatives: List["SearchResult"] = []
+        demoted: List["SearchResult"] = []
+
+        for cluster_indices in clusters:
+            if not cluster_indices:
+                continue
+
+            frequency = len(cluster_indices)
+
+            # Get results in this cluster, sorted by score
+            cluster_results = [results[i] for i in cluster_indices]
+            cluster_results.sort(key=lambda r: getattr(r, "score", 0.0), reverse=True)
+
+            # Check frequency threshold
+            if frequency < self.config.min_frequency:
+                if self.config.keep_mode == "filter":
+                    # Skip low-frequency results entirely
+                    continue
+                else:  # demote mode
+                    # Keep but add to demoted list (lower priority)
+                    for result in cluster_results[: self.config.max_representatives_per_group]:
+                        demoted.append(result)
+                    continue
+
+            # Apply frequency boost and select top representatives
+            for result in cluster_results[: self.config.max_representatives_per_group]:
+                # Calculate frequency-boosted score
+                original_score = getattr(result, "score", 0.0)
+                # log(frequency + 1) to handle frequency=1 case smoothly
+                frequency_boost = 1.0 + self.config.frequency_weight * math.log(frequency + 1)
+                boosted_score = original_score * frequency_boost
+
+                # Create new result with boosted score and frequency metadata
+                # Note: SearchResult might be immutable, so we preserve original
+                # and track boosted score in metadata
+                if hasattr(result, "metadata") and isinstance(result.metadata, dict):
+                    result.metadata["frequency"] = frequency
+                    result.metadata["frequency_boosted_score"] = boosted_score
+
+                representatives.append(result)
+
+        # Sort representatives by boosted score (or original score as fallback)
+        def get_sort_score(r: "SearchResult") -> float:
+            if hasattr(r, "metadata") and isinstance(r.metadata, dict):
+                return r.metadata.get("frequency_boosted_score", getattr(r, "score", 0.0))
+            return getattr(r, "score", 0.0)
+
+        representatives.sort(key=get_sort_score, reverse=True)
+
+        # Add demoted results at the end
+        if demoted:
+            demoted.sort(key=lambda r: getattr(r, "score", 0.0), reverse=True)
+            representatives.extend(demoted)
+
+        return representatives
+
+    def fit_predict(
+        self,
+        embeddings: "np.ndarray",
+        results: List["SearchResult"],
+    ) -> List["SearchResult"]:
+        """Convenience method to cluster and select representatives in one call.
+
+        Args:
+            embeddings: NumPy array (may be ignored for frequency-based clustering).
+            results: List of SearchResult objects.
+
+        Returns:
+            List of representative SearchResult objects.
+        """
+        clusters = self.cluster(embeddings, results)
+        return self.select_representatives(clusters, results, embeddings)
--- a/codex-lens/build/lib/codexlens/search/clustering/hdbscan_strategy.py
+++ b/codex-lens/build/lib/codexlens/search/clustering/hdbscan_strategy.py
@@ -0,0 +1,153 @@
+"""HDBSCAN-based clustering strategy for search results.
+
+HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
+is the primary clustering strategy for grouping similar search results.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Optional
+
+from .base import BaseClusteringStrategy, ClusteringConfig
+
+if TYPE_CHECKING:
+    import numpy as np
+    from codexlens.entities import SearchResult
+
+
+class HDBSCANStrategy(BaseClusteringStrategy):
+    """HDBSCAN-based clustering strategy.
+
+    Uses HDBSCAN algorithm to cluster search results based on embedding similarity.
+    HDBSCAN is preferred over DBSCAN because it:
+    - Automatically determines the number of clusters
+    - Handles varying density clusters well
+    - Identifies noise points (outliers) effectively
+
+    Example:
+        >>> from codexlens.search.clustering import HDBSCANStrategy, ClusteringConfig
+        >>> config = ClusteringConfig(min_cluster_size=3, metric='cosine')
+        >>> strategy = HDBSCANStrategy(config)
+        >>> clusters = strategy.cluster(embeddings, results)
+        >>> representatives = strategy.select_representatives(clusters, results)
+    """
+
+    def __init__(self, config: Optional[ClusteringConfig] = None) -> None:
+        """Initialize HDBSCAN clustering strategy.
+
+        Args:
+            config: Clustering configuration. Uses defaults if not provided.
+
+        Raises:
+            ImportError: If hdbscan package is not installed.
+        """
+        super().__init__(config)
+        # Validate hdbscan is available
+        try:
+            import hdbscan  # noqa: F401
+        except ImportError as exc:
+            raise ImportError(
+                "hdbscan package is required for HDBSCANStrategy. "
+                "Install with: pip install codexlens[clustering]"
+            ) from exc
+
+    def cluster(
+        self,
+        embeddings: "np.ndarray",
+        results: List["SearchResult"],
+    ) -> List[List[int]]:
+        """Cluster search results using HDBSCAN algorithm.
+
+        Args:
+            embeddings: NumPy array of shape (n_results, embedding_dim)
+                containing the embedding vectors for each result.
+            results: List of SearchResult objects corresponding to embeddings.
+
+        Returns:
+            List of clusters, where each cluster is a list of indices
+            into the results list. Noise points are returned as singleton clusters.
+        """
+        import hdbscan
+        import numpy as np
+
+        n_results = len(results)
+        if n_results == 0:
+            return []
+
+        # Handle edge case: fewer results than min_cluster_size
+        if n_results < self.config.min_cluster_size:
+            # Return each result as its own singleton cluster
+            return [[i] for i in range(n_results)]
+
+        # Configure HDBSCAN clusterer
+        clusterer = hdbscan.HDBSCAN(
+            min_cluster_size=self.config.min_cluster_size,
+            min_samples=self.config.min_samples,
+            metric=self.config.metric,
+            cluster_selection_epsilon=self.config.cluster_selection_epsilon,
+            allow_single_cluster=self.config.allow_single_cluster,
+            prediction_data=self.config.prediction_data,
+        )
+
+        # Fit and get cluster labels
+        # Labels: -1 = noise, 0+ = cluster index
+        labels = clusterer.fit_predict(embeddings)
+
+        # Group indices by cluster label
+        cluster_map: dict[int, list[int]] = {}
+        for idx, label in enumerate(labels):
+            if label not in cluster_map:
+                cluster_map[label] = []
+            cluster_map[label].append(idx)
+
+        # Build result: non-noise clusters first, then noise as singletons
+        clusters: List[List[int]] = []
+
+        # Add proper clusters (label >= 0)
+        for label in sorted(cluster_map.keys()):
+            if label >= 0:
+                clusters.append(cluster_map[label])
+
+        # Add noise points as singleton clusters (label == -1)
+        if -1 in cluster_map:
+            for idx in cluster_map[-1]:
+                clusters.append([idx])
+
+        return clusters
+
+    def select_representatives(
+        self,
+        clusters: List[List[int]],
+        results: List["SearchResult"],
+        embeddings: Optional["np.ndarray"] = None,
+    ) -> List["SearchResult"]:
+        """Select representative results from each cluster.
+
+        Selects the result with the highest score from each cluster.
+
+        Args:
+            clusters: List of clusters from cluster() method.
+            results: Original list of SearchResult objects.
+            embeddings: Optional embeddings (not used in score-based selection).
+
+        Returns:
+            List of representative SearchResult objects, one per cluster,
+            ordered by score (highest first).
+        """
+        if not clusters or not results:
+            return []
+
+        representatives: List["SearchResult"] = []
+
+        for cluster_indices in clusters:
+            if not cluster_indices:
+                continue
+
+            # Find the result with the highest score in this cluster
+            best_idx = max(cluster_indices, key=lambda i: results[i].score)
+            representatives.append(results[best_idx])
+
+        # Sort by score descending
+        representatives.sort(key=lambda r: r.score, reverse=True)
+
+        return representatives
--- a/codex-lens/build/lib/codexlens/search/clustering/noop_strategy.py
+++ b/codex-lens/build/lib/codexlens/search/clustering/noop_strategy.py
@@ -0,0 +1,83 @@
+"""No-op clustering strategy for search results.
+
+NoOpStrategy returns all results ungrouped when clustering dependencies
+are not available or clustering is disabled.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Optional
+
+from .base import BaseClusteringStrategy, ClusteringConfig
+
+if TYPE_CHECKING:
+    import numpy as np
+    from codexlens.entities import SearchResult
+
+
+class NoOpStrategy(BaseClusteringStrategy):
+    """No-op clustering strategy that returns all results ungrouped.
+
+    This strategy is used as a final fallback when no clustering dependencies
+    are available, or when clustering is explicitly disabled. Each result
+    is treated as its own singleton cluster.
+
+    Example:
+        >>> from codexlens.search.clustering import NoOpStrategy
+        >>> strategy = NoOpStrategy()
+        >>> clusters = strategy.cluster(embeddings, results)
+        >>> # Returns [[0], [1], [2], ...] - each result in its own cluster
+        >>> representatives = strategy.select_representatives(clusters, results)
+        >>> # Returns all results sorted by score
+    """
+
+    def __init__(self, config: Optional[ClusteringConfig] = None) -> None:
+        """Initialize NoOp clustering strategy.
+
+        Args:
+            config: Clustering configuration. Ignored for NoOpStrategy
+                but accepted for interface compatibility.
+        """
+        super().__init__(config)
+
+    def cluster(
+        self,
+        embeddings: "np.ndarray",
+        results: List["SearchResult"],
+    ) -> List[List[int]]:
+        """Return each result as its own singleton cluster.
+
+        Args:
+            embeddings: NumPy array of shape (n_results, embedding_dim).
+                Not used but accepted for interface compatibility.
+            results: List of SearchResult objects.
+
+        Returns:
+            List of singleton clusters, one per result.
+        """
+        return [[i] for i in range(len(results))]
+
+    def select_representatives(
+        self,
+        clusters: List[List[int]],
+        results: List["SearchResult"],
+        embeddings: Optional["np.ndarray"] = None,
+    ) -> List["SearchResult"]:
+        """Return all results sorted by score.
+
+        Since each cluster is a singleton, this effectively returns all
+        results sorted by score descending.
+
+        Args:
+            clusters: List of singleton clusters.
+            results: Original list of SearchResult objects.
+            embeddings: Optional embeddings (not used).
+
+        Returns:
+            All SearchResult objects sorted by score (highest first).
+        """
+        if not results:
+            return []
+
+        # Return all results sorted by score
+        return sorted(results, key=lambda r: r.score, reverse=True)