Files

203 lines
6.9 KiB
Python

"""Factory for creating clustering strategies.
Provides a unified interface for instantiating different clustering backends
with automatic fallback chain: hdbscan -> dbscan -> noop.
"""
from __future__ import annotations
from typing import Any, Optional
from .base import BaseClusteringStrategy, ClusteringConfig
from .noop_strategy import NoOpStrategy
def check_clustering_strategy_available(strategy: str) -> tuple[bool, str | None]:
"""Check whether a specific clustering strategy can be used.
Args:
strategy: Strategy name to check. Options:
- "hdbscan": HDBSCAN clustering (requires hdbscan package)
- "dbscan": DBSCAN clustering (requires sklearn)
- "frequency": Frequency-based clustering (always available)
- "noop": No-op strategy (always available)
Returns:
Tuple of (is_available, error_message).
error_message is None if available, otherwise contains install instructions.
"""
strategy = (strategy or "").strip().lower()
if strategy == "hdbscan":
try:
import hdbscan # noqa: F401
except ImportError:
return False, (
"hdbscan package not available. "
"Install with: pip install codexlens[clustering]"
)
return True, None
if strategy == "dbscan":
try:
from sklearn.cluster import DBSCAN # noqa: F401
except ImportError:
return False, (
"scikit-learn package not available. "
"Install with: pip install codexlens[clustering]"
)
return True, None
if strategy == "frequency":
# Frequency strategy is always available (no external deps)
return True, None
if strategy == "noop":
return True, None
return False, (
f"Invalid clustering strategy: {strategy}. "
"Must be 'hdbscan', 'dbscan', 'frequency', or 'noop'."
)
def get_strategy(
strategy: str = "hdbscan",
config: Optional[ClusteringConfig] = None,
*,
fallback: bool = True,
**kwargs: Any,
) -> BaseClusteringStrategy:
"""Factory function to create clustering strategy with fallback chain.
The fallback chain is: hdbscan -> dbscan -> frequency -> noop
Args:
strategy: Clustering strategy to use. Options:
- "hdbscan": HDBSCAN clustering (default, recommended)
- "dbscan": DBSCAN clustering (fallback)
- "frequency": Frequency-based clustering (groups by symbol occurrence)
- "noop": No-op strategy (returns all results ungrouped)
- "auto": Try hdbscan, then dbscan, then noop
config: Clustering configuration. Uses defaults if not provided.
For frequency strategy, pass FrequencyConfig for full control.
fallback: If True (default), automatically fall back to next strategy
in the chain when primary is unavailable. If False, raise ImportError
when requested strategy is unavailable.
**kwargs: Additional strategy-specific arguments.
For DBSCANStrategy: eps, eps_percentile
For FrequencyStrategy: group_by, min_frequency, etc.
Returns:
BaseClusteringStrategy: Configured clustering strategy instance.
Raises:
ValueError: If strategy is not recognized.
ImportError: If required dependencies are not installed and fallback=False.
Example:
>>> from codexlens.search.clustering import get_strategy, ClusteringConfig
>>> config = ClusteringConfig(min_cluster_size=3)
>>> # Auto-select best available strategy
>>> strategy = get_strategy("auto", config)
>>> # Explicitly use HDBSCAN (will fall back if unavailable)
>>> strategy = get_strategy("hdbscan", config)
>>> # Use frequency-based strategy
>>> from codexlens.search.clustering import FrequencyConfig
>>> freq_config = FrequencyConfig(min_frequency=2, group_by="symbol")
>>> strategy = get_strategy("frequency", freq_config)
"""
strategy = (strategy or "").strip().lower()
# Handle "auto" - try strategies in order
if strategy == "auto":
return _get_best_available_strategy(config, **kwargs)
if strategy == "hdbscan":
ok, err = check_clustering_strategy_available("hdbscan")
if ok:
from .hdbscan_strategy import HDBSCANStrategy
return HDBSCANStrategy(config)
if fallback:
# Try dbscan fallback
ok_dbscan, _ = check_clustering_strategy_available("dbscan")
if ok_dbscan:
from .dbscan_strategy import DBSCANStrategy
return DBSCANStrategy(config, **kwargs)
# Final fallback to noop
return NoOpStrategy(config)
raise ImportError(err)
if strategy == "dbscan":
ok, err = check_clustering_strategy_available("dbscan")
if ok:
from .dbscan_strategy import DBSCANStrategy
return DBSCANStrategy(config, **kwargs)
if fallback:
# Fallback to noop
return NoOpStrategy(config)
raise ImportError(err)
if strategy == "frequency":
from .frequency_strategy import FrequencyStrategy, FrequencyConfig
# If config is ClusteringConfig but not FrequencyConfig, create default FrequencyConfig
if config is None or not isinstance(config, FrequencyConfig):
freq_config = FrequencyConfig(**kwargs) if kwargs else FrequencyConfig()
else:
freq_config = config
return FrequencyStrategy(freq_config)
if strategy == "noop":
return NoOpStrategy(config)
raise ValueError(
f"Unknown clustering strategy: {strategy}. "
"Supported strategies: 'hdbscan', 'dbscan', 'frequency', 'noop', 'auto'"
)
def _get_best_available_strategy(
config: Optional[ClusteringConfig] = None,
**kwargs: Any,
) -> BaseClusteringStrategy:
"""Get the best available clustering strategy.
Tries strategies in order: hdbscan -> dbscan -> noop
Args:
config: Clustering configuration.
**kwargs: Additional strategy-specific arguments.
Returns:
Best available clustering strategy instance.
"""
# Try HDBSCAN first
ok, _ = check_clustering_strategy_available("hdbscan")
if ok:
from .hdbscan_strategy import HDBSCANStrategy
return HDBSCANStrategy(config)
# Try DBSCAN second
ok, _ = check_clustering_strategy_available("dbscan")
if ok:
from .dbscan_strategy import DBSCANStrategy
return DBSCANStrategy(config, **kwargs)
# Fallback to NoOp
return NoOpStrategy(config)
# Alias for backward compatibility
ClusteringStrategyFactory = type(
"ClusteringStrategyFactory",
(),
{
"get_strategy": staticmethod(get_strategy),
"check_available": staticmethod(check_clustering_strategy_available),
},
)