mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
154 lines
5.8 KiB
Python
154 lines
5.8 KiB
Python
"""Base classes for clustering strategies in the hybrid search pipeline.
|
|
|
|
This module defines the abstract base class for clustering strategies used
|
|
in the staged hybrid search pipeline. Strategies cluster search results
|
|
based on their embeddings and select representative results from each cluster.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass, field
|
|
from typing import TYPE_CHECKING, List, Optional
|
|
|
|
if TYPE_CHECKING:
|
|
import numpy as np
|
|
from codexlens.entities import SearchResult
|
|
|
|
|
|
@dataclass
|
|
class ClusteringConfig:
|
|
"""Configuration parameters for clustering strategies.
|
|
|
|
Attributes:
|
|
min_cluster_size: Minimum number of results to form a cluster.
|
|
HDBSCAN default is 5, but for search results 2-3 is often better.
|
|
min_samples: Number of samples in a neighborhood for a point to be
|
|
considered a core point. Lower values allow more clusters.
|
|
metric: Distance metric for clustering. Common options:
|
|
- 'euclidean': Standard L2 distance
|
|
- 'cosine': Cosine distance (1 - cosine_similarity)
|
|
- 'manhattan': L1 distance
|
|
cluster_selection_epsilon: Distance threshold for cluster selection.
|
|
Results within this distance may be merged into the same cluster.
|
|
allow_single_cluster: If True, allow all results to form one cluster.
|
|
Useful when results are very similar.
|
|
prediction_data: If True, generate prediction data for new points.
|
|
"""
|
|
|
|
min_cluster_size: int = 3
|
|
min_samples: int = 2
|
|
metric: str = "cosine"
|
|
cluster_selection_epsilon: float = 0.0
|
|
allow_single_cluster: bool = True
|
|
prediction_data: bool = False
|
|
|
|
def __post_init__(self) -> None:
|
|
"""Validate configuration parameters."""
|
|
if self.min_cluster_size < 2:
|
|
raise ValueError("min_cluster_size must be >= 2")
|
|
if self.min_samples < 1:
|
|
raise ValueError("min_samples must be >= 1")
|
|
if self.metric not in ("euclidean", "cosine", "manhattan"):
|
|
raise ValueError(f"metric must be one of: euclidean, cosine, manhattan; got {self.metric}")
|
|
if self.cluster_selection_epsilon < 0:
|
|
raise ValueError("cluster_selection_epsilon must be >= 0")
|
|
|
|
|
|
class BaseClusteringStrategy(ABC):
|
|
"""Abstract base class for clustering strategies.
|
|
|
|
Clustering strategies are used in the staged hybrid search pipeline to
|
|
group similar search results and select representative results from each
|
|
cluster, reducing redundancy while maintaining diversity.
|
|
|
|
Subclasses must implement:
|
|
- cluster(): Group results into clusters based on embeddings
|
|
- select_representatives(): Choose best result(s) from each cluster
|
|
"""
|
|
|
|
def __init__(self, config: Optional[ClusteringConfig] = None) -> None:
|
|
"""Initialize the clustering strategy.
|
|
|
|
Args:
|
|
config: Clustering configuration. Uses defaults if not provided.
|
|
"""
|
|
self.config = config or ClusteringConfig()
|
|
|
|
@abstractmethod
|
|
def cluster(
|
|
self,
|
|
embeddings: "np.ndarray",
|
|
results: List["SearchResult"],
|
|
) -> List[List[int]]:
|
|
"""Cluster search results based on their embeddings.
|
|
|
|
Args:
|
|
embeddings: NumPy array of shape (n_results, embedding_dim)
|
|
containing the embedding vectors for each result.
|
|
results: List of SearchResult objects corresponding to embeddings.
|
|
Used for additional metadata during clustering.
|
|
|
|
Returns:
|
|
List of clusters, where each cluster is a list of indices
|
|
into the results list. Results not assigned to any cluster
|
|
(noise points) should be returned as single-element clusters.
|
|
|
|
Example:
|
|
>>> strategy = HDBSCANStrategy()
|
|
>>> clusters = strategy.cluster(embeddings, results)
|
|
>>> # clusters = [[0, 2, 5], [1, 3], [4], [6, 7, 8]]
|
|
>>> # Result indices 0, 2, 5 are in cluster 0
|
|
>>> # Result indices 1, 3 are in cluster 1
|
|
>>> # Result index 4 is a noise point (singleton cluster)
|
|
>>> # Result indices 6, 7, 8 are in cluster 2
|
|
"""
|
|
...
|
|
|
|
@abstractmethod
|
|
def select_representatives(
|
|
self,
|
|
clusters: List[List[int]],
|
|
results: List["SearchResult"],
|
|
embeddings: Optional["np.ndarray"] = None,
|
|
) -> List["SearchResult"]:
|
|
"""Select representative results from each cluster.
|
|
|
|
This method chooses the best result(s) from each cluster to include
|
|
in the final search results. The selection can be based on:
|
|
- Highest score within cluster
|
|
- Closest to cluster centroid
|
|
- Custom selection logic
|
|
|
|
Args:
|
|
clusters: List of clusters from cluster() method.
|
|
results: Original list of SearchResult objects.
|
|
embeddings: Optional embeddings array for centroid-based selection.
|
|
|
|
Returns:
|
|
List of representative SearchResult objects, one or more per cluster,
|
|
ordered by relevance (highest score first).
|
|
|
|
Example:
|
|
>>> representatives = strategy.select_representatives(clusters, results)
|
|
>>> # Returns best result from each cluster
|
|
"""
|
|
...
|
|
|
|
def fit_predict(
|
|
self,
|
|
embeddings: "np.ndarray",
|
|
results: List["SearchResult"],
|
|
) -> List["SearchResult"]:
|
|
"""Convenience method to cluster and select representatives in one call.
|
|
|
|
Args:
|
|
embeddings: NumPy array of shape (n_results, embedding_dim).
|
|
results: List of SearchResult objects.
|
|
|
|
Returns:
|
|
List of representative SearchResult objects.
|
|
"""
|
|
clusters = self.cluster(embeddings, results)
|
|
return self.select_representatives(clusters, results, embeddings)
|