mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-12 02:37:45 +08:00
feat: Implement centralized storage for SPLADE and vector embeddings
- Added centralized SPLADE database and vector storage configuration in config.py. - Updated embedding_manager.py to support centralized SPLADE database path. - Enhanced generate_embeddings and generate_embeddings_recursive functions for centralized storage. - Introduced centralized ANN index creation in ann_index.py. - Modified hybrid_search.py to utilize centralized vector index for searches. - Implemented methods to discover and manage centralized SPLADE and HNSW files.
This commit is contained in:
@@ -9,6 +9,7 @@ Key features:
|
||||
- Incremental vector addition and deletion
|
||||
- Thread-safe operations
|
||||
- Cosine similarity metric
|
||||
- Support for centralized storage mode (single index at project root)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -19,6 +20,7 @@ from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from codexlens.errors import StorageError
|
||||
from codexlens.config import VECTORS_HNSW_NAME
|
||||
|
||||
from . import SEMANTIC_AVAILABLE
|
||||
|
||||
@@ -127,6 +129,94 @@ class ANNIndex:
|
||||
f"auto_save={auto_save}, expansion_threshold={expansion_threshold}"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def create_central(
|
||||
cls,
|
||||
index_root: Path,
|
||||
dim: int,
|
||||
initial_capacity: int = 50000,
|
||||
auto_save: bool = False,
|
||||
expansion_threshold: float = 0.8,
|
||||
) -> "ANNIndex":
|
||||
"""Create a centralized ANN index at the project index root.
|
||||
|
||||
This method creates a single shared HNSW index file at the project root,
|
||||
rather than per-directory indexes. Use this for projects that want all
|
||||
dense vectors stored in one central location.
|
||||
|
||||
Args:
|
||||
index_root: Root directory for the index (e.g., .codexlens/<project_hash>/)
|
||||
dim: Dimension of embedding vectors
|
||||
initial_capacity: Initial maximum elements capacity (default: 50000)
|
||||
auto_save: Whether to automatically save index after operations (default: False)
|
||||
expansion_threshold: Capacity threshold to trigger auto-expansion (default: 0.8)
|
||||
|
||||
Returns:
|
||||
ANNIndex instance configured for centralized storage
|
||||
|
||||
Example:
|
||||
>>> index = ANNIndex.create_central(Path(".codexlens/abc123"), dim=768)
|
||||
>>> index.hnsw_path # Returns: .codexlens/abc123/_vectors.hnsw
|
||||
"""
|
||||
# Create a dummy index_path that will result in the central hnsw_path
|
||||
# The index_path is used to derive hnsw_path, so we create a virtual path
|
||||
# such that self.hnsw_path = index_root / VECTORS_HNSW_NAME
|
||||
instance = cls.__new__(cls)
|
||||
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
raise ImportError(
|
||||
"Semantic search dependencies not available. "
|
||||
"Install with: pip install codexlens[semantic]"
|
||||
)
|
||||
|
||||
if not HNSWLIB_AVAILABLE:
|
||||
raise ImportError(
|
||||
"hnswlib is required for ANN index. "
|
||||
"Install with: pip install hnswlib"
|
||||
)
|
||||
|
||||
if dim <= 0:
|
||||
raise ValueError(f"Invalid dimension: {dim}")
|
||||
|
||||
if initial_capacity <= 0:
|
||||
raise ValueError(f"Invalid initial capacity: {initial_capacity}")
|
||||
|
||||
if not 0.0 < expansion_threshold < 1.0:
|
||||
raise ValueError(
|
||||
f"Invalid expansion threshold: {expansion_threshold}. Must be between 0 and 1."
|
||||
)
|
||||
|
||||
instance.index_path = index_root
|
||||
instance.dim = dim
|
||||
|
||||
# Centralized mode: use VECTORS_HNSW_NAME directly at index_root
|
||||
instance.hnsw_path = index_root / VECTORS_HNSW_NAME
|
||||
|
||||
# HNSW parameters
|
||||
instance.space = "cosine"
|
||||
instance.M = 16
|
||||
instance.ef_construction = 200
|
||||
instance.ef = 50
|
||||
|
||||
# Memory management parameters
|
||||
instance._auto_save = auto_save
|
||||
instance._expansion_threshold = expansion_threshold
|
||||
|
||||
# Thread safety
|
||||
instance._lock = threading.RLock()
|
||||
|
||||
# HNSW index instance
|
||||
instance._index: Optional[hnswlib.Index] = None
|
||||
instance._max_elements = initial_capacity
|
||||
instance._current_count = 0
|
||||
|
||||
logger.info(
|
||||
f"Initialized centralized ANNIndex at {instance.hnsw_path} with "
|
||||
f"capacity={initial_capacity}, auto_save={auto_save}"
|
||||
)
|
||||
|
||||
return instance
|
||||
|
||||
def _ensure_index(self) -> None:
|
||||
"""Ensure HNSW index is initialized (lazy initialization)."""
|
||||
if self._index is None:
|
||||
|
||||
Reference in New Issue
Block a user