feat: Implement centralized storage for SPLADE and vector embeddings

- Added centralized SPLADE database and vector storage configuration in config.py.
- Updated embedding_manager.py to support centralized SPLADE database path.
- Enhanced generate_embeddings and generate_embeddings_recursive functions for centralized storage.
- Introduced centralized ANN index creation in ann_index.py.
- Modified hybrid_search.py to utilize centralized vector index for searches.
- Implemented methods to discover and manage centralized SPLADE and HNSW files.
This commit is contained in:
catlog22
2026-01-02 16:53:39 +08:00
parent 54fb7afdb2
commit 9157c5c78b
5 changed files with 1051 additions and 9 deletions

View File

@@ -310,6 +310,7 @@ def generate_embeddings(
endpoints: Optional[List] = None,
strategy: Optional[str] = None,
cooldown: Optional[float] = None,
splade_db_path: Optional[Path] = None,
) -> Dict[str, any]:
"""Generate embeddings for an index using memory-efficient batch processing.
@@ -339,6 +340,9 @@ def generate_embeddings(
Each dict has keys: model, api_key, api_base, weight.
strategy: Selection strategy for multi-endpoint mode (round_robin, latency_aware).
cooldown: Default cooldown seconds for rate-limited endpoints.
splade_db_path: Optional path to centralized SPLADE database. If None, SPLADE
is written to index_path (legacy behavior). Use index_root / SPLADE_DB_NAME
for centralized storage.
Returns:
Result dictionary with generation statistics
@@ -723,7 +727,7 @@ def generate_embeddings(
splade_error = None
try:
from codexlens.config import Config
from codexlens.config import Config, SPLADE_DB_NAME
config = Config.load()
if config.enable_splade:
@@ -737,8 +741,9 @@ def generate_embeddings(
# Initialize SPLADE encoder and index
splade_encoder = get_splade_encoder(use_gpu=use_gpu)
# Use main index database for SPLADE (not separate _splade.db)
splade_index = SpladeIndex(index_path)
# Use centralized SPLADE database if provided, otherwise fallback to index_path
effective_splade_path = splade_db_path if splade_db_path else index_path
splade_index = SpladeIndex(effective_splade_path)
splade_index.create_tables()
# Retrieve all chunks from database for SPLADE encoding
@@ -953,6 +958,10 @@ def generate_embeddings_recursive(
if progress_callback:
progress_callback(f"Found {len(index_files)} index databases to process")
# Calculate centralized SPLADE database path
from codexlens.config import SPLADE_DB_NAME
splade_db_path = index_root / SPLADE_DB_NAME
# Process each index database
all_results = []
total_chunks = 0
@@ -982,6 +991,7 @@ def generate_embeddings_recursive(
endpoints=endpoints,
strategy=strategy,
cooldown=cooldown,
splade_db_path=splade_db_path, # Use centralized SPLADE storage
)
all_results.append({
@@ -1023,6 +1033,279 @@ def generate_embeddings_recursive(
}
def generate_dense_embeddings_centralized(
index_root: Path,
embedding_backend: Optional[str] = None,
model_profile: Optional[str] = None,
force: bool = False,
chunk_size: int = 2000,
overlap: int = 200,
progress_callback: Optional[callable] = None,
use_gpu: Optional[bool] = None,
max_tokens_per_batch: Optional[int] = None,
max_workers: Optional[int] = None,
endpoints: Optional[List] = None,
strategy: Optional[str] = None,
cooldown: Optional[float] = None,
) -> Dict[str, any]:
"""Generate dense embeddings with centralized vector storage.
This function creates a single HNSW index at the project root instead of
per-directory indexes. All chunks from all _index.db files are combined
into one central _vectors.hnsw file.
Target architecture:
<index_root>/
|-- _vectors.hnsw # Centralized dense vector ANN index
|-- _splade.db # Centralized sparse vector index
|-- src/
|-- _index.db # No longer contains .hnsw file
Args:
index_root: Root index directory containing _index.db files
embedding_backend: Embedding backend (fastembed or litellm)
model_profile: Model profile or name
force: If True, regenerate even if embeddings exist
chunk_size: Maximum chunk size in characters
overlap: Overlap size in characters
progress_callback: Optional callback for progress updates
use_gpu: Whether to use GPU acceleration
max_tokens_per_batch: Maximum tokens per batch
max_workers: Maximum concurrent workers
endpoints: Multi-endpoint configurations
strategy: Endpoint selection strategy
cooldown: Rate-limit cooldown seconds
Returns:
Result dictionary with generation statistics
"""
from codexlens.config import VECTORS_HNSW_NAME, SPLADE_DB_NAME
# Get defaults from config if not specified
(default_backend, default_model, default_gpu,
default_endpoints, default_strategy, default_cooldown) = _get_embedding_defaults()
if embedding_backend is None:
embedding_backend = default_backend
if model_profile is None:
model_profile = default_model
if use_gpu is None:
use_gpu = default_gpu
if endpoints is None:
endpoints = default_endpoints
if strategy is None:
strategy = default_strategy
if cooldown is None:
cooldown = default_cooldown
# Calculate endpoint count for worker scaling
endpoint_count = len(endpoints) if endpoints else 1
if max_workers is None:
if embedding_backend == "litellm":
if endpoint_count > 1:
max_workers = endpoint_count * 2
else:
max_workers = 4
else:
max_workers = 1
backend_available, backend_error = is_embedding_backend_available(embedding_backend)
if not backend_available:
return {"success": False, "error": backend_error or "Embedding backend not available"}
# Discover all _index.db files
index_files = discover_all_index_dbs(index_root)
if not index_files:
return {
"success": False,
"error": f"No index databases found in {index_root}",
}
if progress_callback:
progress_callback(f"Found {len(index_files)} index databases for centralized embedding")
# Check for existing centralized index
central_hnsw_path = index_root / VECTORS_HNSW_NAME
if central_hnsw_path.exists() and not force:
return {
"success": False,
"error": f"Centralized vector index already exists at {central_hnsw_path}. Use --force to regenerate.",
}
# Initialize embedder
try:
from codexlens.semantic.factory import get_embedder as get_embedder_factory
from codexlens.semantic.chunker import Chunker, ChunkConfig
from codexlens.semantic.ann_index import ANNIndex
if embedding_backend == "fastembed":
embedder = get_embedder_factory(backend="fastembed", profile=model_profile, use_gpu=use_gpu)
elif embedding_backend == "litellm":
embedder = get_embedder_factory(
backend="litellm",
model=model_profile,
endpoints=endpoints if endpoints else None,
strategy=strategy,
cooldown=cooldown,
)
else:
return {
"success": False,
"error": f"Invalid embedding backend: {embedding_backend}",
}
chunker = Chunker(config=ChunkConfig(
max_chunk_size=chunk_size,
overlap=overlap,
skip_token_count=True
))
if progress_callback:
if endpoint_count > 1:
progress_callback(f"Using {endpoint_count} API endpoints with {strategy} strategy")
progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
except Exception as e:
return {
"success": False,
"error": f"Failed to initialize components: {str(e)}",
}
# Create centralized ANN index
central_ann_index = ANNIndex.create_central(
index_root=index_root,
dim=embedder.embedding_dim,
initial_capacity=100000, # Larger capacity for centralized index
auto_save=False,
)
# Process all index databases
start_time = time.time()
failed_files = []
total_chunks_created = 0
total_files_processed = 0
all_chunk_ids = []
all_embeddings = []
# Track chunk ID to file_path mapping for metadata
chunk_id_to_info: Dict[int, Dict[str, Any]] = {}
next_chunk_id = 1
for idx, index_path in enumerate(index_files, 1):
if progress_callback:
try:
rel_path = index_path.relative_to(index_root)
except ValueError:
rel_path = index_path
progress_callback(f"Processing {idx}/{len(index_files)}: {rel_path}")
try:
with sqlite3.connect(index_path) as conn:
conn.row_factory = sqlite3.Row
path_column = _get_path_column(conn)
# Get files from this index
cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
file_rows = cursor.fetchall()
for file_row in file_rows:
file_path = file_row[path_column]
content = file_row["content"]
language = file_row["language"] or "python"
try:
chunks = chunker.chunk_sliding_window(
content,
file_path=file_path,
language=language
)
if not chunks:
continue
total_files_processed += 1
# Generate embeddings for this file's chunks
batch_contents = [chunk.content for chunk in chunks]
embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=EMBEDDING_BATCH_SIZE)
# Assign chunk IDs and store embeddings
for i, chunk in enumerate(chunks):
chunk_id = next_chunk_id
next_chunk_id += 1
all_chunk_ids.append(chunk_id)
all_embeddings.append(embeddings_numpy[i])
# Store metadata for later retrieval
chunk_id_to_info[chunk_id] = {
"file_path": file_path,
"content": chunk.content,
"metadata": chunk.metadata,
"category": get_file_category(file_path) or "code",
}
total_chunks_created += 1
except Exception as e:
logger.error(f"Failed to process {file_path}: {e}")
failed_files.append((file_path, str(e)))
except Exception as e:
logger.error(f"Failed to read index {index_path}: {e}")
failed_files.append((str(index_path), str(e)))
# Add all embeddings to centralized ANN index
if all_embeddings:
if progress_callback:
progress_callback(f"Building centralized ANN index with {len(all_embeddings)} vectors...")
try:
import numpy as np
embeddings_matrix = np.vstack(all_embeddings)
central_ann_index.add_vectors(all_chunk_ids, embeddings_matrix)
central_ann_index.save()
if progress_callback:
progress_callback(f"Saved centralized index to {central_hnsw_path}")
except Exception as e:
return {
"success": False,
"error": f"Failed to build centralized ANN index: {str(e)}",
}
# Store chunk metadata in a centralized metadata database
vectors_meta_path = index_root / "VECTORS_META_DB_NAME"
# Note: The metadata is already stored in individual _index.db semantic_chunks tables
# For now, we rely on the existing per-index storage for metadata lookup
# A future enhancement could consolidate metadata into _vectors_meta.db
elapsed_time = time.time() - start_time
# Cleanup
try:
_cleanup_fastembed_resources()
gc.collect()
except Exception:
pass
return {
"success": True,
"result": {
"chunks_created": total_chunks_created,
"files_processed": total_files_processed,
"files_failed": len(failed_files),
"elapsed_time": elapsed_time,
"model_profile": model_profile,
"model_name": embedder.model_name,
"central_index_path": str(central_hnsw_path),
"failed_files": failed_files[:5],
},
}
def get_embeddings_status(index_root: Path) -> Dict[str, any]:
"""Get comprehensive embeddings coverage status for all indexes.