feat: Add centralized vector storage and metadata management for embeddings

This commit is contained in:
catlog22
2026-01-02 17:18:23 +08:00
parent 9157c5c78b
commit 0b6e9db8e4
5 changed files with 534 additions and 11 deletions

View File

@@ -2005,6 +2005,12 @@ def embeddings_generate(
),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
centralized: bool = typer.Option(
False,
"--centralized",
"-c",
help="Use centralized vector storage (single HNSW index at project root).",
),
) -> None:
"""Generate semantic embeddings for code search.
@@ -2012,6 +2018,10 @@ def embeddings_generate(
semantic search capabilities. Embeddings are stored in the same
database as the FTS index.
Storage Modes:
- Default: Per-directory HNSW indexes alongside _index.db files
- Centralized: Single HNSW index at project root (_vectors.hnsw)
Embedding Backend Options:
- fastembed: Local ONNX-based embeddings (default, no API calls)
- litellm: Remote API embeddings via ccw-litellm (requires API keys)
@@ -2033,12 +2043,14 @@ def embeddings_generate(
codexlens embeddings-generate ~/.codexlens/indexes/project/_index.db # Specific index
codexlens embeddings-generate ~/projects/my-app --backend litellm --model text-embedding-3-small # Use LiteLLM
codexlens embeddings-generate ~/projects/my-app --model fast --force # Regenerate with fast profile
codexlens embeddings-generate ~/projects/my-app --centralized # Centralized vector storage
"""
_configure_logging(verbose, json_mode)
from codexlens.cli.embedding_manager import (
generate_embeddings,
generate_embeddings_recursive,
generate_dense_embeddings_centralized,
scan_for_model_conflicts,
check_global_model_lock,
set_locked_model_config,
@@ -2099,7 +2111,11 @@ def embeddings_generate(
console.print(f" {msg}")
console.print(f"[bold]Generating embeddings[/bold]")
if use_recursive:
if centralized:
effective_root = index_root if index_root else (index_path.parent if index_path else target_path)
console.print(f"Index root: [dim]{effective_root}[/dim]")
console.print(f"Mode: [green]Centralized[/green]")
elif use_recursive:
console.print(f"Index root: [dim]{index_root}[/dim]")
console.print(f"Mode: [yellow]Recursive[/yellow]")
else:
@@ -2179,7 +2195,20 @@ def embeddings_generate(
console.print("[yellow]Cancelled.[/yellow] Use --force to skip this prompt.")
raise typer.Exit(code=0)
if use_recursive:
if centralized:
# Centralized mode: single HNSW index at project root
if not index_root:
index_root = index_path.parent if index_path else target_path
result = generate_dense_embeddings_centralized(
index_root,
embedding_backend=backend,
model_profile=model,
force=force,
chunk_size=chunk_size,
progress_callback=progress_update,
max_workers=max_workers,
)
elif use_recursive:
result = generate_embeddings_recursive(
index_root,
embedding_backend=backend,
@@ -2225,7 +2254,18 @@ def embeddings_generate(
# This prevents using different models for future indexes
set_locked_model_config(backend, model)
if use_recursive:
if centralized:
# Centralized mode output
elapsed = data.get("elapsed_time", 0)
console.print(f"[green]✓[/green] Centralized embeddings generated successfully!")
console.print(f" Model: {data.get('model_name', model)}")
console.print(f" Chunks created: {data['chunks_created']:,}")
console.print(f" Files processed: {data['files_processed']}")
if data.get("files_failed", 0) > 0:
console.print(f" [yellow]Files failed: {data['files_failed']}[/yellow]")
console.print(f" Central index: {data.get('central_index_path', 'N/A')}")
console.print(f" Time: {elapsed:.1f}s")
elif use_recursive:
# Recursive mode output
console.print(f"[green]✓[/green] Recursive embeddings generation complete!")
console.print(f" Indexes processed: {data['indexes_processed']}")

View File

@@ -17,6 +17,11 @@ except ImportError:
def is_embedding_backend_available(_backend: str): # type: ignore[no-redef]
return False, "codexlens.semantic not available"
try:
from codexlens.config import VECTORS_META_DB_NAME
except ImportError:
VECTORS_META_DB_NAME = "_vectors_meta.db"
try:
from codexlens.search.ranking import get_file_category
except ImportError:
@@ -1277,10 +1282,38 @@ def generate_dense_embeddings_centralized(
}
# Store chunk metadata in a centralized metadata database
vectors_meta_path = index_root / "VECTORS_META_DB_NAME"
# Note: The metadata is already stored in individual _index.db semantic_chunks tables
# For now, we rely on the existing per-index storage for metadata lookup
# A future enhancement could consolidate metadata into _vectors_meta.db
vectors_meta_path = index_root / VECTORS_META_DB_NAME
if chunk_id_to_info:
if progress_callback:
progress_callback(f"Storing {len(chunk_id_to_info)} chunk metadata records...")
try:
from codexlens.storage.vector_meta_store import VectorMetadataStore
with VectorMetadataStore(vectors_meta_path) as meta_store:
# Convert chunk_id_to_info dict to list of dicts for batch insert
chunks_to_store = []
for cid, info in chunk_id_to_info.items():
metadata = info.get("metadata", {})
chunks_to_store.append({
"chunk_id": cid,
"file_path": info["file_path"],
"content": info["content"],
"start_line": metadata.get("start_line"),
"end_line": metadata.get("end_line"),
"category": info.get("category"),
"metadata": metadata,
"source_index_db": None, # Not tracked per-chunk currently
})
meta_store.add_chunks(chunks_to_store)
if progress_callback:
progress_callback(f"Saved metadata to {vectors_meta_path}")
except Exception as e:
logger.warning("Failed to store vector metadata: %s", e)
# Non-fatal: continue without centralized metadata
elapsed_time = time.time() - start_time