mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-11 02:33:51 +08:00
feat: Add centralized vector storage and metadata management for embeddings
This commit is contained in:
@@ -2005,6 +2005,12 @@ def embeddings_generate(
|
|||||||
),
|
),
|
||||||
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
||||||
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
|
||||||
|
centralized: bool = typer.Option(
|
||||||
|
False,
|
||||||
|
"--centralized",
|
||||||
|
"-c",
|
||||||
|
help="Use centralized vector storage (single HNSW index at project root).",
|
||||||
|
),
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Generate semantic embeddings for code search.
|
"""Generate semantic embeddings for code search.
|
||||||
|
|
||||||
@@ -2012,6 +2018,10 @@ def embeddings_generate(
|
|||||||
semantic search capabilities. Embeddings are stored in the same
|
semantic search capabilities. Embeddings are stored in the same
|
||||||
database as the FTS index.
|
database as the FTS index.
|
||||||
|
|
||||||
|
Storage Modes:
|
||||||
|
- Default: Per-directory HNSW indexes alongside _index.db files
|
||||||
|
- Centralized: Single HNSW index at project root (_vectors.hnsw)
|
||||||
|
|
||||||
Embedding Backend Options:
|
Embedding Backend Options:
|
||||||
- fastembed: Local ONNX-based embeddings (default, no API calls)
|
- fastembed: Local ONNX-based embeddings (default, no API calls)
|
||||||
- litellm: Remote API embeddings via ccw-litellm (requires API keys)
|
- litellm: Remote API embeddings via ccw-litellm (requires API keys)
|
||||||
@@ -2033,12 +2043,14 @@ def embeddings_generate(
|
|||||||
codexlens embeddings-generate ~/.codexlens/indexes/project/_index.db # Specific index
|
codexlens embeddings-generate ~/.codexlens/indexes/project/_index.db # Specific index
|
||||||
codexlens embeddings-generate ~/projects/my-app --backend litellm --model text-embedding-3-small # Use LiteLLM
|
codexlens embeddings-generate ~/projects/my-app --backend litellm --model text-embedding-3-small # Use LiteLLM
|
||||||
codexlens embeddings-generate ~/projects/my-app --model fast --force # Regenerate with fast profile
|
codexlens embeddings-generate ~/projects/my-app --model fast --force # Regenerate with fast profile
|
||||||
|
codexlens embeddings-generate ~/projects/my-app --centralized # Centralized vector storage
|
||||||
"""
|
"""
|
||||||
_configure_logging(verbose, json_mode)
|
_configure_logging(verbose, json_mode)
|
||||||
|
|
||||||
from codexlens.cli.embedding_manager import (
|
from codexlens.cli.embedding_manager import (
|
||||||
generate_embeddings,
|
generate_embeddings,
|
||||||
generate_embeddings_recursive,
|
generate_embeddings_recursive,
|
||||||
|
generate_dense_embeddings_centralized,
|
||||||
scan_for_model_conflicts,
|
scan_for_model_conflicts,
|
||||||
check_global_model_lock,
|
check_global_model_lock,
|
||||||
set_locked_model_config,
|
set_locked_model_config,
|
||||||
@@ -2099,7 +2111,11 @@ def embeddings_generate(
|
|||||||
console.print(f" {msg}")
|
console.print(f" {msg}")
|
||||||
|
|
||||||
console.print(f"[bold]Generating embeddings[/bold]")
|
console.print(f"[bold]Generating embeddings[/bold]")
|
||||||
if use_recursive:
|
if centralized:
|
||||||
|
effective_root = index_root if index_root else (index_path.parent if index_path else target_path)
|
||||||
|
console.print(f"Index root: [dim]{effective_root}[/dim]")
|
||||||
|
console.print(f"Mode: [green]Centralized[/green]")
|
||||||
|
elif use_recursive:
|
||||||
console.print(f"Index root: [dim]{index_root}[/dim]")
|
console.print(f"Index root: [dim]{index_root}[/dim]")
|
||||||
console.print(f"Mode: [yellow]Recursive[/yellow]")
|
console.print(f"Mode: [yellow]Recursive[/yellow]")
|
||||||
else:
|
else:
|
||||||
@@ -2179,7 +2195,20 @@ def embeddings_generate(
|
|||||||
console.print("[yellow]Cancelled.[/yellow] Use --force to skip this prompt.")
|
console.print("[yellow]Cancelled.[/yellow] Use --force to skip this prompt.")
|
||||||
raise typer.Exit(code=0)
|
raise typer.Exit(code=0)
|
||||||
|
|
||||||
if use_recursive:
|
if centralized:
|
||||||
|
# Centralized mode: single HNSW index at project root
|
||||||
|
if not index_root:
|
||||||
|
index_root = index_path.parent if index_path else target_path
|
||||||
|
result = generate_dense_embeddings_centralized(
|
||||||
|
index_root,
|
||||||
|
embedding_backend=backend,
|
||||||
|
model_profile=model,
|
||||||
|
force=force,
|
||||||
|
chunk_size=chunk_size,
|
||||||
|
progress_callback=progress_update,
|
||||||
|
max_workers=max_workers,
|
||||||
|
)
|
||||||
|
elif use_recursive:
|
||||||
result = generate_embeddings_recursive(
|
result = generate_embeddings_recursive(
|
||||||
index_root,
|
index_root,
|
||||||
embedding_backend=backend,
|
embedding_backend=backend,
|
||||||
@@ -2225,7 +2254,18 @@ def embeddings_generate(
|
|||||||
# This prevents using different models for future indexes
|
# This prevents using different models for future indexes
|
||||||
set_locked_model_config(backend, model)
|
set_locked_model_config(backend, model)
|
||||||
|
|
||||||
if use_recursive:
|
if centralized:
|
||||||
|
# Centralized mode output
|
||||||
|
elapsed = data.get("elapsed_time", 0)
|
||||||
|
console.print(f"[green]✓[/green] Centralized embeddings generated successfully!")
|
||||||
|
console.print(f" Model: {data.get('model_name', model)}")
|
||||||
|
console.print(f" Chunks created: {data['chunks_created']:,}")
|
||||||
|
console.print(f" Files processed: {data['files_processed']}")
|
||||||
|
if data.get("files_failed", 0) > 0:
|
||||||
|
console.print(f" [yellow]Files failed: {data['files_failed']}[/yellow]")
|
||||||
|
console.print(f" Central index: {data.get('central_index_path', 'N/A')}")
|
||||||
|
console.print(f" Time: {elapsed:.1f}s")
|
||||||
|
elif use_recursive:
|
||||||
# Recursive mode output
|
# Recursive mode output
|
||||||
console.print(f"[green]✓[/green] Recursive embeddings generation complete!")
|
console.print(f"[green]✓[/green] Recursive embeddings generation complete!")
|
||||||
console.print(f" Indexes processed: {data['indexes_processed']}")
|
console.print(f" Indexes processed: {data['indexes_processed']}")
|
||||||
|
|||||||
@@ -17,6 +17,11 @@ except ImportError:
|
|||||||
def is_embedding_backend_available(_backend: str): # type: ignore[no-redef]
|
def is_embedding_backend_available(_backend: str): # type: ignore[no-redef]
|
||||||
return False, "codexlens.semantic not available"
|
return False, "codexlens.semantic not available"
|
||||||
|
|
||||||
|
try:
|
||||||
|
from codexlens.config import VECTORS_META_DB_NAME
|
||||||
|
except ImportError:
|
||||||
|
VECTORS_META_DB_NAME = "_vectors_meta.db"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from codexlens.search.ranking import get_file_category
|
from codexlens.search.ranking import get_file_category
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -1277,10 +1282,38 @@ def generate_dense_embeddings_centralized(
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Store chunk metadata in a centralized metadata database
|
# Store chunk metadata in a centralized metadata database
|
||||||
vectors_meta_path = index_root / "VECTORS_META_DB_NAME"
|
vectors_meta_path = index_root / VECTORS_META_DB_NAME
|
||||||
# Note: The metadata is already stored in individual _index.db semantic_chunks tables
|
if chunk_id_to_info:
|
||||||
# For now, we rely on the existing per-index storage for metadata lookup
|
if progress_callback:
|
||||||
# A future enhancement could consolidate metadata into _vectors_meta.db
|
progress_callback(f"Storing {len(chunk_id_to_info)} chunk metadata records...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from codexlens.storage.vector_meta_store import VectorMetadataStore
|
||||||
|
|
||||||
|
with VectorMetadataStore(vectors_meta_path) as meta_store:
|
||||||
|
# Convert chunk_id_to_info dict to list of dicts for batch insert
|
||||||
|
chunks_to_store = []
|
||||||
|
for cid, info in chunk_id_to_info.items():
|
||||||
|
metadata = info.get("metadata", {})
|
||||||
|
chunks_to_store.append({
|
||||||
|
"chunk_id": cid,
|
||||||
|
"file_path": info["file_path"],
|
||||||
|
"content": info["content"],
|
||||||
|
"start_line": metadata.get("start_line"),
|
||||||
|
"end_line": metadata.get("end_line"),
|
||||||
|
"category": info.get("category"),
|
||||||
|
"metadata": metadata,
|
||||||
|
"source_index_db": None, # Not tracked per-chunk currently
|
||||||
|
})
|
||||||
|
|
||||||
|
meta_store.add_chunks(chunks_to_store)
|
||||||
|
|
||||||
|
if progress_callback:
|
||||||
|
progress_callback(f"Saved metadata to {vectors_meta_path}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to store vector metadata: %s", e)
|
||||||
|
# Non-fatal: continue without centralized metadata
|
||||||
|
|
||||||
elapsed_time = time.time() - start_time
|
elapsed_time = time.time() - start_time
|
||||||
|
|
||||||
|
|||||||
@@ -664,10 +664,15 @@ class HybridSearchEngine:
|
|||||||
scores: List[float],
|
scores: List[float],
|
||||||
category: Optional[str] = None,
|
category: Optional[str] = None,
|
||||||
) -> List[SearchResult]:
|
) -> List[SearchResult]:
|
||||||
"""Fetch chunk metadata from all _index.db files for centralized search.
|
"""Fetch chunk metadata from centralized _vectors_meta.db for fast lookup.
|
||||||
|
|
||||||
|
This method uses the centralized VectorMetadataStore for O(1) lookup
|
||||||
|
instead of traversing all _index.db files (O(n) where n = number of indexes).
|
||||||
|
|
||||||
|
Falls back to the legacy per-index lookup if centralized metadata is unavailable.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
index_root: Root directory containing _index.db files
|
index_root: Root directory containing _vectors_meta.db
|
||||||
chunk_ids: List of chunk IDs from ANN search
|
chunk_ids: List of chunk IDs from ANN search
|
||||||
scores: Corresponding similarity scores
|
scores: Corresponding similarity scores
|
||||||
category: Optional category filter
|
category: Optional category filter
|
||||||
@@ -675,12 +680,123 @@ class HybridSearchEngine:
|
|||||||
Returns:
|
Returns:
|
||||||
List of SearchResult objects
|
List of SearchResult objects
|
||||||
"""
|
"""
|
||||||
import sqlite3
|
from codexlens.config import VECTORS_META_DB_NAME
|
||||||
import json
|
|
||||||
|
|
||||||
# Build score map
|
# Build score map
|
||||||
score_map = {cid: score for cid, score in zip(chunk_ids, scores)}
|
score_map = {cid: score for cid, score in zip(chunk_ids, scores)}
|
||||||
|
|
||||||
|
# Try centralized metadata store first (fast path)
|
||||||
|
vectors_meta_path = index_root / VECTORS_META_DB_NAME
|
||||||
|
if vectors_meta_path.exists():
|
||||||
|
try:
|
||||||
|
return self._fetch_from_vector_meta_store(
|
||||||
|
vectors_meta_path, chunk_ids, score_map, category
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(
|
||||||
|
"Centralized metadata lookup failed, falling back: %s", e
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fallback: traverse _index.db files (legacy path)
|
||||||
|
return self._fetch_chunks_by_ids_legacy(
|
||||||
|
index_root, chunk_ids, score_map, category
|
||||||
|
)
|
||||||
|
|
||||||
|
def _fetch_from_vector_meta_store(
|
||||||
|
self,
|
||||||
|
meta_db_path: Path,
|
||||||
|
chunk_ids: List[int],
|
||||||
|
score_map: Dict[int, float],
|
||||||
|
category: Optional[str] = None,
|
||||||
|
) -> List[SearchResult]:
|
||||||
|
"""Fetch chunks from centralized VectorMetadataStore.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
meta_db_path: Path to _vectors_meta.db
|
||||||
|
chunk_ids: List of chunk IDs to fetch
|
||||||
|
score_map: Mapping of chunk_id to score
|
||||||
|
category: Optional category filter
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of SearchResult objects
|
||||||
|
"""
|
||||||
|
from codexlens.storage.vector_meta_store import VectorMetadataStore
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
with VectorMetadataStore(meta_db_path) as meta_store:
|
||||||
|
rows = meta_store.get_chunks_by_ids(chunk_ids, category=category)
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
chunk_id = row["chunk_id"]
|
||||||
|
file_path = row["file_path"]
|
||||||
|
content = row["content"] or ""
|
||||||
|
metadata = row.get("metadata") or {}
|
||||||
|
start_line = row.get("start_line")
|
||||||
|
end_line = row.get("end_line")
|
||||||
|
|
||||||
|
score = score_map.get(chunk_id, 0.0)
|
||||||
|
|
||||||
|
# Build excerpt
|
||||||
|
excerpt = content[:200] + "..." if len(content) > 200 else content
|
||||||
|
|
||||||
|
# Extract symbol information
|
||||||
|
symbol_name = metadata.get("symbol_name")
|
||||||
|
symbol_kind = metadata.get("symbol_kind")
|
||||||
|
|
||||||
|
# Build Symbol object if available
|
||||||
|
symbol = None
|
||||||
|
if symbol_name and symbol_kind and start_line and end_line:
|
||||||
|
try:
|
||||||
|
from codexlens.entities import Symbol
|
||||||
|
symbol = Symbol(
|
||||||
|
name=symbol_name,
|
||||||
|
kind=symbol_kind,
|
||||||
|
range=(start_line, end_line)
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
results.append(SearchResult(
|
||||||
|
path=file_path,
|
||||||
|
score=score,
|
||||||
|
excerpt=excerpt,
|
||||||
|
content=content,
|
||||||
|
symbol=symbol,
|
||||||
|
metadata=metadata,
|
||||||
|
start_line=start_line,
|
||||||
|
end_line=end_line,
|
||||||
|
symbol_name=symbol_name,
|
||||||
|
symbol_kind=symbol_kind,
|
||||||
|
))
|
||||||
|
|
||||||
|
# Sort by score descending
|
||||||
|
results.sort(key=lambda r: r.score, reverse=True)
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _fetch_chunks_by_ids_legacy(
|
||||||
|
self,
|
||||||
|
index_root: Path,
|
||||||
|
chunk_ids: List[int],
|
||||||
|
score_map: Dict[int, float],
|
||||||
|
category: Optional[str] = None,
|
||||||
|
) -> List[SearchResult]:
|
||||||
|
"""Legacy fallback: fetch chunk metadata by traversing all _index.db files.
|
||||||
|
|
||||||
|
This is the O(n) fallback path used when centralized metadata is unavailable.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
index_root: Root directory containing _index.db files
|
||||||
|
chunk_ids: List of chunk IDs from ANN search
|
||||||
|
score_map: Mapping of chunk_id to score
|
||||||
|
category: Optional category filter
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of SearchResult objects
|
||||||
|
"""
|
||||||
|
import sqlite3
|
||||||
|
import json
|
||||||
|
|
||||||
# Find all _index.db files
|
# Find all _index.db files
|
||||||
index_files = list(index_root.rglob("_index.db"))
|
index_files = list(index_root.rglob("_index.db"))
|
||||||
|
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from .path_mapper import PathMapper
|
|||||||
from .registry import RegistryStore, ProjectInfo, DirMapping
|
from .registry import RegistryStore, ProjectInfo, DirMapping
|
||||||
from .dir_index import DirIndexStore, SubdirLink, FileEntry
|
from .dir_index import DirIndexStore, SubdirLink, FileEntry
|
||||||
from .index_tree import IndexTreeBuilder, BuildResult, DirBuildResult
|
from .index_tree import IndexTreeBuilder, BuildResult, DirBuildResult
|
||||||
|
from .vector_meta_store import VectorMetadataStore
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
# Legacy (workspace-local)
|
# Legacy (workspace-local)
|
||||||
@@ -25,5 +26,7 @@ __all__ = [
|
|||||||
"IndexTreeBuilder",
|
"IndexTreeBuilder",
|
||||||
"BuildResult",
|
"BuildResult",
|
||||||
"DirBuildResult",
|
"DirBuildResult",
|
||||||
|
# Vector metadata
|
||||||
|
"VectorMetadataStore",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
331
codex-lens/src/codexlens/storage/vector_meta_store.py
Normal file
331
codex-lens/src/codexlens/storage/vector_meta_store.py
Normal file
@@ -0,0 +1,331 @@
|
|||||||
|
"""Central storage for vector metadata.
|
||||||
|
|
||||||
|
This module provides a centralized SQLite database for storing chunk metadata
|
||||||
|
associated with centralized vector indexes. Instead of traversing all _index.db
|
||||||
|
files to fetch chunk metadata, this provides O(1) lookup by chunk ID.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sqlite3
|
||||||
|
import threading
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from codexlens.errors import StorageError
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class VectorMetadataStore:
|
||||||
|
"""Store and retrieve chunk metadata for centralized vector search.
|
||||||
|
|
||||||
|
This class provides efficient storage and retrieval of chunk metadata
|
||||||
|
for the centralized vector index architecture. All chunk metadata is
|
||||||
|
stored in a single _vectors_meta.db file at the project root, enabling
|
||||||
|
fast lookups without traversing multiple _index.db files.
|
||||||
|
|
||||||
|
Schema:
|
||||||
|
chunk_metadata:
|
||||||
|
- chunk_id: INTEGER PRIMARY KEY - Global chunk ID
|
||||||
|
- file_path: TEXT NOT NULL - Path to source file
|
||||||
|
- content: TEXT - Chunk text content
|
||||||
|
- start_line: INTEGER - Start line in source file
|
||||||
|
- end_line: INTEGER - End line in source file
|
||||||
|
- category: TEXT - Content category (code/doc)
|
||||||
|
- metadata: TEXT - JSON-encoded additional metadata
|
||||||
|
- source_index_db: TEXT - Path to source _index.db file
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, db_path: Path | str) -> None:
|
||||||
|
"""Initialize VectorMetadataStore.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db_path: Path to SQLite database file.
|
||||||
|
"""
|
||||||
|
self.db_path = Path(db_path)
|
||||||
|
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Thread-safe connection management
|
||||||
|
self._lock = threading.RLock()
|
||||||
|
self._local = threading.local()
|
||||||
|
|
||||||
|
def _get_connection(self) -> sqlite3.Connection:
|
||||||
|
"""Get or create a thread-local database connection.
|
||||||
|
|
||||||
|
Each thread gets its own connection to ensure thread safety.
|
||||||
|
"""
|
||||||
|
conn = getattr(self._local, "conn", None)
|
||||||
|
if conn is None:
|
||||||
|
conn = sqlite3.connect(
|
||||||
|
str(self.db_path),
|
||||||
|
timeout=30.0,
|
||||||
|
check_same_thread=True,
|
||||||
|
)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
conn.execute("PRAGMA journal_mode=WAL")
|
||||||
|
conn.execute("PRAGMA synchronous=NORMAL")
|
||||||
|
conn.execute("PRAGMA mmap_size=1073741824") # 1GB mmap
|
||||||
|
self._local.conn = conn
|
||||||
|
return conn
|
||||||
|
|
||||||
|
def _ensure_schema(self) -> None:
|
||||||
|
"""Create tables if they don't exist."""
|
||||||
|
with self._lock:
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
conn.execute('''
|
||||||
|
CREATE TABLE IF NOT EXISTS chunk_metadata (
|
||||||
|
chunk_id INTEGER PRIMARY KEY,
|
||||||
|
file_path TEXT NOT NULL,
|
||||||
|
content TEXT,
|
||||||
|
start_line INTEGER,
|
||||||
|
end_line INTEGER,
|
||||||
|
category TEXT,
|
||||||
|
metadata TEXT,
|
||||||
|
source_index_db TEXT
|
||||||
|
)
|
||||||
|
''')
|
||||||
|
conn.execute(
|
||||||
|
'CREATE INDEX IF NOT EXISTS idx_chunk_file_path '
|
||||||
|
'ON chunk_metadata(file_path)'
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
'CREATE INDEX IF NOT EXISTS idx_chunk_category '
|
||||||
|
'ON chunk_metadata(category)'
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
logger.debug("VectorMetadataStore schema created/verified")
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
raise StorageError(
|
||||||
|
f"Failed to create schema: {e}",
|
||||||
|
db_path=str(self.db_path),
|
||||||
|
operation="_ensure_schema"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
def add_chunk(
|
||||||
|
self,
|
||||||
|
chunk_id: int,
|
||||||
|
file_path: str,
|
||||||
|
content: str,
|
||||||
|
start_line: Optional[int] = None,
|
||||||
|
end_line: Optional[int] = None,
|
||||||
|
category: Optional[str] = None,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
source_index_db: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Add a single chunk's metadata.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunk_id: Global unique chunk ID.
|
||||||
|
file_path: Path to source file.
|
||||||
|
content: Chunk text content.
|
||||||
|
start_line: Start line in source file.
|
||||||
|
end_line: End line in source file.
|
||||||
|
category: Content category (code/doc).
|
||||||
|
metadata: Additional metadata dictionary.
|
||||||
|
source_index_db: Path to source _index.db file.
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
metadata_json = json.dumps(metadata) if metadata else None
|
||||||
|
conn.execute(
|
||||||
|
'''
|
||||||
|
INSERT OR REPLACE INTO chunk_metadata
|
||||||
|
(chunk_id, file_path, content, start_line, end_line,
|
||||||
|
category, metadata, source_index_db)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
''',
|
||||||
|
(chunk_id, file_path, content, start_line, end_line,
|
||||||
|
category, metadata_json, source_index_db)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
raise StorageError(
|
||||||
|
f"Failed to add chunk {chunk_id}: {e}",
|
||||||
|
db_path=str(self.db_path),
|
||||||
|
operation="add_chunk"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
def add_chunks(self, chunks: List[Dict[str, Any]]) -> None:
|
||||||
|
"""Batch insert chunk metadata.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunks: List of dictionaries with keys:
|
||||||
|
- chunk_id (required): Global unique chunk ID
|
||||||
|
- file_path (required): Path to source file
|
||||||
|
- content: Chunk text content
|
||||||
|
- start_line: Start line in source file
|
||||||
|
- end_line: End line in source file
|
||||||
|
- category: Content category (code/doc)
|
||||||
|
- metadata: Additional metadata dictionary
|
||||||
|
- source_index_db: Path to source _index.db file
|
||||||
|
"""
|
||||||
|
if not chunks:
|
||||||
|
return
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
batch_data = []
|
||||||
|
for chunk in chunks:
|
||||||
|
metadata = chunk.get("metadata")
|
||||||
|
metadata_json = json.dumps(metadata) if metadata else None
|
||||||
|
batch_data.append((
|
||||||
|
chunk["chunk_id"],
|
||||||
|
chunk["file_path"],
|
||||||
|
chunk.get("content"),
|
||||||
|
chunk.get("start_line"),
|
||||||
|
chunk.get("end_line"),
|
||||||
|
chunk.get("category"),
|
||||||
|
metadata_json,
|
||||||
|
chunk.get("source_index_db"),
|
||||||
|
))
|
||||||
|
|
||||||
|
conn.executemany(
|
||||||
|
'''
|
||||||
|
INSERT OR REPLACE INTO chunk_metadata
|
||||||
|
(chunk_id, file_path, content, start_line, end_line,
|
||||||
|
category, metadata, source_index_db)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
''',
|
||||||
|
batch_data
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
logger.debug("Batch inserted %d chunk metadata records", len(chunks))
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
raise StorageError(
|
||||||
|
f"Failed to batch insert chunks: {e}",
|
||||||
|
db_path=str(self.db_path),
|
||||||
|
operation="add_chunks"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
def get_chunks_by_ids(
|
||||||
|
self,
|
||||||
|
chunk_ids: List[int],
|
||||||
|
category: Optional[str] = None,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Retrieve chunks by their IDs - the key optimization.
|
||||||
|
|
||||||
|
This is the primary method that replaces traversing all _index.db files.
|
||||||
|
Provides O(1) lookup by chunk ID instead of O(n) where n is the number
|
||||||
|
of index databases.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunk_ids: List of chunk IDs to retrieve.
|
||||||
|
category: Optional category filter ('code' or 'doc').
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dictionaries with chunk metadata:
|
||||||
|
- chunk_id: Global chunk ID
|
||||||
|
- file_path: Path to source file
|
||||||
|
- content: Chunk text content
|
||||||
|
- start_line: Start line in source file
|
||||||
|
- end_line: End line in source file
|
||||||
|
- category: Content category
|
||||||
|
- metadata: Parsed metadata dictionary
|
||||||
|
- source_index_db: Source _index.db path
|
||||||
|
"""
|
||||||
|
if not chunk_ids:
|
||||||
|
return []
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
placeholders = ",".join("?" * len(chunk_ids))
|
||||||
|
|
||||||
|
if category:
|
||||||
|
query = f'''
|
||||||
|
SELECT chunk_id, file_path, content, start_line, end_line,
|
||||||
|
category, metadata, source_index_db
|
||||||
|
FROM chunk_metadata
|
||||||
|
WHERE chunk_id IN ({placeholders}) AND category = ?
|
||||||
|
'''
|
||||||
|
params = list(chunk_ids) + [category]
|
||||||
|
else:
|
||||||
|
query = f'''
|
||||||
|
SELECT chunk_id, file_path, content, start_line, end_line,
|
||||||
|
category, metadata, source_index_db
|
||||||
|
FROM chunk_metadata
|
||||||
|
WHERE chunk_id IN ({placeholders})
|
||||||
|
'''
|
||||||
|
params = list(chunk_ids)
|
||||||
|
|
||||||
|
rows = conn.execute(query, params).fetchall()
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for row in rows:
|
||||||
|
metadata = None
|
||||||
|
if row["metadata"]:
|
||||||
|
try:
|
||||||
|
metadata = json.loads(row["metadata"])
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
metadata = {}
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"chunk_id": row["chunk_id"],
|
||||||
|
"file_path": row["file_path"],
|
||||||
|
"content": row["content"],
|
||||||
|
"start_line": row["start_line"],
|
||||||
|
"end_line": row["end_line"],
|
||||||
|
"category": row["category"],
|
||||||
|
"metadata": metadata or {},
|
||||||
|
"source_index_db": row["source_index_db"],
|
||||||
|
})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logger.error("Failed to get chunks by IDs: %s", e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_chunk_count(self) -> int:
|
||||||
|
"""Get total number of chunks in store.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Total chunk count.
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT COUNT(*) FROM chunk_metadata"
|
||||||
|
).fetchone()
|
||||||
|
return row[0] if row else 0
|
||||||
|
except sqlite3.Error:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def clear(self) -> None:
|
||||||
|
"""Clear all metadata."""
|
||||||
|
with self._lock:
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
conn.execute("DELETE FROM chunk_metadata")
|
||||||
|
conn.commit()
|
||||||
|
logger.info("Cleared all chunk metadata")
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
raise StorageError(
|
||||||
|
f"Failed to clear metadata: {e}",
|
||||||
|
db_path=str(self.db_path),
|
||||||
|
operation="clear"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
"""Close database connection."""
|
||||||
|
with self._lock:
|
||||||
|
conn = getattr(self._local, "conn", None)
|
||||||
|
if conn is not None:
|
||||||
|
conn.close()
|
||||||
|
self._local.conn = None
|
||||||
|
|
||||||
|
def __enter__(self) -> "VectorMetadataStore":
|
||||||
|
"""Context manager entry."""
|
||||||
|
self._ensure_schema()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||||
|
"""Context manager exit."""
|
||||||
|
self.close()
|
||||||
Reference in New Issue
Block a user