feat: Add centralized vector storage and metadata management for embeddings

This commit is contained in:
catlog22
2026-01-02 17:18:23 +08:00
parent 9157c5c78b
commit 0b6e9db8e4
5 changed files with 534 additions and 11 deletions

View File

@@ -7,6 +7,7 @@ from .path_mapper import PathMapper
from .registry import RegistryStore, ProjectInfo, DirMapping
from .dir_index import DirIndexStore, SubdirLink, FileEntry
from .index_tree import IndexTreeBuilder, BuildResult, DirBuildResult
from .vector_meta_store import VectorMetadataStore
__all__ = [
# Legacy (workspace-local)
@@ -25,5 +26,7 @@ __all__ = [
"IndexTreeBuilder",
"BuildResult",
"DirBuildResult",
# Vector metadata
"VectorMetadataStore",
]

View File

@@ -0,0 +1,331 @@
"""Central storage for vector metadata.
This module provides a centralized SQLite database for storing chunk metadata
associated with centralized vector indexes. Instead of traversing all _index.db
files to fetch chunk metadata, this provides O(1) lookup by chunk ID.
"""
from __future__ import annotations
import json
import logging
import sqlite3
import threading
from pathlib import Path
from typing import Any, Dict, List, Optional
from codexlens.errors import StorageError
logger = logging.getLogger(__name__)
class VectorMetadataStore:
"""Store and retrieve chunk metadata for centralized vector search.
This class provides efficient storage and retrieval of chunk metadata
for the centralized vector index architecture. All chunk metadata is
stored in a single _vectors_meta.db file at the project root, enabling
fast lookups without traversing multiple _index.db files.
Schema:
chunk_metadata:
- chunk_id: INTEGER PRIMARY KEY - Global chunk ID
- file_path: TEXT NOT NULL - Path to source file
- content: TEXT - Chunk text content
- start_line: INTEGER - Start line in source file
- end_line: INTEGER - End line in source file
- category: TEXT - Content category (code/doc)
- metadata: TEXT - JSON-encoded additional metadata
- source_index_db: TEXT - Path to source _index.db file
"""
def __init__(self, db_path: Path | str) -> None:
"""Initialize VectorMetadataStore.
Args:
db_path: Path to SQLite database file.
"""
self.db_path = Path(db_path)
self.db_path.parent.mkdir(parents=True, exist_ok=True)
# Thread-safe connection management
self._lock = threading.RLock()
self._local = threading.local()
def _get_connection(self) -> sqlite3.Connection:
"""Get or create a thread-local database connection.
Each thread gets its own connection to ensure thread safety.
"""
conn = getattr(self._local, "conn", None)
if conn is None:
conn = sqlite3.connect(
str(self.db_path),
timeout=30.0,
check_same_thread=True,
)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
conn.execute("PRAGMA mmap_size=1073741824") # 1GB mmap
self._local.conn = conn
return conn
def _ensure_schema(self) -> None:
"""Create tables if they don't exist."""
with self._lock:
conn = self._get_connection()
try:
conn.execute('''
CREATE TABLE IF NOT EXISTS chunk_metadata (
chunk_id INTEGER PRIMARY KEY,
file_path TEXT NOT NULL,
content TEXT,
start_line INTEGER,
end_line INTEGER,
category TEXT,
metadata TEXT,
source_index_db TEXT
)
''')
conn.execute(
'CREATE INDEX IF NOT EXISTS idx_chunk_file_path '
'ON chunk_metadata(file_path)'
)
conn.execute(
'CREATE INDEX IF NOT EXISTS idx_chunk_category '
'ON chunk_metadata(category)'
)
conn.commit()
logger.debug("VectorMetadataStore schema created/verified")
except sqlite3.Error as e:
raise StorageError(
f"Failed to create schema: {e}",
db_path=str(self.db_path),
operation="_ensure_schema"
) from e
def add_chunk(
self,
chunk_id: int,
file_path: str,
content: str,
start_line: Optional[int] = None,
end_line: Optional[int] = None,
category: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
source_index_db: Optional[str] = None,
) -> None:
"""Add a single chunk's metadata.
Args:
chunk_id: Global unique chunk ID.
file_path: Path to source file.
content: Chunk text content.
start_line: Start line in source file.
end_line: End line in source file.
category: Content category (code/doc).
metadata: Additional metadata dictionary.
source_index_db: Path to source _index.db file.
"""
with self._lock:
conn = self._get_connection()
try:
metadata_json = json.dumps(metadata) if metadata else None
conn.execute(
'''
INSERT OR REPLACE INTO chunk_metadata
(chunk_id, file_path, content, start_line, end_line,
category, metadata, source_index_db)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
''',
(chunk_id, file_path, content, start_line, end_line,
category, metadata_json, source_index_db)
)
conn.commit()
except sqlite3.Error as e:
raise StorageError(
f"Failed to add chunk {chunk_id}: {e}",
db_path=str(self.db_path),
operation="add_chunk"
) from e
def add_chunks(self, chunks: List[Dict[str, Any]]) -> None:
"""Batch insert chunk metadata.
Args:
chunks: List of dictionaries with keys:
- chunk_id (required): Global unique chunk ID
- file_path (required): Path to source file
- content: Chunk text content
- start_line: Start line in source file
- end_line: End line in source file
- category: Content category (code/doc)
- metadata: Additional metadata dictionary
- source_index_db: Path to source _index.db file
"""
if not chunks:
return
with self._lock:
conn = self._get_connection()
try:
batch_data = []
for chunk in chunks:
metadata = chunk.get("metadata")
metadata_json = json.dumps(metadata) if metadata else None
batch_data.append((
chunk["chunk_id"],
chunk["file_path"],
chunk.get("content"),
chunk.get("start_line"),
chunk.get("end_line"),
chunk.get("category"),
metadata_json,
chunk.get("source_index_db"),
))
conn.executemany(
'''
INSERT OR REPLACE INTO chunk_metadata
(chunk_id, file_path, content, start_line, end_line,
category, metadata, source_index_db)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
''',
batch_data
)
conn.commit()
logger.debug("Batch inserted %d chunk metadata records", len(chunks))
except sqlite3.Error as e:
raise StorageError(
f"Failed to batch insert chunks: {e}",
db_path=str(self.db_path),
operation="add_chunks"
) from e
def get_chunks_by_ids(
self,
chunk_ids: List[int],
category: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""Retrieve chunks by their IDs - the key optimization.
This is the primary method that replaces traversing all _index.db files.
Provides O(1) lookup by chunk ID instead of O(n) where n is the number
of index databases.
Args:
chunk_ids: List of chunk IDs to retrieve.
category: Optional category filter ('code' or 'doc').
Returns:
List of dictionaries with chunk metadata:
- chunk_id: Global chunk ID
- file_path: Path to source file
- content: Chunk text content
- start_line: Start line in source file
- end_line: End line in source file
- category: Content category
- metadata: Parsed metadata dictionary
- source_index_db: Source _index.db path
"""
if not chunk_ids:
return []
with self._lock:
conn = self._get_connection()
try:
placeholders = ",".join("?" * len(chunk_ids))
if category:
query = f'''
SELECT chunk_id, file_path, content, start_line, end_line,
category, metadata, source_index_db
FROM chunk_metadata
WHERE chunk_id IN ({placeholders}) AND category = ?
'''
params = list(chunk_ids) + [category]
else:
query = f'''
SELECT chunk_id, file_path, content, start_line, end_line,
category, metadata, source_index_db
FROM chunk_metadata
WHERE chunk_id IN ({placeholders})
'''
params = list(chunk_ids)
rows = conn.execute(query, params).fetchall()
results = []
for row in rows:
metadata = None
if row["metadata"]:
try:
metadata = json.loads(row["metadata"])
except json.JSONDecodeError:
metadata = {}
results.append({
"chunk_id": row["chunk_id"],
"file_path": row["file_path"],
"content": row["content"],
"start_line": row["start_line"],
"end_line": row["end_line"],
"category": row["category"],
"metadata": metadata or {},
"source_index_db": row["source_index_db"],
})
return results
except sqlite3.Error as e:
logger.error("Failed to get chunks by IDs: %s", e)
return []
def get_chunk_count(self) -> int:
"""Get total number of chunks in store.
Returns:
Total chunk count.
"""
with self._lock:
conn = self._get_connection()
try:
row = conn.execute(
"SELECT COUNT(*) FROM chunk_metadata"
).fetchone()
return row[0] if row else 0
except sqlite3.Error:
return 0
def clear(self) -> None:
"""Clear all metadata."""
with self._lock:
conn = self._get_connection()
try:
conn.execute("DELETE FROM chunk_metadata")
conn.commit()
logger.info("Cleared all chunk metadata")
except sqlite3.Error as e:
raise StorageError(
f"Failed to clear metadata: {e}",
db_path=str(self.db_path),
operation="clear"
) from e
def close(self) -> None:
"""Close database connection."""
with self._lock:
conn = getattr(self._local, "conn", None)
if conn is not None:
conn.close()
self._local.conn = None
def __enter__(self) -> "VectorMetadataStore":
"""Context manager entry."""
self._ensure_schema()
return self
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
"""Context manager exit."""
self.close()