Refactor code structure and remove redundant changes

This commit is contained in:
catlog22
2026-01-24 14:47:47 +08:00
parent cf5fecd66d
commit f2b0a5bbc9
113 changed files with 43217 additions and 235 deletions

View File

@@ -0,0 +1,32 @@
"""Storage backends for CodexLens."""
from __future__ import annotations
from .sqlite_store import SQLiteStore
from .path_mapper import PathMapper
from .registry import RegistryStore, ProjectInfo, DirMapping
from .dir_index import DirIndexStore, SubdirLink, FileEntry
from .index_tree import IndexTreeBuilder, BuildResult, DirBuildResult
from .vector_meta_store import VectorMetadataStore
__all__ = [
# Legacy (workspace-local)
"SQLiteStore",
# Path mapping
"PathMapper",
# Global registry
"RegistryStore",
"ProjectInfo",
"DirMapping",
# Directory index
"DirIndexStore",
"SubdirLink",
"FileEntry",
# Tree builder
"IndexTreeBuilder",
"BuildResult",
"DirBuildResult",
# Vector metadata
"VectorMetadataStore",
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,32 @@
"""Simple filesystem cache helpers."""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
@dataclass
class FileCache:
"""Caches file mtimes for incremental indexing."""
cache_path: Path
def load_mtime(self, path: Path) -> Optional[float]:
try:
key = self._key_for(path)
record = (self.cache_path / key).read_text(encoding="utf-8")
return float(record)
except Exception:
return None
def store_mtime(self, path: Path, mtime: float) -> None:
self.cache_path.mkdir(parents=True, exist_ok=True)
key = self._key_for(path)
(self.cache_path / key).write_text(str(mtime), encoding="utf-8")
def _key_for(self, path: Path) -> str:
safe = str(path).replace(":", "_").replace("\\", "_").replace("/", "_")
return f"{safe}.mtime"

View File

@@ -0,0 +1,398 @@
"""Global cross-directory symbol index for fast lookups.
Stores symbols for an entire project in a single SQLite database so symbol search
does not require traversing every directory _index.db.
This index is updated incrementally during file indexing (delete+insert per file)
to avoid expensive batch rebuilds.
"""
from __future__ import annotations
import logging
import sqlite3
import threading
from pathlib import Path
from typing import List, Optional, Tuple
from codexlens.entities import Symbol
from codexlens.errors import StorageError
class GlobalSymbolIndex:
"""Project-wide symbol index with incremental updates."""
SCHEMA_VERSION = 1
DEFAULT_DB_NAME = "_global_symbols.db"
def __init__(self, db_path: str | Path, project_id: int) -> None:
self.db_path = Path(db_path).resolve()
self.project_id = int(project_id)
self._lock = threading.RLock()
self._conn: Optional[sqlite3.Connection] = None
self.logger = logging.getLogger(__name__)
def initialize(self) -> None:
"""Create database and schema if not exists."""
with self._lock:
self.db_path.parent.mkdir(parents=True, exist_ok=True)
conn = self._get_connection()
current_version = self._get_schema_version(conn)
if current_version > self.SCHEMA_VERSION:
raise StorageError(
f"Database schema version {current_version} is newer than "
f"supported version {self.SCHEMA_VERSION}. "
f"Please update the application or use a compatible database.",
db_path=str(self.db_path),
operation="initialize",
details={
"current_version": current_version,
"supported_version": self.SCHEMA_VERSION,
},
)
if current_version == 0:
self._create_schema(conn)
self._set_schema_version(conn, self.SCHEMA_VERSION)
elif current_version < self.SCHEMA_VERSION:
self._apply_migrations(conn, current_version)
self._set_schema_version(conn, self.SCHEMA_VERSION)
conn.commit()
def close(self) -> None:
"""Close database connection."""
with self._lock:
if self._conn is not None:
try:
self._conn.close()
except Exception:
pass
finally:
self._conn = None
def __enter__(self) -> "GlobalSymbolIndex":
self.initialize()
return self
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
self.close()
def add_symbol(self, symbol: Symbol, file_path: str | Path, index_path: str | Path) -> None:
"""Insert a single symbol (idempotent) for incremental updates."""
file_path_str = str(Path(file_path).resolve())
index_path_str = str(Path(index_path).resolve())
with self._lock:
conn = self._get_connection()
try:
conn.execute(
"""
INSERT INTO global_symbols(
project_id, symbol_name, symbol_kind,
file_path, start_line, end_line, index_path
)
VALUES(?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(
project_id, symbol_name, symbol_kind,
file_path, start_line, end_line
)
DO UPDATE SET
index_path=excluded.index_path
""",
(
self.project_id,
symbol.name,
symbol.kind,
file_path_str,
symbol.range[0],
symbol.range[1],
index_path_str,
),
)
conn.commit()
except sqlite3.DatabaseError as exc:
conn.rollback()
raise StorageError(
f"Failed to add symbol {symbol.name}: {exc}",
db_path=str(self.db_path),
operation="add_symbol",
) from exc
def update_file_symbols(
self,
file_path: str | Path,
symbols: List[Symbol],
index_path: str | Path | None = None,
) -> None:
"""Replace all symbols for a file atomically (delete + insert)."""
file_path_str = str(Path(file_path).resolve())
index_path_str: Optional[str]
if index_path is not None:
index_path_str = str(Path(index_path).resolve())
else:
index_path_str = self._get_existing_index_path(file_path_str)
with self._lock:
conn = self._get_connection()
try:
conn.execute("BEGIN")
conn.execute(
"DELETE FROM global_symbols WHERE project_id=? AND file_path=?",
(self.project_id, file_path_str),
)
if symbols:
if not index_path_str:
raise StorageError(
"index_path is required when inserting symbols for a new file",
db_path=str(self.db_path),
operation="update_file_symbols",
details={"file_path": file_path_str},
)
rows = [
(
self.project_id,
s.name,
s.kind,
file_path_str,
s.range[0],
s.range[1],
index_path_str,
)
for s in symbols
]
conn.executemany(
"""
INSERT INTO global_symbols(
project_id, symbol_name, symbol_kind,
file_path, start_line, end_line, index_path
)
VALUES(?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(
project_id, symbol_name, symbol_kind,
file_path, start_line, end_line
)
DO UPDATE SET
index_path=excluded.index_path
""",
rows,
)
conn.commit()
except sqlite3.DatabaseError as exc:
conn.rollback()
raise StorageError(
f"Failed to update symbols for {file_path_str}: {exc}",
db_path=str(self.db_path),
operation="update_file_symbols",
) from exc
def delete_file_symbols(self, file_path: str | Path) -> int:
"""Remove all symbols for a file. Returns number of rows deleted."""
file_path_str = str(Path(file_path).resolve())
with self._lock:
conn = self._get_connection()
try:
cur = conn.execute(
"DELETE FROM global_symbols WHERE project_id=? AND file_path=?",
(self.project_id, file_path_str),
)
conn.commit()
return int(cur.rowcount or 0)
except sqlite3.DatabaseError as exc:
conn.rollback()
raise StorageError(
f"Failed to delete symbols for {file_path_str}: {exc}",
db_path=str(self.db_path),
operation="delete_file_symbols",
) from exc
def search(
self,
name: str,
kind: Optional[str] = None,
limit: int = 50,
prefix_mode: bool = True,
) -> List[Symbol]:
"""Search symbols and return full Symbol objects."""
if prefix_mode:
pattern = f"{name}%"
else:
pattern = f"%{name}%"
with self._lock:
conn = self._get_connection()
if kind:
rows = conn.execute(
"""
SELECT symbol_name, symbol_kind, file_path, start_line, end_line
FROM global_symbols
WHERE project_id=? AND symbol_name LIKE ? AND symbol_kind=?
ORDER BY symbol_name
LIMIT ?
""",
(self.project_id, pattern, kind, limit),
).fetchall()
else:
rows = conn.execute(
"""
SELECT symbol_name, symbol_kind, file_path, start_line, end_line
FROM global_symbols
WHERE project_id=? AND symbol_name LIKE ?
ORDER BY symbol_name
LIMIT ?
""",
(self.project_id, pattern, limit),
).fetchall()
return [
Symbol(
name=row["symbol_name"],
kind=row["symbol_kind"],
range=(row["start_line"], row["end_line"]),
file=row["file_path"],
)
for row in rows
]
def search_symbols(
self,
name: str,
kind: Optional[str] = None,
limit: int = 50,
prefix_mode: bool = True,
) -> List[Tuple[str, Tuple[int, int]]]:
"""Search symbols and return only (file_path, (start_line, end_line))."""
symbols = self.search(name=name, kind=kind, limit=limit, prefix_mode=prefix_mode)
return [(s.file or "", s.range) for s in symbols]
def get_file_symbols(self, file_path: str | Path) -> List[Symbol]:
"""Get all symbols in a specific file, sorted by start_line.
Args:
file_path: Full path to the file
Returns:
List of Symbol objects sorted by start_line
"""
file_path_str = str(Path(file_path).resolve())
with self._lock:
conn = self._get_connection()
rows = conn.execute(
"""
SELECT symbol_name, symbol_kind, file_path, start_line, end_line
FROM global_symbols
WHERE project_id=? AND file_path=?
ORDER BY start_line
""",
(self.project_id, file_path_str),
).fetchall()
return [
Symbol(
name=row["symbol_name"],
kind=row["symbol_kind"],
range=(row["start_line"], row["end_line"]),
file=row["file_path"],
)
for row in rows
]
def _get_existing_index_path(self, file_path_str: str) -> Optional[str]:
with self._lock:
conn = self._get_connection()
row = conn.execute(
"""
SELECT index_path
FROM global_symbols
WHERE project_id=? AND file_path=?
LIMIT 1
""",
(self.project_id, file_path_str),
).fetchone()
return str(row["index_path"]) if row else None
def _get_schema_version(self, conn: sqlite3.Connection) -> int:
try:
row = conn.execute("PRAGMA user_version").fetchone()
return int(row[0]) if row else 0
except Exception:
return 0
def _set_schema_version(self, conn: sqlite3.Connection, version: int) -> None:
conn.execute(f"PRAGMA user_version = {int(version)}")
def _apply_migrations(self, conn: sqlite3.Connection, from_version: int) -> None:
# No migrations yet (v1).
_ = (conn, from_version)
return
def _get_connection(self) -> sqlite3.Connection:
if self._conn is None:
self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
self._conn.row_factory = sqlite3.Row
self._conn.execute("PRAGMA journal_mode=WAL")
self._conn.execute("PRAGMA synchronous=NORMAL")
self._conn.execute("PRAGMA foreign_keys=ON")
self._conn.execute("PRAGMA mmap_size=30000000000")
return self._conn
def _create_schema(self, conn: sqlite3.Connection) -> None:
try:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS global_symbols (
id INTEGER PRIMARY KEY,
project_id INTEGER NOT NULL,
symbol_name TEXT NOT NULL,
symbol_kind TEXT NOT NULL,
file_path TEXT NOT NULL,
start_line INTEGER,
end_line INTEGER,
index_path TEXT NOT NULL,
UNIQUE(
project_id, symbol_name, symbol_kind,
file_path, start_line, end_line
)
)
"""
)
# Required by optimization spec.
conn.execute(
"""
CREATE INDEX IF NOT EXISTS idx_global_symbols_name_kind
ON global_symbols(symbol_name, symbol_kind)
"""
)
# Used by common queries (project-scoped name lookups).
conn.execute(
"""
CREATE INDEX IF NOT EXISTS idx_global_symbols_project_name_kind
ON global_symbols(project_id, symbol_name, symbol_kind)
"""
)
conn.execute(
"""
CREATE INDEX IF NOT EXISTS idx_global_symbols_project_file
ON global_symbols(project_id, file_path)
"""
)
conn.execute(
"""
CREATE INDEX IF NOT EXISTS idx_global_symbols_project_index_path
ON global_symbols(project_id, index_path)
"""
)
except sqlite3.DatabaseError as exc:
raise StorageError(
f"Failed to initialize global symbol schema: {exc}",
db_path=str(self.db_path),
operation="_create_schema",
) from exc

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,136 @@
"""Merkle tree utilities for change detection.
This module provides a generic, file-system based Merkle tree implementation
that can be used to efficiently diff directory states.
"""
from __future__ import annotations
import hashlib
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, Iterable, List, Optional
def sha256_bytes(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
def sha256_text(text: str) -> str:
return sha256_bytes(text.encode("utf-8", errors="ignore"))
@dataclass
class MerkleNode:
"""A Merkle node representing either a file (leaf) or directory (internal)."""
name: str
rel_path: str
hash: str
is_dir: bool
children: Dict[str, "MerkleNode"] = field(default_factory=dict)
def iter_files(self) -> Iterable["MerkleNode"]:
if not self.is_dir:
yield self
return
for child in self.children.values():
yield from child.iter_files()
@dataclass
class MerkleTree:
"""Merkle tree for a directory snapshot."""
root: MerkleNode
@classmethod
def build_from_directory(cls, root_dir: Path) -> "MerkleTree":
root_dir = Path(root_dir).resolve()
node = cls._build_node(root_dir, base=root_dir)
return cls(root=node)
@classmethod
def _build_node(cls, path: Path, *, base: Path) -> MerkleNode:
if path.is_file():
rel = str(path.relative_to(base)).replace("\\", "/")
return MerkleNode(
name=path.name,
rel_path=rel,
hash=sha256_bytes(path.read_bytes()),
is_dir=False,
)
if not path.is_dir():
rel = str(path.relative_to(base)).replace("\\", "/")
return MerkleNode(name=path.name, rel_path=rel, hash="", is_dir=False)
children: Dict[str, MerkleNode] = {}
for child in sorted(path.iterdir(), key=lambda p: p.name):
child_node = cls._build_node(child, base=base)
children[child_node.name] = child_node
items = [
f"{'d' if n.is_dir else 'f'}:{name}:{n.hash}"
for name, n in sorted(children.items(), key=lambda kv: kv[0])
]
dir_hash = sha256_text("\n".join(items))
rel_path = "." if path == base else str(path.relative_to(base)).replace("\\", "/")
return MerkleNode(
name="." if path == base else path.name,
rel_path=rel_path,
hash=dir_hash,
is_dir=True,
children=children,
)
@staticmethod
def find_changed_files(old: Optional["MerkleTree"], new: Optional["MerkleTree"]) -> List[str]:
"""Find changed/added/removed files between two trees.
Returns:
List of relative file paths (POSIX-style separators).
"""
if old is None and new is None:
return []
if old is None:
return sorted({n.rel_path for n in new.root.iter_files()}) # type: ignore[union-attr]
if new is None:
return sorted({n.rel_path for n in old.root.iter_files()})
changed: set[str] = set()
def walk(old_node: Optional[MerkleNode], new_node: Optional[MerkleNode]) -> None:
if old_node is None and new_node is None:
return
if old_node is None and new_node is not None:
changed.update(n.rel_path for n in new_node.iter_files())
return
if new_node is None and old_node is not None:
changed.update(n.rel_path for n in old_node.iter_files())
return
assert old_node is not None and new_node is not None
if old_node.hash == new_node.hash:
return
if not old_node.is_dir and not new_node.is_dir:
changed.add(new_node.rel_path)
return
if old_node.is_dir != new_node.is_dir:
changed.update(n.rel_path for n in old_node.iter_files())
changed.update(n.rel_path for n in new_node.iter_files())
return
names = set(old_node.children.keys()) | set(new_node.children.keys())
for name in names:
walk(old_node.children.get(name), new_node.children.get(name))
walk(old.root, new.root)
return sorted(changed)

View File

@@ -0,0 +1,154 @@
"""
Manages database schema migrations.
This module provides a framework for applying versioned migrations to the SQLite
database. Migrations are discovered from the `codexlens.storage.migrations`
package and applied sequentially. The database schema version is tracked using
the `user_version` pragma.
"""
import importlib
import logging
import pkgutil
from pathlib import Path
from sqlite3 import Connection
from typing import List, NamedTuple
log = logging.getLogger(__name__)
class Migration(NamedTuple):
"""Represents a single database migration."""
version: int
name: str
upgrade: callable
def discover_migrations() -> List[Migration]:
"""
Discovers and returns a sorted list of database migrations.
Migrations are expected to be in the `codexlens.storage.migrations` package,
with filenames in the format `migration_XXX_description.py`, where XXX is
the version number. Each migration module must contain an `upgrade` function
that takes a `sqlite3.Connection` object as its argument.
Returns:
A list of Migration objects, sorted by version.
"""
import codexlens.storage.migrations
migrations = []
package_path = Path(codexlens.storage.migrations.__file__).parent
for _, name, _ in pkgutil.iter_modules([str(package_path)]):
if name.startswith("migration_"):
try:
version = int(name.split("_")[1])
module = importlib.import_module(f"codexlens.storage.migrations.{name}")
if hasattr(module, "upgrade"):
migrations.append(
Migration(version=version, name=name, upgrade=module.upgrade)
)
else:
log.warning(f"Migration {name} is missing 'upgrade' function.")
except (ValueError, IndexError) as e:
log.warning(f"Could not parse migration name {name}: {e}")
except ImportError as e:
log.warning(f"Could not import migration {name}: {e}")
migrations.sort(key=lambda m: m.version)
return migrations
class MigrationManager:
"""
Manages the application of migrations to a database.
"""
def __init__(self, db_conn: Connection):
"""
Initializes the MigrationManager.
Args:
db_conn: The SQLite database connection.
"""
self.db_conn = db_conn
self.migrations = discover_migrations()
def get_current_version(self) -> int:
"""
Gets the current version of the database schema.
Returns:
The current schema version number.
"""
return self.db_conn.execute("PRAGMA user_version").fetchone()[0]
def set_version(self, version: int):
"""
Sets the database schema version.
Args:
version: The version number to set.
"""
self.db_conn.execute(f"PRAGMA user_version = {version}")
log.info(f"Database schema version set to {version}")
def apply_migrations(self):
"""
Applies all pending migrations to the database.
This method checks the current database version and applies all
subsequent migrations in order. Each migration is applied within
a transaction, unless the migration manages its own transactions.
"""
current_version = self.get_current_version()
log.info(f"Current database schema version: {current_version}")
for migration in self.migrations:
if migration.version > current_version:
log.info(f"Applying migration {migration.version}: {migration.name}...")
try:
# Check if a transaction is already in progress
in_transaction = self.db_conn.in_transaction
# Only start transaction if not already in one
if not in_transaction:
self.db_conn.execute("BEGIN")
migration.upgrade(self.db_conn)
self.set_version(migration.version)
# Only commit if we started the transaction and it's still active
if not in_transaction and self.db_conn.in_transaction:
self.db_conn.execute("COMMIT")
log.info(
f"Successfully applied migration {migration.version}: {migration.name}"
)
except Exception as e:
log.error(
f"Failed to apply migration {migration.version}: {migration.name}. Error: {e}",
exc_info=True,
)
# Try to rollback if transaction is active
try:
if self.db_conn.in_transaction:
self.db_conn.execute("ROLLBACK")
except Exception:
pass # Ignore rollback errors
raise
latest_migration_version = self.migrations[-1].version if self.migrations else 0
if current_version < latest_migration_version:
# This case can be hit if migrations were applied but the loop was exited
# and set_version was not called for the last one for some reason.
# To be safe, we explicitly set the version to the latest known migration.
final_version = self.get_current_version()
if final_version != latest_migration_version:
log.warning(f"Database version ({final_version}) is not the latest migration version ({latest_migration_version}). This may indicate a problem.")
log.info("All pending migrations applied successfully.")

View File

@@ -0,0 +1 @@
# This file makes the 'migrations' directory a Python package.

View File

@@ -0,0 +1,123 @@
"""
Migration 001: Normalize keywords into separate tables.
This migration introduces two new tables, `keywords` and `file_keywords`, to
store semantic keywords in a normalized fashion. It then migrates the existing
keywords from the `semantic_data` JSON blob in the `files` table into these
new tables. This is intended to speed up keyword-based searches significantly.
"""
import json
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection):
"""
Applies the migration to normalize keywords.
- Creates `keywords` and `file_keywords` tables.
- Creates indexes for efficient querying.
- Migrates data from `files.semantic_data` to the new tables.
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
log.info("Creating 'keywords' and 'file_keywords' tables...")
# Create a table to store unique keywords
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS keywords (
id INTEGER PRIMARY KEY,
keyword TEXT NOT NULL UNIQUE
)
"""
)
# Create a join table to link files and keywords (many-to-many)
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS file_keywords (
file_id INTEGER NOT NULL,
keyword_id INTEGER NOT NULL,
PRIMARY KEY (file_id, keyword_id),
FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,
FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE
)
"""
)
log.info("Creating indexes for new keyword tables...")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords (keyword)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords (file_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords (keyword_id)")
log.info("Migrating existing keywords from 'semantic_metadata' table...")
# Check if semantic_metadata table exists before querying
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'")
if not cursor.fetchone():
log.info("No 'semantic_metadata' table found, skipping data migration.")
return
# Check if 'keywords' column exists in semantic_metadata table
# (current schema may already use normalized tables without this column)
cursor.execute("PRAGMA table_info(semantic_metadata)")
columns = {row[1] for row in cursor.fetchall()}
if "keywords" not in columns:
log.info("No 'keywords' column in semantic_metadata table, skipping data migration.")
return
cursor.execute("SELECT file_id, keywords FROM semantic_metadata WHERE keywords IS NOT NULL AND keywords != ''")
files_to_migrate = cursor.fetchall()
if not files_to_migrate:
log.info("No existing files with semantic metadata to migrate.")
return
log.info(f"Found {len(files_to_migrate)} files with semantic metadata to migrate.")
for file_id, keywords_json in files_to_migrate:
if not keywords_json:
continue
try:
keywords = json.loads(keywords_json)
if not isinstance(keywords, list):
log.warning(f"Keywords for file_id {file_id} is not a list, skipping.")
continue
for keyword in keywords:
if not isinstance(keyword, str):
log.warning(f"Non-string keyword '{keyword}' found for file_id {file_id}, skipping.")
continue
keyword = keyword.strip()
if not keyword:
continue
# Get or create keyword_id
cursor.execute("INSERT OR IGNORE INTO keywords (keyword) VALUES (?)", (keyword,))
cursor.execute("SELECT id FROM keywords WHERE keyword = ?", (keyword,))
keyword_id_result = cursor.fetchone()
if keyword_id_result:
keyword_id = keyword_id_result[0]
# Link file to keyword
cursor.execute(
"INSERT OR IGNORE INTO file_keywords (file_id, keyword_id) VALUES (?, ?)",
(file_id, keyword_id),
)
else:
log.error(f"Failed to retrieve or create keyword_id for keyword: {keyword}")
except json.JSONDecodeError as e:
log.warning(f"Could not parse keywords for file_id {file_id}: {e}")
except Exception as e:
log.error(f"An unexpected error occurred during migration for file_id {file_id}: {e}", exc_info=True)
log.info("Finished migrating keywords.")

View File

@@ -0,0 +1,48 @@
"""
Migration 002: Add token_count and symbol_type to symbols table.
This migration adds token counting metadata to symbols for accurate chunk
splitting and performance optimization. It also adds symbol_type for better
filtering in searches.
"""
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection):
"""
Applies the migration to add token metadata to symbols.
- Adds token_count column to symbols table
- Adds symbol_type column to symbols table (for future use)
- Creates index on symbol_type for efficient filtering
- Backfills existing symbols with NULL token_count (to be calculated lazily)
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
log.info("Adding token_count column to symbols table...")
try:
cursor.execute("ALTER TABLE symbols ADD COLUMN token_count INTEGER")
log.info("Successfully added token_count column.")
except Exception as e:
# Column might already exist
log.warning(f"Could not add token_count column (might already exist): {e}")
log.info("Adding symbol_type column to symbols table...")
try:
cursor.execute("ALTER TABLE symbols ADD COLUMN symbol_type TEXT")
log.info("Successfully added symbol_type column.")
except Exception as e:
# Column might already exist
log.warning(f"Could not add symbol_type column (might already exist): {e}")
log.info("Creating index on symbol_type for efficient filtering...")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type)")
log.info("Migration 002 completed successfully.")

View File

@@ -0,0 +1,232 @@
"""
Migration 004: Add dual FTS tables for exact and fuzzy matching.
This migration introduces two FTS5 tables:
- files_fts_exact: Uses unicode61 tokenizer for exact token matching
- files_fts_fuzzy: Uses trigram tokenizer (or extended unicode61) for substring/fuzzy matching
Both tables are synchronized with the files table via triggers for automatic updates.
"""
import logging
from sqlite3 import Connection
from codexlens.storage.sqlite_utils import check_trigram_support, get_sqlite_version
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection):
"""
Applies the migration to add dual FTS tables.
- Drops old files_fts table and triggers
- Creates files_fts_exact with unicode61 tokenizer
- Creates files_fts_fuzzy with trigram or extended unicode61 tokenizer
- Creates synchronized triggers for both tables
- Rebuilds FTS indexes from files table
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
try:
# Check trigram support
has_trigram = check_trigram_support(db_conn)
version = get_sqlite_version(db_conn)
log.info(f"SQLite version: {'.'.join(map(str, version))}")
if has_trigram:
log.info("Trigram tokenizer available, using for fuzzy FTS table")
fuzzy_tokenizer = "trigram"
else:
log.warning(
f"Trigram tokenizer not available (requires SQLite >= 3.34), "
f"using extended unicode61 tokenizer for fuzzy matching"
)
fuzzy_tokenizer = "unicode61 tokenchars '_-.'"
# Start transaction
cursor.execute("BEGIN TRANSACTION")
# Check if files table has 'name' column (v2 schema doesn't have it)
cursor.execute("PRAGMA table_info(files)")
columns = {row[1] for row in cursor.fetchall()}
if 'name' not in columns:
log.info("Adding 'name' column to files table (v2 schema upgrade)...")
# Add name column
cursor.execute("ALTER TABLE files ADD COLUMN name TEXT")
# Populate name from path (extract filename from last '/')
# Use Python to do the extraction since SQLite doesn't have reverse()
cursor.execute("SELECT rowid, path FROM files")
rows = cursor.fetchall()
for rowid, path in rows:
# Extract filename from path
name = path.split('/')[-1] if '/' in path else path
cursor.execute("UPDATE files SET name = ? WHERE rowid = ?", (name, rowid))
# Rename 'path' column to 'full_path' if needed
if 'path' in columns and 'full_path' not in columns:
log.info("Renaming 'path' to 'full_path' (v2 schema upgrade)...")
# Check if indexed_at column exists in v2 schema
has_indexed_at = 'indexed_at' in columns
has_mtime = 'mtime' in columns
# SQLite doesn't support RENAME COLUMN before 3.25, so use table recreation
cursor.execute("""
CREATE TABLE files_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
full_path TEXT NOT NULL UNIQUE,
content TEXT,
language TEXT,
mtime REAL,
indexed_at TEXT
)
""")
# Build INSERT statement based on available columns
# Note: v2 schema has no rowid (path is PRIMARY KEY), so use NULL for AUTOINCREMENT
if has_indexed_at and has_mtime:
cursor.execute("""
INSERT INTO files_new (name, full_path, content, language, mtime, indexed_at)
SELECT name, path, content, language, mtime, indexed_at FROM files
""")
elif has_indexed_at:
cursor.execute("""
INSERT INTO files_new (name, full_path, content, language, indexed_at)
SELECT name, path, content, language, indexed_at FROM files
""")
elif has_mtime:
cursor.execute("""
INSERT INTO files_new (name, full_path, content, language, mtime)
SELECT name, path, content, language, mtime FROM files
""")
else:
cursor.execute("""
INSERT INTO files_new (name, full_path, content, language)
SELECT name, path, content, language FROM files
""")
cursor.execute("DROP TABLE files")
cursor.execute("ALTER TABLE files_new RENAME TO files")
log.info("Dropping old FTS triggers and table...")
# Drop old triggers
cursor.execute("DROP TRIGGER IF EXISTS files_ai")
cursor.execute("DROP TRIGGER IF EXISTS files_ad")
cursor.execute("DROP TRIGGER IF EXISTS files_au")
# Drop old FTS table
cursor.execute("DROP TABLE IF EXISTS files_fts")
# Create exact FTS table (unicode61 with underscores/hyphens/dots as token chars)
# Note: tokenchars includes '.' to properly tokenize qualified names like PortRole.FLOW
log.info("Creating files_fts_exact table with unicode61 tokenizer...")
cursor.execute(
"""
CREATE VIRTUAL TABLE files_fts_exact USING fts5(
name, full_path UNINDEXED, content,
content='files',
content_rowid='id',
tokenize="unicode61 tokenchars '_-.'"
)
"""
)
# Create fuzzy FTS table (trigram or extended unicode61)
log.info(f"Creating files_fts_fuzzy table with {fuzzy_tokenizer} tokenizer...")
cursor.execute(
f"""
CREATE VIRTUAL TABLE files_fts_fuzzy USING fts5(
name, full_path UNINDEXED, content,
content='files',
content_rowid='id',
tokenize="{fuzzy_tokenizer}"
)
"""
)
# Create synchronized triggers for files_fts_exact
log.info("Creating triggers for files_fts_exact...")
cursor.execute(
"""
CREATE TRIGGER files_exact_ai AFTER INSERT ON files BEGIN
INSERT INTO files_fts_exact(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
cursor.execute(
"""
CREATE TRIGGER files_exact_ad AFTER DELETE ON files BEGIN
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
END
"""
)
cursor.execute(
"""
CREATE TRIGGER files_exact_au AFTER UPDATE ON files BEGIN
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
INSERT INTO files_fts_exact(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
# Create synchronized triggers for files_fts_fuzzy
log.info("Creating triggers for files_fts_fuzzy...")
cursor.execute(
"""
CREATE TRIGGER files_fuzzy_ai AFTER INSERT ON files BEGIN
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
cursor.execute(
"""
CREATE TRIGGER files_fuzzy_ad AFTER DELETE ON files BEGIN
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
END
"""
)
cursor.execute(
"""
CREATE TRIGGER files_fuzzy_au AFTER UPDATE ON files BEGIN
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
VALUES('delete', old.id, old.name, old.full_path, old.content);
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
VALUES(new.id, new.name, new.full_path, new.content);
END
"""
)
# Rebuild FTS indexes from files table
log.info("Rebuilding FTS indexes from files table...")
cursor.execute("INSERT INTO files_fts_exact(files_fts_exact) VALUES('rebuild')")
cursor.execute("INSERT INTO files_fts_fuzzy(files_fts_fuzzy) VALUES('rebuild')")
# Commit transaction
cursor.execute("COMMIT")
log.info("Migration 004 completed successfully")
# Vacuum to reclaim space (outside transaction)
try:
log.info("Running VACUUM to reclaim space...")
cursor.execute("VACUUM")
except Exception as e:
log.warning(f"VACUUM failed (non-critical): {e}")
except Exception as e:
log.error(f"Migration 004 failed: {e}")
try:
cursor.execute("ROLLBACK")
except Exception:
pass
raise

View File

@@ -0,0 +1,196 @@
"""
Migration 005: Remove unused and redundant database fields.
This migration removes four problematic fields identified by Gemini analysis:
1. **semantic_metadata.keywords** (deprecated - replaced by file_keywords table)
- Data: Migrated to normalized file_keywords table in migration 001
- Impact: Column now redundant, remove to prevent sync issues
2. **symbols.token_count** (unused - always NULL)
- Data: Never populated, always NULL
- Impact: No data loss, just removes unused column
3. **symbols.symbol_type** (redundant - duplicates kind)
- Data: Redundant with symbols.kind field
- Impact: No data loss, kind field contains same information
4. **subdirs.direct_files** (unused - never displayed)
- Data: Never used in queries or display logic
- Impact: No data loss, just removes unused column
Schema changes use table recreation pattern (SQLite best practice):
- Create new table without deprecated columns
- Copy data from old table
- Drop old table
- Rename new table
- Recreate indexes
"""
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection):
"""Remove unused and redundant fields from schema.
Note: Transaction management is handled by MigrationManager.
This migration should NOT start its own transaction.
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
# Step 1: Remove semantic_metadata.keywords (if column exists)
log.info("Checking semantic_metadata.keywords column...")
cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'"
)
if cursor.fetchone():
# Check if keywords column exists
cursor.execute("PRAGMA table_info(semantic_metadata)")
columns = {row[1] for row in cursor.fetchall()}
if "keywords" in columns:
log.info("Removing semantic_metadata.keywords column...")
cursor.execute("""
CREATE TABLE semantic_metadata_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL UNIQUE,
summary TEXT,
purpose TEXT,
llm_tool TEXT,
generated_at REAL,
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
)
""")
cursor.execute("""
INSERT INTO semantic_metadata_new (id, file_id, summary, purpose, llm_tool, generated_at)
SELECT id, file_id, summary, purpose, llm_tool, generated_at
FROM semantic_metadata
""")
cursor.execute("DROP TABLE semantic_metadata")
cursor.execute("ALTER TABLE semantic_metadata_new RENAME TO semantic_metadata")
# Recreate index
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)"
)
log.info("Removed semantic_metadata.keywords column")
else:
log.info("semantic_metadata.keywords column does not exist, skipping")
else:
log.info("semantic_metadata table does not exist, skipping")
# Step 2: Remove symbols.token_count and symbols.symbol_type (if columns exist)
log.info("Checking symbols.token_count and symbols.symbol_type columns...")
cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='symbols'"
)
if cursor.fetchone():
# Check if token_count or symbol_type columns exist
cursor.execute("PRAGMA table_info(symbols)")
columns = {row[1] for row in cursor.fetchall()}
if "token_count" in columns or "symbol_type" in columns:
log.info("Removing symbols.token_count and symbols.symbol_type columns...")
cursor.execute("""
CREATE TABLE symbols_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL,
name TEXT NOT NULL,
kind TEXT,
start_line INTEGER,
end_line INTEGER,
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
)
""")
cursor.execute("""
INSERT INTO symbols_new (id, file_id, name, kind, start_line, end_line)
SELECT id, file_id, name, kind, start_line, end_line
FROM symbols
""")
cursor.execute("DROP TABLE symbols")
cursor.execute("ALTER TABLE symbols_new RENAME TO symbols")
# Recreate indexes
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
log.info("Removed symbols.token_count and symbols.symbol_type columns")
else:
log.info("symbols.token_count/symbol_type columns do not exist, skipping")
else:
log.info("symbols table does not exist, skipping")
# Step 3: Remove subdirs.direct_files (if column exists)
log.info("Checking subdirs.direct_files column...")
cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='subdirs'"
)
if cursor.fetchone():
# Check if direct_files column exists
cursor.execute("PRAGMA table_info(subdirs)")
columns = {row[1] for row in cursor.fetchall()}
if "direct_files" in columns:
log.info("Removing subdirs.direct_files column...")
cursor.execute("""
CREATE TABLE subdirs_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
index_path TEXT NOT NULL,
files_count INTEGER DEFAULT 0,
last_updated REAL
)
""")
cursor.execute("""
INSERT INTO subdirs_new (id, name, index_path, files_count, last_updated)
SELECT id, name, index_path, files_count, last_updated
FROM subdirs
""")
cursor.execute("DROP TABLE subdirs")
cursor.execute("ALTER TABLE subdirs_new RENAME TO subdirs")
# Recreate index
cursor.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
log.info("Removed subdirs.direct_files column")
else:
log.info("subdirs.direct_files column does not exist, skipping")
else:
log.info("subdirs table does not exist, skipping")
log.info("Migration 005 completed successfully")
# Vacuum to reclaim space (outside transaction, optional)
# Note: VACUUM cannot run inside a transaction, so we skip it here
# The caller can run VACUUM separately if desired
def downgrade(db_conn: Connection):
"""Restore removed fields (data will be lost for keywords, token_count, symbol_type, direct_files).
This is a placeholder - true downgrade is not feasible as data is lost.
The migration is designed to be one-way since removed fields are unused/redundant.
Args:
db_conn: The SQLite database connection.
"""
log.warning(
"Migration 005 downgrade not supported - removed fields are unused/redundant. "
"Data cannot be restored."
)
raise NotImplementedError(
"Migration 005 downgrade not supported - this is a one-way migration"
)

View File

@@ -0,0 +1,37 @@
"""
Migration 006: Ensure relationship tables and indexes exist.
This migration is intentionally idempotent. It creates the `code_relationships`
table (used for graph visualization) and its indexes if missing.
"""
from __future__ import annotations
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
cursor = db_conn.cursor()
log.info("Ensuring code_relationships table exists...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS code_relationships (
id INTEGER PRIMARY KEY,
source_symbol_id INTEGER NOT NULL REFERENCES symbols (id) ON DELETE CASCADE,
target_qualified_name TEXT NOT NULL,
relationship_type TEXT NOT NULL,
source_line INTEGER NOT NULL,
target_file TEXT
)
"""
)
log.info("Ensuring relationship indexes exist...")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)")

View File

@@ -0,0 +1,47 @@
"""
Migration 007: Add precomputed graph neighbor table for search expansion.
Adds:
- graph_neighbors: cached N-hop neighbors between symbols (keyed by symbol ids)
This table is derived data (a cache) and is safe to rebuild at any time.
The migration is intentionally idempotent.
"""
from __future__ import annotations
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
cursor = db_conn.cursor()
log.info("Creating graph_neighbors table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS graph_neighbors (
source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
relationship_depth INTEGER NOT NULL,
PRIMARY KEY (source_symbol_id, neighbor_symbol_id)
)
"""
)
log.info("Creating indexes for graph_neighbors...")
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth
ON graph_neighbors(source_symbol_id, relationship_depth)
"""
)
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor
ON graph_neighbors(neighbor_symbol_id)
"""
)

View File

@@ -0,0 +1,81 @@
"""
Migration 008: Add Merkle hash tables for content-based incremental indexing.
Adds:
- merkle_hashes: per-file SHA-256 hashes (keyed by file_id)
- merkle_state: directory-level root hash (single row, id=1)
Backfills merkle_hashes using the existing `files.content` column when available.
"""
from __future__ import annotations
import hashlib
import logging
import time
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
cursor = db_conn.cursor()
log.info("Creating merkle_hashes table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS merkle_hashes (
file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE,
sha256 TEXT NOT NULL,
updated_at REAL
)
"""
)
log.info("Creating merkle_state table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS merkle_state (
id INTEGER PRIMARY KEY CHECK (id = 1),
root_hash TEXT,
updated_at REAL
)
"""
)
# Backfill file hashes from stored content (best-effort).
try:
rows = cursor.execute("SELECT id, content FROM files").fetchall()
except Exception as exc:
log.warning("Unable to backfill merkle hashes (files table missing?): %s", exc)
return
now = time.time()
inserts: list[tuple[int, str, float]] = []
for row in rows:
file_id = int(row[0])
content = row[1]
if content is None:
continue
try:
digest = hashlib.sha256(str(content).encode("utf-8", errors="ignore")).hexdigest()
inserts.append((file_id, digest, now))
except Exception:
continue
if not inserts:
return
log.info("Backfilling %d file hashes...", len(inserts))
cursor.executemany(
"""
INSERT INTO merkle_hashes(file_id, sha256, updated_at)
VALUES(?, ?, ?)
ON CONFLICT(file_id) DO UPDATE SET
sha256=excluded.sha256,
updated_at=excluded.updated_at
""",
inserts,
)

View File

@@ -0,0 +1,103 @@
"""
Migration 009: Add SPLADE sparse retrieval tables.
This migration introduces SPLADE (Sparse Lexical AnD Expansion) support:
- splade_metadata: Model configuration (model name, vocab size, ONNX path)
- splade_posting_list: Inverted index mapping token_id -> (chunk_id, weight)
The SPLADE tables are designed for efficient sparse vector retrieval:
- Token-based lookup for query expansion
- Chunk-based deletion for index maintenance
- Maintains backward compatibility with existing FTS tables
"""
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
"""
Adds SPLADE tables for sparse retrieval.
Creates:
- splade_metadata: Stores model configuration and ONNX path
- splade_posting_list: Inverted index with token_id -> (chunk_id, weight) mappings
- Indexes for efficient token-based and chunk-based lookups
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
log.info("Creating splade_metadata table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS splade_metadata (
id INTEGER PRIMARY KEY DEFAULT 1,
model_name TEXT NOT NULL,
vocab_size INTEGER NOT NULL,
onnx_path TEXT,
created_at REAL
)
"""
)
log.info("Creating splade_posting_list table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS splade_posting_list (
token_id INTEGER NOT NULL,
chunk_id INTEGER NOT NULL,
weight REAL NOT NULL,
PRIMARY KEY (token_id, chunk_id),
FOREIGN KEY (chunk_id) REFERENCES semantic_chunks(id) ON DELETE CASCADE
)
"""
)
log.info("Creating indexes for splade_posting_list...")
# Index for efficient chunk-based lookups (deletion, updates)
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_splade_by_chunk
ON splade_posting_list(chunk_id)
"""
)
# Index for efficient term-based retrieval
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_splade_by_token
ON splade_posting_list(token_id)
"""
)
log.info("Migration 009 completed successfully")
def downgrade(db_conn: Connection) -> None:
"""
Removes SPLADE tables.
Drops:
- splade_posting_list (and associated indexes)
- splade_metadata
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
log.info("Dropping SPLADE indexes...")
cursor.execute("DROP INDEX IF EXISTS idx_splade_by_chunk")
cursor.execute("DROP INDEX IF EXISTS idx_splade_by_token")
log.info("Dropping splade_posting_list table...")
cursor.execute("DROP TABLE IF EXISTS splade_posting_list")
log.info("Dropping splade_metadata table...")
cursor.execute("DROP TABLE IF EXISTS splade_metadata")
log.info("Migration 009 downgrade completed successfully")

View File

@@ -0,0 +1,162 @@
"""
Migration 010: Add multi-vector storage support for cascade retrieval.
This migration introduces the chunks table with multi-vector support:
- chunks: Stores code chunks with multiple embedding types
- embedding: Original embedding for backward compatibility
- embedding_binary: 256-dim binary vector for coarse ranking (fast)
- embedding_dense: 2048-dim dense vector for fine ranking (precise)
The multi-vector architecture enables cascade retrieval:
1. First stage: Fast binary vector search for candidate retrieval
2. Second stage: Dense vector reranking for precision
"""
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
"""
Adds chunks table with multi-vector embedding columns.
Creates:
- chunks: Table for storing code chunks with multiple embedding types
- idx_chunks_file_path: Index for efficient file-based lookups
Also migrates existing chunks tables by adding new columns if needed.
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
# Check if chunks table already exists
table_exists = cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
).fetchone()
if table_exists:
# Migrate existing table - add new columns if missing
log.info("chunks table exists, checking for missing columns...")
col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall()
existing_columns = {row[1] for row in col_info}
if "embedding_binary" not in existing_columns:
log.info("Adding embedding_binary column to chunks table...")
cursor.execute(
"ALTER TABLE chunks ADD COLUMN embedding_binary BLOB"
)
if "embedding_dense" not in existing_columns:
log.info("Adding embedding_dense column to chunks table...")
cursor.execute(
"ALTER TABLE chunks ADD COLUMN embedding_dense BLOB"
)
else:
# Create new table with all columns
log.info("Creating chunks table with multi-vector support...")
cursor.execute(
"""
CREATE TABLE chunks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT NOT NULL,
content TEXT NOT NULL,
embedding BLOB,
embedding_binary BLOB,
embedding_dense BLOB,
metadata TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
)
# Create index for file-based lookups
log.info("Creating index for chunks table...")
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_chunks_file_path
ON chunks(file_path)
"""
)
log.info("Migration 010 completed successfully")
def downgrade(db_conn: Connection) -> None:
"""
Removes multi-vector columns from chunks table.
Note: This does not drop the chunks table entirely to preserve data.
Only the new columns added by this migration are removed.
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
log.info("Removing multi-vector columns from chunks table...")
# SQLite doesn't support DROP COLUMN directly in older versions
# We need to recreate the table without the columns
# Check if chunks table exists
table_exists = cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
).fetchone()
if not table_exists:
log.info("chunks table does not exist, nothing to downgrade")
return
# Check if the columns exist before trying to remove them
col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall()
existing_columns = {row[1] for row in col_info}
needs_migration = (
"embedding_binary" in existing_columns or
"embedding_dense" in existing_columns
)
if not needs_migration:
log.info("Multi-vector columns not present, nothing to remove")
return
# Recreate table without the new columns
log.info("Recreating chunks table without multi-vector columns...")
cursor.execute(
"""
CREATE TABLE chunks_backup (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT NOT NULL,
content TEXT NOT NULL,
embedding BLOB,
metadata TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
)
cursor.execute(
"""
INSERT INTO chunks_backup (id, file_path, content, embedding, metadata, created_at)
SELECT id, file_path, content, embedding, metadata, created_at FROM chunks
"""
)
cursor.execute("DROP TABLE chunks")
cursor.execute("ALTER TABLE chunks_backup RENAME TO chunks")
# Recreate index
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_chunks_file_path
ON chunks(file_path)
"""
)
log.info("Migration 010 downgrade completed successfully")

View File

@@ -0,0 +1,300 @@
"""Path mapping utilities for source paths and index paths.
This module provides bidirectional mapping between source code directories
and their corresponding index storage locations.
Storage Structure:
~/.codexlens/
├── registry.db # Global mapping table
└── indexes/
└── D/
└── Claude_dms3/
├── _index.db # Root directory index
└── src/
└── _index.db # src/ directory index
"""
import json
import os
import platform
from pathlib import Path
from typing import Optional
def _get_configured_index_root() -> Path:
"""Get the index root from environment or config file.
Priority order:
1. CODEXLENS_INDEX_DIR environment variable
2. index_dir from ~/.codexlens/config.json
3. Default: ~/.codexlens/indexes
"""
env_override = os.getenv("CODEXLENS_INDEX_DIR")
if env_override:
return Path(env_override).expanduser().resolve()
config_file = Path.home() / ".codexlens" / "config.json"
if config_file.exists():
try:
cfg = json.loads(config_file.read_text(encoding="utf-8"))
if "index_dir" in cfg:
return Path(cfg["index_dir"]).expanduser().resolve()
except (json.JSONDecodeError, OSError):
pass
return Path.home() / ".codexlens" / "indexes"
class PathMapper:
"""Bidirectional mapping tool for source paths ↔ index paths.
Handles cross-platform path normalization and conversion between
source code directories and their index storage locations.
Attributes:
DEFAULT_INDEX_ROOT: Default root directory for all indexes
INDEX_DB_NAME: Standard name for index database files
index_root: Configured index root directory
"""
DEFAULT_INDEX_ROOT = _get_configured_index_root()
INDEX_DB_NAME = "_index.db"
def __init__(self, index_root: Optional[Path] = None):
"""Initialize PathMapper with optional custom index root.
Args:
index_root: Custom index root directory. If None, uses DEFAULT_INDEX_ROOT.
"""
self.index_root = (index_root or self.DEFAULT_INDEX_ROOT).resolve()
def source_to_index_dir(self, source_path: Path) -> Path:
"""Convert source directory to its index directory path.
Maps a source code directory to where its index data should be stored.
The mapping preserves the directory structure but normalizes paths
for cross-platform compatibility.
Args:
source_path: Source directory path to map
Returns:
Index directory path under index_root
Examples:
>>> mapper = PathMapper()
>>> mapper.source_to_index_dir(Path("D:/Claude_dms3/src"))
PosixPath('/home/user/.codexlens/indexes/D/Claude_dms3/src')
>>> mapper.source_to_index_dir(Path("/home/user/project"))
PosixPath('/home/user/.codexlens/indexes/home/user/project')
"""
source_path = source_path.resolve()
normalized = self.normalize_path(source_path)
return self.index_root / normalized
def source_to_index_db(self, source_path: Path) -> Path:
"""Convert source directory to its index database file path.
Maps a source directory to the full path of its index database file,
including the standard INDEX_DB_NAME.
Args:
source_path: Source directory path to map
Returns:
Full path to the index database file
Examples:
>>> mapper = PathMapper()
>>> mapper.source_to_index_db(Path("D:/Claude_dms3/src"))
PosixPath('/home/user/.codexlens/indexes/D/Claude_dms3/src/_index.db')
"""
index_dir = self.source_to_index_dir(source_path)
return index_dir / self.INDEX_DB_NAME
def index_to_source(self, index_path: Path) -> Path:
"""Convert index path back to original source path.
Performs reverse mapping from an index storage location to the
original source directory. Handles both directory paths and
database file paths.
Args:
index_path: Index directory or database file path
Returns:
Original source directory path
Raises:
ValueError: If index_path is not under index_root
Examples:
>>> mapper = PathMapper()
>>> mapper.index_to_source(
... Path("~/.codexlens/indexes/D/Claude_dms3/src/_index.db")
... )
WindowsPath('D:/Claude_dms3/src')
>>> mapper.index_to_source(
... Path("~/.codexlens/indexes/D/Claude_dms3/src")
... )
WindowsPath('D:/Claude_dms3/src')
"""
index_path = index_path.resolve()
# Remove _index.db if present
if index_path.name == self.INDEX_DB_NAME:
index_path = index_path.parent
# Verify path is under index_root
try:
relative = index_path.relative_to(self.index_root)
except ValueError:
raise ValueError(
f"Index path {index_path} is not under index root {self.index_root}"
)
# Convert normalized path back to source path
normalized_str = str(relative).replace("\\", "/")
return self.denormalize_path(normalized_str)
def get_project_root(self, source_path: Path) -> Path:
"""Find the project root directory (topmost indexed directory).
Walks up the directory tree to find the highest-level directory
that has an index database.
Args:
source_path: Source directory to start from
Returns:
Project root directory path. Returns source_path itself if
no parent index is found.
Examples:
>>> mapper = PathMapper()
>>> mapper.get_project_root(Path("D:/Claude_dms3/src/codexlens"))
WindowsPath('D:/Claude_dms3')
"""
source_path = source_path.resolve()
current = source_path
project_root = source_path
# Walk up the tree
while current.parent != current: # Stop at filesystem root
parent_index_db = self.source_to_index_db(current.parent)
if parent_index_db.exists():
project_root = current.parent
current = current.parent
else:
break
return project_root
def get_relative_depth(self, source_path: Path, project_root: Path) -> int:
"""Calculate directory depth relative to project root.
Args:
source_path: Target directory path
project_root: Project root directory path
Returns:
Number of directory levels from project_root to source_path
Raises:
ValueError: If source_path is not under project_root
Examples:
>>> mapper = PathMapper()
>>> mapper.get_relative_depth(
... Path("D:/Claude_dms3/src/codexlens"),
... Path("D:/Claude_dms3")
... )
2
"""
source_path = source_path.resolve()
project_root = project_root.resolve()
try:
relative = source_path.relative_to(project_root)
# Count path components
return len(relative.parts)
except ValueError:
raise ValueError(
f"Source path {source_path} is not under project root {project_root}"
)
def normalize_path(self, path: Path) -> str:
"""Normalize path to cross-platform storage format.
Converts OS-specific paths to a standardized format for storage:
- Windows: Removes drive colons (D: → D)
- Unix: Removes leading slash
- Uses forward slashes throughout
Args:
path: Path to normalize
Returns:
Normalized path string
Examples:
>>> mapper = PathMapper()
>>> mapper.normalize_path(Path("D:/path/to/dir"))
'D/path/to/dir'
>>> mapper.normalize_path(Path("/home/user/path"))
'home/user/path'
"""
path = path.resolve()
path_str = str(path)
# Handle Windows paths with drive letters
if platform.system() == "Windows" and len(path.parts) > 0:
# Convert D:\path\to\dir → D/path/to/dir
drive = path.parts[0].replace(":", "") # D: → D
rest = Path(*path.parts[1:]) if len(path.parts) > 1 else Path()
normalized = f"{drive}/{rest}".replace("\\", "/")
return normalized.rstrip("/")
# Handle Unix paths
# /home/user/path → home/user/path
return path_str.lstrip("/").replace("\\", "/")
def denormalize_path(self, normalized: str) -> Path:
"""Convert normalized path back to OS-specific path.
Reverses the normalization process to restore OS-native path format:
- Windows: Adds drive colons (D → D:)
- Unix: Adds leading slash
Args:
normalized: Normalized path string
Returns:
OS-specific Path object
Examples:
>>> mapper = PathMapper()
>>> mapper.denormalize_path("D/path/to/dir") # On Windows
WindowsPath('D:/path/to/dir')
>>> mapper.denormalize_path("home/user/path") # On Unix
PosixPath('/home/user/path')
"""
parts = normalized.split("/")
# Handle Windows paths
if platform.system() == "Windows" and len(parts) > 0:
# Check if first part is a drive letter
if len(parts[0]) == 1 and parts[0].isalpha():
# D/path/to/dir → D:/path/to/dir
drive = f"{parts[0]}:"
if len(parts) > 1:
return Path(drive) / Path(*parts[1:])
return Path(drive)
# Handle Unix paths or relative paths
# home/user/path → /home/user/path
return Path("/") / Path(*parts)

View File

@@ -0,0 +1,683 @@
"""Global project registry for CodexLens - SQLite storage."""
from __future__ import annotations
import platform
import sqlite3
import threading
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional
from codexlens.errors import StorageError
@dataclass
class ProjectInfo:
"""Registered project information."""
id: int
source_root: Path
index_root: Path
created_at: float
last_indexed: float
total_files: int
total_dirs: int
status: str
@dataclass
class DirMapping:
"""Directory to index path mapping."""
id: int
project_id: int
source_path: Path
index_path: Path
depth: int
files_count: int
last_updated: float
class RegistryStore:
"""Global project registry - SQLite storage.
Manages indexed projects and directory-to-index path mappings.
Thread-safe with connection pooling.
"""
DEFAULT_DB_PATH = Path.home() / ".codexlens" / "registry.db"
def __init__(self, db_path: Path | None = None) -> None:
self.db_path = (db_path or self.DEFAULT_DB_PATH).resolve()
self._lock = threading.RLock()
self._local = threading.local()
self._pool_lock = threading.Lock()
self._pool: Dict[int, sqlite3.Connection] = {}
self._pool_generation = 0
def _get_connection(self) -> sqlite3.Connection:
"""Get or create a thread-local database connection."""
thread_id = threading.get_ident()
if getattr(self._local, "generation", None) == self._pool_generation:
conn = getattr(self._local, "conn", None)
if conn is not None:
return conn
with self._pool_lock:
conn = self._pool.get(thread_id)
if conn is None:
conn = sqlite3.connect(self.db_path, check_same_thread=False)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
conn.execute("PRAGMA foreign_keys=ON")
self._pool[thread_id] = conn
self._local.conn = conn
self._local.generation = self._pool_generation
return conn
def close(self) -> None:
"""Close all pooled connections."""
with self._lock:
with self._pool_lock:
for conn in self._pool.values():
conn.close()
self._pool.clear()
self._pool_generation += 1
if hasattr(self._local, "conn"):
self._local.conn = None
if hasattr(self._local, "generation"):
self._local.generation = self._pool_generation
def __enter__(self) -> RegistryStore:
self.initialize()
return self
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
self.close()
def initialize(self) -> None:
"""Create database and schema."""
with self._lock:
self.db_path.parent.mkdir(parents=True, exist_ok=True)
conn = self._get_connection()
self._create_schema(conn)
def _create_schema(self, conn: sqlite3.Connection) -> None:
"""Create database schema."""
try:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS projects (
id INTEGER PRIMARY KEY,
source_root TEXT UNIQUE NOT NULL,
index_root TEXT NOT NULL,
created_at REAL,
last_indexed REAL,
total_files INTEGER DEFAULT 0,
total_dirs INTEGER DEFAULT 0,
status TEXT DEFAULT 'active'
)
"""
)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS dir_mapping (
id INTEGER PRIMARY KEY,
project_id INTEGER REFERENCES projects(id) ON DELETE CASCADE,
source_path TEXT NOT NULL,
index_path TEXT NOT NULL,
depth INTEGER,
files_count INTEGER DEFAULT 0,
last_updated REAL,
UNIQUE(source_path)
)
"""
)
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_dir_source ON dir_mapping(source_path)"
)
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_dir_project ON dir_mapping(project_id)"
)
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_project_source ON projects(source_root)"
)
conn.commit()
except sqlite3.DatabaseError as exc:
raise StorageError(f"Failed to initialize registry schema: {exc}") from exc
def _normalize_path_for_comparison(self, path: Path) -> str:
"""Normalize paths for comparisons and storage.
Windows paths are treated as case-insensitive, so normalize to lowercase.
Unix platforms preserve case sensitivity.
"""
path_str = str(path)
if platform.system() == "Windows":
return path_str.lower()
return path_str
# === Project Operations ===
def register_project(self, source_root: Path, index_root: Path) -> ProjectInfo:
"""Register a new project or update existing one.
Args:
source_root: Source code root directory
index_root: Index storage root directory
Returns:
ProjectInfo for the registered project
"""
with self._lock:
conn = self._get_connection()
source_root_str = self._normalize_path_for_comparison(source_root.resolve())
index_root_str = str(index_root.resolve())
now = time.time()
conn.execute(
"""
INSERT INTO projects(source_root, index_root, created_at, last_indexed)
VALUES(?, ?, ?, ?)
ON CONFLICT(source_root) DO UPDATE SET
index_root=excluded.index_root,
last_indexed=excluded.last_indexed,
status='active'
""",
(source_root_str, index_root_str, now, now),
)
row = conn.execute(
"SELECT * FROM projects WHERE source_root=?", (source_root_str,)
).fetchone()
conn.commit()
if not row:
raise StorageError(f"Failed to register project: {source_root}")
return self._row_to_project_info(row)
def unregister_project(self, source_root: Path) -> bool:
"""Remove a project registration (cascades to directory mappings).
Args:
source_root: Source code root directory
Returns:
True if project was removed, False if not found
"""
with self._lock:
conn = self._get_connection()
source_root_str = self._normalize_path_for_comparison(source_root.resolve())
row = conn.execute(
"SELECT id FROM projects WHERE source_root=?", (source_root_str,)
).fetchone()
if not row:
return False
conn.execute("DELETE FROM projects WHERE source_root=?", (source_root_str,))
conn.commit()
return True
def get_project(self, source_root: Path) -> Optional[ProjectInfo]:
"""Get project information by source root.
Args:
source_root: Source code root directory
Returns:
ProjectInfo if found, None otherwise
"""
with self._lock:
conn = self._get_connection()
source_root_str = self._normalize_path_for_comparison(source_root.resolve())
row = conn.execute(
"SELECT * FROM projects WHERE source_root=?", (source_root_str,)
).fetchone()
return self._row_to_project_info(row) if row else None
def get_project_by_id(self, project_id: int) -> Optional[ProjectInfo]:
"""Get project information by ID.
Args:
project_id: Project database ID
Returns:
ProjectInfo if found, None otherwise
"""
with self._lock:
conn = self._get_connection()
row = conn.execute(
"SELECT * FROM projects WHERE id=?", (project_id,)
).fetchone()
return self._row_to_project_info(row) if row else None
def list_projects(self, status: Optional[str] = None) -> List[ProjectInfo]:
"""List all registered projects.
Args:
status: Optional status filter ('active', 'stale', 'removed')
Returns:
List of ProjectInfo objects
"""
with self._lock:
conn = self._get_connection()
if status:
rows = conn.execute(
"SELECT * FROM projects WHERE status=? ORDER BY created_at DESC",
(status,),
).fetchall()
else:
rows = conn.execute(
"SELECT * FROM projects ORDER BY created_at DESC"
).fetchall()
return [self._row_to_project_info(row) for row in rows]
def update_project_stats(
self, source_root: Path, total_files: int, total_dirs: int
) -> None:
"""Update project statistics.
Args:
source_root: Source code root directory
total_files: Total number of indexed files
total_dirs: Total number of indexed directories
"""
with self._lock:
conn = self._get_connection()
source_root_str = self._normalize_path_for_comparison(source_root.resolve())
conn.execute(
"""
UPDATE projects
SET total_files=?, total_dirs=?, last_indexed=?
WHERE source_root=?
""",
(total_files, total_dirs, time.time(), source_root_str),
)
conn.commit()
def set_project_status(self, source_root: Path, status: str) -> None:
"""Set project status.
Args:
source_root: Source code root directory
status: Status string ('active', 'stale', 'removed')
"""
with self._lock:
conn = self._get_connection()
source_root_str = self._normalize_path_for_comparison(source_root.resolve())
conn.execute(
"UPDATE projects SET status=? WHERE source_root=?",
(status, source_root_str),
)
conn.commit()
# === Directory Mapping Operations ===
def register_dir(
self,
project_id: int,
source_path: Path,
index_path: Path,
depth: int,
files_count: int = 0,
) -> DirMapping:
"""Register a directory mapping.
Args:
project_id: Project database ID
source_path: Source directory path
index_path: Index database path
depth: Directory depth relative to project root
files_count: Number of files in directory
Returns:
DirMapping for the registered directory
"""
with self._lock:
conn = self._get_connection()
source_path_str = self._normalize_path_for_comparison(source_path.resolve())
index_path_str = str(index_path.resolve())
now = time.time()
conn.execute(
"""
INSERT INTO dir_mapping(
project_id, source_path, index_path, depth, files_count, last_updated
)
VALUES(?, ?, ?, ?, ?, ?)
ON CONFLICT(source_path) DO UPDATE SET
index_path=excluded.index_path,
depth=excluded.depth,
files_count=excluded.files_count,
last_updated=excluded.last_updated
""",
(project_id, source_path_str, index_path_str, depth, files_count, now),
)
row = conn.execute(
"SELECT * FROM dir_mapping WHERE source_path=?", (source_path_str,)
).fetchone()
conn.commit()
if not row:
raise StorageError(f"Failed to register directory: {source_path}")
return self._row_to_dir_mapping(row)
def unregister_dir(self, source_path: Path) -> bool:
"""Remove a directory mapping.
Args:
source_path: Source directory path
Returns:
True if directory was removed, False if not found
"""
with self._lock:
conn = self._get_connection()
source_path_str = self._normalize_path_for_comparison(source_path.resolve())
row = conn.execute(
"SELECT id FROM dir_mapping WHERE source_path=?", (source_path_str,)
).fetchone()
if not row:
return False
conn.execute("DELETE FROM dir_mapping WHERE source_path=?", (source_path_str,))
conn.commit()
return True
def find_index_path(self, source_path: Path) -> Optional[Path]:
"""Find index path for a source directory (exact match).
Args:
source_path: Source directory path
Returns:
Index path if found, None otherwise
"""
with self._lock:
conn = self._get_connection()
source_path_str = self._normalize_path_for_comparison(source_path.resolve())
row = conn.execute(
"SELECT index_path FROM dir_mapping WHERE source_path=?",
(source_path_str,),
).fetchone()
return Path(row["index_path"]) if row else None
def find_nearest_index(self, source_path: Path) -> Optional[DirMapping]:
"""Find nearest indexed ancestor directory.
Searches for the closest parent directory that has an index.
Useful for supporting subdirectory searches.
Optimized to use single database query instead of iterating through
each parent directory level.
Args:
source_path: Source directory or file path
Returns:
DirMapping for nearest ancestor, None if not found
"""
with self._lock:
conn = self._get_connection()
source_path_resolved = source_path.resolve()
# Build list of all parent paths from deepest to shallowest
paths_to_check = []
current = source_path_resolved
while True:
paths_to_check.append(self._normalize_path_for_comparison(current))
parent = current.parent
if parent == current: # Reached filesystem root
break
current = parent
if not paths_to_check:
return None
# Single query with WHERE IN, ordered by path length (longest = nearest)
placeholders = ','.join('?' * len(paths_to_check))
query = f"""
SELECT * FROM dir_mapping
WHERE source_path IN ({placeholders})
ORDER BY LENGTH(source_path) DESC
LIMIT 1
"""
row = conn.execute(query, paths_to_check).fetchone()
return self._row_to_dir_mapping(row) if row else None
def find_by_source_path(self, source_path: str) -> Optional[Dict[str, str]]:
"""Find project by source path (exact or nearest match).
Searches for a project whose source_root matches or contains
the given source_path.
Args:
source_path: Source directory path as string
Returns:
Dict with project info including 'index_root', or None if not found
"""
with self._lock:
conn = self._get_connection()
resolved_path = Path(source_path).resolve()
source_path_resolved = self._normalize_path_for_comparison(resolved_path)
# First try exact match on projects table
row = conn.execute(
"SELECT * FROM projects WHERE source_root=?", (source_path_resolved,)
).fetchone()
if row:
return {
"id": str(row["id"]),
"source_root": row["source_root"],
"index_root": row["index_root"],
"status": row["status"] or "active",
}
# Try finding project that contains this path
# Build list of all parent paths
paths_to_check = []
current = resolved_path
while True:
paths_to_check.append(self._normalize_path_for_comparison(current))
parent = current.parent
if parent == current:
break
current = parent
if paths_to_check:
placeholders = ','.join('?' * len(paths_to_check))
query = f"""
SELECT * FROM projects
WHERE source_root IN ({placeholders})
ORDER BY LENGTH(source_root) DESC
LIMIT 1
"""
row = conn.execute(query, paths_to_check).fetchone()
if row:
return {
"id": str(row["id"]),
"source_root": row["source_root"],
"index_root": row["index_root"],
"status": row["status"] or "active",
}
return None
def get_project_dirs(self, project_id: int) -> List[DirMapping]:
"""Get all directory mappings for a project.
Args:
project_id: Project database ID
Returns:
List of DirMapping objects
"""
with self._lock:
conn = self._get_connection()
rows = conn.execute(
"SELECT * FROM dir_mapping WHERE project_id=? ORDER BY depth, source_path",
(project_id,),
).fetchall()
return [self._row_to_dir_mapping(row) for row in rows]
def get_subdirs(self, source_path: Path) -> List[DirMapping]:
"""Get direct subdirectory mappings.
Args:
source_path: Parent directory path
Returns:
List of DirMapping objects for direct children
"""
with self._lock:
conn = self._get_connection()
source_path_str = self._normalize_path_for_comparison(source_path.resolve())
# First get the parent's depth
parent_row = conn.execute(
"SELECT depth, project_id FROM dir_mapping WHERE source_path=?",
(source_path_str,),
).fetchone()
if not parent_row:
return []
parent_depth = int(parent_row["depth"])
project_id = int(parent_row["project_id"])
# Get all subdirs with depth = parent_depth + 1 and matching path prefix
rows = conn.execute(
"""
SELECT * FROM dir_mapping
WHERE project_id=? AND depth=? AND source_path LIKE ?
ORDER BY source_path
""",
(project_id, parent_depth + 1, f"{source_path_str}%"),
).fetchall()
return [self._row_to_dir_mapping(row) for row in rows]
def update_dir_stats(self, source_path: Path, files_count: int) -> None:
"""Update directory statistics.
Args:
source_path: Source directory path
files_count: Number of files in directory
"""
with self._lock:
conn = self._get_connection()
source_path_str = self._normalize_path_for_comparison(source_path.resolve())
conn.execute(
"""
UPDATE dir_mapping
SET files_count=?, last_updated=?
WHERE source_path=?
""",
(files_count, time.time(), source_path_str),
)
conn.commit()
def update_index_paths(self, old_root: Path, new_root: Path) -> int:
"""Update all index paths after migration.
Replaces old_root prefix with new_root in all stored index paths.
Args:
old_root: Old index root directory
new_root: New index root directory
Returns:
Number of paths updated
"""
with self._lock:
conn = self._get_connection()
old_root_str = str(old_root.resolve())
new_root_str = str(new_root.resolve())
updated = 0
# Update projects
conn.execute(
"""
UPDATE projects
SET index_root = REPLACE(index_root, ?, ?)
WHERE index_root LIKE ?
""",
(old_root_str, new_root_str, f"{old_root_str}%"),
)
updated += conn.total_changes
# Update dir_mapping
conn.execute(
"""
UPDATE dir_mapping
SET index_path = REPLACE(index_path, ?, ?)
WHERE index_path LIKE ?
""",
(old_root_str, new_root_str, f"{old_root_str}%"),
)
updated += conn.total_changes
conn.commit()
return updated
# === Internal Methods ===
def _row_to_project_info(self, row: sqlite3.Row) -> ProjectInfo:
"""Convert database row to ProjectInfo."""
return ProjectInfo(
id=int(row["id"]),
source_root=Path(row["source_root"]),
index_root=Path(row["index_root"]),
created_at=float(row["created_at"]) if row["created_at"] else 0.0,
last_indexed=float(row["last_indexed"]) if row["last_indexed"] else 0.0,
total_files=int(row["total_files"]) if row["total_files"] else 0,
total_dirs=int(row["total_dirs"]) if row["total_dirs"] else 0,
status=str(row["status"]) if row["status"] else "active",
)
def _row_to_dir_mapping(self, row: sqlite3.Row) -> DirMapping:
"""Convert database row to DirMapping."""
return DirMapping(
id=int(row["id"]),
project_id=int(row["project_id"]),
source_path=Path(row["source_path"]),
index_path=Path(row["index_path"]),
depth=int(row["depth"]) if row["depth"] is not None else 0,
files_count=int(row["files_count"]) if row["files_count"] else 0,
last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
)

View File

@@ -0,0 +1,578 @@
"""SPLADE inverted index storage for sparse vector retrieval.
This module implements SQLite-based inverted index for SPLADE sparse vectors,
enabling efficient sparse retrieval using dot-product scoring.
"""
from __future__ import annotations
import logging
import sqlite3
import threading
import time
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from codexlens.entities import SearchResult
from codexlens.errors import StorageError
logger = logging.getLogger(__name__)
class SpladeIndex:
"""SQLite-based inverted index for SPLADE sparse vectors.
Stores sparse vectors as posting lists mapping token_id -> (chunk_id, weight).
Supports efficient dot-product retrieval using SQL joins.
"""
def __init__(self, db_path: Path | str) -> None:
"""Initialize SPLADE index.
Args:
db_path: Path to SQLite database file.
"""
self.db_path = Path(db_path)
self.db_path.parent.mkdir(parents=True, exist_ok=True)
# Thread-safe connection management
self._lock = threading.RLock()
self._local = threading.local()
def _get_connection(self) -> sqlite3.Connection:
"""Get or create a thread-local database connection.
Each thread gets its own connection to ensure thread safety.
Connections are stored in thread-local storage.
"""
conn = getattr(self._local, "conn", None)
if conn is None:
# Thread-local connection - each thread has its own
conn = sqlite3.connect(
self.db_path,
timeout=30.0, # Wait up to 30s for locks
check_same_thread=True, # Enforce thread safety
)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
conn.execute("PRAGMA foreign_keys=ON")
# Limit mmap to 1GB to avoid OOM on smaller systems
conn.execute("PRAGMA mmap_size=1073741824")
# Increase cache size for better query performance (20MB = -20000 pages)
conn.execute("PRAGMA cache_size=-20000")
self._local.conn = conn
return conn
def close(self) -> None:
"""Close thread-local database connection."""
with self._lock:
conn = getattr(self._local, "conn", None)
if conn is not None:
conn.close()
self._local.conn = None
def __enter__(self) -> SpladeIndex:
"""Context manager entry."""
self.create_tables()
return self
def __exit__(self, exc_type, exc, tb) -> None:
"""Context manager exit."""
self.close()
def has_index(self) -> bool:
"""Check if SPLADE tables exist in database.
Returns:
True if tables exist, False otherwise.
"""
with self._lock:
conn = self._get_connection()
try:
cursor = conn.execute(
"""
SELECT name FROM sqlite_master
WHERE type='table' AND name='splade_posting_list'
"""
)
return cursor.fetchone() is not None
except sqlite3.Error as e:
logger.error("Failed to check index existence: %s", e)
return False
def create_tables(self) -> None:
"""Create SPLADE schema if not exists.
Note: When used with distributed indexes (multiple _index.db files),
the SPLADE database stores chunk IDs from multiple sources. In this case,
foreign key constraints are not enforced to allow cross-database references.
"""
with self._lock:
conn = self._get_connection()
try:
# Inverted index for sparse vectors
# Note: No FOREIGN KEY constraint to support distributed index architecture
# where chunks may come from multiple _index.db files
conn.execute("""
CREATE TABLE IF NOT EXISTS splade_posting_list (
token_id INTEGER NOT NULL,
chunk_id INTEGER NOT NULL,
weight REAL NOT NULL,
PRIMARY KEY (token_id, chunk_id)
)
""")
# Indexes for efficient lookups
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_splade_by_chunk
ON splade_posting_list(chunk_id)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_splade_by_token
ON splade_posting_list(token_id)
""")
# Model metadata
conn.execute("""
CREATE TABLE IF NOT EXISTS splade_metadata (
id INTEGER PRIMARY KEY DEFAULT 1,
model_name TEXT NOT NULL,
vocab_size INTEGER NOT NULL,
onnx_path TEXT,
created_at REAL
)
""")
# Chunk metadata for self-contained search results
# Stores all chunk info needed to build SearchResult without querying _index.db
conn.execute("""
CREATE TABLE IF NOT EXISTS splade_chunks (
id INTEGER PRIMARY KEY,
file_path TEXT NOT NULL,
content TEXT NOT NULL,
metadata TEXT,
source_db TEXT
)
""")
conn.commit()
logger.debug("SPLADE schema created successfully")
except sqlite3.Error as e:
raise StorageError(
f"Failed to create SPLADE schema: {e}",
db_path=str(self.db_path),
operation="create_tables"
) from e
def add_posting(self, chunk_id: int, sparse_vec: Dict[int, float]) -> None:
"""Add a single document to inverted index.
Args:
chunk_id: Chunk ID (foreign key to semantic_chunks.id).
sparse_vec: Sparse vector as {token_id: weight} mapping.
"""
if not sparse_vec:
logger.warning("Empty sparse vector for chunk_id=%d, skipping", chunk_id)
return
with self._lock:
conn = self._get_connection()
try:
# Insert all non-zero weights for this chunk
postings = [
(token_id, chunk_id, weight)
for token_id, weight in sparse_vec.items()
if weight > 0 # Only store non-zero weights
]
if postings:
conn.executemany(
"""
INSERT OR REPLACE INTO splade_posting_list
(token_id, chunk_id, weight)
VALUES (?, ?, ?)
""",
postings
)
conn.commit()
logger.debug(
"Added %d postings for chunk_id=%d", len(postings), chunk_id
)
except sqlite3.Error as e:
raise StorageError(
f"Failed to add posting for chunk_id={chunk_id}: {e}",
db_path=str(self.db_path),
operation="add_posting"
) from e
def add_postings_batch(
self, postings: List[Tuple[int, Dict[int, float]]]
) -> None:
"""Batch insert postings for multiple chunks.
Args:
postings: List of (chunk_id, sparse_vec) tuples.
"""
if not postings:
return
with self._lock:
conn = self._get_connection()
try:
# Flatten all postings into single batch
batch_data = []
for chunk_id, sparse_vec in postings:
for token_id, weight in sparse_vec.items():
if weight > 0: # Only store non-zero weights
batch_data.append((token_id, chunk_id, weight))
if batch_data:
conn.executemany(
"""
INSERT OR REPLACE INTO splade_posting_list
(token_id, chunk_id, weight)
VALUES (?, ?, ?)
""",
batch_data
)
conn.commit()
logger.debug(
"Batch inserted %d postings for %d chunks",
len(batch_data),
len(postings)
)
except sqlite3.Error as e:
raise StorageError(
f"Failed to batch insert postings: {e}",
db_path=str(self.db_path),
operation="add_postings_batch"
) from e
def add_chunk_metadata(
self,
chunk_id: int,
file_path: str,
content: str,
metadata: Optional[str] = None,
source_db: Optional[str] = None
) -> None:
"""Store chunk metadata for self-contained search results.
Args:
chunk_id: Global chunk ID.
file_path: Path to source file.
content: Chunk text content.
metadata: JSON metadata string.
source_db: Path to source _index.db.
"""
with self._lock:
conn = self._get_connection()
try:
conn.execute(
"""
INSERT OR REPLACE INTO splade_chunks
(id, file_path, content, metadata, source_db)
VALUES (?, ?, ?, ?, ?)
""",
(chunk_id, file_path, content, metadata, source_db)
)
conn.commit()
except sqlite3.Error as e:
raise StorageError(
f"Failed to add chunk metadata for chunk_id={chunk_id}: {e}",
db_path=str(self.db_path),
operation="add_chunk_metadata"
) from e
def add_chunks_metadata_batch(
self,
chunks: List[Tuple[int, str, str, Optional[str], Optional[str]]]
) -> None:
"""Batch insert chunk metadata.
Args:
chunks: List of (chunk_id, file_path, content, metadata, source_db) tuples.
"""
if not chunks:
return
with self._lock:
conn = self._get_connection()
try:
conn.executemany(
"""
INSERT OR REPLACE INTO splade_chunks
(id, file_path, content, metadata, source_db)
VALUES (?, ?, ?, ?, ?)
""",
chunks
)
conn.commit()
logger.debug("Batch inserted %d chunk metadata records", len(chunks))
except sqlite3.Error as e:
raise StorageError(
f"Failed to batch insert chunk metadata: {e}",
db_path=str(self.db_path),
operation="add_chunks_metadata_batch"
) from e
def get_chunks_by_ids(self, chunk_ids: List[int]) -> List[Dict]:
"""Get chunk metadata by IDs.
Args:
chunk_ids: List of chunk IDs to retrieve.
Returns:
List of dicts with id, file_path, content, metadata, source_db.
"""
if not chunk_ids:
return []
with self._lock:
conn = self._get_connection()
try:
placeholders = ",".join("?" * len(chunk_ids))
rows = conn.execute(
f"""
SELECT id, file_path, content, metadata, source_db
FROM splade_chunks
WHERE id IN ({placeholders})
""",
chunk_ids
).fetchall()
return [
{
"id": row["id"],
"file_path": row["file_path"],
"content": row["content"],
"metadata": row["metadata"],
"source_db": row["source_db"]
}
for row in rows
]
except sqlite3.Error as e:
logger.error("Failed to get chunks by IDs: %s", e)
return []
def remove_chunk(self, chunk_id: int) -> int:
"""Remove all postings for a chunk.
Args:
chunk_id: Chunk ID to remove.
Returns:
Number of deleted postings.
"""
with self._lock:
conn = self._get_connection()
try:
cursor = conn.execute(
"DELETE FROM splade_posting_list WHERE chunk_id = ?",
(chunk_id,)
)
conn.commit()
deleted = cursor.rowcount
logger.debug("Removed %d postings for chunk_id=%d", deleted, chunk_id)
return deleted
except sqlite3.Error as e:
raise StorageError(
f"Failed to remove chunk_id={chunk_id}: {e}",
db_path=str(self.db_path),
operation="remove_chunk"
) from e
def search(
self,
query_sparse: Dict[int, float],
limit: int = 50,
min_score: float = 0.0,
max_query_terms: int = 64
) -> List[Tuple[int, float]]:
"""Search for similar chunks using dot-product scoring.
Implements efficient sparse dot-product via SQL JOIN:
score(q, d) = sum(q[t] * d[t]) for all tokens t
Args:
query_sparse: Query sparse vector as {token_id: weight}.
limit: Maximum number of results.
min_score: Minimum score threshold.
max_query_terms: Maximum query terms to use (default: 64).
Pruning to top-K terms reduces search time with minimal impact on quality.
Set to 0 or negative to disable pruning (use all terms).
Returns:
List of (chunk_id, score) tuples, ordered by score descending.
"""
if not query_sparse:
logger.warning("Empty query sparse vector")
return []
with self._lock:
conn = self._get_connection()
try:
# Build VALUES clause for query terms
# Each term: (token_id, weight)
query_terms = [
(token_id, weight)
for token_id, weight in query_sparse.items()
if weight > 0
]
if not query_terms:
logger.warning("No non-zero query terms")
return []
# Query pruning: keep only top-K terms by weight
# max_query_terms <= 0 means no limit (use all terms)
if max_query_terms > 0 and len(query_terms) > max_query_terms:
query_terms = sorted(query_terms, key=lambda x: x[1], reverse=True)[:max_query_terms]
logger.debug(
"Query pruned from %d to %d terms",
len(query_sparse),
len(query_terms)
)
# Create CTE for query terms using parameterized VALUES
# Build placeholders and params to prevent SQL injection
params = []
placeholders = []
for token_id, weight in query_terms:
placeholders.append("(?, ?)")
params.extend([token_id, weight])
values_placeholders = ", ".join(placeholders)
sql = f"""
WITH query_terms(token_id, weight) AS (
VALUES {values_placeholders}
)
SELECT
p.chunk_id,
SUM(p.weight * q.weight) as score
FROM splade_posting_list p
INNER JOIN query_terms q ON p.token_id = q.token_id
GROUP BY p.chunk_id
HAVING score >= ?
ORDER BY score DESC
LIMIT ?
"""
# Append min_score and limit to params
params.extend([min_score, limit])
rows = conn.execute(sql, params).fetchall()
results = [(row["chunk_id"], float(row["score"])) for row in rows]
logger.debug(
"SPLADE search: %d query terms, %d results",
len(query_terms),
len(results)
)
return results
except sqlite3.Error as e:
raise StorageError(
f"SPLADE search failed: {e}",
db_path=str(self.db_path),
operation="search"
) from e
def get_metadata(self) -> Optional[Dict]:
"""Get SPLADE model metadata.
Returns:
Dictionary with model_name, vocab_size, onnx_path, created_at,
or None if not set.
"""
with self._lock:
conn = self._get_connection()
try:
row = conn.execute(
"""
SELECT model_name, vocab_size, onnx_path, created_at
FROM splade_metadata
WHERE id = 1
"""
).fetchone()
if row:
return {
"model_name": row["model_name"],
"vocab_size": row["vocab_size"],
"onnx_path": row["onnx_path"],
"created_at": row["created_at"]
}
return None
except sqlite3.Error as e:
logger.error("Failed to get metadata: %s", e)
return None
def set_metadata(
self,
model_name: str,
vocab_size: int,
onnx_path: Optional[str] = None
) -> None:
"""Set SPLADE model metadata.
Args:
model_name: SPLADE model name.
vocab_size: Vocabulary size (typically ~30k for BERT vocab).
onnx_path: Optional path to ONNX model file.
"""
with self._lock:
conn = self._get_connection()
try:
current_time = time.time()
conn.execute(
"""
INSERT OR REPLACE INTO splade_metadata
(id, model_name, vocab_size, onnx_path, created_at)
VALUES (1, ?, ?, ?, ?)
""",
(model_name, vocab_size, onnx_path, current_time)
)
conn.commit()
logger.info(
"Set SPLADE metadata: model=%s, vocab_size=%d",
model_name,
vocab_size
)
except sqlite3.Error as e:
raise StorageError(
f"Failed to set metadata: {e}",
db_path=str(self.db_path),
operation="set_metadata"
) from e
def get_stats(self) -> Dict:
"""Get index statistics.
Returns:
Dictionary with total_postings, unique_tokens, unique_chunks.
"""
with self._lock:
conn = self._get_connection()
try:
row = conn.execute("""
SELECT
COUNT(*) as total_postings,
COUNT(DISTINCT token_id) as unique_tokens,
COUNT(DISTINCT chunk_id) as unique_chunks
FROM splade_posting_list
""").fetchone()
return {
"total_postings": row["total_postings"],
"unique_tokens": row["unique_tokens"],
"unique_chunks": row["unique_chunks"]
}
except sqlite3.Error as e:
logger.error("Failed to get stats: %s", e)
return {
"total_postings": 0,
"unique_tokens": 0,
"unique_chunks": 0
}

View File

@@ -0,0 +1,976 @@
"""SQLite storage for CodexLens indexing and search."""
from __future__ import annotations
import json
import logging
import sqlite3
import threading
import time
from dataclasses import asdict
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple
from codexlens.entities import IndexedFile, SearchResult, Symbol
from codexlens.errors import StorageError
logger = logging.getLogger(__name__)
class SQLiteStore:
"""SQLiteStore providing FTS5 search and symbol lookup.
Implements thread-local connection pooling for improved performance.
"""
# Maximum number of connections to keep in pool to prevent memory leaks
MAX_POOL_SIZE = 32
# Idle timeout in seconds (10 minutes)
IDLE_TIMEOUT = 600
# Periodic cleanup interval in seconds (5 minutes)
CLEANUP_INTERVAL = 300
def __init__(self, db_path: str | Path) -> None:
self.db_path = Path(db_path)
self._lock = threading.RLock()
self._local = threading.local()
self._pool_lock = threading.Lock()
# Pool stores (connection, last_access_time) tuples
self._pool: Dict[int, Tuple[sqlite3.Connection, float]] = {}
self._pool_generation = 0
self._cleanup_timer: threading.Timer | None = None
self._cleanup_stop_event = threading.Event()
self._start_cleanup_timer()
def _get_connection(self) -> sqlite3.Connection:
"""Get or create a thread-local database connection."""
thread_id = threading.get_ident()
current_time = time.time()
if getattr(self._local, "generation", None) == self._pool_generation:
conn = getattr(self._local, "conn", None)
if conn is not None:
with self._pool_lock:
pool_entry = self._pool.get(thread_id)
if pool_entry is not None:
pooled_conn, _ = pool_entry
self._pool[thread_id] = (pooled_conn, current_time)
self._local.conn = pooled_conn
return pooled_conn
# Thread-local connection is stale (e.g., cleaned up by timer).
self._local.conn = None
with self._pool_lock:
pool_entry = self._pool.get(thread_id)
if pool_entry is not None:
conn, _ = pool_entry
# Update last access time
self._pool[thread_id] = (conn, current_time)
else:
# Clean up stale and idle connections if pool is too large
if len(self._pool) >= self.MAX_POOL_SIZE:
self._cleanup_stale_connections()
conn = sqlite3.connect(self.db_path, check_same_thread=False)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
conn.execute("PRAGMA foreign_keys=ON")
# Memory-mapped I/O for faster reads (30GB limit)
conn.execute("PRAGMA mmap_size=30000000000")
self._pool[thread_id] = (conn, current_time)
self._local.conn = conn
self._local.generation = self._pool_generation
return conn
def _cleanup_stale_connections(self) -> None:
"""Remove connections for threads that no longer exist or have been idle too long."""
current_time = time.time()
# Get list of active thread IDs
active_threads = {t.ident for t in threading.enumerate() if t.ident is not None}
# Find connections to remove: dead threads or idle timeout exceeded
stale_ids: list[tuple[int, str]] = []
for tid, (conn, last_access) in list(self._pool.items()):
try:
is_dead_thread = tid not in active_threads
is_idle = (current_time - last_access) > self.IDLE_TIMEOUT
is_invalid_connection = False
if not is_dead_thread and not is_idle:
try:
conn.execute("SELECT 1").fetchone()
except sqlite3.ProgrammingError:
is_invalid_connection = True
except sqlite3.Error:
is_invalid_connection = True
if is_invalid_connection:
stale_ids.append((tid, "invalid_connection"))
elif is_dead_thread:
stale_ids.append((tid, "dead_thread"))
elif is_idle:
stale_ids.append((tid, "idle_timeout"))
except Exception:
# Never break cleanup for a single bad entry.
continue
# Close and remove stale connections
for tid, reason in stale_ids:
try:
conn, _ = self._pool[tid]
conn.close()
except Exception:
pass
del self._pool[tid]
logger.debug("Cleaned SQLiteStore connection for thread_id=%s (%s)", tid, reason)
def _start_cleanup_timer(self) -> None:
if self.CLEANUP_INTERVAL <= 0:
return
self._cleanup_stop_event.clear()
def tick() -> None:
if self._cleanup_stop_event.is_set():
return
try:
with self._pool_lock:
self._cleanup_stale_connections()
finally:
with self._pool_lock:
if self._cleanup_stop_event.is_set():
self._cleanup_timer = None
return
self._cleanup_timer = threading.Timer(self.CLEANUP_INTERVAL, tick)
self._cleanup_timer.daemon = True
self._cleanup_timer.start()
self._cleanup_timer = threading.Timer(self.CLEANUP_INTERVAL, tick)
self._cleanup_timer.daemon = True
self._cleanup_timer.start()
def _stop_cleanup_timer(self) -> None:
self._cleanup_stop_event.set()
with self._pool_lock:
if self._cleanup_timer is not None:
self._cleanup_timer.cancel()
self._cleanup_timer = None
def close(self) -> None:
"""Close all pooled connections."""
with self._lock:
self._stop_cleanup_timer()
with self._pool_lock:
for conn, _ in self._pool.values():
conn.close()
self._pool.clear()
self._pool_generation += 1
if hasattr(self._local, "conn"):
self._local.conn = None
if hasattr(self._local, "generation"):
self._local.generation = self._pool_generation
def __enter__(self) -> SQLiteStore:
self.initialize()
return self
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
self.close()
def execute_query(
self,
sql: str,
params: tuple = (),
allow_writes: bool = False
) -> List[Dict[str, Any]]:
"""Execute a raw SQL query and return results as dictionaries.
This is the public API for executing custom queries without bypassing
encapsulation via _get_connection().
By default, only SELECT queries are allowed. Use allow_writes=True
for trusted internal code that needs to execute other statements.
Args:
sql: SQL query string with ? placeholders for parameters
params: Tuple of parameter values to bind
allow_writes: If True, allow non-SELECT statements (default False)
Returns:
List of result rows as dictionaries
Raises:
StorageError: If query execution fails or validation fails
"""
# Validate query type for security
sql_stripped = sql.strip().upper()
if not allow_writes:
# Only allow SELECT and WITH (for CTEs) statements
if not (sql_stripped.startswith("SELECT") or sql_stripped.startswith("WITH")):
raise StorageError(
"Only SELECT queries are allowed. "
"Use allow_writes=True for trusted internal operations.",
db_path=str(self.db_path),
operation="execute_query",
details={"query_type": sql_stripped.split()[0] if sql_stripped else "EMPTY"}
)
try:
conn = self._get_connection()
rows = conn.execute(sql, params).fetchall()
return [dict(row) for row in rows]
except sqlite3.Error as e:
raise StorageError(
f"Query execution failed: {e}",
db_path=str(self.db_path),
operation="execute_query",
details={"error_type": type(e).__name__}
) from e
def initialize(self) -> None:
with self._lock:
self.db_path.parent.mkdir(parents=True, exist_ok=True)
conn = self._get_connection()
self._create_schema(conn)
self._ensure_fts_external_content(conn)
def add_file(self, indexed_file: IndexedFile, content: str) -> None:
with self._lock:
conn = self._get_connection()
path = str(Path(indexed_file.path).resolve())
language = indexed_file.language
mtime = Path(path).stat().st_mtime if Path(path).exists() else None
line_count = content.count(chr(10)) + 1
conn.execute(
"""
INSERT INTO files(path, language, content, mtime, line_count)
VALUES(?, ?, ?, ?, ?)
ON CONFLICT(path) DO UPDATE SET
language=excluded.language,
content=excluded.content,
mtime=excluded.mtime,
line_count=excluded.line_count
""",
(path, language, content, mtime, line_count),
)
row = conn.execute("SELECT id FROM files WHERE path=?", (path,)).fetchone()
if not row:
raise StorageError(f"Failed to read file id for {path}")
file_id = int(row["id"])
conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
if indexed_file.symbols:
conn.executemany(
"""
INSERT INTO symbols(file_id, name, kind, start_line, end_line)
VALUES(?, ?, ?, ?, ?)
""",
[
(file_id, s.name, s.kind, s.range[0], s.range[1])
for s in indexed_file.symbols
],
)
conn.commit()
def add_files(self, files_data: List[tuple[IndexedFile, str]]) -> None:
"""Add multiple files in a single transaction for better performance.
Args:
files_data: List of (indexed_file, content) tuples
"""
with self._lock:
conn = self._get_connection()
try:
conn.execute("BEGIN")
for indexed_file, content in files_data:
path = str(Path(indexed_file.path).resolve())
language = indexed_file.language
mtime = Path(path).stat().st_mtime if Path(path).exists() else None
line_count = content.count(chr(10)) + 1
conn.execute(
"""
INSERT INTO files(path, language, content, mtime, line_count)
VALUES(?, ?, ?, ?, ?)
ON CONFLICT(path) DO UPDATE SET
language=excluded.language,
content=excluded.content,
mtime=excluded.mtime,
line_count=excluded.line_count
""",
(path, language, content, mtime, line_count),
)
row = conn.execute("SELECT id FROM files WHERE path=?", (path,)).fetchone()
if not row:
raise StorageError(f"Failed to read file id for {path}")
file_id = int(row["id"])
conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
if indexed_file.symbols:
conn.executemany(
"""
INSERT INTO symbols(file_id, name, kind, start_line, end_line)
VALUES(?, ?, ?, ?, ?)
""",
[
(file_id, s.name, s.kind, s.range[0], s.range[1])
for s in indexed_file.symbols
],
)
conn.commit()
except Exception as exc:
try:
conn.rollback()
except Exception as rollback_exc:
logger.error(
"Rollback failed after add_files() error (%s): %s", exc, rollback_exc
)
raise exc.with_traceback(exc.__traceback__) from rollback_exc
raise
def remove_file(self, path: str | Path) -> bool:
"""Remove a file from the index."""
with self._lock:
conn = self._get_connection()
resolved_path = str(Path(path).resolve())
row = conn.execute(
"SELECT id FROM files WHERE path=?", (resolved_path,)
).fetchone()
if not row:
return False
file_id = int(row["id"])
conn.execute("DELETE FROM files WHERE id=?", (file_id,))
conn.commit()
return True
def file_exists(self, path: str | Path) -> bool:
"""Check if a file exists in the index."""
with self._lock:
conn = self._get_connection()
resolved_path = str(Path(path).resolve())
row = conn.execute(
"SELECT 1 FROM files WHERE path=?", (resolved_path,)
).fetchone()
return row is not None
def get_file_mtime(self, path: str | Path) -> float | None:
"""Get the stored mtime for a file."""
with self._lock:
conn = self._get_connection()
resolved_path = str(Path(path).resolve())
row = conn.execute(
"SELECT mtime FROM files WHERE path=?", (resolved_path,)
).fetchone()
return float(row["mtime"]) if row and row["mtime"] else None
def search_fts(self, query: str, *, limit: int = 20, offset: int = 0) -> List[SearchResult]:
with self._lock:
conn = self._get_connection()
try:
rows = conn.execute(
"""
SELECT rowid, path, bm25(files_fts) AS rank,
snippet(files_fts, 2, '[bold red]', '[/bold red]', "...", 20) AS excerpt
FROM files_fts
WHERE files_fts MATCH ?
ORDER BY rank
LIMIT ? OFFSET ?
""",
(query, limit, offset),
).fetchall()
except sqlite3.DatabaseError as exc:
raise StorageError(f"FTS search failed: {exc}") from exc
results: List[SearchResult] = []
for row in rows:
rank = float(row["rank"]) if row["rank"] is not None else 0.0
score = abs(rank) if rank < 0 else 0.0
results.append(
SearchResult(
path=row["path"],
score=score,
excerpt=row["excerpt"],
)
)
return results
def search_files_only(
self, query: str, *, limit: int = 20, offset: int = 0
) -> List[str]:
"""Search indexed file contents and return only file paths."""
with self._lock:
conn = self._get_connection()
try:
rows = conn.execute(
"""
SELECT path
FROM files_fts
WHERE files_fts MATCH ?
ORDER BY bm25(files_fts)
LIMIT ? OFFSET ?
""",
(query, limit, offset),
).fetchall()
except sqlite3.DatabaseError as exc:
raise StorageError(f"FTS search failed: {exc}") from exc
return [row["path"] for row in rows]
def search_symbols(
self, name: str, *, kind: Optional[str] = None, limit: int = 50
) -> List[Symbol]:
pattern = f"%{name}%"
with self._lock:
conn = self._get_connection()
if kind:
rows = conn.execute(
"""
SELECT name, kind, start_line, end_line
FROM symbols
WHERE name LIKE ? AND kind=?
ORDER BY name
LIMIT ?
""",
(pattern, kind, limit),
).fetchall()
else:
rows = conn.execute(
"""
SELECT name, kind, start_line, end_line
FROM symbols
WHERE name LIKE ?
ORDER BY name
LIMIT ?
""",
(pattern, limit),
).fetchall()
return [
Symbol(name=row["name"], kind=row["kind"], range=(row["start_line"], row["end_line"]))
for row in rows
]
def stats(self) -> Dict[str, Any]:
with self._lock:
conn = self._get_connection()
file_count = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()["c"]
symbol_count = conn.execute("SELECT COUNT(*) AS c FROM symbols").fetchone()["c"]
lang_rows = conn.execute(
"SELECT language, COUNT(*) AS c FROM files GROUP BY language ORDER BY c DESC"
).fetchall()
languages = {row["language"]: row["c"] for row in lang_rows}
# Include relationship count if table exists
relationship_count = 0
try:
rel_row = conn.execute("SELECT COUNT(*) AS c FROM code_relationships").fetchone()
relationship_count = int(rel_row["c"]) if rel_row else 0
except sqlite3.DatabaseError:
pass
return {
"files": int(file_count),
"symbols": int(symbol_count),
"relationships": relationship_count,
"languages": languages,
"db_path": str(self.db_path),
}
def _connect(self) -> sqlite3.Connection:
"""Legacy method for backward compatibility."""
return self._get_connection()
def _create_schema(self, conn: sqlite3.Connection) -> None:
try:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS files (
id INTEGER PRIMARY KEY,
path TEXT UNIQUE NOT NULL,
language TEXT NOT NULL,
content TEXT NOT NULL,
mtime REAL,
line_count INTEGER
)
"""
)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS symbols (
id INTEGER PRIMARY KEY,
file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
name TEXT NOT NULL,
kind TEXT NOT NULL,
start_line INTEGER NOT NULL,
end_line INTEGER NOT NULL
)
"""
)
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_kind ON symbols(kind)")
conn.execute(
"""
CREATE TABLE IF NOT EXISTS code_relationships (
id INTEGER PRIMARY KEY,
source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
target_qualified_name TEXT NOT NULL,
relationship_type TEXT NOT NULL,
source_line INTEGER NOT NULL,
target_file TEXT
)
"""
)
conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
# Chunks table for multi-vector storage (cascade retrieval architecture)
# - embedding: Original embedding for backward compatibility
# - embedding_binary: 256-dim binary vector for coarse ranking
# - embedding_dense: 2048-dim dense vector for fine ranking
conn.execute(
"""
CREATE TABLE IF NOT EXISTS chunks (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT NOT NULL,
content TEXT NOT NULL,
embedding BLOB,
embedding_binary BLOB,
embedding_dense BLOB,
metadata TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
)
conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_file_path ON chunks(file_path)")
# Run migration for existing databases
self._migrate_chunks_table(conn)
conn.commit()
except sqlite3.DatabaseError as exc:
raise StorageError(f"Failed to initialize database schema: {exc}") from exc
def _ensure_fts_external_content(self, conn: sqlite3.Connection) -> None:
"""Ensure files_fts is an FTS5 external-content table (no content duplication)."""
try:
sql_row = conn.execute(
"SELECT sql FROM sqlite_master WHERE type='table' AND name='files_fts'"
).fetchone()
sql = str(sql_row["sql"]) if sql_row and sql_row["sql"] else None
if sql is None:
self._create_external_fts(conn)
conn.commit()
return
if (
"content='files'" in sql
or 'content="files"' in sql
or "content=files" in sql
):
self._create_fts_triggers(conn)
conn.commit()
return
self._migrate_fts_to_external(conn)
except sqlite3.DatabaseError as exc:
raise StorageError(f"Failed to ensure FTS schema: {exc}") from exc
def _create_external_fts(self, conn: sqlite3.Connection) -> None:
conn.execute(
"""
CREATE VIRTUAL TABLE files_fts USING fts5(
path UNINDEXED,
language UNINDEXED,
content,
content='files',
content_rowid='id',
tokenize="unicode61 tokenchars '_'"
)
"""
)
self._create_fts_triggers(conn)
def _create_fts_triggers(self, conn: sqlite3.Connection) -> None:
conn.execute(
"""
CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN
INSERT INTO files_fts(rowid, path, language, content)
VALUES(new.id, new.path, new.language, new.content);
END
"""
)
conn.execute(
"""
CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN
INSERT INTO files_fts(files_fts, rowid, path, language, content)
VALUES('delete', old.id, old.path, old.language, old.content);
END
"""
)
conn.execute(
"""
CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN
INSERT INTO files_fts(files_fts, rowid, path, language, content)
VALUES('delete', old.id, old.path, old.language, old.content);
INSERT INTO files_fts(rowid, path, language, content)
VALUES(new.id, new.path, new.language, new.content);
END
"""
)
def _migrate_fts_to_external(self, conn: sqlite3.Connection) -> None:
"""Migrate legacy files_fts (with duplicated content) to external content."""
try:
conn.execute("BEGIN")
conn.execute("DROP TRIGGER IF EXISTS files_ai")
conn.execute("DROP TRIGGER IF EXISTS files_ad")
conn.execute("DROP TRIGGER IF EXISTS files_au")
conn.execute("ALTER TABLE files_fts RENAME TO files_fts_legacy")
self._create_external_fts(conn)
conn.execute("INSERT INTO files_fts(files_fts) VALUES('rebuild')")
conn.execute("DROP TABLE files_fts_legacy")
conn.commit()
except sqlite3.DatabaseError as exc:
try:
conn.rollback()
except Exception as rollback_exc:
logger.error(
"Rollback failed during FTS schema migration (%s): %s", exc, rollback_exc
)
raise exc.with_traceback(exc.__traceback__) from rollback_exc
try:
conn.execute("DROP TABLE IF EXISTS files_fts")
except Exception:
pass
try:
conn.execute("ALTER TABLE files_fts_legacy RENAME TO files_fts")
conn.commit()
except Exception:
pass
raise
try:
conn.execute("VACUUM")
except sqlite3.DatabaseError:
pass
def _migrate_chunks_table(self, conn: sqlite3.Connection) -> None:
"""Migrate existing chunks table to add multi-vector columns if needed.
This handles upgrading existing databases that may have the chunks table
without the embedding_binary and embedding_dense columns.
"""
# Check if chunks table exists
table_exists = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
).fetchone()
if not table_exists:
# Table doesn't exist yet, nothing to migrate
return
# Check existing columns
cursor = conn.execute("PRAGMA table_info(chunks)")
columns = {row[1] for row in cursor.fetchall()}
# Add embedding_binary column if missing
if "embedding_binary" not in columns:
logger.info("Migrating chunks table: adding embedding_binary column")
conn.execute(
"ALTER TABLE chunks ADD COLUMN embedding_binary BLOB"
)
# Add embedding_dense column if missing
if "embedding_dense" not in columns:
logger.info("Migrating chunks table: adding embedding_dense column")
conn.execute(
"ALTER TABLE chunks ADD COLUMN embedding_dense BLOB"
)
def add_chunks(
self,
file_path: str,
chunks_data: List[Dict[str, Any]],
*,
embedding: Optional[List[List[float]]] = None,
embedding_binary: Optional[List[bytes]] = None,
embedding_dense: Optional[List[bytes]] = None,
) -> List[int]:
"""Add multiple chunks with multi-vector embeddings support.
This method supports the cascade retrieval architecture with three embedding types:
- embedding: Original dense embedding for backward compatibility
- embedding_binary: 256-dim binary vector for fast coarse ranking
- embedding_dense: 2048-dim dense vector for precise fine ranking
Args:
file_path: Path to the source file for all chunks.
chunks_data: List of dicts with 'content' and optional 'metadata' keys.
embedding: Optional list of dense embeddings (one per chunk).
embedding_binary: Optional list of binary embeddings as bytes (one per chunk).
embedding_dense: Optional list of dense embeddings as bytes (one per chunk).
Returns:
List of inserted chunk IDs.
Raises:
ValueError: If embedding list lengths don't match chunks_data length.
StorageError: If database operation fails.
"""
if not chunks_data:
return []
n_chunks = len(chunks_data)
# Validate embedding lengths
if embedding is not None and len(embedding) != n_chunks:
raise ValueError(
f"embedding length ({len(embedding)}) != chunks_data length ({n_chunks})"
)
if embedding_binary is not None and len(embedding_binary) != n_chunks:
raise ValueError(
f"embedding_binary length ({len(embedding_binary)}) != chunks_data length ({n_chunks})"
)
if embedding_dense is not None and len(embedding_dense) != n_chunks:
raise ValueError(
f"embedding_dense length ({len(embedding_dense)}) != chunks_data length ({n_chunks})"
)
# Prepare batch data
batch_data = []
for i, chunk in enumerate(chunks_data):
content = chunk.get("content", "")
metadata = chunk.get("metadata")
metadata_json = json.dumps(metadata) if metadata else None
# Convert embeddings to bytes if needed
emb_blob = None
if embedding is not None:
import struct
emb_blob = struct.pack(f"{len(embedding[i])}f", *embedding[i])
emb_binary_blob = embedding_binary[i] if embedding_binary is not None else None
emb_dense_blob = embedding_dense[i] if embedding_dense is not None else None
batch_data.append((
file_path, content, emb_blob, emb_binary_blob, emb_dense_blob, metadata_json
))
with self._lock:
conn = self._get_connection()
try:
# Get starting ID before insert
row = conn.execute("SELECT MAX(id) FROM chunks").fetchone()
start_id = (row[0] or 0) + 1
conn.executemany(
"""
INSERT INTO chunks (
file_path, content, embedding, embedding_binary,
embedding_dense, metadata
)
VALUES (?, ?, ?, ?, ?, ?)
""",
batch_data
)
conn.commit()
# Calculate inserted IDs
return list(range(start_id, start_id + n_chunks))
except sqlite3.DatabaseError as exc:
raise StorageError(
f"Failed to add chunks: {exc}",
db_path=str(self.db_path),
operation="add_chunks",
) from exc
def get_binary_embeddings(
self, chunk_ids: List[int]
) -> Dict[int, Optional[bytes]]:
"""Get binary embeddings for specified chunk IDs.
Used for coarse ranking in cascade retrieval architecture.
Binary embeddings (256-dim) enable fast approximate similarity search.
Args:
chunk_ids: List of chunk IDs to retrieve embeddings for.
Returns:
Dictionary mapping chunk_id to embedding_binary bytes (or None if not set).
Raises:
StorageError: If database query fails.
"""
if not chunk_ids:
return {}
with self._lock:
conn = self._get_connection()
try:
placeholders = ",".join("?" * len(chunk_ids))
rows = conn.execute(
f"SELECT id, embedding_binary FROM chunks WHERE id IN ({placeholders})",
chunk_ids
).fetchall()
return {row["id"]: row["embedding_binary"] for row in rows}
except sqlite3.DatabaseError as exc:
raise StorageError(
f"Failed to get binary embeddings: {exc}",
db_path=str(self.db_path),
operation="get_binary_embeddings",
) from exc
def get_dense_embeddings(
self, chunk_ids: List[int]
) -> Dict[int, Optional[bytes]]:
"""Get dense embeddings for specified chunk IDs.
Used for fine ranking in cascade retrieval architecture.
Dense embeddings (2048-dim) provide high-precision similarity scoring.
Args:
chunk_ids: List of chunk IDs to retrieve embeddings for.
Returns:
Dictionary mapping chunk_id to embedding_dense bytes (or None if not set).
Raises:
StorageError: If database query fails.
"""
if not chunk_ids:
return {}
with self._lock:
conn = self._get_connection()
try:
placeholders = ",".join("?" * len(chunk_ids))
rows = conn.execute(
f"SELECT id, embedding_dense FROM chunks WHERE id IN ({placeholders})",
chunk_ids
).fetchall()
return {row["id"]: row["embedding_dense"] for row in rows}
except sqlite3.DatabaseError as exc:
raise StorageError(
f"Failed to get dense embeddings: {exc}",
db_path=str(self.db_path),
operation="get_dense_embeddings",
) from exc
def get_chunks_by_ids(
self, chunk_ids: List[int]
) -> List[Dict[str, Any]]:
"""Get chunk data for specified IDs.
Args:
chunk_ids: List of chunk IDs to retrieve.
Returns:
List of chunk dictionaries with id, file_path, content, metadata.
Raises:
StorageError: If database query fails.
"""
if not chunk_ids:
return []
with self._lock:
conn = self._get_connection()
try:
placeholders = ",".join("?" * len(chunk_ids))
rows = conn.execute(
f"""
SELECT id, file_path, content, metadata, created_at
FROM chunks
WHERE id IN ({placeholders})
""",
chunk_ids
).fetchall()
results = []
for row in rows:
metadata = None
if row["metadata"]:
try:
metadata = json.loads(row["metadata"])
except json.JSONDecodeError:
pass
results.append({
"id": row["id"],
"file_path": row["file_path"],
"content": row["content"],
"metadata": metadata,
"created_at": row["created_at"],
})
return results
except sqlite3.DatabaseError as exc:
raise StorageError(
f"Failed to get chunks: {exc}",
db_path=str(self.db_path),
operation="get_chunks_by_ids",
) from exc
def delete_chunks_by_file(self, file_path: str) -> int:
"""Delete all chunks for a given file path.
Args:
file_path: Path to the source file.
Returns:
Number of deleted chunks.
Raises:
StorageError: If database operation fails.
"""
with self._lock:
conn = self._get_connection()
try:
cursor = conn.execute(
"DELETE FROM chunks WHERE file_path = ?",
(file_path,)
)
conn.commit()
return cursor.rowcount
except sqlite3.DatabaseError as exc:
raise StorageError(
f"Failed to delete chunks: {exc}",
db_path=str(self.db_path),
operation="delete_chunks_by_file",
) from exc
def count_chunks(self) -> int:
"""Count total chunks in store.
Returns:
Total number of chunks.
"""
with self._lock:
conn = self._get_connection()
row = conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone()
return int(row["c"]) if row else 0

View File

@@ -0,0 +1,64 @@
"""SQLite utility functions for CodexLens storage layer."""
from __future__ import annotations
import logging
import sqlite3
log = logging.getLogger(__name__)
def check_trigram_support(conn: sqlite3.Connection) -> bool:
"""Check if SQLite supports trigram tokenizer for FTS5.
Trigram tokenizer requires SQLite >= 3.34.0.
Args:
conn: Database connection to test
Returns:
True if trigram tokenizer is available, False otherwise
"""
try:
# Test by creating a temporary virtual table with trigram tokenizer
conn.execute(
"""
CREATE VIRTUAL TABLE IF NOT EXISTS test_trigram_check
USING fts5(test_content, tokenize='trigram')
"""
)
# Clean up test table
conn.execute("DROP TABLE IF EXISTS test_trigram_check")
conn.commit()
return True
except sqlite3.OperationalError as e:
# Trigram tokenizer not available
if "unrecognized tokenizer" in str(e).lower():
log.debug("Trigram tokenizer not available in this SQLite version")
return False
# Other operational errors should be re-raised
raise
except Exception:
# Any other exception means trigram is not supported
return False
def get_sqlite_version(conn: sqlite3.Connection) -> tuple[int, int, int]:
"""Get SQLite version as (major, minor, patch) tuple.
Args:
conn: Database connection
Returns:
Version tuple, e.g., (3, 34, 1)
"""
row = conn.execute("SELECT sqlite_version()").fetchone()
version_str = row[0] if row else "0.0.0"
parts = version_str.split('.')
try:
major = int(parts[0]) if len(parts) > 0 else 0
minor = int(parts[1]) if len(parts) > 1 else 0
patch = int(parts[2]) if len(parts) > 2 else 0
return (major, minor, patch)
except (ValueError, IndexError):
return (0, 0, 0)

View File

@@ -0,0 +1,415 @@
"""Central storage for vector metadata.
This module provides a centralized SQLite database for storing chunk metadata
associated with centralized vector indexes. Instead of traversing all _index.db
files to fetch chunk metadata, this provides O(1) lookup by chunk ID.
"""
from __future__ import annotations
import json
import logging
import sqlite3
import threading
from pathlib import Path
from typing import Any, Dict, List, Optional
from codexlens.errors import StorageError
logger = logging.getLogger(__name__)
class VectorMetadataStore:
"""Store and retrieve chunk metadata for centralized vector search.
This class provides efficient storage and retrieval of chunk metadata
for the centralized vector index architecture. All chunk metadata is
stored in a single _vectors_meta.db file at the project root, enabling
fast lookups without traversing multiple _index.db files.
Schema:
chunk_metadata:
- chunk_id: INTEGER PRIMARY KEY - Global chunk ID
- file_path: TEXT NOT NULL - Path to source file
- content: TEXT - Chunk text content
- start_line: INTEGER - Start line in source file
- end_line: INTEGER - End line in source file
- category: TEXT - Content category (code/doc)
- metadata: TEXT - JSON-encoded additional metadata
- source_index_db: TEXT - Path to source _index.db file
"""
def __init__(self, db_path: Path | str) -> None:
"""Initialize VectorMetadataStore.
Args:
db_path: Path to SQLite database file.
"""
self.db_path = Path(db_path)
self.db_path.parent.mkdir(parents=True, exist_ok=True)
# Thread-safe connection management
self._lock = threading.RLock()
self._local = threading.local()
def _get_connection(self) -> sqlite3.Connection:
"""Get or create a thread-local database connection.
Each thread gets its own connection to ensure thread safety.
"""
conn = getattr(self._local, "conn", None)
if conn is None:
conn = sqlite3.connect(
str(self.db_path),
timeout=30.0,
check_same_thread=True,
)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
conn.execute("PRAGMA mmap_size=1073741824") # 1GB mmap
self._local.conn = conn
return conn
def _ensure_schema(self) -> None:
"""Create tables if they don't exist."""
with self._lock:
conn = self._get_connection()
try:
conn.execute('''
CREATE TABLE IF NOT EXISTS chunk_metadata (
chunk_id INTEGER PRIMARY KEY,
file_path TEXT NOT NULL,
content TEXT,
start_line INTEGER,
end_line INTEGER,
category TEXT,
metadata TEXT,
source_index_db TEXT
)
''')
conn.execute(
'CREATE INDEX IF NOT EXISTS idx_chunk_file_path '
'ON chunk_metadata(file_path)'
)
conn.execute(
'CREATE INDEX IF NOT EXISTS idx_chunk_category '
'ON chunk_metadata(category)'
)
# Binary vectors table for cascade search
conn.execute('''
CREATE TABLE IF NOT EXISTS binary_vectors (
chunk_id INTEGER PRIMARY KEY,
vector BLOB NOT NULL
)
''')
conn.commit()
logger.debug("VectorMetadataStore schema created/verified")
except sqlite3.Error as e:
raise StorageError(
f"Failed to create schema: {e}",
db_path=str(self.db_path),
operation="_ensure_schema"
) from e
def add_chunk(
self,
chunk_id: int,
file_path: str,
content: str,
start_line: Optional[int] = None,
end_line: Optional[int] = None,
category: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
source_index_db: Optional[str] = None,
) -> None:
"""Add a single chunk's metadata.
Args:
chunk_id: Global unique chunk ID.
file_path: Path to source file.
content: Chunk text content.
start_line: Start line in source file.
end_line: End line in source file.
category: Content category (code/doc).
metadata: Additional metadata dictionary.
source_index_db: Path to source _index.db file.
"""
with self._lock:
conn = self._get_connection()
try:
metadata_json = json.dumps(metadata) if metadata else None
conn.execute(
'''
INSERT OR REPLACE INTO chunk_metadata
(chunk_id, file_path, content, start_line, end_line,
category, metadata, source_index_db)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
''',
(chunk_id, file_path, content, start_line, end_line,
category, metadata_json, source_index_db)
)
conn.commit()
except sqlite3.Error as e:
raise StorageError(
f"Failed to add chunk {chunk_id}: {e}",
db_path=str(self.db_path),
operation="add_chunk"
) from e
def add_chunks(self, chunks: List[Dict[str, Any]]) -> None:
"""Batch insert chunk metadata.
Args:
chunks: List of dictionaries with keys:
- chunk_id (required): Global unique chunk ID
- file_path (required): Path to source file
- content: Chunk text content
- start_line: Start line in source file
- end_line: End line in source file
- category: Content category (code/doc)
- metadata: Additional metadata dictionary
- source_index_db: Path to source _index.db file
"""
if not chunks:
return
with self._lock:
conn = self._get_connection()
try:
batch_data = []
for chunk in chunks:
metadata = chunk.get("metadata")
metadata_json = json.dumps(metadata) if metadata else None
batch_data.append((
chunk["chunk_id"],
chunk["file_path"],
chunk.get("content"),
chunk.get("start_line"),
chunk.get("end_line"),
chunk.get("category"),
metadata_json,
chunk.get("source_index_db"),
))
conn.executemany(
'''
INSERT OR REPLACE INTO chunk_metadata
(chunk_id, file_path, content, start_line, end_line,
category, metadata, source_index_db)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
''',
batch_data
)
conn.commit()
logger.debug("Batch inserted %d chunk metadata records", len(chunks))
except sqlite3.Error as e:
raise StorageError(
f"Failed to batch insert chunks: {e}",
db_path=str(self.db_path),
operation="add_chunks"
) from e
def get_chunks_by_ids(
self,
chunk_ids: List[int],
category: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""Retrieve chunks by their IDs - the key optimization.
This is the primary method that replaces traversing all _index.db files.
Provides O(1) lookup by chunk ID instead of O(n) where n is the number
of index databases.
Args:
chunk_ids: List of chunk IDs to retrieve.
category: Optional category filter ('code' or 'doc').
Returns:
List of dictionaries with chunk metadata:
- chunk_id: Global chunk ID
- file_path: Path to source file
- content: Chunk text content
- start_line: Start line in source file
- end_line: End line in source file
- category: Content category
- metadata: Parsed metadata dictionary
- source_index_db: Source _index.db path
"""
if not chunk_ids:
return []
# No lock needed for reads: WAL mode + thread-local connections ensure safety
conn = self._get_connection()
try:
placeholders = ",".join("?" * len(chunk_ids))
if category:
query = f'''
SELECT chunk_id, file_path, content, start_line, end_line,
category, metadata, source_index_db
FROM chunk_metadata
WHERE chunk_id IN ({placeholders}) AND category = ?
'''
params = list(chunk_ids) + [category]
else:
query = f'''
SELECT chunk_id, file_path, content, start_line, end_line,
category, metadata, source_index_db
FROM chunk_metadata
WHERE chunk_id IN ({placeholders})
'''
params = list(chunk_ids)
rows = conn.execute(query, params).fetchall()
results = []
for row in rows:
metadata = None
if row["metadata"]:
try:
metadata = json.loads(row["metadata"])
except json.JSONDecodeError:
metadata = {}
results.append({
"chunk_id": row["chunk_id"],
"file_path": row["file_path"],
"content": row["content"],
"start_line": row["start_line"],
"end_line": row["end_line"],
"category": row["category"],
"metadata": metadata or {},
"source_index_db": row["source_index_db"],
})
return results
except sqlite3.Error as e:
logger.error("Failed to get chunks by IDs: %s", e)
return []
def get_chunk_count(self) -> int:
"""Get total number of chunks in store.
Returns:
Total chunk count.
"""
# No lock needed for reads: WAL mode + thread-local connections ensure safety
conn = self._get_connection()
try:
row = conn.execute(
"SELECT COUNT(*) FROM chunk_metadata"
).fetchone()
return row[0] if row else 0
except sqlite3.Error:
return 0
def clear(self) -> None:
"""Clear all metadata."""
with self._lock:
conn = self._get_connection()
try:
conn.execute("DELETE FROM chunk_metadata")
conn.commit()
logger.info("Cleared all chunk metadata")
except sqlite3.Error as e:
raise StorageError(
f"Failed to clear metadata: {e}",
db_path=str(self.db_path),
operation="clear"
) from e
def close(self) -> None:
"""Close database connection."""
with self._lock:
conn = getattr(self._local, "conn", None)
if conn is not None:
conn.close()
self._local.conn = None
def __enter__(self) -> "VectorMetadataStore":
"""Context manager entry."""
self._ensure_schema()
return self
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
"""Context manager exit."""
self.close()
# ============= Binary Vector Methods for Cascade Search =============
def add_binary_vectors(
self, chunk_ids: List[int], binary_vectors: List[bytes]
) -> None:
"""Batch insert binary vectors for cascade search.
Args:
chunk_ids: List of chunk IDs.
binary_vectors: List of packed binary vectors (as bytes).
"""
if not chunk_ids or len(chunk_ids) != len(binary_vectors):
return
with self._lock:
conn = self._get_connection()
try:
data = list(zip(chunk_ids, binary_vectors))
conn.executemany(
"INSERT OR REPLACE INTO binary_vectors (chunk_id, vector) VALUES (?, ?)",
data
)
conn.commit()
logger.debug("Added %d binary vectors", len(chunk_ids))
except sqlite3.Error as e:
raise StorageError(
f"Failed to add binary vectors: {e}",
db_path=str(self.db_path),
operation="add_binary_vectors"
) from e
def get_all_binary_vectors(self) -> List[tuple]:
"""Get all binary vectors for cascade search.
Returns:
List of (chunk_id, vector_bytes) tuples.
"""
conn = self._get_connection()
try:
rows = conn.execute(
"SELECT chunk_id, vector FROM binary_vectors"
).fetchall()
return [(row[0], row[1]) for row in rows]
except sqlite3.Error as e:
logger.error("Failed to get binary vectors: %s", e)
return []
def get_binary_vector_count(self) -> int:
"""Get total number of binary vectors.
Returns:
Binary vector count.
"""
conn = self._get_connection()
try:
row = conn.execute(
"SELECT COUNT(*) FROM binary_vectors"
).fetchone()
return row[0] if row else 0
except sqlite3.Error:
return 0
def clear_binary_vectors(self) -> None:
"""Clear all binary vectors."""
with self._lock:
conn = self._get_connection()
try:
conn.execute("DELETE FROM binary_vectors")
conn.commit()
logger.info("Cleared all binary vectors")
except sqlite3.Error as e:
raise StorageError(
f"Failed to clear binary vectors: {e}",
db_path=str(self.db_path),
operation="clear_binary_vectors"
) from e