mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-13 02:41:50 +08:00
Refactor code structure and remove redundant changes
This commit is contained in:
32
codex-lens/build/lib/codexlens/storage/__init__.py
Normal file
32
codex-lens/build/lib/codexlens/storage/__init__.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Storage backends for CodexLens."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .sqlite_store import SQLiteStore
|
||||
from .path_mapper import PathMapper
|
||||
from .registry import RegistryStore, ProjectInfo, DirMapping
|
||||
from .dir_index import DirIndexStore, SubdirLink, FileEntry
|
||||
from .index_tree import IndexTreeBuilder, BuildResult, DirBuildResult
|
||||
from .vector_meta_store import VectorMetadataStore
|
||||
|
||||
__all__ = [
|
||||
# Legacy (workspace-local)
|
||||
"SQLiteStore",
|
||||
# Path mapping
|
||||
"PathMapper",
|
||||
# Global registry
|
||||
"RegistryStore",
|
||||
"ProjectInfo",
|
||||
"DirMapping",
|
||||
# Directory index
|
||||
"DirIndexStore",
|
||||
"SubdirLink",
|
||||
"FileEntry",
|
||||
# Tree builder
|
||||
"IndexTreeBuilder",
|
||||
"BuildResult",
|
||||
"DirBuildResult",
|
||||
# Vector metadata
|
||||
"VectorMetadataStore",
|
||||
]
|
||||
|
||||
2358
codex-lens/build/lib/codexlens/storage/dir_index.py
Normal file
2358
codex-lens/build/lib/codexlens/storage/dir_index.py
Normal file
File diff suppressed because it is too large
Load Diff
32
codex-lens/build/lib/codexlens/storage/file_cache.py
Normal file
32
codex-lens/build/lib/codexlens/storage/file_cache.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Simple filesystem cache helpers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileCache:
|
||||
"""Caches file mtimes for incremental indexing."""
|
||||
|
||||
cache_path: Path
|
||||
|
||||
def load_mtime(self, path: Path) -> Optional[float]:
|
||||
try:
|
||||
key = self._key_for(path)
|
||||
record = (self.cache_path / key).read_text(encoding="utf-8")
|
||||
return float(record)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def store_mtime(self, path: Path, mtime: float) -> None:
|
||||
self.cache_path.mkdir(parents=True, exist_ok=True)
|
||||
key = self._key_for(path)
|
||||
(self.cache_path / key).write_text(str(mtime), encoding="utf-8")
|
||||
|
||||
def _key_for(self, path: Path) -> str:
|
||||
safe = str(path).replace(":", "_").replace("\\", "_").replace("/", "_")
|
||||
return f"{safe}.mtime"
|
||||
|
||||
398
codex-lens/build/lib/codexlens/storage/global_index.py
Normal file
398
codex-lens/build/lib/codexlens/storage/global_index.py
Normal file
@@ -0,0 +1,398 @@
|
||||
"""Global cross-directory symbol index for fast lookups.
|
||||
|
||||
Stores symbols for an entire project in a single SQLite database so symbol search
|
||||
does not require traversing every directory _index.db.
|
||||
|
||||
This index is updated incrementally during file indexing (delete+insert per file)
|
||||
to avoid expensive batch rebuilds.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from codexlens.entities import Symbol
|
||||
from codexlens.errors import StorageError
|
||||
|
||||
|
||||
class GlobalSymbolIndex:
|
||||
"""Project-wide symbol index with incremental updates."""
|
||||
|
||||
SCHEMA_VERSION = 1
|
||||
DEFAULT_DB_NAME = "_global_symbols.db"
|
||||
|
||||
def __init__(self, db_path: str | Path, project_id: int) -> None:
|
||||
self.db_path = Path(db_path).resolve()
|
||||
self.project_id = int(project_id)
|
||||
self._lock = threading.RLock()
|
||||
self._conn: Optional[sqlite3.Connection] = None
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Create database and schema if not exists."""
|
||||
with self._lock:
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
conn = self._get_connection()
|
||||
|
||||
current_version = self._get_schema_version(conn)
|
||||
if current_version > self.SCHEMA_VERSION:
|
||||
raise StorageError(
|
||||
f"Database schema version {current_version} is newer than "
|
||||
f"supported version {self.SCHEMA_VERSION}. "
|
||||
f"Please update the application or use a compatible database.",
|
||||
db_path=str(self.db_path),
|
||||
operation="initialize",
|
||||
details={
|
||||
"current_version": current_version,
|
||||
"supported_version": self.SCHEMA_VERSION,
|
||||
},
|
||||
)
|
||||
|
||||
if current_version == 0:
|
||||
self._create_schema(conn)
|
||||
self._set_schema_version(conn, self.SCHEMA_VERSION)
|
||||
elif current_version < self.SCHEMA_VERSION:
|
||||
self._apply_migrations(conn, current_version)
|
||||
self._set_schema_version(conn, self.SCHEMA_VERSION)
|
||||
|
||||
conn.commit()
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close database connection."""
|
||||
with self._lock:
|
||||
if self._conn is not None:
|
||||
try:
|
||||
self._conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
self._conn = None
|
||||
|
||||
def __enter__(self) -> "GlobalSymbolIndex":
|
||||
self.initialize()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
|
||||
self.close()
|
||||
|
||||
def add_symbol(self, symbol: Symbol, file_path: str | Path, index_path: str | Path) -> None:
|
||||
"""Insert a single symbol (idempotent) for incremental updates."""
|
||||
file_path_str = str(Path(file_path).resolve())
|
||||
index_path_str = str(Path(index_path).resolve())
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO global_symbols(
|
||||
project_id, symbol_name, symbol_kind,
|
||||
file_path, start_line, end_line, index_path
|
||||
)
|
||||
VALUES(?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(
|
||||
project_id, symbol_name, symbol_kind,
|
||||
file_path, start_line, end_line
|
||||
)
|
||||
DO UPDATE SET
|
||||
index_path=excluded.index_path
|
||||
""",
|
||||
(
|
||||
self.project_id,
|
||||
symbol.name,
|
||||
symbol.kind,
|
||||
file_path_str,
|
||||
symbol.range[0],
|
||||
symbol.range[1],
|
||||
index_path_str,
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
conn.rollback()
|
||||
raise StorageError(
|
||||
f"Failed to add symbol {symbol.name}: {exc}",
|
||||
db_path=str(self.db_path),
|
||||
operation="add_symbol",
|
||||
) from exc
|
||||
|
||||
def update_file_symbols(
|
||||
self,
|
||||
file_path: str | Path,
|
||||
symbols: List[Symbol],
|
||||
index_path: str | Path | None = None,
|
||||
) -> None:
|
||||
"""Replace all symbols for a file atomically (delete + insert)."""
|
||||
file_path_str = str(Path(file_path).resolve())
|
||||
|
||||
index_path_str: Optional[str]
|
||||
if index_path is not None:
|
||||
index_path_str = str(Path(index_path).resolve())
|
||||
else:
|
||||
index_path_str = self._get_existing_index_path(file_path_str)
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
conn.execute("BEGIN")
|
||||
conn.execute(
|
||||
"DELETE FROM global_symbols WHERE project_id=? AND file_path=?",
|
||||
(self.project_id, file_path_str),
|
||||
)
|
||||
|
||||
if symbols:
|
||||
if not index_path_str:
|
||||
raise StorageError(
|
||||
"index_path is required when inserting symbols for a new file",
|
||||
db_path=str(self.db_path),
|
||||
operation="update_file_symbols",
|
||||
details={"file_path": file_path_str},
|
||||
)
|
||||
|
||||
rows = [
|
||||
(
|
||||
self.project_id,
|
||||
s.name,
|
||||
s.kind,
|
||||
file_path_str,
|
||||
s.range[0],
|
||||
s.range[1],
|
||||
index_path_str,
|
||||
)
|
||||
for s in symbols
|
||||
]
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT INTO global_symbols(
|
||||
project_id, symbol_name, symbol_kind,
|
||||
file_path, start_line, end_line, index_path
|
||||
)
|
||||
VALUES(?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(
|
||||
project_id, symbol_name, symbol_kind,
|
||||
file_path, start_line, end_line
|
||||
)
|
||||
DO UPDATE SET
|
||||
index_path=excluded.index_path
|
||||
""",
|
||||
rows,
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
conn.rollback()
|
||||
raise StorageError(
|
||||
f"Failed to update symbols for {file_path_str}: {exc}",
|
||||
db_path=str(self.db_path),
|
||||
operation="update_file_symbols",
|
||||
) from exc
|
||||
|
||||
def delete_file_symbols(self, file_path: str | Path) -> int:
|
||||
"""Remove all symbols for a file. Returns number of rows deleted."""
|
||||
file_path_str = str(Path(file_path).resolve())
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
cur = conn.execute(
|
||||
"DELETE FROM global_symbols WHERE project_id=? AND file_path=?",
|
||||
(self.project_id, file_path_str),
|
||||
)
|
||||
conn.commit()
|
||||
return int(cur.rowcount or 0)
|
||||
except sqlite3.DatabaseError as exc:
|
||||
conn.rollback()
|
||||
raise StorageError(
|
||||
f"Failed to delete symbols for {file_path_str}: {exc}",
|
||||
db_path=str(self.db_path),
|
||||
operation="delete_file_symbols",
|
||||
) from exc
|
||||
|
||||
def search(
|
||||
self,
|
||||
name: str,
|
||||
kind: Optional[str] = None,
|
||||
limit: int = 50,
|
||||
prefix_mode: bool = True,
|
||||
) -> List[Symbol]:
|
||||
"""Search symbols and return full Symbol objects."""
|
||||
if prefix_mode:
|
||||
pattern = f"{name}%"
|
||||
else:
|
||||
pattern = f"%{name}%"
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
if kind:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT symbol_name, symbol_kind, file_path, start_line, end_line
|
||||
FROM global_symbols
|
||||
WHERE project_id=? AND symbol_name LIKE ? AND symbol_kind=?
|
||||
ORDER BY symbol_name
|
||||
LIMIT ?
|
||||
""",
|
||||
(self.project_id, pattern, kind, limit),
|
||||
).fetchall()
|
||||
else:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT symbol_name, symbol_kind, file_path, start_line, end_line
|
||||
FROM global_symbols
|
||||
WHERE project_id=? AND symbol_name LIKE ?
|
||||
ORDER BY symbol_name
|
||||
LIMIT ?
|
||||
""",
|
||||
(self.project_id, pattern, limit),
|
||||
).fetchall()
|
||||
|
||||
return [
|
||||
Symbol(
|
||||
name=row["symbol_name"],
|
||||
kind=row["symbol_kind"],
|
||||
range=(row["start_line"], row["end_line"]),
|
||||
file=row["file_path"],
|
||||
)
|
||||
for row in rows
|
||||
]
|
||||
|
||||
def search_symbols(
|
||||
self,
|
||||
name: str,
|
||||
kind: Optional[str] = None,
|
||||
limit: int = 50,
|
||||
prefix_mode: bool = True,
|
||||
) -> List[Tuple[str, Tuple[int, int]]]:
|
||||
"""Search symbols and return only (file_path, (start_line, end_line))."""
|
||||
symbols = self.search(name=name, kind=kind, limit=limit, prefix_mode=prefix_mode)
|
||||
return [(s.file or "", s.range) for s in symbols]
|
||||
|
||||
def get_file_symbols(self, file_path: str | Path) -> List[Symbol]:
|
||||
"""Get all symbols in a specific file, sorted by start_line.
|
||||
|
||||
Args:
|
||||
file_path: Full path to the file
|
||||
|
||||
Returns:
|
||||
List of Symbol objects sorted by start_line
|
||||
"""
|
||||
file_path_str = str(Path(file_path).resolve())
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT symbol_name, symbol_kind, file_path, start_line, end_line
|
||||
FROM global_symbols
|
||||
WHERE project_id=? AND file_path=?
|
||||
ORDER BY start_line
|
||||
""",
|
||||
(self.project_id, file_path_str),
|
||||
).fetchall()
|
||||
|
||||
return [
|
||||
Symbol(
|
||||
name=row["symbol_name"],
|
||||
kind=row["symbol_kind"],
|
||||
range=(row["start_line"], row["end_line"]),
|
||||
file=row["file_path"],
|
||||
)
|
||||
for row in rows
|
||||
]
|
||||
|
||||
def _get_existing_index_path(self, file_path_str: str) -> Optional[str]:
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT index_path
|
||||
FROM global_symbols
|
||||
WHERE project_id=? AND file_path=?
|
||||
LIMIT 1
|
||||
""",
|
||||
(self.project_id, file_path_str),
|
||||
).fetchone()
|
||||
return str(row["index_path"]) if row else None
|
||||
|
||||
def _get_schema_version(self, conn: sqlite3.Connection) -> int:
|
||||
try:
|
||||
row = conn.execute("PRAGMA user_version").fetchone()
|
||||
return int(row[0]) if row else 0
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
def _set_schema_version(self, conn: sqlite3.Connection, version: int) -> None:
|
||||
conn.execute(f"PRAGMA user_version = {int(version)}")
|
||||
|
||||
def _apply_migrations(self, conn: sqlite3.Connection, from_version: int) -> None:
|
||||
# No migrations yet (v1).
|
||||
_ = (conn, from_version)
|
||||
return
|
||||
|
||||
def _get_connection(self) -> sqlite3.Connection:
|
||||
if self._conn is None:
|
||||
self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
|
||||
self._conn.row_factory = sqlite3.Row
|
||||
self._conn.execute("PRAGMA journal_mode=WAL")
|
||||
self._conn.execute("PRAGMA synchronous=NORMAL")
|
||||
self._conn.execute("PRAGMA foreign_keys=ON")
|
||||
self._conn.execute("PRAGMA mmap_size=30000000000")
|
||||
return self._conn
|
||||
|
||||
def _create_schema(self, conn: sqlite3.Connection) -> None:
|
||||
try:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS global_symbols (
|
||||
id INTEGER PRIMARY KEY,
|
||||
project_id INTEGER NOT NULL,
|
||||
symbol_name TEXT NOT NULL,
|
||||
symbol_kind TEXT NOT NULL,
|
||||
file_path TEXT NOT NULL,
|
||||
start_line INTEGER,
|
||||
end_line INTEGER,
|
||||
index_path TEXT NOT NULL,
|
||||
UNIQUE(
|
||||
project_id, symbol_name, symbol_kind,
|
||||
file_path, start_line, end_line
|
||||
)
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Required by optimization spec.
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_global_symbols_name_kind
|
||||
ON global_symbols(symbol_name, symbol_kind)
|
||||
"""
|
||||
)
|
||||
# Used by common queries (project-scoped name lookups).
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_global_symbols_project_name_kind
|
||||
ON global_symbols(project_id, symbol_name, symbol_kind)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_global_symbols_project_file
|
||||
ON global_symbols(project_id, file_path)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_global_symbols_project_index_path
|
||||
ON global_symbols(project_id, index_path)
|
||||
"""
|
||||
)
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(
|
||||
f"Failed to initialize global symbol schema: {exc}",
|
||||
db_path=str(self.db_path),
|
||||
operation="_create_schema",
|
||||
) from exc
|
||||
|
||||
1064
codex-lens/build/lib/codexlens/storage/index_tree.py
Normal file
1064
codex-lens/build/lib/codexlens/storage/index_tree.py
Normal file
File diff suppressed because it is too large
Load Diff
136
codex-lens/build/lib/codexlens/storage/merkle_tree.py
Normal file
136
codex-lens/build/lib/codexlens/storage/merkle_tree.py
Normal file
@@ -0,0 +1,136 @@
|
||||
"""Merkle tree utilities for change detection.
|
||||
|
||||
This module provides a generic, file-system based Merkle tree implementation
|
||||
that can be used to efficiently diff directory states.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
|
||||
|
||||
def sha256_bytes(data: bytes) -> str:
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def sha256_text(text: str) -> str:
|
||||
return sha256_bytes(text.encode("utf-8", errors="ignore"))
|
||||
|
||||
|
||||
@dataclass
|
||||
class MerkleNode:
|
||||
"""A Merkle node representing either a file (leaf) or directory (internal)."""
|
||||
|
||||
name: str
|
||||
rel_path: str
|
||||
hash: str
|
||||
is_dir: bool
|
||||
children: Dict[str, "MerkleNode"] = field(default_factory=dict)
|
||||
|
||||
def iter_files(self) -> Iterable["MerkleNode"]:
|
||||
if not self.is_dir:
|
||||
yield self
|
||||
return
|
||||
for child in self.children.values():
|
||||
yield from child.iter_files()
|
||||
|
||||
|
||||
@dataclass
|
||||
class MerkleTree:
|
||||
"""Merkle tree for a directory snapshot."""
|
||||
|
||||
root: MerkleNode
|
||||
|
||||
@classmethod
|
||||
def build_from_directory(cls, root_dir: Path) -> "MerkleTree":
|
||||
root_dir = Path(root_dir).resolve()
|
||||
node = cls._build_node(root_dir, base=root_dir)
|
||||
return cls(root=node)
|
||||
|
||||
@classmethod
|
||||
def _build_node(cls, path: Path, *, base: Path) -> MerkleNode:
|
||||
if path.is_file():
|
||||
rel = str(path.relative_to(base)).replace("\\", "/")
|
||||
return MerkleNode(
|
||||
name=path.name,
|
||||
rel_path=rel,
|
||||
hash=sha256_bytes(path.read_bytes()),
|
||||
is_dir=False,
|
||||
)
|
||||
|
||||
if not path.is_dir():
|
||||
rel = str(path.relative_to(base)).replace("\\", "/")
|
||||
return MerkleNode(name=path.name, rel_path=rel, hash="", is_dir=False)
|
||||
|
||||
children: Dict[str, MerkleNode] = {}
|
||||
for child in sorted(path.iterdir(), key=lambda p: p.name):
|
||||
child_node = cls._build_node(child, base=base)
|
||||
children[child_node.name] = child_node
|
||||
|
||||
items = [
|
||||
f"{'d' if n.is_dir else 'f'}:{name}:{n.hash}"
|
||||
for name, n in sorted(children.items(), key=lambda kv: kv[0])
|
||||
]
|
||||
dir_hash = sha256_text("\n".join(items))
|
||||
|
||||
rel_path = "." if path == base else str(path.relative_to(base)).replace("\\", "/")
|
||||
return MerkleNode(
|
||||
name="." if path == base else path.name,
|
||||
rel_path=rel_path,
|
||||
hash=dir_hash,
|
||||
is_dir=True,
|
||||
children=children,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def find_changed_files(old: Optional["MerkleTree"], new: Optional["MerkleTree"]) -> List[str]:
|
||||
"""Find changed/added/removed files between two trees.
|
||||
|
||||
Returns:
|
||||
List of relative file paths (POSIX-style separators).
|
||||
"""
|
||||
if old is None and new is None:
|
||||
return []
|
||||
if old is None:
|
||||
return sorted({n.rel_path for n in new.root.iter_files()}) # type: ignore[union-attr]
|
||||
if new is None:
|
||||
return sorted({n.rel_path for n in old.root.iter_files()})
|
||||
|
||||
changed: set[str] = set()
|
||||
|
||||
def walk(old_node: Optional[MerkleNode], new_node: Optional[MerkleNode]) -> None:
|
||||
if old_node is None and new_node is None:
|
||||
return
|
||||
|
||||
if old_node is None and new_node is not None:
|
||||
changed.update(n.rel_path for n in new_node.iter_files())
|
||||
return
|
||||
|
||||
if new_node is None and old_node is not None:
|
||||
changed.update(n.rel_path for n in old_node.iter_files())
|
||||
return
|
||||
|
||||
assert old_node is not None and new_node is not None
|
||||
|
||||
if old_node.hash == new_node.hash:
|
||||
return
|
||||
|
||||
if not old_node.is_dir and not new_node.is_dir:
|
||||
changed.add(new_node.rel_path)
|
||||
return
|
||||
|
||||
if old_node.is_dir != new_node.is_dir:
|
||||
changed.update(n.rel_path for n in old_node.iter_files())
|
||||
changed.update(n.rel_path for n in new_node.iter_files())
|
||||
return
|
||||
|
||||
names = set(old_node.children.keys()) | set(new_node.children.keys())
|
||||
for name in names:
|
||||
walk(old_node.children.get(name), new_node.children.get(name))
|
||||
|
||||
walk(old.root, new.root)
|
||||
return sorted(changed)
|
||||
|
||||
154
codex-lens/build/lib/codexlens/storage/migration_manager.py
Normal file
154
codex-lens/build/lib/codexlens/storage/migration_manager.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""
|
||||
Manages database schema migrations.
|
||||
|
||||
This module provides a framework for applying versioned migrations to the SQLite
|
||||
database. Migrations are discovered from the `codexlens.storage.migrations`
|
||||
package and applied sequentially. The database schema version is tracked using
|
||||
the `user_version` pragma.
|
||||
"""
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
import pkgutil
|
||||
from pathlib import Path
|
||||
from sqlite3 import Connection
|
||||
from typing import List, NamedTuple
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Migration(NamedTuple):
|
||||
"""Represents a single database migration."""
|
||||
|
||||
version: int
|
||||
name: str
|
||||
upgrade: callable
|
||||
|
||||
|
||||
def discover_migrations() -> List[Migration]:
|
||||
"""
|
||||
Discovers and returns a sorted list of database migrations.
|
||||
|
||||
Migrations are expected to be in the `codexlens.storage.migrations` package,
|
||||
with filenames in the format `migration_XXX_description.py`, where XXX is
|
||||
the version number. Each migration module must contain an `upgrade` function
|
||||
that takes a `sqlite3.Connection` object as its argument.
|
||||
|
||||
Returns:
|
||||
A list of Migration objects, sorted by version.
|
||||
"""
|
||||
import codexlens.storage.migrations
|
||||
|
||||
migrations = []
|
||||
package_path = Path(codexlens.storage.migrations.__file__).parent
|
||||
|
||||
for _, name, _ in pkgutil.iter_modules([str(package_path)]):
|
||||
if name.startswith("migration_"):
|
||||
try:
|
||||
version = int(name.split("_")[1])
|
||||
module = importlib.import_module(f"codexlens.storage.migrations.{name}")
|
||||
if hasattr(module, "upgrade"):
|
||||
migrations.append(
|
||||
Migration(version=version, name=name, upgrade=module.upgrade)
|
||||
)
|
||||
else:
|
||||
log.warning(f"Migration {name} is missing 'upgrade' function.")
|
||||
except (ValueError, IndexError) as e:
|
||||
log.warning(f"Could not parse migration name {name}: {e}")
|
||||
except ImportError as e:
|
||||
log.warning(f"Could not import migration {name}: {e}")
|
||||
|
||||
migrations.sort(key=lambda m: m.version)
|
||||
return migrations
|
||||
|
||||
|
||||
class MigrationManager:
|
||||
"""
|
||||
Manages the application of migrations to a database.
|
||||
"""
|
||||
|
||||
def __init__(self, db_conn: Connection):
|
||||
"""
|
||||
Initializes the MigrationManager.
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
self.db_conn = db_conn
|
||||
self.migrations = discover_migrations()
|
||||
|
||||
def get_current_version(self) -> int:
|
||||
"""
|
||||
Gets the current version of the database schema.
|
||||
|
||||
Returns:
|
||||
The current schema version number.
|
||||
"""
|
||||
return self.db_conn.execute("PRAGMA user_version").fetchone()[0]
|
||||
|
||||
def set_version(self, version: int):
|
||||
"""
|
||||
Sets the database schema version.
|
||||
|
||||
Args:
|
||||
version: The version number to set.
|
||||
"""
|
||||
self.db_conn.execute(f"PRAGMA user_version = {version}")
|
||||
log.info(f"Database schema version set to {version}")
|
||||
|
||||
def apply_migrations(self):
|
||||
"""
|
||||
Applies all pending migrations to the database.
|
||||
|
||||
This method checks the current database version and applies all
|
||||
subsequent migrations in order. Each migration is applied within
|
||||
a transaction, unless the migration manages its own transactions.
|
||||
"""
|
||||
current_version = self.get_current_version()
|
||||
log.info(f"Current database schema version: {current_version}")
|
||||
|
||||
for migration in self.migrations:
|
||||
if migration.version > current_version:
|
||||
log.info(f"Applying migration {migration.version}: {migration.name}...")
|
||||
try:
|
||||
# Check if a transaction is already in progress
|
||||
in_transaction = self.db_conn.in_transaction
|
||||
|
||||
# Only start transaction if not already in one
|
||||
if not in_transaction:
|
||||
self.db_conn.execute("BEGIN")
|
||||
|
||||
migration.upgrade(self.db_conn)
|
||||
self.set_version(migration.version)
|
||||
|
||||
# Only commit if we started the transaction and it's still active
|
||||
if not in_transaction and self.db_conn.in_transaction:
|
||||
self.db_conn.execute("COMMIT")
|
||||
|
||||
log.info(
|
||||
f"Successfully applied migration {migration.version}: {migration.name}"
|
||||
)
|
||||
except Exception as e:
|
||||
log.error(
|
||||
f"Failed to apply migration {migration.version}: {migration.name}. Error: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
# Try to rollback if transaction is active
|
||||
try:
|
||||
if self.db_conn.in_transaction:
|
||||
self.db_conn.execute("ROLLBACK")
|
||||
except Exception:
|
||||
pass # Ignore rollback errors
|
||||
raise
|
||||
|
||||
latest_migration_version = self.migrations[-1].version if self.migrations else 0
|
||||
if current_version < latest_migration_version:
|
||||
# This case can be hit if migrations were applied but the loop was exited
|
||||
# and set_version was not called for the last one for some reason.
|
||||
# To be safe, we explicitly set the version to the latest known migration.
|
||||
final_version = self.get_current_version()
|
||||
if final_version != latest_migration_version:
|
||||
log.warning(f"Database version ({final_version}) is not the latest migration version ({latest_migration_version}). This may indicate a problem.")
|
||||
|
||||
log.info("All pending migrations applied successfully.")
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
# This file makes the 'migrations' directory a Python package.
|
||||
@@ -0,0 +1,123 @@
|
||||
"""
|
||||
Migration 001: Normalize keywords into separate tables.
|
||||
|
||||
This migration introduces two new tables, `keywords` and `file_keywords`, to
|
||||
store semantic keywords in a normalized fashion. It then migrates the existing
|
||||
keywords from the `semantic_data` JSON blob in the `files` table into these
|
||||
new tables. This is intended to speed up keyword-based searches significantly.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection):
|
||||
"""
|
||||
Applies the migration to normalize keywords.
|
||||
|
||||
- Creates `keywords` and `file_keywords` tables.
|
||||
- Creates indexes for efficient querying.
|
||||
- Migrates data from `files.semantic_data` to the new tables.
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Creating 'keywords' and 'file_keywords' tables...")
|
||||
# Create a table to store unique keywords
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS keywords (
|
||||
id INTEGER PRIMARY KEY,
|
||||
keyword TEXT NOT NULL UNIQUE
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create a join table to link files and keywords (many-to-many)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS file_keywords (
|
||||
file_id INTEGER NOT NULL,
|
||||
keyword_id INTEGER NOT NULL,
|
||||
PRIMARY KEY (file_id, keyword_id),
|
||||
FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Creating indexes for new keyword tables...")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords (keyword)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords (file_id)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords (keyword_id)")
|
||||
|
||||
log.info("Migrating existing keywords from 'semantic_metadata' table...")
|
||||
|
||||
# Check if semantic_metadata table exists before querying
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'")
|
||||
if not cursor.fetchone():
|
||||
log.info("No 'semantic_metadata' table found, skipping data migration.")
|
||||
return
|
||||
|
||||
# Check if 'keywords' column exists in semantic_metadata table
|
||||
# (current schema may already use normalized tables without this column)
|
||||
cursor.execute("PRAGMA table_info(semantic_metadata)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
if "keywords" not in columns:
|
||||
log.info("No 'keywords' column in semantic_metadata table, skipping data migration.")
|
||||
return
|
||||
|
||||
cursor.execute("SELECT file_id, keywords FROM semantic_metadata WHERE keywords IS NOT NULL AND keywords != ''")
|
||||
|
||||
files_to_migrate = cursor.fetchall()
|
||||
if not files_to_migrate:
|
||||
log.info("No existing files with semantic metadata to migrate.")
|
||||
return
|
||||
|
||||
log.info(f"Found {len(files_to_migrate)} files with semantic metadata to migrate.")
|
||||
|
||||
for file_id, keywords_json in files_to_migrate:
|
||||
if not keywords_json:
|
||||
continue
|
||||
try:
|
||||
keywords = json.loads(keywords_json)
|
||||
|
||||
if not isinstance(keywords, list):
|
||||
log.warning(f"Keywords for file_id {file_id} is not a list, skipping.")
|
||||
continue
|
||||
|
||||
for keyword in keywords:
|
||||
if not isinstance(keyword, str):
|
||||
log.warning(f"Non-string keyword '{keyword}' found for file_id {file_id}, skipping.")
|
||||
continue
|
||||
|
||||
keyword = keyword.strip()
|
||||
if not keyword:
|
||||
continue
|
||||
|
||||
# Get or create keyword_id
|
||||
cursor.execute("INSERT OR IGNORE INTO keywords (keyword) VALUES (?)", (keyword,))
|
||||
cursor.execute("SELECT id FROM keywords WHERE keyword = ?", (keyword,))
|
||||
keyword_id_result = cursor.fetchone()
|
||||
|
||||
if keyword_id_result:
|
||||
keyword_id = keyword_id_result[0]
|
||||
# Link file to keyword
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO file_keywords (file_id, keyword_id) VALUES (?, ?)",
|
||||
(file_id, keyword_id),
|
||||
)
|
||||
else:
|
||||
log.error(f"Failed to retrieve or create keyword_id for keyword: {keyword}")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
log.warning(f"Could not parse keywords for file_id {file_id}: {e}")
|
||||
except Exception as e:
|
||||
log.error(f"An unexpected error occurred during migration for file_id {file_id}: {e}", exc_info=True)
|
||||
|
||||
log.info("Finished migrating keywords.")
|
||||
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
Migration 002: Add token_count and symbol_type to symbols table.
|
||||
|
||||
This migration adds token counting metadata to symbols for accurate chunk
|
||||
splitting and performance optimization. It also adds symbol_type for better
|
||||
filtering in searches.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection):
|
||||
"""
|
||||
Applies the migration to add token metadata to symbols.
|
||||
|
||||
- Adds token_count column to symbols table
|
||||
- Adds symbol_type column to symbols table (for future use)
|
||||
- Creates index on symbol_type for efficient filtering
|
||||
- Backfills existing symbols with NULL token_count (to be calculated lazily)
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Adding token_count column to symbols table...")
|
||||
try:
|
||||
cursor.execute("ALTER TABLE symbols ADD COLUMN token_count INTEGER")
|
||||
log.info("Successfully added token_count column.")
|
||||
except Exception as e:
|
||||
# Column might already exist
|
||||
log.warning(f"Could not add token_count column (might already exist): {e}")
|
||||
|
||||
log.info("Adding symbol_type column to symbols table...")
|
||||
try:
|
||||
cursor.execute("ALTER TABLE symbols ADD COLUMN symbol_type TEXT")
|
||||
log.info("Successfully added symbol_type column.")
|
||||
except Exception as e:
|
||||
# Column might already exist
|
||||
log.warning(f"Could not add symbol_type column (might already exist): {e}")
|
||||
|
||||
log.info("Creating index on symbol_type for efficient filtering...")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type)")
|
||||
|
||||
log.info("Migration 002 completed successfully.")
|
||||
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Migration 004: Add dual FTS tables for exact and fuzzy matching.
|
||||
|
||||
This migration introduces two FTS5 tables:
|
||||
- files_fts_exact: Uses unicode61 tokenizer for exact token matching
|
||||
- files_fts_fuzzy: Uses trigram tokenizer (or extended unicode61) for substring/fuzzy matching
|
||||
|
||||
Both tables are synchronized with the files table via triggers for automatic updates.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
from codexlens.storage.sqlite_utils import check_trigram_support, get_sqlite_version
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection):
|
||||
"""
|
||||
Applies the migration to add dual FTS tables.
|
||||
|
||||
- Drops old files_fts table and triggers
|
||||
- Creates files_fts_exact with unicode61 tokenizer
|
||||
- Creates files_fts_fuzzy with trigram or extended unicode61 tokenizer
|
||||
- Creates synchronized triggers for both tables
|
||||
- Rebuilds FTS indexes from files table
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
try:
|
||||
# Check trigram support
|
||||
has_trigram = check_trigram_support(db_conn)
|
||||
version = get_sqlite_version(db_conn)
|
||||
log.info(f"SQLite version: {'.'.join(map(str, version))}")
|
||||
|
||||
if has_trigram:
|
||||
log.info("Trigram tokenizer available, using for fuzzy FTS table")
|
||||
fuzzy_tokenizer = "trigram"
|
||||
else:
|
||||
log.warning(
|
||||
f"Trigram tokenizer not available (requires SQLite >= 3.34), "
|
||||
f"using extended unicode61 tokenizer for fuzzy matching"
|
||||
)
|
||||
fuzzy_tokenizer = "unicode61 tokenchars '_-.'"
|
||||
|
||||
# Start transaction
|
||||
cursor.execute("BEGIN TRANSACTION")
|
||||
|
||||
# Check if files table has 'name' column (v2 schema doesn't have it)
|
||||
cursor.execute("PRAGMA table_info(files)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if 'name' not in columns:
|
||||
log.info("Adding 'name' column to files table (v2 schema upgrade)...")
|
||||
# Add name column
|
||||
cursor.execute("ALTER TABLE files ADD COLUMN name TEXT")
|
||||
# Populate name from path (extract filename from last '/')
|
||||
# Use Python to do the extraction since SQLite doesn't have reverse()
|
||||
cursor.execute("SELECT rowid, path FROM files")
|
||||
rows = cursor.fetchall()
|
||||
for rowid, path in rows:
|
||||
# Extract filename from path
|
||||
name = path.split('/')[-1] if '/' in path else path
|
||||
cursor.execute("UPDATE files SET name = ? WHERE rowid = ?", (name, rowid))
|
||||
|
||||
# Rename 'path' column to 'full_path' if needed
|
||||
if 'path' in columns and 'full_path' not in columns:
|
||||
log.info("Renaming 'path' to 'full_path' (v2 schema upgrade)...")
|
||||
# Check if indexed_at column exists in v2 schema
|
||||
has_indexed_at = 'indexed_at' in columns
|
||||
has_mtime = 'mtime' in columns
|
||||
|
||||
# SQLite doesn't support RENAME COLUMN before 3.25, so use table recreation
|
||||
cursor.execute("""
|
||||
CREATE TABLE files_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL,
|
||||
full_path TEXT NOT NULL UNIQUE,
|
||||
content TEXT,
|
||||
language TEXT,
|
||||
mtime REAL,
|
||||
indexed_at TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
# Build INSERT statement based on available columns
|
||||
# Note: v2 schema has no rowid (path is PRIMARY KEY), so use NULL for AUTOINCREMENT
|
||||
if has_indexed_at and has_mtime:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language, mtime, indexed_at)
|
||||
SELECT name, path, content, language, mtime, indexed_at FROM files
|
||||
""")
|
||||
elif has_indexed_at:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language, indexed_at)
|
||||
SELECT name, path, content, language, indexed_at FROM files
|
||||
""")
|
||||
elif has_mtime:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language, mtime)
|
||||
SELECT name, path, content, language, mtime FROM files
|
||||
""")
|
||||
else:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language)
|
||||
SELECT name, path, content, language FROM files
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE files")
|
||||
cursor.execute("ALTER TABLE files_new RENAME TO files")
|
||||
|
||||
log.info("Dropping old FTS triggers and table...")
|
||||
# Drop old triggers
|
||||
cursor.execute("DROP TRIGGER IF EXISTS files_ai")
|
||||
cursor.execute("DROP TRIGGER IF EXISTS files_ad")
|
||||
cursor.execute("DROP TRIGGER IF EXISTS files_au")
|
||||
|
||||
# Drop old FTS table
|
||||
cursor.execute("DROP TABLE IF EXISTS files_fts")
|
||||
|
||||
# Create exact FTS table (unicode61 with underscores/hyphens/dots as token chars)
|
||||
# Note: tokenchars includes '.' to properly tokenize qualified names like PortRole.FLOW
|
||||
log.info("Creating files_fts_exact table with unicode61 tokenizer...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE VIRTUAL TABLE files_fts_exact USING fts5(
|
||||
name, full_path UNINDEXED, content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="unicode61 tokenchars '_-.'"
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create fuzzy FTS table (trigram or extended unicode61)
|
||||
log.info(f"Creating files_fts_fuzzy table with {fuzzy_tokenizer} tokenizer...")
|
||||
cursor.execute(
|
||||
f"""
|
||||
CREATE VIRTUAL TABLE files_fts_fuzzy USING fts5(
|
||||
name, full_path UNINDEXED, content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="{fuzzy_tokenizer}"
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create synchronized triggers for files_fts_exact
|
||||
log.info("Creating triggers for files_fts_exact...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_exact_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts_exact(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_exact_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_exact_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
INSERT INTO files_fts_exact(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Create synchronized triggers for files_fts_fuzzy
|
||||
log.info("Creating triggers for files_fts_fuzzy...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_fuzzy_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_fuzzy_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_fuzzy_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Rebuild FTS indexes from files table
|
||||
log.info("Rebuilding FTS indexes from files table...")
|
||||
cursor.execute("INSERT INTO files_fts_exact(files_fts_exact) VALUES('rebuild')")
|
||||
cursor.execute("INSERT INTO files_fts_fuzzy(files_fts_fuzzy) VALUES('rebuild')")
|
||||
|
||||
# Commit transaction
|
||||
cursor.execute("COMMIT")
|
||||
log.info("Migration 004 completed successfully")
|
||||
|
||||
# Vacuum to reclaim space (outside transaction)
|
||||
try:
|
||||
log.info("Running VACUUM to reclaim space...")
|
||||
cursor.execute("VACUUM")
|
||||
except Exception as e:
|
||||
log.warning(f"VACUUM failed (non-critical): {e}")
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Migration 004 failed: {e}")
|
||||
try:
|
||||
cursor.execute("ROLLBACK")
|
||||
except Exception:
|
||||
pass
|
||||
raise
|
||||
@@ -0,0 +1,196 @@
|
||||
"""
|
||||
Migration 005: Remove unused and redundant database fields.
|
||||
|
||||
This migration removes four problematic fields identified by Gemini analysis:
|
||||
|
||||
1. **semantic_metadata.keywords** (deprecated - replaced by file_keywords table)
|
||||
- Data: Migrated to normalized file_keywords table in migration 001
|
||||
- Impact: Column now redundant, remove to prevent sync issues
|
||||
|
||||
2. **symbols.token_count** (unused - always NULL)
|
||||
- Data: Never populated, always NULL
|
||||
- Impact: No data loss, just removes unused column
|
||||
|
||||
3. **symbols.symbol_type** (redundant - duplicates kind)
|
||||
- Data: Redundant with symbols.kind field
|
||||
- Impact: No data loss, kind field contains same information
|
||||
|
||||
4. **subdirs.direct_files** (unused - never displayed)
|
||||
- Data: Never used in queries or display logic
|
||||
- Impact: No data loss, just removes unused column
|
||||
|
||||
Schema changes use table recreation pattern (SQLite best practice):
|
||||
- Create new table without deprecated columns
|
||||
- Copy data from old table
|
||||
- Drop old table
|
||||
- Rename new table
|
||||
- Recreate indexes
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection):
|
||||
"""Remove unused and redundant fields from schema.
|
||||
|
||||
Note: Transaction management is handled by MigrationManager.
|
||||
This migration should NOT start its own transaction.
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
# Step 1: Remove semantic_metadata.keywords (if column exists)
|
||||
log.info("Checking semantic_metadata.keywords column...")
|
||||
|
||||
cursor.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'"
|
||||
)
|
||||
if cursor.fetchone():
|
||||
# Check if keywords column exists
|
||||
cursor.execute("PRAGMA table_info(semantic_metadata)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if "keywords" in columns:
|
||||
log.info("Removing semantic_metadata.keywords column...")
|
||||
cursor.execute("""
|
||||
CREATE TABLE semantic_metadata_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_id INTEGER NOT NULL UNIQUE,
|
||||
summary TEXT,
|
||||
purpose TEXT,
|
||||
llm_tool TEXT,
|
||||
generated_at REAL,
|
||||
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
|
||||
)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO semantic_metadata_new (id, file_id, summary, purpose, llm_tool, generated_at)
|
||||
SELECT id, file_id, summary, purpose, llm_tool, generated_at
|
||||
FROM semantic_metadata
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE semantic_metadata")
|
||||
cursor.execute("ALTER TABLE semantic_metadata_new RENAME TO semantic_metadata")
|
||||
|
||||
# Recreate index
|
||||
cursor.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)"
|
||||
)
|
||||
log.info("Removed semantic_metadata.keywords column")
|
||||
else:
|
||||
log.info("semantic_metadata.keywords column does not exist, skipping")
|
||||
else:
|
||||
log.info("semantic_metadata table does not exist, skipping")
|
||||
|
||||
# Step 2: Remove symbols.token_count and symbols.symbol_type (if columns exist)
|
||||
log.info("Checking symbols.token_count and symbols.symbol_type columns...")
|
||||
|
||||
cursor.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='symbols'"
|
||||
)
|
||||
if cursor.fetchone():
|
||||
# Check if token_count or symbol_type columns exist
|
||||
cursor.execute("PRAGMA table_info(symbols)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if "token_count" in columns or "symbol_type" in columns:
|
||||
log.info("Removing symbols.token_count and symbols.symbol_type columns...")
|
||||
cursor.execute("""
|
||||
CREATE TABLE symbols_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_id INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
kind TEXT,
|
||||
start_line INTEGER,
|
||||
end_line INTEGER,
|
||||
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
|
||||
)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO symbols_new (id, file_id, name, kind, start_line, end_line)
|
||||
SELECT id, file_id, name, kind, start_line, end_line
|
||||
FROM symbols
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE symbols")
|
||||
cursor.execute("ALTER TABLE symbols_new RENAME TO symbols")
|
||||
|
||||
# Recreate indexes
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
|
||||
log.info("Removed symbols.token_count and symbols.symbol_type columns")
|
||||
else:
|
||||
log.info("symbols.token_count/symbol_type columns do not exist, skipping")
|
||||
else:
|
||||
log.info("symbols table does not exist, skipping")
|
||||
|
||||
# Step 3: Remove subdirs.direct_files (if column exists)
|
||||
log.info("Checking subdirs.direct_files column...")
|
||||
|
||||
cursor.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='subdirs'"
|
||||
)
|
||||
if cursor.fetchone():
|
||||
# Check if direct_files column exists
|
||||
cursor.execute("PRAGMA table_info(subdirs)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if "direct_files" in columns:
|
||||
log.info("Removing subdirs.direct_files column...")
|
||||
cursor.execute("""
|
||||
CREATE TABLE subdirs_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL UNIQUE,
|
||||
index_path TEXT NOT NULL,
|
||||
files_count INTEGER DEFAULT 0,
|
||||
last_updated REAL
|
||||
)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO subdirs_new (id, name, index_path, files_count, last_updated)
|
||||
SELECT id, name, index_path, files_count, last_updated
|
||||
FROM subdirs
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE subdirs")
|
||||
cursor.execute("ALTER TABLE subdirs_new RENAME TO subdirs")
|
||||
|
||||
# Recreate index
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
|
||||
log.info("Removed subdirs.direct_files column")
|
||||
else:
|
||||
log.info("subdirs.direct_files column does not exist, skipping")
|
||||
else:
|
||||
log.info("subdirs table does not exist, skipping")
|
||||
|
||||
log.info("Migration 005 completed successfully")
|
||||
|
||||
# Vacuum to reclaim space (outside transaction, optional)
|
||||
# Note: VACUUM cannot run inside a transaction, so we skip it here
|
||||
# The caller can run VACUUM separately if desired
|
||||
|
||||
|
||||
def downgrade(db_conn: Connection):
|
||||
"""Restore removed fields (data will be lost for keywords, token_count, symbol_type, direct_files).
|
||||
|
||||
This is a placeholder - true downgrade is not feasible as data is lost.
|
||||
The migration is designed to be one-way since removed fields are unused/redundant.
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
log.warning(
|
||||
"Migration 005 downgrade not supported - removed fields are unused/redundant. "
|
||||
"Data cannot be restored."
|
||||
)
|
||||
raise NotImplementedError(
|
||||
"Migration 005 downgrade not supported - this is a one-way migration"
|
||||
)
|
||||
@@ -0,0 +1,37 @@
|
||||
"""
|
||||
Migration 006: Ensure relationship tables and indexes exist.
|
||||
|
||||
This migration is intentionally idempotent. It creates the `code_relationships`
|
||||
table (used for graph visualization) and its indexes if missing.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection) -> None:
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Ensuring code_relationships table exists...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS code_relationships (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_symbol_id INTEGER NOT NULL REFERENCES symbols (id) ON DELETE CASCADE,
|
||||
target_qualified_name TEXT NOT NULL,
|
||||
relationship_type TEXT NOT NULL,
|
||||
source_line INTEGER NOT NULL,
|
||||
target_file TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Ensuring relationship indexes exist...")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)")
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
"""
|
||||
Migration 007: Add precomputed graph neighbor table for search expansion.
|
||||
|
||||
Adds:
|
||||
- graph_neighbors: cached N-hop neighbors between symbols (keyed by symbol ids)
|
||||
|
||||
This table is derived data (a cache) and is safe to rebuild at any time.
|
||||
The migration is intentionally idempotent.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection) -> None:
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Creating graph_neighbors table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS graph_neighbors (
|
||||
source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
|
||||
neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
|
||||
relationship_depth INTEGER NOT NULL,
|
||||
PRIMARY KEY (source_symbol_id, neighbor_symbol_id)
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Creating indexes for graph_neighbors...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth
|
||||
ON graph_neighbors(source_symbol_id, relationship_depth)
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor
|
||||
ON graph_neighbors(neighbor_symbol_id)
|
||||
"""
|
||||
)
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
"""
|
||||
Migration 008: Add Merkle hash tables for content-based incremental indexing.
|
||||
|
||||
Adds:
|
||||
- merkle_hashes: per-file SHA-256 hashes (keyed by file_id)
|
||||
- merkle_state: directory-level root hash (single row, id=1)
|
||||
|
||||
Backfills merkle_hashes using the existing `files.content` column when available.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import time
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection) -> None:
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Creating merkle_hashes table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS merkle_hashes (
|
||||
file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE,
|
||||
sha256 TEXT NOT NULL,
|
||||
updated_at REAL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Creating merkle_state table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS merkle_state (
|
||||
id INTEGER PRIMARY KEY CHECK (id = 1),
|
||||
root_hash TEXT,
|
||||
updated_at REAL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Backfill file hashes from stored content (best-effort).
|
||||
try:
|
||||
rows = cursor.execute("SELECT id, content FROM files").fetchall()
|
||||
except Exception as exc:
|
||||
log.warning("Unable to backfill merkle hashes (files table missing?): %s", exc)
|
||||
return
|
||||
|
||||
now = time.time()
|
||||
inserts: list[tuple[int, str, float]] = []
|
||||
|
||||
for row in rows:
|
||||
file_id = int(row[0])
|
||||
content = row[1]
|
||||
if content is None:
|
||||
continue
|
||||
try:
|
||||
digest = hashlib.sha256(str(content).encode("utf-8", errors="ignore")).hexdigest()
|
||||
inserts.append((file_id, digest, now))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not inserts:
|
||||
return
|
||||
|
||||
log.info("Backfilling %d file hashes...", len(inserts))
|
||||
cursor.executemany(
|
||||
"""
|
||||
INSERT INTO merkle_hashes(file_id, sha256, updated_at)
|
||||
VALUES(?, ?, ?)
|
||||
ON CONFLICT(file_id) DO UPDATE SET
|
||||
sha256=excluded.sha256,
|
||||
updated_at=excluded.updated_at
|
||||
""",
|
||||
inserts,
|
||||
)
|
||||
|
||||
@@ -0,0 +1,103 @@
|
||||
"""
|
||||
Migration 009: Add SPLADE sparse retrieval tables.
|
||||
|
||||
This migration introduces SPLADE (Sparse Lexical AnD Expansion) support:
|
||||
- splade_metadata: Model configuration (model name, vocab size, ONNX path)
|
||||
- splade_posting_list: Inverted index mapping token_id -> (chunk_id, weight)
|
||||
|
||||
The SPLADE tables are designed for efficient sparse vector retrieval:
|
||||
- Token-based lookup for query expansion
|
||||
- Chunk-based deletion for index maintenance
|
||||
- Maintains backward compatibility with existing FTS tables
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection) -> None:
|
||||
"""
|
||||
Adds SPLADE tables for sparse retrieval.
|
||||
|
||||
Creates:
|
||||
- splade_metadata: Stores model configuration and ONNX path
|
||||
- splade_posting_list: Inverted index with token_id -> (chunk_id, weight) mappings
|
||||
- Indexes for efficient token-based and chunk-based lookups
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Creating splade_metadata table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS splade_metadata (
|
||||
id INTEGER PRIMARY KEY DEFAULT 1,
|
||||
model_name TEXT NOT NULL,
|
||||
vocab_size INTEGER NOT NULL,
|
||||
onnx_path TEXT,
|
||||
created_at REAL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Creating splade_posting_list table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS splade_posting_list (
|
||||
token_id INTEGER NOT NULL,
|
||||
chunk_id INTEGER NOT NULL,
|
||||
weight REAL NOT NULL,
|
||||
PRIMARY KEY (token_id, chunk_id),
|
||||
FOREIGN KEY (chunk_id) REFERENCES semantic_chunks(id) ON DELETE CASCADE
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Creating indexes for splade_posting_list...")
|
||||
# Index for efficient chunk-based lookups (deletion, updates)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_splade_by_chunk
|
||||
ON splade_posting_list(chunk_id)
|
||||
"""
|
||||
)
|
||||
|
||||
# Index for efficient term-based retrieval
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_splade_by_token
|
||||
ON splade_posting_list(token_id)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Migration 009 completed successfully")
|
||||
|
||||
|
||||
def downgrade(db_conn: Connection) -> None:
|
||||
"""
|
||||
Removes SPLADE tables.
|
||||
|
||||
Drops:
|
||||
- splade_posting_list (and associated indexes)
|
||||
- splade_metadata
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Dropping SPLADE indexes...")
|
||||
cursor.execute("DROP INDEX IF EXISTS idx_splade_by_chunk")
|
||||
cursor.execute("DROP INDEX IF EXISTS idx_splade_by_token")
|
||||
|
||||
log.info("Dropping splade_posting_list table...")
|
||||
cursor.execute("DROP TABLE IF EXISTS splade_posting_list")
|
||||
|
||||
log.info("Dropping splade_metadata table...")
|
||||
cursor.execute("DROP TABLE IF EXISTS splade_metadata")
|
||||
|
||||
log.info("Migration 009 downgrade completed successfully")
|
||||
@@ -0,0 +1,162 @@
|
||||
"""
|
||||
Migration 010: Add multi-vector storage support for cascade retrieval.
|
||||
|
||||
This migration introduces the chunks table with multi-vector support:
|
||||
- chunks: Stores code chunks with multiple embedding types
|
||||
- embedding: Original embedding for backward compatibility
|
||||
- embedding_binary: 256-dim binary vector for coarse ranking (fast)
|
||||
- embedding_dense: 2048-dim dense vector for fine ranking (precise)
|
||||
|
||||
The multi-vector architecture enables cascade retrieval:
|
||||
1. First stage: Fast binary vector search for candidate retrieval
|
||||
2. Second stage: Dense vector reranking for precision
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection) -> None:
|
||||
"""
|
||||
Adds chunks table with multi-vector embedding columns.
|
||||
|
||||
Creates:
|
||||
- chunks: Table for storing code chunks with multiple embedding types
|
||||
- idx_chunks_file_path: Index for efficient file-based lookups
|
||||
|
||||
Also migrates existing chunks tables by adding new columns if needed.
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
# Check if chunks table already exists
|
||||
table_exists = cursor.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
|
||||
).fetchone()
|
||||
|
||||
if table_exists:
|
||||
# Migrate existing table - add new columns if missing
|
||||
log.info("chunks table exists, checking for missing columns...")
|
||||
|
||||
col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall()
|
||||
existing_columns = {row[1] for row in col_info}
|
||||
|
||||
if "embedding_binary" not in existing_columns:
|
||||
log.info("Adding embedding_binary column to chunks table...")
|
||||
cursor.execute(
|
||||
"ALTER TABLE chunks ADD COLUMN embedding_binary BLOB"
|
||||
)
|
||||
|
||||
if "embedding_dense" not in existing_columns:
|
||||
log.info("Adding embedding_dense column to chunks table...")
|
||||
cursor.execute(
|
||||
"ALTER TABLE chunks ADD COLUMN embedding_dense BLOB"
|
||||
)
|
||||
else:
|
||||
# Create new table with all columns
|
||||
log.info("Creating chunks table with multi-vector support...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE chunks (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_path TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
embedding BLOB,
|
||||
embedding_binary BLOB,
|
||||
embedding_dense BLOB,
|
||||
metadata TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create index for file-based lookups
|
||||
log.info("Creating index for chunks table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_chunks_file_path
|
||||
ON chunks(file_path)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Migration 010 completed successfully")
|
||||
|
||||
|
||||
def downgrade(db_conn: Connection) -> None:
|
||||
"""
|
||||
Removes multi-vector columns from chunks table.
|
||||
|
||||
Note: This does not drop the chunks table entirely to preserve data.
|
||||
Only the new columns added by this migration are removed.
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Removing multi-vector columns from chunks table...")
|
||||
|
||||
# SQLite doesn't support DROP COLUMN directly in older versions
|
||||
# We need to recreate the table without the columns
|
||||
|
||||
# Check if chunks table exists
|
||||
table_exists = cursor.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
|
||||
).fetchone()
|
||||
|
||||
if not table_exists:
|
||||
log.info("chunks table does not exist, nothing to downgrade")
|
||||
return
|
||||
|
||||
# Check if the columns exist before trying to remove them
|
||||
col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall()
|
||||
existing_columns = {row[1] for row in col_info}
|
||||
|
||||
needs_migration = (
|
||||
"embedding_binary" in existing_columns or
|
||||
"embedding_dense" in existing_columns
|
||||
)
|
||||
|
||||
if not needs_migration:
|
||||
log.info("Multi-vector columns not present, nothing to remove")
|
||||
return
|
||||
|
||||
# Recreate table without the new columns
|
||||
log.info("Recreating chunks table without multi-vector columns...")
|
||||
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE chunks_backup (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_path TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
embedding BLOB,
|
||||
metadata TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO chunks_backup (id, file_path, content, embedding, metadata, created_at)
|
||||
SELECT id, file_path, content, embedding, metadata, created_at FROM chunks
|
||||
"""
|
||||
)
|
||||
|
||||
cursor.execute("DROP TABLE chunks")
|
||||
cursor.execute("ALTER TABLE chunks_backup RENAME TO chunks")
|
||||
|
||||
# Recreate index
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_chunks_file_path
|
||||
ON chunks(file_path)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Migration 010 downgrade completed successfully")
|
||||
300
codex-lens/build/lib/codexlens/storage/path_mapper.py
Normal file
300
codex-lens/build/lib/codexlens/storage/path_mapper.py
Normal file
@@ -0,0 +1,300 @@
|
||||
"""Path mapping utilities for source paths and index paths.
|
||||
|
||||
This module provides bidirectional mapping between source code directories
|
||||
and their corresponding index storage locations.
|
||||
|
||||
Storage Structure:
|
||||
~/.codexlens/
|
||||
├── registry.db # Global mapping table
|
||||
└── indexes/
|
||||
└── D/
|
||||
└── Claude_dms3/
|
||||
├── _index.db # Root directory index
|
||||
└── src/
|
||||
└── _index.db # src/ directory index
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def _get_configured_index_root() -> Path:
|
||||
"""Get the index root from environment or config file.
|
||||
|
||||
Priority order:
|
||||
1. CODEXLENS_INDEX_DIR environment variable
|
||||
2. index_dir from ~/.codexlens/config.json
|
||||
3. Default: ~/.codexlens/indexes
|
||||
"""
|
||||
env_override = os.getenv("CODEXLENS_INDEX_DIR")
|
||||
if env_override:
|
||||
return Path(env_override).expanduser().resolve()
|
||||
|
||||
config_file = Path.home() / ".codexlens" / "config.json"
|
||||
if config_file.exists():
|
||||
try:
|
||||
cfg = json.loads(config_file.read_text(encoding="utf-8"))
|
||||
if "index_dir" in cfg:
|
||||
return Path(cfg["index_dir"]).expanduser().resolve()
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
|
||||
return Path.home() / ".codexlens" / "indexes"
|
||||
|
||||
|
||||
class PathMapper:
|
||||
"""Bidirectional mapping tool for source paths ↔ index paths.
|
||||
|
||||
Handles cross-platform path normalization and conversion between
|
||||
source code directories and their index storage locations.
|
||||
|
||||
Attributes:
|
||||
DEFAULT_INDEX_ROOT: Default root directory for all indexes
|
||||
INDEX_DB_NAME: Standard name for index database files
|
||||
index_root: Configured index root directory
|
||||
"""
|
||||
|
||||
DEFAULT_INDEX_ROOT = _get_configured_index_root()
|
||||
INDEX_DB_NAME = "_index.db"
|
||||
|
||||
def __init__(self, index_root: Optional[Path] = None):
|
||||
"""Initialize PathMapper with optional custom index root.
|
||||
|
||||
Args:
|
||||
index_root: Custom index root directory. If None, uses DEFAULT_INDEX_ROOT.
|
||||
"""
|
||||
self.index_root = (index_root or self.DEFAULT_INDEX_ROOT).resolve()
|
||||
|
||||
def source_to_index_dir(self, source_path: Path) -> Path:
|
||||
"""Convert source directory to its index directory path.
|
||||
|
||||
Maps a source code directory to where its index data should be stored.
|
||||
The mapping preserves the directory structure but normalizes paths
|
||||
for cross-platform compatibility.
|
||||
|
||||
Args:
|
||||
source_path: Source directory path to map
|
||||
|
||||
Returns:
|
||||
Index directory path under index_root
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.source_to_index_dir(Path("D:/Claude_dms3/src"))
|
||||
PosixPath('/home/user/.codexlens/indexes/D/Claude_dms3/src')
|
||||
|
||||
>>> mapper.source_to_index_dir(Path("/home/user/project"))
|
||||
PosixPath('/home/user/.codexlens/indexes/home/user/project')
|
||||
"""
|
||||
source_path = source_path.resolve()
|
||||
normalized = self.normalize_path(source_path)
|
||||
return self.index_root / normalized
|
||||
|
||||
def source_to_index_db(self, source_path: Path) -> Path:
|
||||
"""Convert source directory to its index database file path.
|
||||
|
||||
Maps a source directory to the full path of its index database file,
|
||||
including the standard INDEX_DB_NAME.
|
||||
|
||||
Args:
|
||||
source_path: Source directory path to map
|
||||
|
||||
Returns:
|
||||
Full path to the index database file
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.source_to_index_db(Path("D:/Claude_dms3/src"))
|
||||
PosixPath('/home/user/.codexlens/indexes/D/Claude_dms3/src/_index.db')
|
||||
"""
|
||||
index_dir = self.source_to_index_dir(source_path)
|
||||
return index_dir / self.INDEX_DB_NAME
|
||||
|
||||
def index_to_source(self, index_path: Path) -> Path:
|
||||
"""Convert index path back to original source path.
|
||||
|
||||
Performs reverse mapping from an index storage location to the
|
||||
original source directory. Handles both directory paths and
|
||||
database file paths.
|
||||
|
||||
Args:
|
||||
index_path: Index directory or database file path
|
||||
|
||||
Returns:
|
||||
Original source directory path
|
||||
|
||||
Raises:
|
||||
ValueError: If index_path is not under index_root
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.index_to_source(
|
||||
... Path("~/.codexlens/indexes/D/Claude_dms3/src/_index.db")
|
||||
... )
|
||||
WindowsPath('D:/Claude_dms3/src')
|
||||
|
||||
>>> mapper.index_to_source(
|
||||
... Path("~/.codexlens/indexes/D/Claude_dms3/src")
|
||||
... )
|
||||
WindowsPath('D:/Claude_dms3/src')
|
||||
"""
|
||||
index_path = index_path.resolve()
|
||||
|
||||
# Remove _index.db if present
|
||||
if index_path.name == self.INDEX_DB_NAME:
|
||||
index_path = index_path.parent
|
||||
|
||||
# Verify path is under index_root
|
||||
try:
|
||||
relative = index_path.relative_to(self.index_root)
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Index path {index_path} is not under index root {self.index_root}"
|
||||
)
|
||||
|
||||
# Convert normalized path back to source path
|
||||
normalized_str = str(relative).replace("\\", "/")
|
||||
return self.denormalize_path(normalized_str)
|
||||
|
||||
def get_project_root(self, source_path: Path) -> Path:
|
||||
"""Find the project root directory (topmost indexed directory).
|
||||
|
||||
Walks up the directory tree to find the highest-level directory
|
||||
that has an index database.
|
||||
|
||||
Args:
|
||||
source_path: Source directory to start from
|
||||
|
||||
Returns:
|
||||
Project root directory path. Returns source_path itself if
|
||||
no parent index is found.
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.get_project_root(Path("D:/Claude_dms3/src/codexlens"))
|
||||
WindowsPath('D:/Claude_dms3')
|
||||
"""
|
||||
source_path = source_path.resolve()
|
||||
current = source_path
|
||||
project_root = source_path
|
||||
|
||||
# Walk up the tree
|
||||
while current.parent != current: # Stop at filesystem root
|
||||
parent_index_db = self.source_to_index_db(current.parent)
|
||||
if parent_index_db.exists():
|
||||
project_root = current.parent
|
||||
current = current.parent
|
||||
else:
|
||||
break
|
||||
|
||||
return project_root
|
||||
|
||||
def get_relative_depth(self, source_path: Path, project_root: Path) -> int:
|
||||
"""Calculate directory depth relative to project root.
|
||||
|
||||
Args:
|
||||
source_path: Target directory path
|
||||
project_root: Project root directory path
|
||||
|
||||
Returns:
|
||||
Number of directory levels from project_root to source_path
|
||||
|
||||
Raises:
|
||||
ValueError: If source_path is not under project_root
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.get_relative_depth(
|
||||
... Path("D:/Claude_dms3/src/codexlens"),
|
||||
... Path("D:/Claude_dms3")
|
||||
... )
|
||||
2
|
||||
"""
|
||||
source_path = source_path.resolve()
|
||||
project_root = project_root.resolve()
|
||||
|
||||
try:
|
||||
relative = source_path.relative_to(project_root)
|
||||
# Count path components
|
||||
return len(relative.parts)
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Source path {source_path} is not under project root {project_root}"
|
||||
)
|
||||
|
||||
def normalize_path(self, path: Path) -> str:
|
||||
"""Normalize path to cross-platform storage format.
|
||||
|
||||
Converts OS-specific paths to a standardized format for storage:
|
||||
- Windows: Removes drive colons (D: → D)
|
||||
- Unix: Removes leading slash
|
||||
- Uses forward slashes throughout
|
||||
|
||||
Args:
|
||||
path: Path to normalize
|
||||
|
||||
Returns:
|
||||
Normalized path string
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.normalize_path(Path("D:/path/to/dir"))
|
||||
'D/path/to/dir'
|
||||
|
||||
>>> mapper.normalize_path(Path("/home/user/path"))
|
||||
'home/user/path'
|
||||
"""
|
||||
path = path.resolve()
|
||||
path_str = str(path)
|
||||
|
||||
# Handle Windows paths with drive letters
|
||||
if platform.system() == "Windows" and len(path.parts) > 0:
|
||||
# Convert D:\path\to\dir → D/path/to/dir
|
||||
drive = path.parts[0].replace(":", "") # D: → D
|
||||
rest = Path(*path.parts[1:]) if len(path.parts) > 1 else Path()
|
||||
normalized = f"{drive}/{rest}".replace("\\", "/")
|
||||
return normalized.rstrip("/")
|
||||
|
||||
# Handle Unix paths
|
||||
# /home/user/path → home/user/path
|
||||
return path_str.lstrip("/").replace("\\", "/")
|
||||
|
||||
def denormalize_path(self, normalized: str) -> Path:
|
||||
"""Convert normalized path back to OS-specific path.
|
||||
|
||||
Reverses the normalization process to restore OS-native path format:
|
||||
- Windows: Adds drive colons (D → D:)
|
||||
- Unix: Adds leading slash
|
||||
|
||||
Args:
|
||||
normalized: Normalized path string
|
||||
|
||||
Returns:
|
||||
OS-specific Path object
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.denormalize_path("D/path/to/dir") # On Windows
|
||||
WindowsPath('D:/path/to/dir')
|
||||
|
||||
>>> mapper.denormalize_path("home/user/path") # On Unix
|
||||
PosixPath('/home/user/path')
|
||||
"""
|
||||
parts = normalized.split("/")
|
||||
|
||||
# Handle Windows paths
|
||||
if platform.system() == "Windows" and len(parts) > 0:
|
||||
# Check if first part is a drive letter
|
||||
if len(parts[0]) == 1 and parts[0].isalpha():
|
||||
# D/path/to/dir → D:/path/to/dir
|
||||
drive = f"{parts[0]}:"
|
||||
if len(parts) > 1:
|
||||
return Path(drive) / Path(*parts[1:])
|
||||
return Path(drive)
|
||||
|
||||
# Handle Unix paths or relative paths
|
||||
# home/user/path → /home/user/path
|
||||
return Path("/") / Path(*parts)
|
||||
683
codex-lens/build/lib/codexlens/storage/registry.py
Normal file
683
codex-lens/build/lib/codexlens/storage/registry.py
Normal file
@@ -0,0 +1,683 @@
|
||||
"""Global project registry for CodexLens - SQLite storage."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import platform
|
||||
import sqlite3
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from codexlens.errors import StorageError
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProjectInfo:
|
||||
"""Registered project information."""
|
||||
|
||||
id: int
|
||||
source_root: Path
|
||||
index_root: Path
|
||||
created_at: float
|
||||
last_indexed: float
|
||||
total_files: int
|
||||
total_dirs: int
|
||||
status: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class DirMapping:
|
||||
"""Directory to index path mapping."""
|
||||
|
||||
id: int
|
||||
project_id: int
|
||||
source_path: Path
|
||||
index_path: Path
|
||||
depth: int
|
||||
files_count: int
|
||||
last_updated: float
|
||||
|
||||
|
||||
class RegistryStore:
|
||||
"""Global project registry - SQLite storage.
|
||||
|
||||
Manages indexed projects and directory-to-index path mappings.
|
||||
Thread-safe with connection pooling.
|
||||
"""
|
||||
|
||||
DEFAULT_DB_PATH = Path.home() / ".codexlens" / "registry.db"
|
||||
|
||||
def __init__(self, db_path: Path | None = None) -> None:
|
||||
self.db_path = (db_path or self.DEFAULT_DB_PATH).resolve()
|
||||
self._lock = threading.RLock()
|
||||
self._local = threading.local()
|
||||
self._pool_lock = threading.Lock()
|
||||
self._pool: Dict[int, sqlite3.Connection] = {}
|
||||
self._pool_generation = 0
|
||||
|
||||
def _get_connection(self) -> sqlite3.Connection:
|
||||
"""Get or create a thread-local database connection."""
|
||||
thread_id = threading.get_ident()
|
||||
if getattr(self._local, "generation", None) == self._pool_generation:
|
||||
conn = getattr(self._local, "conn", None)
|
||||
if conn is not None:
|
||||
return conn
|
||||
|
||||
with self._pool_lock:
|
||||
conn = self._pool.get(thread_id)
|
||||
if conn is None:
|
||||
conn = sqlite3.connect(self.db_path, check_same_thread=False)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA synchronous=NORMAL")
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
self._pool[thread_id] = conn
|
||||
|
||||
self._local.conn = conn
|
||||
self._local.generation = self._pool_generation
|
||||
return conn
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close all pooled connections."""
|
||||
with self._lock:
|
||||
with self._pool_lock:
|
||||
for conn in self._pool.values():
|
||||
conn.close()
|
||||
self._pool.clear()
|
||||
self._pool_generation += 1
|
||||
|
||||
if hasattr(self._local, "conn"):
|
||||
self._local.conn = None
|
||||
if hasattr(self._local, "generation"):
|
||||
self._local.generation = self._pool_generation
|
||||
|
||||
def __enter__(self) -> RegistryStore:
|
||||
self.initialize()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
|
||||
self.close()
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Create database and schema."""
|
||||
with self._lock:
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
conn = self._get_connection()
|
||||
self._create_schema(conn)
|
||||
|
||||
def _create_schema(self, conn: sqlite3.Connection) -> None:
|
||||
"""Create database schema."""
|
||||
try:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS projects (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_root TEXT UNIQUE NOT NULL,
|
||||
index_root TEXT NOT NULL,
|
||||
created_at REAL,
|
||||
last_indexed REAL,
|
||||
total_files INTEGER DEFAULT 0,
|
||||
total_dirs INTEGER DEFAULT 0,
|
||||
status TEXT DEFAULT 'active'
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS dir_mapping (
|
||||
id INTEGER PRIMARY KEY,
|
||||
project_id INTEGER REFERENCES projects(id) ON DELETE CASCADE,
|
||||
source_path TEXT NOT NULL,
|
||||
index_path TEXT NOT NULL,
|
||||
depth INTEGER,
|
||||
files_count INTEGER DEFAULT 0,
|
||||
last_updated REAL,
|
||||
UNIQUE(source_path)
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_dir_source ON dir_mapping(source_path)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_dir_project ON dir_mapping(project_id)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_project_source ON projects(source_root)"
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"Failed to initialize registry schema: {exc}") from exc
|
||||
|
||||
def _normalize_path_for_comparison(self, path: Path) -> str:
|
||||
"""Normalize paths for comparisons and storage.
|
||||
|
||||
Windows paths are treated as case-insensitive, so normalize to lowercase.
|
||||
Unix platforms preserve case sensitivity.
|
||||
"""
|
||||
path_str = str(path)
|
||||
if platform.system() == "Windows":
|
||||
return path_str.lower()
|
||||
return path_str
|
||||
|
||||
# === Project Operations ===
|
||||
|
||||
def register_project(self, source_root: Path, index_root: Path) -> ProjectInfo:
|
||||
"""Register a new project or update existing one.
|
||||
|
||||
Args:
|
||||
source_root: Source code root directory
|
||||
index_root: Index storage root directory
|
||||
|
||||
Returns:
|
||||
ProjectInfo for the registered project
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_root_str = self._normalize_path_for_comparison(source_root.resolve())
|
||||
index_root_str = str(index_root.resolve())
|
||||
now = time.time()
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO projects(source_root, index_root, created_at, last_indexed)
|
||||
VALUES(?, ?, ?, ?)
|
||||
ON CONFLICT(source_root) DO UPDATE SET
|
||||
index_root=excluded.index_root,
|
||||
last_indexed=excluded.last_indexed,
|
||||
status='active'
|
||||
""",
|
||||
(source_root_str, index_root_str, now, now),
|
||||
)
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT * FROM projects WHERE source_root=?", (source_root_str,)
|
||||
).fetchone()
|
||||
|
||||
conn.commit()
|
||||
|
||||
if not row:
|
||||
raise StorageError(f"Failed to register project: {source_root}")
|
||||
|
||||
return self._row_to_project_info(row)
|
||||
|
||||
def unregister_project(self, source_root: Path) -> bool:
|
||||
"""Remove a project registration (cascades to directory mappings).
|
||||
|
||||
Args:
|
||||
source_root: Source code root directory
|
||||
|
||||
Returns:
|
||||
True if project was removed, False if not found
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_root_str = self._normalize_path_for_comparison(source_root.resolve())
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT id FROM projects WHERE source_root=?", (source_root_str,)
|
||||
).fetchone()
|
||||
|
||||
if not row:
|
||||
return False
|
||||
|
||||
conn.execute("DELETE FROM projects WHERE source_root=?", (source_root_str,))
|
||||
conn.commit()
|
||||
return True
|
||||
|
||||
def get_project(self, source_root: Path) -> Optional[ProjectInfo]:
|
||||
"""Get project information by source root.
|
||||
|
||||
Args:
|
||||
source_root: Source code root directory
|
||||
|
||||
Returns:
|
||||
ProjectInfo if found, None otherwise
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_root_str = self._normalize_path_for_comparison(source_root.resolve())
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT * FROM projects WHERE source_root=?", (source_root_str,)
|
||||
).fetchone()
|
||||
|
||||
return self._row_to_project_info(row) if row else None
|
||||
|
||||
def get_project_by_id(self, project_id: int) -> Optional[ProjectInfo]:
|
||||
"""Get project information by ID.
|
||||
|
||||
Args:
|
||||
project_id: Project database ID
|
||||
|
||||
Returns:
|
||||
ProjectInfo if found, None otherwise
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT * FROM projects WHERE id=?", (project_id,)
|
||||
).fetchone()
|
||||
|
||||
return self._row_to_project_info(row) if row else None
|
||||
|
||||
def list_projects(self, status: Optional[str] = None) -> List[ProjectInfo]:
|
||||
"""List all registered projects.
|
||||
|
||||
Args:
|
||||
status: Optional status filter ('active', 'stale', 'removed')
|
||||
|
||||
Returns:
|
||||
List of ProjectInfo objects
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
if status:
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM projects WHERE status=? ORDER BY created_at DESC",
|
||||
(status,),
|
||||
).fetchall()
|
||||
else:
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM projects ORDER BY created_at DESC"
|
||||
).fetchall()
|
||||
|
||||
return [self._row_to_project_info(row) for row in rows]
|
||||
|
||||
def update_project_stats(
|
||||
self, source_root: Path, total_files: int, total_dirs: int
|
||||
) -> None:
|
||||
"""Update project statistics.
|
||||
|
||||
Args:
|
||||
source_root: Source code root directory
|
||||
total_files: Total number of indexed files
|
||||
total_dirs: Total number of indexed directories
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_root_str = self._normalize_path_for_comparison(source_root.resolve())
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE projects
|
||||
SET total_files=?, total_dirs=?, last_indexed=?
|
||||
WHERE source_root=?
|
||||
""",
|
||||
(total_files, total_dirs, time.time(), source_root_str),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def set_project_status(self, source_root: Path, status: str) -> None:
|
||||
"""Set project status.
|
||||
|
||||
Args:
|
||||
source_root: Source code root directory
|
||||
status: Status string ('active', 'stale', 'removed')
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_root_str = self._normalize_path_for_comparison(source_root.resolve())
|
||||
|
||||
conn.execute(
|
||||
"UPDATE projects SET status=? WHERE source_root=?",
|
||||
(status, source_root_str),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# === Directory Mapping Operations ===
|
||||
|
||||
def register_dir(
|
||||
self,
|
||||
project_id: int,
|
||||
source_path: Path,
|
||||
index_path: Path,
|
||||
depth: int,
|
||||
files_count: int = 0,
|
||||
) -> DirMapping:
|
||||
"""Register a directory mapping.
|
||||
|
||||
Args:
|
||||
project_id: Project database ID
|
||||
source_path: Source directory path
|
||||
index_path: Index database path
|
||||
depth: Directory depth relative to project root
|
||||
files_count: Number of files in directory
|
||||
|
||||
Returns:
|
||||
DirMapping for the registered directory
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_path_str = self._normalize_path_for_comparison(source_path.resolve())
|
||||
index_path_str = str(index_path.resolve())
|
||||
now = time.time()
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO dir_mapping(
|
||||
project_id, source_path, index_path, depth, files_count, last_updated
|
||||
)
|
||||
VALUES(?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(source_path) DO UPDATE SET
|
||||
index_path=excluded.index_path,
|
||||
depth=excluded.depth,
|
||||
files_count=excluded.files_count,
|
||||
last_updated=excluded.last_updated
|
||||
""",
|
||||
(project_id, source_path_str, index_path_str, depth, files_count, now),
|
||||
)
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT * FROM dir_mapping WHERE source_path=?", (source_path_str,)
|
||||
).fetchone()
|
||||
|
||||
conn.commit()
|
||||
|
||||
if not row:
|
||||
raise StorageError(f"Failed to register directory: {source_path}")
|
||||
|
||||
return self._row_to_dir_mapping(row)
|
||||
|
||||
def unregister_dir(self, source_path: Path) -> bool:
|
||||
"""Remove a directory mapping.
|
||||
|
||||
Args:
|
||||
source_path: Source directory path
|
||||
|
||||
Returns:
|
||||
True if directory was removed, False if not found
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_path_str = self._normalize_path_for_comparison(source_path.resolve())
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT id FROM dir_mapping WHERE source_path=?", (source_path_str,)
|
||||
).fetchone()
|
||||
|
||||
if not row:
|
||||
return False
|
||||
|
||||
conn.execute("DELETE FROM dir_mapping WHERE source_path=?", (source_path_str,))
|
||||
conn.commit()
|
||||
return True
|
||||
|
||||
def find_index_path(self, source_path: Path) -> Optional[Path]:
|
||||
"""Find index path for a source directory (exact match).
|
||||
|
||||
Args:
|
||||
source_path: Source directory path
|
||||
|
||||
Returns:
|
||||
Index path if found, None otherwise
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_path_str = self._normalize_path_for_comparison(source_path.resolve())
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT index_path FROM dir_mapping WHERE source_path=?",
|
||||
(source_path_str,),
|
||||
).fetchone()
|
||||
|
||||
return Path(row["index_path"]) if row else None
|
||||
|
||||
def find_nearest_index(self, source_path: Path) -> Optional[DirMapping]:
|
||||
"""Find nearest indexed ancestor directory.
|
||||
|
||||
Searches for the closest parent directory that has an index.
|
||||
Useful for supporting subdirectory searches.
|
||||
|
||||
Optimized to use single database query instead of iterating through
|
||||
each parent directory level.
|
||||
|
||||
Args:
|
||||
source_path: Source directory or file path
|
||||
|
||||
Returns:
|
||||
DirMapping for nearest ancestor, None if not found
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_path_resolved = source_path.resolve()
|
||||
|
||||
# Build list of all parent paths from deepest to shallowest
|
||||
paths_to_check = []
|
||||
current = source_path_resolved
|
||||
while True:
|
||||
paths_to_check.append(self._normalize_path_for_comparison(current))
|
||||
parent = current.parent
|
||||
if parent == current: # Reached filesystem root
|
||||
break
|
||||
current = parent
|
||||
|
||||
if not paths_to_check:
|
||||
return None
|
||||
|
||||
# Single query with WHERE IN, ordered by path length (longest = nearest)
|
||||
placeholders = ','.join('?' * len(paths_to_check))
|
||||
query = f"""
|
||||
SELECT * FROM dir_mapping
|
||||
WHERE source_path IN ({placeholders})
|
||||
ORDER BY LENGTH(source_path) DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
|
||||
row = conn.execute(query, paths_to_check).fetchone()
|
||||
return self._row_to_dir_mapping(row) if row else None
|
||||
|
||||
def find_by_source_path(self, source_path: str) -> Optional[Dict[str, str]]:
|
||||
"""Find project by source path (exact or nearest match).
|
||||
|
||||
Searches for a project whose source_root matches or contains
|
||||
the given source_path.
|
||||
|
||||
Args:
|
||||
source_path: Source directory path as string
|
||||
|
||||
Returns:
|
||||
Dict with project info including 'index_root', or None if not found
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
resolved_path = Path(source_path).resolve()
|
||||
source_path_resolved = self._normalize_path_for_comparison(resolved_path)
|
||||
|
||||
# First try exact match on projects table
|
||||
row = conn.execute(
|
||||
"SELECT * FROM projects WHERE source_root=?", (source_path_resolved,)
|
||||
).fetchone()
|
||||
|
||||
if row:
|
||||
return {
|
||||
"id": str(row["id"]),
|
||||
"source_root": row["source_root"],
|
||||
"index_root": row["index_root"],
|
||||
"status": row["status"] or "active",
|
||||
}
|
||||
|
||||
# Try finding project that contains this path
|
||||
# Build list of all parent paths
|
||||
paths_to_check = []
|
||||
current = resolved_path
|
||||
while True:
|
||||
paths_to_check.append(self._normalize_path_for_comparison(current))
|
||||
parent = current.parent
|
||||
if parent == current:
|
||||
break
|
||||
current = parent
|
||||
|
||||
if paths_to_check:
|
||||
placeholders = ','.join('?' * len(paths_to_check))
|
||||
query = f"""
|
||||
SELECT * FROM projects
|
||||
WHERE source_root IN ({placeholders})
|
||||
ORDER BY LENGTH(source_root) DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
row = conn.execute(query, paths_to_check).fetchone()
|
||||
|
||||
if row:
|
||||
return {
|
||||
"id": str(row["id"]),
|
||||
"source_root": row["source_root"],
|
||||
"index_root": row["index_root"],
|
||||
"status": row["status"] or "active",
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
def get_project_dirs(self, project_id: int) -> List[DirMapping]:
|
||||
"""Get all directory mappings for a project.
|
||||
|
||||
Args:
|
||||
project_id: Project database ID
|
||||
|
||||
Returns:
|
||||
List of DirMapping objects
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM dir_mapping WHERE project_id=? ORDER BY depth, source_path",
|
||||
(project_id,),
|
||||
).fetchall()
|
||||
|
||||
return [self._row_to_dir_mapping(row) for row in rows]
|
||||
|
||||
def get_subdirs(self, source_path: Path) -> List[DirMapping]:
|
||||
"""Get direct subdirectory mappings.
|
||||
|
||||
Args:
|
||||
source_path: Parent directory path
|
||||
|
||||
Returns:
|
||||
List of DirMapping objects for direct children
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_path_str = self._normalize_path_for_comparison(source_path.resolve())
|
||||
|
||||
# First get the parent's depth
|
||||
parent_row = conn.execute(
|
||||
"SELECT depth, project_id FROM dir_mapping WHERE source_path=?",
|
||||
(source_path_str,),
|
||||
).fetchone()
|
||||
|
||||
if not parent_row:
|
||||
return []
|
||||
|
||||
parent_depth = int(parent_row["depth"])
|
||||
project_id = int(parent_row["project_id"])
|
||||
|
||||
# Get all subdirs with depth = parent_depth + 1 and matching path prefix
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT * FROM dir_mapping
|
||||
WHERE project_id=? AND depth=? AND source_path LIKE ?
|
||||
ORDER BY source_path
|
||||
""",
|
||||
(project_id, parent_depth + 1, f"{source_path_str}%"),
|
||||
).fetchall()
|
||||
|
||||
return [self._row_to_dir_mapping(row) for row in rows]
|
||||
|
||||
def update_dir_stats(self, source_path: Path, files_count: int) -> None:
|
||||
"""Update directory statistics.
|
||||
|
||||
Args:
|
||||
source_path: Source directory path
|
||||
files_count: Number of files in directory
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_path_str = self._normalize_path_for_comparison(source_path.resolve())
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE dir_mapping
|
||||
SET files_count=?, last_updated=?
|
||||
WHERE source_path=?
|
||||
""",
|
||||
(files_count, time.time(), source_path_str),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def update_index_paths(self, old_root: Path, new_root: Path) -> int:
|
||||
"""Update all index paths after migration.
|
||||
|
||||
Replaces old_root prefix with new_root in all stored index paths.
|
||||
|
||||
Args:
|
||||
old_root: Old index root directory
|
||||
new_root: New index root directory
|
||||
|
||||
Returns:
|
||||
Number of paths updated
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
old_root_str = str(old_root.resolve())
|
||||
new_root_str = str(new_root.resolve())
|
||||
updated = 0
|
||||
|
||||
# Update projects
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE projects
|
||||
SET index_root = REPLACE(index_root, ?, ?)
|
||||
WHERE index_root LIKE ?
|
||||
""",
|
||||
(old_root_str, new_root_str, f"{old_root_str}%"),
|
||||
)
|
||||
updated += conn.total_changes
|
||||
|
||||
# Update dir_mapping
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE dir_mapping
|
||||
SET index_path = REPLACE(index_path, ?, ?)
|
||||
WHERE index_path LIKE ?
|
||||
""",
|
||||
(old_root_str, new_root_str, f"{old_root_str}%"),
|
||||
)
|
||||
updated += conn.total_changes
|
||||
|
||||
conn.commit()
|
||||
return updated
|
||||
|
||||
# === Internal Methods ===
|
||||
|
||||
def _row_to_project_info(self, row: sqlite3.Row) -> ProjectInfo:
|
||||
"""Convert database row to ProjectInfo."""
|
||||
return ProjectInfo(
|
||||
id=int(row["id"]),
|
||||
source_root=Path(row["source_root"]),
|
||||
index_root=Path(row["index_root"]),
|
||||
created_at=float(row["created_at"]) if row["created_at"] else 0.0,
|
||||
last_indexed=float(row["last_indexed"]) if row["last_indexed"] else 0.0,
|
||||
total_files=int(row["total_files"]) if row["total_files"] else 0,
|
||||
total_dirs=int(row["total_dirs"]) if row["total_dirs"] else 0,
|
||||
status=str(row["status"]) if row["status"] else "active",
|
||||
)
|
||||
|
||||
def _row_to_dir_mapping(self, row: sqlite3.Row) -> DirMapping:
|
||||
"""Convert database row to DirMapping."""
|
||||
return DirMapping(
|
||||
id=int(row["id"]),
|
||||
project_id=int(row["project_id"]),
|
||||
source_path=Path(row["source_path"]),
|
||||
index_path=Path(row["index_path"]),
|
||||
depth=int(row["depth"]) if row["depth"] is not None else 0,
|
||||
files_count=int(row["files_count"]) if row["files_count"] else 0,
|
||||
last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
|
||||
)
|
||||
578
codex-lens/build/lib/codexlens/storage/splade_index.py
Normal file
578
codex-lens/build/lib/codexlens/storage/splade_index.py
Normal file
@@ -0,0 +1,578 @@
|
||||
"""SPLADE inverted index storage for sparse vector retrieval.
|
||||
|
||||
This module implements SQLite-based inverted index for SPLADE sparse vectors,
|
||||
enabling efficient sparse retrieval using dot-product scoring.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from codexlens.entities import SearchResult
|
||||
from codexlens.errors import StorageError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SpladeIndex:
|
||||
"""SQLite-based inverted index for SPLADE sparse vectors.
|
||||
|
||||
Stores sparse vectors as posting lists mapping token_id -> (chunk_id, weight).
|
||||
Supports efficient dot-product retrieval using SQL joins.
|
||||
"""
|
||||
|
||||
def __init__(self, db_path: Path | str) -> None:
|
||||
"""Initialize SPLADE index.
|
||||
|
||||
Args:
|
||||
db_path: Path to SQLite database file.
|
||||
"""
|
||||
self.db_path = Path(db_path)
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Thread-safe connection management
|
||||
self._lock = threading.RLock()
|
||||
self._local = threading.local()
|
||||
|
||||
def _get_connection(self) -> sqlite3.Connection:
|
||||
"""Get or create a thread-local database connection.
|
||||
|
||||
Each thread gets its own connection to ensure thread safety.
|
||||
Connections are stored in thread-local storage.
|
||||
"""
|
||||
conn = getattr(self._local, "conn", None)
|
||||
if conn is None:
|
||||
# Thread-local connection - each thread has its own
|
||||
conn = sqlite3.connect(
|
||||
self.db_path,
|
||||
timeout=30.0, # Wait up to 30s for locks
|
||||
check_same_thread=True, # Enforce thread safety
|
||||
)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA synchronous=NORMAL")
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
# Limit mmap to 1GB to avoid OOM on smaller systems
|
||||
conn.execute("PRAGMA mmap_size=1073741824")
|
||||
# Increase cache size for better query performance (20MB = -20000 pages)
|
||||
conn.execute("PRAGMA cache_size=-20000")
|
||||
self._local.conn = conn
|
||||
return conn
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close thread-local database connection."""
|
||||
with self._lock:
|
||||
conn = getattr(self._local, "conn", None)
|
||||
if conn is not None:
|
||||
conn.close()
|
||||
self._local.conn = None
|
||||
|
||||
def __enter__(self) -> SpladeIndex:
|
||||
"""Context manager entry."""
|
||||
self.create_tables()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb) -> None:
|
||||
"""Context manager exit."""
|
||||
self.close()
|
||||
|
||||
def has_index(self) -> bool:
|
||||
"""Check if SPLADE tables exist in database.
|
||||
|
||||
Returns:
|
||||
True if tables exist, False otherwise.
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
cursor = conn.execute(
|
||||
"""
|
||||
SELECT name FROM sqlite_master
|
||||
WHERE type='table' AND name='splade_posting_list'
|
||||
"""
|
||||
)
|
||||
return cursor.fetchone() is not None
|
||||
except sqlite3.Error as e:
|
||||
logger.error("Failed to check index existence: %s", e)
|
||||
return False
|
||||
|
||||
def create_tables(self) -> None:
|
||||
"""Create SPLADE schema if not exists.
|
||||
|
||||
Note: When used with distributed indexes (multiple _index.db files),
|
||||
the SPLADE database stores chunk IDs from multiple sources. In this case,
|
||||
foreign key constraints are not enforced to allow cross-database references.
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
# Inverted index for sparse vectors
|
||||
# Note: No FOREIGN KEY constraint to support distributed index architecture
|
||||
# where chunks may come from multiple _index.db files
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS splade_posting_list (
|
||||
token_id INTEGER NOT NULL,
|
||||
chunk_id INTEGER NOT NULL,
|
||||
weight REAL NOT NULL,
|
||||
PRIMARY KEY (token_id, chunk_id)
|
||||
)
|
||||
""")
|
||||
|
||||
# Indexes for efficient lookups
|
||||
conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_splade_by_chunk
|
||||
ON splade_posting_list(chunk_id)
|
||||
""")
|
||||
conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_splade_by_token
|
||||
ON splade_posting_list(token_id)
|
||||
""")
|
||||
|
||||
# Model metadata
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS splade_metadata (
|
||||
id INTEGER PRIMARY KEY DEFAULT 1,
|
||||
model_name TEXT NOT NULL,
|
||||
vocab_size INTEGER NOT NULL,
|
||||
onnx_path TEXT,
|
||||
created_at REAL
|
||||
)
|
||||
""")
|
||||
|
||||
# Chunk metadata for self-contained search results
|
||||
# Stores all chunk info needed to build SearchResult without querying _index.db
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS splade_chunks (
|
||||
id INTEGER PRIMARY KEY,
|
||||
file_path TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
metadata TEXT,
|
||||
source_db TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
logger.debug("SPLADE schema created successfully")
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Failed to create SPLADE schema: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="create_tables"
|
||||
) from e
|
||||
|
||||
def add_posting(self, chunk_id: int, sparse_vec: Dict[int, float]) -> None:
|
||||
"""Add a single document to inverted index.
|
||||
|
||||
Args:
|
||||
chunk_id: Chunk ID (foreign key to semantic_chunks.id).
|
||||
sparse_vec: Sparse vector as {token_id: weight} mapping.
|
||||
"""
|
||||
if not sparse_vec:
|
||||
logger.warning("Empty sparse vector for chunk_id=%d, skipping", chunk_id)
|
||||
return
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
# Insert all non-zero weights for this chunk
|
||||
postings = [
|
||||
(token_id, chunk_id, weight)
|
||||
for token_id, weight in sparse_vec.items()
|
||||
if weight > 0 # Only store non-zero weights
|
||||
]
|
||||
|
||||
if postings:
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT OR REPLACE INTO splade_posting_list
|
||||
(token_id, chunk_id, weight)
|
||||
VALUES (?, ?, ?)
|
||||
""",
|
||||
postings
|
||||
)
|
||||
conn.commit()
|
||||
logger.debug(
|
||||
"Added %d postings for chunk_id=%d", len(postings), chunk_id
|
||||
)
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Failed to add posting for chunk_id={chunk_id}: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="add_posting"
|
||||
) from e
|
||||
|
||||
def add_postings_batch(
|
||||
self, postings: List[Tuple[int, Dict[int, float]]]
|
||||
) -> None:
|
||||
"""Batch insert postings for multiple chunks.
|
||||
|
||||
Args:
|
||||
postings: List of (chunk_id, sparse_vec) tuples.
|
||||
"""
|
||||
if not postings:
|
||||
return
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
# Flatten all postings into single batch
|
||||
batch_data = []
|
||||
for chunk_id, sparse_vec in postings:
|
||||
for token_id, weight in sparse_vec.items():
|
||||
if weight > 0: # Only store non-zero weights
|
||||
batch_data.append((token_id, chunk_id, weight))
|
||||
|
||||
if batch_data:
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT OR REPLACE INTO splade_posting_list
|
||||
(token_id, chunk_id, weight)
|
||||
VALUES (?, ?, ?)
|
||||
""",
|
||||
batch_data
|
||||
)
|
||||
conn.commit()
|
||||
logger.debug(
|
||||
"Batch inserted %d postings for %d chunks",
|
||||
len(batch_data),
|
||||
len(postings)
|
||||
)
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Failed to batch insert postings: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="add_postings_batch"
|
||||
) from e
|
||||
|
||||
def add_chunk_metadata(
|
||||
self,
|
||||
chunk_id: int,
|
||||
file_path: str,
|
||||
content: str,
|
||||
metadata: Optional[str] = None,
|
||||
source_db: Optional[str] = None
|
||||
) -> None:
|
||||
"""Store chunk metadata for self-contained search results.
|
||||
|
||||
Args:
|
||||
chunk_id: Global chunk ID.
|
||||
file_path: Path to source file.
|
||||
content: Chunk text content.
|
||||
metadata: JSON metadata string.
|
||||
source_db: Path to source _index.db.
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT OR REPLACE INTO splade_chunks
|
||||
(id, file_path, content, metadata, source_db)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""",
|
||||
(chunk_id, file_path, content, metadata, source_db)
|
||||
)
|
||||
conn.commit()
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Failed to add chunk metadata for chunk_id={chunk_id}: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="add_chunk_metadata"
|
||||
) from e
|
||||
|
||||
def add_chunks_metadata_batch(
|
||||
self,
|
||||
chunks: List[Tuple[int, str, str, Optional[str], Optional[str]]]
|
||||
) -> None:
|
||||
"""Batch insert chunk metadata.
|
||||
|
||||
Args:
|
||||
chunks: List of (chunk_id, file_path, content, metadata, source_db) tuples.
|
||||
"""
|
||||
if not chunks:
|
||||
return
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT OR REPLACE INTO splade_chunks
|
||||
(id, file_path, content, metadata, source_db)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""",
|
||||
chunks
|
||||
)
|
||||
conn.commit()
|
||||
logger.debug("Batch inserted %d chunk metadata records", len(chunks))
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Failed to batch insert chunk metadata: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="add_chunks_metadata_batch"
|
||||
) from e
|
||||
|
||||
def get_chunks_by_ids(self, chunk_ids: List[int]) -> List[Dict]:
|
||||
"""Get chunk metadata by IDs.
|
||||
|
||||
Args:
|
||||
chunk_ids: List of chunk IDs to retrieve.
|
||||
|
||||
Returns:
|
||||
List of dicts with id, file_path, content, metadata, source_db.
|
||||
"""
|
||||
if not chunk_ids:
|
||||
return []
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
placeholders = ",".join("?" * len(chunk_ids))
|
||||
rows = conn.execute(
|
||||
f"""
|
||||
SELECT id, file_path, content, metadata, source_db
|
||||
FROM splade_chunks
|
||||
WHERE id IN ({placeholders})
|
||||
""",
|
||||
chunk_ids
|
||||
).fetchall()
|
||||
|
||||
return [
|
||||
{
|
||||
"id": row["id"],
|
||||
"file_path": row["file_path"],
|
||||
"content": row["content"],
|
||||
"metadata": row["metadata"],
|
||||
"source_db": row["source_db"]
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
except sqlite3.Error as e:
|
||||
logger.error("Failed to get chunks by IDs: %s", e)
|
||||
return []
|
||||
|
||||
def remove_chunk(self, chunk_id: int) -> int:
|
||||
"""Remove all postings for a chunk.
|
||||
|
||||
Args:
|
||||
chunk_id: Chunk ID to remove.
|
||||
|
||||
Returns:
|
||||
Number of deleted postings.
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
cursor = conn.execute(
|
||||
"DELETE FROM splade_posting_list WHERE chunk_id = ?",
|
||||
(chunk_id,)
|
||||
)
|
||||
conn.commit()
|
||||
deleted = cursor.rowcount
|
||||
logger.debug("Removed %d postings for chunk_id=%d", deleted, chunk_id)
|
||||
return deleted
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Failed to remove chunk_id={chunk_id}: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="remove_chunk"
|
||||
) from e
|
||||
|
||||
def search(
|
||||
self,
|
||||
query_sparse: Dict[int, float],
|
||||
limit: int = 50,
|
||||
min_score: float = 0.0,
|
||||
max_query_terms: int = 64
|
||||
) -> List[Tuple[int, float]]:
|
||||
"""Search for similar chunks using dot-product scoring.
|
||||
|
||||
Implements efficient sparse dot-product via SQL JOIN:
|
||||
score(q, d) = sum(q[t] * d[t]) for all tokens t
|
||||
|
||||
Args:
|
||||
query_sparse: Query sparse vector as {token_id: weight}.
|
||||
limit: Maximum number of results.
|
||||
min_score: Minimum score threshold.
|
||||
max_query_terms: Maximum query terms to use (default: 64).
|
||||
Pruning to top-K terms reduces search time with minimal impact on quality.
|
||||
Set to 0 or negative to disable pruning (use all terms).
|
||||
|
||||
Returns:
|
||||
List of (chunk_id, score) tuples, ordered by score descending.
|
||||
"""
|
||||
if not query_sparse:
|
||||
logger.warning("Empty query sparse vector")
|
||||
return []
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
# Build VALUES clause for query terms
|
||||
# Each term: (token_id, weight)
|
||||
query_terms = [
|
||||
(token_id, weight)
|
||||
for token_id, weight in query_sparse.items()
|
||||
if weight > 0
|
||||
]
|
||||
|
||||
if not query_terms:
|
||||
logger.warning("No non-zero query terms")
|
||||
return []
|
||||
|
||||
# Query pruning: keep only top-K terms by weight
|
||||
# max_query_terms <= 0 means no limit (use all terms)
|
||||
if max_query_terms > 0 and len(query_terms) > max_query_terms:
|
||||
query_terms = sorted(query_terms, key=lambda x: x[1], reverse=True)[:max_query_terms]
|
||||
logger.debug(
|
||||
"Query pruned from %d to %d terms",
|
||||
len(query_sparse),
|
||||
len(query_terms)
|
||||
)
|
||||
|
||||
# Create CTE for query terms using parameterized VALUES
|
||||
# Build placeholders and params to prevent SQL injection
|
||||
params = []
|
||||
placeholders = []
|
||||
for token_id, weight in query_terms:
|
||||
placeholders.append("(?, ?)")
|
||||
params.extend([token_id, weight])
|
||||
|
||||
values_placeholders = ", ".join(placeholders)
|
||||
|
||||
sql = f"""
|
||||
WITH query_terms(token_id, weight) AS (
|
||||
VALUES {values_placeholders}
|
||||
)
|
||||
SELECT
|
||||
p.chunk_id,
|
||||
SUM(p.weight * q.weight) as score
|
||||
FROM splade_posting_list p
|
||||
INNER JOIN query_terms q ON p.token_id = q.token_id
|
||||
GROUP BY p.chunk_id
|
||||
HAVING score >= ?
|
||||
ORDER BY score DESC
|
||||
LIMIT ?
|
||||
"""
|
||||
|
||||
# Append min_score and limit to params
|
||||
params.extend([min_score, limit])
|
||||
rows = conn.execute(sql, params).fetchall()
|
||||
|
||||
results = [(row["chunk_id"], float(row["score"])) for row in rows]
|
||||
logger.debug(
|
||||
"SPLADE search: %d query terms, %d results",
|
||||
len(query_terms),
|
||||
len(results)
|
||||
)
|
||||
return results
|
||||
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"SPLADE search failed: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="search"
|
||||
) from e
|
||||
|
||||
def get_metadata(self) -> Optional[Dict]:
|
||||
"""Get SPLADE model metadata.
|
||||
|
||||
Returns:
|
||||
Dictionary with model_name, vocab_size, onnx_path, created_at,
|
||||
or None if not set.
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT model_name, vocab_size, onnx_path, created_at
|
||||
FROM splade_metadata
|
||||
WHERE id = 1
|
||||
"""
|
||||
).fetchone()
|
||||
|
||||
if row:
|
||||
return {
|
||||
"model_name": row["model_name"],
|
||||
"vocab_size": row["vocab_size"],
|
||||
"onnx_path": row["onnx_path"],
|
||||
"created_at": row["created_at"]
|
||||
}
|
||||
return None
|
||||
except sqlite3.Error as e:
|
||||
logger.error("Failed to get metadata: %s", e)
|
||||
return None
|
||||
|
||||
def set_metadata(
|
||||
self,
|
||||
model_name: str,
|
||||
vocab_size: int,
|
||||
onnx_path: Optional[str] = None
|
||||
) -> None:
|
||||
"""Set SPLADE model metadata.
|
||||
|
||||
Args:
|
||||
model_name: SPLADE model name.
|
||||
vocab_size: Vocabulary size (typically ~30k for BERT vocab).
|
||||
onnx_path: Optional path to ONNX model file.
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
current_time = time.time()
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT OR REPLACE INTO splade_metadata
|
||||
(id, model_name, vocab_size, onnx_path, created_at)
|
||||
VALUES (1, ?, ?, ?, ?)
|
||||
""",
|
||||
(model_name, vocab_size, onnx_path, current_time)
|
||||
)
|
||||
conn.commit()
|
||||
logger.info(
|
||||
"Set SPLADE metadata: model=%s, vocab_size=%d",
|
||||
model_name,
|
||||
vocab_size
|
||||
)
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Failed to set metadata: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="set_metadata"
|
||||
) from e
|
||||
|
||||
def get_stats(self) -> Dict:
|
||||
"""Get index statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with total_postings, unique_tokens, unique_chunks.
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
row = conn.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total_postings,
|
||||
COUNT(DISTINCT token_id) as unique_tokens,
|
||||
COUNT(DISTINCT chunk_id) as unique_chunks
|
||||
FROM splade_posting_list
|
||||
""").fetchone()
|
||||
|
||||
return {
|
||||
"total_postings": row["total_postings"],
|
||||
"unique_tokens": row["unique_tokens"],
|
||||
"unique_chunks": row["unique_chunks"]
|
||||
}
|
||||
except sqlite3.Error as e:
|
||||
logger.error("Failed to get stats: %s", e)
|
||||
return {
|
||||
"total_postings": 0,
|
||||
"unique_tokens": 0,
|
||||
"unique_chunks": 0
|
||||
}
|
||||
976
codex-lens/build/lib/codexlens/storage/sqlite_store.py
Normal file
976
codex-lens/build/lib/codexlens/storage/sqlite_store.py
Normal file
@@ -0,0 +1,976 @@
|
||||
"""SQLite storage for CodexLens indexing and search."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
from codexlens.entities import IndexedFile, SearchResult, Symbol
|
||||
from codexlens.errors import StorageError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SQLiteStore:
|
||||
"""SQLiteStore providing FTS5 search and symbol lookup.
|
||||
|
||||
Implements thread-local connection pooling for improved performance.
|
||||
"""
|
||||
|
||||
# Maximum number of connections to keep in pool to prevent memory leaks
|
||||
MAX_POOL_SIZE = 32
|
||||
# Idle timeout in seconds (10 minutes)
|
||||
IDLE_TIMEOUT = 600
|
||||
# Periodic cleanup interval in seconds (5 minutes)
|
||||
CLEANUP_INTERVAL = 300
|
||||
|
||||
def __init__(self, db_path: str | Path) -> None:
|
||||
self.db_path = Path(db_path)
|
||||
self._lock = threading.RLock()
|
||||
self._local = threading.local()
|
||||
self._pool_lock = threading.Lock()
|
||||
# Pool stores (connection, last_access_time) tuples
|
||||
self._pool: Dict[int, Tuple[sqlite3.Connection, float]] = {}
|
||||
self._pool_generation = 0
|
||||
self._cleanup_timer: threading.Timer | None = None
|
||||
self._cleanup_stop_event = threading.Event()
|
||||
self._start_cleanup_timer()
|
||||
|
||||
def _get_connection(self) -> sqlite3.Connection:
|
||||
"""Get or create a thread-local database connection."""
|
||||
thread_id = threading.get_ident()
|
||||
current_time = time.time()
|
||||
|
||||
if getattr(self._local, "generation", None) == self._pool_generation:
|
||||
conn = getattr(self._local, "conn", None)
|
||||
if conn is not None:
|
||||
with self._pool_lock:
|
||||
pool_entry = self._pool.get(thread_id)
|
||||
if pool_entry is not None:
|
||||
pooled_conn, _ = pool_entry
|
||||
self._pool[thread_id] = (pooled_conn, current_time)
|
||||
self._local.conn = pooled_conn
|
||||
return pooled_conn
|
||||
|
||||
# Thread-local connection is stale (e.g., cleaned up by timer).
|
||||
self._local.conn = None
|
||||
|
||||
with self._pool_lock:
|
||||
pool_entry = self._pool.get(thread_id)
|
||||
if pool_entry is not None:
|
||||
conn, _ = pool_entry
|
||||
# Update last access time
|
||||
self._pool[thread_id] = (conn, current_time)
|
||||
else:
|
||||
# Clean up stale and idle connections if pool is too large
|
||||
if len(self._pool) >= self.MAX_POOL_SIZE:
|
||||
self._cleanup_stale_connections()
|
||||
|
||||
conn = sqlite3.connect(self.db_path, check_same_thread=False)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA synchronous=NORMAL")
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
# Memory-mapped I/O for faster reads (30GB limit)
|
||||
conn.execute("PRAGMA mmap_size=30000000000")
|
||||
self._pool[thread_id] = (conn, current_time)
|
||||
|
||||
self._local.conn = conn
|
||||
self._local.generation = self._pool_generation
|
||||
return conn
|
||||
|
||||
def _cleanup_stale_connections(self) -> None:
|
||||
"""Remove connections for threads that no longer exist or have been idle too long."""
|
||||
current_time = time.time()
|
||||
# Get list of active thread IDs
|
||||
active_threads = {t.ident for t in threading.enumerate() if t.ident is not None}
|
||||
|
||||
# Find connections to remove: dead threads or idle timeout exceeded
|
||||
stale_ids: list[tuple[int, str]] = []
|
||||
for tid, (conn, last_access) in list(self._pool.items()):
|
||||
try:
|
||||
is_dead_thread = tid not in active_threads
|
||||
is_idle = (current_time - last_access) > self.IDLE_TIMEOUT
|
||||
|
||||
is_invalid_connection = False
|
||||
if not is_dead_thread and not is_idle:
|
||||
try:
|
||||
conn.execute("SELECT 1").fetchone()
|
||||
except sqlite3.ProgrammingError:
|
||||
is_invalid_connection = True
|
||||
except sqlite3.Error:
|
||||
is_invalid_connection = True
|
||||
|
||||
if is_invalid_connection:
|
||||
stale_ids.append((tid, "invalid_connection"))
|
||||
elif is_dead_thread:
|
||||
stale_ids.append((tid, "dead_thread"))
|
||||
elif is_idle:
|
||||
stale_ids.append((tid, "idle_timeout"))
|
||||
except Exception:
|
||||
# Never break cleanup for a single bad entry.
|
||||
continue
|
||||
|
||||
# Close and remove stale connections
|
||||
for tid, reason in stale_ids:
|
||||
try:
|
||||
conn, _ = self._pool[tid]
|
||||
conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
del self._pool[tid]
|
||||
logger.debug("Cleaned SQLiteStore connection for thread_id=%s (%s)", tid, reason)
|
||||
|
||||
def _start_cleanup_timer(self) -> None:
|
||||
if self.CLEANUP_INTERVAL <= 0:
|
||||
return
|
||||
|
||||
self._cleanup_stop_event.clear()
|
||||
|
||||
def tick() -> None:
|
||||
if self._cleanup_stop_event.is_set():
|
||||
return
|
||||
|
||||
try:
|
||||
with self._pool_lock:
|
||||
self._cleanup_stale_connections()
|
||||
finally:
|
||||
with self._pool_lock:
|
||||
if self._cleanup_stop_event.is_set():
|
||||
self._cleanup_timer = None
|
||||
return
|
||||
|
||||
self._cleanup_timer = threading.Timer(self.CLEANUP_INTERVAL, tick)
|
||||
self._cleanup_timer.daemon = True
|
||||
self._cleanup_timer.start()
|
||||
|
||||
self._cleanup_timer = threading.Timer(self.CLEANUP_INTERVAL, tick)
|
||||
self._cleanup_timer.daemon = True
|
||||
self._cleanup_timer.start()
|
||||
|
||||
def _stop_cleanup_timer(self) -> None:
|
||||
self._cleanup_stop_event.set()
|
||||
with self._pool_lock:
|
||||
if self._cleanup_timer is not None:
|
||||
self._cleanup_timer.cancel()
|
||||
self._cleanup_timer = None
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close all pooled connections."""
|
||||
with self._lock:
|
||||
self._stop_cleanup_timer()
|
||||
with self._pool_lock:
|
||||
for conn, _ in self._pool.values():
|
||||
conn.close()
|
||||
self._pool.clear()
|
||||
self._pool_generation += 1
|
||||
|
||||
if hasattr(self._local, "conn"):
|
||||
self._local.conn = None
|
||||
if hasattr(self._local, "generation"):
|
||||
self._local.generation = self._pool_generation
|
||||
|
||||
def __enter__(self) -> SQLiteStore:
|
||||
self.initialize()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
|
||||
self.close()
|
||||
|
||||
def execute_query(
|
||||
self,
|
||||
sql: str,
|
||||
params: tuple = (),
|
||||
allow_writes: bool = False
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Execute a raw SQL query and return results as dictionaries.
|
||||
|
||||
This is the public API for executing custom queries without bypassing
|
||||
encapsulation via _get_connection().
|
||||
|
||||
By default, only SELECT queries are allowed. Use allow_writes=True
|
||||
for trusted internal code that needs to execute other statements.
|
||||
|
||||
Args:
|
||||
sql: SQL query string with ? placeholders for parameters
|
||||
params: Tuple of parameter values to bind
|
||||
allow_writes: If True, allow non-SELECT statements (default False)
|
||||
|
||||
Returns:
|
||||
List of result rows as dictionaries
|
||||
|
||||
Raises:
|
||||
StorageError: If query execution fails or validation fails
|
||||
"""
|
||||
# Validate query type for security
|
||||
sql_stripped = sql.strip().upper()
|
||||
if not allow_writes:
|
||||
# Only allow SELECT and WITH (for CTEs) statements
|
||||
if not (sql_stripped.startswith("SELECT") or sql_stripped.startswith("WITH")):
|
||||
raise StorageError(
|
||||
"Only SELECT queries are allowed. "
|
||||
"Use allow_writes=True for trusted internal operations.",
|
||||
db_path=str(self.db_path),
|
||||
operation="execute_query",
|
||||
details={"query_type": sql_stripped.split()[0] if sql_stripped else "EMPTY"}
|
||||
)
|
||||
|
||||
try:
|
||||
conn = self._get_connection()
|
||||
rows = conn.execute(sql, params).fetchall()
|
||||
return [dict(row) for row in rows]
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Query execution failed: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="execute_query",
|
||||
details={"error_type": type(e).__name__}
|
||||
) from e
|
||||
|
||||
def initialize(self) -> None:
|
||||
with self._lock:
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
conn = self._get_connection()
|
||||
self._create_schema(conn)
|
||||
self._ensure_fts_external_content(conn)
|
||||
|
||||
|
||||
def add_file(self, indexed_file: IndexedFile, content: str) -> None:
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
path = str(Path(indexed_file.path).resolve())
|
||||
language = indexed_file.language
|
||||
mtime = Path(path).stat().st_mtime if Path(path).exists() else None
|
||||
line_count = content.count(chr(10)) + 1
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO files(path, language, content, mtime, line_count)
|
||||
VALUES(?, ?, ?, ?, ?)
|
||||
ON CONFLICT(path) DO UPDATE SET
|
||||
language=excluded.language,
|
||||
content=excluded.content,
|
||||
mtime=excluded.mtime,
|
||||
line_count=excluded.line_count
|
||||
""",
|
||||
(path, language, content, mtime, line_count),
|
||||
)
|
||||
|
||||
row = conn.execute("SELECT id FROM files WHERE path=?", (path,)).fetchone()
|
||||
if not row:
|
||||
raise StorageError(f"Failed to read file id for {path}")
|
||||
file_id = int(row["id"])
|
||||
|
||||
conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
|
||||
if indexed_file.symbols:
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT INTO symbols(file_id, name, kind, start_line, end_line)
|
||||
VALUES(?, ?, ?, ?, ?)
|
||||
""",
|
||||
[
|
||||
(file_id, s.name, s.kind, s.range[0], s.range[1])
|
||||
for s in indexed_file.symbols
|
||||
],
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def add_files(self, files_data: List[tuple[IndexedFile, str]]) -> None:
|
||||
"""Add multiple files in a single transaction for better performance.
|
||||
|
||||
Args:
|
||||
files_data: List of (indexed_file, content) tuples
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
conn.execute("BEGIN")
|
||||
|
||||
for indexed_file, content in files_data:
|
||||
path = str(Path(indexed_file.path).resolve())
|
||||
language = indexed_file.language
|
||||
mtime = Path(path).stat().st_mtime if Path(path).exists() else None
|
||||
line_count = content.count(chr(10)) + 1
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO files(path, language, content, mtime, line_count)
|
||||
VALUES(?, ?, ?, ?, ?)
|
||||
ON CONFLICT(path) DO UPDATE SET
|
||||
language=excluded.language,
|
||||
content=excluded.content,
|
||||
mtime=excluded.mtime,
|
||||
line_count=excluded.line_count
|
||||
""",
|
||||
(path, language, content, mtime, line_count),
|
||||
)
|
||||
|
||||
row = conn.execute("SELECT id FROM files WHERE path=?", (path,)).fetchone()
|
||||
if not row:
|
||||
raise StorageError(f"Failed to read file id for {path}")
|
||||
file_id = int(row["id"])
|
||||
|
||||
conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
|
||||
if indexed_file.symbols:
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT INTO symbols(file_id, name, kind, start_line, end_line)
|
||||
VALUES(?, ?, ?, ?, ?)
|
||||
""",
|
||||
[
|
||||
(file_id, s.name, s.kind, s.range[0], s.range[1])
|
||||
for s in indexed_file.symbols
|
||||
],
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
except Exception as exc:
|
||||
try:
|
||||
conn.rollback()
|
||||
except Exception as rollback_exc:
|
||||
logger.error(
|
||||
"Rollback failed after add_files() error (%s): %s", exc, rollback_exc
|
||||
)
|
||||
raise exc.with_traceback(exc.__traceback__) from rollback_exc
|
||||
raise
|
||||
|
||||
def remove_file(self, path: str | Path) -> bool:
|
||||
"""Remove a file from the index."""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
resolved_path = str(Path(path).resolve())
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT id FROM files WHERE path=?", (resolved_path,)
|
||||
).fetchone()
|
||||
|
||||
if not row:
|
||||
return False
|
||||
|
||||
file_id = int(row["id"])
|
||||
conn.execute("DELETE FROM files WHERE id=?", (file_id,))
|
||||
conn.commit()
|
||||
return True
|
||||
|
||||
def file_exists(self, path: str | Path) -> bool:
|
||||
"""Check if a file exists in the index."""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
resolved_path = str(Path(path).resolve())
|
||||
row = conn.execute(
|
||||
"SELECT 1 FROM files WHERE path=?", (resolved_path,)
|
||||
).fetchone()
|
||||
return row is not None
|
||||
|
||||
def get_file_mtime(self, path: str | Path) -> float | None:
|
||||
"""Get the stored mtime for a file."""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
resolved_path = str(Path(path).resolve())
|
||||
row = conn.execute(
|
||||
"SELECT mtime FROM files WHERE path=?", (resolved_path,)
|
||||
).fetchone()
|
||||
return float(row["mtime"]) if row and row["mtime"] else None
|
||||
|
||||
|
||||
def search_fts(self, query: str, *, limit: int = 20, offset: int = 0) -> List[SearchResult]:
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT rowid, path, bm25(files_fts) AS rank,
|
||||
snippet(files_fts, 2, '[bold red]', '[/bold red]', "...", 20) AS excerpt
|
||||
FROM files_fts
|
||||
WHERE files_fts MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ? OFFSET ?
|
||||
""",
|
||||
(query, limit, offset),
|
||||
).fetchall()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"FTS search failed: {exc}") from exc
|
||||
|
||||
results: List[SearchResult] = []
|
||||
for row in rows:
|
||||
rank = float(row["rank"]) if row["rank"] is not None else 0.0
|
||||
score = abs(rank) if rank < 0 else 0.0
|
||||
results.append(
|
||||
SearchResult(
|
||||
path=row["path"],
|
||||
score=score,
|
||||
excerpt=row["excerpt"],
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def search_files_only(
|
||||
self, query: str, *, limit: int = 20, offset: int = 0
|
||||
) -> List[str]:
|
||||
"""Search indexed file contents and return only file paths."""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT path
|
||||
FROM files_fts
|
||||
WHERE files_fts MATCH ?
|
||||
ORDER BY bm25(files_fts)
|
||||
LIMIT ? OFFSET ?
|
||||
""",
|
||||
(query, limit, offset),
|
||||
).fetchall()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"FTS search failed: {exc}") from exc
|
||||
|
||||
return [row["path"] for row in rows]
|
||||
|
||||
def search_symbols(
|
||||
self, name: str, *, kind: Optional[str] = None, limit: int = 50
|
||||
) -> List[Symbol]:
|
||||
pattern = f"%{name}%"
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
if kind:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT name, kind, start_line, end_line
|
||||
FROM symbols
|
||||
WHERE name LIKE ? AND kind=?
|
||||
ORDER BY name
|
||||
LIMIT ?
|
||||
""",
|
||||
(pattern, kind, limit),
|
||||
).fetchall()
|
||||
else:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT name, kind, start_line, end_line
|
||||
FROM symbols
|
||||
WHERE name LIKE ?
|
||||
ORDER BY name
|
||||
LIMIT ?
|
||||
""",
|
||||
(pattern, limit),
|
||||
).fetchall()
|
||||
|
||||
return [
|
||||
Symbol(name=row["name"], kind=row["kind"], range=(row["start_line"], row["end_line"]))
|
||||
for row in rows
|
||||
]
|
||||
|
||||
|
||||
def stats(self) -> Dict[str, Any]:
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
file_count = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()["c"]
|
||||
symbol_count = conn.execute("SELECT COUNT(*) AS c FROM symbols").fetchone()["c"]
|
||||
lang_rows = conn.execute(
|
||||
"SELECT language, COUNT(*) AS c FROM files GROUP BY language ORDER BY c DESC"
|
||||
).fetchall()
|
||||
languages = {row["language"]: row["c"] for row in lang_rows}
|
||||
# Include relationship count if table exists
|
||||
relationship_count = 0
|
||||
try:
|
||||
rel_row = conn.execute("SELECT COUNT(*) AS c FROM code_relationships").fetchone()
|
||||
relationship_count = int(rel_row["c"]) if rel_row else 0
|
||||
except sqlite3.DatabaseError:
|
||||
pass
|
||||
|
||||
return {
|
||||
"files": int(file_count),
|
||||
"symbols": int(symbol_count),
|
||||
"relationships": relationship_count,
|
||||
"languages": languages,
|
||||
"db_path": str(self.db_path),
|
||||
}
|
||||
|
||||
|
||||
def _connect(self) -> sqlite3.Connection:
|
||||
"""Legacy method for backward compatibility."""
|
||||
return self._get_connection()
|
||||
|
||||
def _create_schema(self, conn: sqlite3.Connection) -> None:
|
||||
try:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
id INTEGER PRIMARY KEY,
|
||||
path TEXT UNIQUE NOT NULL,
|
||||
language TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
mtime REAL,
|
||||
line_count INTEGER
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS symbols (
|
||||
id INTEGER PRIMARY KEY,
|
||||
file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
|
||||
name TEXT NOT NULL,
|
||||
kind TEXT NOT NULL,
|
||||
start_line INTEGER NOT NULL,
|
||||
end_line INTEGER NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_kind ON symbols(kind)")
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS code_relationships (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
|
||||
target_qualified_name TEXT NOT NULL,
|
||||
relationship_type TEXT NOT NULL,
|
||||
source_line INTEGER NOT NULL,
|
||||
target_file TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
|
||||
# Chunks table for multi-vector storage (cascade retrieval architecture)
|
||||
# - embedding: Original embedding for backward compatibility
|
||||
# - embedding_binary: 256-dim binary vector for coarse ranking
|
||||
# - embedding_dense: 2048-dim dense vector for fine ranking
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS chunks (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_path TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
embedding BLOB,
|
||||
embedding_binary BLOB,
|
||||
embedding_dense BLOB,
|
||||
metadata TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_file_path ON chunks(file_path)")
|
||||
# Run migration for existing databases
|
||||
self._migrate_chunks_table(conn)
|
||||
conn.commit()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"Failed to initialize database schema: {exc}") from exc
|
||||
|
||||
def _ensure_fts_external_content(self, conn: sqlite3.Connection) -> None:
|
||||
"""Ensure files_fts is an FTS5 external-content table (no content duplication)."""
|
||||
try:
|
||||
sql_row = conn.execute(
|
||||
"SELECT sql FROM sqlite_master WHERE type='table' AND name='files_fts'"
|
||||
).fetchone()
|
||||
sql = str(sql_row["sql"]) if sql_row and sql_row["sql"] else None
|
||||
|
||||
if sql is None:
|
||||
self._create_external_fts(conn)
|
||||
conn.commit()
|
||||
return
|
||||
|
||||
if (
|
||||
"content='files'" in sql
|
||||
or 'content="files"' in sql
|
||||
or "content=files" in sql
|
||||
):
|
||||
self._create_fts_triggers(conn)
|
||||
conn.commit()
|
||||
return
|
||||
|
||||
self._migrate_fts_to_external(conn)
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"Failed to ensure FTS schema: {exc}") from exc
|
||||
|
||||
def _create_external_fts(self, conn: sqlite3.Connection) -> None:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE VIRTUAL TABLE files_fts USING fts5(
|
||||
path UNINDEXED,
|
||||
language UNINDEXED,
|
||||
content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="unicode61 tokenchars '_'"
|
||||
)
|
||||
"""
|
||||
)
|
||||
self._create_fts_triggers(conn)
|
||||
|
||||
def _create_fts_triggers(self, conn: sqlite3.Connection) -> None:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts(rowid, path, language, content)
|
||||
VALUES(new.id, new.path, new.language, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts(files_fts, rowid, path, language, content)
|
||||
VALUES('delete', old.id, old.path, old.language, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts(files_fts, rowid, path, language, content)
|
||||
VALUES('delete', old.id, old.path, old.language, old.content);
|
||||
INSERT INTO files_fts(rowid, path, language, content)
|
||||
VALUES(new.id, new.path, new.language, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
def _migrate_fts_to_external(self, conn: sqlite3.Connection) -> None:
|
||||
"""Migrate legacy files_fts (with duplicated content) to external content."""
|
||||
try:
|
||||
conn.execute("BEGIN")
|
||||
conn.execute("DROP TRIGGER IF EXISTS files_ai")
|
||||
conn.execute("DROP TRIGGER IF EXISTS files_ad")
|
||||
conn.execute("DROP TRIGGER IF EXISTS files_au")
|
||||
|
||||
conn.execute("ALTER TABLE files_fts RENAME TO files_fts_legacy")
|
||||
self._create_external_fts(conn)
|
||||
conn.execute("INSERT INTO files_fts(files_fts) VALUES('rebuild')")
|
||||
conn.execute("DROP TABLE files_fts_legacy")
|
||||
conn.commit()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
try:
|
||||
conn.rollback()
|
||||
except Exception as rollback_exc:
|
||||
logger.error(
|
||||
"Rollback failed during FTS schema migration (%s): %s", exc, rollback_exc
|
||||
)
|
||||
raise exc.with_traceback(exc.__traceback__) from rollback_exc
|
||||
|
||||
try:
|
||||
conn.execute("DROP TABLE IF EXISTS files_fts")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
conn.execute("ALTER TABLE files_fts_legacy RENAME TO files_fts")
|
||||
conn.commit()
|
||||
except Exception:
|
||||
pass
|
||||
raise
|
||||
|
||||
try:
|
||||
conn.execute("VACUUM")
|
||||
except sqlite3.DatabaseError:
|
||||
pass
|
||||
|
||||
def _migrate_chunks_table(self, conn: sqlite3.Connection) -> None:
|
||||
"""Migrate existing chunks table to add multi-vector columns if needed.
|
||||
|
||||
This handles upgrading existing databases that may have the chunks table
|
||||
without the embedding_binary and embedding_dense columns.
|
||||
"""
|
||||
# Check if chunks table exists
|
||||
table_exists = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
|
||||
).fetchone()
|
||||
|
||||
if not table_exists:
|
||||
# Table doesn't exist yet, nothing to migrate
|
||||
return
|
||||
|
||||
# Check existing columns
|
||||
cursor = conn.execute("PRAGMA table_info(chunks)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
# Add embedding_binary column if missing
|
||||
if "embedding_binary" not in columns:
|
||||
logger.info("Migrating chunks table: adding embedding_binary column")
|
||||
conn.execute(
|
||||
"ALTER TABLE chunks ADD COLUMN embedding_binary BLOB"
|
||||
)
|
||||
|
||||
# Add embedding_dense column if missing
|
||||
if "embedding_dense" not in columns:
|
||||
logger.info("Migrating chunks table: adding embedding_dense column")
|
||||
conn.execute(
|
||||
"ALTER TABLE chunks ADD COLUMN embedding_dense BLOB"
|
||||
)
|
||||
|
||||
def add_chunks(
|
||||
self,
|
||||
file_path: str,
|
||||
chunks_data: List[Dict[str, Any]],
|
||||
*,
|
||||
embedding: Optional[List[List[float]]] = None,
|
||||
embedding_binary: Optional[List[bytes]] = None,
|
||||
embedding_dense: Optional[List[bytes]] = None,
|
||||
) -> List[int]:
|
||||
"""Add multiple chunks with multi-vector embeddings support.
|
||||
|
||||
This method supports the cascade retrieval architecture with three embedding types:
|
||||
- embedding: Original dense embedding for backward compatibility
|
||||
- embedding_binary: 256-dim binary vector for fast coarse ranking
|
||||
- embedding_dense: 2048-dim dense vector for precise fine ranking
|
||||
|
||||
Args:
|
||||
file_path: Path to the source file for all chunks.
|
||||
chunks_data: List of dicts with 'content' and optional 'metadata' keys.
|
||||
embedding: Optional list of dense embeddings (one per chunk).
|
||||
embedding_binary: Optional list of binary embeddings as bytes (one per chunk).
|
||||
embedding_dense: Optional list of dense embeddings as bytes (one per chunk).
|
||||
|
||||
Returns:
|
||||
List of inserted chunk IDs.
|
||||
|
||||
Raises:
|
||||
ValueError: If embedding list lengths don't match chunks_data length.
|
||||
StorageError: If database operation fails.
|
||||
"""
|
||||
if not chunks_data:
|
||||
return []
|
||||
|
||||
n_chunks = len(chunks_data)
|
||||
|
||||
# Validate embedding lengths
|
||||
if embedding is not None and len(embedding) != n_chunks:
|
||||
raise ValueError(
|
||||
f"embedding length ({len(embedding)}) != chunks_data length ({n_chunks})"
|
||||
)
|
||||
if embedding_binary is not None and len(embedding_binary) != n_chunks:
|
||||
raise ValueError(
|
||||
f"embedding_binary length ({len(embedding_binary)}) != chunks_data length ({n_chunks})"
|
||||
)
|
||||
if embedding_dense is not None and len(embedding_dense) != n_chunks:
|
||||
raise ValueError(
|
||||
f"embedding_dense length ({len(embedding_dense)}) != chunks_data length ({n_chunks})"
|
||||
)
|
||||
|
||||
# Prepare batch data
|
||||
batch_data = []
|
||||
for i, chunk in enumerate(chunks_data):
|
||||
content = chunk.get("content", "")
|
||||
metadata = chunk.get("metadata")
|
||||
metadata_json = json.dumps(metadata) if metadata else None
|
||||
|
||||
# Convert embeddings to bytes if needed
|
||||
emb_blob = None
|
||||
if embedding is not None:
|
||||
import struct
|
||||
emb_blob = struct.pack(f"{len(embedding[i])}f", *embedding[i])
|
||||
|
||||
emb_binary_blob = embedding_binary[i] if embedding_binary is not None else None
|
||||
emb_dense_blob = embedding_dense[i] if embedding_dense is not None else None
|
||||
|
||||
batch_data.append((
|
||||
file_path, content, emb_blob, emb_binary_blob, emb_dense_blob, metadata_json
|
||||
))
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
# Get starting ID before insert
|
||||
row = conn.execute("SELECT MAX(id) FROM chunks").fetchone()
|
||||
start_id = (row[0] or 0) + 1
|
||||
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT INTO chunks (
|
||||
file_path, content, embedding, embedding_binary,
|
||||
embedding_dense, metadata
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
batch_data
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# Calculate inserted IDs
|
||||
return list(range(start_id, start_id + n_chunks))
|
||||
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(
|
||||
f"Failed to add chunks: {exc}",
|
||||
db_path=str(self.db_path),
|
||||
operation="add_chunks",
|
||||
) from exc
|
||||
|
||||
def get_binary_embeddings(
|
||||
self, chunk_ids: List[int]
|
||||
) -> Dict[int, Optional[bytes]]:
|
||||
"""Get binary embeddings for specified chunk IDs.
|
||||
|
||||
Used for coarse ranking in cascade retrieval architecture.
|
||||
Binary embeddings (256-dim) enable fast approximate similarity search.
|
||||
|
||||
Args:
|
||||
chunk_ids: List of chunk IDs to retrieve embeddings for.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping chunk_id to embedding_binary bytes (or None if not set).
|
||||
|
||||
Raises:
|
||||
StorageError: If database query fails.
|
||||
"""
|
||||
if not chunk_ids:
|
||||
return {}
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
placeholders = ",".join("?" * len(chunk_ids))
|
||||
rows = conn.execute(
|
||||
f"SELECT id, embedding_binary FROM chunks WHERE id IN ({placeholders})",
|
||||
chunk_ids
|
||||
).fetchall()
|
||||
|
||||
return {row["id"]: row["embedding_binary"] for row in rows}
|
||||
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(
|
||||
f"Failed to get binary embeddings: {exc}",
|
||||
db_path=str(self.db_path),
|
||||
operation="get_binary_embeddings",
|
||||
) from exc
|
||||
|
||||
def get_dense_embeddings(
|
||||
self, chunk_ids: List[int]
|
||||
) -> Dict[int, Optional[bytes]]:
|
||||
"""Get dense embeddings for specified chunk IDs.
|
||||
|
||||
Used for fine ranking in cascade retrieval architecture.
|
||||
Dense embeddings (2048-dim) provide high-precision similarity scoring.
|
||||
|
||||
Args:
|
||||
chunk_ids: List of chunk IDs to retrieve embeddings for.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping chunk_id to embedding_dense bytes (or None if not set).
|
||||
|
||||
Raises:
|
||||
StorageError: If database query fails.
|
||||
"""
|
||||
if not chunk_ids:
|
||||
return {}
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
placeholders = ",".join("?" * len(chunk_ids))
|
||||
rows = conn.execute(
|
||||
f"SELECT id, embedding_dense FROM chunks WHERE id IN ({placeholders})",
|
||||
chunk_ids
|
||||
).fetchall()
|
||||
|
||||
return {row["id"]: row["embedding_dense"] for row in rows}
|
||||
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(
|
||||
f"Failed to get dense embeddings: {exc}",
|
||||
db_path=str(self.db_path),
|
||||
operation="get_dense_embeddings",
|
||||
) from exc
|
||||
|
||||
def get_chunks_by_ids(
|
||||
self, chunk_ids: List[int]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Get chunk data for specified IDs.
|
||||
|
||||
Args:
|
||||
chunk_ids: List of chunk IDs to retrieve.
|
||||
|
||||
Returns:
|
||||
List of chunk dictionaries with id, file_path, content, metadata.
|
||||
|
||||
Raises:
|
||||
StorageError: If database query fails.
|
||||
"""
|
||||
if not chunk_ids:
|
||||
return []
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
placeholders = ",".join("?" * len(chunk_ids))
|
||||
rows = conn.execute(
|
||||
f"""
|
||||
SELECT id, file_path, content, metadata, created_at
|
||||
FROM chunks
|
||||
WHERE id IN ({placeholders})
|
||||
""",
|
||||
chunk_ids
|
||||
).fetchall()
|
||||
|
||||
results = []
|
||||
for row in rows:
|
||||
metadata = None
|
||||
if row["metadata"]:
|
||||
try:
|
||||
metadata = json.loads(row["metadata"])
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
results.append({
|
||||
"id": row["id"],
|
||||
"file_path": row["file_path"],
|
||||
"content": row["content"],
|
||||
"metadata": metadata,
|
||||
"created_at": row["created_at"],
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(
|
||||
f"Failed to get chunks: {exc}",
|
||||
db_path=str(self.db_path),
|
||||
operation="get_chunks_by_ids",
|
||||
) from exc
|
||||
|
||||
def delete_chunks_by_file(self, file_path: str) -> int:
|
||||
"""Delete all chunks for a given file path.
|
||||
|
||||
Args:
|
||||
file_path: Path to the source file.
|
||||
|
||||
Returns:
|
||||
Number of deleted chunks.
|
||||
|
||||
Raises:
|
||||
StorageError: If database operation fails.
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
cursor = conn.execute(
|
||||
"DELETE FROM chunks WHERE file_path = ?",
|
||||
(file_path,)
|
||||
)
|
||||
conn.commit()
|
||||
return cursor.rowcount
|
||||
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(
|
||||
f"Failed to delete chunks: {exc}",
|
||||
db_path=str(self.db_path),
|
||||
operation="delete_chunks_by_file",
|
||||
) from exc
|
||||
|
||||
def count_chunks(self) -> int:
|
||||
"""Count total chunks in store.
|
||||
|
||||
Returns:
|
||||
Total number of chunks.
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
row = conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone()
|
||||
return int(row["c"]) if row else 0
|
||||
64
codex-lens/build/lib/codexlens/storage/sqlite_utils.py
Normal file
64
codex-lens/build/lib/codexlens/storage/sqlite_utils.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""SQLite utility functions for CodexLens storage layer."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def check_trigram_support(conn: sqlite3.Connection) -> bool:
|
||||
"""Check if SQLite supports trigram tokenizer for FTS5.
|
||||
|
||||
Trigram tokenizer requires SQLite >= 3.34.0.
|
||||
|
||||
Args:
|
||||
conn: Database connection to test
|
||||
|
||||
Returns:
|
||||
True if trigram tokenizer is available, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Test by creating a temporary virtual table with trigram tokenizer
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS test_trigram_check
|
||||
USING fts5(test_content, tokenize='trigram')
|
||||
"""
|
||||
)
|
||||
# Clean up test table
|
||||
conn.execute("DROP TABLE IF EXISTS test_trigram_check")
|
||||
conn.commit()
|
||||
return True
|
||||
except sqlite3.OperationalError as e:
|
||||
# Trigram tokenizer not available
|
||||
if "unrecognized tokenizer" in str(e).lower():
|
||||
log.debug("Trigram tokenizer not available in this SQLite version")
|
||||
return False
|
||||
# Other operational errors should be re-raised
|
||||
raise
|
||||
except Exception:
|
||||
# Any other exception means trigram is not supported
|
||||
return False
|
||||
|
||||
|
||||
def get_sqlite_version(conn: sqlite3.Connection) -> tuple[int, int, int]:
|
||||
"""Get SQLite version as (major, minor, patch) tuple.
|
||||
|
||||
Args:
|
||||
conn: Database connection
|
||||
|
||||
Returns:
|
||||
Version tuple, e.g., (3, 34, 1)
|
||||
"""
|
||||
row = conn.execute("SELECT sqlite_version()").fetchone()
|
||||
version_str = row[0] if row else "0.0.0"
|
||||
parts = version_str.split('.')
|
||||
try:
|
||||
major = int(parts[0]) if len(parts) > 0 else 0
|
||||
minor = int(parts[1]) if len(parts) > 1 else 0
|
||||
patch = int(parts[2]) if len(parts) > 2 else 0
|
||||
return (major, minor, patch)
|
||||
except (ValueError, IndexError):
|
||||
return (0, 0, 0)
|
||||
415
codex-lens/build/lib/codexlens/storage/vector_meta_store.py
Normal file
415
codex-lens/build/lib/codexlens/storage/vector_meta_store.py
Normal file
@@ -0,0 +1,415 @@
|
||||
"""Central storage for vector metadata.
|
||||
|
||||
This module provides a centralized SQLite database for storing chunk metadata
|
||||
associated with centralized vector indexes. Instead of traversing all _index.db
|
||||
files to fetch chunk metadata, this provides O(1) lookup by chunk ID.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from codexlens.errors import StorageError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VectorMetadataStore:
|
||||
"""Store and retrieve chunk metadata for centralized vector search.
|
||||
|
||||
This class provides efficient storage and retrieval of chunk metadata
|
||||
for the centralized vector index architecture. All chunk metadata is
|
||||
stored in a single _vectors_meta.db file at the project root, enabling
|
||||
fast lookups without traversing multiple _index.db files.
|
||||
|
||||
Schema:
|
||||
chunk_metadata:
|
||||
- chunk_id: INTEGER PRIMARY KEY - Global chunk ID
|
||||
- file_path: TEXT NOT NULL - Path to source file
|
||||
- content: TEXT - Chunk text content
|
||||
- start_line: INTEGER - Start line in source file
|
||||
- end_line: INTEGER - End line in source file
|
||||
- category: TEXT - Content category (code/doc)
|
||||
- metadata: TEXT - JSON-encoded additional metadata
|
||||
- source_index_db: TEXT - Path to source _index.db file
|
||||
"""
|
||||
|
||||
def __init__(self, db_path: Path | str) -> None:
|
||||
"""Initialize VectorMetadataStore.
|
||||
|
||||
Args:
|
||||
db_path: Path to SQLite database file.
|
||||
"""
|
||||
self.db_path = Path(db_path)
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Thread-safe connection management
|
||||
self._lock = threading.RLock()
|
||||
self._local = threading.local()
|
||||
|
||||
def _get_connection(self) -> sqlite3.Connection:
|
||||
"""Get or create a thread-local database connection.
|
||||
|
||||
Each thread gets its own connection to ensure thread safety.
|
||||
"""
|
||||
conn = getattr(self._local, "conn", None)
|
||||
if conn is None:
|
||||
conn = sqlite3.connect(
|
||||
str(self.db_path),
|
||||
timeout=30.0,
|
||||
check_same_thread=True,
|
||||
)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA synchronous=NORMAL")
|
||||
conn.execute("PRAGMA mmap_size=1073741824") # 1GB mmap
|
||||
self._local.conn = conn
|
||||
return conn
|
||||
|
||||
def _ensure_schema(self) -> None:
|
||||
"""Create tables if they don't exist."""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
conn.execute('''
|
||||
CREATE TABLE IF NOT EXISTS chunk_metadata (
|
||||
chunk_id INTEGER PRIMARY KEY,
|
||||
file_path TEXT NOT NULL,
|
||||
content TEXT,
|
||||
start_line INTEGER,
|
||||
end_line INTEGER,
|
||||
category TEXT,
|
||||
metadata TEXT,
|
||||
source_index_db TEXT
|
||||
)
|
||||
''')
|
||||
conn.execute(
|
||||
'CREATE INDEX IF NOT EXISTS idx_chunk_file_path '
|
||||
'ON chunk_metadata(file_path)'
|
||||
)
|
||||
conn.execute(
|
||||
'CREATE INDEX IF NOT EXISTS idx_chunk_category '
|
||||
'ON chunk_metadata(category)'
|
||||
)
|
||||
# Binary vectors table for cascade search
|
||||
conn.execute('''
|
||||
CREATE TABLE IF NOT EXISTS binary_vectors (
|
||||
chunk_id INTEGER PRIMARY KEY,
|
||||
vector BLOB NOT NULL
|
||||
)
|
||||
''')
|
||||
conn.commit()
|
||||
logger.debug("VectorMetadataStore schema created/verified")
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Failed to create schema: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="_ensure_schema"
|
||||
) from e
|
||||
|
||||
def add_chunk(
|
||||
self,
|
||||
chunk_id: int,
|
||||
file_path: str,
|
||||
content: str,
|
||||
start_line: Optional[int] = None,
|
||||
end_line: Optional[int] = None,
|
||||
category: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
source_index_db: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Add a single chunk's metadata.
|
||||
|
||||
Args:
|
||||
chunk_id: Global unique chunk ID.
|
||||
file_path: Path to source file.
|
||||
content: Chunk text content.
|
||||
start_line: Start line in source file.
|
||||
end_line: End line in source file.
|
||||
category: Content category (code/doc).
|
||||
metadata: Additional metadata dictionary.
|
||||
source_index_db: Path to source _index.db file.
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
metadata_json = json.dumps(metadata) if metadata else None
|
||||
conn.execute(
|
||||
'''
|
||||
INSERT OR REPLACE INTO chunk_metadata
|
||||
(chunk_id, file_path, content, start_line, end_line,
|
||||
category, metadata, source_index_db)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
''',
|
||||
(chunk_id, file_path, content, start_line, end_line,
|
||||
category, metadata_json, source_index_db)
|
||||
)
|
||||
conn.commit()
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Failed to add chunk {chunk_id}: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="add_chunk"
|
||||
) from e
|
||||
|
||||
def add_chunks(self, chunks: List[Dict[str, Any]]) -> None:
|
||||
"""Batch insert chunk metadata.
|
||||
|
||||
Args:
|
||||
chunks: List of dictionaries with keys:
|
||||
- chunk_id (required): Global unique chunk ID
|
||||
- file_path (required): Path to source file
|
||||
- content: Chunk text content
|
||||
- start_line: Start line in source file
|
||||
- end_line: End line in source file
|
||||
- category: Content category (code/doc)
|
||||
- metadata: Additional metadata dictionary
|
||||
- source_index_db: Path to source _index.db file
|
||||
"""
|
||||
if not chunks:
|
||||
return
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
batch_data = []
|
||||
for chunk in chunks:
|
||||
metadata = chunk.get("metadata")
|
||||
metadata_json = json.dumps(metadata) if metadata else None
|
||||
batch_data.append((
|
||||
chunk["chunk_id"],
|
||||
chunk["file_path"],
|
||||
chunk.get("content"),
|
||||
chunk.get("start_line"),
|
||||
chunk.get("end_line"),
|
||||
chunk.get("category"),
|
||||
metadata_json,
|
||||
chunk.get("source_index_db"),
|
||||
))
|
||||
|
||||
conn.executemany(
|
||||
'''
|
||||
INSERT OR REPLACE INTO chunk_metadata
|
||||
(chunk_id, file_path, content, start_line, end_line,
|
||||
category, metadata, source_index_db)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
''',
|
||||
batch_data
|
||||
)
|
||||
conn.commit()
|
||||
logger.debug("Batch inserted %d chunk metadata records", len(chunks))
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Failed to batch insert chunks: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="add_chunks"
|
||||
) from e
|
||||
|
||||
def get_chunks_by_ids(
|
||||
self,
|
||||
chunk_ids: List[int],
|
||||
category: Optional[str] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Retrieve chunks by their IDs - the key optimization.
|
||||
|
||||
This is the primary method that replaces traversing all _index.db files.
|
||||
Provides O(1) lookup by chunk ID instead of O(n) where n is the number
|
||||
of index databases.
|
||||
|
||||
Args:
|
||||
chunk_ids: List of chunk IDs to retrieve.
|
||||
category: Optional category filter ('code' or 'doc').
|
||||
|
||||
Returns:
|
||||
List of dictionaries with chunk metadata:
|
||||
- chunk_id: Global chunk ID
|
||||
- file_path: Path to source file
|
||||
- content: Chunk text content
|
||||
- start_line: Start line in source file
|
||||
- end_line: End line in source file
|
||||
- category: Content category
|
||||
- metadata: Parsed metadata dictionary
|
||||
- source_index_db: Source _index.db path
|
||||
"""
|
||||
if not chunk_ids:
|
||||
return []
|
||||
|
||||
# No lock needed for reads: WAL mode + thread-local connections ensure safety
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
placeholders = ",".join("?" * len(chunk_ids))
|
||||
|
||||
if category:
|
||||
query = f'''
|
||||
SELECT chunk_id, file_path, content, start_line, end_line,
|
||||
category, metadata, source_index_db
|
||||
FROM chunk_metadata
|
||||
WHERE chunk_id IN ({placeholders}) AND category = ?
|
||||
'''
|
||||
params = list(chunk_ids) + [category]
|
||||
else:
|
||||
query = f'''
|
||||
SELECT chunk_id, file_path, content, start_line, end_line,
|
||||
category, metadata, source_index_db
|
||||
FROM chunk_metadata
|
||||
WHERE chunk_id IN ({placeholders})
|
||||
'''
|
||||
params = list(chunk_ids)
|
||||
|
||||
rows = conn.execute(query, params).fetchall()
|
||||
|
||||
results = []
|
||||
for row in rows:
|
||||
metadata = None
|
||||
if row["metadata"]:
|
||||
try:
|
||||
metadata = json.loads(row["metadata"])
|
||||
except json.JSONDecodeError:
|
||||
metadata = {}
|
||||
|
||||
results.append({
|
||||
"chunk_id": row["chunk_id"],
|
||||
"file_path": row["file_path"],
|
||||
"content": row["content"],
|
||||
"start_line": row["start_line"],
|
||||
"end_line": row["end_line"],
|
||||
"category": row["category"],
|
||||
"metadata": metadata or {},
|
||||
"source_index_db": row["source_index_db"],
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
except sqlite3.Error as e:
|
||||
logger.error("Failed to get chunks by IDs: %s", e)
|
||||
return []
|
||||
|
||||
def get_chunk_count(self) -> int:
|
||||
"""Get total number of chunks in store.
|
||||
|
||||
Returns:
|
||||
Total chunk count.
|
||||
"""
|
||||
# No lock needed for reads: WAL mode + thread-local connections ensure safety
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
row = conn.execute(
|
||||
"SELECT COUNT(*) FROM chunk_metadata"
|
||||
).fetchone()
|
||||
return row[0] if row else 0
|
||||
except sqlite3.Error:
|
||||
return 0
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all metadata."""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
conn.execute("DELETE FROM chunk_metadata")
|
||||
conn.commit()
|
||||
logger.info("Cleared all chunk metadata")
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Failed to clear metadata: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="clear"
|
||||
) from e
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close database connection."""
|
||||
with self._lock:
|
||||
conn = getattr(self._local, "conn", None)
|
||||
if conn is not None:
|
||||
conn.close()
|
||||
self._local.conn = None
|
||||
|
||||
def __enter__(self) -> "VectorMetadataStore":
|
||||
"""Context manager entry."""
|
||||
self._ensure_schema()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||
"""Context manager exit."""
|
||||
self.close()
|
||||
|
||||
# ============= Binary Vector Methods for Cascade Search =============
|
||||
|
||||
def add_binary_vectors(
|
||||
self, chunk_ids: List[int], binary_vectors: List[bytes]
|
||||
) -> None:
|
||||
"""Batch insert binary vectors for cascade search.
|
||||
|
||||
Args:
|
||||
chunk_ids: List of chunk IDs.
|
||||
binary_vectors: List of packed binary vectors (as bytes).
|
||||
"""
|
||||
if not chunk_ids or len(chunk_ids) != len(binary_vectors):
|
||||
return
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
data = list(zip(chunk_ids, binary_vectors))
|
||||
conn.executemany(
|
||||
"INSERT OR REPLACE INTO binary_vectors (chunk_id, vector) VALUES (?, ?)",
|
||||
data
|
||||
)
|
||||
conn.commit()
|
||||
logger.debug("Added %d binary vectors", len(chunk_ids))
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Failed to add binary vectors: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="add_binary_vectors"
|
||||
) from e
|
||||
|
||||
def get_all_binary_vectors(self) -> List[tuple]:
|
||||
"""Get all binary vectors for cascade search.
|
||||
|
||||
Returns:
|
||||
List of (chunk_id, vector_bytes) tuples.
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"SELECT chunk_id, vector FROM binary_vectors"
|
||||
).fetchall()
|
||||
return [(row[0], row[1]) for row in rows]
|
||||
except sqlite3.Error as e:
|
||||
logger.error("Failed to get binary vectors: %s", e)
|
||||
return []
|
||||
|
||||
def get_binary_vector_count(self) -> int:
|
||||
"""Get total number of binary vectors.
|
||||
|
||||
Returns:
|
||||
Binary vector count.
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
row = conn.execute(
|
||||
"SELECT COUNT(*) FROM binary_vectors"
|
||||
).fetchone()
|
||||
return row[0] if row else 0
|
||||
except sqlite3.Error:
|
||||
return 0
|
||||
|
||||
def clear_binary_vectors(self) -> None:
|
||||
"""Clear all binary vectors."""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
conn.execute("DELETE FROM binary_vectors")
|
||||
conn.commit()
|
||||
logger.info("Cleared all binary vectors")
|
||||
except sqlite3.Error as e:
|
||||
raise StorageError(
|
||||
f"Failed to clear binary vectors: {e}",
|
||||
db_path=str(self.db_path),
|
||||
operation="clear_binary_vectors"
|
||||
) from e
|
||||
Reference in New Issue
Block a user