mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-10 02:24:35 +08:00
Add comprehensive tests for query parsing and Reciprocal Rank Fusion
- Implemented tests for the QueryParser class, covering various identifier splitting methods (CamelCase, snake_case, kebab-case), OR expansion, and FTS5 operator preservation. - Added parameterized tests to validate expected token outputs for different query formats. - Created edge case tests to ensure robustness against unusual input scenarios. - Developed tests for the Reciprocal Rank Fusion (RRF) algorithm, including score computation, weight handling, and result ranking across multiple sources. - Included tests for normalization of BM25 scores and tagging search results with source metadata.
This commit is contained in:
@@ -57,7 +57,7 @@ class DirIndexStore:
|
||||
|
||||
# Schema version for migration tracking
|
||||
# Increment this when schema changes require migration
|
||||
SCHEMA_VERSION = 2
|
||||
SCHEMA_VERSION = 4
|
||||
|
||||
def __init__(self, db_path: str | Path) -> None:
|
||||
"""Initialize directory index store.
|
||||
@@ -93,11 +93,13 @@ class DirIndexStore:
|
||||
)
|
||||
|
||||
# Create or migrate schema
|
||||
self._create_schema(conn)
|
||||
self._create_fts_triggers(conn)
|
||||
|
||||
# Apply versioned migrations if needed
|
||||
if current_version < self.SCHEMA_VERSION:
|
||||
if current_version == 0:
|
||||
# New database - create schema directly
|
||||
self._create_schema(conn)
|
||||
self._create_fts_triggers(conn)
|
||||
self._set_schema_version(conn, self.SCHEMA_VERSION)
|
||||
elif current_version < self.SCHEMA_VERSION:
|
||||
# Existing database - apply migrations
|
||||
self._apply_migrations(conn, current_version)
|
||||
self._set_schema_version(conn, self.SCHEMA_VERSION)
|
||||
|
||||
@@ -126,6 +128,11 @@ class DirIndexStore:
|
||||
if from_version < 2:
|
||||
self._migrate_v2_add_name_column(conn)
|
||||
|
||||
# Migration v2 -> v4: Add dual FTS tables (exact + fuzzy)
|
||||
if from_version < 4:
|
||||
from codexlens.storage.migrations.migration_004_dual_fts import upgrade
|
||||
upgrade(conn)
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close database connection."""
|
||||
with self._lock:
|
||||
@@ -465,6 +472,117 @@ class DirIndexStore:
|
||||
|
||||
return float(row["mtime"]) if row and row["mtime"] else None
|
||||
|
||||
def needs_reindex(self, full_path: str | Path) -> bool:
|
||||
"""Check if a file needs reindexing based on mtime comparison.
|
||||
|
||||
Uses 1ms tolerance to handle filesystem timestamp precision variations.
|
||||
|
||||
Args:
|
||||
full_path: Complete source file path
|
||||
|
||||
Returns:
|
||||
True if file should be reindexed (new, modified, or missing from index)
|
||||
"""
|
||||
full_path_obj = Path(full_path).resolve()
|
||||
if not full_path_obj.exists():
|
||||
return False # File doesn't exist, skip indexing
|
||||
|
||||
# Get current filesystem mtime
|
||||
try:
|
||||
current_mtime = full_path_obj.stat().st_mtime
|
||||
except OSError:
|
||||
return False # Can't read file stats, skip
|
||||
|
||||
# Get stored mtime from database
|
||||
stored_mtime = self.get_file_mtime(full_path_obj)
|
||||
|
||||
# File not in index, needs indexing
|
||||
if stored_mtime is None:
|
||||
return True
|
||||
|
||||
# Compare with 1ms tolerance for floating point precision
|
||||
MTIME_TOLERANCE = 0.001
|
||||
return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
|
||||
|
||||
def add_file_incremental(
|
||||
self,
|
||||
name: str,
|
||||
full_path: str | Path,
|
||||
content: str,
|
||||
language: str,
|
||||
symbols: Optional[List[Symbol]] = None,
|
||||
) -> Optional[int]:
|
||||
"""Add or update a file only if it has changed (incremental indexing).
|
||||
|
||||
Checks mtime before indexing to skip unchanged files.
|
||||
|
||||
Args:
|
||||
name: Filename without path
|
||||
full_path: Complete source file path
|
||||
content: File content for indexing
|
||||
language: Programming language identifier
|
||||
symbols: List of Symbol objects from the file
|
||||
|
||||
Returns:
|
||||
Database file_id if indexed, None if skipped (unchanged)
|
||||
|
||||
Raises:
|
||||
StorageError: If database operations fail
|
||||
"""
|
||||
# Check if reindexing is needed
|
||||
if not self.needs_reindex(full_path):
|
||||
return None # Skip unchanged file
|
||||
|
||||
# File changed or new, perform full indexing
|
||||
return self.add_file(name, full_path, content, language, symbols)
|
||||
|
||||
def cleanup_deleted_files(self, source_dir: Path) -> int:
|
||||
"""Remove indexed files that no longer exist in the source directory.
|
||||
|
||||
Scans the source directory and removes database entries for deleted files.
|
||||
|
||||
Args:
|
||||
source_dir: Source directory to scan
|
||||
|
||||
Returns:
|
||||
Number of deleted file entries removed
|
||||
|
||||
Raises:
|
||||
StorageError: If cleanup operations fail
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_dir = source_dir.resolve()
|
||||
|
||||
try:
|
||||
# Get all indexed file paths
|
||||
rows = conn.execute("SELECT full_path FROM files").fetchall()
|
||||
indexed_paths = {row["full_path"] for row in rows}
|
||||
|
||||
# Build set of existing files in source directory
|
||||
existing_paths = set()
|
||||
for file_path in source_dir.rglob("*"):
|
||||
if file_path.is_file():
|
||||
existing_paths.add(str(file_path.resolve()))
|
||||
|
||||
# Find orphaned entries (indexed but no longer exist)
|
||||
deleted_paths = indexed_paths - existing_paths
|
||||
|
||||
# Remove orphaned entries
|
||||
deleted_count = 0
|
||||
for deleted_path in deleted_paths:
|
||||
conn.execute("DELETE FROM files WHERE full_path=?", (deleted_path,))
|
||||
deleted_count += 1
|
||||
|
||||
if deleted_count > 0:
|
||||
conn.commit()
|
||||
|
||||
return deleted_count
|
||||
|
||||
except Exception as exc:
|
||||
conn.rollback()
|
||||
raise StorageError(f"Failed to cleanup deleted files: {exc}") from exc
|
||||
|
||||
def list_files(self) -> List[FileEntry]:
|
||||
"""List all files in current directory.
|
||||
|
||||
@@ -985,6 +1103,92 @@ class DirIndexStore:
|
||||
)
|
||||
return results
|
||||
|
||||
def search_fts_exact(self, query: str, limit: int = 20) -> List[SearchResult]:
|
||||
"""Full-text search using exact token matching (unicode61 tokenizer).
|
||||
|
||||
Args:
|
||||
query: FTS5 query string
|
||||
limit: Maximum results to return
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects sorted by relevance
|
||||
|
||||
Raises:
|
||||
StorageError: If FTS search fails
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT rowid, full_path, bm25(files_fts_exact) AS rank,
|
||||
snippet(files_fts_exact, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
|
||||
FROM files_fts_exact
|
||||
WHERE files_fts_exact MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ?
|
||||
""",
|
||||
(query, limit),
|
||||
).fetchall()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"FTS exact search failed: {exc}") from exc
|
||||
|
||||
results: List[SearchResult] = []
|
||||
for row in rows:
|
||||
rank = float(row["rank"]) if row["rank"] is not None else 0.0
|
||||
score = abs(rank) if rank < 0 else 0.0
|
||||
results.append(
|
||||
SearchResult(
|
||||
path=row["full_path"],
|
||||
score=score,
|
||||
excerpt=row["excerpt"],
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def search_fts_fuzzy(self, query: str, limit: int = 20) -> List[SearchResult]:
|
||||
"""Full-text search using fuzzy/substring matching (trigram or extended unicode61 tokenizer).
|
||||
|
||||
Args:
|
||||
query: FTS5 query string
|
||||
limit: Maximum results to return
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects sorted by relevance
|
||||
|
||||
Raises:
|
||||
StorageError: If FTS search fails
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT rowid, full_path, bm25(files_fts_fuzzy) AS rank,
|
||||
snippet(files_fts_fuzzy, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
|
||||
FROM files_fts_fuzzy
|
||||
WHERE files_fts_fuzzy MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ?
|
||||
""",
|
||||
(query, limit),
|
||||
).fetchall()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"FTS fuzzy search failed: {exc}") from exc
|
||||
|
||||
results: List[SearchResult] = []
|
||||
for row in rows:
|
||||
rank = float(row["rank"]) if row["rank"] is not None else 0.0
|
||||
score = abs(rank) if rank < 0 else 0.0
|
||||
results.append(
|
||||
SearchResult(
|
||||
path=row["full_path"],
|
||||
score=score,
|
||||
excerpt=row["excerpt"],
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def search_files_only(self, query: str, limit: int = 20) -> List[str]:
|
||||
"""Fast FTS search returning only file paths (no snippet generation).
|
||||
|
||||
@@ -1185,16 +1389,34 @@ class DirIndexStore:
|
||||
"""
|
||||
)
|
||||
|
||||
# FTS5 external content table with code-friendly tokenizer
|
||||
# unicode61 tokenchars keeps underscores as part of tokens
|
||||
# so 'user_id' is indexed as one token, not 'user' and 'id'
|
||||
# Dual FTS5 external content tables for exact and fuzzy matching
|
||||
# files_fts_exact: unicode61 tokenizer for exact token matching
|
||||
# files_fts_fuzzy: trigram tokenizer (or extended unicode61) for substring/fuzzy matching
|
||||
from codexlens.storage.sqlite_utils import check_trigram_support
|
||||
|
||||
has_trigram = check_trigram_support(conn)
|
||||
fuzzy_tokenizer = "trigram" if has_trigram else "unicode61 tokenchars '_-'"
|
||||
|
||||
# Exact FTS table with unicode61 tokenizer
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_exact USING fts5(
|
||||
name, full_path UNINDEXED, content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="unicode61 tokenchars '_'"
|
||||
tokenize="unicode61 tokenchars '_-'"
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Fuzzy FTS table with trigram or extended unicode61 tokenizer
|
||||
conn.execute(
|
||||
f"""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_fuzzy USING fts5(
|
||||
name, full_path UNINDEXED, content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="{fuzzy_tokenizer}"
|
||||
)
|
||||
"""
|
||||
)
|
||||
@@ -1301,38 +1523,72 @@ class DirIndexStore:
|
||||
conn.execute("UPDATE files SET name = ? WHERE id = ?", (name, file_id))
|
||||
|
||||
def _create_fts_triggers(self, conn: sqlite3.Connection) -> None:
|
||||
"""Create FTS5 external content triggers.
|
||||
"""Create FTS5 external content triggers for dual FTS tables.
|
||||
|
||||
Creates synchronized triggers for both files_fts_exact and files_fts_fuzzy tables.
|
||||
|
||||
Args:
|
||||
conn: Database connection
|
||||
"""
|
||||
# Insert trigger
|
||||
# Insert triggers for files_fts_exact
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts(rowid, name, full_path, content)
|
||||
CREATE TRIGGER IF NOT EXISTS files_exact_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts_exact(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Delete trigger
|
||||
# Delete trigger for files_fts_exact
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts(files_fts, rowid, name, full_path, content)
|
||||
CREATE TRIGGER IF NOT EXISTS files_exact_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Update trigger
|
||||
# Update trigger for files_fts_exact
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts(files_fts, rowid, name, full_path, content)
|
||||
CREATE TRIGGER IF NOT EXISTS files_exact_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
INSERT INTO files_fts(rowid, name, full_path, content)
|
||||
INSERT INTO files_fts_exact(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Insert trigger for files_fts_fuzzy
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_fuzzy_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Delete trigger for files_fts_fuzzy
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_fuzzy_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Update trigger for files_fts_fuzzy
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_fuzzy_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
|
||||
@@ -77,7 +77,7 @@ class IndexTreeBuilder:
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self, registry: RegistryStore, mapper: PathMapper, config: Config = None
|
||||
self, registry: RegistryStore, mapper: PathMapper, config: Config = None, incremental: bool = True
|
||||
):
|
||||
"""Initialize the index tree builder.
|
||||
|
||||
@@ -85,18 +85,21 @@ class IndexTreeBuilder:
|
||||
registry: Global registry store for project tracking
|
||||
mapper: Path mapper for source to index conversions
|
||||
config: CodexLens configuration (uses defaults if None)
|
||||
incremental: Enable incremental indexing (default True)
|
||||
"""
|
||||
self.registry = registry
|
||||
self.mapper = mapper
|
||||
self.config = config or Config()
|
||||
self.parser_factory = ParserFactory(self.config)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.incremental = incremental
|
||||
|
||||
def build(
|
||||
self,
|
||||
source_root: Path,
|
||||
languages: List[str] = None,
|
||||
workers: int = 4,
|
||||
force_full: bool = False,
|
||||
) -> BuildResult:
|
||||
"""Build complete index tree for a project.
|
||||
|
||||
@@ -106,11 +109,13 @@ class IndexTreeBuilder:
|
||||
3. Build indexes bottom-up (deepest first)
|
||||
4. Link subdirectories to parents
|
||||
5. Update project statistics
|
||||
6. Cleanup deleted files (if incremental mode)
|
||||
|
||||
Args:
|
||||
source_root: Project root directory to index
|
||||
languages: Optional list of language IDs to limit indexing
|
||||
workers: Number of parallel worker processes
|
||||
force_full: Force full reindex (override incremental mode)
|
||||
|
||||
Returns:
|
||||
BuildResult with statistics and errors
|
||||
@@ -122,7 +127,12 @@ class IndexTreeBuilder:
|
||||
if not source_root.exists():
|
||||
raise ValueError(f"Source root does not exist: {source_root}")
|
||||
|
||||
self.logger.info("Building index tree for %s", source_root)
|
||||
# Override incremental mode if force_full is True
|
||||
use_incremental = self.incremental and not force_full
|
||||
if force_full:
|
||||
self.logger.info("Building index tree for %s (FULL reindex)", source_root)
|
||||
else:
|
||||
self.logger.info("Building index tree for %s (incremental=%s)", source_root, use_incremental)
|
||||
|
||||
# Register project
|
||||
index_root = self.mapper.source_to_index_dir(source_root)
|
||||
@@ -186,6 +196,25 @@ class IndexTreeBuilder:
|
||||
# Link children to this directory
|
||||
self._link_children_to_parent(result.source_path, all_results)
|
||||
|
||||
# Cleanup deleted files if in incremental mode
|
||||
if use_incremental:
|
||||
self.logger.info("Cleaning up deleted files...")
|
||||
total_deleted = 0
|
||||
for result in all_results:
|
||||
if result.error:
|
||||
continue
|
||||
try:
|
||||
with DirIndexStore(result.index_path) as store:
|
||||
deleted_count = store.cleanup_deleted_files(result.source_path)
|
||||
total_deleted += deleted_count
|
||||
if deleted_count > 0:
|
||||
self.logger.debug("Removed %d deleted files from %s", deleted_count, result.source_path)
|
||||
except Exception as exc:
|
||||
self.logger.warning("Cleanup failed for %s: %s", result.source_path, exc)
|
||||
|
||||
if total_deleted > 0:
|
||||
self.logger.info("Removed %d deleted files from index", total_deleted)
|
||||
|
||||
# Update project statistics
|
||||
self.registry.update_project_stats(source_root, total_files, total_dirs)
|
||||
|
||||
@@ -436,9 +465,15 @@ class IndexTreeBuilder:
|
||||
|
||||
files_count = 0
|
||||
symbols_count = 0
|
||||
skipped_count = 0
|
||||
|
||||
for file_path in source_files:
|
||||
try:
|
||||
# Check if file needs reindexing (incremental mode)
|
||||
if self.incremental and not store.needs_reindex(file_path):
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Read and parse file
|
||||
text = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
language_id = self.config.language_for_path(file_path)
|
||||
@@ -491,13 +526,23 @@ class IndexTreeBuilder:
|
||||
|
||||
store.close()
|
||||
|
||||
self.logger.debug(
|
||||
"Built %s: %d files, %d symbols, %d subdirs",
|
||||
dir_path,
|
||||
files_count,
|
||||
symbols_count,
|
||||
len(subdirs),
|
||||
)
|
||||
if skipped_count > 0:
|
||||
self.logger.debug(
|
||||
"Built %s: %d files indexed, %d skipped (unchanged), %d symbols, %d subdirs",
|
||||
dir_path,
|
||||
files_count,
|
||||
skipped_count,
|
||||
symbols_count,
|
||||
len(subdirs),
|
||||
)
|
||||
else:
|
||||
self.logger.debug(
|
||||
"Built %s: %d files, %d symbols, %d subdirs",
|
||||
dir_path,
|
||||
files_count,
|
||||
symbols_count,
|
||||
len(subdirs),
|
||||
)
|
||||
|
||||
return DirBuildResult(
|
||||
source_path=dir_path,
|
||||
|
||||
@@ -0,0 +1,231 @@
|
||||
"""
|
||||
Migration 004: Add dual FTS tables for exact and fuzzy matching.
|
||||
|
||||
This migration introduces two FTS5 tables:
|
||||
- files_fts_exact: Uses unicode61 tokenizer for exact token matching
|
||||
- files_fts_fuzzy: Uses trigram tokenizer (or extended unicode61) for substring/fuzzy matching
|
||||
|
||||
Both tables are synchronized with the files table via triggers for automatic updates.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
from codexlens.storage.sqlite_utils import check_trigram_support, get_sqlite_version
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection):
|
||||
"""
|
||||
Applies the migration to add dual FTS tables.
|
||||
|
||||
- Drops old files_fts table and triggers
|
||||
- Creates files_fts_exact with unicode61 tokenizer
|
||||
- Creates files_fts_fuzzy with trigram or extended unicode61 tokenizer
|
||||
- Creates synchronized triggers for both tables
|
||||
- Rebuilds FTS indexes from files table
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
try:
|
||||
# Check trigram support
|
||||
has_trigram = check_trigram_support(db_conn)
|
||||
version = get_sqlite_version(db_conn)
|
||||
log.info(f"SQLite version: {'.'.join(map(str, version))}")
|
||||
|
||||
if has_trigram:
|
||||
log.info("Trigram tokenizer available, using for fuzzy FTS table")
|
||||
fuzzy_tokenizer = "trigram"
|
||||
else:
|
||||
log.warning(
|
||||
f"Trigram tokenizer not available (requires SQLite >= 3.34), "
|
||||
f"using extended unicode61 tokenizer for fuzzy matching"
|
||||
)
|
||||
fuzzy_tokenizer = "unicode61 tokenchars '_-'"
|
||||
|
||||
# Start transaction
|
||||
cursor.execute("BEGIN TRANSACTION")
|
||||
|
||||
# Check if files table has 'name' column (v2 schema doesn't have it)
|
||||
cursor.execute("PRAGMA table_info(files)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
if 'name' not in columns:
|
||||
log.info("Adding 'name' column to files table (v2 schema upgrade)...")
|
||||
# Add name column
|
||||
cursor.execute("ALTER TABLE files ADD COLUMN name TEXT")
|
||||
# Populate name from path (extract filename from last '/')
|
||||
# Use Python to do the extraction since SQLite doesn't have reverse()
|
||||
cursor.execute("SELECT rowid, path FROM files")
|
||||
rows = cursor.fetchall()
|
||||
for rowid, path in rows:
|
||||
# Extract filename from path
|
||||
name = path.split('/')[-1] if '/' in path else path
|
||||
cursor.execute("UPDATE files SET name = ? WHERE rowid = ?", (name, rowid))
|
||||
|
||||
# Rename 'path' column to 'full_path' if needed
|
||||
if 'path' in columns and 'full_path' not in columns:
|
||||
log.info("Renaming 'path' to 'full_path' (v2 schema upgrade)...")
|
||||
# Check if indexed_at column exists in v2 schema
|
||||
has_indexed_at = 'indexed_at' in columns
|
||||
has_mtime = 'mtime' in columns
|
||||
|
||||
# SQLite doesn't support RENAME COLUMN before 3.25, so use table recreation
|
||||
cursor.execute("""
|
||||
CREATE TABLE files_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL,
|
||||
full_path TEXT NOT NULL UNIQUE,
|
||||
content TEXT,
|
||||
language TEXT,
|
||||
mtime REAL,
|
||||
indexed_at TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
# Build INSERT statement based on available columns
|
||||
# Note: v2 schema has no rowid (path is PRIMARY KEY), so use NULL for AUTOINCREMENT
|
||||
if has_indexed_at and has_mtime:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language, mtime, indexed_at)
|
||||
SELECT name, path, content, language, mtime, indexed_at FROM files
|
||||
""")
|
||||
elif has_indexed_at:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language, indexed_at)
|
||||
SELECT name, path, content, language, indexed_at FROM files
|
||||
""")
|
||||
elif has_mtime:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language, mtime)
|
||||
SELECT name, path, content, language, mtime FROM files
|
||||
""")
|
||||
else:
|
||||
cursor.execute("""
|
||||
INSERT INTO files_new (name, full_path, content, language)
|
||||
SELECT name, path, content, language FROM files
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE files")
|
||||
cursor.execute("ALTER TABLE files_new RENAME TO files")
|
||||
|
||||
log.info("Dropping old FTS triggers and table...")
|
||||
# Drop old triggers
|
||||
cursor.execute("DROP TRIGGER IF EXISTS files_ai")
|
||||
cursor.execute("DROP TRIGGER IF EXISTS files_ad")
|
||||
cursor.execute("DROP TRIGGER IF EXISTS files_au")
|
||||
|
||||
# Drop old FTS table
|
||||
cursor.execute("DROP TABLE IF EXISTS files_fts")
|
||||
|
||||
# Create exact FTS table (unicode61 with underscores/hyphens as token chars)
|
||||
log.info("Creating files_fts_exact table with unicode61 tokenizer...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE VIRTUAL TABLE files_fts_exact USING fts5(
|
||||
name, full_path UNINDEXED, content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="unicode61 tokenchars '_-'"
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create fuzzy FTS table (trigram or extended unicode61)
|
||||
log.info(f"Creating files_fts_fuzzy table with {fuzzy_tokenizer} tokenizer...")
|
||||
cursor.execute(
|
||||
f"""
|
||||
CREATE VIRTUAL TABLE files_fts_fuzzy USING fts5(
|
||||
name, full_path UNINDEXED, content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="{fuzzy_tokenizer}"
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Create synchronized triggers for files_fts_exact
|
||||
log.info("Creating triggers for files_fts_exact...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_exact_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts_exact(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_exact_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_exact_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
INSERT INTO files_fts_exact(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Create synchronized triggers for files_fts_fuzzy
|
||||
log.info("Creating triggers for files_fts_fuzzy...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_fuzzy_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_fuzzy_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TRIGGER files_fuzzy_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Rebuild FTS indexes from files table
|
||||
log.info("Rebuilding FTS indexes from files table...")
|
||||
cursor.execute("INSERT INTO files_fts_exact(files_fts_exact) VALUES('rebuild')")
|
||||
cursor.execute("INSERT INTO files_fts_fuzzy(files_fts_fuzzy) VALUES('rebuild')")
|
||||
|
||||
# Commit transaction
|
||||
cursor.execute("COMMIT")
|
||||
log.info("Migration 004 completed successfully")
|
||||
|
||||
# Vacuum to reclaim space (outside transaction)
|
||||
try:
|
||||
log.info("Running VACUUM to reclaim space...")
|
||||
cursor.execute("VACUUM")
|
||||
except Exception as e:
|
||||
log.warning(f"VACUUM failed (non-critical): {e}")
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Migration 004 failed: {e}")
|
||||
try:
|
||||
cursor.execute("ROLLBACK")
|
||||
except Exception:
|
||||
pass
|
||||
raise
|
||||
64
codex-lens/src/codexlens/storage/sqlite_utils.py
Normal file
64
codex-lens/src/codexlens/storage/sqlite_utils.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""SQLite utility functions for CodexLens storage layer."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def check_trigram_support(conn: sqlite3.Connection) -> bool:
|
||||
"""Check if SQLite supports trigram tokenizer for FTS5.
|
||||
|
||||
Trigram tokenizer requires SQLite >= 3.34.0.
|
||||
|
||||
Args:
|
||||
conn: Database connection to test
|
||||
|
||||
Returns:
|
||||
True if trigram tokenizer is available, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Test by creating a temporary virtual table with trigram tokenizer
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS test_trigram_check
|
||||
USING fts5(test_content, tokenize='trigram')
|
||||
"""
|
||||
)
|
||||
# Clean up test table
|
||||
conn.execute("DROP TABLE IF EXISTS test_trigram_check")
|
||||
conn.commit()
|
||||
return True
|
||||
except sqlite3.OperationalError as e:
|
||||
# Trigram tokenizer not available
|
||||
if "unrecognized tokenizer" in str(e).lower():
|
||||
log.debug("Trigram tokenizer not available in this SQLite version")
|
||||
return False
|
||||
# Other operational errors should be re-raised
|
||||
raise
|
||||
except Exception:
|
||||
# Any other exception means trigram is not supported
|
||||
return False
|
||||
|
||||
|
||||
def get_sqlite_version(conn: sqlite3.Connection) -> tuple[int, int, int]:
|
||||
"""Get SQLite version as (major, minor, patch) tuple.
|
||||
|
||||
Args:
|
||||
conn: Database connection
|
||||
|
||||
Returns:
|
||||
Version tuple, e.g., (3, 34, 1)
|
||||
"""
|
||||
row = conn.execute("SELECT sqlite_version()").fetchone()
|
||||
version_str = row[0] if row else "0.0.0"
|
||||
parts = version_str.split('.')
|
||||
try:
|
||||
major = int(parts[0]) if len(parts) > 0 else 0
|
||||
minor = int(parts[1]) if len(parts) > 1 else 0
|
||||
patch = int(parts[2]) if len(parts) > 2 else 0
|
||||
return (major, minor, patch)
|
||||
except (ValueError, IndexError):
|
||||
return (0, 0, 0)
|
||||
Reference in New Issue
Block a user