Add comprehensive tests for query parsing and Reciprocal Rank Fusion

- Implemented tests for the QueryParser class, covering various identifier splitting methods (CamelCase, snake_case, kebab-case), OR expansion, and FTS5 operator preservation. - Added parameterized tests to validate expected token outputs for different query formats. - Created edge case tests to ensure robustness against unusual input scenarios. - Developed tests for the Reciprocal Rank Fusion (RRF) algorithm, including score computation, weight handling, and result ranking across multiple sources. - Included tests for normalization of BM25 scores and tagging search results with source metadata.
2026-02-10 02:24:35 +08:00 · 2025-12-16 10:20:19 +08:00
parent 35485bbbb1
commit 3da0ef2adb
39 changed files with 6171 additions and 240 deletions
--- a/codex-lens/src/codexlens/storage/dir_index.py
+++ b/codex-lens/src/codexlens/storage/dir_index.py
@@ -57,7 +57,7 @@ class DirIndexStore:

    # Schema version for migration tracking
    # Increment this when schema changes require migration
-    SCHEMA_VERSION = 2
+    SCHEMA_VERSION = 4

    def __init__(self, db_path: str | Path) -> None:
        """Initialize directory index store.
@@ -93,11 +93,13 @@ class DirIndexStore:
                )

            # Create or migrate schema
-            self._create_schema(conn)
-            self._create_fts_triggers(conn)
-
-            # Apply versioned migrations if needed
-            if current_version < self.SCHEMA_VERSION:
+            if current_version == 0:
+                # New database - create schema directly
+                self._create_schema(conn)
+                self._create_fts_triggers(conn)
+                self._set_schema_version(conn, self.SCHEMA_VERSION)
+            elif current_version < self.SCHEMA_VERSION:
+                # Existing database - apply migrations
                self._apply_migrations(conn, current_version)
                self._set_schema_version(conn, self.SCHEMA_VERSION)

@@ -126,6 +128,11 @@ class DirIndexStore:
        if from_version < 2:
            self._migrate_v2_add_name_column(conn)

+        # Migration v2 -> v4: Add dual FTS tables (exact + fuzzy)
+        if from_version < 4:
+            from codexlens.storage.migrations.migration_004_dual_fts import upgrade
+            upgrade(conn)
+
    def close(self) -> None:
        """Close database connection."""
        with self._lock:
@@ -465,6 +472,117 @@ class DirIndexStore:

            return float(row["mtime"]) if row and row["mtime"] else None

+    def needs_reindex(self, full_path: str | Path) -> bool:
+        """Check if a file needs reindexing based on mtime comparison.
+
+        Uses 1ms tolerance to handle filesystem timestamp precision variations.
+
+        Args:
+            full_path: Complete source file path
+
+        Returns:
+            True if file should be reindexed (new, modified, or missing from index)
+        """
+        full_path_obj = Path(full_path).resolve()
+        if not full_path_obj.exists():
+            return False  # File doesn't exist, skip indexing
+
+        # Get current filesystem mtime
+        try:
+            current_mtime = full_path_obj.stat().st_mtime
+        except OSError:
+            return False  # Can't read file stats, skip
+
+        # Get stored mtime from database
+        stored_mtime = self.get_file_mtime(full_path_obj)
+
+        # File not in index, needs indexing
+        if stored_mtime is None:
+            return True
+
+        # Compare with 1ms tolerance for floating point precision
+        MTIME_TOLERANCE = 0.001
+        return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
+
+    def add_file_incremental(
+        self,
+        name: str,
+        full_path: str | Path,
+        content: str,
+        language: str,
+        symbols: Optional[List[Symbol]] = None,
+    ) -> Optional[int]:
+        """Add or update a file only if it has changed (incremental indexing).
+
+        Checks mtime before indexing to skip unchanged files.
+
+        Args:
+            name: Filename without path
+            full_path: Complete source file path
+            content: File content for indexing
+            language: Programming language identifier
+            symbols: List of Symbol objects from the file
+
+        Returns:
+            Database file_id if indexed, None if skipped (unchanged)
+
+        Raises:
+            StorageError: If database operations fail
+        """
+        # Check if reindexing is needed
+        if not self.needs_reindex(full_path):
+            return None  # Skip unchanged file
+
+        # File changed or new, perform full indexing
+        return self.add_file(name, full_path, content, language, symbols)
+
+    def cleanup_deleted_files(self, source_dir: Path) -> int:
+        """Remove indexed files that no longer exist in the source directory.
+
+        Scans the source directory and removes database entries for deleted files.
+
+        Args:
+            source_dir: Source directory to scan
+
+        Returns:
+            Number of deleted file entries removed
+
+        Raises:
+            StorageError: If cleanup operations fail
+        """
+        with self._lock:
+            conn = self._get_connection()
+            source_dir = source_dir.resolve()
+
+            try:
+                # Get all indexed file paths
+                rows = conn.execute("SELECT full_path FROM files").fetchall()
+                indexed_paths = {row["full_path"] for row in rows}
+
+                # Build set of existing files in source directory
+                existing_paths = set()
+                for file_path in source_dir.rglob("*"):
+                    if file_path.is_file():
+                        existing_paths.add(str(file_path.resolve()))
+
+                # Find orphaned entries (indexed but no longer exist)
+                deleted_paths = indexed_paths - existing_paths
+
+                # Remove orphaned entries
+                deleted_count = 0
+                for deleted_path in deleted_paths:
+                    conn.execute("DELETE FROM files WHERE full_path=?", (deleted_path,))
+                    deleted_count += 1
+
+                if deleted_count > 0:
+                    conn.commit()
+
+                return deleted_count
+
+            except Exception as exc:
+                conn.rollback()
+                raise StorageError(f"Failed to cleanup deleted files: {exc}") from exc
+
    def list_files(self) -> List[FileEntry]:
        """List all files in current directory.

@@ -985,6 +1103,92 @@ class DirIndexStore:
                )
            return results

+    def search_fts_exact(self, query: str, limit: int = 20) -> List[SearchResult]:
+        """Full-text search using exact token matching (unicode61 tokenizer).
+
+        Args:
+            query: FTS5 query string
+            limit: Maximum results to return
+
+        Returns:
+            List of SearchResult objects sorted by relevance
+
+        Raises:
+            StorageError: If FTS search fails
+        """
+        with self._lock:
+            conn = self._get_connection()
+            try:
+                rows = conn.execute(
+                    """
+                    SELECT rowid, full_path, bm25(files_fts_exact) AS rank,
+                           snippet(files_fts_exact, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
+                    FROM files_fts_exact
+                    WHERE files_fts_exact MATCH ?
+                    ORDER BY rank
+                    LIMIT ?
+                    """,
+                    (query, limit),
+                ).fetchall()
+            except sqlite3.DatabaseError as exc:
+                raise StorageError(f"FTS exact search failed: {exc}") from exc
+
+            results: List[SearchResult] = []
+            for row in rows:
+                rank = float(row["rank"]) if row["rank"] is not None else 0.0
+                score = abs(rank) if rank < 0 else 0.0
+                results.append(
+                    SearchResult(
+                        path=row["full_path"],
+                        score=score,
+                        excerpt=row["excerpt"],
+                    )
+                )
+            return results
+
+    def search_fts_fuzzy(self, query: str, limit: int = 20) -> List[SearchResult]:
+        """Full-text search using fuzzy/substring matching (trigram or extended unicode61 tokenizer).
+
+        Args:
+            query: FTS5 query string
+            limit: Maximum results to return
+
+        Returns:
+            List of SearchResult objects sorted by relevance
+
+        Raises:
+            StorageError: If FTS search fails
+        """
+        with self._lock:
+            conn = self._get_connection()
+            try:
+                rows = conn.execute(
+                    """
+                    SELECT rowid, full_path, bm25(files_fts_fuzzy) AS rank,
+                           snippet(files_fts_fuzzy, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
+                    FROM files_fts_fuzzy
+                    WHERE files_fts_fuzzy MATCH ?
+                    ORDER BY rank
+                    LIMIT ?
+                    """,
+                    (query, limit),
+                ).fetchall()
+            except sqlite3.DatabaseError as exc:
+                raise StorageError(f"FTS fuzzy search failed: {exc}") from exc
+
+            results: List[SearchResult] = []
+            for row in rows:
+                rank = float(row["rank"]) if row["rank"] is not None else 0.0
+                score = abs(rank) if rank < 0 else 0.0
+                results.append(
+                    SearchResult(
+                        path=row["full_path"],
+                        score=score,
+                        excerpt=row["excerpt"],
+                    )
+                )
+            return results
+
    def search_files_only(self, query: str, limit: int = 20) -> List[str]:
        """Fast FTS search returning only file paths (no snippet generation).

@@ -1185,16 +1389,34 @@ class DirIndexStore:
                """
            )

-            # FTS5 external content table with code-friendly tokenizer
-            # unicode61 tokenchars keeps underscores as part of tokens
-            # so 'user_id' is indexed as one token, not 'user' and 'id'
+            # Dual FTS5 external content tables for exact and fuzzy matching
+            # files_fts_exact: unicode61 tokenizer for exact token matching
+            # files_fts_fuzzy: trigram tokenizer (or extended unicode61) for substring/fuzzy matching
+            from codexlens.storage.sqlite_utils import check_trigram_support
+
+            has_trigram = check_trigram_support(conn)
+            fuzzy_tokenizer = "trigram" if has_trigram else "unicode61 tokenchars '_-'"
+
+            # Exact FTS table with unicode61 tokenizer
            conn.execute(
                """
-                CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
+                CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_exact USING fts5(
                    name, full_path UNINDEXED, content,
                    content='files',
                    content_rowid='id',
-                    tokenize="unicode61 tokenchars '_'"
+                    tokenize="unicode61 tokenchars '_-'"
+                )
+                """
+            )
+
+            # Fuzzy FTS table with trigram or extended unicode61 tokenizer
+            conn.execute(
+                f"""
+                CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_fuzzy USING fts5(
+                    name, full_path UNINDEXED, content,
+                    content='files',
+                    content_rowid='id',
+                    tokenize="{fuzzy_tokenizer}"
                )
                """
            )
@@ -1301,38 +1523,72 @@ class DirIndexStore:
            conn.execute("UPDATE files SET name = ? WHERE id = ?", (name, file_id))

    def _create_fts_triggers(self, conn: sqlite3.Connection) -> None:
-        """Create FTS5 external content triggers.
+        """Create FTS5 external content triggers for dual FTS tables.
+
+        Creates synchronized triggers for both files_fts_exact and files_fts_fuzzy tables.

        Args:
            conn: Database connection
        """
-        # Insert trigger
+        # Insert triggers for files_fts_exact
        conn.execute(
            """
-            CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN
-                INSERT INTO files_fts(rowid, name, full_path, content)
+            CREATE TRIGGER IF NOT EXISTS files_exact_ai AFTER INSERT ON files BEGIN
+                INSERT INTO files_fts_exact(rowid, name, full_path, content)
                VALUES(new.id, new.name, new.full_path, new.content);
            END
            """
        )

-        # Delete trigger
+        # Delete trigger for files_fts_exact
        conn.execute(
            """
-            CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN
-                INSERT INTO files_fts(files_fts, rowid, name, full_path, content)
+            CREATE TRIGGER IF NOT EXISTS files_exact_ad AFTER DELETE ON files BEGIN
+                INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
                VALUES('delete', old.id, old.name, old.full_path, old.content);
            END
            """
        )

-        # Update trigger
+        # Update trigger for files_fts_exact
        conn.execute(
            """
-            CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN
-                INSERT INTO files_fts(files_fts, rowid, name, full_path, content)
+            CREATE TRIGGER IF NOT EXISTS files_exact_au AFTER UPDATE ON files BEGIN
+                INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
                VALUES('delete', old.id, old.name, old.full_path, old.content);
-                INSERT INTO files_fts(rowid, name, full_path, content)
+                INSERT INTO files_fts_exact(rowid, name, full_path, content)
+                VALUES(new.id, new.name, new.full_path, new.content);
+            END
+            """
+        )
+
+        # Insert trigger for files_fts_fuzzy
+        conn.execute(
+            """
+            CREATE TRIGGER IF NOT EXISTS files_fuzzy_ai AFTER INSERT ON files BEGIN
+                INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
+                VALUES(new.id, new.name, new.full_path, new.content);
+            END
+            """
+        )
+
+        # Delete trigger for files_fts_fuzzy
+        conn.execute(
+            """
+            CREATE TRIGGER IF NOT EXISTS files_fuzzy_ad AFTER DELETE ON files BEGIN
+                INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
+                VALUES('delete', old.id, old.name, old.full_path, old.content);
+            END
+            """
+        )
+
+        # Update trigger for files_fts_fuzzy
+        conn.execute(
+            """
+            CREATE TRIGGER IF NOT EXISTS files_fuzzy_au AFTER UPDATE ON files BEGIN
+                INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
+                VALUES('delete', old.id, old.name, old.full_path, old.content);
+                INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
                VALUES(new.id, new.name, new.full_path, new.content);
            END
            """
--- a/codex-lens/src/codexlens/storage/index_tree.py
+++ b/codex-lens/src/codexlens/storage/index_tree.py
@@ -77,7 +77,7 @@ class IndexTreeBuilder:
    }

    def __init__(
-        self, registry: RegistryStore, mapper: PathMapper, config: Config = None
+        self, registry: RegistryStore, mapper: PathMapper, config: Config = None, incremental: bool = True
    ):
        """Initialize the index tree builder.

@@ -85,18 +85,21 @@ class IndexTreeBuilder:
            registry: Global registry store for project tracking
            mapper: Path mapper for source to index conversions
            config: CodexLens configuration (uses defaults if None)
+            incremental: Enable incremental indexing (default True)
        """
        self.registry = registry
        self.mapper = mapper
        self.config = config or Config()
        self.parser_factory = ParserFactory(self.config)
        self.logger = logging.getLogger(__name__)
+        self.incremental = incremental

    def build(
        self,
        source_root: Path,
        languages: List[str] = None,
        workers: int = 4,
+        force_full: bool = False,
    ) -> BuildResult:
        """Build complete index tree for a project.

@@ -106,11 +109,13 @@ class IndexTreeBuilder:
        3. Build indexes bottom-up (deepest first)
        4. Link subdirectories to parents
        5. Update project statistics
+        6. Cleanup deleted files (if incremental mode)

        Args:
            source_root: Project root directory to index
            languages: Optional list of language IDs to limit indexing
            workers: Number of parallel worker processes
+            force_full: Force full reindex (override incremental mode)

        Returns:
            BuildResult with statistics and errors
@@ -122,7 +127,12 @@ class IndexTreeBuilder:
        if not source_root.exists():
            raise ValueError(f"Source root does not exist: {source_root}")

-        self.logger.info("Building index tree for %s", source_root)
+        # Override incremental mode if force_full is True
+        use_incremental = self.incremental and not force_full
+        if force_full:
+            self.logger.info("Building index tree for %s (FULL reindex)", source_root)
+        else:
+            self.logger.info("Building index tree for %s (incremental=%s)", source_root, use_incremental)

        # Register project
        index_root = self.mapper.source_to_index_dir(source_root)
@@ -186,6 +196,25 @@ class IndexTreeBuilder:
            # Link children to this directory
            self._link_children_to_parent(result.source_path, all_results)

+        # Cleanup deleted files if in incremental mode
+        if use_incremental:
+            self.logger.info("Cleaning up deleted files...")
+            total_deleted = 0
+            for result in all_results:
+                if result.error:
+                    continue
+                try:
+                    with DirIndexStore(result.index_path) as store:
+                        deleted_count = store.cleanup_deleted_files(result.source_path)
+                        total_deleted += deleted_count
+                        if deleted_count > 0:
+                            self.logger.debug("Removed %d deleted files from %s", deleted_count, result.source_path)
+                except Exception as exc:
+                    self.logger.warning("Cleanup failed for %s: %s", result.source_path, exc)
+
+            if total_deleted > 0:
+                self.logger.info("Removed %d deleted files from index", total_deleted)
+
        # Update project statistics
        self.registry.update_project_stats(source_root, total_files, total_dirs)

@@ -436,9 +465,15 @@ class IndexTreeBuilder:

            files_count = 0
            symbols_count = 0
+            skipped_count = 0

            for file_path in source_files:
                try:
+                    # Check if file needs reindexing (incremental mode)
+                    if self.incremental and not store.needs_reindex(file_path):
+                        skipped_count += 1
+                        continue
+
                    # Read and parse file
                    text = file_path.read_text(encoding="utf-8", errors="ignore")
                    language_id = self.config.language_for_path(file_path)
@@ -491,13 +526,23 @@ class IndexTreeBuilder:

            store.close()

-            self.logger.debug(
-                "Built %s: %d files, %d symbols, %d subdirs",
-                dir_path,
-                files_count,
-                symbols_count,
-                len(subdirs),
-            )
+            if skipped_count > 0:
+                self.logger.debug(
+                    "Built %s: %d files indexed, %d skipped (unchanged), %d symbols, %d subdirs",
+                    dir_path,
+                    files_count,
+                    skipped_count,
+                    symbols_count,
+                    len(subdirs),
+                )
+            else:
+                self.logger.debug(
+                    "Built %s: %d files, %d symbols, %d subdirs",
+                    dir_path,
+                    files_count,
+                    symbols_count,
+                    len(subdirs),
+                )

            return DirBuildResult(
                source_path=dir_path,
--- a/codex-lens/src/codexlens/storage/migrations/migration_004_dual_fts.py
+++ b/codex-lens/src/codexlens/storage/migrations/migration_004_dual_fts.py
@@ -0,0 +1,231 @@
+"""
+Migration 004: Add dual FTS tables for exact and fuzzy matching.
+
+This migration introduces two FTS5 tables:
+- files_fts_exact: Uses unicode61 tokenizer for exact token matching
+- files_fts_fuzzy: Uses trigram tokenizer (or extended unicode61) for substring/fuzzy matching
+
+Both tables are synchronized with the files table via triggers for automatic updates.
+"""
+
+import logging
+from sqlite3 import Connection
+
+from codexlens.storage.sqlite_utils import check_trigram_support, get_sqlite_version
+
+log = logging.getLogger(__name__)
+
+
+def upgrade(db_conn: Connection):
+    """
+    Applies the migration to add dual FTS tables.
+
+    - Drops old files_fts table and triggers
+    - Creates files_fts_exact with unicode61 tokenizer
+    - Creates files_fts_fuzzy with trigram or extended unicode61 tokenizer
+    - Creates synchronized triggers for both tables
+    - Rebuilds FTS indexes from files table
+
+    Args:
+        db_conn: The SQLite database connection.
+    """
+    cursor = db_conn.cursor()
+
+    try:
+        # Check trigram support
+        has_trigram = check_trigram_support(db_conn)
+        version = get_sqlite_version(db_conn)
+        log.info(f"SQLite version: {'.'.join(map(str, version))}")
+
+        if has_trigram:
+            log.info("Trigram tokenizer available, using for fuzzy FTS table")
+            fuzzy_tokenizer = "trigram"
+        else:
+            log.warning(
+                f"Trigram tokenizer not available (requires SQLite >= 3.34), "
+                f"using extended unicode61 tokenizer for fuzzy matching"
+            )
+            fuzzy_tokenizer = "unicode61 tokenchars '_-'"
+
+        # Start transaction
+        cursor.execute("BEGIN TRANSACTION")
+
+        # Check if files table has 'name' column (v2 schema doesn't have it)
+        cursor.execute("PRAGMA table_info(files)")
+        columns = {row[1] for row in cursor.fetchall()}
+        
+        if 'name' not in columns:
+            log.info("Adding 'name' column to files table (v2 schema upgrade)...")
+            # Add name column
+            cursor.execute("ALTER TABLE files ADD COLUMN name TEXT")
+            # Populate name from path (extract filename from last '/')
+            # Use Python to do the extraction since SQLite doesn't have reverse()
+            cursor.execute("SELECT rowid, path FROM files")
+            rows = cursor.fetchall()
+            for rowid, path in rows:
+                # Extract filename from path
+                name = path.split('/')[-1] if '/' in path else path
+                cursor.execute("UPDATE files SET name = ? WHERE rowid = ?", (name, rowid))
+            
+        # Rename 'path' column to 'full_path' if needed
+        if 'path' in columns and 'full_path' not in columns:
+            log.info("Renaming 'path' to 'full_path' (v2 schema upgrade)...")
+            # Check if indexed_at column exists in v2 schema
+            has_indexed_at = 'indexed_at' in columns
+            has_mtime = 'mtime' in columns
+            
+            # SQLite doesn't support RENAME COLUMN before 3.25, so use table recreation
+            cursor.execute("""
+                CREATE TABLE files_new (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    name TEXT NOT NULL,
+                    full_path TEXT NOT NULL UNIQUE,
+                    content TEXT,
+                    language TEXT,
+                    mtime REAL,
+                    indexed_at TEXT
+                )
+            """)
+            
+            # Build INSERT statement based on available columns
+            # Note: v2 schema has no rowid (path is PRIMARY KEY), so use NULL for AUTOINCREMENT
+            if has_indexed_at and has_mtime:
+                cursor.execute("""
+                    INSERT INTO files_new (name, full_path, content, language, mtime, indexed_at)
+                    SELECT name, path, content, language, mtime, indexed_at FROM files
+                """)
+            elif has_indexed_at:
+                cursor.execute("""
+                    INSERT INTO files_new (name, full_path, content, language, indexed_at)
+                    SELECT name, path, content, language, indexed_at FROM files
+                """)
+            elif has_mtime:
+                cursor.execute("""
+                    INSERT INTO files_new (name, full_path, content, language, mtime)
+                    SELECT name, path, content, language, mtime FROM files
+                """)
+            else:
+                cursor.execute("""
+                    INSERT INTO files_new (name, full_path, content, language)
+                    SELECT name, path, content, language FROM files
+                """)
+            
+            cursor.execute("DROP TABLE files")
+            cursor.execute("ALTER TABLE files_new RENAME TO files")
+
+        log.info("Dropping old FTS triggers and table...")
+        # Drop old triggers
+        cursor.execute("DROP TRIGGER IF EXISTS files_ai")
+        cursor.execute("DROP TRIGGER IF EXISTS files_ad")
+        cursor.execute("DROP TRIGGER IF EXISTS files_au")
+
+        # Drop old FTS table
+        cursor.execute("DROP TABLE IF EXISTS files_fts")
+
+        # Create exact FTS table (unicode61 with underscores/hyphens as token chars)
+        log.info("Creating files_fts_exact table with unicode61 tokenizer...")
+        cursor.execute(
+            """
+            CREATE VIRTUAL TABLE files_fts_exact USING fts5(
+                name, full_path UNINDEXED, content,
+                content='files',
+                content_rowid='id',
+                tokenize="unicode61 tokenchars '_-'"
+            )
+            """
+        )
+
+        # Create fuzzy FTS table (trigram or extended unicode61)
+        log.info(f"Creating files_fts_fuzzy table with {fuzzy_tokenizer} tokenizer...")
+        cursor.execute(
+            f"""
+            CREATE VIRTUAL TABLE files_fts_fuzzy USING fts5(
+                name, full_path UNINDEXED, content,
+                content='files',
+                content_rowid='id',
+                tokenize="{fuzzy_tokenizer}"
+            )
+            """
+        )
+
+        # Create synchronized triggers for files_fts_exact
+        log.info("Creating triggers for files_fts_exact...")
+        cursor.execute(
+            """
+            CREATE TRIGGER files_exact_ai AFTER INSERT ON files BEGIN
+                INSERT INTO files_fts_exact(rowid, name, full_path, content)
+                VALUES(new.id, new.name, new.full_path, new.content);
+            END
+            """
+        )
+        cursor.execute(
+            """
+            CREATE TRIGGER files_exact_ad AFTER DELETE ON files BEGIN
+                INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
+                VALUES('delete', old.id, old.name, old.full_path, old.content);
+            END
+            """
+        )
+        cursor.execute(
+            """
+            CREATE TRIGGER files_exact_au AFTER UPDATE ON files BEGIN
+                INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
+                VALUES('delete', old.id, old.name, old.full_path, old.content);
+                INSERT INTO files_fts_exact(rowid, name, full_path, content)
+                VALUES(new.id, new.name, new.full_path, new.content);
+            END
+            """
+        )
+
+        # Create synchronized triggers for files_fts_fuzzy
+        log.info("Creating triggers for files_fts_fuzzy...")
+        cursor.execute(
+            """
+            CREATE TRIGGER files_fuzzy_ai AFTER INSERT ON files BEGIN
+                INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
+                VALUES(new.id, new.name, new.full_path, new.content);
+            END
+            """
+        )
+        cursor.execute(
+            """
+            CREATE TRIGGER files_fuzzy_ad AFTER DELETE ON files BEGIN
+                INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
+                VALUES('delete', old.id, old.name, old.full_path, old.content);
+            END
+            """
+        )
+        cursor.execute(
+            """
+            CREATE TRIGGER files_fuzzy_au AFTER UPDATE ON files BEGIN
+                INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
+                VALUES('delete', old.id, old.name, old.full_path, old.content);
+                INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
+                VALUES(new.id, new.name, new.full_path, new.content);
+            END
+            """
+        )
+
+        # Rebuild FTS indexes from files table
+        log.info("Rebuilding FTS indexes from files table...")
+        cursor.execute("INSERT INTO files_fts_exact(files_fts_exact) VALUES('rebuild')")
+        cursor.execute("INSERT INTO files_fts_fuzzy(files_fts_fuzzy) VALUES('rebuild')")
+
+        # Commit transaction
+        cursor.execute("COMMIT")
+        log.info("Migration 004 completed successfully")
+
+        # Vacuum to reclaim space (outside transaction)
+        try:
+            log.info("Running VACUUM to reclaim space...")
+            cursor.execute("VACUUM")
+        except Exception as e:
+            log.warning(f"VACUUM failed (non-critical): {e}")
+
+    except Exception as e:
+        log.error(f"Migration 004 failed: {e}")
+        try:
+            cursor.execute("ROLLBACK")
+        except Exception:
+            pass
+        raise
--- a/codex-lens/src/codexlens/storage/sqlite_utils.py
+++ b/codex-lens/src/codexlens/storage/sqlite_utils.py
@@ -0,0 +1,64 @@
+"""SQLite utility functions for CodexLens storage layer."""
+
+from __future__ import annotations
+
+import logging
+import sqlite3
+
+log = logging.getLogger(__name__)
+
+
+def check_trigram_support(conn: sqlite3.Connection) -> bool:
+    """Check if SQLite supports trigram tokenizer for FTS5.
+
+    Trigram tokenizer requires SQLite >= 3.34.0.
+
+    Args:
+        conn: Database connection to test
+
+    Returns:
+        True if trigram tokenizer is available, False otherwise
+    """
+    try:
+        # Test by creating a temporary virtual table with trigram tokenizer
+        conn.execute(
+            """
+            CREATE VIRTUAL TABLE IF NOT EXISTS test_trigram_check
+            USING fts5(test_content, tokenize='trigram')
+            """
+        )
+        # Clean up test table
+        conn.execute("DROP TABLE IF EXISTS test_trigram_check")
+        conn.commit()
+        return True
+    except sqlite3.OperationalError as e:
+        # Trigram tokenizer not available
+        if "unrecognized tokenizer" in str(e).lower():
+            log.debug("Trigram tokenizer not available in this SQLite version")
+            return False
+        # Other operational errors should be re-raised
+        raise
+    except Exception:
+        # Any other exception means trigram is not supported
+        return False
+
+
+def get_sqlite_version(conn: sqlite3.Connection) -> tuple[int, int, int]:
+    """Get SQLite version as (major, minor, patch) tuple.
+
+    Args:
+        conn: Database connection
+
+    Returns:
+        Version tuple, e.g., (3, 34, 1)
+    """
+    row = conn.execute("SELECT sqlite_version()").fetchone()
+    version_str = row[0] if row else "0.0.0"
+    parts = version_str.split('.')
+    try:
+        major = int(parts[0]) if len(parts) > 0 else 0
+        minor = int(parts[1]) if len(parts) > 1 else 0
+        patch = int(parts[2]) if len(parts) > 2 else 0
+        return (major, minor, patch)
+    except (ValueError, IndexError):
+        return (0, 0, 0)