Add comprehensive tests for query parsing and Reciprocal Rank Fusion

- Implemented tests for the QueryParser class, covering various identifier splitting methods (CamelCase, snake_case, kebab-case), OR expansion, and FTS5 operator preservation. - Added parameterized tests to validate expected token outputs for different query formats. - Created edge case tests to ensure robustness against unusual input scenarios. - Developed tests for the Reciprocal Rank Fusion (RRF) algorithm, including score computation, weight handling, and result ranking across multiple sources. - Included tests for normalization of BM25 scores and tagging search results with source metadata.
2026-02-09 02:24:11 +08:00 · 2025-12-16 10:20:19 +08:00
parent 35485bbbb1
commit 3da0ef2adb
39 changed files with 6171 additions and 240 deletions
--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -20,6 +20,7 @@ from codexlens.parsers.factory import ParserFactory
 from codexlens.storage.path_mapper import PathMapper
 from codexlens.storage.registry import RegistryStore, ProjectInfo
 from codexlens.storage.index_tree import IndexTreeBuilder
+from codexlens.storage.dir_index import DirIndexStore
 from codexlens.search.chain_search import ChainSearchEngine, SearchOptions

 from .output import (
@@ -77,6 +78,7 @@ def init(
        help="Limit indexing to specific languages (repeat or comma-separated).",
    ),
    workers: int = typer.Option(4, "--workers", "-w", min=1, max=16, help="Parallel worker processes."),
+    force: bool = typer.Option(False, "--force", "-f", help="Force full reindex (skip incremental mode)."),
    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
 ) -> None:
@@ -84,6 +86,9 @@ def init(

    Indexes are stored in ~/.codexlens/indexes/ with mirrored directory structure.
    Set CODEXLENS_INDEX_DIR to customize the index location.
+
+    By default, uses incremental indexing (skip unchanged files).
+    Use --force to rebuild all files regardless of modification time.
    """
    _configure_logging(verbose)
    config = Config()
@@ -96,14 +101,18 @@ def init(
        registry.initialize()
        mapper = PathMapper()

-        builder = IndexTreeBuilder(registry, mapper, config)
+        builder = IndexTreeBuilder(registry, mapper, config, incremental=not force)

-        console.print(f"[bold]Building index for:[/bold] {base_path}")
+        if force:
+            console.print(f"[bold]Building index for:[/bold] {base_path} [yellow](FULL reindex)[/yellow]")
+        else:
+            console.print(f"[bold]Building index for:[/bold] {base_path} [dim](incremental)[/dim]")

        build_result = builder.build(
            source_root=base_path,
            languages=languages,
            workers=workers,
+            force_full=force,
        )

        result = {
@@ -172,6 +181,8 @@ def search(
    limit: int = typer.Option(20, "--limit", "-n", min=1, max=500, help="Max results."),
    depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."),
    files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
+    mode: str = typer.Option("exact", "--mode", "-m", help="Search mode: exact, fuzzy, hybrid, vector."),
+    weights: Optional[str] = typer.Option(None, "--weights", help="Custom RRF weights as 'exact,fuzzy,vector' (e.g., '0.5,0.3,0.2')."),
    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
 ) -> None:
@@ -179,10 +190,51 @@ def search(

    Uses chain search across directory indexes.
    Use --depth to limit search recursion (0 = current dir only).
+
+    Search Modes:
+      - exact: Exact FTS using unicode61 tokenizer (default)
+      - fuzzy: Fuzzy FTS using trigram tokenizer
+      - hybrid: RRF fusion of exact + fuzzy (recommended)
+      - vector: Semantic vector search (future)
+
+    Hybrid Mode:
+      Default weights: exact=0.4, fuzzy=0.3, vector=0.3
+      Use --weights to customize (e.g., --weights 0.5,0.3,0.2)
    """
    _configure_logging(verbose)
    search_path = path.expanduser().resolve()

+    # Validate mode
+    valid_modes = ["exact", "fuzzy", "hybrid", "vector"]
+    if mode not in valid_modes:
+        if json_mode:
+            print_json(success=False, error=f"Invalid mode: {mode}. Must be one of: {', '.join(valid_modes)}")
+        else:
+            console.print(f"[red]Invalid mode:[/red] {mode}")
+            console.print(f"[dim]Valid modes: {', '.join(valid_modes)}[/dim]")
+        raise typer.Exit(code=1)
+
+    # Parse custom weights if provided
+    hybrid_weights = None
+    if weights:
+        try:
+            weight_parts = [float(w.strip()) for w in weights.split(",")]
+            if len(weight_parts) == 3:
+                weight_sum = sum(weight_parts)
+                if abs(weight_sum - 1.0) > 0.01:
+                    console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]")
+                    # Normalize weights
+                    weight_parts = [w / weight_sum for w in weight_parts]
+                hybrid_weights = {
+                    "exact": weight_parts[0],
+                    "fuzzy": weight_parts[1],
+                    "vector": weight_parts[2],
+                }
+            else:
+                console.print("[yellow]Warning: Invalid weights format (need 3 values). Using defaults.[/yellow]")
+        except ValueError:
+            console.print("[yellow]Warning: Invalid weights format. Using defaults.[/yellow]")
+
    registry: RegistryStore | None = None
    try:
        registry = RegistryStore()
@@ -190,10 +242,18 @@ def search(
        mapper = PathMapper()

        engine = ChainSearchEngine(registry, mapper)
+
+        # Map mode to options
+        hybrid_mode = mode == "hybrid"
+        enable_fuzzy = mode in ["fuzzy", "hybrid"]
+
        options = SearchOptions(
            depth=depth,
            total_limit=limit,
            files_only=files_only,
+            hybrid_mode=hybrid_mode,
+            enable_fuzzy=enable_fuzzy,
+            hybrid_weights=hybrid_weights,
        )

        if files_only:
@@ -208,8 +268,17 @@ def search(
            result = engine.search(query, search_path, options)
            payload = {
                "query": query,
+                "mode": mode,
                "count": len(result.results),
-                "results": [{"path": r.path, "score": r.score, "excerpt": r.excerpt} for r in result.results],
+                "results": [
+                    {
+                        "path": r.path,
+                        "score": r.score,
+                        "excerpt": r.excerpt,
+                        "source": getattr(r, "search_source", None),
+                    }
+                    for r in result.results
+                ],
                "stats": {
                    "dirs_searched": result.stats.dirs_searched,
                    "files_matched": result.stats.files_matched,
@@ -219,9 +288,8 @@ def search(
            if json_mode:
                print_json(success=True, result=payload)
            else:
-                render_search_results(result.results)
-                if verbose:
-                    console.print(f"[dim]Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms[/dim]")
+                render_search_results(result.results, verbose=verbose)
+                console.print(f"[dim]Mode: {mode} | Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms[/dim]")

    except SearchError as exc:
        if json_mode:
@@ -404,6 +472,27 @@ def status(
                if f.is_file():
                    index_size += f.stat().st_size

+        # Check schema version and enabled features
+        schema_version = None
+        has_dual_fts = False
+        if projects and index_root.exists():
+            # Check first index database for features
+            index_files = list(index_root.rglob("_index.db"))
+            if index_files:
+                try:
+                    with DirIndexStore(index_files[0]) as store:
+                        with store._lock:
+                            conn = store._get_connection()
+                            schema_version = store._get_schema_version(conn)
+                            # Check if dual FTS tables exist
+                            cursor = conn.execute(
+                                "SELECT name FROM sqlite_master WHERE type='table' AND name IN ('search_fts_exact', 'search_fts_fuzzy')"
+                            )
+                            fts_tables = [row[0] for row in cursor.fetchall()]
+                            has_dual_fts = len(fts_tables) == 2
+                except Exception:
+                    pass
+
        stats = {
            "index_root": str(index_root),
            "registry_path": str(_get_registry_path()),
@@ -412,6 +501,13 @@ def status(
            "total_dirs": total_dirs,
            "index_size_bytes": index_size,
            "index_size_mb": round(index_size / (1024 * 1024), 2),
+            "schema_version": schema_version,
+            "features": {
+                "exact_fts": True,  # Always available
+                "fuzzy_fts": has_dual_fts,
+                "hybrid_search": has_dual_fts,
+                "vector_search": False,  # Not yet implemented
+            },
        }

        if json_mode:
@@ -424,6 +520,17 @@ def status(
            console.print(f"  Total Files: {stats['total_files']}")
            console.print(f"  Total Directories: {stats['total_dirs']}")
            console.print(f"  Index Size: {stats['index_size_mb']} MB")
+            if schema_version:
+                console.print(f"  Schema Version: {schema_version}")
+            console.print("\n[bold]Search Backends:[/bold]")
+            console.print(f"  Exact FTS: ✓ (unicode61)")
+            if has_dual_fts:
+                console.print(f"  Fuzzy FTS: ✓ (trigram)")
+                console.print(f"  Hybrid Search: ✓ (RRF fusion)")
+            else:
+                console.print(f"  Fuzzy FTS: ✗ (run 'migrate' to enable)")
+                console.print(f"  Hybrid Search: ✗ (run 'migrate' to enable)")
+            console.print(f"  Vector Search: ✗ (future)")

    except StorageError as exc:
        if json_mode:
@@ -778,6 +885,139 @@ def config(
            raise typer.Exit(code=1)


+@app.command()
+def migrate(
+    path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to migrate."),
+    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
+) -> None:
+    """Migrate project indexes to latest schema (Dual-FTS upgrade).
+
+    Upgrades all _index.db files in the project to schema version 4, which includes:
+    - Dual FTS tables (exact + fuzzy)
+    - Encoding detection support
+    - Incremental indexing metadata
+
+    This is a safe operation that preserves all existing data.
+    Progress is shown during migration.
+    """
+    _configure_logging(verbose)
+    base_path = path.expanduser().resolve()
+
+    registry: RegistryStore | None = None
+    try:
+        registry = RegistryStore()
+        registry.initialize()
+        mapper = PathMapper()
+
+        # Find project
+        project_info = registry.get_project(base_path)
+        if not project_info:
+            raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.")
+
+        index_dir = mapper.source_to_index_dir(base_path)
+        if not index_dir.exists():
+            raise CodexLensError(f"Index directory not found: {index_dir}")
+
+        # Find all _index.db files
+        index_files = list(index_dir.rglob("_index.db"))
+
+        if not index_files:
+            if json_mode:
+                print_json(success=True, result={"message": "No indexes to migrate", "migrated": 0})
+            else:
+                console.print("[yellow]No indexes found to migrate.[/yellow]")
+            return
+
+        migrated_count = 0
+        error_count = 0
+        already_migrated = 0
+
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+            TextColumn("({task.completed}/{task.total})"),
+            TimeElapsedColumn(),
+            console=console,
+        ) as progress:
+            task = progress.add_task(f"Migrating {len(index_files)} indexes...", total=len(index_files))
+
+            for db_path in index_files:
+                try:
+                    store = DirIndexStore(db_path)
+
+                    # Check current version
+                    with store._lock:
+                        conn = store._get_connection()
+                        current_version = store._get_schema_version(conn)
+
+                        if current_version >= DirIndexStore.SCHEMA_VERSION:
+                            already_migrated += 1
+                            if verbose:
+                                progress.console.print(f"[dim]Already migrated: {db_path.parent.name}[/dim]")
+                        elif current_version > 0:
+                            # Apply migrations
+                            store._apply_migrations(conn, current_version)
+                            store._set_schema_version(conn, DirIndexStore.SCHEMA_VERSION)
+                            conn.commit()
+                            migrated_count += 1
+                            if verbose:
+                                progress.console.print(f"[green]Migrated: {db_path.parent.name} (v{current_version} → v{DirIndexStore.SCHEMA_VERSION})[/green]")
+                        else:
+                            # New database, initialize directly
+                            store.initialize()
+                            migrated_count += 1
+
+                    store.close()
+
+                except Exception as e:
+                    error_count += 1
+                    if verbose:
+                        progress.console.print(f"[red]Error migrating {db_path}: {e}[/red]")
+
+                progress.update(task, advance=1)
+
+        result = {
+            "path": str(base_path),
+            "total_indexes": len(index_files),
+            "migrated": migrated_count,
+            "already_migrated": already_migrated,
+            "errors": error_count,
+        }
+
+        if json_mode:
+            print_json(success=True, result=result)
+        else:
+            console.print(f"[green]Migration complete:[/green]")
+            console.print(f"  Total indexes: {len(index_files)}")
+            console.print(f"  Migrated: {migrated_count}")
+            console.print(f"  Already up-to-date: {already_migrated}")
+            if error_count > 0:
+                console.print(f"  [yellow]Errors: {error_count}[/yellow]")
+
+    except StorageError as exc:
+        if json_mode:
+            print_json(success=False, error=f"Storage error: {exc}")
+        else:
+            console.print(f"[red]Migration failed (storage):[/red] {exc}")
+            raise typer.Exit(code=1)
+    except CodexLensError as exc:
+        if json_mode:
+            print_json(success=False, error=str(exc))
+        else:
+            console.print(f"[red]Migration failed:[/red] {exc}")
+            raise typer.Exit(code=1)
+    except Exception as exc:
+        if json_mode:
+            print_json(success=False, error=f"Unexpected error: {exc}")
+        else:
+            console.print(f"[red]Migration failed (unexpected):[/red] {exc}")
+            raise typer.Exit(code=1)
+    finally:
+        if registry is not None:
+            registry.close()


@app.command()
--- a/codex-lens/src/codexlens/cli/output.py
+++ b/codex-lens/src/codexlens/cli/output.py
@@ -41,15 +41,45 @@ def print_json(*, success: bool, result: Any = None, error: str | None = None) -
    console.print_json(json.dumps(payload, ensure_ascii=False))


-def render_search_results(results: Sequence[SearchResult], *, title: str = "Search Results") -> None:
+def render_search_results(
+    results: Sequence[SearchResult], *, title: str = "Search Results", verbose: bool = False
+) -> None:
+    """Render search results with optional source tags in verbose mode.
+
+    Args:
+        results: Search results to display
+        title: Table title
+        verbose: If True, show search source tags ([E], [F], [V]) and fusion scores
+    """
    table = Table(title=title, show_lines=False)
+
+    if verbose:
+        # Verbose mode: show source tags
+        table.add_column("Source", style="dim", width=6, justify="center")
+
    table.add_column("Path", style="cyan", no_wrap=True)
    table.add_column("Score", style="magenta", justify="right")
    table.add_column("Excerpt", style="white")

    for res in results:
        excerpt = res.excerpt or ""
-        table.add_row(res.path, f"{res.score:.3f}", excerpt)
+        score_str = f"{res.score:.3f}"
+
+        if verbose:
+            # Extract search source tag if available
+            source = getattr(res, "search_source", None)
+            source_tag = ""
+            if source == "exact":
+                source_tag = "[E]"
+            elif source == "fuzzy":
+                source_tag = "[F]"
+            elif source == "vector":
+                source_tag = "[V]"
+            elif source == "fusion":
+                source_tag = "[RRF]"
+            table.add_row(source_tag, res.path, score_str, excerpt)
+        else:
+            table.add_row(res.path, score_str, excerpt)

    console.print(table)

--- a/codex-lens/src/codexlens/parsers/encoding.py
+++ b/codex-lens/src/codexlens/parsers/encoding.py
@@ -0,0 +1,202 @@
+"""Optional encoding detection module for CodexLens.
+
+Provides automatic encoding detection with graceful fallback to UTF-8.
+Install with: pip install codexlens[encoding]
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import Tuple, Optional
+
+log = logging.getLogger(__name__)
+
+# Feature flag for encoding detection availability
+ENCODING_DETECTION_AVAILABLE = False
+_import_error: Optional[str] = None
+
+
+def _detect_chardet_backend() -> Tuple[bool, Optional[str]]:
+    """Detect if chardet or charset-normalizer is available."""
+    try:
+        import chardet
+        return True, None
+    except ImportError:
+        pass
+
+    try:
+        from charset_normalizer import from_bytes
+        return True, None
+    except ImportError:
+        pass
+
+    return False, "chardet not available. Install with: pip install codexlens[encoding]"
+
+
+# Initialize on module load
+ENCODING_DETECTION_AVAILABLE, _import_error = _detect_chardet_backend()
+
+
+def check_encoding_available() -> Tuple[bool, Optional[str]]:
+    """Check if encoding detection dependencies are available.
+
+    Returns:
+        Tuple of (available, error_message)
+    """
+    return ENCODING_DETECTION_AVAILABLE, _import_error
+
+
+def detect_encoding(content_bytes: bytes, confidence_threshold: float = 0.7) -> str:
+    """Detect encoding from file content bytes.
+
+    Uses chardet or charset-normalizer with configurable confidence threshold.
+    Falls back to UTF-8 if confidence is too low or detection unavailable.
+
+    Args:
+        content_bytes: Raw file content as bytes
+        confidence_threshold: Minimum confidence (0.0-1.0) to accept detection
+
+    Returns:
+        Detected encoding name (e.g., 'utf-8', 'iso-8859-1', 'gbk')
+        Returns 'utf-8' as fallback if detection fails or confidence too low
+    """
+    if not ENCODING_DETECTION_AVAILABLE:
+        log.debug("Encoding detection not available, using UTF-8 fallback")
+        return "utf-8"
+
+    if not content_bytes:
+        return "utf-8"
+
+    try:
+        # Try chardet first
+        try:
+            import chardet
+            result = chardet.detect(content_bytes)
+            encoding = result.get("encoding")
+            confidence = result.get("confidence", 0.0)
+
+            if encoding and confidence >= confidence_threshold:
+                log.debug(f"Detected encoding: {encoding} (confidence: {confidence:.2f})")
+                # Normalize encoding name: replace underscores with hyphens
+                return encoding.lower().replace('_', '-')
+            else:
+                log.debug(
+                    f"Low confidence encoding detection: {encoding} "
+                    f"(confidence: {confidence:.2f}), using UTF-8 fallback"
+                )
+                return "utf-8"
+        except ImportError:
+            pass
+
+        # Fallback to charset-normalizer
+        try:
+            from charset_normalizer import from_bytes
+            results = from_bytes(content_bytes)
+            if results:
+                best = results.best()
+                if best and best.encoding:
+                    log.debug(f"Detected encoding via charset-normalizer: {best.encoding}")
+                    # Normalize encoding name: replace underscores with hyphens
+                    return best.encoding.lower().replace('_', '-')
+        except ImportError:
+            pass
+
+    except Exception as e:
+        log.warning(f"Encoding detection failed: {e}, using UTF-8 fallback")
+
+    return "utf-8"
+
+
+def read_file_safe(
+    path: Path | str,
+    confidence_threshold: float = 0.7,
+    max_detection_bytes: int = 100_000
+) -> Tuple[str, str]:
+    """Read file with automatic encoding detection and safe decoding.
+
+    Reads file bytes, detects encoding, and decodes with error replacement
+    to preserve file structure even with encoding issues.
+
+    Args:
+        path: Path to file to read
+        confidence_threshold: Minimum confidence for encoding detection
+        max_detection_bytes: Maximum bytes to use for encoding detection (default 100KB)
+
+    Returns:
+        Tuple of (content, detected_encoding)
+        - content: Decoded file content (with <20> for unmappable bytes)
+        - detected_encoding: Detected encoding name
+
+    Raises:
+        OSError: If file cannot be read
+        IsADirectoryError: If path is a directory
+    """
+    file_path = Path(path) if isinstance(path, str) else path
+
+    # Read file bytes
+    try:
+        content_bytes = file_path.read_bytes()
+    except Exception as e:
+        log.error(f"Failed to read file {file_path}: {e}")
+        raise
+
+    # Detect encoding from first N bytes for performance
+    detection_sample = content_bytes[:max_detection_bytes] if len(content_bytes) > max_detection_bytes else content_bytes
+    encoding = detect_encoding(detection_sample, confidence_threshold)
+
+    # Decode with error replacement to preserve structure
+    try:
+        content = content_bytes.decode(encoding, errors='replace')
+        log.debug(f"Successfully decoded {file_path} using {encoding}")
+        return content, encoding
+    except Exception as e:
+        # Final fallback to UTF-8 with replacement
+        log.warning(f"Failed to decode {file_path} with {encoding}, using UTF-8: {e}")
+        content = content_bytes.decode('utf-8', errors='replace')
+        return content, 'utf-8'
+
+
+def is_binary_file(path: Path | str, sample_size: int = 8192) -> bool:
+    """Check if file is likely binary by sampling first bytes.
+
+    Uses heuristic: if >30% of sample bytes are null or non-text, consider binary.
+
+    Args:
+        path: Path to file to check
+        sample_size: Number of bytes to sample (default 8KB)
+
+    Returns:
+        True if file appears to be binary, False otherwise
+    """
+    file_path = Path(path) if isinstance(path, str) else path
+
+    try:
+        with file_path.open('rb') as f:
+            sample = f.read(sample_size)
+
+        if not sample:
+            return False
+
+        # Count null bytes and non-printable characters
+        null_count = sample.count(b'\x00')
+        non_text_count = sum(1 for byte in sample if byte < 0x20 and byte not in (0x09, 0x0a, 0x0d))
+
+        # If >30% null bytes or >50% non-text, consider binary
+        null_ratio = null_count / len(sample)
+        non_text_ratio = non_text_count / len(sample)
+
+        return null_ratio > 0.3 or non_text_ratio > 0.5
+
+    except Exception as e:
+        log.debug(f"Binary check failed for {file_path}: {e}, assuming text")
+        return False
+
+
+__all__ = [
+    "ENCODING_DETECTION_AVAILABLE",
+    "check_encoding_available",
+    "detect_encoding",
+    "read_file_safe",
+    "is_binary_file",
+]
--- a/codex-lens/src/codexlens/search/chain_search.py
+++ b/codex-lens/src/codexlens/search/chain_search.py
@@ -18,6 +18,7 @@ from codexlens.storage.registry import RegistryStore, DirMapping
 from codexlens.storage.dir_index import DirIndexStore, SubdirLink
 from codexlens.storage.path_mapper import PathMapper
 from codexlens.storage.sqlite_store import SQLiteStore
+from codexlens.search.hybrid_search import HybridSearchEngine


@dataclass
@@ -32,6 +33,9 @@ class SearchOptions:
        include_symbols: Whether to include symbol search results
        files_only: Return only file paths without excerpts
        include_semantic: Whether to include semantic keyword search results
+        hybrid_mode: Enable hybrid search with RRF fusion (default False)
+        enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True)
+        hybrid_weights: Custom RRF weights for hybrid search (optional)
    """
    depth: int = -1
    max_workers: int = 8
@@ -40,6 +44,9 @@ class SearchOptions:
    include_symbols: bool = False
    files_only: bool = False
    include_semantic: bool = False
+    hybrid_mode: bool = False
+    enable_fuzzy: bool = True
+    hybrid_weights: Optional[Dict[str, float]] = None


@dataclass
@@ -484,7 +491,10 @@ class ChainSearchEngine:
                query,
                options.limit_per_dir,
                options.files_only,
-                options.include_semantic
+                options.include_semantic,
+                options.hybrid_mode,
+                options.enable_fuzzy,
+                options.hybrid_weights
            ): idx_path
            for idx_path in index_paths
        }
@@ -507,7 +517,10 @@ class ChainSearchEngine:
                              query: str,
                              limit: int,
                              files_only: bool = False,
-                              include_semantic: bool = False) -> List[SearchResult]:
+                              include_semantic: bool = False,
+                              hybrid_mode: bool = False,
+                              enable_fuzzy: bool = True,
+                              hybrid_weights: Optional[Dict[str, float]] = None) -> List[SearchResult]:
        """Search a single index database.

        Handles exceptions gracefully, returning empty list on failure.
@@ -518,39 +531,54 @@ class ChainSearchEngine:
            limit: Maximum results from this index
            files_only: If True, skip snippet generation for faster search
            include_semantic: If True, also search semantic keywords and merge results
+            hybrid_mode: If True, use hybrid search with RRF fusion
+            enable_fuzzy: Enable fuzzy FTS in hybrid mode
+            hybrid_weights: Custom RRF weights for hybrid search

        Returns:
            List of SearchResult objects (empty on error)
        """
        try:
-            with DirIndexStore(index_path) as store:
-                # Get FTS results
-                if files_only:
-                    # Fast path: return paths only without snippets
-                    paths = store.search_files_only(query, limit=limit)
-                    fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
-                else:
-                    fts_results = store.search_fts(query, limit=limit)
-                
-                # Optionally add semantic keyword results
-                if include_semantic:
-                    try:
-                        semantic_matches = store.search_semantic_keywords(query)
-                        # Convert semantic matches to SearchResult with 0.8x weight
-                        for file_entry, keywords in semantic_matches:
-                            # Create excerpt from keywords
-                            excerpt = f"Keywords: {', '.join(keywords[:5])}"
-                            # Use a base score of 10.0 for semantic matches, weighted by 0.8
-                            semantic_result = SearchResult(
-                                path=str(file_entry.full_path),
-                                score=10.0 * 0.8,
-                                excerpt=excerpt
-                            )
-                            fts_results.append(semantic_result)
-                    except Exception as sem_exc:
-                        self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
-                
-                return fts_results
+            # Use hybrid search if enabled
+            if hybrid_mode:
+                hybrid_engine = HybridSearchEngine(weights=hybrid_weights)
+                fts_results = hybrid_engine.search(
+                    index_path,
+                    query,
+                    limit=limit,
+                    enable_fuzzy=enable_fuzzy,
+                    enable_vector=False,  # Vector search not yet implemented
+                )
+            else:
+                # Legacy single-FTS search
+                with DirIndexStore(index_path) as store:
+                    # Get FTS results
+                    if files_only:
+                        # Fast path: return paths only without snippets
+                        paths = store.search_files_only(query, limit=limit)
+                        fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
+                    else:
+                        fts_results = store.search_fts(query, limit=limit)
+
+                    # Optionally add semantic keyword results
+                    if include_semantic:
+                        try:
+                            semantic_matches = store.search_semantic_keywords(query)
+                            # Convert semantic matches to SearchResult with 0.8x weight
+                            for file_entry, keywords in semantic_matches:
+                                # Create excerpt from keywords
+                                excerpt = f"Keywords: {', '.join(keywords[:5])}"
+                                # Use a base score of 10.0 for semantic matches, weighted by 0.8
+                                semantic_result = SearchResult(
+                                    path=str(file_entry.full_path),
+                                    score=10.0 * 0.8,
+                                    excerpt=excerpt
+                                )
+                                fts_results.append(semantic_result)
+                        except Exception as sem_exc:
+                            self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
+
+            return fts_results
        except Exception as exc:
            self.logger.debug(f"Search error in {index_path}: {exc}")
            return []
--- a/codex-lens/src/codexlens/search/hybrid_search.py
+++ b/codex-lens/src/codexlens/search/hybrid_search.py
@@ -0,0 +1,211 @@
+"""Hybrid search engine orchestrating parallel exact/fuzzy/vector searches with RRF fusion.
+
+Coordinates multiple search backends in parallel using ThreadPoolExecutor and combines
+results via Reciprocal Rank Fusion (RRF) algorithm.
+"""
+
+from __future__ import annotations
+
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from codexlens.entities import SearchResult
+from codexlens.search.ranking import reciprocal_rank_fusion, tag_search_source
+from codexlens.storage.dir_index import DirIndexStore
+
+
+class HybridSearchEngine:
+    """Hybrid search engine with parallel execution and RRF fusion.
+
+    Orchestrates searches across exact FTS, fuzzy FTS, and optional vector backends,
+    executing them in parallel and fusing results via Reciprocal Rank Fusion.
+
+    Attributes:
+        logger: Python logger instance
+        default_weights: Default RRF weights for each source
+    """
+
+    # Default RRF weights (exact: 40%, fuzzy: 30%, vector: 30%)
+    DEFAULT_WEIGHTS = {
+        "exact": 0.4,
+        "fuzzy": 0.3,
+        "vector": 0.3,
+    }
+
+    def __init__(self, weights: Optional[Dict[str, float]] = None):
+        """Initialize hybrid search engine.
+
+        Args:
+            weights: Optional custom RRF weights (default: DEFAULT_WEIGHTS)
+        """
+        self.logger = logging.getLogger(__name__)
+        self.weights = weights or self.DEFAULT_WEIGHTS.copy()
+
+    def search(
+        self,
+        index_path: Path,
+        query: str,
+        limit: int = 20,
+        enable_fuzzy: bool = True,
+        enable_vector: bool = False,
+    ) -> List[SearchResult]:
+        """Execute hybrid search with parallel retrieval and RRF fusion.
+
+        Args:
+            index_path: Path to _index.db file
+            query: FTS5 query string
+            limit: Maximum results to return after fusion
+            enable_fuzzy: Enable fuzzy FTS search (default True)
+            enable_vector: Enable vector search (default False)
+
+        Returns:
+            List of SearchResult objects sorted by fusion score
+
+        Examples:
+            >>> engine = HybridSearchEngine()
+            >>> results = engine.search(Path("project/_index.db"), "authentication")
+            >>> for r in results[:5]:
+            ...     print(f"{r.path}: {r.score:.3f}")
+        """
+        # Determine which backends to use
+        backends = {"exact": True}  # Always use exact search
+        if enable_fuzzy:
+            backends["fuzzy"] = True
+        if enable_vector:
+            backends["vector"] = True
+
+        # Execute parallel searches
+        results_map = self._search_parallel(index_path, query, backends, limit)
+
+        # Apply RRF fusion
+        # Filter weights to only active backends
+        active_weights = {
+            source: weight
+            for source, weight in self.weights.items()
+            if source in results_map
+        }
+
+        fused_results = reciprocal_rank_fusion(results_map, active_weights)
+
+        # Apply final limit
+        return fused_results[:limit]
+
+    def _search_parallel(
+        self,
+        index_path: Path,
+        query: str,
+        backends: Dict[str, bool],
+        limit: int,
+    ) -> Dict[str, List[SearchResult]]:
+        """Execute parallel searches across enabled backends.
+
+        Args:
+            index_path: Path to _index.db file
+            query: FTS5 query string
+            backends: Dictionary of backend name to enabled flag
+            limit: Results limit per backend
+
+        Returns:
+            Dictionary mapping source name to results list
+        """
+        results_map: Dict[str, List[SearchResult]] = {}
+
+        # Use ThreadPoolExecutor for parallel I/O-bound searches
+        with ThreadPoolExecutor(max_workers=len(backends)) as executor:
+            # Submit search tasks
+            future_to_source = {}
+
+            if backends.get("exact"):
+                future = executor.submit(
+                    self._search_exact, index_path, query, limit
+                )
+                future_to_source[future] = "exact"
+
+            if backends.get("fuzzy"):
+                future = executor.submit(
+                    self._search_fuzzy, index_path, query, limit
+                )
+                future_to_source[future] = "fuzzy"
+
+            if backends.get("vector"):
+                future = executor.submit(
+                    self._search_vector, index_path, query, limit
+                )
+                future_to_source[future] = "vector"
+
+            # Collect results as they complete
+            for future in as_completed(future_to_source):
+                source = future_to_source[future]
+                try:
+                    results = future.result()
+                    # Tag results with source for debugging
+                    tagged_results = tag_search_source(results, source)
+                    results_map[source] = tagged_results
+                    self.logger.debug(
+                        "Got %d results from %s search", len(results), source
+                    )
+                except Exception as exc:
+                    self.logger.error("Search failed for %s: %s", source, exc)
+                    results_map[source] = []
+
+        return results_map
+
+    def _search_exact(
+        self, index_path: Path, query: str, limit: int
+    ) -> List[SearchResult]:
+        """Execute exact FTS search using unicode61 tokenizer.
+
+        Args:
+            index_path: Path to _index.db file
+            query: FTS5 query string
+            limit: Maximum results
+
+        Returns:
+            List of SearchResult objects
+        """
+        try:
+            with DirIndexStore(index_path) as store:
+                return store.search_fts_exact(query, limit=limit)
+        except Exception as exc:
+            self.logger.debug("Exact search error: %s", exc)
+            return []
+
+    def _search_fuzzy(
+        self, index_path: Path, query: str, limit: int
+    ) -> List[SearchResult]:
+        """Execute fuzzy FTS search using trigram/extended unicode61 tokenizer.
+
+        Args:
+            index_path: Path to _index.db file
+            query: FTS5 query string
+            limit: Maximum results
+
+        Returns:
+            List of SearchResult objects
+        """
+        try:
+            with DirIndexStore(index_path) as store:
+                return store.search_fts_fuzzy(query, limit=limit)
+        except Exception as exc:
+            self.logger.debug("Fuzzy search error: %s", exc)
+            return []
+
+    def _search_vector(
+        self, index_path: Path, query: str, limit: int
+    ) -> List[SearchResult]:
+        """Execute vector search (placeholder for future implementation).
+
+        Args:
+            index_path: Path to _index.db file
+            query: Query string
+            limit: Maximum results
+
+        Returns:
+            List of SearchResult objects (empty for now)
+        """
+        # Placeholder for vector search integration
+        # Will be implemented when VectorStore is available
+        self.logger.debug("Vector search not yet implemented")
+        return []
--- a/codex-lens/src/codexlens/search/query_parser.py
+++ b/codex-lens/src/codexlens/search/query_parser.py
@@ -0,0 +1,242 @@
+"""Query preprocessing for CodexLens search.
+
+Provides query expansion for better identifier matching:
+- CamelCase splitting: UserAuth → User OR Auth
+- snake_case splitting: user_auth → user OR auth
+- Preserves original query for exact matching
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import Set, List
+
+log = logging.getLogger(__name__)
+
+
+class QueryParser:
+    """Parser for preprocessing search queries before FTS5 execution.
+
+    Expands identifier-style queries (CamelCase, snake_case) into OR queries
+    to improve recall when searching for code symbols.
+
+    Example transformations:
+        - 'UserAuth' → 'UserAuth OR User OR Auth'
+        - 'user_auth' → 'user_auth OR user OR auth'
+        - 'getUserData' → 'getUserData OR get OR User OR Data'
+    """
+
+    # Patterns for identifier splitting
+    CAMEL_CASE_PATTERN = re.compile(r'([a-z])([A-Z])')
+    SNAKE_CASE_PATTERN = re.compile(r'_+')
+    KEBAB_CASE_PATTERN = re.compile(r'-+')
+
+    # Minimum token length to include in expansion (avoid noise from single chars)
+    MIN_TOKEN_LENGTH = 2
+
+    # All-caps acronyms pattern (e.g., HTTP, SQL, API)
+    ALL_CAPS_PATTERN = re.compile(r'^[A-Z]{2,}$')
+
+    def __init__(self, enable: bool = True, min_token_length: int = 2):
+        """Initialize query parser.
+
+        Args:
+            enable: Whether to enable query preprocessing
+            min_token_length: Minimum token length to include in expansion
+        """
+        self.enable = enable
+        self.min_token_length = min_token_length
+
+    def preprocess_query(self, query: str) -> str:
+        """Preprocess query with identifier expansion.
+
+        Args:
+            query: Original search query
+
+        Returns:
+            Expanded query with OR operator connecting original and split tokens
+
+        Example:
+            >>> parser = QueryParser()
+            >>> parser.preprocess_query('UserAuth')
+            'UserAuth OR User OR Auth'
+            >>> parser.preprocess_query('get_user_data')
+            'get_user_data OR get OR user OR data'
+        """
+        if not self.enable:
+            return query
+
+        query = query.strip()
+        if not query:
+            return query
+
+        # Extract tokens from query (handle multiple words/terms)
+        # For simple queries, just process the whole thing
+        # For complex FTS5 queries with operators, preserve structure
+        if self._is_simple_query(query):
+            return self._expand_simple_query(query)
+        else:
+            # Complex query with FTS5 operators, don't expand
+            log.debug(f"Skipping expansion for complex FTS5 query: {query}")
+            return query
+
+    def _is_simple_query(self, query: str) -> bool:
+        """Check if query is simple (no FTS5 operators).
+
+        Args:
+            query: Search query
+
+        Returns:
+            True if query is simple (safe to expand), False otherwise
+        """
+        # Check for FTS5 operators that indicate complex query
+        fts5_operators = ['OR', 'AND', 'NOT', 'NEAR', '*', '^', '"']
+        return not any(op in query for op in fts5_operators)
+
+    def _expand_simple_query(self, query: str) -> str:
+        """Expand a simple query with identifier splitting.
+
+        Args:
+            query: Simple search query
+
+        Returns:
+            Expanded query with OR operators
+        """
+        tokens: Set[str] = set()
+
+        # Always include original query
+        tokens.add(query)
+
+        # Split on whitespace first
+        words = query.split()
+
+        for word in words:
+            # Extract tokens from this word
+            word_tokens = self._extract_tokens(word)
+            tokens.update(word_tokens)
+
+        # Filter out short tokens and duplicates
+        filtered_tokens = [
+            t for t in tokens
+            if len(t) >= self.min_token_length
+        ]
+
+        # Remove duplicates while preserving original query first
+        unique_tokens: List[str] = []
+        seen: Set[str] = set()
+
+        # Always put original query first
+        if query not in seen and len(query) >= self.min_token_length:
+            unique_tokens.append(query)
+            seen.add(query)
+
+        # Add other tokens
+        for token in filtered_tokens:
+            if token not in seen:
+                unique_tokens.append(token)
+                seen.add(token)
+
+        # Join with OR operator (only if we have multiple tokens)
+        if len(unique_tokens) > 1:
+            expanded = ' OR '.join(unique_tokens)
+            log.debug(f"Expanded query: '{query}' → '{expanded}'")
+            return expanded
+        else:
+            return query
+
+    def _extract_tokens(self, word: str) -> Set[str]:
+        """Extract tokens from a single word using various splitting strategies.
+
+        Args:
+            word: Single word/identifier to split
+
+        Returns:
+            Set of extracted tokens
+        """
+        tokens: Set[str] = set()
+
+        # Add original word
+        tokens.add(word)
+
+        # Handle all-caps acronyms (don't split)
+        if self.ALL_CAPS_PATTERN.match(word):
+            return tokens
+
+        # CamelCase splitting
+        camel_tokens = self._split_camel_case(word)
+        tokens.update(camel_tokens)
+
+        # snake_case splitting
+        snake_tokens = self._split_snake_case(word)
+        tokens.update(snake_tokens)
+
+        # kebab-case splitting
+        kebab_tokens = self._split_kebab_case(word)
+        tokens.update(kebab_tokens)
+
+        return tokens
+
+    def _split_camel_case(self, word: str) -> List[str]:
+        """Split CamelCase identifier into tokens.
+
+        Args:
+            word: CamelCase identifier (e.g., 'getUserData')
+
+        Returns:
+            List of tokens (e.g., ['get', 'User', 'Data'])
+        """
+        # Insert space before uppercase letters preceded by lowercase
+        spaced = self.CAMEL_CASE_PATTERN.sub(r'\1 \2', word)
+        # Split on spaces and filter empty
+        return [t for t in spaced.split() if t]
+
+    def _split_snake_case(self, word: str) -> List[str]:
+        """Split snake_case identifier into tokens.
+
+        Args:
+            word: snake_case identifier (e.g., 'get_user_data')
+
+        Returns:
+            List of tokens (e.g., ['get', 'user', 'data'])
+        """
+        # Split on underscores
+        return [t for t in self.SNAKE_CASE_PATTERN.split(word) if t]
+
+    def _split_kebab_case(self, word: str) -> List[str]:
+        """Split kebab-case identifier into tokens.
+
+        Args:
+            word: kebab-case identifier (e.g., 'get-user-data')
+
+        Returns:
+            List of tokens (e.g., ['get', 'user', 'data'])
+        """
+        # Split on hyphens
+        return [t for t in self.KEBAB_CASE_PATTERN.split(word) if t]
+
+
+# Global default parser instance
+_default_parser = QueryParser(enable=True)
+
+
+def preprocess_query(query: str, enable: bool = True) -> str:
+    """Convenience function for query preprocessing.
+
+    Args:
+        query: Original search query
+        enable: Whether to enable preprocessing
+
+    Returns:
+        Preprocessed query with identifier expansion
+    """
+    if not enable:
+        return query
+
+    return _default_parser.preprocess_query(query)
+
+
+__all__ = [
+    "QueryParser",
+    "preprocess_query",
+]
--- a/codex-lens/src/codexlens/search/ranking.py
+++ b/codex-lens/src/codexlens/search/ranking.py
@@ -0,0 +1,160 @@
+"""Ranking algorithms for hybrid search result fusion.
+
+Implements Reciprocal Rank Fusion (RRF) and score normalization utilities
+for combining results from heterogeneous search backends (exact FTS, fuzzy FTS, vector search).
+"""
+
+from __future__ import annotations
+
+import math
+from typing import Dict, List
+
+from codexlens.entities import SearchResult
+
+
+def reciprocal_rank_fusion(
+    results_map: Dict[str, List[SearchResult]],
+    weights: Dict[str, float] = None,
+    k: int = 60,
+) -> List[SearchResult]:
+    """Combine search results from multiple sources using Reciprocal Rank Fusion.
+
+    RRF formula: score(d) = Σ weight_source / (k + rank_source(d))
+
+    Args:
+        results_map: Dictionary mapping source name to list of SearchResult objects
+                     Sources: 'exact', 'fuzzy', 'vector'
+        weights: Dictionary mapping source name to weight (default: equal weights)
+                 Example: {'exact': 0.4, 'fuzzy': 0.3, 'vector': 0.3}
+        k: Constant to avoid division by zero and control rank influence (default 60)
+
+    Returns:
+        List of SearchResult objects sorted by fused score (descending)
+
+    Examples:
+        >>> exact_results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
+        >>> fuzzy_results = [SearchResult(path="b.py", score=8.0, excerpt="...")]
+        >>> results_map = {'exact': exact_results, 'fuzzy': fuzzy_results}
+        >>> fused = reciprocal_rank_fusion(results_map)
+    """
+    if not results_map:
+        return []
+
+    # Default equal weights if not provided
+    if weights is None:
+        num_sources = len(results_map)
+        weights = {source: 1.0 / num_sources for source in results_map}
+
+    # Validate weights sum to 1.0
+    weight_sum = sum(weights.values())
+    if not math.isclose(weight_sum, 1.0, abs_tol=0.01):
+        # Normalize weights to sum to 1.0
+        weights = {source: w / weight_sum for source, w in weights.items()}
+
+    # Build unified result set with RRF scores
+    path_to_result: Dict[str, SearchResult] = {}
+    path_to_fusion_score: Dict[str, float] = {}
+
+    for source_name, results in results_map.items():
+        weight = weights.get(source_name, 0.0)
+        if weight == 0:
+            continue
+
+        for rank, result in enumerate(results, start=1):
+            path = result.path
+            rrf_contribution = weight / (k + rank)
+
+            # Initialize or accumulate fusion score
+            if path not in path_to_fusion_score:
+                path_to_fusion_score[path] = 0.0
+                path_to_result[path] = result
+
+            path_to_fusion_score[path] += rrf_contribution
+
+    # Create final results with fusion scores
+    fused_results = []
+    for path, base_result in path_to_result.items():
+        fusion_score = path_to_fusion_score[path]
+
+        # Create new SearchResult with fusion_score in metadata
+        fused_result = SearchResult(
+            path=base_result.path,
+            score=fusion_score,
+            excerpt=base_result.excerpt,
+            content=base_result.content,
+            symbol=base_result.symbol,
+            chunk=base_result.chunk,
+            metadata={
+                **base_result.metadata,
+                "fusion_score": fusion_score,
+                "original_score": base_result.score,
+            },
+            start_line=base_result.start_line,
+            end_line=base_result.end_line,
+            symbol_name=base_result.symbol_name,
+            symbol_kind=base_result.symbol_kind,
+        )
+        fused_results.append(fused_result)
+
+    # Sort by fusion score descending
+    fused_results.sort(key=lambda r: r.score, reverse=True)
+
+    return fused_results
+
+
+def normalize_bm25_score(score: float) -> float:
+    """Normalize BM25 scores from SQLite FTS5 to 0-1 range.
+
+    SQLite FTS5 returns negative BM25 scores (more negative = better match).
+    Uses sigmoid transformation for normalization.
+
+    Args:
+        score: Raw BM25 score from SQLite (typically negative)
+
+    Returns:
+        Normalized score in range [0, 1]
+
+    Examples:
+        >>> normalize_bm25_score(-10.5)  # Good match
+        0.85
+        >>> normalize_bm25_score(-1.2)   # Weak match
+        0.62
+    """
+    # Take absolute value (BM25 is negative in SQLite)
+    abs_score = abs(score)
+
+    # Sigmoid transformation: 1 / (1 + e^(-x))
+    # Scale factor of 0.1 maps typical BM25 range (-20 to 0) to (0, 1)
+    normalized = 1.0 / (1.0 + math.exp(-abs_score * 0.1))
+
+    return normalized
+
+
+def tag_search_source(results: List[SearchResult], source: str) -> List[SearchResult]:
+    """Tag search results with their source for RRF tracking.
+
+    Args:
+        results: List of SearchResult objects
+        source: Source identifier ('exact', 'fuzzy', 'vector')
+
+    Returns:
+        List of SearchResult objects with 'search_source' in metadata
+    """
+    tagged_results = []
+    for result in results:
+        tagged_result = SearchResult(
+            path=result.path,
+            score=result.score,
+            excerpt=result.excerpt,
+            content=result.content,
+            symbol=result.symbol,
+            chunk=result.chunk,
+            metadata={**result.metadata, "search_source": source},
+            start_line=result.start_line,
+            end_line=result.end_line,
+            symbol_name=result.symbol_name,
+            symbol_kind=result.symbol_kind,
+        )
+        tagged_results.append(tagged_result)
+
+    return tagged_results
--- a/codex-lens/src/codexlens/storage/dir_index.py
+++ b/codex-lens/src/codexlens/storage/dir_index.py
@@ -57,7 +57,7 @@ class DirIndexStore:

    # Schema version for migration tracking
    # Increment this when schema changes require migration
-    SCHEMA_VERSION = 2
+    SCHEMA_VERSION = 4

    def __init__(self, db_path: str | Path) -> None:
        """Initialize directory index store.
@@ -93,11 +93,13 @@ class DirIndexStore:
                )

            # Create or migrate schema
-            self._create_schema(conn)
-            self._create_fts_triggers(conn)
-
-            # Apply versioned migrations if needed
-            if current_version < self.SCHEMA_VERSION:
+            if current_version == 0:
+                # New database - create schema directly
+                self._create_schema(conn)
+                self._create_fts_triggers(conn)
+                self._set_schema_version(conn, self.SCHEMA_VERSION)
+            elif current_version < self.SCHEMA_VERSION:
+                # Existing database - apply migrations
                self._apply_migrations(conn, current_version)
                self._set_schema_version(conn, self.SCHEMA_VERSION)

@@ -126,6 +128,11 @@ class DirIndexStore:
        if from_version < 2:
            self._migrate_v2_add_name_column(conn)

+        # Migration v2 -> v4: Add dual FTS tables (exact + fuzzy)
+        if from_version < 4:
+            from codexlens.storage.migrations.migration_004_dual_fts import upgrade
+            upgrade(conn)
+
    def close(self) -> None:
        """Close database connection."""
        with self._lock:
@@ -465,6 +472,117 @@ class DirIndexStore:

            return float(row["mtime"]) if row and row["mtime"] else None

+    def needs_reindex(self, full_path: str | Path) -> bool:
+        """Check if a file needs reindexing based on mtime comparison.
+
+        Uses 1ms tolerance to handle filesystem timestamp precision variations.
+
+        Args:
+            full_path: Complete source file path
+
+        Returns:
+            True if file should be reindexed (new, modified, or missing from index)
+        """
+        full_path_obj = Path(full_path).resolve()
+        if not full_path_obj.exists():
+            return False  # File doesn't exist, skip indexing
+
+        # Get current filesystem mtime
+        try:
+            current_mtime = full_path_obj.stat().st_mtime
+        except OSError:
+            return False  # Can't read file stats, skip
+
+        # Get stored mtime from database
+        stored_mtime = self.get_file_mtime(full_path_obj)
+
+        # File not in index, needs indexing
+        if stored_mtime is None:
+            return True
+
+        # Compare with 1ms tolerance for floating point precision
+        MTIME_TOLERANCE = 0.001
+        return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
+
+    def add_file_incremental(
+        self,
+        name: str,
+        full_path: str | Path,
+        content: str,
+        language: str,
+        symbols: Optional[List[Symbol]] = None,
+    ) -> Optional[int]:
+        """Add or update a file only if it has changed (incremental indexing).
+
+        Checks mtime before indexing to skip unchanged files.
+
+        Args:
+            name: Filename without path
+            full_path: Complete source file path
+            content: File content for indexing
+            language: Programming language identifier
+            symbols: List of Symbol objects from the file
+
+        Returns:
+            Database file_id if indexed, None if skipped (unchanged)
+
+        Raises:
+            StorageError: If database operations fail
+        """
+        # Check if reindexing is needed
+        if not self.needs_reindex(full_path):
+            return None  # Skip unchanged file
+
+        # File changed or new, perform full indexing
+        return self.add_file(name, full_path, content, language, symbols)
+
+    def cleanup_deleted_files(self, source_dir: Path) -> int:
+        """Remove indexed files that no longer exist in the source directory.
+
+        Scans the source directory and removes database entries for deleted files.
+
+        Args:
+            source_dir: Source directory to scan
+
+        Returns:
+            Number of deleted file entries removed
+
+        Raises:
+            StorageError: If cleanup operations fail
+        """
+        with self._lock:
+            conn = self._get_connection()
+            source_dir = source_dir.resolve()
+
+            try:
+                # Get all indexed file paths
+                rows = conn.execute("SELECT full_path FROM files").fetchall()
+                indexed_paths = {row["full_path"] for row in rows}
+
+                # Build set of existing files in source directory
+                existing_paths = set()
+                for file_path in source_dir.rglob("*"):
+                    if file_path.is_file():
+                        existing_paths.add(str(file_path.resolve()))
+
+                # Find orphaned entries (indexed but no longer exist)
+                deleted_paths = indexed_paths - existing_paths
+
+                # Remove orphaned entries
+                deleted_count = 0
+                for deleted_path in deleted_paths:
+                    conn.execute("DELETE FROM files WHERE full_path=?", (deleted_path,))
+                    deleted_count += 1
+
+                if deleted_count > 0:
+                    conn.commit()
+
+                return deleted_count
+
+            except Exception as exc:
+                conn.rollback()
+                raise StorageError(f"Failed to cleanup deleted files: {exc}") from exc
+
    def list_files(self) -> List[FileEntry]:
        """List all files in current directory.

@@ -985,6 +1103,92 @@ class DirIndexStore:
                )
            return results

+    def search_fts_exact(self, query: str, limit: int = 20) -> List[SearchResult]:
+        """Full-text search using exact token matching (unicode61 tokenizer).
+
+        Args:
+            query: FTS5 query string
+            limit: Maximum results to return
+
+        Returns:
+            List of SearchResult objects sorted by relevance
+
+        Raises:
+            StorageError: If FTS search fails
+        """
+        with self._lock:
+            conn = self._get_connection()
+            try:
+                rows = conn.execute(
+                    """
+                    SELECT rowid, full_path, bm25(files_fts_exact) AS rank,
+                           snippet(files_fts_exact, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
+                    FROM files_fts_exact
+                    WHERE files_fts_exact MATCH ?
+                    ORDER BY rank
+                    LIMIT ?
+                    """,
+                    (query, limit),
+                ).fetchall()
+            except sqlite3.DatabaseError as exc:
+                raise StorageError(f"FTS exact search failed: {exc}") from exc
+
+            results: List[SearchResult] = []
+            for row in rows:
+                rank = float(row["rank"]) if row["rank"] is not None else 0.0
+                score = abs(rank) if rank < 0 else 0.0
+                results.append(
+                    SearchResult(
+                        path=row["full_path"],
+                        score=score,
+                        excerpt=row["excerpt"],
+                    )
+                )
+            return results
+
+    def search_fts_fuzzy(self, query: str, limit: int = 20) -> List[SearchResult]:
+        """Full-text search using fuzzy/substring matching (trigram or extended unicode61 tokenizer).
+
+        Args:
+            query: FTS5 query string
+            limit: Maximum results to return
+
+        Returns:
+            List of SearchResult objects sorted by relevance
+
+        Raises:
+            StorageError: If FTS search fails
+        """
+        with self._lock:
+            conn = self._get_connection()
+            try:
+                rows = conn.execute(
+                    """
+                    SELECT rowid, full_path, bm25(files_fts_fuzzy) AS rank,
+                           snippet(files_fts_fuzzy, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
+                    FROM files_fts_fuzzy
+                    WHERE files_fts_fuzzy MATCH ?
+                    ORDER BY rank
+                    LIMIT ?
+                    """,
+                    (query, limit),
+                ).fetchall()
+            except sqlite3.DatabaseError as exc:
+                raise StorageError(f"FTS fuzzy search failed: {exc}") from exc
+
+            results: List[SearchResult] = []
+            for row in rows:
+                rank = float(row["rank"]) if row["rank"] is not None else 0.0
+                score = abs(rank) if rank < 0 else 0.0
+                results.append(
+                    SearchResult(
+                        path=row["full_path"],
+                        score=score,
+                        excerpt=row["excerpt"],
+                    )
+                )
+            return results
+
    def search_files_only(self, query: str, limit: int = 20) -> List[str]:
        """Fast FTS search returning only file paths (no snippet generation).

@@ -1185,16 +1389,34 @@ class DirIndexStore:
                """
            )

-            # FTS5 external content table with code-friendly tokenizer
-            # unicode61 tokenchars keeps underscores as part of tokens
-            # so 'user_id' is indexed as one token, not 'user' and 'id'
+            # Dual FTS5 external content tables for exact and fuzzy matching
+            # files_fts_exact: unicode61 tokenizer for exact token matching
+            # files_fts_fuzzy: trigram tokenizer (or extended unicode61) for substring/fuzzy matching
+            from codexlens.storage.sqlite_utils import check_trigram_support
+
+            has_trigram = check_trigram_support(conn)
+            fuzzy_tokenizer = "trigram" if has_trigram else "unicode61 tokenchars '_-'"
+
+            # Exact FTS table with unicode61 tokenizer
            conn.execute(
                """
-                CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
+                CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_exact USING fts5(
                    name, full_path UNINDEXED, content,
                    content='files',
                    content_rowid='id',
-                    tokenize="unicode61 tokenchars '_'"
+                    tokenize="unicode61 tokenchars '_-'"
+                )
+                """
+            )
+
+            # Fuzzy FTS table with trigram or extended unicode61 tokenizer
+            conn.execute(
+                f"""
+                CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_fuzzy USING fts5(
+                    name, full_path UNINDEXED, content,
+                    content='files',
+                    content_rowid='id',
+                    tokenize="{fuzzy_tokenizer}"
                )
                """
            )
@@ -1301,38 +1523,72 @@ class DirIndexStore:
            conn.execute("UPDATE files SET name = ? WHERE id = ?", (name, file_id))

    def _create_fts_triggers(self, conn: sqlite3.Connection) -> None:
-        """Create FTS5 external content triggers.
+        """Create FTS5 external content triggers for dual FTS tables.
+
+        Creates synchronized triggers for both files_fts_exact and files_fts_fuzzy tables.

        Args:
            conn: Database connection
        """
-        # Insert trigger
+        # Insert triggers for files_fts_exact
        conn.execute(
            """
-            CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN
-                INSERT INTO files_fts(rowid, name, full_path, content)
+            CREATE TRIGGER IF NOT EXISTS files_exact_ai AFTER INSERT ON files BEGIN
+                INSERT INTO files_fts_exact(rowid, name, full_path, content)
                VALUES(new.id, new.name, new.full_path, new.content);
            END
            """
        )

-        # Delete trigger
+        # Delete trigger for files_fts_exact
        conn.execute(
            """
-            CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN
-                INSERT INTO files_fts(files_fts, rowid, name, full_path, content)
+            CREATE TRIGGER IF NOT EXISTS files_exact_ad AFTER DELETE ON files BEGIN
+                INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
                VALUES('delete', old.id, old.name, old.full_path, old.content);
            END
            """
        )

-        # Update trigger
+        # Update trigger for files_fts_exact
        conn.execute(
            """
-            CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN
-                INSERT INTO files_fts(files_fts, rowid, name, full_path, content)
+            CREATE TRIGGER IF NOT EXISTS files_exact_au AFTER UPDATE ON files BEGIN
+                INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
                VALUES('delete', old.id, old.name, old.full_path, old.content);
-                INSERT INTO files_fts(rowid, name, full_path, content)
+                INSERT INTO files_fts_exact(rowid, name, full_path, content)
+                VALUES(new.id, new.name, new.full_path, new.content);
+            END
+            """
+        )
+
+        # Insert trigger for files_fts_fuzzy
+        conn.execute(
+            """
+            CREATE TRIGGER IF NOT EXISTS files_fuzzy_ai AFTER INSERT ON files BEGIN
+                INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
+                VALUES(new.id, new.name, new.full_path, new.content);
+            END
+            """
+        )
+
+        # Delete trigger for files_fts_fuzzy
+        conn.execute(
+            """
+            CREATE TRIGGER IF NOT EXISTS files_fuzzy_ad AFTER DELETE ON files BEGIN
+                INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
+                VALUES('delete', old.id, old.name, old.full_path, old.content);
+            END
+            """
+        )
+
+        # Update trigger for files_fts_fuzzy
+        conn.execute(
+            """
+            CREATE TRIGGER IF NOT EXISTS files_fuzzy_au AFTER UPDATE ON files BEGIN
+                INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
+                VALUES('delete', old.id, old.name, old.full_path, old.content);
+                INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
                VALUES(new.id, new.name, new.full_path, new.content);
            END
            """
--- a/codex-lens/src/codexlens/storage/index_tree.py
+++ b/codex-lens/src/codexlens/storage/index_tree.py
@@ -77,7 +77,7 @@ class IndexTreeBuilder:
    }

    def __init__(
-        self, registry: RegistryStore, mapper: PathMapper, config: Config = None
+        self, registry: RegistryStore, mapper: PathMapper, config: Config = None, incremental: bool = True
    ):
        """Initialize the index tree builder.

@@ -85,18 +85,21 @@ class IndexTreeBuilder:
            registry: Global registry store for project tracking
            mapper: Path mapper for source to index conversions
            config: CodexLens configuration (uses defaults if None)
+            incremental: Enable incremental indexing (default True)
        """
        self.registry = registry
        self.mapper = mapper
        self.config = config or Config()
        self.parser_factory = ParserFactory(self.config)
        self.logger = logging.getLogger(__name__)
+        self.incremental = incremental

    def build(
        self,
        source_root: Path,
        languages: List[str] = None,
        workers: int = 4,
+        force_full: bool = False,
    ) -> BuildResult:
        """Build complete index tree for a project.

@@ -106,11 +109,13 @@ class IndexTreeBuilder:
        3. Build indexes bottom-up (deepest first)
        4. Link subdirectories to parents
        5. Update project statistics
+        6. Cleanup deleted files (if incremental mode)

        Args:
            source_root: Project root directory to index
            languages: Optional list of language IDs to limit indexing
            workers: Number of parallel worker processes
+            force_full: Force full reindex (override incremental mode)

        Returns:
            BuildResult with statistics and errors
@@ -122,7 +127,12 @@ class IndexTreeBuilder:
        if not source_root.exists():
            raise ValueError(f"Source root does not exist: {source_root}")

-        self.logger.info("Building index tree for %s", source_root)
+        # Override incremental mode if force_full is True
+        use_incremental = self.incremental and not force_full
+        if force_full:
+            self.logger.info("Building index tree for %s (FULL reindex)", source_root)
+        else:
+            self.logger.info("Building index tree for %s (incremental=%s)", source_root, use_incremental)

        # Register project
        index_root = self.mapper.source_to_index_dir(source_root)
@@ -186,6 +196,25 @@ class IndexTreeBuilder:
            # Link children to this directory
            self._link_children_to_parent(result.source_path, all_results)

+        # Cleanup deleted files if in incremental mode
+        if use_incremental:
+            self.logger.info("Cleaning up deleted files...")
+            total_deleted = 0
+            for result in all_results:
+                if result.error:
+                    continue
+                try:
+                    with DirIndexStore(result.index_path) as store:
+                        deleted_count = store.cleanup_deleted_files(result.source_path)
+                        total_deleted += deleted_count
+                        if deleted_count > 0:
+                            self.logger.debug("Removed %d deleted files from %s", deleted_count, result.source_path)
+                except Exception as exc:
+                    self.logger.warning("Cleanup failed for %s: %s", result.source_path, exc)
+
+            if total_deleted > 0:
+                self.logger.info("Removed %d deleted files from index", total_deleted)
+
        # Update project statistics
        self.registry.update_project_stats(source_root, total_files, total_dirs)

@@ -436,9 +465,15 @@ class IndexTreeBuilder:

            files_count = 0
            symbols_count = 0
+            skipped_count = 0

            for file_path in source_files:
                try:
+                    # Check if file needs reindexing (incremental mode)
+                    if self.incremental and not store.needs_reindex(file_path):
+                        skipped_count += 1
+                        continue
+
                    # Read and parse file
                    text = file_path.read_text(encoding="utf-8", errors="ignore")
                    language_id = self.config.language_for_path(file_path)
@@ -491,13 +526,23 @@ class IndexTreeBuilder:

            store.close()

-            self.logger.debug(
-                "Built %s: %d files, %d symbols, %d subdirs",
-                dir_path,
-                files_count,
-                symbols_count,
-                len(subdirs),
-            )
+            if skipped_count > 0:
+                self.logger.debug(
+                    "Built %s: %d files indexed, %d skipped (unchanged), %d symbols, %d subdirs",
+                    dir_path,
+                    files_count,
+                    skipped_count,
+                    symbols_count,
+                    len(subdirs),
+                )
+            else:
+                self.logger.debug(
+                    "Built %s: %d files, %d symbols, %d subdirs",
+                    dir_path,
+                    files_count,
+                    symbols_count,
+                    len(subdirs),
+                )

            return DirBuildResult(
                source_path=dir_path,
--- a/codex-lens/src/codexlens/storage/migrations/migration_004_dual_fts.py
+++ b/codex-lens/src/codexlens/storage/migrations/migration_004_dual_fts.py
@@ -0,0 +1,231 @@
+"""
+Migration 004: Add dual FTS tables for exact and fuzzy matching.
+
+This migration introduces two FTS5 tables:
+- files_fts_exact: Uses unicode61 tokenizer for exact token matching
+- files_fts_fuzzy: Uses trigram tokenizer (or extended unicode61) for substring/fuzzy matching
+
+Both tables are synchronized with the files table via triggers for automatic updates.
+"""
+
+import logging
+from sqlite3 import Connection
+
+from codexlens.storage.sqlite_utils import check_trigram_support, get_sqlite_version
+
+log = logging.getLogger(__name__)
+
+
+def upgrade(db_conn: Connection):
+    """
+    Applies the migration to add dual FTS tables.
+
+    - Drops old files_fts table and triggers
+    - Creates files_fts_exact with unicode61 tokenizer
+    - Creates files_fts_fuzzy with trigram or extended unicode61 tokenizer
+    - Creates synchronized triggers for both tables
+    - Rebuilds FTS indexes from files table
+
+    Args:
+        db_conn: The SQLite database connection.
+    """
+    cursor = db_conn.cursor()
+
+    try:
+        # Check trigram support
+        has_trigram = check_trigram_support(db_conn)
+        version = get_sqlite_version(db_conn)
+        log.info(f"SQLite version: {'.'.join(map(str, version))}")
+
+        if has_trigram:
+            log.info("Trigram tokenizer available, using for fuzzy FTS table")
+            fuzzy_tokenizer = "trigram"
+        else:
+            log.warning(
+                f"Trigram tokenizer not available (requires SQLite >= 3.34), "
+                f"using extended unicode61 tokenizer for fuzzy matching"
+            )
+            fuzzy_tokenizer = "unicode61 tokenchars '_-'"
+
+        # Start transaction
+        cursor.execute("BEGIN TRANSACTION")
+
+        # Check if files table has 'name' column (v2 schema doesn't have it)
+        cursor.execute("PRAGMA table_info(files)")
+        columns = {row[1] for row in cursor.fetchall()}
+        
+        if 'name' not in columns:
+            log.info("Adding 'name' column to files table (v2 schema upgrade)...")
+            # Add name column
+            cursor.execute("ALTER TABLE files ADD COLUMN name TEXT")
+            # Populate name from path (extract filename from last '/')
+            # Use Python to do the extraction since SQLite doesn't have reverse()
+            cursor.execute("SELECT rowid, path FROM files")
+            rows = cursor.fetchall()
+            for rowid, path in rows:
+                # Extract filename from path
+                name = path.split('/')[-1] if '/' in path else path
+                cursor.execute("UPDATE files SET name = ? WHERE rowid = ?", (name, rowid))
+            
+        # Rename 'path' column to 'full_path' if needed
+        if 'path' in columns and 'full_path' not in columns:
+            log.info("Renaming 'path' to 'full_path' (v2 schema upgrade)...")
+            # Check if indexed_at column exists in v2 schema
+            has_indexed_at = 'indexed_at' in columns
+            has_mtime = 'mtime' in columns
+            
+            # SQLite doesn't support RENAME COLUMN before 3.25, so use table recreation
+            cursor.execute("""
+                CREATE TABLE files_new (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    name TEXT NOT NULL,
+                    full_path TEXT NOT NULL UNIQUE,
+                    content TEXT,
+                    language TEXT,
+                    mtime REAL,
+                    indexed_at TEXT
+                )
+            """)
+            
+            # Build INSERT statement based on available columns
+            # Note: v2 schema has no rowid (path is PRIMARY KEY), so use NULL for AUTOINCREMENT
+            if has_indexed_at and has_mtime:
+                cursor.execute("""
+                    INSERT INTO files_new (name, full_path, content, language, mtime, indexed_at)
+                    SELECT name, path, content, language, mtime, indexed_at FROM files
+                """)
+            elif has_indexed_at:
+                cursor.execute("""
+                    INSERT INTO files_new (name, full_path, content, language, indexed_at)
+                    SELECT name, path, content, language, indexed_at FROM files
+                """)
+            elif has_mtime:
+                cursor.execute("""
+                    INSERT INTO files_new (name, full_path, content, language, mtime)
+                    SELECT name, path, content, language, mtime FROM files
+                """)
+            else:
+                cursor.execute("""
+                    INSERT INTO files_new (name, full_path, content, language)
+                    SELECT name, path, content, language FROM files
+                """)
+            
+            cursor.execute("DROP TABLE files")
+            cursor.execute("ALTER TABLE files_new RENAME TO files")
+
+        log.info("Dropping old FTS triggers and table...")
+        # Drop old triggers
+        cursor.execute("DROP TRIGGER IF EXISTS files_ai")
+        cursor.execute("DROP TRIGGER IF EXISTS files_ad")
+        cursor.execute("DROP TRIGGER IF EXISTS files_au")
+
+        # Drop old FTS table
+        cursor.execute("DROP TABLE IF EXISTS files_fts")
+
+        # Create exact FTS table (unicode61 with underscores/hyphens as token chars)
+        log.info("Creating files_fts_exact table with unicode61 tokenizer...")
+        cursor.execute(
+            """
+            CREATE VIRTUAL TABLE files_fts_exact USING fts5(
+                name, full_path UNINDEXED, content,
+                content='files',
+                content_rowid='id',
+                tokenize="unicode61 tokenchars '_-'"
+            )
+            """
+        )
+
+        # Create fuzzy FTS table (trigram or extended unicode61)
+        log.info(f"Creating files_fts_fuzzy table with {fuzzy_tokenizer} tokenizer...")
+        cursor.execute(
+            f"""
+            CREATE VIRTUAL TABLE files_fts_fuzzy USING fts5(
+                name, full_path UNINDEXED, content,
+                content='files',
+                content_rowid='id',
+                tokenize="{fuzzy_tokenizer}"
+            )
+            """
+        )
+
+        # Create synchronized triggers for files_fts_exact
+        log.info("Creating triggers for files_fts_exact...")
+        cursor.execute(
+            """
+            CREATE TRIGGER files_exact_ai AFTER INSERT ON files BEGIN
+                INSERT INTO files_fts_exact(rowid, name, full_path, content)
+                VALUES(new.id, new.name, new.full_path, new.content);
+            END
+            """
+        )
+        cursor.execute(
+            """
+            CREATE TRIGGER files_exact_ad AFTER DELETE ON files BEGIN
+                INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
+                VALUES('delete', old.id, old.name, old.full_path, old.content);
+            END
+            """
+        )
+        cursor.execute(
+            """
+            CREATE TRIGGER files_exact_au AFTER UPDATE ON files BEGIN
+                INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
+                VALUES('delete', old.id, old.name, old.full_path, old.content);
+                INSERT INTO files_fts_exact(rowid, name, full_path, content)
+                VALUES(new.id, new.name, new.full_path, new.content);
+            END
+            """
+        )
+
+        # Create synchronized triggers for files_fts_fuzzy
+        log.info("Creating triggers for files_fts_fuzzy...")
+        cursor.execute(
+            """
+            CREATE TRIGGER files_fuzzy_ai AFTER INSERT ON files BEGIN
+                INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
+                VALUES(new.id, new.name, new.full_path, new.content);
+            END
+            """
+        )
+        cursor.execute(
+            """
+            CREATE TRIGGER files_fuzzy_ad AFTER DELETE ON files BEGIN
+                INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
+                VALUES('delete', old.id, old.name, old.full_path, old.content);
+            END
+            """
+        )
+        cursor.execute(
+            """
+            CREATE TRIGGER files_fuzzy_au AFTER UPDATE ON files BEGIN
+                INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
+                VALUES('delete', old.id, old.name, old.full_path, old.content);
+                INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
+                VALUES(new.id, new.name, new.full_path, new.content);
+            END
+            """
+        )
+
+        # Rebuild FTS indexes from files table
+        log.info("Rebuilding FTS indexes from files table...")
+        cursor.execute("INSERT INTO files_fts_exact(files_fts_exact) VALUES('rebuild')")
+        cursor.execute("INSERT INTO files_fts_fuzzy(files_fts_fuzzy) VALUES('rebuild')")
+
+        # Commit transaction
+        cursor.execute("COMMIT")
+        log.info("Migration 004 completed successfully")
+
+        # Vacuum to reclaim space (outside transaction)
+        try:
+            log.info("Running VACUUM to reclaim space...")
+            cursor.execute("VACUUM")
+        except Exception as e:
+            log.warning(f"VACUUM failed (non-critical): {e}")
+
+    except Exception as e:
+        log.error(f"Migration 004 failed: {e}")
+        try:
+            cursor.execute("ROLLBACK")
+        except Exception:
+            pass
+        raise
--- a/codex-lens/src/codexlens/storage/sqlite_utils.py
+++ b/codex-lens/src/codexlens/storage/sqlite_utils.py
@@ -0,0 +1,64 @@
+"""SQLite utility functions for CodexLens storage layer."""
+
+from __future__ import annotations
+
+import logging
+import sqlite3
+
+log = logging.getLogger(__name__)
+
+
+def check_trigram_support(conn: sqlite3.Connection) -> bool:
+    """Check if SQLite supports trigram tokenizer for FTS5.
+
+    Trigram tokenizer requires SQLite >= 3.34.0.
+
+    Args:
+        conn: Database connection to test
+
+    Returns:
+        True if trigram tokenizer is available, False otherwise
+    """
+    try:
+        # Test by creating a temporary virtual table with trigram tokenizer
+        conn.execute(
+            """
+            CREATE VIRTUAL TABLE IF NOT EXISTS test_trigram_check
+            USING fts5(test_content, tokenize='trigram')
+            """
+        )
+        # Clean up test table
+        conn.execute("DROP TABLE IF EXISTS test_trigram_check")
+        conn.commit()
+        return True
+    except sqlite3.OperationalError as e:
+        # Trigram tokenizer not available
+        if "unrecognized tokenizer" in str(e).lower():
+            log.debug("Trigram tokenizer not available in this SQLite version")
+            return False
+        # Other operational errors should be re-raised
+        raise
+    except Exception:
+        # Any other exception means trigram is not supported
+        return False
+
+
+def get_sqlite_version(conn: sqlite3.Connection) -> tuple[int, int, int]:
+    """Get SQLite version as (major, minor, patch) tuple.
+
+    Args:
+        conn: Database connection
+
+    Returns:
+        Version tuple, e.g., (3, 34, 1)
+    """
+    row = conn.execute("SELECT sqlite_version()").fetchone()
+    version_str = row[0] if row else "0.0.0"
+    parts = version_str.split('.')
+    try:
+        major = int(parts[0]) if len(parts) > 0 else 0
+        minor = int(parts[1]) if len(parts) > 1 else 0
+        patch = int(parts[2]) if len(parts) > 2 else 0
+        return (major, minor, patch)
+    except (ValueError, IndexError):
+        return (0, 0, 0)