Add graph expansion and cross-encoder reranking features

- Implemented GraphExpander to enhance search results with related symbols using precomputed neighbors. - Added CrossEncoderReranker for second-stage search ranking, allowing for improved result scoring. - Created migrations to establish necessary database tables for relationships and graph neighbors. - Developed tests for graph expansion functionality, ensuring related results are populated correctly. - Enhanced performance benchmarks for cross-encoder reranking latency and graph expansion overhead. - Updated schema cleanup tests to reflect changes in versioning and deprecated fields. - Added new test cases for Treesitter parser to validate relationship extraction with alias resolution.
2026-02-11 02:33:51 +08:00 · 2025-12-31 16:58:59 +08:00
parent 4bde13e83a
commit 31a45f1f30
27 changed files with 2566 additions and 97 deletions
--- a/codex-lens/src/codexlens/storage/dir_index.py
+++ b/codex-lens/src/codexlens/storage/dir_index.py
@@ -10,15 +10,17 @@ Each directory maintains its own _index.db with:
 from __future__ import annotations

 import logging
+import hashlib
 import re
 import sqlite3
 import threading
+import time
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple

 from codexlens.config import Config
-from codexlens.entities import SearchResult, Symbol
+from codexlens.entities import CodeRelationship, SearchResult, Symbol
 from codexlens.errors import StorageError
 from codexlens.storage.global_index import GlobalSymbolIndex

@@ -60,7 +62,7 @@ class DirIndexStore:

    # Schema version for migration tracking
    # Increment this when schema changes require migration
-    SCHEMA_VERSION = 5
+    SCHEMA_VERSION = 8

    def __init__(
        self,
@@ -150,6 +152,21 @@ class DirIndexStore:
            from codexlens.storage.migrations.migration_005_cleanup_unused_fields import upgrade
            upgrade(conn)

+        # Migration v5 -> v6: Ensure relationship tables/indexes exist
+        if from_version < 6:
+            from codexlens.storage.migrations.migration_006_enhance_relationships import upgrade
+            upgrade(conn)
+
+        # Migration v6 -> v7: Add graph neighbor cache for search expansion
+        if from_version < 7:
+            from codexlens.storage.migrations.migration_007_add_graph_neighbors import upgrade
+            upgrade(conn)
+
+        # Migration v7 -> v8: Add Merkle hashes for incremental change detection
+        if from_version < 8:
+            from codexlens.storage.migrations.migration_008_add_merkle_hashes import upgrade
+            upgrade(conn)
+
    def close(self) -> None:
        """Close database connection."""
        with self._lock:
@@ -179,6 +196,7 @@ class DirIndexStore:
        content: str,
        language: str,
        symbols: Optional[List[Symbol]] = None,
+        relationships: Optional[List[CodeRelationship]] = None,
    ) -> int:
        """Add or update a file in the current directory index.

@@ -188,6 +206,7 @@ class DirIndexStore:
            content: File content for indexing
            language: Programming language identifier
            symbols: List of Symbol objects from the file
+            relationships: Optional list of CodeRelationship edges from this file

        Returns:
            Database file_id
@@ -240,6 +259,8 @@ class DirIndexStore:
                        symbol_rows,
                    )

+                self._save_merkle_hash(conn, file_id=file_id, content=content)
+                self._save_relationships(conn, file_id=file_id, relationships=relationships)
                conn.commit()
                self._maybe_update_global_symbols(full_path_str, symbols or [])
                return file_id
@@ -248,6 +269,96 @@ class DirIndexStore:
                conn.rollback()
                raise StorageError(f"Failed to add file {name}: {exc}") from exc

+    def save_relationships(self, file_id: int, relationships: List[CodeRelationship]) -> None:
+        """Save relationships for an already-indexed file.
+
+        Args:
+            file_id: Database file id
+            relationships: Relationship edges to persist
+        """
+        if not relationships:
+            return
+        with self._lock:
+            conn = self._get_connection()
+            self._save_relationships(conn, file_id=file_id, relationships=relationships)
+            conn.commit()
+
+    def _save_relationships(
+        self,
+        conn: sqlite3.Connection,
+        file_id: int,
+        relationships: Optional[List[CodeRelationship]],
+    ) -> None:
+        if not relationships:
+            return
+
+        rows = conn.execute(
+            "SELECT id, name FROM symbols WHERE file_id=? ORDER BY start_line, id",
+            (file_id,),
+        ).fetchall()
+
+        name_to_id: Dict[str, int] = {}
+        for row in rows:
+            name = row["name"]
+            if name not in name_to_id:
+                name_to_id[name] = int(row["id"])
+
+        if not name_to_id:
+            return
+
+        rel_rows: List[Tuple[int, str, str, int, Optional[str]]] = []
+        seen: set[tuple[int, str, str, int, Optional[str]]] = set()
+
+        for rel in relationships:
+            source_id = name_to_id.get(rel.source_symbol)
+            if source_id is None:
+                continue
+
+            target = (rel.target_symbol or "").strip()
+            if not target:
+                continue
+
+            rel_type = rel.relationship_type.value
+            source_line = int(rel.source_line)
+            key = (source_id, target, rel_type, source_line, rel.target_file)
+            if key in seen:
+                continue
+            seen.add(key)
+
+            rel_rows.append((source_id, target, rel_type, source_line, rel.target_file))
+
+        if not rel_rows:
+            return
+
+        conn.executemany(
+            """
+            INSERT INTO code_relationships(
+                source_symbol_id, target_qualified_name,
+                relationship_type, source_line, target_file
+            )
+            VALUES(?, ?, ?, ?, ?)
+            """,
+            rel_rows,
+        )
+
+    def _save_merkle_hash(self, conn: sqlite3.Connection, file_id: int, content: str) -> None:
+        """Upsert a SHA-256 content hash for the given file_id (best-effort)."""
+        try:
+            digest = hashlib.sha256(content.encode("utf-8", errors="ignore")).hexdigest()
+            now = time.time()
+            conn.execute(
+                """
+                INSERT INTO merkle_hashes(file_id, sha256, updated_at)
+                VALUES(?, ?, ?)
+                ON CONFLICT(file_id) DO UPDATE SET
+                    sha256=excluded.sha256,
+                    updated_at=excluded.updated_at
+                """,
+                (file_id, digest, now),
+            )
+        except sqlite3.Error:
+            return
+
    def add_files_batch(
        self, files: List[Tuple[str, Path, str, str, Optional[List[Symbol]]]]
    ) -> int:
@@ -312,6 +423,8 @@ class DirIndexStore:
                            symbol_rows,
                        )

+                    self._save_merkle_hash(conn, file_id=file_id, content=content)
+
                conn.commit()
                return count

@@ -395,9 +508,13 @@ class DirIndexStore:
            return float(row["mtime"]) if row and row["mtime"] else None

    def needs_reindex(self, full_path: str | Path) -> bool:
-        """Check if a file needs reindexing based on mtime comparison.
+        """Check if a file needs reindexing.

-        Uses 1ms tolerance to handle filesystem timestamp precision variations.
+        Default behavior uses mtime comparison (with 1ms tolerance).
+
+        When `Config.enable_merkle_detection` is enabled and Merkle metadata is
+        available, uses SHA-256 content hash comparison (with mtime as a fast
+        path to avoid hashing unchanged files).

        Args:
            full_path: Complete source file path
@@ -415,16 +532,154 @@ class DirIndexStore:
        except OSError:
            return False  # Can't read file stats, skip

-        # Get stored mtime from database
-        stored_mtime = self.get_file_mtime(full_path_obj)
+        MTIME_TOLERANCE = 0.001

-        # File not in index, needs indexing
-        if stored_mtime is None:
+        # Fast path: mtime-only mode (default / backward-compatible)
+        if self._config is None or not getattr(self._config, "enable_merkle_detection", False):
+            stored_mtime = self.get_file_mtime(full_path_obj)
+            if stored_mtime is None:
+                return True
+            return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
+
+        full_path_str = str(full_path_obj)
+
+        # Hash-based change detection (best-effort, falls back to mtime when metadata missing)
+        with self._lock:
+            conn = self._get_connection()
+            try:
+                row = conn.execute(
+                    """
+                    SELECT f.id AS file_id, f.mtime AS mtime, mh.sha256 AS sha256
+                    FROM files f
+                    LEFT JOIN merkle_hashes mh ON mh.file_id = f.id
+                    WHERE f.full_path=?
+                    """,
+                    (full_path_str,),
+                ).fetchone()
+            except sqlite3.Error:
+                row = None
+
+        if row is None:
            return True

-        # Compare with 1ms tolerance for floating point precision
-        MTIME_TOLERANCE = 0.001
-        return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
+        stored_mtime = float(row["mtime"]) if row["mtime"] else None
+        stored_hash = row["sha256"] if row["sha256"] else None
+        file_id = int(row["file_id"])
+
+        # Missing Merkle data: fall back to mtime
+        if stored_hash is None:
+            if stored_mtime is None:
+                return True
+            return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
+
+        # If mtime is unchanged within tolerance, assume unchanged without hashing.
+        if stored_mtime is not None and abs(current_mtime - stored_mtime) <= MTIME_TOLERANCE:
+            return False
+
+        try:
+            current_text = full_path_obj.read_text(encoding="utf-8", errors="ignore")
+            current_hash = hashlib.sha256(current_text.encode("utf-8", errors="ignore")).hexdigest()
+        except OSError:
+            return False
+
+        if current_hash == stored_hash:
+            # Content unchanged, but mtime drifted: update stored mtime to avoid repeated hashing.
+            with self._lock:
+                conn = self._get_connection()
+                conn.execute("UPDATE files SET mtime=? WHERE id=?", (current_mtime, file_id))
+                conn.commit()
+            return False
+
+        return True
+
+    def get_merkle_root_hash(self) -> Optional[str]:
+        """Return the stored Merkle root hash for this directory index (if present)."""
+        with self._lock:
+            conn = self._get_connection()
+            try:
+                row = conn.execute(
+                    "SELECT root_hash FROM merkle_state WHERE id=1"
+                ).fetchone()
+            except sqlite3.Error:
+                return None
+
+            return row["root_hash"] if row and row["root_hash"] else None
+
+    def update_merkle_root(self) -> Optional[str]:
+        """Compute and persist the Merkle root hash for this directory index.
+
+        The root hash includes:
+        - Direct file hashes from `merkle_hashes`
+        - Direct subdirectory root hashes (read from child `_index.db` files)
+        """
+        if self._config is None or not getattr(self._config, "enable_merkle_detection", False):
+            return None
+
+        with self._lock:
+            conn = self._get_connection()
+            try:
+                file_rows = conn.execute(
+                    """
+                    SELECT f.name AS name, mh.sha256 AS sha256
+                    FROM files f
+                    LEFT JOIN merkle_hashes mh ON mh.file_id = f.id
+                    ORDER BY f.name
+                    """
+                ).fetchall()
+
+                subdir_rows = conn.execute(
+                    "SELECT name, index_path FROM subdirs ORDER BY name"
+                ).fetchall()
+            except sqlite3.Error as exc:
+                self.logger.debug("Failed to compute merkle root: %s", exc)
+                return None
+
+        items: List[str] = []
+
+        for row in file_rows:
+            name = row["name"]
+            sha = (row["sha256"] or "").strip()
+            items.append(f"f:{name}:{sha}")
+
+        def read_child_root(index_path: str) -> str:
+            try:
+                with sqlite3.connect(index_path) as child_conn:
+                    child_conn.row_factory = sqlite3.Row
+                    child_row = child_conn.execute(
+                        "SELECT root_hash FROM merkle_state WHERE id=1"
+                    ).fetchone()
+                    return child_row["root_hash"] if child_row and child_row["root_hash"] else ""
+            except Exception:
+                return ""
+
+        for row in subdir_rows:
+            name = row["name"]
+            index_path = row["index_path"]
+            child_hash = read_child_root(index_path) if index_path else ""
+            items.append(f"d:{name}:{child_hash}")
+
+        root_hash = hashlib.sha256("\n".join(items).encode("utf-8", errors="ignore")).hexdigest()
+        now = time.time()
+
+        with self._lock:
+            conn = self._get_connection()
+            try:
+                conn.execute(
+                    """
+                    INSERT INTO merkle_state(id, root_hash, updated_at)
+                    VALUES(1, ?, ?)
+                    ON CONFLICT(id) DO UPDATE SET
+                        root_hash=excluded.root_hash,
+                        updated_at=excluded.updated_at
+                    """,
+                    (root_hash, now),
+                )
+                conn.commit()
+            except sqlite3.Error as exc:
+                self.logger.debug("Failed to persist merkle root: %s", exc)
+                return None
+
+        return root_hash

    def add_file_incremental(
        self,
@@ -433,6 +688,7 @@ class DirIndexStore:
        content: str,
        language: str,
        symbols: Optional[List[Symbol]] = None,
+        relationships: Optional[List[CodeRelationship]] = None,
    ) -> Optional[int]:
        """Add or update a file only if it has changed (incremental indexing).

@@ -444,6 +700,7 @@ class DirIndexStore:
            content: File content for indexing
            language: Programming language identifier
            symbols: List of Symbol objects from the file
+            relationships: Optional list of CodeRelationship edges from this file

        Returns:
            Database file_id if indexed, None if skipped (unchanged)
@@ -456,7 +713,7 @@ class DirIndexStore:
            return None  # Skip unchanged file

        # File changed or new, perform full indexing
-        return self.add_file(name, full_path, content, language, symbols)
+        return self.add_file(name, full_path, content, language, symbols, relationships)

    def cleanup_deleted_files(self, source_dir: Path) -> int:
        """Remove indexed files that no longer exist in the source directory.
@@ -1767,6 +2024,39 @@ class DirIndexStore:
                """
            )

+            # Precomputed graph neighbors cache for search expansion (v7)
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS graph_neighbors (
+                    source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
+                    neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
+                    relationship_depth INTEGER NOT NULL,
+                    PRIMARY KEY (source_symbol_id, neighbor_symbol_id)
+                )
+                """
+            )
+
+            # Merkle hashes for incremental change detection (v8)
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS merkle_hashes (
+                    file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE,
+                    sha256 TEXT NOT NULL,
+                    updated_at REAL
+                )
+                """
+            )
+
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS merkle_state (
+                    id INTEGER PRIMARY KEY CHECK (id = 1),
+                    root_hash TEXT,
+                    updated_at REAL
+                )
+                """
+            )
+
            # Indexes (v5: removed idx_symbols_type)
            conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
@@ -1780,6 +2070,14 @@ class DirIndexStore:
            conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)")
+            conn.execute(
+                "CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth "
+                "ON graph_neighbors(source_symbol_id, relationship_depth)"
+            )
+            conn.execute(
+                "CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor "
+                "ON graph_neighbors(neighbor_symbol_id)"
+            )

        except sqlite3.DatabaseError as exc:
            raise StorageError(f"Failed to create schema: {exc}") from exc
--- a/codex-lens/src/codexlens/storage/index_tree.py
+++ b/codex-lens/src/codexlens/storage/index_tree.py
@@ -8,11 +8,13 @@ from __future__ import annotations

 import logging
 import os
+import re
+import sqlite3
 import time
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, List, Optional, Set
+from typing import Dict, List, Optional, Set, Tuple

 from codexlens.config import Config
 from codexlens.parsers.factory import ParserFactory
@@ -247,6 +249,9 @@ class IndexTreeBuilder:
                try:
                    with DirIndexStore(result.index_path, config=self.config, global_index=global_index) as store:
                        deleted_count = store.cleanup_deleted_files(result.source_path)
+                        if deleted_count > 0:
+                            _compute_graph_neighbors(store, logger=self.logger)
+                        store.update_merkle_root()
                        total_deleted += deleted_count
                        if deleted_count > 0:
                            self.logger.debug("Removed %d deleted files from %s", deleted_count, result.source_path)
@@ -575,6 +580,7 @@ class IndexTreeBuilder:
                        content=text,
                        language=language_id,
                        symbols=indexed_file.symbols,
+                        relationships=indexed_file.relationships,
                    )

                    files_count += 1
@@ -584,6 +590,9 @@ class IndexTreeBuilder:
                    self.logger.debug("Failed to index %s: %s", file_path, exc)
                    continue

+            if files_count > 0:
+                _compute_graph_neighbors(store, logger=self.logger)
+
            # Get list of subdirectories
            subdirs = [
                d.name
@@ -593,6 +602,7 @@ class IndexTreeBuilder:
                and not d.name.startswith(".")
            ]

+            store.update_merkle_root()
            store.close()
            if global_index is not None:
                global_index.close()
@@ -654,31 +664,29 @@ class IndexTreeBuilder:
        parent_index_db = self.mapper.source_to_index_db(parent_path)

        try:
-            store = DirIndexStore(parent_index_db)
-            store.initialize()
+            with DirIndexStore(parent_index_db, config=self.config) as store:
+                for result in all_results:
+                    # Only register direct children (parent is one level up)
+                    if result.source_path.parent != parent_path:
+                        continue

-            for result in all_results:
-                # Only register direct children (parent is one level up)
-                if result.source_path.parent != parent_path:
-                    continue
+                    if result.error:
+                        continue

-                if result.error:
-                    continue
+                    # Register subdirectory link
+                    store.register_subdir(
+                        name=result.source_path.name,
+                        index_path=result.index_path,
+                        files_count=result.files_count,
+                        direct_files=result.files_count,
+                    )
+                    self.logger.debug(
+                        "Linked %s to parent %s",
+                        result.source_path.name,
+                        parent_path,
+                    )

-                # Register subdirectory link
-                store.register_subdir(
-                    name=result.source_path.name,
-                    index_path=result.index_path,
-                    files_count=result.files_count,
-                    direct_files=result.files_count,
-                )
-                self.logger.debug(
-                    "Linked %s to parent %s",
-                    result.source_path.name,
-                    parent_path,
-                )
-
-            store.close()
+                store.update_merkle_root()

        except Exception as exc:
            self.logger.error(
@@ -726,6 +734,164 @@ class IndexTreeBuilder:
        return files


+def _normalize_relationship_target(target: str) -> str:
+    """Best-effort normalization of a relationship target into a local symbol name."""
+    target = (target or "").strip()
+    if not target:
+        return ""
+
+    # Drop trailing call parentheses when present (e.g., "foo()" -> "foo").
+    if target.endswith("()"):
+        target = target[:-2]
+
+    # Keep the leaf identifier for common qualified formats.
+    for sep in ("::", ".", "#"):
+        if sep in target:
+            target = target.split(sep)[-1]
+
+    # Strip non-identifier suffix/prefix noise.
+    target = re.sub(r"^[^A-Za-z0-9_]+", "", target)
+    target = re.sub(r"[^A-Za-z0-9_]+$", "", target)
+    return target
+
+
+def _compute_graph_neighbors(
+    store: DirIndexStore,
+    *,
+    max_depth: int = 2,
+    logger: Optional[logging.Logger] = None,
+) -> None:
+    """Compute and persist N-hop neighbors for all symbols in a directory index."""
+    if max_depth <= 0:
+        return
+
+    log = logger or logging.getLogger(__name__)
+
+    with store._lock:
+        conn = store._get_connection()
+        conn.row_factory = sqlite3.Row
+
+        # Ensure schema exists even for older databases pinned to the same user_version.
+        try:
+            from codexlens.storage.migrations.migration_007_add_graph_neighbors import upgrade
+
+            upgrade(conn)
+        except Exception as exc:
+            log.debug("Graph neighbor schema ensure failed: %s", exc)
+
+        cursor = conn.cursor()
+
+        try:
+            cursor.execute("DELETE FROM graph_neighbors")
+        except sqlite3.Error:
+            # Table missing or schema mismatch; skip gracefully.
+            return
+
+        try:
+            symbol_rows = cursor.execute(
+                "SELECT id, file_id, name FROM symbols"
+            ).fetchall()
+            rel_rows = cursor.execute(
+                "SELECT source_symbol_id, target_qualified_name FROM code_relationships"
+            ).fetchall()
+        except sqlite3.Error:
+            return
+
+        if not symbol_rows or not rel_rows:
+            try:
+                conn.commit()
+            except sqlite3.Error:
+                pass
+            return
+
+        symbol_file_by_id: Dict[int, int] = {}
+        symbols_by_file_and_name: Dict[Tuple[int, str], List[int]] = {}
+        symbols_by_name: Dict[str, List[int]] = {}
+
+        for row in symbol_rows:
+            symbol_id = int(row["id"])
+            file_id = int(row["file_id"])
+            name = str(row["name"])
+            symbol_file_by_id[symbol_id] = file_id
+            symbols_by_file_and_name.setdefault((file_id, name), []).append(symbol_id)
+            symbols_by_name.setdefault(name, []).append(symbol_id)
+
+        adjacency: Dict[int, Set[int]] = {}
+
+        for row in rel_rows:
+            source_id = int(row["source_symbol_id"])
+            target_raw = str(row["target_qualified_name"] or "")
+            target_name = _normalize_relationship_target(target_raw)
+            if not target_name:
+                continue
+
+            source_file_id = symbol_file_by_id.get(source_id)
+            if source_file_id is None:
+                continue
+
+            candidate_ids = symbols_by_file_and_name.get((source_file_id, target_name))
+            if not candidate_ids:
+                global_candidates = symbols_by_name.get(target_name, [])
+                # Only resolve cross-file by name when unambiguous.
+                candidate_ids = global_candidates if len(global_candidates) == 1 else []
+
+            for target_id in candidate_ids:
+                if target_id == source_id:
+                    continue
+                adjacency.setdefault(source_id, set()).add(target_id)
+                adjacency.setdefault(target_id, set()).add(source_id)
+
+        if not adjacency:
+            try:
+                conn.commit()
+            except sqlite3.Error:
+                pass
+            return
+
+        insert_rows: List[Tuple[int, int, int]] = []
+        max_depth = min(int(max_depth), 2)
+
+        for source_id, first_hop in adjacency.items():
+            if not first_hop:
+                continue
+            for neighbor_id in first_hop:
+                insert_rows.append((source_id, neighbor_id, 1))
+
+            if max_depth < 2:
+                continue
+
+            second_hop: Set[int] = set()
+            for neighbor_id in first_hop:
+                second_hop.update(adjacency.get(neighbor_id, set()))
+
+            second_hop.discard(source_id)
+            second_hop.difference_update(first_hop)
+
+            for neighbor_id in second_hop:
+                insert_rows.append((source_id, neighbor_id, 2))
+
+        if not insert_rows:
+            try:
+                conn.commit()
+            except sqlite3.Error:
+                pass
+            return
+
+        try:
+            cursor.executemany(
+                """
+                INSERT INTO graph_neighbors(
+                    source_symbol_id, neighbor_symbol_id, relationship_depth
+                )
+                VALUES(?, ?, ?)
+                """,
+                insert_rows,
+            )
+            conn.commit()
+        except sqlite3.Error:
+            return
+
+
 # === Worker Function for ProcessPoolExecutor ===


@@ -795,6 +961,7 @@ def _build_dir_worker(args: tuple) -> DirBuildResult:
                    content=text,
                    language=language_id,
                    symbols=indexed_file.symbols,
+                    relationships=indexed_file.relationships,
                )

                files_count += 1
@@ -803,6 +970,9 @@ def _build_dir_worker(args: tuple) -> DirBuildResult:
            except Exception:
                continue

+        if files_count > 0:
+            _compute_graph_neighbors(store)
+
        # Get subdirectories
        ignore_dirs = {
            ".git",
@@ -821,6 +991,7 @@ def _build_dir_worker(args: tuple) -> DirBuildResult:
            if d.is_dir() and d.name not in ignore_dirs and not d.name.startswith(".")
        ]

+        store.update_merkle_root()
        store.close()
        if global_index is not None:
            global_index.close()
--- a/codex-lens/src/codexlens/storage/merkle_tree.py
+++ b/codex-lens/src/codexlens/storage/merkle_tree.py
@@ -0,0 +1,136 @@
+"""Merkle tree utilities for change detection.
+
+This module provides a generic, file-system based Merkle tree implementation
+that can be used to efficiently diff directory states.
+"""
+
+from __future__ import annotations
+
+import hashlib
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional
+
+
+def sha256_bytes(data: bytes) -> str:
+    return hashlib.sha256(data).hexdigest()
+
+
+def sha256_text(text: str) -> str:
+    return sha256_bytes(text.encode("utf-8", errors="ignore"))
+
+
+@dataclass
+class MerkleNode:
+    """A Merkle node representing either a file (leaf) or directory (internal)."""
+
+    name: str
+    rel_path: str
+    hash: str
+    is_dir: bool
+    children: Dict[str, "MerkleNode"] = field(default_factory=dict)
+
+    def iter_files(self) -> Iterable["MerkleNode"]:
+        if not self.is_dir:
+            yield self
+            return
+        for child in self.children.values():
+            yield from child.iter_files()
+
+
+@dataclass
+class MerkleTree:
+    """Merkle tree for a directory snapshot."""
+
+    root: MerkleNode
+
+    @classmethod
+    def build_from_directory(cls, root_dir: Path) -> "MerkleTree":
+        root_dir = Path(root_dir).resolve()
+        node = cls._build_node(root_dir, base=root_dir)
+        return cls(root=node)
+
+    @classmethod
+    def _build_node(cls, path: Path, *, base: Path) -> MerkleNode:
+        if path.is_file():
+            rel = str(path.relative_to(base)).replace("\\", "/")
+            return MerkleNode(
+                name=path.name,
+                rel_path=rel,
+                hash=sha256_bytes(path.read_bytes()),
+                is_dir=False,
+            )
+
+        if not path.is_dir():
+            rel = str(path.relative_to(base)).replace("\\", "/")
+            return MerkleNode(name=path.name, rel_path=rel, hash="", is_dir=False)
+
+        children: Dict[str, MerkleNode] = {}
+        for child in sorted(path.iterdir(), key=lambda p: p.name):
+            child_node = cls._build_node(child, base=base)
+            children[child_node.name] = child_node
+
+        items = [
+            f"{'d' if n.is_dir else 'f'}:{name}:{n.hash}"
+            for name, n in sorted(children.items(), key=lambda kv: kv[0])
+        ]
+        dir_hash = sha256_text("\n".join(items))
+
+        rel_path = "." if path == base else str(path.relative_to(base)).replace("\\", "/")
+        return MerkleNode(
+            name="." if path == base else path.name,
+            rel_path=rel_path,
+            hash=dir_hash,
+            is_dir=True,
+            children=children,
+        )
+
+    @staticmethod
+    def find_changed_files(old: Optional["MerkleTree"], new: Optional["MerkleTree"]) -> List[str]:
+        """Find changed/added/removed files between two trees.
+
+        Returns:
+            List of relative file paths (POSIX-style separators).
+        """
+        if old is None and new is None:
+            return []
+        if old is None:
+            return sorted({n.rel_path for n in new.root.iter_files()})  # type: ignore[union-attr]
+        if new is None:
+            return sorted({n.rel_path for n in old.root.iter_files()})
+
+        changed: set[str] = set()
+
+        def walk(old_node: Optional[MerkleNode], new_node: Optional[MerkleNode]) -> None:
+            if old_node is None and new_node is None:
+                return
+
+            if old_node is None and new_node is not None:
+                changed.update(n.rel_path for n in new_node.iter_files())
+                return
+
+            if new_node is None and old_node is not None:
+                changed.update(n.rel_path for n in old_node.iter_files())
+                return
+
+            assert old_node is not None and new_node is not None
+
+            if old_node.hash == new_node.hash:
+                return
+
+            if not old_node.is_dir and not new_node.is_dir:
+                changed.add(new_node.rel_path)
+                return
+
+            if old_node.is_dir != new_node.is_dir:
+                changed.update(n.rel_path for n in old_node.iter_files())
+                changed.update(n.rel_path for n in new_node.iter_files())
+                return
+
+            names = set(old_node.children.keys()) | set(new_node.children.keys())
+            for name in names:
+                walk(old_node.children.get(name), new_node.children.get(name))
+
+        walk(old.root, new.root)
+        return sorted(changed)
+
--- a/codex-lens/src/codexlens/storage/migrations/migration_006_enhance_relationships.py
+++ b/codex-lens/src/codexlens/storage/migrations/migration_006_enhance_relationships.py
@@ -0,0 +1,37 @@
+"""
+Migration 006: Ensure relationship tables and indexes exist.
+
+This migration is intentionally idempotent. It creates the `code_relationships`
+table (used for graph visualization) and its indexes if missing.
+"""
+
+from __future__ import annotations
+
+import logging
+from sqlite3 import Connection
+
+log = logging.getLogger(__name__)
+
+
+def upgrade(db_conn: Connection) -> None:
+    cursor = db_conn.cursor()
+
+    log.info("Ensuring code_relationships table exists...")
+    cursor.execute(
+        """
+        CREATE TABLE IF NOT EXISTS code_relationships (
+            id INTEGER PRIMARY KEY,
+            source_symbol_id INTEGER NOT NULL REFERENCES symbols (id) ON DELETE CASCADE,
+            target_qualified_name TEXT NOT NULL,
+            relationship_type TEXT NOT NULL,
+            source_line INTEGER NOT NULL,
+            target_file TEXT
+        )
+        """
+    )
+
+    log.info("Ensuring relationship indexes exist...")
+    cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
+    cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
+    cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)")
+
--- a/codex-lens/src/codexlens/storage/migrations/migration_007_add_graph_neighbors.py
+++ b/codex-lens/src/codexlens/storage/migrations/migration_007_add_graph_neighbors.py
@@ -0,0 +1,47 @@
+"""
+Migration 007: Add precomputed graph neighbor table for search expansion.
+
+Adds:
+- graph_neighbors: cached N-hop neighbors between symbols (keyed by symbol ids)
+
+This table is derived data (a cache) and is safe to rebuild at any time.
+The migration is intentionally idempotent.
+"""
+
+from __future__ import annotations
+
+import logging
+from sqlite3 import Connection
+
+log = logging.getLogger(__name__)
+
+
+def upgrade(db_conn: Connection) -> None:
+    cursor = db_conn.cursor()
+
+    log.info("Creating graph_neighbors table...")
+    cursor.execute(
+        """
+        CREATE TABLE IF NOT EXISTS graph_neighbors (
+            source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
+            neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
+            relationship_depth INTEGER NOT NULL,
+            PRIMARY KEY (source_symbol_id, neighbor_symbol_id)
+        )
+        """
+    )
+
+    log.info("Creating indexes for graph_neighbors...")
+    cursor.execute(
+        """
+        CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth
+        ON graph_neighbors(source_symbol_id, relationship_depth)
+        """
+    )
+    cursor.execute(
+        """
+        CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor
+        ON graph_neighbors(neighbor_symbol_id)
+        """
+    )
+
--- a/codex-lens/src/codexlens/storage/migrations/migration_008_add_merkle_hashes.py
+++ b/codex-lens/src/codexlens/storage/migrations/migration_008_add_merkle_hashes.py
@@ -0,0 +1,81 @@
+"""
+Migration 008: Add Merkle hash tables for content-based incremental indexing.
+
+Adds:
+- merkle_hashes: per-file SHA-256 hashes (keyed by file_id)
+- merkle_state: directory-level root hash (single row, id=1)
+
+Backfills merkle_hashes using the existing `files.content` column when available.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import time
+from sqlite3 import Connection
+
+log = logging.getLogger(__name__)
+
+
+def upgrade(db_conn: Connection) -> None:
+    cursor = db_conn.cursor()
+
+    log.info("Creating merkle_hashes table...")
+    cursor.execute(
+        """
+        CREATE TABLE IF NOT EXISTS merkle_hashes (
+            file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE,
+            sha256 TEXT NOT NULL,
+            updated_at REAL
+        )
+        """
+    )
+
+    log.info("Creating merkle_state table...")
+    cursor.execute(
+        """
+        CREATE TABLE IF NOT EXISTS merkle_state (
+            id INTEGER PRIMARY KEY CHECK (id = 1),
+            root_hash TEXT,
+            updated_at REAL
+        )
+        """
+    )
+
+    # Backfill file hashes from stored content (best-effort).
+    try:
+        rows = cursor.execute("SELECT id, content FROM files").fetchall()
+    except Exception as exc:
+        log.warning("Unable to backfill merkle hashes (files table missing?): %s", exc)
+        return
+
+    now = time.time()
+    inserts: list[tuple[int, str, float]] = []
+
+    for row in rows:
+        file_id = int(row[0])
+        content = row[1]
+        if content is None:
+            continue
+        try:
+            digest = hashlib.sha256(str(content).encode("utf-8", errors="ignore")).hexdigest()
+            inserts.append((file_id, digest, now))
+        except Exception:
+            continue
+
+    if not inserts:
+        return
+
+    log.info("Backfilling %d file hashes...", len(inserts))
+    cursor.executemany(
+        """
+        INSERT INTO merkle_hashes(file_id, sha256, updated_at)
+        VALUES(?, ?, ?)
+        ON CONFLICT(file_id) DO UPDATE SET
+            sha256=excluded.sha256,
+            updated_at=excluded.updated_at
+        """,
+        inserts,
+    )
+