Add graph expansion and cross-encoder reranking features

- Implemented GraphExpander to enhance search results with related symbols using precomputed neighbors.
- Added CrossEncoderReranker for second-stage search ranking, allowing for improved result scoring.
- Created migrations to establish necessary database tables for relationships and graph neighbors.
- Developed tests for graph expansion functionality, ensuring related results are populated correctly.
- Enhanced performance benchmarks for cross-encoder reranking latency and graph expansion overhead.
- Updated schema cleanup tests to reflect changes in versioning and deprecated fields.
- Added new test cases for Treesitter parser to validate relationship extraction with alias resolution.
This commit is contained in:
catlog22
2025-12-31 16:58:59 +08:00
parent 4bde13e83a
commit 31a45f1f30
27 changed files with 2566 additions and 97 deletions

View File

@@ -10,15 +10,17 @@ Each directory maintains its own _index.db with:
from __future__ import annotations
import logging
import hashlib
import re
import sqlite3
import threading
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from codexlens.config import Config
from codexlens.entities import SearchResult, Symbol
from codexlens.entities import CodeRelationship, SearchResult, Symbol
from codexlens.errors import StorageError
from codexlens.storage.global_index import GlobalSymbolIndex
@@ -60,7 +62,7 @@ class DirIndexStore:
# Schema version for migration tracking
# Increment this when schema changes require migration
SCHEMA_VERSION = 5
SCHEMA_VERSION = 8
def __init__(
self,
@@ -150,6 +152,21 @@ class DirIndexStore:
from codexlens.storage.migrations.migration_005_cleanup_unused_fields import upgrade
upgrade(conn)
# Migration v5 -> v6: Ensure relationship tables/indexes exist
if from_version < 6:
from codexlens.storage.migrations.migration_006_enhance_relationships import upgrade
upgrade(conn)
# Migration v6 -> v7: Add graph neighbor cache for search expansion
if from_version < 7:
from codexlens.storage.migrations.migration_007_add_graph_neighbors import upgrade
upgrade(conn)
# Migration v7 -> v8: Add Merkle hashes for incremental change detection
if from_version < 8:
from codexlens.storage.migrations.migration_008_add_merkle_hashes import upgrade
upgrade(conn)
def close(self) -> None:
"""Close database connection."""
with self._lock:
@@ -179,6 +196,7 @@ class DirIndexStore:
content: str,
language: str,
symbols: Optional[List[Symbol]] = None,
relationships: Optional[List[CodeRelationship]] = None,
) -> int:
"""Add or update a file in the current directory index.
@@ -188,6 +206,7 @@ class DirIndexStore:
content: File content for indexing
language: Programming language identifier
symbols: List of Symbol objects from the file
relationships: Optional list of CodeRelationship edges from this file
Returns:
Database file_id
@@ -240,6 +259,8 @@ class DirIndexStore:
symbol_rows,
)
self._save_merkle_hash(conn, file_id=file_id, content=content)
self._save_relationships(conn, file_id=file_id, relationships=relationships)
conn.commit()
self._maybe_update_global_symbols(full_path_str, symbols or [])
return file_id
@@ -248,6 +269,96 @@ class DirIndexStore:
conn.rollback()
raise StorageError(f"Failed to add file {name}: {exc}") from exc
def save_relationships(self, file_id: int, relationships: List[CodeRelationship]) -> None:
"""Save relationships for an already-indexed file.
Args:
file_id: Database file id
relationships: Relationship edges to persist
"""
if not relationships:
return
with self._lock:
conn = self._get_connection()
self._save_relationships(conn, file_id=file_id, relationships=relationships)
conn.commit()
def _save_relationships(
self,
conn: sqlite3.Connection,
file_id: int,
relationships: Optional[List[CodeRelationship]],
) -> None:
if not relationships:
return
rows = conn.execute(
"SELECT id, name FROM symbols WHERE file_id=? ORDER BY start_line, id",
(file_id,),
).fetchall()
name_to_id: Dict[str, int] = {}
for row in rows:
name = row["name"]
if name not in name_to_id:
name_to_id[name] = int(row["id"])
if not name_to_id:
return
rel_rows: List[Tuple[int, str, str, int, Optional[str]]] = []
seen: set[tuple[int, str, str, int, Optional[str]]] = set()
for rel in relationships:
source_id = name_to_id.get(rel.source_symbol)
if source_id is None:
continue
target = (rel.target_symbol or "").strip()
if not target:
continue
rel_type = rel.relationship_type.value
source_line = int(rel.source_line)
key = (source_id, target, rel_type, source_line, rel.target_file)
if key in seen:
continue
seen.add(key)
rel_rows.append((source_id, target, rel_type, source_line, rel.target_file))
if not rel_rows:
return
conn.executemany(
"""
INSERT INTO code_relationships(
source_symbol_id, target_qualified_name,
relationship_type, source_line, target_file
)
VALUES(?, ?, ?, ?, ?)
""",
rel_rows,
)
def _save_merkle_hash(self, conn: sqlite3.Connection, file_id: int, content: str) -> None:
"""Upsert a SHA-256 content hash for the given file_id (best-effort)."""
try:
digest = hashlib.sha256(content.encode("utf-8", errors="ignore")).hexdigest()
now = time.time()
conn.execute(
"""
INSERT INTO merkle_hashes(file_id, sha256, updated_at)
VALUES(?, ?, ?)
ON CONFLICT(file_id) DO UPDATE SET
sha256=excluded.sha256,
updated_at=excluded.updated_at
""",
(file_id, digest, now),
)
except sqlite3.Error:
return
def add_files_batch(
self, files: List[Tuple[str, Path, str, str, Optional[List[Symbol]]]]
) -> int:
@@ -312,6 +423,8 @@ class DirIndexStore:
symbol_rows,
)
self._save_merkle_hash(conn, file_id=file_id, content=content)
conn.commit()
return count
@@ -395,9 +508,13 @@ class DirIndexStore:
return float(row["mtime"]) if row and row["mtime"] else None
def needs_reindex(self, full_path: str | Path) -> bool:
"""Check if a file needs reindexing based on mtime comparison.
"""Check if a file needs reindexing.
Uses 1ms tolerance to handle filesystem timestamp precision variations.
Default behavior uses mtime comparison (with 1ms tolerance).
When `Config.enable_merkle_detection` is enabled and Merkle metadata is
available, uses SHA-256 content hash comparison (with mtime as a fast
path to avoid hashing unchanged files).
Args:
full_path: Complete source file path
@@ -415,16 +532,154 @@ class DirIndexStore:
except OSError:
return False # Can't read file stats, skip
# Get stored mtime from database
stored_mtime = self.get_file_mtime(full_path_obj)
MTIME_TOLERANCE = 0.001
# File not in index, needs indexing
if stored_mtime is None:
# Fast path: mtime-only mode (default / backward-compatible)
if self._config is None or not getattr(self._config, "enable_merkle_detection", False):
stored_mtime = self.get_file_mtime(full_path_obj)
if stored_mtime is None:
return True
return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
full_path_str = str(full_path_obj)
# Hash-based change detection (best-effort, falls back to mtime when metadata missing)
with self._lock:
conn = self._get_connection()
try:
row = conn.execute(
"""
SELECT f.id AS file_id, f.mtime AS mtime, mh.sha256 AS sha256
FROM files f
LEFT JOIN merkle_hashes mh ON mh.file_id = f.id
WHERE f.full_path=?
""",
(full_path_str,),
).fetchone()
except sqlite3.Error:
row = None
if row is None:
return True
# Compare with 1ms tolerance for floating point precision
MTIME_TOLERANCE = 0.001
return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
stored_mtime = float(row["mtime"]) if row["mtime"] else None
stored_hash = row["sha256"] if row["sha256"] else None
file_id = int(row["file_id"])
# Missing Merkle data: fall back to mtime
if stored_hash is None:
if stored_mtime is None:
return True
return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
# If mtime is unchanged within tolerance, assume unchanged without hashing.
if stored_mtime is not None and abs(current_mtime - stored_mtime) <= MTIME_TOLERANCE:
return False
try:
current_text = full_path_obj.read_text(encoding="utf-8", errors="ignore")
current_hash = hashlib.sha256(current_text.encode("utf-8", errors="ignore")).hexdigest()
except OSError:
return False
if current_hash == stored_hash:
# Content unchanged, but mtime drifted: update stored mtime to avoid repeated hashing.
with self._lock:
conn = self._get_connection()
conn.execute("UPDATE files SET mtime=? WHERE id=?", (current_mtime, file_id))
conn.commit()
return False
return True
def get_merkle_root_hash(self) -> Optional[str]:
"""Return the stored Merkle root hash for this directory index (if present)."""
with self._lock:
conn = self._get_connection()
try:
row = conn.execute(
"SELECT root_hash FROM merkle_state WHERE id=1"
).fetchone()
except sqlite3.Error:
return None
return row["root_hash"] if row and row["root_hash"] else None
def update_merkle_root(self) -> Optional[str]:
"""Compute and persist the Merkle root hash for this directory index.
The root hash includes:
- Direct file hashes from `merkle_hashes`
- Direct subdirectory root hashes (read from child `_index.db` files)
"""
if self._config is None or not getattr(self._config, "enable_merkle_detection", False):
return None
with self._lock:
conn = self._get_connection()
try:
file_rows = conn.execute(
"""
SELECT f.name AS name, mh.sha256 AS sha256
FROM files f
LEFT JOIN merkle_hashes mh ON mh.file_id = f.id
ORDER BY f.name
"""
).fetchall()
subdir_rows = conn.execute(
"SELECT name, index_path FROM subdirs ORDER BY name"
).fetchall()
except sqlite3.Error as exc:
self.logger.debug("Failed to compute merkle root: %s", exc)
return None
items: List[str] = []
for row in file_rows:
name = row["name"]
sha = (row["sha256"] or "").strip()
items.append(f"f:{name}:{sha}")
def read_child_root(index_path: str) -> str:
try:
with sqlite3.connect(index_path) as child_conn:
child_conn.row_factory = sqlite3.Row
child_row = child_conn.execute(
"SELECT root_hash FROM merkle_state WHERE id=1"
).fetchone()
return child_row["root_hash"] if child_row and child_row["root_hash"] else ""
except Exception:
return ""
for row in subdir_rows:
name = row["name"]
index_path = row["index_path"]
child_hash = read_child_root(index_path) if index_path else ""
items.append(f"d:{name}:{child_hash}")
root_hash = hashlib.sha256("\n".join(items).encode("utf-8", errors="ignore")).hexdigest()
now = time.time()
with self._lock:
conn = self._get_connection()
try:
conn.execute(
"""
INSERT INTO merkle_state(id, root_hash, updated_at)
VALUES(1, ?, ?)
ON CONFLICT(id) DO UPDATE SET
root_hash=excluded.root_hash,
updated_at=excluded.updated_at
""",
(root_hash, now),
)
conn.commit()
except sqlite3.Error as exc:
self.logger.debug("Failed to persist merkle root: %s", exc)
return None
return root_hash
def add_file_incremental(
self,
@@ -433,6 +688,7 @@ class DirIndexStore:
content: str,
language: str,
symbols: Optional[List[Symbol]] = None,
relationships: Optional[List[CodeRelationship]] = None,
) -> Optional[int]:
"""Add or update a file only if it has changed (incremental indexing).
@@ -444,6 +700,7 @@ class DirIndexStore:
content: File content for indexing
language: Programming language identifier
symbols: List of Symbol objects from the file
relationships: Optional list of CodeRelationship edges from this file
Returns:
Database file_id if indexed, None if skipped (unchanged)
@@ -456,7 +713,7 @@ class DirIndexStore:
return None # Skip unchanged file
# File changed or new, perform full indexing
return self.add_file(name, full_path, content, language, symbols)
return self.add_file(name, full_path, content, language, symbols, relationships)
def cleanup_deleted_files(self, source_dir: Path) -> int:
"""Remove indexed files that no longer exist in the source directory.
@@ -1767,6 +2024,39 @@ class DirIndexStore:
"""
)
# Precomputed graph neighbors cache for search expansion (v7)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS graph_neighbors (
source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
relationship_depth INTEGER NOT NULL,
PRIMARY KEY (source_symbol_id, neighbor_symbol_id)
)
"""
)
# Merkle hashes for incremental change detection (v8)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS merkle_hashes (
file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE,
sha256 TEXT NOT NULL,
updated_at REAL
)
"""
)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS merkle_state (
id INTEGER PRIMARY KEY CHECK (id = 1),
root_hash TEXT,
updated_at REAL
)
"""
)
# Indexes (v5: removed idx_symbols_type)
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
@@ -1780,6 +2070,14 @@ class DirIndexStore:
conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)")
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth "
"ON graph_neighbors(source_symbol_id, relationship_depth)"
)
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor "
"ON graph_neighbors(neighbor_symbol_id)"
)
except sqlite3.DatabaseError as exc:
raise StorageError(f"Failed to create schema: {exc}") from exc

View File

@@ -8,11 +8,13 @@ from __future__ import annotations
import logging
import os
import re
import sqlite3
import time
from concurrent.futures import ProcessPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Set
from typing import Dict, List, Optional, Set, Tuple
from codexlens.config import Config
from codexlens.parsers.factory import ParserFactory
@@ -247,6 +249,9 @@ class IndexTreeBuilder:
try:
with DirIndexStore(result.index_path, config=self.config, global_index=global_index) as store:
deleted_count = store.cleanup_deleted_files(result.source_path)
if deleted_count > 0:
_compute_graph_neighbors(store, logger=self.logger)
store.update_merkle_root()
total_deleted += deleted_count
if deleted_count > 0:
self.logger.debug("Removed %d deleted files from %s", deleted_count, result.source_path)
@@ -575,6 +580,7 @@ class IndexTreeBuilder:
content=text,
language=language_id,
symbols=indexed_file.symbols,
relationships=indexed_file.relationships,
)
files_count += 1
@@ -584,6 +590,9 @@ class IndexTreeBuilder:
self.logger.debug("Failed to index %s: %s", file_path, exc)
continue
if files_count > 0:
_compute_graph_neighbors(store, logger=self.logger)
# Get list of subdirectories
subdirs = [
d.name
@@ -593,6 +602,7 @@ class IndexTreeBuilder:
and not d.name.startswith(".")
]
store.update_merkle_root()
store.close()
if global_index is not None:
global_index.close()
@@ -654,31 +664,29 @@ class IndexTreeBuilder:
parent_index_db = self.mapper.source_to_index_db(parent_path)
try:
store = DirIndexStore(parent_index_db)
store.initialize()
with DirIndexStore(parent_index_db, config=self.config) as store:
for result in all_results:
# Only register direct children (parent is one level up)
if result.source_path.parent != parent_path:
continue
for result in all_results:
# Only register direct children (parent is one level up)
if result.source_path.parent != parent_path:
continue
if result.error:
continue
if result.error:
continue
# Register subdirectory link
store.register_subdir(
name=result.source_path.name,
index_path=result.index_path,
files_count=result.files_count,
direct_files=result.files_count,
)
self.logger.debug(
"Linked %s to parent %s",
result.source_path.name,
parent_path,
)
# Register subdirectory link
store.register_subdir(
name=result.source_path.name,
index_path=result.index_path,
files_count=result.files_count,
direct_files=result.files_count,
)
self.logger.debug(
"Linked %s to parent %s",
result.source_path.name,
parent_path,
)
store.close()
store.update_merkle_root()
except Exception as exc:
self.logger.error(
@@ -726,6 +734,164 @@ class IndexTreeBuilder:
return files
def _normalize_relationship_target(target: str) -> str:
"""Best-effort normalization of a relationship target into a local symbol name."""
target = (target or "").strip()
if not target:
return ""
# Drop trailing call parentheses when present (e.g., "foo()" -> "foo").
if target.endswith("()"):
target = target[:-2]
# Keep the leaf identifier for common qualified formats.
for sep in ("::", ".", "#"):
if sep in target:
target = target.split(sep)[-1]
# Strip non-identifier suffix/prefix noise.
target = re.sub(r"^[^A-Za-z0-9_]+", "", target)
target = re.sub(r"[^A-Za-z0-9_]+$", "", target)
return target
def _compute_graph_neighbors(
store: DirIndexStore,
*,
max_depth: int = 2,
logger: Optional[logging.Logger] = None,
) -> None:
"""Compute and persist N-hop neighbors for all symbols in a directory index."""
if max_depth <= 0:
return
log = logger or logging.getLogger(__name__)
with store._lock:
conn = store._get_connection()
conn.row_factory = sqlite3.Row
# Ensure schema exists even for older databases pinned to the same user_version.
try:
from codexlens.storage.migrations.migration_007_add_graph_neighbors import upgrade
upgrade(conn)
except Exception as exc:
log.debug("Graph neighbor schema ensure failed: %s", exc)
cursor = conn.cursor()
try:
cursor.execute("DELETE FROM graph_neighbors")
except sqlite3.Error:
# Table missing or schema mismatch; skip gracefully.
return
try:
symbol_rows = cursor.execute(
"SELECT id, file_id, name FROM symbols"
).fetchall()
rel_rows = cursor.execute(
"SELECT source_symbol_id, target_qualified_name FROM code_relationships"
).fetchall()
except sqlite3.Error:
return
if not symbol_rows or not rel_rows:
try:
conn.commit()
except sqlite3.Error:
pass
return
symbol_file_by_id: Dict[int, int] = {}
symbols_by_file_and_name: Dict[Tuple[int, str], List[int]] = {}
symbols_by_name: Dict[str, List[int]] = {}
for row in symbol_rows:
symbol_id = int(row["id"])
file_id = int(row["file_id"])
name = str(row["name"])
symbol_file_by_id[symbol_id] = file_id
symbols_by_file_and_name.setdefault((file_id, name), []).append(symbol_id)
symbols_by_name.setdefault(name, []).append(symbol_id)
adjacency: Dict[int, Set[int]] = {}
for row in rel_rows:
source_id = int(row["source_symbol_id"])
target_raw = str(row["target_qualified_name"] or "")
target_name = _normalize_relationship_target(target_raw)
if not target_name:
continue
source_file_id = symbol_file_by_id.get(source_id)
if source_file_id is None:
continue
candidate_ids = symbols_by_file_and_name.get((source_file_id, target_name))
if not candidate_ids:
global_candidates = symbols_by_name.get(target_name, [])
# Only resolve cross-file by name when unambiguous.
candidate_ids = global_candidates if len(global_candidates) == 1 else []
for target_id in candidate_ids:
if target_id == source_id:
continue
adjacency.setdefault(source_id, set()).add(target_id)
adjacency.setdefault(target_id, set()).add(source_id)
if not adjacency:
try:
conn.commit()
except sqlite3.Error:
pass
return
insert_rows: List[Tuple[int, int, int]] = []
max_depth = min(int(max_depth), 2)
for source_id, first_hop in adjacency.items():
if not first_hop:
continue
for neighbor_id in first_hop:
insert_rows.append((source_id, neighbor_id, 1))
if max_depth < 2:
continue
second_hop: Set[int] = set()
for neighbor_id in first_hop:
second_hop.update(adjacency.get(neighbor_id, set()))
second_hop.discard(source_id)
second_hop.difference_update(first_hop)
for neighbor_id in second_hop:
insert_rows.append((source_id, neighbor_id, 2))
if not insert_rows:
try:
conn.commit()
except sqlite3.Error:
pass
return
try:
cursor.executemany(
"""
INSERT INTO graph_neighbors(
source_symbol_id, neighbor_symbol_id, relationship_depth
)
VALUES(?, ?, ?)
""",
insert_rows,
)
conn.commit()
except sqlite3.Error:
return
# === Worker Function for ProcessPoolExecutor ===
@@ -795,6 +961,7 @@ def _build_dir_worker(args: tuple) -> DirBuildResult:
content=text,
language=language_id,
symbols=indexed_file.symbols,
relationships=indexed_file.relationships,
)
files_count += 1
@@ -803,6 +970,9 @@ def _build_dir_worker(args: tuple) -> DirBuildResult:
except Exception:
continue
if files_count > 0:
_compute_graph_neighbors(store)
# Get subdirectories
ignore_dirs = {
".git",
@@ -821,6 +991,7 @@ def _build_dir_worker(args: tuple) -> DirBuildResult:
if d.is_dir() and d.name not in ignore_dirs and not d.name.startswith(".")
]
store.update_merkle_root()
store.close()
if global_index is not None:
global_index.close()

View File

@@ -0,0 +1,136 @@
"""Merkle tree utilities for change detection.
This module provides a generic, file-system based Merkle tree implementation
that can be used to efficiently diff directory states.
"""
from __future__ import annotations
import hashlib
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, Iterable, List, Optional
def sha256_bytes(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
def sha256_text(text: str) -> str:
return sha256_bytes(text.encode("utf-8", errors="ignore"))
@dataclass
class MerkleNode:
"""A Merkle node representing either a file (leaf) or directory (internal)."""
name: str
rel_path: str
hash: str
is_dir: bool
children: Dict[str, "MerkleNode"] = field(default_factory=dict)
def iter_files(self) -> Iterable["MerkleNode"]:
if not self.is_dir:
yield self
return
for child in self.children.values():
yield from child.iter_files()
@dataclass
class MerkleTree:
"""Merkle tree for a directory snapshot."""
root: MerkleNode
@classmethod
def build_from_directory(cls, root_dir: Path) -> "MerkleTree":
root_dir = Path(root_dir).resolve()
node = cls._build_node(root_dir, base=root_dir)
return cls(root=node)
@classmethod
def _build_node(cls, path: Path, *, base: Path) -> MerkleNode:
if path.is_file():
rel = str(path.relative_to(base)).replace("\\", "/")
return MerkleNode(
name=path.name,
rel_path=rel,
hash=sha256_bytes(path.read_bytes()),
is_dir=False,
)
if not path.is_dir():
rel = str(path.relative_to(base)).replace("\\", "/")
return MerkleNode(name=path.name, rel_path=rel, hash="", is_dir=False)
children: Dict[str, MerkleNode] = {}
for child in sorted(path.iterdir(), key=lambda p: p.name):
child_node = cls._build_node(child, base=base)
children[child_node.name] = child_node
items = [
f"{'d' if n.is_dir else 'f'}:{name}:{n.hash}"
for name, n in sorted(children.items(), key=lambda kv: kv[0])
]
dir_hash = sha256_text("\n".join(items))
rel_path = "." if path == base else str(path.relative_to(base)).replace("\\", "/")
return MerkleNode(
name="." if path == base else path.name,
rel_path=rel_path,
hash=dir_hash,
is_dir=True,
children=children,
)
@staticmethod
def find_changed_files(old: Optional["MerkleTree"], new: Optional["MerkleTree"]) -> List[str]:
"""Find changed/added/removed files between two trees.
Returns:
List of relative file paths (POSIX-style separators).
"""
if old is None and new is None:
return []
if old is None:
return sorted({n.rel_path for n in new.root.iter_files()}) # type: ignore[union-attr]
if new is None:
return sorted({n.rel_path for n in old.root.iter_files()})
changed: set[str] = set()
def walk(old_node: Optional[MerkleNode], new_node: Optional[MerkleNode]) -> None:
if old_node is None and new_node is None:
return
if old_node is None and new_node is not None:
changed.update(n.rel_path for n in new_node.iter_files())
return
if new_node is None and old_node is not None:
changed.update(n.rel_path for n in old_node.iter_files())
return
assert old_node is not None and new_node is not None
if old_node.hash == new_node.hash:
return
if not old_node.is_dir and not new_node.is_dir:
changed.add(new_node.rel_path)
return
if old_node.is_dir != new_node.is_dir:
changed.update(n.rel_path for n in old_node.iter_files())
changed.update(n.rel_path for n in new_node.iter_files())
return
names = set(old_node.children.keys()) | set(new_node.children.keys())
for name in names:
walk(old_node.children.get(name), new_node.children.get(name))
walk(old.root, new.root)
return sorted(changed)

View File

@@ -0,0 +1,37 @@
"""
Migration 006: Ensure relationship tables and indexes exist.
This migration is intentionally idempotent. It creates the `code_relationships`
table (used for graph visualization) and its indexes if missing.
"""
from __future__ import annotations
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
cursor = db_conn.cursor()
log.info("Ensuring code_relationships table exists...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS code_relationships (
id INTEGER PRIMARY KEY,
source_symbol_id INTEGER NOT NULL REFERENCES symbols (id) ON DELETE CASCADE,
target_qualified_name TEXT NOT NULL,
relationship_type TEXT NOT NULL,
source_line INTEGER NOT NULL,
target_file TEXT
)
"""
)
log.info("Ensuring relationship indexes exist...")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)")

View File

@@ -0,0 +1,47 @@
"""
Migration 007: Add precomputed graph neighbor table for search expansion.
Adds:
- graph_neighbors: cached N-hop neighbors between symbols (keyed by symbol ids)
This table is derived data (a cache) and is safe to rebuild at any time.
The migration is intentionally idempotent.
"""
from __future__ import annotations
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
cursor = db_conn.cursor()
log.info("Creating graph_neighbors table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS graph_neighbors (
source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
relationship_depth INTEGER NOT NULL,
PRIMARY KEY (source_symbol_id, neighbor_symbol_id)
)
"""
)
log.info("Creating indexes for graph_neighbors...")
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth
ON graph_neighbors(source_symbol_id, relationship_depth)
"""
)
cursor.execute(
"""
CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor
ON graph_neighbors(neighbor_symbol_id)
"""
)

View File

@@ -0,0 +1,81 @@
"""
Migration 008: Add Merkle hash tables for content-based incremental indexing.
Adds:
- merkle_hashes: per-file SHA-256 hashes (keyed by file_id)
- merkle_state: directory-level root hash (single row, id=1)
Backfills merkle_hashes using the existing `files.content` column when available.
"""
from __future__ import annotations
import hashlib
import logging
import time
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection) -> None:
cursor = db_conn.cursor()
log.info("Creating merkle_hashes table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS merkle_hashes (
file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE,
sha256 TEXT NOT NULL,
updated_at REAL
)
"""
)
log.info("Creating merkle_state table...")
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS merkle_state (
id INTEGER PRIMARY KEY CHECK (id = 1),
root_hash TEXT,
updated_at REAL
)
"""
)
# Backfill file hashes from stored content (best-effort).
try:
rows = cursor.execute("SELECT id, content FROM files").fetchall()
except Exception as exc:
log.warning("Unable to backfill merkle hashes (files table missing?): %s", exc)
return
now = time.time()
inserts: list[tuple[int, str, float]] = []
for row in rows:
file_id = int(row[0])
content = row[1]
if content is None:
continue
try:
digest = hashlib.sha256(str(content).encode("utf-8", errors="ignore")).hexdigest()
inserts.append((file_id, digest, now))
except Exception:
continue
if not inserts:
return
log.info("Backfilling %d file hashes...", len(inserts))
cursor.executemany(
"""
INSERT INTO merkle_hashes(file_id, sha256, updated_at)
VALUES(?, ?, ?)
ON CONFLICT(file_id) DO UPDATE SET
sha256=excluded.sha256,
updated_at=excluded.updated_at
""",
inserts,
)