mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-11 02:33:51 +08:00
Add graph expansion and cross-encoder reranking features
- Implemented GraphExpander to enhance search results with related symbols using precomputed neighbors. - Added CrossEncoderReranker for second-stage search ranking, allowing for improved result scoring. - Created migrations to establish necessary database tables for relationships and graph neighbors. - Developed tests for graph expansion functionality, ensuring related results are populated correctly. - Enhanced performance benchmarks for cross-encoder reranking latency and graph expansion overhead. - Updated schema cleanup tests to reflect changes in versioning and deprecated fields. - Added new test cases for Treesitter parser to validate relationship extraction with alias resolution.
This commit is contained in:
@@ -10,15 +10,17 @@ Each directory maintains its own _index.db with:
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import hashlib
|
||||
import re
|
||||
import sqlite3
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from codexlens.config import Config
|
||||
from codexlens.entities import SearchResult, Symbol
|
||||
from codexlens.entities import CodeRelationship, SearchResult, Symbol
|
||||
from codexlens.errors import StorageError
|
||||
from codexlens.storage.global_index import GlobalSymbolIndex
|
||||
|
||||
@@ -60,7 +62,7 @@ class DirIndexStore:
|
||||
|
||||
# Schema version for migration tracking
|
||||
# Increment this when schema changes require migration
|
||||
SCHEMA_VERSION = 5
|
||||
SCHEMA_VERSION = 8
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -150,6 +152,21 @@ class DirIndexStore:
|
||||
from codexlens.storage.migrations.migration_005_cleanup_unused_fields import upgrade
|
||||
upgrade(conn)
|
||||
|
||||
# Migration v5 -> v6: Ensure relationship tables/indexes exist
|
||||
if from_version < 6:
|
||||
from codexlens.storage.migrations.migration_006_enhance_relationships import upgrade
|
||||
upgrade(conn)
|
||||
|
||||
# Migration v6 -> v7: Add graph neighbor cache for search expansion
|
||||
if from_version < 7:
|
||||
from codexlens.storage.migrations.migration_007_add_graph_neighbors import upgrade
|
||||
upgrade(conn)
|
||||
|
||||
# Migration v7 -> v8: Add Merkle hashes for incremental change detection
|
||||
if from_version < 8:
|
||||
from codexlens.storage.migrations.migration_008_add_merkle_hashes import upgrade
|
||||
upgrade(conn)
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close database connection."""
|
||||
with self._lock:
|
||||
@@ -179,6 +196,7 @@ class DirIndexStore:
|
||||
content: str,
|
||||
language: str,
|
||||
symbols: Optional[List[Symbol]] = None,
|
||||
relationships: Optional[List[CodeRelationship]] = None,
|
||||
) -> int:
|
||||
"""Add or update a file in the current directory index.
|
||||
|
||||
@@ -188,6 +206,7 @@ class DirIndexStore:
|
||||
content: File content for indexing
|
||||
language: Programming language identifier
|
||||
symbols: List of Symbol objects from the file
|
||||
relationships: Optional list of CodeRelationship edges from this file
|
||||
|
||||
Returns:
|
||||
Database file_id
|
||||
@@ -240,6 +259,8 @@ class DirIndexStore:
|
||||
symbol_rows,
|
||||
)
|
||||
|
||||
self._save_merkle_hash(conn, file_id=file_id, content=content)
|
||||
self._save_relationships(conn, file_id=file_id, relationships=relationships)
|
||||
conn.commit()
|
||||
self._maybe_update_global_symbols(full_path_str, symbols or [])
|
||||
return file_id
|
||||
@@ -248,6 +269,96 @@ class DirIndexStore:
|
||||
conn.rollback()
|
||||
raise StorageError(f"Failed to add file {name}: {exc}") from exc
|
||||
|
||||
def save_relationships(self, file_id: int, relationships: List[CodeRelationship]) -> None:
|
||||
"""Save relationships for an already-indexed file.
|
||||
|
||||
Args:
|
||||
file_id: Database file id
|
||||
relationships: Relationship edges to persist
|
||||
"""
|
||||
if not relationships:
|
||||
return
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
self._save_relationships(conn, file_id=file_id, relationships=relationships)
|
||||
conn.commit()
|
||||
|
||||
def _save_relationships(
|
||||
self,
|
||||
conn: sqlite3.Connection,
|
||||
file_id: int,
|
||||
relationships: Optional[List[CodeRelationship]],
|
||||
) -> None:
|
||||
if not relationships:
|
||||
return
|
||||
|
||||
rows = conn.execute(
|
||||
"SELECT id, name FROM symbols WHERE file_id=? ORDER BY start_line, id",
|
||||
(file_id,),
|
||||
).fetchall()
|
||||
|
||||
name_to_id: Dict[str, int] = {}
|
||||
for row in rows:
|
||||
name = row["name"]
|
||||
if name not in name_to_id:
|
||||
name_to_id[name] = int(row["id"])
|
||||
|
||||
if not name_to_id:
|
||||
return
|
||||
|
||||
rel_rows: List[Tuple[int, str, str, int, Optional[str]]] = []
|
||||
seen: set[tuple[int, str, str, int, Optional[str]]] = set()
|
||||
|
||||
for rel in relationships:
|
||||
source_id = name_to_id.get(rel.source_symbol)
|
||||
if source_id is None:
|
||||
continue
|
||||
|
||||
target = (rel.target_symbol or "").strip()
|
||||
if not target:
|
||||
continue
|
||||
|
||||
rel_type = rel.relationship_type.value
|
||||
source_line = int(rel.source_line)
|
||||
key = (source_id, target, rel_type, source_line, rel.target_file)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
|
||||
rel_rows.append((source_id, target, rel_type, source_line, rel.target_file))
|
||||
|
||||
if not rel_rows:
|
||||
return
|
||||
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT INTO code_relationships(
|
||||
source_symbol_id, target_qualified_name,
|
||||
relationship_type, source_line, target_file
|
||||
)
|
||||
VALUES(?, ?, ?, ?, ?)
|
||||
""",
|
||||
rel_rows,
|
||||
)
|
||||
|
||||
def _save_merkle_hash(self, conn: sqlite3.Connection, file_id: int, content: str) -> None:
|
||||
"""Upsert a SHA-256 content hash for the given file_id (best-effort)."""
|
||||
try:
|
||||
digest = hashlib.sha256(content.encode("utf-8", errors="ignore")).hexdigest()
|
||||
now = time.time()
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO merkle_hashes(file_id, sha256, updated_at)
|
||||
VALUES(?, ?, ?)
|
||||
ON CONFLICT(file_id) DO UPDATE SET
|
||||
sha256=excluded.sha256,
|
||||
updated_at=excluded.updated_at
|
||||
""",
|
||||
(file_id, digest, now),
|
||||
)
|
||||
except sqlite3.Error:
|
||||
return
|
||||
|
||||
def add_files_batch(
|
||||
self, files: List[Tuple[str, Path, str, str, Optional[List[Symbol]]]]
|
||||
) -> int:
|
||||
@@ -312,6 +423,8 @@ class DirIndexStore:
|
||||
symbol_rows,
|
||||
)
|
||||
|
||||
self._save_merkle_hash(conn, file_id=file_id, content=content)
|
||||
|
||||
conn.commit()
|
||||
return count
|
||||
|
||||
@@ -395,9 +508,13 @@ class DirIndexStore:
|
||||
return float(row["mtime"]) if row and row["mtime"] else None
|
||||
|
||||
def needs_reindex(self, full_path: str | Path) -> bool:
|
||||
"""Check if a file needs reindexing based on mtime comparison.
|
||||
"""Check if a file needs reindexing.
|
||||
|
||||
Uses 1ms tolerance to handle filesystem timestamp precision variations.
|
||||
Default behavior uses mtime comparison (with 1ms tolerance).
|
||||
|
||||
When `Config.enable_merkle_detection` is enabled and Merkle metadata is
|
||||
available, uses SHA-256 content hash comparison (with mtime as a fast
|
||||
path to avoid hashing unchanged files).
|
||||
|
||||
Args:
|
||||
full_path: Complete source file path
|
||||
@@ -415,16 +532,154 @@ class DirIndexStore:
|
||||
except OSError:
|
||||
return False # Can't read file stats, skip
|
||||
|
||||
# Get stored mtime from database
|
||||
stored_mtime = self.get_file_mtime(full_path_obj)
|
||||
MTIME_TOLERANCE = 0.001
|
||||
|
||||
# File not in index, needs indexing
|
||||
if stored_mtime is None:
|
||||
# Fast path: mtime-only mode (default / backward-compatible)
|
||||
if self._config is None or not getattr(self._config, "enable_merkle_detection", False):
|
||||
stored_mtime = self.get_file_mtime(full_path_obj)
|
||||
if stored_mtime is None:
|
||||
return True
|
||||
return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
|
||||
|
||||
full_path_str = str(full_path_obj)
|
||||
|
||||
# Hash-based change detection (best-effort, falls back to mtime when metadata missing)
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT f.id AS file_id, f.mtime AS mtime, mh.sha256 AS sha256
|
||||
FROM files f
|
||||
LEFT JOIN merkle_hashes mh ON mh.file_id = f.id
|
||||
WHERE f.full_path=?
|
||||
""",
|
||||
(full_path_str,),
|
||||
).fetchone()
|
||||
except sqlite3.Error:
|
||||
row = None
|
||||
|
||||
if row is None:
|
||||
return True
|
||||
|
||||
# Compare with 1ms tolerance for floating point precision
|
||||
MTIME_TOLERANCE = 0.001
|
||||
return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
|
||||
stored_mtime = float(row["mtime"]) if row["mtime"] else None
|
||||
stored_hash = row["sha256"] if row["sha256"] else None
|
||||
file_id = int(row["file_id"])
|
||||
|
||||
# Missing Merkle data: fall back to mtime
|
||||
if stored_hash is None:
|
||||
if stored_mtime is None:
|
||||
return True
|
||||
return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
|
||||
|
||||
# If mtime is unchanged within tolerance, assume unchanged without hashing.
|
||||
if stored_mtime is not None and abs(current_mtime - stored_mtime) <= MTIME_TOLERANCE:
|
||||
return False
|
||||
|
||||
try:
|
||||
current_text = full_path_obj.read_text(encoding="utf-8", errors="ignore")
|
||||
current_hash = hashlib.sha256(current_text.encode("utf-8", errors="ignore")).hexdigest()
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
if current_hash == stored_hash:
|
||||
# Content unchanged, but mtime drifted: update stored mtime to avoid repeated hashing.
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
conn.execute("UPDATE files SET mtime=? WHERE id=?", (current_mtime, file_id))
|
||||
conn.commit()
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_merkle_root_hash(self) -> Optional[str]:
|
||||
"""Return the stored Merkle root hash for this directory index (if present)."""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
row = conn.execute(
|
||||
"SELECT root_hash FROM merkle_state WHERE id=1"
|
||||
).fetchone()
|
||||
except sqlite3.Error:
|
||||
return None
|
||||
|
||||
return row["root_hash"] if row and row["root_hash"] else None
|
||||
|
||||
def update_merkle_root(self) -> Optional[str]:
|
||||
"""Compute and persist the Merkle root hash for this directory index.
|
||||
|
||||
The root hash includes:
|
||||
- Direct file hashes from `merkle_hashes`
|
||||
- Direct subdirectory root hashes (read from child `_index.db` files)
|
||||
"""
|
||||
if self._config is None or not getattr(self._config, "enable_merkle_detection", False):
|
||||
return None
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
file_rows = conn.execute(
|
||||
"""
|
||||
SELECT f.name AS name, mh.sha256 AS sha256
|
||||
FROM files f
|
||||
LEFT JOIN merkle_hashes mh ON mh.file_id = f.id
|
||||
ORDER BY f.name
|
||||
"""
|
||||
).fetchall()
|
||||
|
||||
subdir_rows = conn.execute(
|
||||
"SELECT name, index_path FROM subdirs ORDER BY name"
|
||||
).fetchall()
|
||||
except sqlite3.Error as exc:
|
||||
self.logger.debug("Failed to compute merkle root: %s", exc)
|
||||
return None
|
||||
|
||||
items: List[str] = []
|
||||
|
||||
for row in file_rows:
|
||||
name = row["name"]
|
||||
sha = (row["sha256"] or "").strip()
|
||||
items.append(f"f:{name}:{sha}")
|
||||
|
||||
def read_child_root(index_path: str) -> str:
|
||||
try:
|
||||
with sqlite3.connect(index_path) as child_conn:
|
||||
child_conn.row_factory = sqlite3.Row
|
||||
child_row = child_conn.execute(
|
||||
"SELECT root_hash FROM merkle_state WHERE id=1"
|
||||
).fetchone()
|
||||
return child_row["root_hash"] if child_row and child_row["root_hash"] else ""
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
for row in subdir_rows:
|
||||
name = row["name"]
|
||||
index_path = row["index_path"]
|
||||
child_hash = read_child_root(index_path) if index_path else ""
|
||||
items.append(f"d:{name}:{child_hash}")
|
||||
|
||||
root_hash = hashlib.sha256("\n".join(items).encode("utf-8", errors="ignore")).hexdigest()
|
||||
now = time.time()
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO merkle_state(id, root_hash, updated_at)
|
||||
VALUES(1, ?, ?)
|
||||
ON CONFLICT(id) DO UPDATE SET
|
||||
root_hash=excluded.root_hash,
|
||||
updated_at=excluded.updated_at
|
||||
""",
|
||||
(root_hash, now),
|
||||
)
|
||||
conn.commit()
|
||||
except sqlite3.Error as exc:
|
||||
self.logger.debug("Failed to persist merkle root: %s", exc)
|
||||
return None
|
||||
|
||||
return root_hash
|
||||
|
||||
def add_file_incremental(
|
||||
self,
|
||||
@@ -433,6 +688,7 @@ class DirIndexStore:
|
||||
content: str,
|
||||
language: str,
|
||||
symbols: Optional[List[Symbol]] = None,
|
||||
relationships: Optional[List[CodeRelationship]] = None,
|
||||
) -> Optional[int]:
|
||||
"""Add or update a file only if it has changed (incremental indexing).
|
||||
|
||||
@@ -444,6 +700,7 @@ class DirIndexStore:
|
||||
content: File content for indexing
|
||||
language: Programming language identifier
|
||||
symbols: List of Symbol objects from the file
|
||||
relationships: Optional list of CodeRelationship edges from this file
|
||||
|
||||
Returns:
|
||||
Database file_id if indexed, None if skipped (unchanged)
|
||||
@@ -456,7 +713,7 @@ class DirIndexStore:
|
||||
return None # Skip unchanged file
|
||||
|
||||
# File changed or new, perform full indexing
|
||||
return self.add_file(name, full_path, content, language, symbols)
|
||||
return self.add_file(name, full_path, content, language, symbols, relationships)
|
||||
|
||||
def cleanup_deleted_files(self, source_dir: Path) -> int:
|
||||
"""Remove indexed files that no longer exist in the source directory.
|
||||
@@ -1767,6 +2024,39 @@ class DirIndexStore:
|
||||
"""
|
||||
)
|
||||
|
||||
# Precomputed graph neighbors cache for search expansion (v7)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS graph_neighbors (
|
||||
source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
|
||||
neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
|
||||
relationship_depth INTEGER NOT NULL,
|
||||
PRIMARY KEY (source_symbol_id, neighbor_symbol_id)
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Merkle hashes for incremental change detection (v8)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS merkle_hashes (
|
||||
file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE,
|
||||
sha256 TEXT NOT NULL,
|
||||
updated_at REAL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS merkle_state (
|
||||
id INTEGER PRIMARY KEY CHECK (id = 1),
|
||||
root_hash TEXT,
|
||||
updated_at REAL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Indexes (v5: removed idx_symbols_type)
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
|
||||
@@ -1780,6 +2070,14 @@ class DirIndexStore:
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)")
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth "
|
||||
"ON graph_neighbors(source_symbol_id, relationship_depth)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor "
|
||||
"ON graph_neighbors(neighbor_symbol_id)"
|
||||
)
|
||||
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"Failed to create schema: {exc}") from exc
|
||||
|
||||
@@ -8,11 +8,13 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
|
||||
from codexlens.config import Config
|
||||
from codexlens.parsers.factory import ParserFactory
|
||||
@@ -247,6 +249,9 @@ class IndexTreeBuilder:
|
||||
try:
|
||||
with DirIndexStore(result.index_path, config=self.config, global_index=global_index) as store:
|
||||
deleted_count = store.cleanup_deleted_files(result.source_path)
|
||||
if deleted_count > 0:
|
||||
_compute_graph_neighbors(store, logger=self.logger)
|
||||
store.update_merkle_root()
|
||||
total_deleted += deleted_count
|
||||
if deleted_count > 0:
|
||||
self.logger.debug("Removed %d deleted files from %s", deleted_count, result.source_path)
|
||||
@@ -575,6 +580,7 @@ class IndexTreeBuilder:
|
||||
content=text,
|
||||
language=language_id,
|
||||
symbols=indexed_file.symbols,
|
||||
relationships=indexed_file.relationships,
|
||||
)
|
||||
|
||||
files_count += 1
|
||||
@@ -584,6 +590,9 @@ class IndexTreeBuilder:
|
||||
self.logger.debug("Failed to index %s: %s", file_path, exc)
|
||||
continue
|
||||
|
||||
if files_count > 0:
|
||||
_compute_graph_neighbors(store, logger=self.logger)
|
||||
|
||||
# Get list of subdirectories
|
||||
subdirs = [
|
||||
d.name
|
||||
@@ -593,6 +602,7 @@ class IndexTreeBuilder:
|
||||
and not d.name.startswith(".")
|
||||
]
|
||||
|
||||
store.update_merkle_root()
|
||||
store.close()
|
||||
if global_index is not None:
|
||||
global_index.close()
|
||||
@@ -654,31 +664,29 @@ class IndexTreeBuilder:
|
||||
parent_index_db = self.mapper.source_to_index_db(parent_path)
|
||||
|
||||
try:
|
||||
store = DirIndexStore(parent_index_db)
|
||||
store.initialize()
|
||||
with DirIndexStore(parent_index_db, config=self.config) as store:
|
||||
for result in all_results:
|
||||
# Only register direct children (parent is one level up)
|
||||
if result.source_path.parent != parent_path:
|
||||
continue
|
||||
|
||||
for result in all_results:
|
||||
# Only register direct children (parent is one level up)
|
||||
if result.source_path.parent != parent_path:
|
||||
continue
|
||||
if result.error:
|
||||
continue
|
||||
|
||||
if result.error:
|
||||
continue
|
||||
# Register subdirectory link
|
||||
store.register_subdir(
|
||||
name=result.source_path.name,
|
||||
index_path=result.index_path,
|
||||
files_count=result.files_count,
|
||||
direct_files=result.files_count,
|
||||
)
|
||||
self.logger.debug(
|
||||
"Linked %s to parent %s",
|
||||
result.source_path.name,
|
||||
parent_path,
|
||||
)
|
||||
|
||||
# Register subdirectory link
|
||||
store.register_subdir(
|
||||
name=result.source_path.name,
|
||||
index_path=result.index_path,
|
||||
files_count=result.files_count,
|
||||
direct_files=result.files_count,
|
||||
)
|
||||
self.logger.debug(
|
||||
"Linked %s to parent %s",
|
||||
result.source_path.name,
|
||||
parent_path,
|
||||
)
|
||||
|
||||
store.close()
|
||||
store.update_merkle_root()
|
||||
|
||||
except Exception as exc:
|
||||
self.logger.error(
|
||||
@@ -726,6 +734,164 @@ class IndexTreeBuilder:
|
||||
return files
|
||||
|
||||
|
||||
def _normalize_relationship_target(target: str) -> str:
|
||||
"""Best-effort normalization of a relationship target into a local symbol name."""
|
||||
target = (target or "").strip()
|
||||
if not target:
|
||||
return ""
|
||||
|
||||
# Drop trailing call parentheses when present (e.g., "foo()" -> "foo").
|
||||
if target.endswith("()"):
|
||||
target = target[:-2]
|
||||
|
||||
# Keep the leaf identifier for common qualified formats.
|
||||
for sep in ("::", ".", "#"):
|
||||
if sep in target:
|
||||
target = target.split(sep)[-1]
|
||||
|
||||
# Strip non-identifier suffix/prefix noise.
|
||||
target = re.sub(r"^[^A-Za-z0-9_]+", "", target)
|
||||
target = re.sub(r"[^A-Za-z0-9_]+$", "", target)
|
||||
return target
|
||||
|
||||
|
||||
def _compute_graph_neighbors(
|
||||
store: DirIndexStore,
|
||||
*,
|
||||
max_depth: int = 2,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
) -> None:
|
||||
"""Compute and persist N-hop neighbors for all symbols in a directory index."""
|
||||
if max_depth <= 0:
|
||||
return
|
||||
|
||||
log = logger or logging.getLogger(__name__)
|
||||
|
||||
with store._lock:
|
||||
conn = store._get_connection()
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
# Ensure schema exists even for older databases pinned to the same user_version.
|
||||
try:
|
||||
from codexlens.storage.migrations.migration_007_add_graph_neighbors import upgrade
|
||||
|
||||
upgrade(conn)
|
||||
except Exception as exc:
|
||||
log.debug("Graph neighbor schema ensure failed: %s", exc)
|
||||
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
cursor.execute("DELETE FROM graph_neighbors")
|
||||
except sqlite3.Error:
|
||||
# Table missing or schema mismatch; skip gracefully.
|
||||
return
|
||||
|
||||
try:
|
||||
symbol_rows = cursor.execute(
|
||||
"SELECT id, file_id, name FROM symbols"
|
||||
).fetchall()
|
||||
rel_rows = cursor.execute(
|
||||
"SELECT source_symbol_id, target_qualified_name FROM code_relationships"
|
||||
).fetchall()
|
||||
except sqlite3.Error:
|
||||
return
|
||||
|
||||
if not symbol_rows or not rel_rows:
|
||||
try:
|
||||
conn.commit()
|
||||
except sqlite3.Error:
|
||||
pass
|
||||
return
|
||||
|
||||
symbol_file_by_id: Dict[int, int] = {}
|
||||
symbols_by_file_and_name: Dict[Tuple[int, str], List[int]] = {}
|
||||
symbols_by_name: Dict[str, List[int]] = {}
|
||||
|
||||
for row in symbol_rows:
|
||||
symbol_id = int(row["id"])
|
||||
file_id = int(row["file_id"])
|
||||
name = str(row["name"])
|
||||
symbol_file_by_id[symbol_id] = file_id
|
||||
symbols_by_file_and_name.setdefault((file_id, name), []).append(symbol_id)
|
||||
symbols_by_name.setdefault(name, []).append(symbol_id)
|
||||
|
||||
adjacency: Dict[int, Set[int]] = {}
|
||||
|
||||
for row in rel_rows:
|
||||
source_id = int(row["source_symbol_id"])
|
||||
target_raw = str(row["target_qualified_name"] or "")
|
||||
target_name = _normalize_relationship_target(target_raw)
|
||||
if not target_name:
|
||||
continue
|
||||
|
||||
source_file_id = symbol_file_by_id.get(source_id)
|
||||
if source_file_id is None:
|
||||
continue
|
||||
|
||||
candidate_ids = symbols_by_file_and_name.get((source_file_id, target_name))
|
||||
if not candidate_ids:
|
||||
global_candidates = symbols_by_name.get(target_name, [])
|
||||
# Only resolve cross-file by name when unambiguous.
|
||||
candidate_ids = global_candidates if len(global_candidates) == 1 else []
|
||||
|
||||
for target_id in candidate_ids:
|
||||
if target_id == source_id:
|
||||
continue
|
||||
adjacency.setdefault(source_id, set()).add(target_id)
|
||||
adjacency.setdefault(target_id, set()).add(source_id)
|
||||
|
||||
if not adjacency:
|
||||
try:
|
||||
conn.commit()
|
||||
except sqlite3.Error:
|
||||
pass
|
||||
return
|
||||
|
||||
insert_rows: List[Tuple[int, int, int]] = []
|
||||
max_depth = min(int(max_depth), 2)
|
||||
|
||||
for source_id, first_hop in adjacency.items():
|
||||
if not first_hop:
|
||||
continue
|
||||
for neighbor_id in first_hop:
|
||||
insert_rows.append((source_id, neighbor_id, 1))
|
||||
|
||||
if max_depth < 2:
|
||||
continue
|
||||
|
||||
second_hop: Set[int] = set()
|
||||
for neighbor_id in first_hop:
|
||||
second_hop.update(adjacency.get(neighbor_id, set()))
|
||||
|
||||
second_hop.discard(source_id)
|
||||
second_hop.difference_update(first_hop)
|
||||
|
||||
for neighbor_id in second_hop:
|
||||
insert_rows.append((source_id, neighbor_id, 2))
|
||||
|
||||
if not insert_rows:
|
||||
try:
|
||||
conn.commit()
|
||||
except sqlite3.Error:
|
||||
pass
|
||||
return
|
||||
|
||||
try:
|
||||
cursor.executemany(
|
||||
"""
|
||||
INSERT INTO graph_neighbors(
|
||||
source_symbol_id, neighbor_symbol_id, relationship_depth
|
||||
)
|
||||
VALUES(?, ?, ?)
|
||||
""",
|
||||
insert_rows,
|
||||
)
|
||||
conn.commit()
|
||||
except sqlite3.Error:
|
||||
return
|
||||
|
||||
|
||||
# === Worker Function for ProcessPoolExecutor ===
|
||||
|
||||
|
||||
@@ -795,6 +961,7 @@ def _build_dir_worker(args: tuple) -> DirBuildResult:
|
||||
content=text,
|
||||
language=language_id,
|
||||
symbols=indexed_file.symbols,
|
||||
relationships=indexed_file.relationships,
|
||||
)
|
||||
|
||||
files_count += 1
|
||||
@@ -803,6 +970,9 @@ def _build_dir_worker(args: tuple) -> DirBuildResult:
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if files_count > 0:
|
||||
_compute_graph_neighbors(store)
|
||||
|
||||
# Get subdirectories
|
||||
ignore_dirs = {
|
||||
".git",
|
||||
@@ -821,6 +991,7 @@ def _build_dir_worker(args: tuple) -> DirBuildResult:
|
||||
if d.is_dir() and d.name not in ignore_dirs and not d.name.startswith(".")
|
||||
]
|
||||
|
||||
store.update_merkle_root()
|
||||
store.close()
|
||||
if global_index is not None:
|
||||
global_index.close()
|
||||
|
||||
136
codex-lens/src/codexlens/storage/merkle_tree.py
Normal file
136
codex-lens/src/codexlens/storage/merkle_tree.py
Normal file
@@ -0,0 +1,136 @@
|
||||
"""Merkle tree utilities for change detection.
|
||||
|
||||
This module provides a generic, file-system based Merkle tree implementation
|
||||
that can be used to efficiently diff directory states.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
|
||||
|
||||
def sha256_bytes(data: bytes) -> str:
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def sha256_text(text: str) -> str:
|
||||
return sha256_bytes(text.encode("utf-8", errors="ignore"))
|
||||
|
||||
|
||||
@dataclass
|
||||
class MerkleNode:
|
||||
"""A Merkle node representing either a file (leaf) or directory (internal)."""
|
||||
|
||||
name: str
|
||||
rel_path: str
|
||||
hash: str
|
||||
is_dir: bool
|
||||
children: Dict[str, "MerkleNode"] = field(default_factory=dict)
|
||||
|
||||
def iter_files(self) -> Iterable["MerkleNode"]:
|
||||
if not self.is_dir:
|
||||
yield self
|
||||
return
|
||||
for child in self.children.values():
|
||||
yield from child.iter_files()
|
||||
|
||||
|
||||
@dataclass
|
||||
class MerkleTree:
|
||||
"""Merkle tree for a directory snapshot."""
|
||||
|
||||
root: MerkleNode
|
||||
|
||||
@classmethod
|
||||
def build_from_directory(cls, root_dir: Path) -> "MerkleTree":
|
||||
root_dir = Path(root_dir).resolve()
|
||||
node = cls._build_node(root_dir, base=root_dir)
|
||||
return cls(root=node)
|
||||
|
||||
@classmethod
|
||||
def _build_node(cls, path: Path, *, base: Path) -> MerkleNode:
|
||||
if path.is_file():
|
||||
rel = str(path.relative_to(base)).replace("\\", "/")
|
||||
return MerkleNode(
|
||||
name=path.name,
|
||||
rel_path=rel,
|
||||
hash=sha256_bytes(path.read_bytes()),
|
||||
is_dir=False,
|
||||
)
|
||||
|
||||
if not path.is_dir():
|
||||
rel = str(path.relative_to(base)).replace("\\", "/")
|
||||
return MerkleNode(name=path.name, rel_path=rel, hash="", is_dir=False)
|
||||
|
||||
children: Dict[str, MerkleNode] = {}
|
||||
for child in sorted(path.iterdir(), key=lambda p: p.name):
|
||||
child_node = cls._build_node(child, base=base)
|
||||
children[child_node.name] = child_node
|
||||
|
||||
items = [
|
||||
f"{'d' if n.is_dir else 'f'}:{name}:{n.hash}"
|
||||
for name, n in sorted(children.items(), key=lambda kv: kv[0])
|
||||
]
|
||||
dir_hash = sha256_text("\n".join(items))
|
||||
|
||||
rel_path = "." if path == base else str(path.relative_to(base)).replace("\\", "/")
|
||||
return MerkleNode(
|
||||
name="." if path == base else path.name,
|
||||
rel_path=rel_path,
|
||||
hash=dir_hash,
|
||||
is_dir=True,
|
||||
children=children,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def find_changed_files(old: Optional["MerkleTree"], new: Optional["MerkleTree"]) -> List[str]:
|
||||
"""Find changed/added/removed files between two trees.
|
||||
|
||||
Returns:
|
||||
List of relative file paths (POSIX-style separators).
|
||||
"""
|
||||
if old is None and new is None:
|
||||
return []
|
||||
if old is None:
|
||||
return sorted({n.rel_path for n in new.root.iter_files()}) # type: ignore[union-attr]
|
||||
if new is None:
|
||||
return sorted({n.rel_path for n in old.root.iter_files()})
|
||||
|
||||
changed: set[str] = set()
|
||||
|
||||
def walk(old_node: Optional[MerkleNode], new_node: Optional[MerkleNode]) -> None:
|
||||
if old_node is None and new_node is None:
|
||||
return
|
||||
|
||||
if old_node is None and new_node is not None:
|
||||
changed.update(n.rel_path for n in new_node.iter_files())
|
||||
return
|
||||
|
||||
if new_node is None and old_node is not None:
|
||||
changed.update(n.rel_path for n in old_node.iter_files())
|
||||
return
|
||||
|
||||
assert old_node is not None and new_node is not None
|
||||
|
||||
if old_node.hash == new_node.hash:
|
||||
return
|
||||
|
||||
if not old_node.is_dir and not new_node.is_dir:
|
||||
changed.add(new_node.rel_path)
|
||||
return
|
||||
|
||||
if old_node.is_dir != new_node.is_dir:
|
||||
changed.update(n.rel_path for n in old_node.iter_files())
|
||||
changed.update(n.rel_path for n in new_node.iter_files())
|
||||
return
|
||||
|
||||
names = set(old_node.children.keys()) | set(new_node.children.keys())
|
||||
for name in names:
|
||||
walk(old_node.children.get(name), new_node.children.get(name))
|
||||
|
||||
walk(old.root, new.root)
|
||||
return sorted(changed)
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
"""
|
||||
Migration 006: Ensure relationship tables and indexes exist.
|
||||
|
||||
This migration is intentionally idempotent. It creates the `code_relationships`
|
||||
table (used for graph visualization) and its indexes if missing.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection) -> None:
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Ensuring code_relationships table exists...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS code_relationships (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_symbol_id INTEGER NOT NULL REFERENCES symbols (id) ON DELETE CASCADE,
|
||||
target_qualified_name TEXT NOT NULL,
|
||||
relationship_type TEXT NOT NULL,
|
||||
source_line INTEGER NOT NULL,
|
||||
target_file TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Ensuring relationship indexes exist...")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)")
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
"""
|
||||
Migration 007: Add precomputed graph neighbor table for search expansion.
|
||||
|
||||
Adds:
|
||||
- graph_neighbors: cached N-hop neighbors between symbols (keyed by symbol ids)
|
||||
|
||||
This table is derived data (a cache) and is safe to rebuild at any time.
|
||||
The migration is intentionally idempotent.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection) -> None:
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Creating graph_neighbors table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS graph_neighbors (
|
||||
source_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
|
||||
neighbor_symbol_id INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
|
||||
relationship_depth INTEGER NOT NULL,
|
||||
PRIMARY KEY (source_symbol_id, neighbor_symbol_id)
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Creating indexes for graph_neighbors...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_graph_neighbors_source_depth
|
||||
ON graph_neighbors(source_symbol_id, relationship_depth)
|
||||
"""
|
||||
)
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_graph_neighbors_neighbor
|
||||
ON graph_neighbors(neighbor_symbol_id)
|
||||
"""
|
||||
)
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
"""
|
||||
Migration 008: Add Merkle hash tables for content-based incremental indexing.
|
||||
|
||||
Adds:
|
||||
- merkle_hashes: per-file SHA-256 hashes (keyed by file_id)
|
||||
- merkle_state: directory-level root hash (single row, id=1)
|
||||
|
||||
Backfills merkle_hashes using the existing `files.content` column when available.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import time
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection) -> None:
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
log.info("Creating merkle_hashes table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS merkle_hashes (
|
||||
file_id INTEGER PRIMARY KEY REFERENCES files(id) ON DELETE CASCADE,
|
||||
sha256 TEXT NOT NULL,
|
||||
updated_at REAL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
log.info("Creating merkle_state table...")
|
||||
cursor.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS merkle_state (
|
||||
id INTEGER PRIMARY KEY CHECK (id = 1),
|
||||
root_hash TEXT,
|
||||
updated_at REAL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Backfill file hashes from stored content (best-effort).
|
||||
try:
|
||||
rows = cursor.execute("SELECT id, content FROM files").fetchall()
|
||||
except Exception as exc:
|
||||
log.warning("Unable to backfill merkle hashes (files table missing?): %s", exc)
|
||||
return
|
||||
|
||||
now = time.time()
|
||||
inserts: list[tuple[int, str, float]] = []
|
||||
|
||||
for row in rows:
|
||||
file_id = int(row[0])
|
||||
content = row[1]
|
||||
if content is None:
|
||||
continue
|
||||
try:
|
||||
digest = hashlib.sha256(str(content).encode("utf-8", errors="ignore")).hexdigest()
|
||||
inserts.append((file_id, digest, now))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not inserts:
|
||||
return
|
||||
|
||||
log.info("Backfilling %d file hashes...", len(inserts))
|
||||
cursor.executemany(
|
||||
"""
|
||||
INSERT INTO merkle_hashes(file_id, sha256, updated_at)
|
||||
VALUES(?, ?, ?)
|
||||
ON CONFLICT(file_id) DO UPDATE SET
|
||||
sha256=excluded.sha256,
|
||||
updated_at=excluded.updated_at
|
||||
""",
|
||||
inserts,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user