mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
1065 lines
35 KiB
Python
1065 lines
35 KiB
Python
"""Hierarchical index tree builder for CodexLens.
|
|
|
|
Constructs a bottom-up directory index tree with parallel processing support.
|
|
Each directory maintains its own _index.db with files and subdirectory links.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import time
|
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Set, Tuple
|
|
|
|
from codexlens.config import Config
|
|
from codexlens.parsers.factory import ParserFactory
|
|
from codexlens.storage.dir_index import DirIndexStore
|
|
from codexlens.storage.global_index import GlobalSymbolIndex
|
|
from codexlens.storage.path_mapper import PathMapper
|
|
from codexlens.storage.registry import ProjectInfo, RegistryStore
|
|
|
|
|
|
@dataclass
|
|
class BuildResult:
|
|
"""Complete build operation result."""
|
|
|
|
project_id: int
|
|
source_root: Path
|
|
index_root: Path
|
|
total_files: int
|
|
total_dirs: int
|
|
errors: List[str]
|
|
|
|
|
|
@dataclass
|
|
class DirBuildResult:
|
|
"""Single directory build result."""
|
|
|
|
source_path: Path
|
|
index_path: Path
|
|
files_count: int
|
|
symbols_count: int
|
|
subdirs: List[str] # Subdirectory names
|
|
error: Optional[str] = None
|
|
|
|
|
|
class IndexTreeBuilder:
|
|
"""Hierarchical index tree builder with parallel processing.
|
|
|
|
Builds directory indexes bottom-up to enable proper subdirectory linking.
|
|
Each directory gets its own _index.db containing:
|
|
- Files in that directory
|
|
- Links to child directory indexes
|
|
- Symbols and FTS5 search
|
|
|
|
Attributes:
|
|
registry: Global project registry
|
|
mapper: Path mapping between source and index
|
|
config: CodexLens configuration
|
|
parser_factory: Parser factory for symbol extraction
|
|
logger: Logger instance
|
|
IGNORE_DIRS: Set of directory names to skip during indexing
|
|
"""
|
|
|
|
# Directories to skip during indexing
|
|
IGNORE_DIRS: Set[str] = {
|
|
".git",
|
|
".venv",
|
|
"venv",
|
|
"node_modules",
|
|
"__pycache__",
|
|
".codexlens",
|
|
".idea",
|
|
".vscode",
|
|
}
|
|
|
|
def __init__(
|
|
self, registry: RegistryStore, mapper: PathMapper, config: Config = None, incremental: bool = True
|
|
):
|
|
"""Initialize the index tree builder.
|
|
|
|
Args:
|
|
registry: Global registry store for project tracking
|
|
mapper: Path mapper for source to index conversions
|
|
config: CodexLens configuration (uses defaults if None)
|
|
incremental: Enable incremental indexing (default True)
|
|
"""
|
|
self.registry = registry
|
|
self.mapper = mapper
|
|
self.config = config or Config()
|
|
self.parser_factory = ParserFactory(self.config)
|
|
self.logger = logging.getLogger(__name__)
|
|
self.incremental = incremental
|
|
|
|
def build(
|
|
self,
|
|
source_root: Path,
|
|
languages: List[str] = None,
|
|
workers: int = None,
|
|
force_full: bool = False,
|
|
) -> BuildResult:
|
|
"""Build complete index tree for a project.
|
|
|
|
Process:
|
|
1. Register project in registry
|
|
2. Collect all directories grouped by depth
|
|
3. Build indexes bottom-up (deepest first)
|
|
4. Link subdirectories to parents
|
|
5. Update project statistics
|
|
6. Cleanup deleted files (if incremental mode)
|
|
|
|
Args:
|
|
source_root: Project root directory to index
|
|
languages: Optional list of language IDs to limit indexing
|
|
workers: Number of parallel worker processes
|
|
force_full: Force full reindex (override incremental mode)
|
|
|
|
Returns:
|
|
BuildResult with statistics and errors
|
|
|
|
Raises:
|
|
ValueError: If source_root doesn't exist
|
|
"""
|
|
source_root = source_root.resolve()
|
|
if not source_root.exists():
|
|
raise ValueError(f"Source root does not exist: {source_root}")
|
|
|
|
# Auto-detect optimal worker count if not specified
|
|
if workers is None:
|
|
workers = min(os.cpu_count() or 4, 16) # Cap at 16 workers
|
|
self.logger.debug("Auto-detected %d workers for parallel indexing", workers)
|
|
|
|
# Override incremental mode if force_full is True
|
|
use_incremental = self.incremental and not force_full
|
|
if force_full:
|
|
self.logger.info("Building index tree for %s (FULL reindex)", source_root)
|
|
else:
|
|
self.logger.info("Building index tree for %s (incremental=%s)", source_root, use_incremental)
|
|
|
|
# Register project
|
|
index_root = self.mapper.source_to_index_dir(source_root)
|
|
project_info = self.registry.register_project(source_root, index_root)
|
|
global_index_db_path = index_root / GlobalSymbolIndex.DEFAULT_DB_NAME
|
|
|
|
global_index: GlobalSymbolIndex | None = None
|
|
if self.config.global_symbol_index_enabled:
|
|
global_index = GlobalSymbolIndex(global_index_db_path, project_id=project_info.id)
|
|
global_index.initialize()
|
|
|
|
# Report progress: discovering files (5%)
|
|
print("Discovering files...", flush=True)
|
|
|
|
# Collect directories by depth
|
|
dirs_by_depth = self._collect_dirs_by_depth(source_root, languages)
|
|
|
|
if not dirs_by_depth:
|
|
self.logger.warning("No indexable directories found in %s", source_root)
|
|
if global_index is not None:
|
|
global_index.close()
|
|
return BuildResult(
|
|
project_id=project_info.id,
|
|
source_root=source_root,
|
|
index_root=index_root,
|
|
total_files=0,
|
|
total_dirs=0,
|
|
errors=["No indexable directories found"],
|
|
)
|
|
|
|
# Calculate total directories for progress tracking
|
|
total_dirs_to_process = sum(len(dirs) for dirs in dirs_by_depth.values())
|
|
processed_dirs = 0
|
|
|
|
# Report progress: building index (10%)
|
|
print("Building index...", flush=True)
|
|
|
|
total_files = 0
|
|
total_dirs = 0
|
|
all_errors: List[str] = []
|
|
all_results: List[DirBuildResult] = [] # Store all results for subdir linking
|
|
|
|
# Build bottom-up (highest depth first)
|
|
max_depth = max(dirs_by_depth.keys())
|
|
for depth in range(max_depth, -1, -1):
|
|
if depth not in dirs_by_depth:
|
|
continue
|
|
|
|
dirs = dirs_by_depth[depth]
|
|
self.logger.info("Building %d directories at depth %d", len(dirs), depth)
|
|
|
|
# Build directories at this level in parallel
|
|
results = self._build_level_parallel(
|
|
dirs,
|
|
languages,
|
|
workers,
|
|
project_id=project_info.id,
|
|
global_index_db_path=global_index_db_path,
|
|
)
|
|
all_results.extend(results)
|
|
|
|
# Process results
|
|
for result in results:
|
|
if result.error:
|
|
all_errors.append(f"{result.source_path}: {result.error}")
|
|
processed_dirs += 1
|
|
continue
|
|
|
|
total_files += result.files_count
|
|
total_dirs += 1
|
|
processed_dirs += 1
|
|
|
|
# Report progress for each processed directory (10-80%)
|
|
# Use "Processing file" format for frontend parser compatibility
|
|
progress_percent = 10 + int((processed_dirs / total_dirs_to_process) * 70)
|
|
print(f"Processing file {processed_dirs}/{total_dirs_to_process}: {result.source_path.name}", flush=True)
|
|
|
|
# Register directory in registry
|
|
self.registry.register_dir(
|
|
project_id=project_info.id,
|
|
source_path=result.source_path,
|
|
index_path=result.index_path,
|
|
depth=self.mapper.get_relative_depth(result.source_path, source_root),
|
|
files_count=result.files_count,
|
|
)
|
|
|
|
# Report progress: linking subdirectories (80%)
|
|
print("Linking subdirectories...", flush=True)
|
|
|
|
# After building all directories, link subdirectories to parents
|
|
# This needs to happen after all indexes exist
|
|
for result in all_results:
|
|
if result.error:
|
|
continue
|
|
# Link children to this directory
|
|
self._link_children_to_parent(result.source_path, all_results)
|
|
|
|
# Cleanup deleted files if in incremental mode
|
|
if use_incremental:
|
|
# Report progress: cleaning up (90%)
|
|
print("Cleaning up deleted files...", flush=True)
|
|
self.logger.info("Cleaning up deleted files...")
|
|
total_deleted = 0
|
|
for result in all_results:
|
|
if result.error:
|
|
continue
|
|
try:
|
|
with DirIndexStore(result.index_path, config=self.config, global_index=global_index) as store:
|
|
deleted_count = store.cleanup_deleted_files(result.source_path)
|
|
if deleted_count > 0:
|
|
_compute_graph_neighbors(store, logger=self.logger)
|
|
store.update_merkle_root()
|
|
total_deleted += deleted_count
|
|
if deleted_count > 0:
|
|
self.logger.debug("Removed %d deleted files from %s", deleted_count, result.source_path)
|
|
except Exception as exc:
|
|
self.logger.warning("Cleanup failed for %s: %s", result.source_path, exc)
|
|
|
|
if total_deleted > 0:
|
|
self.logger.info("Removed %d deleted files from index", total_deleted)
|
|
|
|
# Report progress: finalizing (95%)
|
|
print("Finalizing...", flush=True)
|
|
|
|
# Update project statistics
|
|
self.registry.update_project_stats(source_root, total_files, total_dirs)
|
|
|
|
# Report completion (100%)
|
|
print(f"Indexed {total_files} files", flush=True)
|
|
|
|
self.logger.info(
|
|
"Index build complete: %d files, %d directories, %d errors",
|
|
total_files,
|
|
total_dirs,
|
|
len(all_errors),
|
|
)
|
|
|
|
if global_index is not None:
|
|
global_index.close()
|
|
|
|
return BuildResult(
|
|
project_id=project_info.id,
|
|
source_root=source_root,
|
|
index_root=index_root,
|
|
total_files=total_files,
|
|
total_dirs=total_dirs,
|
|
errors=all_errors,
|
|
)
|
|
|
|
def update_subtree(
|
|
self,
|
|
source_path: Path,
|
|
languages: List[str] = None,
|
|
workers: int = None,
|
|
) -> BuildResult:
|
|
"""Incrementally update a subtree.
|
|
|
|
Rebuilds indexes for the specified directory and all subdirectories.
|
|
Useful for incremental updates when only part of the tree changed.
|
|
|
|
Args:
|
|
source_path: Root of subtree to update
|
|
languages: Optional list of language IDs to limit indexing
|
|
workers: Number of parallel worker processes
|
|
|
|
Returns:
|
|
BuildResult for the subtree
|
|
|
|
Raises:
|
|
ValueError: If source_path is not indexed
|
|
"""
|
|
source_path = source_path.resolve()
|
|
project_root = self.mapper.get_project_root(source_path)
|
|
|
|
# Get project info
|
|
project_info = self.registry.get_project(project_root)
|
|
if not project_info:
|
|
raise ValueError(f"Directory not indexed: {source_path}")
|
|
|
|
self.logger.info("Updating subtree at %s", source_path)
|
|
|
|
# Use build logic but start from source_path
|
|
return self.build(source_path, languages, workers)
|
|
|
|
def rebuild_dir(self, source_path: Path) -> DirBuildResult:
|
|
"""Rebuild index for a single directory.
|
|
|
|
Only rebuilds the specified directory, does not touch subdirectories.
|
|
Useful for updating a single directory after file changes.
|
|
|
|
Args:
|
|
source_path: Directory to rebuild
|
|
|
|
Returns:
|
|
DirBuildResult for the directory
|
|
"""
|
|
source_path = source_path.resolve()
|
|
self.logger.info("Rebuilding directory %s", source_path)
|
|
project_root = self.mapper.get_project_root(source_path)
|
|
project_info = self.registry.get_project(project_root)
|
|
if not project_info:
|
|
raise ValueError(f"Directory not indexed: {source_path}")
|
|
|
|
global_index_db_path = project_info.index_root / GlobalSymbolIndex.DEFAULT_DB_NAME
|
|
return self._build_single_dir(
|
|
source_path,
|
|
languages=None,
|
|
project_id=project_info.id,
|
|
global_index_db_path=global_index_db_path,
|
|
)
|
|
|
|
# === Internal Methods ===
|
|
|
|
def _collect_dirs_by_depth(
|
|
self, source_root: Path, languages: List[str] = None
|
|
) -> Dict[int, List[Path]]:
|
|
"""Collect all indexable directories grouped by depth.
|
|
|
|
Walks the directory tree and groups directories by their depth
|
|
relative to source_root. Depth 0 is the root itself.
|
|
|
|
Args:
|
|
source_root: Root directory to start from
|
|
languages: Optional language filter
|
|
|
|
Returns:
|
|
Dictionary mapping depth to list of directory paths
|
|
Example: {0: [root], 1: [src, tests], 2: [src/api, src/utils]}
|
|
"""
|
|
source_root = source_root.resolve()
|
|
dirs_by_depth: Dict[int, List[Path]] = {}
|
|
|
|
# Always include the root directory at depth 0 for chain search entry point
|
|
dirs_by_depth[0] = [source_root]
|
|
|
|
for root, dirnames, _ in os.walk(source_root):
|
|
# Filter out ignored directories
|
|
dirnames[:] = [
|
|
d
|
|
for d in dirnames
|
|
if d not in self.IGNORE_DIRS and not d.startswith(".")
|
|
]
|
|
|
|
root_path = Path(root)
|
|
|
|
# Skip root (already added)
|
|
if root_path == source_root:
|
|
continue
|
|
|
|
# Check if this directory should be indexed
|
|
if not self._should_index_dir(root_path, languages):
|
|
continue
|
|
|
|
# Calculate depth relative to source_root
|
|
try:
|
|
depth = len(root_path.relative_to(source_root).parts)
|
|
except ValueError:
|
|
continue
|
|
|
|
if depth not in dirs_by_depth:
|
|
dirs_by_depth[depth] = []
|
|
|
|
dirs_by_depth[depth].append(root_path)
|
|
|
|
return dirs_by_depth
|
|
|
|
def _should_index_dir(self, dir_path: Path, languages: List[str] = None) -> bool:
|
|
"""Check if directory should be indexed.
|
|
|
|
A directory is indexed if:
|
|
1. It's not in IGNORE_DIRS
|
|
2. It doesn't start with '.'
|
|
3. It contains at least one supported language file, OR
|
|
4. It has subdirectories that contain supported files (transitive)
|
|
|
|
Args:
|
|
dir_path: Directory to check
|
|
languages: Optional language filter
|
|
|
|
Returns:
|
|
True if directory should be indexed
|
|
"""
|
|
# Check directory name
|
|
if dir_path.name in self.IGNORE_DIRS or dir_path.name.startswith("."):
|
|
return False
|
|
|
|
# Check for supported files in this directory
|
|
source_files = self._iter_source_files(dir_path, languages)
|
|
if len(source_files) > 0:
|
|
return True
|
|
|
|
# Check if any subdirectory has indexable files (transitive)
|
|
# This handles cases like 'src' which has no direct files but has 'src/codexlens'
|
|
for item in dir_path.iterdir():
|
|
if not item.is_dir():
|
|
continue
|
|
if item.name in self.IGNORE_DIRS or item.name.startswith("."):
|
|
continue
|
|
# Recursively check subdirectories
|
|
if self._has_indexable_files_recursive(item, languages):
|
|
return True
|
|
|
|
return False
|
|
|
|
def _has_indexable_files_recursive(self, dir_path: Path, languages: List[str] = None) -> bool:
|
|
"""Check if directory or any subdirectory has indexable files.
|
|
|
|
Args:
|
|
dir_path: Directory to check
|
|
languages: Optional language filter
|
|
|
|
Returns:
|
|
True if directory tree contains indexable files
|
|
"""
|
|
# Check for supported files in this directory
|
|
source_files = self._iter_source_files(dir_path, languages)
|
|
if len(source_files) > 0:
|
|
return True
|
|
|
|
# Check subdirectories
|
|
try:
|
|
for item in dir_path.iterdir():
|
|
if not item.is_dir():
|
|
continue
|
|
if item.name in self.IGNORE_DIRS or item.name.startswith("."):
|
|
continue
|
|
if self._has_indexable_files_recursive(item, languages):
|
|
return True
|
|
except PermissionError:
|
|
pass
|
|
|
|
return False
|
|
|
|
def _build_level_parallel(
|
|
self,
|
|
dirs: List[Path],
|
|
languages: List[str],
|
|
workers: int,
|
|
*,
|
|
project_id: int,
|
|
global_index_db_path: Path,
|
|
) -> List[DirBuildResult]:
|
|
"""Build multiple directories in parallel.
|
|
|
|
Uses ProcessPoolExecutor to build directories concurrently.
|
|
All directories at the same level are independent and can be
|
|
processed in parallel.
|
|
|
|
Args:
|
|
dirs: List of directories to build
|
|
languages: Language filter
|
|
workers: Number of worker processes
|
|
|
|
Returns:
|
|
List of DirBuildResult objects
|
|
"""
|
|
results: List[DirBuildResult] = []
|
|
|
|
if not dirs:
|
|
return results
|
|
|
|
# For single directory, avoid overhead of process pool
|
|
if len(dirs) == 1:
|
|
result = self._build_single_dir(
|
|
dirs[0],
|
|
languages,
|
|
project_id=project_id,
|
|
global_index_db_path=global_index_db_path,
|
|
)
|
|
return [result]
|
|
|
|
# Prepare arguments for worker processes
|
|
config_dict = {
|
|
"data_dir": str(self.config.data_dir),
|
|
"supported_languages": self.config.supported_languages,
|
|
"parsing_rules": self.config.parsing_rules,
|
|
"global_symbol_index_enabled": self.config.global_symbol_index_enabled,
|
|
}
|
|
|
|
worker_args = [
|
|
(
|
|
dir_path,
|
|
self.mapper.source_to_index_db(dir_path),
|
|
languages,
|
|
config_dict,
|
|
int(project_id),
|
|
str(global_index_db_path),
|
|
)
|
|
for dir_path in dirs
|
|
]
|
|
|
|
# Execute in parallel
|
|
with ProcessPoolExecutor(max_workers=workers) as executor:
|
|
futures = {
|
|
executor.submit(_build_dir_worker, args): args[0]
|
|
for args in worker_args
|
|
}
|
|
|
|
for future in as_completed(futures):
|
|
try:
|
|
result = future.result()
|
|
results.append(result)
|
|
except Exception as exc:
|
|
dir_path = futures[future]
|
|
self.logger.error("Failed to build %s: %s", dir_path, exc)
|
|
results.append(
|
|
DirBuildResult(
|
|
source_path=dir_path,
|
|
index_path=self.mapper.source_to_index_db(dir_path),
|
|
files_count=0,
|
|
symbols_count=0,
|
|
subdirs=[],
|
|
error=str(exc),
|
|
)
|
|
)
|
|
|
|
return results
|
|
|
|
def _build_single_dir(
|
|
self,
|
|
dir_path: Path,
|
|
languages: List[str] = None,
|
|
*,
|
|
project_id: int,
|
|
global_index_db_path: Path,
|
|
) -> DirBuildResult:
|
|
"""Build index for a single directory.
|
|
|
|
Creates _index.db and indexes all files in the directory.
|
|
Does not recurse into subdirectories.
|
|
|
|
Args:
|
|
dir_path: Directory to index
|
|
languages: Optional language filter
|
|
|
|
Returns:
|
|
DirBuildResult with statistics and subdirectory list
|
|
"""
|
|
dir_path = dir_path.resolve()
|
|
index_db_path = self.mapper.source_to_index_db(dir_path)
|
|
|
|
global_index: GlobalSymbolIndex | None = None
|
|
try:
|
|
# Ensure index directory exists
|
|
index_db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create directory index
|
|
if self.config.global_symbol_index_enabled:
|
|
global_index = GlobalSymbolIndex(global_index_db_path, project_id=project_id)
|
|
global_index.initialize()
|
|
|
|
store = DirIndexStore(index_db_path, config=self.config, global_index=global_index)
|
|
store.initialize()
|
|
|
|
# Get source files in this directory only
|
|
source_files = self._iter_source_files(dir_path, languages)
|
|
|
|
files_count = 0
|
|
symbols_count = 0
|
|
skipped_count = 0
|
|
|
|
for file_path in source_files:
|
|
try:
|
|
# Check if file needs reindexing (incremental mode)
|
|
if self.incremental and not store.needs_reindex(file_path):
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Read and parse file
|
|
text = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
language_id = self.config.language_for_path(file_path)
|
|
if not language_id:
|
|
continue
|
|
|
|
parser = self.parser_factory.get_parser(language_id)
|
|
indexed_file = parser.parse(text, file_path)
|
|
|
|
# Add to directory index
|
|
store.add_file(
|
|
name=file_path.name,
|
|
full_path=file_path,
|
|
content=text,
|
|
language=language_id,
|
|
symbols=indexed_file.symbols,
|
|
relationships=indexed_file.relationships,
|
|
)
|
|
|
|
files_count += 1
|
|
symbols_count += len(indexed_file.symbols)
|
|
|
|
except Exception as exc:
|
|
self.logger.debug("Failed to index %s: %s", file_path, exc)
|
|
continue
|
|
|
|
if files_count > 0:
|
|
_compute_graph_neighbors(store, logger=self.logger)
|
|
|
|
# Get list of subdirectories
|
|
subdirs = [
|
|
d.name
|
|
for d in dir_path.iterdir()
|
|
if d.is_dir()
|
|
and d.name not in self.IGNORE_DIRS
|
|
and not d.name.startswith(".")
|
|
]
|
|
|
|
store.update_merkle_root()
|
|
store.close()
|
|
if global_index is not None:
|
|
global_index.close()
|
|
|
|
if skipped_count > 0:
|
|
self.logger.debug(
|
|
"Built %s: %d files indexed, %d skipped (unchanged), %d symbols, %d subdirs",
|
|
dir_path,
|
|
files_count,
|
|
skipped_count,
|
|
symbols_count,
|
|
len(subdirs),
|
|
)
|
|
else:
|
|
self.logger.debug(
|
|
"Built %s: %d files, %d symbols, %d subdirs",
|
|
dir_path,
|
|
files_count,
|
|
symbols_count,
|
|
len(subdirs),
|
|
)
|
|
|
|
return DirBuildResult(
|
|
source_path=dir_path,
|
|
index_path=index_db_path,
|
|
files_count=files_count,
|
|
symbols_count=symbols_count,
|
|
subdirs=subdirs,
|
|
)
|
|
|
|
except Exception as exc:
|
|
self.logger.error("Failed to build directory %s: %s", dir_path, exc)
|
|
if global_index is not None:
|
|
try:
|
|
global_index.close()
|
|
except Exception:
|
|
pass
|
|
return DirBuildResult(
|
|
source_path=dir_path,
|
|
index_path=index_db_path,
|
|
files_count=0,
|
|
symbols_count=0,
|
|
subdirs=[],
|
|
error=str(exc),
|
|
)
|
|
|
|
def _link_children_to_parent(
|
|
self, parent_path: Path, all_results: List[DirBuildResult]
|
|
) -> None:
|
|
"""Link child directory indexes to parent's subdirs table.
|
|
|
|
Finds all direct children of parent_path in all_results and
|
|
registers them as subdirectories in the parent's index.
|
|
|
|
Args:
|
|
parent_path: Parent directory path
|
|
all_results: List of all build results
|
|
"""
|
|
parent_index_db = self.mapper.source_to_index_db(parent_path)
|
|
|
|
try:
|
|
with DirIndexStore(parent_index_db, config=self.config) as store:
|
|
for result in all_results:
|
|
# Only register direct children (parent is one level up)
|
|
if result.source_path.parent != parent_path:
|
|
continue
|
|
|
|
if result.error:
|
|
continue
|
|
|
|
# Register subdirectory link
|
|
store.register_subdir(
|
|
name=result.source_path.name,
|
|
index_path=result.index_path,
|
|
files_count=result.files_count,
|
|
direct_files=result.files_count,
|
|
)
|
|
self.logger.debug(
|
|
"Linked %s to parent %s",
|
|
result.source_path.name,
|
|
parent_path,
|
|
)
|
|
|
|
store.update_merkle_root()
|
|
|
|
except Exception as exc:
|
|
self.logger.error(
|
|
"Failed to link children to %s: %s", parent_path, exc
|
|
)
|
|
|
|
def _iter_source_files(
|
|
self, dir_path: Path, languages: List[str] = None
|
|
) -> List[Path]:
|
|
"""Iterate source files in directory (non-recursive).
|
|
|
|
Returns files in the specified directory that match language filters.
|
|
Does not recurse into subdirectories.
|
|
|
|
Args:
|
|
dir_path: Directory to scan
|
|
languages: Optional language filter
|
|
|
|
Returns:
|
|
List of source file paths
|
|
"""
|
|
files: List[Path] = []
|
|
|
|
if not dir_path.is_dir():
|
|
return files
|
|
|
|
for item in dir_path.iterdir():
|
|
if not item.is_file():
|
|
continue
|
|
|
|
if item.name.startswith("."):
|
|
continue
|
|
|
|
# Check language support
|
|
language_id = self.config.language_for_path(item)
|
|
if not language_id:
|
|
continue
|
|
|
|
# Apply language filter
|
|
if languages and language_id not in languages:
|
|
continue
|
|
|
|
files.append(item)
|
|
|
|
return files
|
|
|
|
|
|
def _normalize_relationship_target(target: str) -> str:
|
|
"""Best-effort normalization of a relationship target into a local symbol name."""
|
|
target = (target or "").strip()
|
|
if not target:
|
|
return ""
|
|
|
|
# Drop trailing call parentheses when present (e.g., "foo()" -> "foo").
|
|
if target.endswith("()"):
|
|
target = target[:-2]
|
|
|
|
# Keep the leaf identifier for common qualified formats.
|
|
for sep in ("::", ".", "#"):
|
|
if sep in target:
|
|
target = target.split(sep)[-1]
|
|
|
|
# Strip non-identifier suffix/prefix noise.
|
|
target = re.sub(r"^[^A-Za-z0-9_]+", "", target)
|
|
target = re.sub(r"[^A-Za-z0-9_]+$", "", target)
|
|
return target
|
|
|
|
|
|
def _compute_graph_neighbors(
|
|
store: DirIndexStore,
|
|
*,
|
|
max_depth: int = 2,
|
|
logger: Optional[logging.Logger] = None,
|
|
) -> None:
|
|
"""Compute and persist N-hop neighbors for all symbols in a directory index."""
|
|
if max_depth <= 0:
|
|
return
|
|
|
|
log = logger or logging.getLogger(__name__)
|
|
|
|
with store._lock:
|
|
conn = store._get_connection()
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
# Ensure schema exists even for older databases pinned to the same user_version.
|
|
try:
|
|
from codexlens.storage.migrations.migration_007_add_graph_neighbors import upgrade
|
|
|
|
upgrade(conn)
|
|
except Exception as exc:
|
|
log.debug("Graph neighbor schema ensure failed: %s", exc)
|
|
|
|
cursor = conn.cursor()
|
|
|
|
try:
|
|
cursor.execute("DELETE FROM graph_neighbors")
|
|
except sqlite3.Error:
|
|
# Table missing or schema mismatch; skip gracefully.
|
|
return
|
|
|
|
try:
|
|
symbol_rows = cursor.execute(
|
|
"SELECT id, file_id, name FROM symbols"
|
|
).fetchall()
|
|
rel_rows = cursor.execute(
|
|
"SELECT source_symbol_id, target_qualified_name FROM code_relationships"
|
|
).fetchall()
|
|
except sqlite3.Error:
|
|
return
|
|
|
|
if not symbol_rows or not rel_rows:
|
|
try:
|
|
conn.commit()
|
|
except sqlite3.Error:
|
|
pass
|
|
return
|
|
|
|
symbol_file_by_id: Dict[int, int] = {}
|
|
symbols_by_file_and_name: Dict[Tuple[int, str], List[int]] = {}
|
|
symbols_by_name: Dict[str, List[int]] = {}
|
|
|
|
for row in symbol_rows:
|
|
symbol_id = int(row["id"])
|
|
file_id = int(row["file_id"])
|
|
name = str(row["name"])
|
|
symbol_file_by_id[symbol_id] = file_id
|
|
symbols_by_file_and_name.setdefault((file_id, name), []).append(symbol_id)
|
|
symbols_by_name.setdefault(name, []).append(symbol_id)
|
|
|
|
adjacency: Dict[int, Set[int]] = {}
|
|
|
|
for row in rel_rows:
|
|
source_id = int(row["source_symbol_id"])
|
|
target_raw = str(row["target_qualified_name"] or "")
|
|
target_name = _normalize_relationship_target(target_raw)
|
|
if not target_name:
|
|
continue
|
|
|
|
source_file_id = symbol_file_by_id.get(source_id)
|
|
if source_file_id is None:
|
|
continue
|
|
|
|
candidate_ids = symbols_by_file_and_name.get((source_file_id, target_name))
|
|
if not candidate_ids:
|
|
global_candidates = symbols_by_name.get(target_name, [])
|
|
# Only resolve cross-file by name when unambiguous.
|
|
candidate_ids = global_candidates if len(global_candidates) == 1 else []
|
|
|
|
for target_id in candidate_ids:
|
|
if target_id == source_id:
|
|
continue
|
|
adjacency.setdefault(source_id, set()).add(target_id)
|
|
adjacency.setdefault(target_id, set()).add(source_id)
|
|
|
|
if not adjacency:
|
|
try:
|
|
conn.commit()
|
|
except sqlite3.Error:
|
|
pass
|
|
return
|
|
|
|
insert_rows: List[Tuple[int, int, int]] = []
|
|
max_depth = min(int(max_depth), 2)
|
|
|
|
for source_id, first_hop in adjacency.items():
|
|
if not first_hop:
|
|
continue
|
|
for neighbor_id in first_hop:
|
|
insert_rows.append((source_id, neighbor_id, 1))
|
|
|
|
if max_depth < 2:
|
|
continue
|
|
|
|
second_hop: Set[int] = set()
|
|
for neighbor_id in first_hop:
|
|
second_hop.update(adjacency.get(neighbor_id, set()))
|
|
|
|
second_hop.discard(source_id)
|
|
second_hop.difference_update(first_hop)
|
|
|
|
for neighbor_id in second_hop:
|
|
insert_rows.append((source_id, neighbor_id, 2))
|
|
|
|
if not insert_rows:
|
|
try:
|
|
conn.commit()
|
|
except sqlite3.Error:
|
|
pass
|
|
return
|
|
|
|
try:
|
|
cursor.executemany(
|
|
"""
|
|
INSERT INTO graph_neighbors(
|
|
source_symbol_id, neighbor_symbol_id, relationship_depth
|
|
)
|
|
VALUES(?, ?, ?)
|
|
""",
|
|
insert_rows,
|
|
)
|
|
conn.commit()
|
|
except sqlite3.Error:
|
|
return
|
|
|
|
|
|
# === Worker Function for ProcessPoolExecutor ===
|
|
|
|
|
|
def _build_dir_worker(args: tuple) -> DirBuildResult:
|
|
"""Worker function for parallel directory building.
|
|
|
|
Must be at module level for ProcessPoolExecutor pickling.
|
|
Reconstructs necessary objects from serializable arguments.
|
|
|
|
Args:
|
|
args: Tuple of (dir_path, index_db_path, languages, config_dict, project_id, global_index_db_path)
|
|
|
|
Returns:
|
|
DirBuildResult for the directory
|
|
"""
|
|
dir_path, index_db_path, languages, config_dict, project_id, global_index_db_path = args
|
|
|
|
# Reconstruct config
|
|
config = Config(
|
|
data_dir=Path(config_dict["data_dir"]),
|
|
supported_languages=config_dict["supported_languages"],
|
|
parsing_rules=config_dict["parsing_rules"],
|
|
global_symbol_index_enabled=bool(config_dict.get("global_symbol_index_enabled", True)),
|
|
)
|
|
|
|
parser_factory = ParserFactory(config)
|
|
|
|
global_index: GlobalSymbolIndex | None = None
|
|
try:
|
|
# Ensure index directory exists
|
|
index_db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create directory index
|
|
if config.global_symbol_index_enabled and global_index_db_path:
|
|
global_index = GlobalSymbolIndex(Path(global_index_db_path), project_id=int(project_id))
|
|
global_index.initialize()
|
|
|
|
store = DirIndexStore(index_db_path, config=config, global_index=global_index)
|
|
store.initialize()
|
|
|
|
files_count = 0
|
|
symbols_count = 0
|
|
|
|
# Index files in this directory
|
|
for item in dir_path.iterdir():
|
|
if not item.is_file():
|
|
continue
|
|
|
|
if item.name.startswith("."):
|
|
continue
|
|
|
|
language_id = config.language_for_path(item)
|
|
if not language_id:
|
|
continue
|
|
|
|
if languages and language_id not in languages:
|
|
continue
|
|
|
|
try:
|
|
text = item.read_text(encoding="utf-8", errors="ignore")
|
|
parser = parser_factory.get_parser(language_id)
|
|
indexed_file = parser.parse(text, item)
|
|
|
|
store.add_file(
|
|
name=item.name,
|
|
full_path=item,
|
|
content=text,
|
|
language=language_id,
|
|
symbols=indexed_file.symbols,
|
|
relationships=indexed_file.relationships,
|
|
)
|
|
|
|
files_count += 1
|
|
symbols_count += len(indexed_file.symbols)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
if files_count > 0:
|
|
_compute_graph_neighbors(store)
|
|
|
|
# Get subdirectories
|
|
ignore_dirs = {
|
|
".git",
|
|
".venv",
|
|
"venv",
|
|
"node_modules",
|
|
"__pycache__",
|
|
".codexlens",
|
|
".idea",
|
|
".vscode",
|
|
}
|
|
|
|
subdirs = [
|
|
d.name
|
|
for d in dir_path.iterdir()
|
|
if d.is_dir() and d.name not in ignore_dirs and not d.name.startswith(".")
|
|
]
|
|
|
|
store.update_merkle_root()
|
|
store.close()
|
|
if global_index is not None:
|
|
global_index.close()
|
|
|
|
return DirBuildResult(
|
|
source_path=dir_path,
|
|
index_path=index_db_path,
|
|
files_count=files_count,
|
|
symbols_count=symbols_count,
|
|
subdirs=subdirs,
|
|
)
|
|
|
|
except Exception as exc:
|
|
if global_index is not None:
|
|
try:
|
|
global_index.close()
|
|
except Exception:
|
|
pass
|
|
return DirBuildResult(
|
|
source_path=dir_path,
|
|
index_path=index_db_path,
|
|
files_count=0,
|
|
symbols_count=0,
|
|
subdirs=[],
|
|
error=str(exc),
|
|
)
|