"""Hierarchical index tree builder for CodexLens. Constructs a bottom-up directory index tree with parallel processing support. Each directory maintains its own _index.db with files and subdirectory links. """ from __future__ import annotations import logging import os import re import sqlite3 import time from concurrent.futures import ProcessPoolExecutor, as_completed from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Optional, Set, Tuple from codexlens.config import Config from codexlens.parsers.factory import ParserFactory from codexlens.storage.dir_index import DirIndexStore from codexlens.storage.global_index import GlobalSymbolIndex from codexlens.storage.path_mapper import PathMapper from codexlens.storage.registry import ProjectInfo, RegistryStore @dataclass class BuildResult: """Complete build operation result.""" project_id: int source_root: Path index_root: Path total_files: int total_dirs: int errors: List[str] @dataclass class DirBuildResult: """Single directory build result.""" source_path: Path index_path: Path files_count: int symbols_count: int subdirs: List[str] # Subdirectory names error: Optional[str] = None class IndexTreeBuilder: """Hierarchical index tree builder with parallel processing. Builds directory indexes bottom-up to enable proper subdirectory linking. Each directory gets its own _index.db containing: - Files in that directory - Links to child directory indexes - Symbols and FTS5 search Attributes: registry: Global project registry mapper: Path mapping between source and index config: CodexLens configuration parser_factory: Parser factory for symbol extraction logger: Logger instance IGNORE_DIRS: Set of directory names to skip during indexing """ # Directories to skip during indexing IGNORE_DIRS: Set[str] = { ".git", ".venv", "venv", "node_modules", "__pycache__", ".codexlens", ".idea", ".vscode", } def __init__( self, registry: RegistryStore, mapper: PathMapper, config: Config = None, incremental: bool = True ): """Initialize the index tree builder. Args: registry: Global registry store for project tracking mapper: Path mapper for source to index conversions config: CodexLens configuration (uses defaults if None) incremental: Enable incremental indexing (default True) """ self.registry = registry self.mapper = mapper self.config = config or Config() self.parser_factory = ParserFactory(self.config) self.logger = logging.getLogger(__name__) self.incremental = incremental def build( self, source_root: Path, languages: List[str] = None, workers: int = None, force_full: bool = False, ) -> BuildResult: """Build complete index tree for a project. Process: 1. Register project in registry 2. Collect all directories grouped by depth 3. Build indexes bottom-up (deepest first) 4. Link subdirectories to parents 5. Update project statistics 6. Cleanup deleted files (if incremental mode) Args: source_root: Project root directory to index languages: Optional list of language IDs to limit indexing workers: Number of parallel worker processes force_full: Force full reindex (override incremental mode) Returns: BuildResult with statistics and errors Raises: ValueError: If source_root doesn't exist """ source_root = source_root.resolve() if not source_root.exists(): raise ValueError(f"Source root does not exist: {source_root}") # Auto-detect optimal worker count if not specified if workers is None: workers = min(os.cpu_count() or 4, 16) # Cap at 16 workers self.logger.debug("Auto-detected %d workers for parallel indexing", workers) # Override incremental mode if force_full is True use_incremental = self.incremental and not force_full if force_full: self.logger.info("Building index tree for %s (FULL reindex)", source_root) else: self.logger.info("Building index tree for %s (incremental=%s)", source_root, use_incremental) # Register project index_root = self.mapper.source_to_index_dir(source_root) project_info = self.registry.register_project(source_root, index_root) global_index_db_path = index_root / GlobalSymbolIndex.DEFAULT_DB_NAME global_index: GlobalSymbolIndex | None = None if self.config.global_symbol_index_enabled: global_index = GlobalSymbolIndex(global_index_db_path, project_id=project_info.id) global_index.initialize() # Report progress: discovering files (5%) print("Discovering files...", flush=True) # Collect directories by depth dirs_by_depth = self._collect_dirs_by_depth(source_root, languages) if not dirs_by_depth: self.logger.warning("No indexable directories found in %s", source_root) if global_index is not None: global_index.close() return BuildResult( project_id=project_info.id, source_root=source_root, index_root=index_root, total_files=0, total_dirs=0, errors=["No indexable directories found"], ) # Calculate total directories for progress tracking total_dirs_to_process = sum(len(dirs) for dirs in dirs_by_depth.values()) processed_dirs = 0 # Report progress: building index (10%) print("Building index...", flush=True) total_files = 0 total_dirs = 0 all_errors: List[str] = [] all_results: List[DirBuildResult] = [] # Store all results for subdir linking # Build bottom-up (highest depth first) max_depth = max(dirs_by_depth.keys()) for depth in range(max_depth, -1, -1): if depth not in dirs_by_depth: continue dirs = dirs_by_depth[depth] self.logger.info("Building %d directories at depth %d", len(dirs), depth) # Build directories at this level in parallel results = self._build_level_parallel( dirs, languages, workers, project_id=project_info.id, global_index_db_path=global_index_db_path, ) all_results.extend(results) # Process results for result in results: if result.error: all_errors.append(f"{result.source_path}: {result.error}") processed_dirs += 1 continue total_files += result.files_count total_dirs += 1 processed_dirs += 1 # Report progress for each processed directory (10-80%) # Use "Processing file" format for frontend parser compatibility progress_percent = 10 + int((processed_dirs / total_dirs_to_process) * 70) print(f"Processing file {processed_dirs}/{total_dirs_to_process}: {result.source_path.name}", flush=True) # Register directory in registry self.registry.register_dir( project_id=project_info.id, source_path=result.source_path, index_path=result.index_path, depth=self.mapper.get_relative_depth(result.source_path, source_root), files_count=result.files_count, ) # Report progress: linking subdirectories (80%) print("Linking subdirectories...", flush=True) # After building all directories, link subdirectories to parents # This needs to happen after all indexes exist for result in all_results: if result.error: continue # Link children to this directory self._link_children_to_parent(result.source_path, all_results) # Cleanup deleted files if in incremental mode if use_incremental: # Report progress: cleaning up (90%) print("Cleaning up deleted files...", flush=True) self.logger.info("Cleaning up deleted files...") total_deleted = 0 for result in all_results: if result.error: continue try: with DirIndexStore(result.index_path, config=self.config, global_index=global_index) as store: deleted_count = store.cleanup_deleted_files(result.source_path) if deleted_count > 0: _compute_graph_neighbors(store, logger=self.logger) store.update_merkle_root() total_deleted += deleted_count if deleted_count > 0: self.logger.debug("Removed %d deleted files from %s", deleted_count, result.source_path) except Exception as exc: self.logger.warning("Cleanup failed for %s: %s", result.source_path, exc) if total_deleted > 0: self.logger.info("Removed %d deleted files from index", total_deleted) # Report progress: finalizing (95%) print("Finalizing...", flush=True) # Update project statistics self.registry.update_project_stats(source_root, total_files, total_dirs) # Report completion (100%) print(f"Indexed {total_files} files", flush=True) self.logger.info( "Index build complete: %d files, %d directories, %d errors", total_files, total_dirs, len(all_errors), ) if global_index is not None: global_index.close() return BuildResult( project_id=project_info.id, source_root=source_root, index_root=index_root, total_files=total_files, total_dirs=total_dirs, errors=all_errors, ) def update_subtree( self, source_path: Path, languages: List[str] = None, workers: int = None, ) -> BuildResult: """Incrementally update a subtree. Rebuilds indexes for the specified directory and all subdirectories. Useful for incremental updates when only part of the tree changed. Args: source_path: Root of subtree to update languages: Optional list of language IDs to limit indexing workers: Number of parallel worker processes Returns: BuildResult for the subtree Raises: ValueError: If source_path is not indexed """ source_path = source_path.resolve() project_root = self.mapper.get_project_root(source_path) # Get project info project_info = self.registry.get_project(project_root) if not project_info: raise ValueError(f"Directory not indexed: {source_path}") self.logger.info("Updating subtree at %s", source_path) # Use build logic but start from source_path return self.build(source_path, languages, workers) def rebuild_dir(self, source_path: Path) -> DirBuildResult: """Rebuild index for a single directory. Only rebuilds the specified directory, does not touch subdirectories. Useful for updating a single directory after file changes. Args: source_path: Directory to rebuild Returns: DirBuildResult for the directory """ source_path = source_path.resolve() self.logger.info("Rebuilding directory %s", source_path) project_root = self.mapper.get_project_root(source_path) project_info = self.registry.get_project(project_root) if not project_info: raise ValueError(f"Directory not indexed: {source_path}") global_index_db_path = project_info.index_root / GlobalSymbolIndex.DEFAULT_DB_NAME return self._build_single_dir( source_path, languages=None, project_id=project_info.id, global_index_db_path=global_index_db_path, ) # === Internal Methods === def _collect_dirs_by_depth( self, source_root: Path, languages: List[str] = None ) -> Dict[int, List[Path]]: """Collect all indexable directories grouped by depth. Walks the directory tree and groups directories by their depth relative to source_root. Depth 0 is the root itself. Args: source_root: Root directory to start from languages: Optional language filter Returns: Dictionary mapping depth to list of directory paths Example: {0: [root], 1: [src, tests], 2: [src/api, src/utils]} """ source_root = source_root.resolve() dirs_by_depth: Dict[int, List[Path]] = {} # Always include the root directory at depth 0 for chain search entry point dirs_by_depth[0] = [source_root] for root, dirnames, _ in os.walk(source_root): # Filter out ignored directories dirnames[:] = [ d for d in dirnames if d not in self.IGNORE_DIRS and not d.startswith(".") ] root_path = Path(root) # Skip root (already added) if root_path == source_root: continue # Check if this directory should be indexed if not self._should_index_dir(root_path, languages): continue # Calculate depth relative to source_root try: depth = len(root_path.relative_to(source_root).parts) except ValueError: continue if depth not in dirs_by_depth: dirs_by_depth[depth] = [] dirs_by_depth[depth].append(root_path) return dirs_by_depth def _should_index_dir(self, dir_path: Path, languages: List[str] = None) -> bool: """Check if directory should be indexed. A directory is indexed if: 1. It's not in IGNORE_DIRS 2. It doesn't start with '.' 3. It contains at least one supported language file, OR 4. It has subdirectories that contain supported files (transitive) Args: dir_path: Directory to check languages: Optional language filter Returns: True if directory should be indexed """ # Check directory name if dir_path.name in self.IGNORE_DIRS or dir_path.name.startswith("."): return False # Check for supported files in this directory source_files = self._iter_source_files(dir_path, languages) if len(source_files) > 0: return True # Check if any subdirectory has indexable files (transitive) # This handles cases like 'src' which has no direct files but has 'src/codexlens' for item in dir_path.iterdir(): if not item.is_dir(): continue if item.name in self.IGNORE_DIRS or item.name.startswith("."): continue # Recursively check subdirectories if self._has_indexable_files_recursive(item, languages): return True return False def _has_indexable_files_recursive(self, dir_path: Path, languages: List[str] = None) -> bool: """Check if directory or any subdirectory has indexable files. Args: dir_path: Directory to check languages: Optional language filter Returns: True if directory tree contains indexable files """ # Check for supported files in this directory source_files = self._iter_source_files(dir_path, languages) if len(source_files) > 0: return True # Check subdirectories try: for item in dir_path.iterdir(): if not item.is_dir(): continue if item.name in self.IGNORE_DIRS or item.name.startswith("."): continue if self._has_indexable_files_recursive(item, languages): return True except PermissionError: pass return False def _build_level_parallel( self, dirs: List[Path], languages: List[str], workers: int, *, project_id: int, global_index_db_path: Path, ) -> List[DirBuildResult]: """Build multiple directories in parallel. Uses ProcessPoolExecutor to build directories concurrently. All directories at the same level are independent and can be processed in parallel. Args: dirs: List of directories to build languages: Language filter workers: Number of worker processes Returns: List of DirBuildResult objects """ results: List[DirBuildResult] = [] if not dirs: return results # For single directory, avoid overhead of process pool if len(dirs) == 1: result = self._build_single_dir( dirs[0], languages, project_id=project_id, global_index_db_path=global_index_db_path, ) return [result] # Prepare arguments for worker processes config_dict = { "data_dir": str(self.config.data_dir), "supported_languages": self.config.supported_languages, "parsing_rules": self.config.parsing_rules, "global_symbol_index_enabled": self.config.global_symbol_index_enabled, } worker_args = [ ( dir_path, self.mapper.source_to_index_db(dir_path), languages, config_dict, int(project_id), str(global_index_db_path), ) for dir_path in dirs ] # Execute in parallel with ProcessPoolExecutor(max_workers=workers) as executor: futures = { executor.submit(_build_dir_worker, args): args[0] for args in worker_args } for future in as_completed(futures): try: result = future.result() results.append(result) except Exception as exc: dir_path = futures[future] self.logger.error("Failed to build %s: %s", dir_path, exc) results.append( DirBuildResult( source_path=dir_path, index_path=self.mapper.source_to_index_db(dir_path), files_count=0, symbols_count=0, subdirs=[], error=str(exc), ) ) return results def _build_single_dir( self, dir_path: Path, languages: List[str] = None, *, project_id: int, global_index_db_path: Path, ) -> DirBuildResult: """Build index for a single directory. Creates _index.db and indexes all files in the directory. Does not recurse into subdirectories. Args: dir_path: Directory to index languages: Optional language filter Returns: DirBuildResult with statistics and subdirectory list """ dir_path = dir_path.resolve() index_db_path = self.mapper.source_to_index_db(dir_path) global_index: GlobalSymbolIndex | None = None try: # Ensure index directory exists index_db_path.parent.mkdir(parents=True, exist_ok=True) # Create directory index if self.config.global_symbol_index_enabled: global_index = GlobalSymbolIndex(global_index_db_path, project_id=project_id) global_index.initialize() store = DirIndexStore(index_db_path, config=self.config, global_index=global_index) store.initialize() # Get source files in this directory only source_files = self._iter_source_files(dir_path, languages) files_count = 0 symbols_count = 0 skipped_count = 0 for file_path in source_files: try: # Check if file needs reindexing (incremental mode) if self.incremental and not store.needs_reindex(file_path): skipped_count += 1 continue # Read and parse file text = file_path.read_text(encoding="utf-8", errors="ignore") language_id = self.config.language_for_path(file_path) if not language_id: continue parser = self.parser_factory.get_parser(language_id) indexed_file = parser.parse(text, file_path) # Add to directory index store.add_file( name=file_path.name, full_path=file_path, content=text, language=language_id, symbols=indexed_file.symbols, relationships=indexed_file.relationships, ) files_count += 1 symbols_count += len(indexed_file.symbols) except Exception as exc: self.logger.debug("Failed to index %s: %s", file_path, exc) continue if files_count > 0: _compute_graph_neighbors(store, logger=self.logger) # Get list of subdirectories subdirs = [ d.name for d in dir_path.iterdir() if d.is_dir() and d.name not in self.IGNORE_DIRS and not d.name.startswith(".") ] store.update_merkle_root() store.close() if global_index is not None: global_index.close() if skipped_count > 0: self.logger.debug( "Built %s: %d files indexed, %d skipped (unchanged), %d symbols, %d subdirs", dir_path, files_count, skipped_count, symbols_count, len(subdirs), ) else: self.logger.debug( "Built %s: %d files, %d symbols, %d subdirs", dir_path, files_count, symbols_count, len(subdirs), ) return DirBuildResult( source_path=dir_path, index_path=index_db_path, files_count=files_count, symbols_count=symbols_count, subdirs=subdirs, ) except Exception as exc: self.logger.error("Failed to build directory %s: %s", dir_path, exc) if global_index is not None: try: global_index.close() except Exception: pass return DirBuildResult( source_path=dir_path, index_path=index_db_path, files_count=0, symbols_count=0, subdirs=[], error=str(exc), ) def _link_children_to_parent( self, parent_path: Path, all_results: List[DirBuildResult] ) -> None: """Link child directory indexes to parent's subdirs table. Finds all direct children of parent_path in all_results and registers them as subdirectories in the parent's index. Args: parent_path: Parent directory path all_results: List of all build results """ parent_index_db = self.mapper.source_to_index_db(parent_path) try: with DirIndexStore(parent_index_db, config=self.config) as store: for result in all_results: # Only register direct children (parent is one level up) if result.source_path.parent != parent_path: continue if result.error: continue # Register subdirectory link store.register_subdir( name=result.source_path.name, index_path=result.index_path, files_count=result.files_count, direct_files=result.files_count, ) self.logger.debug( "Linked %s to parent %s", result.source_path.name, parent_path, ) store.update_merkle_root() except Exception as exc: self.logger.error( "Failed to link children to %s: %s", parent_path, exc ) def _iter_source_files( self, dir_path: Path, languages: List[str] = None ) -> List[Path]: """Iterate source files in directory (non-recursive). Returns files in the specified directory that match language filters. Does not recurse into subdirectories. Args: dir_path: Directory to scan languages: Optional language filter Returns: List of source file paths """ files: List[Path] = [] if not dir_path.is_dir(): return files for item in dir_path.iterdir(): if not item.is_file(): continue if item.name.startswith("."): continue # Check language support language_id = self.config.language_for_path(item) if not language_id: continue # Apply language filter if languages and language_id not in languages: continue files.append(item) return files def _normalize_relationship_target(target: str) -> str: """Best-effort normalization of a relationship target into a local symbol name.""" target = (target or "").strip() if not target: return "" # Drop trailing call parentheses when present (e.g., "foo()" -> "foo"). if target.endswith("()"): target = target[:-2] # Keep the leaf identifier for common qualified formats. for sep in ("::", ".", "#"): if sep in target: target = target.split(sep)[-1] # Strip non-identifier suffix/prefix noise. target = re.sub(r"^[^A-Za-z0-9_]+", "", target) target = re.sub(r"[^A-Za-z0-9_]+$", "", target) return target def _compute_graph_neighbors( store: DirIndexStore, *, max_depth: int = 2, logger: Optional[logging.Logger] = None, ) -> None: """Compute and persist N-hop neighbors for all symbols in a directory index.""" if max_depth <= 0: return log = logger or logging.getLogger(__name__) with store._lock: conn = store._get_connection() conn.row_factory = sqlite3.Row # Ensure schema exists even for older databases pinned to the same user_version. try: from codexlens.storage.migrations.migration_007_add_graph_neighbors import upgrade upgrade(conn) except Exception as exc: log.debug("Graph neighbor schema ensure failed: %s", exc) cursor = conn.cursor() try: cursor.execute("DELETE FROM graph_neighbors") except sqlite3.Error: # Table missing or schema mismatch; skip gracefully. return try: symbol_rows = cursor.execute( "SELECT id, file_id, name FROM symbols" ).fetchall() rel_rows = cursor.execute( "SELECT source_symbol_id, target_qualified_name FROM code_relationships" ).fetchall() except sqlite3.Error: return if not symbol_rows or not rel_rows: try: conn.commit() except sqlite3.Error: pass return symbol_file_by_id: Dict[int, int] = {} symbols_by_file_and_name: Dict[Tuple[int, str], List[int]] = {} symbols_by_name: Dict[str, List[int]] = {} for row in symbol_rows: symbol_id = int(row["id"]) file_id = int(row["file_id"]) name = str(row["name"]) symbol_file_by_id[symbol_id] = file_id symbols_by_file_and_name.setdefault((file_id, name), []).append(symbol_id) symbols_by_name.setdefault(name, []).append(symbol_id) adjacency: Dict[int, Set[int]] = {} for row in rel_rows: source_id = int(row["source_symbol_id"]) target_raw = str(row["target_qualified_name"] or "") target_name = _normalize_relationship_target(target_raw) if not target_name: continue source_file_id = symbol_file_by_id.get(source_id) if source_file_id is None: continue candidate_ids = symbols_by_file_and_name.get((source_file_id, target_name)) if not candidate_ids: global_candidates = symbols_by_name.get(target_name, []) # Only resolve cross-file by name when unambiguous. candidate_ids = global_candidates if len(global_candidates) == 1 else [] for target_id in candidate_ids: if target_id == source_id: continue adjacency.setdefault(source_id, set()).add(target_id) adjacency.setdefault(target_id, set()).add(source_id) if not adjacency: try: conn.commit() except sqlite3.Error: pass return insert_rows: List[Tuple[int, int, int]] = [] max_depth = min(int(max_depth), 2) for source_id, first_hop in adjacency.items(): if not first_hop: continue for neighbor_id in first_hop: insert_rows.append((source_id, neighbor_id, 1)) if max_depth < 2: continue second_hop: Set[int] = set() for neighbor_id in first_hop: second_hop.update(adjacency.get(neighbor_id, set())) second_hop.discard(source_id) second_hop.difference_update(first_hop) for neighbor_id in second_hop: insert_rows.append((source_id, neighbor_id, 2)) if not insert_rows: try: conn.commit() except sqlite3.Error: pass return try: cursor.executemany( """ INSERT INTO graph_neighbors( source_symbol_id, neighbor_symbol_id, relationship_depth ) VALUES(?, ?, ?) """, insert_rows, ) conn.commit() except sqlite3.Error: return # === Worker Function for ProcessPoolExecutor === def _build_dir_worker(args: tuple) -> DirBuildResult: """Worker function for parallel directory building. Must be at module level for ProcessPoolExecutor pickling. Reconstructs necessary objects from serializable arguments. Args: args: Tuple of (dir_path, index_db_path, languages, config_dict, project_id, global_index_db_path) Returns: DirBuildResult for the directory """ dir_path, index_db_path, languages, config_dict, project_id, global_index_db_path = args # Reconstruct config config = Config( data_dir=Path(config_dict["data_dir"]), supported_languages=config_dict["supported_languages"], parsing_rules=config_dict["parsing_rules"], global_symbol_index_enabled=bool(config_dict.get("global_symbol_index_enabled", True)), ) parser_factory = ParserFactory(config) global_index: GlobalSymbolIndex | None = None try: # Ensure index directory exists index_db_path.parent.mkdir(parents=True, exist_ok=True) # Create directory index if config.global_symbol_index_enabled and global_index_db_path: global_index = GlobalSymbolIndex(Path(global_index_db_path), project_id=int(project_id)) global_index.initialize() store = DirIndexStore(index_db_path, config=config, global_index=global_index) store.initialize() files_count = 0 symbols_count = 0 # Index files in this directory for item in dir_path.iterdir(): if not item.is_file(): continue if item.name.startswith("."): continue language_id = config.language_for_path(item) if not language_id: continue if languages and language_id not in languages: continue try: text = item.read_text(encoding="utf-8", errors="ignore") parser = parser_factory.get_parser(language_id) indexed_file = parser.parse(text, item) store.add_file( name=item.name, full_path=item, content=text, language=language_id, symbols=indexed_file.symbols, relationships=indexed_file.relationships, ) files_count += 1 symbols_count += len(indexed_file.symbols) except Exception: continue if files_count > 0: _compute_graph_neighbors(store) # Get subdirectories ignore_dirs = { ".git", ".venv", "venv", "node_modules", "__pycache__", ".codexlens", ".idea", ".vscode", } subdirs = [ d.name for d in dir_path.iterdir() if d.is_dir() and d.name not in ignore_dirs and not d.name.startswith(".") ] store.update_merkle_root() store.close() if global_index is not None: global_index.close() return DirBuildResult( source_path=dir_path, index_path=index_db_path, files_count=files_count, symbols_count=symbols_count, subdirs=subdirs, ) except Exception as exc: if global_index is not None: try: global_index.close() except Exception: pass return DirBuildResult( source_path=dir_path, index_path=index_db_path, files_count=0, symbols_count=0, subdirs=[], error=str(exc), )