mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
Performance Optimizations: - VectorStore: NumPy vectorized cosine similarity (100x+ faster) - Cached embedding matrix with pre-computed norms - Lazy content loading for top-k results only - Thread-safe cache invalidation - SQLite: Added PRAGMA mmap_size=30GB for memory-mapped I/O - FTS5: unicode61 tokenizer with tokenchars='_' for code identifiers - ChainSearch: files_only fast path skipping snippet generation - ThreadPoolExecutor: shared pool across searches New Components: - DirIndexStore: single-directory index with FTS5 and symbols - RegistryStore: global project registry with path mappings - PathMapper: source-to-index path conversion utility - IndexTreeBuilder: hierarchical index tree construction - ChainSearchEngine: parallel recursive directory search Test Coverage: - 36 comprehensive search functionality tests - 14 performance benchmark tests - 296 total tests passing (100% pass rate) Benchmark Results: - FTS5 search: 0.23-0.26ms avg (3900-4300 ops/sec) - Vector search: 1.05-1.54ms avg (650-955 ops/sec) - Full semantic: 4.56-6.38ms avg per query 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
699 lines
22 KiB
Python
699 lines
22 KiB
Python
"""Hierarchical index tree builder for CodexLens.
|
|
|
|
Constructs a bottom-up directory index tree with parallel processing support.
|
|
Each directory maintains its own _index.db with files and subdirectory links.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import time
|
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Set
|
|
|
|
from codexlens.config import Config
|
|
from codexlens.parsers.factory import ParserFactory
|
|
from codexlens.storage.dir_index import DirIndexStore
|
|
from codexlens.storage.path_mapper import PathMapper
|
|
from codexlens.storage.registry import ProjectInfo, RegistryStore
|
|
|
|
|
|
@dataclass
|
|
class BuildResult:
|
|
"""Complete build operation result."""
|
|
|
|
project_id: int
|
|
source_root: Path
|
|
index_root: Path
|
|
total_files: int
|
|
total_dirs: int
|
|
errors: List[str]
|
|
|
|
|
|
@dataclass
|
|
class DirBuildResult:
|
|
"""Single directory build result."""
|
|
|
|
source_path: Path
|
|
index_path: Path
|
|
files_count: int
|
|
symbols_count: int
|
|
subdirs: List[str] # Subdirectory names
|
|
error: Optional[str] = None
|
|
|
|
|
|
class IndexTreeBuilder:
|
|
"""Hierarchical index tree builder with parallel processing.
|
|
|
|
Builds directory indexes bottom-up to enable proper subdirectory linking.
|
|
Each directory gets its own _index.db containing:
|
|
- Files in that directory
|
|
- Links to child directory indexes
|
|
- Symbols and FTS5 search
|
|
|
|
Attributes:
|
|
registry: Global project registry
|
|
mapper: Path mapping between source and index
|
|
config: CodexLens configuration
|
|
parser_factory: Parser factory for symbol extraction
|
|
logger: Logger instance
|
|
IGNORE_DIRS: Set of directory names to skip during indexing
|
|
"""
|
|
|
|
# Directories to skip during indexing
|
|
IGNORE_DIRS: Set[str] = {
|
|
".git",
|
|
".venv",
|
|
"venv",
|
|
"node_modules",
|
|
"__pycache__",
|
|
".codexlens",
|
|
".idea",
|
|
".vscode",
|
|
}
|
|
|
|
def __init__(
|
|
self, registry: RegistryStore, mapper: PathMapper, config: Config = None
|
|
):
|
|
"""Initialize the index tree builder.
|
|
|
|
Args:
|
|
registry: Global registry store for project tracking
|
|
mapper: Path mapper for source to index conversions
|
|
config: CodexLens configuration (uses defaults if None)
|
|
"""
|
|
self.registry = registry
|
|
self.mapper = mapper
|
|
self.config = config or Config()
|
|
self.parser_factory = ParserFactory(self.config)
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
def build(
|
|
self,
|
|
source_root: Path,
|
|
languages: List[str] = None,
|
|
workers: int = 4,
|
|
) -> BuildResult:
|
|
"""Build complete index tree for a project.
|
|
|
|
Process:
|
|
1. Register project in registry
|
|
2. Collect all directories grouped by depth
|
|
3. Build indexes bottom-up (deepest first)
|
|
4. Link subdirectories to parents
|
|
5. Update project statistics
|
|
|
|
Args:
|
|
source_root: Project root directory to index
|
|
languages: Optional list of language IDs to limit indexing
|
|
workers: Number of parallel worker processes
|
|
|
|
Returns:
|
|
BuildResult with statistics and errors
|
|
|
|
Raises:
|
|
ValueError: If source_root doesn't exist
|
|
"""
|
|
source_root = source_root.resolve()
|
|
if not source_root.exists():
|
|
raise ValueError(f"Source root does not exist: {source_root}")
|
|
|
|
self.logger.info("Building index tree for %s", source_root)
|
|
|
|
# Register project
|
|
index_root = self.mapper.source_to_index_dir(source_root)
|
|
project_info = self.registry.register_project(source_root, index_root)
|
|
|
|
# Collect directories by depth
|
|
dirs_by_depth = self._collect_dirs_by_depth(source_root, languages)
|
|
|
|
if not dirs_by_depth:
|
|
self.logger.warning("No indexable directories found in %s", source_root)
|
|
return BuildResult(
|
|
project_id=project_info.id,
|
|
source_root=source_root,
|
|
index_root=index_root,
|
|
total_files=0,
|
|
total_dirs=0,
|
|
errors=["No indexable directories found"],
|
|
)
|
|
|
|
total_files = 0
|
|
total_dirs = 0
|
|
all_errors: List[str] = []
|
|
all_results: List[DirBuildResult] = [] # Store all results for subdir linking
|
|
|
|
# Build bottom-up (highest depth first)
|
|
max_depth = max(dirs_by_depth.keys())
|
|
for depth in range(max_depth, -1, -1):
|
|
if depth not in dirs_by_depth:
|
|
continue
|
|
|
|
dirs = dirs_by_depth[depth]
|
|
self.logger.info("Building %d directories at depth %d", len(dirs), depth)
|
|
|
|
# Build directories at this level in parallel
|
|
results = self._build_level_parallel(dirs, languages, workers)
|
|
all_results.extend(results)
|
|
|
|
# Process results
|
|
for result in results:
|
|
if result.error:
|
|
all_errors.append(f"{result.source_path}: {result.error}")
|
|
continue
|
|
|
|
total_files += result.files_count
|
|
total_dirs += 1
|
|
|
|
# Register directory in registry
|
|
self.registry.register_dir(
|
|
project_id=project_info.id,
|
|
source_path=result.source_path,
|
|
index_path=result.index_path,
|
|
depth=self.mapper.get_relative_depth(result.source_path, source_root),
|
|
files_count=result.files_count,
|
|
)
|
|
|
|
# After building all directories, link subdirectories to parents
|
|
# This needs to happen after all indexes exist
|
|
for result in all_results:
|
|
if result.error:
|
|
continue
|
|
# Link children to this directory
|
|
self._link_children_to_parent(result.source_path, all_results)
|
|
|
|
# Update project statistics
|
|
self.registry.update_project_stats(source_root, total_files, total_dirs)
|
|
|
|
self.logger.info(
|
|
"Index build complete: %d files, %d directories, %d errors",
|
|
total_files,
|
|
total_dirs,
|
|
len(all_errors),
|
|
)
|
|
|
|
return BuildResult(
|
|
project_id=project_info.id,
|
|
source_root=source_root,
|
|
index_root=index_root,
|
|
total_files=total_files,
|
|
total_dirs=total_dirs,
|
|
errors=all_errors,
|
|
)
|
|
|
|
def update_subtree(
|
|
self,
|
|
source_path: Path,
|
|
languages: List[str] = None,
|
|
workers: int = 4,
|
|
) -> BuildResult:
|
|
"""Incrementally update a subtree.
|
|
|
|
Rebuilds indexes for the specified directory and all subdirectories.
|
|
Useful for incremental updates when only part of the tree changed.
|
|
|
|
Args:
|
|
source_path: Root of subtree to update
|
|
languages: Optional list of language IDs to limit indexing
|
|
workers: Number of parallel worker processes
|
|
|
|
Returns:
|
|
BuildResult for the subtree
|
|
|
|
Raises:
|
|
ValueError: If source_path is not indexed
|
|
"""
|
|
source_path = source_path.resolve()
|
|
project_root = self.mapper.get_project_root(source_path)
|
|
|
|
# Get project info
|
|
project_info = self.registry.get_project(project_root)
|
|
if not project_info:
|
|
raise ValueError(f"Directory not indexed: {source_path}")
|
|
|
|
self.logger.info("Updating subtree at %s", source_path)
|
|
|
|
# Use build logic but start from source_path
|
|
return self.build(source_path, languages, workers)
|
|
|
|
def rebuild_dir(self, source_path: Path) -> DirBuildResult:
|
|
"""Rebuild index for a single directory.
|
|
|
|
Only rebuilds the specified directory, does not touch subdirectories.
|
|
Useful for updating a single directory after file changes.
|
|
|
|
Args:
|
|
source_path: Directory to rebuild
|
|
|
|
Returns:
|
|
DirBuildResult for the directory
|
|
"""
|
|
source_path = source_path.resolve()
|
|
self.logger.info("Rebuilding directory %s", source_path)
|
|
return self._build_single_dir(source_path)
|
|
|
|
# === Internal Methods ===
|
|
|
|
def _collect_dirs_by_depth(
|
|
self, source_root: Path, languages: List[str] = None
|
|
) -> Dict[int, List[Path]]:
|
|
"""Collect all indexable directories grouped by depth.
|
|
|
|
Walks the directory tree and groups directories by their depth
|
|
relative to source_root. Depth 0 is the root itself.
|
|
|
|
Args:
|
|
source_root: Root directory to start from
|
|
languages: Optional language filter
|
|
|
|
Returns:
|
|
Dictionary mapping depth to list of directory paths
|
|
Example: {0: [root], 1: [src, tests], 2: [src/api, src/utils]}
|
|
"""
|
|
source_root = source_root.resolve()
|
|
dirs_by_depth: Dict[int, List[Path]] = {}
|
|
|
|
# Always include the root directory at depth 0 for chain search entry point
|
|
dirs_by_depth[0] = [source_root]
|
|
|
|
for root, dirnames, _ in os.walk(source_root):
|
|
# Filter out ignored directories
|
|
dirnames[:] = [
|
|
d
|
|
for d in dirnames
|
|
if d not in self.IGNORE_DIRS and not d.startswith(".")
|
|
]
|
|
|
|
root_path = Path(root)
|
|
|
|
# Skip root (already added)
|
|
if root_path == source_root:
|
|
continue
|
|
|
|
# Check if this directory should be indexed
|
|
if not self._should_index_dir(root_path, languages):
|
|
continue
|
|
|
|
# Calculate depth relative to source_root
|
|
try:
|
|
depth = len(root_path.relative_to(source_root).parts)
|
|
except ValueError:
|
|
continue
|
|
|
|
if depth not in dirs_by_depth:
|
|
dirs_by_depth[depth] = []
|
|
|
|
dirs_by_depth[depth].append(root_path)
|
|
|
|
return dirs_by_depth
|
|
|
|
def _should_index_dir(self, dir_path: Path, languages: List[str] = None) -> bool:
|
|
"""Check if directory should be indexed.
|
|
|
|
A directory is indexed if:
|
|
1. It's not in IGNORE_DIRS
|
|
2. It doesn't start with '.'
|
|
3. It contains at least one supported language file
|
|
|
|
Args:
|
|
dir_path: Directory to check
|
|
languages: Optional language filter
|
|
|
|
Returns:
|
|
True if directory should be indexed
|
|
"""
|
|
# Check directory name
|
|
if dir_path.name in self.IGNORE_DIRS or dir_path.name.startswith("."):
|
|
return False
|
|
|
|
# Check for supported files in this directory
|
|
source_files = self._iter_source_files(dir_path, languages)
|
|
return len(source_files) > 0
|
|
|
|
def _build_level_parallel(
|
|
self, dirs: List[Path], languages: List[str], workers: int
|
|
) -> List[DirBuildResult]:
|
|
"""Build multiple directories in parallel.
|
|
|
|
Uses ProcessPoolExecutor to build directories concurrently.
|
|
All directories at the same level are independent and can be
|
|
processed in parallel.
|
|
|
|
Args:
|
|
dirs: List of directories to build
|
|
languages: Language filter
|
|
workers: Number of worker processes
|
|
|
|
Returns:
|
|
List of DirBuildResult objects
|
|
"""
|
|
results: List[DirBuildResult] = []
|
|
|
|
if not dirs:
|
|
return results
|
|
|
|
# For single directory, avoid overhead of process pool
|
|
if len(dirs) == 1:
|
|
result = self._build_single_dir(dirs[0], languages)
|
|
return [result]
|
|
|
|
# Prepare arguments for worker processes
|
|
config_dict = {
|
|
"data_dir": str(self.config.data_dir),
|
|
"supported_languages": self.config.supported_languages,
|
|
"parsing_rules": self.config.parsing_rules,
|
|
}
|
|
|
|
worker_args = [
|
|
(
|
|
dir_path,
|
|
self.mapper.source_to_index_db(dir_path),
|
|
languages,
|
|
config_dict,
|
|
)
|
|
for dir_path in dirs
|
|
]
|
|
|
|
# Execute in parallel
|
|
with ProcessPoolExecutor(max_workers=workers) as executor:
|
|
futures = {
|
|
executor.submit(_build_dir_worker, args): args[0]
|
|
for args in worker_args
|
|
}
|
|
|
|
for future in as_completed(futures):
|
|
try:
|
|
result = future.result()
|
|
results.append(result)
|
|
except Exception as exc:
|
|
dir_path = futures[future]
|
|
self.logger.error("Failed to build %s: %s", dir_path, exc)
|
|
results.append(
|
|
DirBuildResult(
|
|
source_path=dir_path,
|
|
index_path=self.mapper.source_to_index_db(dir_path),
|
|
files_count=0,
|
|
symbols_count=0,
|
|
subdirs=[],
|
|
error=str(exc),
|
|
)
|
|
)
|
|
|
|
return results
|
|
|
|
def _build_single_dir(
|
|
self, dir_path: Path, languages: List[str] = None
|
|
) -> DirBuildResult:
|
|
"""Build index for a single directory.
|
|
|
|
Creates _index.db and indexes all files in the directory.
|
|
Does not recurse into subdirectories.
|
|
|
|
Args:
|
|
dir_path: Directory to index
|
|
languages: Optional language filter
|
|
|
|
Returns:
|
|
DirBuildResult with statistics and subdirectory list
|
|
"""
|
|
dir_path = dir_path.resolve()
|
|
index_db_path = self.mapper.source_to_index_db(dir_path)
|
|
|
|
try:
|
|
# Ensure index directory exists
|
|
index_db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create directory index
|
|
store = DirIndexStore(index_db_path)
|
|
store.initialize()
|
|
|
|
# Get source files in this directory only
|
|
source_files = self._iter_source_files(dir_path, languages)
|
|
|
|
files_count = 0
|
|
symbols_count = 0
|
|
|
|
for file_path in source_files:
|
|
try:
|
|
# Read and parse file
|
|
text = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
language_id = self.config.language_for_path(file_path)
|
|
if not language_id:
|
|
continue
|
|
|
|
parser = self.parser_factory.get_parser(language_id)
|
|
indexed_file = parser.parse(text, file_path)
|
|
|
|
# Add to directory index
|
|
store.add_file(
|
|
name=file_path.name,
|
|
full_path=file_path,
|
|
content=text,
|
|
language=language_id,
|
|
symbols=indexed_file.symbols,
|
|
)
|
|
|
|
files_count += 1
|
|
symbols_count += len(indexed_file.symbols)
|
|
|
|
except Exception as exc:
|
|
self.logger.debug("Failed to index %s: %s", file_path, exc)
|
|
continue
|
|
|
|
# Get list of subdirectories
|
|
subdirs = [
|
|
d.name
|
|
for d in dir_path.iterdir()
|
|
if d.is_dir()
|
|
and d.name not in self.IGNORE_DIRS
|
|
and not d.name.startswith(".")
|
|
]
|
|
|
|
store.close()
|
|
|
|
self.logger.debug(
|
|
"Built %s: %d files, %d symbols, %d subdirs",
|
|
dir_path,
|
|
files_count,
|
|
symbols_count,
|
|
len(subdirs),
|
|
)
|
|
|
|
return DirBuildResult(
|
|
source_path=dir_path,
|
|
index_path=index_db_path,
|
|
files_count=files_count,
|
|
symbols_count=symbols_count,
|
|
subdirs=subdirs,
|
|
)
|
|
|
|
except Exception as exc:
|
|
self.logger.error("Failed to build directory %s: %s", dir_path, exc)
|
|
return DirBuildResult(
|
|
source_path=dir_path,
|
|
index_path=index_db_path,
|
|
files_count=0,
|
|
symbols_count=0,
|
|
subdirs=[],
|
|
error=str(exc),
|
|
)
|
|
|
|
def _link_children_to_parent(
|
|
self, parent_path: Path, all_results: List[DirBuildResult]
|
|
) -> None:
|
|
"""Link child directory indexes to parent's subdirs table.
|
|
|
|
Finds all direct children of parent_path in all_results and
|
|
registers them as subdirectories in the parent's index.
|
|
|
|
Args:
|
|
parent_path: Parent directory path
|
|
all_results: List of all build results
|
|
"""
|
|
parent_index_db = self.mapper.source_to_index_db(parent_path)
|
|
|
|
try:
|
|
store = DirIndexStore(parent_index_db)
|
|
store.initialize()
|
|
|
|
for result in all_results:
|
|
# Only register direct children (parent is one level up)
|
|
if result.source_path.parent != parent_path:
|
|
continue
|
|
|
|
if result.error:
|
|
continue
|
|
|
|
# Register subdirectory link
|
|
store.register_subdir(
|
|
name=result.source_path.name,
|
|
index_path=result.index_path,
|
|
files_count=result.files_count,
|
|
direct_files=result.files_count,
|
|
)
|
|
self.logger.debug(
|
|
"Linked %s to parent %s",
|
|
result.source_path.name,
|
|
parent_path,
|
|
)
|
|
|
|
store.close()
|
|
|
|
except Exception as exc:
|
|
self.logger.error(
|
|
"Failed to link children to %s: %s", parent_path, exc
|
|
)
|
|
|
|
def _iter_source_files(
|
|
self, dir_path: Path, languages: List[str] = None
|
|
) -> List[Path]:
|
|
"""Iterate source files in directory (non-recursive).
|
|
|
|
Returns files in the specified directory that match language filters.
|
|
Does not recurse into subdirectories.
|
|
|
|
Args:
|
|
dir_path: Directory to scan
|
|
languages: Optional language filter
|
|
|
|
Returns:
|
|
List of source file paths
|
|
"""
|
|
files: List[Path] = []
|
|
|
|
if not dir_path.is_dir():
|
|
return files
|
|
|
|
for item in dir_path.iterdir():
|
|
if not item.is_file():
|
|
continue
|
|
|
|
if item.name.startswith("."):
|
|
continue
|
|
|
|
# Check language support
|
|
language_id = self.config.language_for_path(item)
|
|
if not language_id:
|
|
continue
|
|
|
|
# Apply language filter
|
|
if languages and language_id not in languages:
|
|
continue
|
|
|
|
files.append(item)
|
|
|
|
return files
|
|
|
|
|
|
# === Worker Function for ProcessPoolExecutor ===
|
|
|
|
|
|
def _build_dir_worker(args: tuple) -> DirBuildResult:
|
|
"""Worker function for parallel directory building.
|
|
|
|
Must be at module level for ProcessPoolExecutor pickling.
|
|
Reconstructs necessary objects from serializable arguments.
|
|
|
|
Args:
|
|
args: Tuple of (dir_path, index_db_path, languages, config_dict)
|
|
|
|
Returns:
|
|
DirBuildResult for the directory
|
|
"""
|
|
dir_path, index_db_path, languages, config_dict = args
|
|
|
|
# Reconstruct config
|
|
config = Config(
|
|
data_dir=Path(config_dict["data_dir"]),
|
|
supported_languages=config_dict["supported_languages"],
|
|
parsing_rules=config_dict["parsing_rules"],
|
|
)
|
|
|
|
parser_factory = ParserFactory(config)
|
|
|
|
try:
|
|
# Ensure index directory exists
|
|
index_db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create directory index
|
|
store = DirIndexStore(index_db_path)
|
|
store.initialize()
|
|
|
|
files_count = 0
|
|
symbols_count = 0
|
|
|
|
# Index files in this directory
|
|
for item in dir_path.iterdir():
|
|
if not item.is_file():
|
|
continue
|
|
|
|
if item.name.startswith("."):
|
|
continue
|
|
|
|
language_id = config.language_for_path(item)
|
|
if not language_id:
|
|
continue
|
|
|
|
if languages and language_id not in languages:
|
|
continue
|
|
|
|
try:
|
|
text = item.read_text(encoding="utf-8", errors="ignore")
|
|
parser = parser_factory.get_parser(language_id)
|
|
indexed_file = parser.parse(text, item)
|
|
|
|
store.add_file(
|
|
name=item.name,
|
|
full_path=item,
|
|
content=text,
|
|
language=language_id,
|
|
symbols=indexed_file.symbols,
|
|
)
|
|
|
|
files_count += 1
|
|
symbols_count += len(indexed_file.symbols)
|
|
|
|
except Exception:
|
|
continue
|
|
|
|
# Get subdirectories
|
|
ignore_dirs = {
|
|
".git",
|
|
".venv",
|
|
"venv",
|
|
"node_modules",
|
|
"__pycache__",
|
|
".codexlens",
|
|
".idea",
|
|
".vscode",
|
|
}
|
|
|
|
subdirs = [
|
|
d.name
|
|
for d in dir_path.iterdir()
|
|
if d.is_dir() and d.name not in ignore_dirs and not d.name.startswith(".")
|
|
]
|
|
|
|
store.close()
|
|
|
|
return DirBuildResult(
|
|
source_path=dir_path,
|
|
index_path=index_db_path,
|
|
files_count=files_count,
|
|
symbols_count=symbols_count,
|
|
subdirs=subdirs,
|
|
)
|
|
|
|
except Exception as exc:
|
|
return DirBuildResult(
|
|
source_path=dir_path,
|
|
index_path=index_db_path,
|
|
files_count=0,
|
|
symbols_count=0,
|
|
subdirs=[],
|
|
error=str(exc),
|
|
)
|