perf(codex-lens): optimize search performance with vectorized operations

Performance Optimizations: - VectorStore: NumPy vectorized cosine similarity (100x+ faster) - Cached embedding matrix with pre-computed norms - Lazy content loading for top-k results only - Thread-safe cache invalidation - SQLite: Added PRAGMA mmap_size=30GB for memory-mapped I/O - FTS5: unicode61 tokenizer with tokenchars='_' for code identifiers - ChainSearch: files_only fast path skipping snippet generation - ThreadPoolExecutor: shared pool across searches New Components: - DirIndexStore: single-directory index with FTS5 and symbols - RegistryStore: global project registry with path mappings - PathMapper: source-to-index path conversion utility - IndexTreeBuilder: hierarchical index tree construction - ChainSearchEngine: parallel recursive directory search Test Coverage: - 36 comprehensive search functionality tests - 14 performance benchmark tests - 296 total tests passing (100% pass rate) Benchmark Results: - FTS5 search: 0.23-0.26ms avg (3900-4300 ops/sec) - Vector search: 1.05-1.54ms avg (650-955 ops/sec) - Full semantic: 4.56-6.38ms avg per query 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-12 02:37:45 +08:00 · 2025-12-14 11:06:24 +08:00
parent 90adef6cfb
commit 08dc0a0348
11 changed files with 4470 additions and 54 deletions
--- a/codex-lens/src/codexlens/search/chain_search.py
+++ b/codex-lens/src/codexlens/search/chain_search.py
@@ -0,0 +1,566 @@
+"""Chain search engine for recursive multi-directory searching.
+
+Provides parallel search across directory hierarchies using indexed _index.db files.
+Supports depth-limited traversal, result aggregation, and symbol search.
+"""
+
+from __future__ import annotations
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+import logging
+import time
+
+from codexlens.entities import SearchResult, Symbol
+from codexlens.storage.registry import RegistryStore, DirMapping
+from codexlens.storage.dir_index import DirIndexStore, SubdirLink
+from codexlens.storage.path_mapper import PathMapper
+
+
+@dataclass
+class SearchOptions:
+    """Configuration options for chain search.
+
+    Attributes:
+        depth: Maximum search depth (-1 = unlimited, 0 = current dir only)
+        max_workers: Number of parallel worker threads
+        limit_per_dir: Maximum results per directory
+        total_limit: Total result limit across all directories
+        include_symbols: Whether to include symbol search results
+        files_only: Return only file paths without excerpts
+    """
+    depth: int = -1
+    max_workers: int = 8
+    limit_per_dir: int = 10
+    total_limit: int = 100
+    include_symbols: bool = False
+    files_only: bool = False
+
+
+@dataclass
+class SearchStats:
+    """Statistics collected during search execution.
+
+    Attributes:
+        dirs_searched: Number of directories searched
+        files_matched: Number of files with matches
+        time_ms: Total search time in milliseconds
+        errors: List of error messages encountered
+    """
+    dirs_searched: int = 0
+    files_matched: int = 0
+    time_ms: float = 0
+    errors: List[str] = field(default_factory=list)
+
+
+@dataclass
+class ChainSearchResult:
+    """Comprehensive search result with metadata.
+
+    Attributes:
+        query: Original search query
+        results: List of SearchResult objects
+        symbols: List of Symbol objects (if include_symbols=True)
+        stats: SearchStats with execution metrics
+    """
+    query: str
+    results: List[SearchResult]
+    symbols: List[Symbol]
+    stats: SearchStats
+
+
+class ChainSearchEngine:
+    """Parallel chain search engine for hierarchical directory indexes.
+
+    Searches across multiple directory indexes in parallel, following subdirectory
+    links to recursively traverse the file tree. Supports depth limits, result
+    aggregation, and both content and symbol searches.
+
+    Thread-safe with configurable parallelism.
+
+    Attributes:
+        registry: Global project registry
+        mapper: Path mapping utility
+        logger: Python logger instance
+    """
+
+    def __init__(self,
+                 registry: RegistryStore,
+                 mapper: PathMapper,
+                 max_workers: int = 8):
+        """Initialize chain search engine.
+
+        Args:
+            registry: Global project registry for path lookups
+            mapper: Path mapper for source/index conversions
+            max_workers: Maximum parallel workers (default 8)
+        """
+        self.registry = registry
+        self.mapper = mapper
+        self.logger = logging.getLogger(__name__)
+        self._max_workers = max_workers
+        self._executor: Optional[ThreadPoolExecutor] = None
+
+    def _get_executor(self, max_workers: Optional[int] = None) -> ThreadPoolExecutor:
+        """Get or create the shared thread pool executor.
+
+        Lazy initialization to avoid creating executor if never used.
+
+        Args:
+            max_workers: Override default max_workers if specified
+
+        Returns:
+            ThreadPoolExecutor instance
+        """
+        workers = max_workers or self._max_workers
+        if self._executor is None:
+            self._executor = ThreadPoolExecutor(max_workers=workers)
+        return self._executor
+
+    def close(self) -> None:
+        """Shutdown the thread pool executor."""
+        if self._executor is not None:
+            self._executor.shutdown(wait=True)
+            self._executor = None
+
+    def __enter__(self) -> "ChainSearchEngine":
+        """Context manager entry."""
+        return self
+
+    def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
+        """Context manager exit."""
+        self.close()
+
+    def search(self, query: str,
+               source_path: Path,
+               options: Optional[SearchOptions] = None) -> ChainSearchResult:
+        """Execute chain search from source_path with recursive traversal.
+
+        Process:
+        1. Locate starting index for source_path
+        2. Collect all child indexes based on depth limit
+        3. Search indexes in parallel using ThreadPoolExecutor
+        4. Aggregate, deduplicate, and rank results
+
+        Args:
+            query: FTS5 search query string
+            source_path: Starting directory path
+            options: Search configuration (uses defaults if None)
+
+        Returns:
+            ChainSearchResult with results, symbols, and statistics
+
+        Examples:
+            >>> engine = ChainSearchEngine(registry, mapper)
+            >>> result = engine.search("authentication", Path("D:/project/src"))
+            >>> for r in result.results[:5]:
+            ...     print(f"{r.path}: {r.score:.2f}")
+        """
+        options = options or SearchOptions()
+        start_time = time.time()
+        stats = SearchStats()
+
+        # Step 1: Find starting index
+        start_index = self._find_start_index(source_path)
+        if not start_index:
+            self.logger.warning(f"No index found for {source_path}")
+            stats.time_ms = (time.time() - start_time) * 1000
+            return ChainSearchResult(
+                query=query,
+                results=[],
+                symbols=[],
+                stats=stats
+            )
+
+        # Step 2: Collect all index paths to search
+        index_paths = self._collect_index_paths(start_index, options.depth)
+        stats.dirs_searched = len(index_paths)
+
+        if not index_paths:
+            self.logger.warning(f"No indexes collected from {start_index}")
+            stats.time_ms = (time.time() - start_time) * 1000
+            return ChainSearchResult(
+                query=query,
+                results=[],
+                symbols=[],
+                stats=stats
+            )
+
+        # Step 3: Parallel search
+        results, search_stats = self._search_parallel(
+            index_paths, query, options
+        )
+        stats.errors = search_stats.errors
+
+        # Step 4: Merge and rank
+        final_results = self._merge_and_rank(results, options.total_limit)
+        stats.files_matched = len(final_results)
+
+        # Optional: Symbol search
+        symbols = []
+        if options.include_symbols:
+            symbols = self._search_symbols_parallel(
+                index_paths, query, None, options.total_limit
+            )
+
+        stats.time_ms = (time.time() - start_time) * 1000
+
+        return ChainSearchResult(
+            query=query,
+            results=final_results,
+            symbols=symbols,
+            stats=stats
+        )
+
+    def search_files_only(self, query: str,
+                          source_path: Path,
+                          options: Optional[SearchOptions] = None) -> List[str]:
+        """Search and return only matching file paths.
+
+        Faster than full search when excerpts are not needed.
+
+        Args:
+            query: FTS5 search query string
+            source_path: Starting directory path
+            options: Search configuration (uses defaults if None)
+
+        Returns:
+            List of file paths as strings
+
+        Examples:
+            >>> engine = ChainSearchEngine(registry, mapper)
+            >>> paths = engine.search_files_only("TODO", Path("D:/project"))
+            >>> print(f"Found {len(paths)} files with TODOs")
+        """
+        options = options or SearchOptions()
+        options.files_only = True
+
+        result = self.search(query, source_path, options)
+        return [r.path for r in result.results]
+
+    def search_symbols(self, name: str,
+                       source_path: Path,
+                       kind: Optional[str] = None,
+                       options: Optional[SearchOptions] = None) -> List[Symbol]:
+        """Chain symbol search across directory hierarchy.
+
+        Args:
+            name: Symbol name pattern (partial match supported)
+            source_path: Starting directory path
+            kind: Optional symbol kind filter (e.g., 'function', 'class')
+            options: Search configuration (uses defaults if None)
+
+        Returns:
+            List of Symbol objects sorted by name
+
+        Examples:
+            >>> engine = ChainSearchEngine(registry, mapper)
+            >>> funcs = engine.search_symbols("init", Path("D:/project"), kind="function")
+            >>> for sym in funcs[:10]:
+            ...     print(f"{sym.name} ({sym.kind}): lines {sym.range}")
+        """
+        options = options or SearchOptions()
+
+        start_index = self._find_start_index(source_path)
+        if not start_index:
+            self.logger.warning(f"No index found for {source_path}")
+            return []
+
+        index_paths = self._collect_index_paths(start_index, options.depth)
+        if not index_paths:
+            return []
+
+        return self._search_symbols_parallel(
+            index_paths, name, kind, options.total_limit
+        )
+
+    # === Internal Methods ===
+
+    def _find_start_index(self, source_path: Path) -> Optional[Path]:
+        """Find index database path for source directory.
+
+        Attempts exact match first, then searches for nearest ancestor index.
+
+        Args:
+            source_path: Source directory path
+
+        Returns:
+            Path to _index.db file, or None if not found
+        """
+        source_path = source_path.resolve()
+
+        # Try exact match first
+        exact_index = self.mapper.source_to_index_db(source_path)
+        if exact_index.exists():
+            self.logger.debug(f"Found exact index: {exact_index}")
+            return exact_index
+
+        # Try nearest ancestor via registry
+        nearest = self.registry.find_nearest_index(source_path)
+        if nearest:
+            self.logger.debug(f"Found nearest index: {nearest.index_path}")
+            return nearest.index_path
+
+        self.logger.warning(f"No index found for {source_path}")
+        return None
+
+    def _collect_index_paths(self, start_index: Path,
+                              depth: int) -> List[Path]:
+        """Recursively collect all subdirectory index paths.
+
+        Traverses directory tree via subdirs table in each _index.db,
+        respecting depth limit.
+
+        Args:
+            start_index: Starting _index.db path
+            depth: Maximum depth (-1 = unlimited, 0 = current only)
+
+        Returns:
+            List of _index.db paths to search
+        """
+        collected = []
+        visited = set()
+
+        def _collect_recursive(index_path: Path, current_depth: int):
+            # Normalize path to avoid duplicates
+            normalized = index_path.resolve()
+            if normalized in visited:
+                return
+            visited.add(normalized)
+
+            # Add current index
+            if normalized.exists():
+                collected.append(normalized)
+            else:
+                self.logger.debug(f"Index does not exist: {normalized}")
+                return
+
+            # Check depth limit
+            if depth >= 0 and current_depth >= depth:
+                return
+
+            # Read subdirs and recurse
+            try:
+                with DirIndexStore(normalized) as store:
+                    subdirs = store.get_subdirs()
+                    for subdir in subdirs:
+                        _collect_recursive(subdir.index_path, current_depth + 1)
+            except Exception as exc:
+                self.logger.warning(f"Failed to read subdirs from {normalized}: {exc}")
+
+        _collect_recursive(start_index, 0)
+        self.logger.info(f"Collected {len(collected)} indexes (depth={depth})")
+        return collected
+
+    def _search_parallel(self, index_paths: List[Path],
+                          query: str,
+                          options: SearchOptions) -> tuple[List[SearchResult], SearchStats]:
+        """Search multiple indexes in parallel using shared ThreadPoolExecutor.
+
+        Args:
+            index_paths: List of _index.db paths to search
+            query: FTS5 query string
+            options: Search configuration
+
+        Returns:
+            Tuple of (all results, search statistics)
+        """
+        all_results = []
+        stats = SearchStats()
+
+        executor = self._get_executor(options.max_workers)
+        # Submit all search tasks
+        future_to_path = {
+            executor.submit(
+                self._search_single_index,
+                idx_path,
+                query,
+                options.limit_per_dir,
+                options.files_only
+            ): idx_path
+            for idx_path in index_paths
+        }
+
+        # Collect results as they complete
+        for future in as_completed(future_to_path):
+            idx_path = future_to_path[future]
+            try:
+                results = future.result()
+                all_results.extend(results)
+                self.logger.debug(f"Got {len(results)} results from {idx_path.parent.name}")
+            except Exception as exc:
+                error_msg = f"Search failed for {idx_path}: {exc}"
+                self.logger.error(error_msg)
+                stats.errors.append(error_msg)
+
+        return all_results, stats
+
+    def _search_single_index(self, index_path: Path,
+                              query: str,
+                              limit: int,
+                              files_only: bool = False) -> List[SearchResult]:
+        """Search a single index database.
+
+        Handles exceptions gracefully, returning empty list on failure.
+
+        Args:
+            index_path: Path to _index.db file
+            query: FTS5 query string
+            limit: Maximum results from this index
+            files_only: If True, skip snippet generation for faster search
+
+        Returns:
+            List of SearchResult objects (empty on error)
+        """
+        try:
+            with DirIndexStore(index_path) as store:
+                if files_only:
+                    # Fast path: return paths only without snippets
+                    paths = store.search_files_only(query, limit=limit)
+                    return [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
+                else:
+                    return store.search_fts(query, limit=limit)
+        except Exception as exc:
+            self.logger.debug(f"Search error in {index_path}: {exc}")
+            return []
+
+    def _merge_and_rank(self, results: List[SearchResult],
+                         limit: int) -> List[SearchResult]:
+        """Aggregate, deduplicate, and rank results.
+
+        Process:
+        1. Deduplicate by path (keep highest score)
+        2. Sort by score descending
+        3. Limit to requested count
+
+        Args:
+            results: Raw results from all indexes
+            limit: Maximum results to return
+
+        Returns:
+            Deduplicated and ranked results
+        """
+        # Deduplicate by path, keeping best score
+        path_to_result: Dict[str, SearchResult] = {}
+        for result in results:
+            path = result.path
+            if path not in path_to_result or result.score > path_to_result[path].score:
+                path_to_result[path] = result
+
+        # Sort by score descending
+        unique_results = list(path_to_result.values())
+        unique_results.sort(key=lambda r: r.score, reverse=True)
+
+        # Apply limit
+        return unique_results[:limit]
+
+    def _search_symbols_parallel(self, index_paths: List[Path],
+                                  name: str,
+                                  kind: Optional[str],
+                                  limit: int) -> List[Symbol]:
+        """Search symbols across multiple indexes in parallel.
+
+        Args:
+            index_paths: List of _index.db paths to search
+            name: Symbol name pattern
+            kind: Optional symbol kind filter
+            limit: Total symbol limit
+
+        Returns:
+            Deduplicated and sorted symbols
+        """
+        all_symbols = []
+
+        executor = self._get_executor()
+        # Submit all symbol search tasks
+        future_to_path = {
+            executor.submit(
+                self._search_symbols_single,
+                idx_path,
+                name,
+                kind
+            ): idx_path
+            for idx_path in index_paths
+        }
+
+        # Collect results
+        for future in as_completed(future_to_path):
+            try:
+                symbols = future.result()
+                all_symbols.extend(symbols)
+            except Exception as exc:
+                self.logger.error(f"Symbol search failed: {exc}")
+
+        # Deduplicate by (name, kind, range)
+        seen = set()
+        unique_symbols = []
+        for sym in all_symbols:
+            key = (sym.name, sym.kind, sym.range)
+            if key not in seen:
+                seen.add(key)
+                unique_symbols.append(sym)
+
+        # Sort by name
+        unique_symbols.sort(key=lambda s: s.name)
+
+        return unique_symbols[:limit]
+
+    def _search_symbols_single(self, index_path: Path,
+                                name: str,
+                                kind: Optional[str]) -> List[Symbol]:
+        """Search symbols in a single index.
+
+        Args:
+            index_path: Path to _index.db file
+            name: Symbol name pattern
+            kind: Optional symbol kind filter
+
+        Returns:
+            List of Symbol objects (empty on error)
+        """
+        try:
+            with DirIndexStore(index_path) as store:
+                return store.search_symbols(name, kind=kind)
+        except Exception as exc:
+            self.logger.debug(f"Symbol search error in {index_path}: {exc}")
+            return []
+
+
+# === Convenience Functions ===
+
+def quick_search(query: str,
+                 source_path: Path,
+                 depth: int = -1) -> List[SearchResult]:
+    """Quick search convenience function with automatic initialization.
+
+    Creates temporary registry and mapper instances for one-off searches.
+    For repeated searches, create a ChainSearchEngine instance directly.
+
+    Args:
+        query: FTS5 search query string
+        source_path: Starting directory path
+        depth: Maximum search depth (-1 = unlimited)
+
+    Returns:
+        List of SearchResult objects sorted by relevance
+
+    Examples:
+        >>> from pathlib import Path
+        >>> results = quick_search("authentication", Path("D:/project/src"))
+        >>> print(f"Found {len(results)} matches")
+    """
+    registry = RegistryStore()
+    registry.initialize()
+
+    mapper = PathMapper()
+
+    engine = ChainSearchEngine(registry, mapper)
+    options = SearchOptions(depth=depth)
+
+    result = engine.search(query, source_path, options)
+
+    registry.close()
+
+    return result.results