perf(codex-lens): optimize search performance with vectorized operations

Performance Optimizations:
- VectorStore: NumPy vectorized cosine similarity (100x+ faster)
  - Cached embedding matrix with pre-computed norms
  - Lazy content loading for top-k results only
  - Thread-safe cache invalidation
- SQLite: Added PRAGMA mmap_size=30GB for memory-mapped I/O
- FTS5: unicode61 tokenizer with tokenchars='_' for code identifiers
- ChainSearch: files_only fast path skipping snippet generation
- ThreadPoolExecutor: shared pool across searches

New Components:
- DirIndexStore: single-directory index with FTS5 and symbols
- RegistryStore: global project registry with path mappings
- PathMapper: source-to-index path conversion utility
- IndexTreeBuilder: hierarchical index tree construction
- ChainSearchEngine: parallel recursive directory search

Test Coverage:
- 36 comprehensive search functionality tests
- 14 performance benchmark tests
- 296 total tests passing (100% pass rate)

Benchmark Results:
- FTS5 search: 0.23-0.26ms avg (3900-4300 ops/sec)
- Vector search: 1.05-1.54ms avg (650-955 ops/sec)
- Full semantic: 4.56-6.38ms avg per query

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
catlog22
2025-12-14 11:06:24 +08:00
parent 90adef6cfb
commit 08dc0a0348
11 changed files with 4470 additions and 54 deletions

View File

@@ -0,0 +1,566 @@
"""Chain search engine for recursive multi-directory searching.
Provides parallel search across directory hierarchies using indexed _index.db files.
Supports depth-limited traversal, result aggregation, and symbol search.
"""
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional, Dict, Any
import logging
import time
from codexlens.entities import SearchResult, Symbol
from codexlens.storage.registry import RegistryStore, DirMapping
from codexlens.storage.dir_index import DirIndexStore, SubdirLink
from codexlens.storage.path_mapper import PathMapper
@dataclass
class SearchOptions:
"""Configuration options for chain search.
Attributes:
depth: Maximum search depth (-1 = unlimited, 0 = current dir only)
max_workers: Number of parallel worker threads
limit_per_dir: Maximum results per directory
total_limit: Total result limit across all directories
include_symbols: Whether to include symbol search results
files_only: Return only file paths without excerpts
"""
depth: int = -1
max_workers: int = 8
limit_per_dir: int = 10
total_limit: int = 100
include_symbols: bool = False
files_only: bool = False
@dataclass
class SearchStats:
"""Statistics collected during search execution.
Attributes:
dirs_searched: Number of directories searched
files_matched: Number of files with matches
time_ms: Total search time in milliseconds
errors: List of error messages encountered
"""
dirs_searched: int = 0
files_matched: int = 0
time_ms: float = 0
errors: List[str] = field(default_factory=list)
@dataclass
class ChainSearchResult:
"""Comprehensive search result with metadata.
Attributes:
query: Original search query
results: List of SearchResult objects
symbols: List of Symbol objects (if include_symbols=True)
stats: SearchStats with execution metrics
"""
query: str
results: List[SearchResult]
symbols: List[Symbol]
stats: SearchStats
class ChainSearchEngine:
"""Parallel chain search engine for hierarchical directory indexes.
Searches across multiple directory indexes in parallel, following subdirectory
links to recursively traverse the file tree. Supports depth limits, result
aggregation, and both content and symbol searches.
Thread-safe with configurable parallelism.
Attributes:
registry: Global project registry
mapper: Path mapping utility
logger: Python logger instance
"""
def __init__(self,
registry: RegistryStore,
mapper: PathMapper,
max_workers: int = 8):
"""Initialize chain search engine.
Args:
registry: Global project registry for path lookups
mapper: Path mapper for source/index conversions
max_workers: Maximum parallel workers (default 8)
"""
self.registry = registry
self.mapper = mapper
self.logger = logging.getLogger(__name__)
self._max_workers = max_workers
self._executor: Optional[ThreadPoolExecutor] = None
def _get_executor(self, max_workers: Optional[int] = None) -> ThreadPoolExecutor:
"""Get or create the shared thread pool executor.
Lazy initialization to avoid creating executor if never used.
Args:
max_workers: Override default max_workers if specified
Returns:
ThreadPoolExecutor instance
"""
workers = max_workers or self._max_workers
if self._executor is None:
self._executor = ThreadPoolExecutor(max_workers=workers)
return self._executor
def close(self) -> None:
"""Shutdown the thread pool executor."""
if self._executor is not None:
self._executor.shutdown(wait=True)
self._executor = None
def __enter__(self) -> "ChainSearchEngine":
"""Context manager entry."""
return self
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
"""Context manager exit."""
self.close()
def search(self, query: str,
source_path: Path,
options: Optional[SearchOptions] = None) -> ChainSearchResult:
"""Execute chain search from source_path with recursive traversal.
Process:
1. Locate starting index for source_path
2. Collect all child indexes based on depth limit
3. Search indexes in parallel using ThreadPoolExecutor
4. Aggregate, deduplicate, and rank results
Args:
query: FTS5 search query string
source_path: Starting directory path
options: Search configuration (uses defaults if None)
Returns:
ChainSearchResult with results, symbols, and statistics
Examples:
>>> engine = ChainSearchEngine(registry, mapper)
>>> result = engine.search("authentication", Path("D:/project/src"))
>>> for r in result.results[:5]:
... print(f"{r.path}: {r.score:.2f}")
"""
options = options or SearchOptions()
start_time = time.time()
stats = SearchStats()
# Step 1: Find starting index
start_index = self._find_start_index(source_path)
if not start_index:
self.logger.warning(f"No index found for {source_path}")
stats.time_ms = (time.time() - start_time) * 1000
return ChainSearchResult(
query=query,
results=[],
symbols=[],
stats=stats
)
# Step 2: Collect all index paths to search
index_paths = self._collect_index_paths(start_index, options.depth)
stats.dirs_searched = len(index_paths)
if not index_paths:
self.logger.warning(f"No indexes collected from {start_index}")
stats.time_ms = (time.time() - start_time) * 1000
return ChainSearchResult(
query=query,
results=[],
symbols=[],
stats=stats
)
# Step 3: Parallel search
results, search_stats = self._search_parallel(
index_paths, query, options
)
stats.errors = search_stats.errors
# Step 4: Merge and rank
final_results = self._merge_and_rank(results, options.total_limit)
stats.files_matched = len(final_results)
# Optional: Symbol search
symbols = []
if options.include_symbols:
symbols = self._search_symbols_parallel(
index_paths, query, None, options.total_limit
)
stats.time_ms = (time.time() - start_time) * 1000
return ChainSearchResult(
query=query,
results=final_results,
symbols=symbols,
stats=stats
)
def search_files_only(self, query: str,
source_path: Path,
options: Optional[SearchOptions] = None) -> List[str]:
"""Search and return only matching file paths.
Faster than full search when excerpts are not needed.
Args:
query: FTS5 search query string
source_path: Starting directory path
options: Search configuration (uses defaults if None)
Returns:
List of file paths as strings
Examples:
>>> engine = ChainSearchEngine(registry, mapper)
>>> paths = engine.search_files_only("TODO", Path("D:/project"))
>>> print(f"Found {len(paths)} files with TODOs")
"""
options = options or SearchOptions()
options.files_only = True
result = self.search(query, source_path, options)
return [r.path for r in result.results]
def search_symbols(self, name: str,
source_path: Path,
kind: Optional[str] = None,
options: Optional[SearchOptions] = None) -> List[Symbol]:
"""Chain symbol search across directory hierarchy.
Args:
name: Symbol name pattern (partial match supported)
source_path: Starting directory path
kind: Optional symbol kind filter (e.g., 'function', 'class')
options: Search configuration (uses defaults if None)
Returns:
List of Symbol objects sorted by name
Examples:
>>> engine = ChainSearchEngine(registry, mapper)
>>> funcs = engine.search_symbols("init", Path("D:/project"), kind="function")
>>> for sym in funcs[:10]:
... print(f"{sym.name} ({sym.kind}): lines {sym.range}")
"""
options = options or SearchOptions()
start_index = self._find_start_index(source_path)
if not start_index:
self.logger.warning(f"No index found for {source_path}")
return []
index_paths = self._collect_index_paths(start_index, options.depth)
if not index_paths:
return []
return self._search_symbols_parallel(
index_paths, name, kind, options.total_limit
)
# === Internal Methods ===
def _find_start_index(self, source_path: Path) -> Optional[Path]:
"""Find index database path for source directory.
Attempts exact match first, then searches for nearest ancestor index.
Args:
source_path: Source directory path
Returns:
Path to _index.db file, or None if not found
"""
source_path = source_path.resolve()
# Try exact match first
exact_index = self.mapper.source_to_index_db(source_path)
if exact_index.exists():
self.logger.debug(f"Found exact index: {exact_index}")
return exact_index
# Try nearest ancestor via registry
nearest = self.registry.find_nearest_index(source_path)
if nearest:
self.logger.debug(f"Found nearest index: {nearest.index_path}")
return nearest.index_path
self.logger.warning(f"No index found for {source_path}")
return None
def _collect_index_paths(self, start_index: Path,
depth: int) -> List[Path]:
"""Recursively collect all subdirectory index paths.
Traverses directory tree via subdirs table in each _index.db,
respecting depth limit.
Args:
start_index: Starting _index.db path
depth: Maximum depth (-1 = unlimited, 0 = current only)
Returns:
List of _index.db paths to search
"""
collected = []
visited = set()
def _collect_recursive(index_path: Path, current_depth: int):
# Normalize path to avoid duplicates
normalized = index_path.resolve()
if normalized in visited:
return
visited.add(normalized)
# Add current index
if normalized.exists():
collected.append(normalized)
else:
self.logger.debug(f"Index does not exist: {normalized}")
return
# Check depth limit
if depth >= 0 and current_depth >= depth:
return
# Read subdirs and recurse
try:
with DirIndexStore(normalized) as store:
subdirs = store.get_subdirs()
for subdir in subdirs:
_collect_recursive(subdir.index_path, current_depth + 1)
except Exception as exc:
self.logger.warning(f"Failed to read subdirs from {normalized}: {exc}")
_collect_recursive(start_index, 0)
self.logger.info(f"Collected {len(collected)} indexes (depth={depth})")
return collected
def _search_parallel(self, index_paths: List[Path],
query: str,
options: SearchOptions) -> tuple[List[SearchResult], SearchStats]:
"""Search multiple indexes in parallel using shared ThreadPoolExecutor.
Args:
index_paths: List of _index.db paths to search
query: FTS5 query string
options: Search configuration
Returns:
Tuple of (all results, search statistics)
"""
all_results = []
stats = SearchStats()
executor = self._get_executor(options.max_workers)
# Submit all search tasks
future_to_path = {
executor.submit(
self._search_single_index,
idx_path,
query,
options.limit_per_dir,
options.files_only
): idx_path
for idx_path in index_paths
}
# Collect results as they complete
for future in as_completed(future_to_path):
idx_path = future_to_path[future]
try:
results = future.result()
all_results.extend(results)
self.logger.debug(f"Got {len(results)} results from {idx_path.parent.name}")
except Exception as exc:
error_msg = f"Search failed for {idx_path}: {exc}"
self.logger.error(error_msg)
stats.errors.append(error_msg)
return all_results, stats
def _search_single_index(self, index_path: Path,
query: str,
limit: int,
files_only: bool = False) -> List[SearchResult]:
"""Search a single index database.
Handles exceptions gracefully, returning empty list on failure.
Args:
index_path: Path to _index.db file
query: FTS5 query string
limit: Maximum results from this index
files_only: If True, skip snippet generation for faster search
Returns:
List of SearchResult objects (empty on error)
"""
try:
with DirIndexStore(index_path) as store:
if files_only:
# Fast path: return paths only without snippets
paths = store.search_files_only(query, limit=limit)
return [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
else:
return store.search_fts(query, limit=limit)
except Exception as exc:
self.logger.debug(f"Search error in {index_path}: {exc}")
return []
def _merge_and_rank(self, results: List[SearchResult],
limit: int) -> List[SearchResult]:
"""Aggregate, deduplicate, and rank results.
Process:
1. Deduplicate by path (keep highest score)
2. Sort by score descending
3. Limit to requested count
Args:
results: Raw results from all indexes
limit: Maximum results to return
Returns:
Deduplicated and ranked results
"""
# Deduplicate by path, keeping best score
path_to_result: Dict[str, SearchResult] = {}
for result in results:
path = result.path
if path not in path_to_result or result.score > path_to_result[path].score:
path_to_result[path] = result
# Sort by score descending
unique_results = list(path_to_result.values())
unique_results.sort(key=lambda r: r.score, reverse=True)
# Apply limit
return unique_results[:limit]
def _search_symbols_parallel(self, index_paths: List[Path],
name: str,
kind: Optional[str],
limit: int) -> List[Symbol]:
"""Search symbols across multiple indexes in parallel.
Args:
index_paths: List of _index.db paths to search
name: Symbol name pattern
kind: Optional symbol kind filter
limit: Total symbol limit
Returns:
Deduplicated and sorted symbols
"""
all_symbols = []
executor = self._get_executor()
# Submit all symbol search tasks
future_to_path = {
executor.submit(
self._search_symbols_single,
idx_path,
name,
kind
): idx_path
for idx_path in index_paths
}
# Collect results
for future in as_completed(future_to_path):
try:
symbols = future.result()
all_symbols.extend(symbols)
except Exception as exc:
self.logger.error(f"Symbol search failed: {exc}")
# Deduplicate by (name, kind, range)
seen = set()
unique_symbols = []
for sym in all_symbols:
key = (sym.name, sym.kind, sym.range)
if key not in seen:
seen.add(key)
unique_symbols.append(sym)
# Sort by name
unique_symbols.sort(key=lambda s: s.name)
return unique_symbols[:limit]
def _search_symbols_single(self, index_path: Path,
name: str,
kind: Optional[str]) -> List[Symbol]:
"""Search symbols in a single index.
Args:
index_path: Path to _index.db file
name: Symbol name pattern
kind: Optional symbol kind filter
Returns:
List of Symbol objects (empty on error)
"""
try:
with DirIndexStore(index_path) as store:
return store.search_symbols(name, kind=kind)
except Exception as exc:
self.logger.debug(f"Symbol search error in {index_path}: {exc}")
return []
# === Convenience Functions ===
def quick_search(query: str,
source_path: Path,
depth: int = -1) -> List[SearchResult]:
"""Quick search convenience function with automatic initialization.
Creates temporary registry and mapper instances for one-off searches.
For repeated searches, create a ChainSearchEngine instance directly.
Args:
query: FTS5 search query string
source_path: Starting directory path
depth: Maximum search depth (-1 = unlimited)
Returns:
List of SearchResult objects sorted by relevance
Examples:
>>> from pathlib import Path
>>> results = quick_search("authentication", Path("D:/project/src"))
>>> print(f"Found {len(results)} matches")
"""
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
engine = ChainSearchEngine(registry, mapper)
options = SearchOptions(depth=depth)
result = engine.search(query, source_path, options)
registry.close()
return result.results