mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
perf(codex-lens): optimize search performance with vectorized operations
Performance Optimizations: - VectorStore: NumPy vectorized cosine similarity (100x+ faster) - Cached embedding matrix with pre-computed norms - Lazy content loading for top-k results only - Thread-safe cache invalidation - SQLite: Added PRAGMA mmap_size=30GB for memory-mapped I/O - FTS5: unicode61 tokenizer with tokenchars='_' for code identifiers - ChainSearch: files_only fast path skipping snippet generation - ThreadPoolExecutor: shared pool across searches New Components: - DirIndexStore: single-directory index with FTS5 and symbols - RegistryStore: global project registry with path mappings - PathMapper: source-to-index path conversion utility - IndexTreeBuilder: hierarchical index tree construction - ChainSearchEngine: parallel recursive directory search Test Coverage: - 36 comprehensive search functionality tests - 14 performance benchmark tests - 296 total tests passing (100% pass rate) Benchmark Results: - FTS5 search: 0.23-0.26ms avg (3900-4300 ops/sec) - Vector search: 1.05-1.54ms avg (650-955 ops/sec) - Full semantic: 4.56-6.38ms avg per query 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
15
codex-lens/src/codexlens/search/__init__.py
Normal file
15
codex-lens/src/codexlens/search/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from .chain_search import (
|
||||
ChainSearchEngine,
|
||||
SearchOptions,
|
||||
SearchStats,
|
||||
ChainSearchResult,
|
||||
quick_search,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ChainSearchEngine",
|
||||
"SearchOptions",
|
||||
"SearchStats",
|
||||
"ChainSearchResult",
|
||||
"quick_search",
|
||||
]
|
||||
566
codex-lens/src/codexlens/search/chain_search.py
Normal file
566
codex-lens/src/codexlens/search/chain_search.py
Normal file
@@ -0,0 +1,566 @@
|
||||
"""Chain search engine for recursive multi-directory searching.
|
||||
|
||||
Provides parallel search across directory hierarchies using indexed _index.db files.
|
||||
Supports depth-limited traversal, result aggregation, and symbol search.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any
|
||||
import logging
|
||||
import time
|
||||
|
||||
from codexlens.entities import SearchResult, Symbol
|
||||
from codexlens.storage.registry import RegistryStore, DirMapping
|
||||
from codexlens.storage.dir_index import DirIndexStore, SubdirLink
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchOptions:
|
||||
"""Configuration options for chain search.
|
||||
|
||||
Attributes:
|
||||
depth: Maximum search depth (-1 = unlimited, 0 = current dir only)
|
||||
max_workers: Number of parallel worker threads
|
||||
limit_per_dir: Maximum results per directory
|
||||
total_limit: Total result limit across all directories
|
||||
include_symbols: Whether to include symbol search results
|
||||
files_only: Return only file paths without excerpts
|
||||
"""
|
||||
depth: int = -1
|
||||
max_workers: int = 8
|
||||
limit_per_dir: int = 10
|
||||
total_limit: int = 100
|
||||
include_symbols: bool = False
|
||||
files_only: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchStats:
|
||||
"""Statistics collected during search execution.
|
||||
|
||||
Attributes:
|
||||
dirs_searched: Number of directories searched
|
||||
files_matched: Number of files with matches
|
||||
time_ms: Total search time in milliseconds
|
||||
errors: List of error messages encountered
|
||||
"""
|
||||
dirs_searched: int = 0
|
||||
files_matched: int = 0
|
||||
time_ms: float = 0
|
||||
errors: List[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChainSearchResult:
|
||||
"""Comprehensive search result with metadata.
|
||||
|
||||
Attributes:
|
||||
query: Original search query
|
||||
results: List of SearchResult objects
|
||||
symbols: List of Symbol objects (if include_symbols=True)
|
||||
stats: SearchStats with execution metrics
|
||||
"""
|
||||
query: str
|
||||
results: List[SearchResult]
|
||||
symbols: List[Symbol]
|
||||
stats: SearchStats
|
||||
|
||||
|
||||
class ChainSearchEngine:
|
||||
"""Parallel chain search engine for hierarchical directory indexes.
|
||||
|
||||
Searches across multiple directory indexes in parallel, following subdirectory
|
||||
links to recursively traverse the file tree. Supports depth limits, result
|
||||
aggregation, and both content and symbol searches.
|
||||
|
||||
Thread-safe with configurable parallelism.
|
||||
|
||||
Attributes:
|
||||
registry: Global project registry
|
||||
mapper: Path mapping utility
|
||||
logger: Python logger instance
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
registry: RegistryStore,
|
||||
mapper: PathMapper,
|
||||
max_workers: int = 8):
|
||||
"""Initialize chain search engine.
|
||||
|
||||
Args:
|
||||
registry: Global project registry for path lookups
|
||||
mapper: Path mapper for source/index conversions
|
||||
max_workers: Maximum parallel workers (default 8)
|
||||
"""
|
||||
self.registry = registry
|
||||
self.mapper = mapper
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self._max_workers = max_workers
|
||||
self._executor: Optional[ThreadPoolExecutor] = None
|
||||
|
||||
def _get_executor(self, max_workers: Optional[int] = None) -> ThreadPoolExecutor:
|
||||
"""Get or create the shared thread pool executor.
|
||||
|
||||
Lazy initialization to avoid creating executor if never used.
|
||||
|
||||
Args:
|
||||
max_workers: Override default max_workers if specified
|
||||
|
||||
Returns:
|
||||
ThreadPoolExecutor instance
|
||||
"""
|
||||
workers = max_workers or self._max_workers
|
||||
if self._executor is None:
|
||||
self._executor = ThreadPoolExecutor(max_workers=workers)
|
||||
return self._executor
|
||||
|
||||
def close(self) -> None:
|
||||
"""Shutdown the thread pool executor."""
|
||||
if self._executor is not None:
|
||||
self._executor.shutdown(wait=True)
|
||||
self._executor = None
|
||||
|
||||
def __enter__(self) -> "ChainSearchEngine":
|
||||
"""Context manager entry."""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
|
||||
"""Context manager exit."""
|
||||
self.close()
|
||||
|
||||
def search(self, query: str,
|
||||
source_path: Path,
|
||||
options: Optional[SearchOptions] = None) -> ChainSearchResult:
|
||||
"""Execute chain search from source_path with recursive traversal.
|
||||
|
||||
Process:
|
||||
1. Locate starting index for source_path
|
||||
2. Collect all child indexes based on depth limit
|
||||
3. Search indexes in parallel using ThreadPoolExecutor
|
||||
4. Aggregate, deduplicate, and rank results
|
||||
|
||||
Args:
|
||||
query: FTS5 search query string
|
||||
source_path: Starting directory path
|
||||
options: Search configuration (uses defaults if None)
|
||||
|
||||
Returns:
|
||||
ChainSearchResult with results, symbols, and statistics
|
||||
|
||||
Examples:
|
||||
>>> engine = ChainSearchEngine(registry, mapper)
|
||||
>>> result = engine.search("authentication", Path("D:/project/src"))
|
||||
>>> for r in result.results[:5]:
|
||||
... print(f"{r.path}: {r.score:.2f}")
|
||||
"""
|
||||
options = options or SearchOptions()
|
||||
start_time = time.time()
|
||||
stats = SearchStats()
|
||||
|
||||
# Step 1: Find starting index
|
||||
start_index = self._find_start_index(source_path)
|
||||
if not start_index:
|
||||
self.logger.warning(f"No index found for {source_path}")
|
||||
stats.time_ms = (time.time() - start_time) * 1000
|
||||
return ChainSearchResult(
|
||||
query=query,
|
||||
results=[],
|
||||
symbols=[],
|
||||
stats=stats
|
||||
)
|
||||
|
||||
# Step 2: Collect all index paths to search
|
||||
index_paths = self._collect_index_paths(start_index, options.depth)
|
||||
stats.dirs_searched = len(index_paths)
|
||||
|
||||
if not index_paths:
|
||||
self.logger.warning(f"No indexes collected from {start_index}")
|
||||
stats.time_ms = (time.time() - start_time) * 1000
|
||||
return ChainSearchResult(
|
||||
query=query,
|
||||
results=[],
|
||||
symbols=[],
|
||||
stats=stats
|
||||
)
|
||||
|
||||
# Step 3: Parallel search
|
||||
results, search_stats = self._search_parallel(
|
||||
index_paths, query, options
|
||||
)
|
||||
stats.errors = search_stats.errors
|
||||
|
||||
# Step 4: Merge and rank
|
||||
final_results = self._merge_and_rank(results, options.total_limit)
|
||||
stats.files_matched = len(final_results)
|
||||
|
||||
# Optional: Symbol search
|
||||
symbols = []
|
||||
if options.include_symbols:
|
||||
symbols = self._search_symbols_parallel(
|
||||
index_paths, query, None, options.total_limit
|
||||
)
|
||||
|
||||
stats.time_ms = (time.time() - start_time) * 1000
|
||||
|
||||
return ChainSearchResult(
|
||||
query=query,
|
||||
results=final_results,
|
||||
symbols=symbols,
|
||||
stats=stats
|
||||
)
|
||||
|
||||
def search_files_only(self, query: str,
|
||||
source_path: Path,
|
||||
options: Optional[SearchOptions] = None) -> List[str]:
|
||||
"""Search and return only matching file paths.
|
||||
|
||||
Faster than full search when excerpts are not needed.
|
||||
|
||||
Args:
|
||||
query: FTS5 search query string
|
||||
source_path: Starting directory path
|
||||
options: Search configuration (uses defaults if None)
|
||||
|
||||
Returns:
|
||||
List of file paths as strings
|
||||
|
||||
Examples:
|
||||
>>> engine = ChainSearchEngine(registry, mapper)
|
||||
>>> paths = engine.search_files_only("TODO", Path("D:/project"))
|
||||
>>> print(f"Found {len(paths)} files with TODOs")
|
||||
"""
|
||||
options = options or SearchOptions()
|
||||
options.files_only = True
|
||||
|
||||
result = self.search(query, source_path, options)
|
||||
return [r.path for r in result.results]
|
||||
|
||||
def search_symbols(self, name: str,
|
||||
source_path: Path,
|
||||
kind: Optional[str] = None,
|
||||
options: Optional[SearchOptions] = None) -> List[Symbol]:
|
||||
"""Chain symbol search across directory hierarchy.
|
||||
|
||||
Args:
|
||||
name: Symbol name pattern (partial match supported)
|
||||
source_path: Starting directory path
|
||||
kind: Optional symbol kind filter (e.g., 'function', 'class')
|
||||
options: Search configuration (uses defaults if None)
|
||||
|
||||
Returns:
|
||||
List of Symbol objects sorted by name
|
||||
|
||||
Examples:
|
||||
>>> engine = ChainSearchEngine(registry, mapper)
|
||||
>>> funcs = engine.search_symbols("init", Path("D:/project"), kind="function")
|
||||
>>> for sym in funcs[:10]:
|
||||
... print(f"{sym.name} ({sym.kind}): lines {sym.range}")
|
||||
"""
|
||||
options = options or SearchOptions()
|
||||
|
||||
start_index = self._find_start_index(source_path)
|
||||
if not start_index:
|
||||
self.logger.warning(f"No index found for {source_path}")
|
||||
return []
|
||||
|
||||
index_paths = self._collect_index_paths(start_index, options.depth)
|
||||
if not index_paths:
|
||||
return []
|
||||
|
||||
return self._search_symbols_parallel(
|
||||
index_paths, name, kind, options.total_limit
|
||||
)
|
||||
|
||||
# === Internal Methods ===
|
||||
|
||||
def _find_start_index(self, source_path: Path) -> Optional[Path]:
|
||||
"""Find index database path for source directory.
|
||||
|
||||
Attempts exact match first, then searches for nearest ancestor index.
|
||||
|
||||
Args:
|
||||
source_path: Source directory path
|
||||
|
||||
Returns:
|
||||
Path to _index.db file, or None if not found
|
||||
"""
|
||||
source_path = source_path.resolve()
|
||||
|
||||
# Try exact match first
|
||||
exact_index = self.mapper.source_to_index_db(source_path)
|
||||
if exact_index.exists():
|
||||
self.logger.debug(f"Found exact index: {exact_index}")
|
||||
return exact_index
|
||||
|
||||
# Try nearest ancestor via registry
|
||||
nearest = self.registry.find_nearest_index(source_path)
|
||||
if nearest:
|
||||
self.logger.debug(f"Found nearest index: {nearest.index_path}")
|
||||
return nearest.index_path
|
||||
|
||||
self.logger.warning(f"No index found for {source_path}")
|
||||
return None
|
||||
|
||||
def _collect_index_paths(self, start_index: Path,
|
||||
depth: int) -> List[Path]:
|
||||
"""Recursively collect all subdirectory index paths.
|
||||
|
||||
Traverses directory tree via subdirs table in each _index.db,
|
||||
respecting depth limit.
|
||||
|
||||
Args:
|
||||
start_index: Starting _index.db path
|
||||
depth: Maximum depth (-1 = unlimited, 0 = current only)
|
||||
|
||||
Returns:
|
||||
List of _index.db paths to search
|
||||
"""
|
||||
collected = []
|
||||
visited = set()
|
||||
|
||||
def _collect_recursive(index_path: Path, current_depth: int):
|
||||
# Normalize path to avoid duplicates
|
||||
normalized = index_path.resolve()
|
||||
if normalized in visited:
|
||||
return
|
||||
visited.add(normalized)
|
||||
|
||||
# Add current index
|
||||
if normalized.exists():
|
||||
collected.append(normalized)
|
||||
else:
|
||||
self.logger.debug(f"Index does not exist: {normalized}")
|
||||
return
|
||||
|
||||
# Check depth limit
|
||||
if depth >= 0 and current_depth >= depth:
|
||||
return
|
||||
|
||||
# Read subdirs and recurse
|
||||
try:
|
||||
with DirIndexStore(normalized) as store:
|
||||
subdirs = store.get_subdirs()
|
||||
for subdir in subdirs:
|
||||
_collect_recursive(subdir.index_path, current_depth + 1)
|
||||
except Exception as exc:
|
||||
self.logger.warning(f"Failed to read subdirs from {normalized}: {exc}")
|
||||
|
||||
_collect_recursive(start_index, 0)
|
||||
self.logger.info(f"Collected {len(collected)} indexes (depth={depth})")
|
||||
return collected
|
||||
|
||||
def _search_parallel(self, index_paths: List[Path],
|
||||
query: str,
|
||||
options: SearchOptions) -> tuple[List[SearchResult], SearchStats]:
|
||||
"""Search multiple indexes in parallel using shared ThreadPoolExecutor.
|
||||
|
||||
Args:
|
||||
index_paths: List of _index.db paths to search
|
||||
query: FTS5 query string
|
||||
options: Search configuration
|
||||
|
||||
Returns:
|
||||
Tuple of (all results, search statistics)
|
||||
"""
|
||||
all_results = []
|
||||
stats = SearchStats()
|
||||
|
||||
executor = self._get_executor(options.max_workers)
|
||||
# Submit all search tasks
|
||||
future_to_path = {
|
||||
executor.submit(
|
||||
self._search_single_index,
|
||||
idx_path,
|
||||
query,
|
||||
options.limit_per_dir,
|
||||
options.files_only
|
||||
): idx_path
|
||||
for idx_path in index_paths
|
||||
}
|
||||
|
||||
# Collect results as they complete
|
||||
for future in as_completed(future_to_path):
|
||||
idx_path = future_to_path[future]
|
||||
try:
|
||||
results = future.result()
|
||||
all_results.extend(results)
|
||||
self.logger.debug(f"Got {len(results)} results from {idx_path.parent.name}")
|
||||
except Exception as exc:
|
||||
error_msg = f"Search failed for {idx_path}: {exc}"
|
||||
self.logger.error(error_msg)
|
||||
stats.errors.append(error_msg)
|
||||
|
||||
return all_results, stats
|
||||
|
||||
def _search_single_index(self, index_path: Path,
|
||||
query: str,
|
||||
limit: int,
|
||||
files_only: bool = False) -> List[SearchResult]:
|
||||
"""Search a single index database.
|
||||
|
||||
Handles exceptions gracefully, returning empty list on failure.
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
query: FTS5 query string
|
||||
limit: Maximum results from this index
|
||||
files_only: If True, skip snippet generation for faster search
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects (empty on error)
|
||||
"""
|
||||
try:
|
||||
with DirIndexStore(index_path) as store:
|
||||
if files_only:
|
||||
# Fast path: return paths only without snippets
|
||||
paths = store.search_files_only(query, limit=limit)
|
||||
return [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
|
||||
else:
|
||||
return store.search_fts(query, limit=limit)
|
||||
except Exception as exc:
|
||||
self.logger.debug(f"Search error in {index_path}: {exc}")
|
||||
return []
|
||||
|
||||
def _merge_and_rank(self, results: List[SearchResult],
|
||||
limit: int) -> List[SearchResult]:
|
||||
"""Aggregate, deduplicate, and rank results.
|
||||
|
||||
Process:
|
||||
1. Deduplicate by path (keep highest score)
|
||||
2. Sort by score descending
|
||||
3. Limit to requested count
|
||||
|
||||
Args:
|
||||
results: Raw results from all indexes
|
||||
limit: Maximum results to return
|
||||
|
||||
Returns:
|
||||
Deduplicated and ranked results
|
||||
"""
|
||||
# Deduplicate by path, keeping best score
|
||||
path_to_result: Dict[str, SearchResult] = {}
|
||||
for result in results:
|
||||
path = result.path
|
||||
if path not in path_to_result or result.score > path_to_result[path].score:
|
||||
path_to_result[path] = result
|
||||
|
||||
# Sort by score descending
|
||||
unique_results = list(path_to_result.values())
|
||||
unique_results.sort(key=lambda r: r.score, reverse=True)
|
||||
|
||||
# Apply limit
|
||||
return unique_results[:limit]
|
||||
|
||||
def _search_symbols_parallel(self, index_paths: List[Path],
|
||||
name: str,
|
||||
kind: Optional[str],
|
||||
limit: int) -> List[Symbol]:
|
||||
"""Search symbols across multiple indexes in parallel.
|
||||
|
||||
Args:
|
||||
index_paths: List of _index.db paths to search
|
||||
name: Symbol name pattern
|
||||
kind: Optional symbol kind filter
|
||||
limit: Total symbol limit
|
||||
|
||||
Returns:
|
||||
Deduplicated and sorted symbols
|
||||
"""
|
||||
all_symbols = []
|
||||
|
||||
executor = self._get_executor()
|
||||
# Submit all symbol search tasks
|
||||
future_to_path = {
|
||||
executor.submit(
|
||||
self._search_symbols_single,
|
||||
idx_path,
|
||||
name,
|
||||
kind
|
||||
): idx_path
|
||||
for idx_path in index_paths
|
||||
}
|
||||
|
||||
# Collect results
|
||||
for future in as_completed(future_to_path):
|
||||
try:
|
||||
symbols = future.result()
|
||||
all_symbols.extend(symbols)
|
||||
except Exception as exc:
|
||||
self.logger.error(f"Symbol search failed: {exc}")
|
||||
|
||||
# Deduplicate by (name, kind, range)
|
||||
seen = set()
|
||||
unique_symbols = []
|
||||
for sym in all_symbols:
|
||||
key = (sym.name, sym.kind, sym.range)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique_symbols.append(sym)
|
||||
|
||||
# Sort by name
|
||||
unique_symbols.sort(key=lambda s: s.name)
|
||||
|
||||
return unique_symbols[:limit]
|
||||
|
||||
def _search_symbols_single(self, index_path: Path,
|
||||
name: str,
|
||||
kind: Optional[str]) -> List[Symbol]:
|
||||
"""Search symbols in a single index.
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
name: Symbol name pattern
|
||||
kind: Optional symbol kind filter
|
||||
|
||||
Returns:
|
||||
List of Symbol objects (empty on error)
|
||||
"""
|
||||
try:
|
||||
with DirIndexStore(index_path) as store:
|
||||
return store.search_symbols(name, kind=kind)
|
||||
except Exception as exc:
|
||||
self.logger.debug(f"Symbol search error in {index_path}: {exc}")
|
||||
return []
|
||||
|
||||
|
||||
# === Convenience Functions ===
|
||||
|
||||
def quick_search(query: str,
|
||||
source_path: Path,
|
||||
depth: int = -1) -> List[SearchResult]:
|
||||
"""Quick search convenience function with automatic initialization.
|
||||
|
||||
Creates temporary registry and mapper instances for one-off searches.
|
||||
For repeated searches, create a ChainSearchEngine instance directly.
|
||||
|
||||
Args:
|
||||
query: FTS5 search query string
|
||||
source_path: Starting directory path
|
||||
depth: Maximum search depth (-1 = unlimited)
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects sorted by relevance
|
||||
|
||||
Examples:
|
||||
>>> from pathlib import Path
|
||||
>>> results = quick_search("authentication", Path("D:/project/src"))
|
||||
>>> print(f"Found {len(results)} matches")
|
||||
"""
|
||||
registry = RegistryStore()
|
||||
registry.initialize()
|
||||
|
||||
mapper = PathMapper()
|
||||
|
||||
engine = ChainSearchEngine(registry, mapper)
|
||||
options = SearchOptions(depth=depth)
|
||||
|
||||
result = engine.search(query, source_path, options)
|
||||
|
||||
registry.close()
|
||||
|
||||
return result.results
|
||||
@@ -1,9 +1,16 @@
|
||||
"""Vector storage and similarity search for semantic chunks."""
|
||||
"""Vector storage and similarity search for semantic chunks.
|
||||
|
||||
Optimized for high-performance similarity search using:
|
||||
- Cached embedding matrix for batch operations
|
||||
- NumPy vectorized cosine similarity (100x+ faster than loops)
|
||||
- Lazy content loading (only fetch for top-k results)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
@@ -34,7 +41,14 @@ def _cosine_similarity(a: List[float], b: List[float]) -> float:
|
||||
|
||||
|
||||
class VectorStore:
|
||||
"""SQLite-based vector storage with cosine similarity search."""
|
||||
"""SQLite-based vector storage with optimized cosine similarity search.
|
||||
|
||||
Performance optimizations:
|
||||
- Embedding matrix cached in memory for batch similarity computation
|
||||
- NumPy vectorized operations instead of Python loops
|
||||
- Lazy content loading - only fetch full content for top-k results
|
||||
- Thread-safe cache invalidation
|
||||
"""
|
||||
|
||||
def __init__(self, db_path: str | Path) -> None:
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
@@ -45,11 +59,21 @@ class VectorStore:
|
||||
|
||||
self.db_path = Path(db_path)
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Embedding cache for fast similarity search
|
||||
self._cache_lock = threading.RLock()
|
||||
self._embedding_matrix: Optional[np.ndarray] = None
|
||||
self._embedding_norms: Optional[np.ndarray] = None
|
||||
self._chunk_ids: Optional[List[int]] = None
|
||||
self._cache_version: int = 0
|
||||
|
||||
self._init_schema()
|
||||
|
||||
def _init_schema(self) -> None:
|
||||
"""Initialize vector storage schema."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
# Enable memory mapping for faster reads
|
||||
conn.execute("PRAGMA mmap_size = 30000000000") # 30GB limit
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS semantic_chunks (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
@@ -66,6 +90,53 @@ class VectorStore:
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
def _invalidate_cache(self) -> None:
|
||||
"""Invalidate the embedding cache (thread-safe)."""
|
||||
with self._cache_lock:
|
||||
self._embedding_matrix = None
|
||||
self._embedding_norms = None
|
||||
self._chunk_ids = None
|
||||
self._cache_version += 1
|
||||
|
||||
def _refresh_cache(self) -> bool:
|
||||
"""Load embeddings into numpy matrix for fast similarity search.
|
||||
|
||||
Returns:
|
||||
True if cache was refreshed successfully, False if no data.
|
||||
"""
|
||||
with self._cache_lock:
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("PRAGMA mmap_size = 30000000000")
|
||||
rows = conn.execute(
|
||||
"SELECT id, embedding FROM semantic_chunks"
|
||||
).fetchall()
|
||||
|
||||
if not rows:
|
||||
self._embedding_matrix = None
|
||||
self._embedding_norms = None
|
||||
self._chunk_ids = None
|
||||
return False
|
||||
|
||||
# Extract IDs and embeddings
|
||||
self._chunk_ids = [r[0] for r in rows]
|
||||
|
||||
# Bulk convert binary blobs to numpy matrix
|
||||
embeddings = [
|
||||
np.frombuffer(r[1], dtype=np.float32) for r in rows
|
||||
]
|
||||
self._embedding_matrix = np.vstack(embeddings)
|
||||
|
||||
# Pre-compute norms for faster similarity calculation
|
||||
self._embedding_norms = np.linalg.norm(
|
||||
self._embedding_matrix, axis=1, keepdims=True
|
||||
)
|
||||
# Avoid division by zero
|
||||
self._embedding_norms = np.where(
|
||||
self._embedding_norms == 0, 1e-10, self._embedding_norms
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def add_chunk(self, chunk: SemanticChunk, file_path: str) -> int:
|
||||
"""Add a single chunk with its embedding.
|
||||
|
||||
@@ -87,17 +158,46 @@ class VectorStore:
|
||||
(file_path, chunk.content, embedding_blob, metadata_json)
|
||||
)
|
||||
conn.commit()
|
||||
return cursor.lastrowid or 0
|
||||
chunk_id = cursor.lastrowid or 0
|
||||
|
||||
# Invalidate cache after modification
|
||||
self._invalidate_cache()
|
||||
return chunk_id
|
||||
|
||||
def add_chunks(self, chunks: List[SemanticChunk], file_path: str) -> List[int]:
|
||||
"""Add multiple chunks with embeddings.
|
||||
"""Add multiple chunks with embeddings (batch insert).
|
||||
|
||||
Returns:
|
||||
List of inserted chunk IDs.
|
||||
"""
|
||||
ids = []
|
||||
if not chunks:
|
||||
return []
|
||||
|
||||
# Prepare batch data
|
||||
batch_data = []
|
||||
for chunk in chunks:
|
||||
ids.append(self.add_chunk(chunk, file_path))
|
||||
if chunk.embedding is None:
|
||||
raise ValueError("All chunks must have embeddings")
|
||||
embedding_blob = np.array(chunk.embedding, dtype=np.float32).tobytes()
|
||||
metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
|
||||
batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
|
||||
|
||||
# Batch insert
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.executemany(
|
||||
"""
|
||||
INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""",
|
||||
batch_data
|
||||
)
|
||||
conn.commit()
|
||||
# Get inserted IDs (approximate - assumes sequential)
|
||||
last_id = cursor.lastrowid or 0
|
||||
ids = list(range(last_id - len(chunks) + 1, last_id + 1))
|
||||
|
||||
# Invalidate cache after modification
|
||||
self._invalidate_cache()
|
||||
return ids
|
||||
|
||||
def delete_file_chunks(self, file_path: str) -> int:
|
||||
@@ -112,7 +212,11 @@ class VectorStore:
|
||||
(file_path,)
|
||||
)
|
||||
conn.commit()
|
||||
return cursor.rowcount
|
||||
deleted = cursor.rowcount
|
||||
|
||||
if deleted > 0:
|
||||
self._invalidate_cache()
|
||||
return deleted
|
||||
|
||||
def search_similar(
|
||||
self,
|
||||
@@ -123,6 +227,11 @@ class VectorStore:
|
||||
) -> List[SearchResult]:
|
||||
"""Find chunks most similar to query embedding.
|
||||
|
||||
Optimized with:
|
||||
- Vectorized NumPy similarity computation (100x+ faster)
|
||||
- Cached embedding matrix (avoids repeated DB reads)
|
||||
- Lazy content loading (only fetch for top-k results)
|
||||
|
||||
Args:
|
||||
query_embedding: Query vector.
|
||||
top_k: Maximum results to return.
|
||||
@@ -132,62 +241,132 @@ class VectorStore:
|
||||
Returns:
|
||||
List of SearchResult ordered by similarity (highest first).
|
||||
"""
|
||||
results: List[Tuple[float, SearchResult]] = []
|
||||
with self._cache_lock:
|
||||
# Refresh cache if needed
|
||||
if self._embedding_matrix is None:
|
||||
if not self._refresh_cache():
|
||||
return [] # No data
|
||||
|
||||
# Vectorized cosine similarity
|
||||
query_vec = np.array(query_embedding, dtype=np.float32).reshape(1, -1)
|
||||
query_norm = np.linalg.norm(query_vec)
|
||||
if query_norm == 0:
|
||||
return []
|
||||
|
||||
# Compute all similarities at once: (N,) scores
|
||||
# similarity = (A @ B.T) / (||A|| * ||B||)
|
||||
dot_products = np.dot(self._embedding_matrix, query_vec.T).flatten()
|
||||
scores = dot_products / (self._embedding_norms.flatten() * query_norm)
|
||||
|
||||
# Filter by min_score and get top-k indices
|
||||
valid_mask = scores >= min_score
|
||||
valid_indices = np.where(valid_mask)[0]
|
||||
|
||||
if len(valid_indices) == 0:
|
||||
return []
|
||||
|
||||
# Sort by score descending and take top_k
|
||||
valid_scores = scores[valid_indices]
|
||||
sorted_order = np.argsort(valid_scores)[::-1][:top_k]
|
||||
top_indices = valid_indices[sorted_order]
|
||||
top_scores = valid_scores[sorted_order]
|
||||
|
||||
# Get chunk IDs for top results
|
||||
top_ids = [self._chunk_ids[i] for i in top_indices]
|
||||
|
||||
# Fetch content only for top-k results (lazy loading)
|
||||
results = self._fetch_results_by_ids(
|
||||
top_ids, top_scores.tolist(), return_full_content
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def _fetch_results_by_ids(
|
||||
self,
|
||||
chunk_ids: List[int],
|
||||
scores: List[float],
|
||||
return_full_content: bool,
|
||||
) -> List[SearchResult]:
|
||||
"""Fetch full result data for specific chunk IDs.
|
||||
|
||||
Args:
|
||||
chunk_ids: List of chunk IDs to fetch.
|
||||
scores: Corresponding similarity scores.
|
||||
return_full_content: Whether to include full content.
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects.
|
||||
"""
|
||||
if not chunk_ids:
|
||||
return []
|
||||
|
||||
# Build parameterized query for IN clause
|
||||
placeholders = ",".join("?" * len(chunk_ids))
|
||||
query = f"""
|
||||
SELECT id, file_path, content, metadata
|
||||
FROM semantic_chunks
|
||||
WHERE id IN ({placeholders})
|
||||
"""
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
rows = conn.execute(
|
||||
"SELECT id, file_path, content, embedding, metadata FROM semantic_chunks"
|
||||
).fetchall()
|
||||
conn.execute("PRAGMA mmap_size = 30000000000")
|
||||
rows = conn.execute(query, chunk_ids).fetchall()
|
||||
|
||||
for row_id, file_path, content, embedding_blob, metadata_json in rows:
|
||||
stored_embedding = np.frombuffer(embedding_blob, dtype=np.float32).tolist()
|
||||
score = _cosine_similarity(query_embedding, stored_embedding)
|
||||
# Build ID -> row mapping
|
||||
id_to_row = {r[0]: r for r in rows}
|
||||
|
||||
if score >= min_score:
|
||||
metadata = json.loads(metadata_json) if metadata_json else {}
|
||||
results = []
|
||||
for chunk_id, score in zip(chunk_ids, scores):
|
||||
row = id_to_row.get(chunk_id)
|
||||
if not row:
|
||||
continue
|
||||
|
||||
# Build excerpt (short preview)
|
||||
excerpt = content[:200] + "..." if len(content) > 200 else content
|
||||
|
||||
# Extract symbol information from metadata
|
||||
symbol_name = metadata.get("symbol_name")
|
||||
symbol_kind = metadata.get("symbol_kind")
|
||||
start_line = metadata.get("start_line")
|
||||
end_line = metadata.get("end_line")
|
||||
|
||||
# Build Symbol object if we have symbol info
|
||||
symbol = None
|
||||
if symbol_name and symbol_kind and start_line and end_line:
|
||||
try:
|
||||
from codexlens.entities import Symbol
|
||||
symbol = Symbol(
|
||||
name=symbol_name,
|
||||
kind=symbol_kind,
|
||||
range=(start_line, end_line)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
_, file_path, content, metadata_json = row
|
||||
metadata = json.loads(metadata_json) if metadata_json else {}
|
||||
|
||||
results.append((score, SearchResult(
|
||||
path=file_path,
|
||||
score=score,
|
||||
excerpt=excerpt,
|
||||
content=content if return_full_content else None,
|
||||
symbol=symbol,
|
||||
metadata=metadata,
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
symbol_name=symbol_name,
|
||||
symbol_kind=symbol_kind,
|
||||
)))
|
||||
# Build excerpt (short preview)
|
||||
excerpt = content[:200] + "..." if len(content) > 200 else content
|
||||
|
||||
# Sort by score descending
|
||||
results.sort(key=lambda x: x[0], reverse=True)
|
||||
# Extract symbol information from metadata
|
||||
symbol_name = metadata.get("symbol_name")
|
||||
symbol_kind = metadata.get("symbol_kind")
|
||||
start_line = metadata.get("start_line")
|
||||
end_line = metadata.get("end_line")
|
||||
|
||||
return [r for _, r in results[:top_k]]
|
||||
# Build Symbol object if we have symbol info
|
||||
symbol = None
|
||||
if symbol_name and symbol_kind and start_line and end_line:
|
||||
try:
|
||||
from codexlens.entities import Symbol
|
||||
symbol = Symbol(
|
||||
name=symbol_name,
|
||||
kind=symbol_kind,
|
||||
range=(start_line, end_line)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
results.append(SearchResult(
|
||||
path=file_path,
|
||||
score=score,
|
||||
excerpt=excerpt,
|
||||
content=content if return_full_content else None,
|
||||
symbol=symbol,
|
||||
metadata=metadata,
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
symbol_name=symbol_name,
|
||||
symbol_kind=symbol_kind,
|
||||
))
|
||||
|
||||
return results
|
||||
|
||||
def count_chunks(self) -> int:
|
||||
"""Count total chunks in store."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
row = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone()
|
||||
return row[0] if row else 0
|
||||
|
||||
def clear_cache(self) -> None:
|
||||
"""Manually clear the embedding cache."""
|
||||
self._invalidate_cache()
|
||||
|
||||
@@ -3,6 +3,27 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from .sqlite_store import SQLiteStore
|
||||
from .path_mapper import PathMapper
|
||||
from .registry import RegistryStore, ProjectInfo, DirMapping
|
||||
from .dir_index import DirIndexStore, SubdirLink, FileEntry
|
||||
from .index_tree import IndexTreeBuilder, BuildResult, DirBuildResult
|
||||
|
||||
__all__ = ["SQLiteStore"]
|
||||
__all__ = [
|
||||
# Legacy (workspace-local)
|
||||
"SQLiteStore",
|
||||
# Path mapping
|
||||
"PathMapper",
|
||||
# Global registry
|
||||
"RegistryStore",
|
||||
"ProjectInfo",
|
||||
"DirMapping",
|
||||
# Directory index
|
||||
"DirIndexStore",
|
||||
"SubdirLink",
|
||||
"FileEntry",
|
||||
# Tree builder
|
||||
"IndexTreeBuilder",
|
||||
"BuildResult",
|
||||
"DirBuildResult",
|
||||
]
|
||||
|
||||
|
||||
797
codex-lens/src/codexlens/storage/dir_index.py
Normal file
797
codex-lens/src/codexlens/storage/dir_index.py
Normal file
@@ -0,0 +1,797 @@
|
||||
"""Single-directory index storage with hierarchical linking.
|
||||
|
||||
Each directory maintains its own _index.db with:
|
||||
- Files in the current directory
|
||||
- Links to subdirectory indexes
|
||||
- Full-text search via FTS5
|
||||
- Symbol table for code navigation
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
import threading
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from codexlens.entities import SearchResult, Symbol
|
||||
from codexlens.errors import StorageError
|
||||
|
||||
|
||||
@dataclass
|
||||
class SubdirLink:
|
||||
"""Link to a subdirectory's index database."""
|
||||
|
||||
id: int
|
||||
name: str
|
||||
index_path: Path
|
||||
files_count: int
|
||||
direct_files: int
|
||||
last_updated: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileEntry:
|
||||
"""Metadata for an indexed file in current directory."""
|
||||
|
||||
id: int
|
||||
name: str
|
||||
full_path: Path
|
||||
language: str
|
||||
mtime: float
|
||||
line_count: int
|
||||
|
||||
|
||||
class DirIndexStore:
|
||||
"""Single-directory index storage with hierarchical subdirectory linking.
|
||||
|
||||
Each directory has an independent _index.db containing:
|
||||
- Files table: Files in this directory only
|
||||
- Subdirs table: Links to child directory indexes
|
||||
- Symbols table: Code symbols from files
|
||||
- FTS5 index: Full-text search on file content
|
||||
|
||||
Thread-safe operations with WAL mode enabled.
|
||||
"""
|
||||
|
||||
def __init__(self, db_path: str | Path) -> None:
|
||||
"""Initialize directory index store.
|
||||
|
||||
Args:
|
||||
db_path: Path to _index.db file for this directory
|
||||
"""
|
||||
self.db_path = Path(db_path).resolve()
|
||||
self._lock = threading.RLock()
|
||||
self._conn: Optional[sqlite3.Connection] = None
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Create database and schema if not exists."""
|
||||
with self._lock:
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
conn = self._get_connection()
|
||||
self._create_schema(conn)
|
||||
self._create_fts_triggers(conn)
|
||||
conn.commit()
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close database connection."""
|
||||
with self._lock:
|
||||
if self._conn is not None:
|
||||
try:
|
||||
self._conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
self._conn = None
|
||||
|
||||
def __enter__(self) -> DirIndexStore:
|
||||
"""Context manager entry."""
|
||||
self.initialize()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
|
||||
"""Context manager exit."""
|
||||
self.close()
|
||||
|
||||
# === File Operations ===
|
||||
|
||||
def add_file(
|
||||
self,
|
||||
name: str,
|
||||
full_path: str | Path,
|
||||
content: str,
|
||||
language: str,
|
||||
symbols: Optional[List[Symbol]] = None,
|
||||
) -> int:
|
||||
"""Add or update a file in the current directory index.
|
||||
|
||||
Args:
|
||||
name: Filename without path
|
||||
full_path: Complete source file path
|
||||
content: File content for indexing
|
||||
language: Programming language identifier
|
||||
symbols: List of Symbol objects from the file
|
||||
|
||||
Returns:
|
||||
Database file_id
|
||||
|
||||
Raises:
|
||||
StorageError: If database operations fail
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
full_path_str = str(Path(full_path).resolve())
|
||||
mtime = Path(full_path_str).stat().st_mtime if Path(full_path_str).exists() else None
|
||||
line_count = content.count('\n') + 1
|
||||
|
||||
try:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO files(name, full_path, language, content, mtime, line_count)
|
||||
VALUES(?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(full_path) DO UPDATE SET
|
||||
name=excluded.name,
|
||||
language=excluded.language,
|
||||
content=excluded.content,
|
||||
mtime=excluded.mtime,
|
||||
line_count=excluded.line_count
|
||||
""",
|
||||
(name, full_path_str, language, content, mtime, line_count),
|
||||
)
|
||||
|
||||
row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone()
|
||||
if not row:
|
||||
raise StorageError(f"Failed to retrieve file_id for {full_path_str}")
|
||||
|
||||
file_id = int(row["id"])
|
||||
|
||||
# Replace symbols
|
||||
conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
|
||||
if symbols:
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT INTO symbols(file_id, name, kind, start_line, end_line)
|
||||
VALUES(?, ?, ?, ?, ?)
|
||||
""",
|
||||
[
|
||||
(file_id, s.name, s.kind, s.range[0], s.range[1])
|
||||
for s in symbols
|
||||
],
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
return file_id
|
||||
|
||||
except sqlite3.DatabaseError as exc:
|
||||
conn.rollback()
|
||||
raise StorageError(f"Failed to add file {name}: {exc}") from exc
|
||||
|
||||
def add_files_batch(
|
||||
self, files: List[Tuple[str, Path, str, str, Optional[List[Symbol]]]]
|
||||
) -> int:
|
||||
"""Add multiple files in a single transaction.
|
||||
|
||||
Args:
|
||||
files: List of (name, full_path, content, language, symbols) tuples
|
||||
|
||||
Returns:
|
||||
Number of files added
|
||||
|
||||
Raises:
|
||||
StorageError: If batch operation fails
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
count = 0
|
||||
|
||||
try:
|
||||
conn.execute("BEGIN")
|
||||
|
||||
for name, full_path, content, language, symbols in files:
|
||||
full_path_str = str(Path(full_path).resolve())
|
||||
mtime = Path(full_path_str).stat().st_mtime if Path(full_path_str).exists() else None
|
||||
line_count = content.count('\n') + 1
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO files(name, full_path, language, content, mtime, line_count)
|
||||
VALUES(?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(full_path) DO UPDATE SET
|
||||
name=excluded.name,
|
||||
language=excluded.language,
|
||||
content=excluded.content,
|
||||
mtime=excluded.mtime,
|
||||
line_count=excluded.line_count
|
||||
""",
|
||||
(name, full_path_str, language, content, mtime, line_count),
|
||||
)
|
||||
|
||||
row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone()
|
||||
if not row:
|
||||
raise StorageError(f"Failed to retrieve file_id for {full_path_str}")
|
||||
|
||||
file_id = int(row["id"])
|
||||
count += 1
|
||||
|
||||
conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
|
||||
if symbols:
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT INTO symbols(file_id, name, kind, start_line, end_line)
|
||||
VALUES(?, ?, ?, ?, ?)
|
||||
""",
|
||||
[
|
||||
(file_id, s.name, s.kind, s.range[0], s.range[1])
|
||||
for s in symbols
|
||||
],
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
return count
|
||||
|
||||
except sqlite3.DatabaseError as exc:
|
||||
conn.rollback()
|
||||
raise StorageError(f"Batch insert failed: {exc}") from exc
|
||||
|
||||
def remove_file(self, full_path: str | Path) -> bool:
|
||||
"""Remove a file from the index.
|
||||
|
||||
Args:
|
||||
full_path: Complete source file path
|
||||
|
||||
Returns:
|
||||
True if file was removed, False if not found
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
full_path_str = str(Path(full_path).resolve())
|
||||
|
||||
row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone()
|
||||
if not row:
|
||||
return False
|
||||
|
||||
file_id = int(row["id"])
|
||||
conn.execute("DELETE FROM files WHERE id=?", (file_id,))
|
||||
conn.commit()
|
||||
return True
|
||||
|
||||
def get_file(self, full_path: str | Path) -> Optional[FileEntry]:
|
||||
"""Get file metadata.
|
||||
|
||||
Args:
|
||||
full_path: Complete source file path
|
||||
|
||||
Returns:
|
||||
FileEntry if found, None otherwise
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
full_path_str = str(Path(full_path).resolve())
|
||||
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT id, name, full_path, language, mtime, line_count
|
||||
FROM files WHERE full_path=?
|
||||
""",
|
||||
(full_path_str,),
|
||||
).fetchone()
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return FileEntry(
|
||||
id=int(row["id"]),
|
||||
name=row["name"],
|
||||
full_path=Path(row["full_path"]),
|
||||
language=row["language"],
|
||||
mtime=float(row["mtime"]) if row["mtime"] else 0.0,
|
||||
line_count=int(row["line_count"]) if row["line_count"] else 0,
|
||||
)
|
||||
|
||||
def get_file_mtime(self, full_path: str | Path) -> Optional[float]:
|
||||
"""Get stored modification time for a file.
|
||||
|
||||
Args:
|
||||
full_path: Complete source file path
|
||||
|
||||
Returns:
|
||||
Modification time as float, or None if not found
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
full_path_str = str(Path(full_path).resolve())
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT mtime FROM files WHERE full_path=?", (full_path_str,)
|
||||
).fetchone()
|
||||
|
||||
return float(row["mtime"]) if row and row["mtime"] else None
|
||||
|
||||
def list_files(self) -> List[FileEntry]:
|
||||
"""List all files in current directory.
|
||||
|
||||
Returns:
|
||||
List of FileEntry objects
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, name, full_path, language, mtime, line_count
|
||||
FROM files
|
||||
ORDER BY name
|
||||
"""
|
||||
).fetchall()
|
||||
|
||||
return [
|
||||
FileEntry(
|
||||
id=int(row["id"]),
|
||||
name=row["name"],
|
||||
full_path=Path(row["full_path"]),
|
||||
language=row["language"],
|
||||
mtime=float(row["mtime"]) if row["mtime"] else 0.0,
|
||||
line_count=int(row["line_count"]) if row["line_count"] else 0,
|
||||
)
|
||||
for row in rows
|
||||
]
|
||||
|
||||
def file_count(self) -> int:
|
||||
"""Get number of files in current directory.
|
||||
|
||||
Returns:
|
||||
File count
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
row = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()
|
||||
return int(row["c"]) if row else 0
|
||||
|
||||
# === Subdirectory Links ===
|
||||
|
||||
def register_subdir(
|
||||
self,
|
||||
name: str,
|
||||
index_path: str | Path,
|
||||
files_count: int = 0,
|
||||
direct_files: int = 0,
|
||||
) -> None:
|
||||
"""Register or update a subdirectory link.
|
||||
|
||||
Args:
|
||||
name: Subdirectory name
|
||||
index_path: Path to subdirectory's _index.db
|
||||
files_count: Total files recursively
|
||||
direct_files: Files directly in subdirectory
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
index_path_str = str(Path(index_path).resolve())
|
||||
|
||||
import time
|
||||
last_updated = time.time()
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO subdirs(name, index_path, files_count, direct_files, last_updated)
|
||||
VALUES(?, ?, ?, ?, ?)
|
||||
ON CONFLICT(name) DO UPDATE SET
|
||||
index_path=excluded.index_path,
|
||||
files_count=excluded.files_count,
|
||||
direct_files=excluded.direct_files,
|
||||
last_updated=excluded.last_updated
|
||||
""",
|
||||
(name, index_path_str, files_count, direct_files, last_updated),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def unregister_subdir(self, name: str) -> bool:
|
||||
"""Remove a subdirectory link.
|
||||
|
||||
Args:
|
||||
name: Subdirectory name
|
||||
|
||||
Returns:
|
||||
True if removed, False if not found
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
row = conn.execute("SELECT id FROM subdirs WHERE name=?", (name,)).fetchone()
|
||||
if not row:
|
||||
return False
|
||||
|
||||
conn.execute("DELETE FROM subdirs WHERE name=?", (name,))
|
||||
conn.commit()
|
||||
return True
|
||||
|
||||
def get_subdirs(self) -> List[SubdirLink]:
|
||||
"""Get all subdirectory links.
|
||||
|
||||
Returns:
|
||||
List of SubdirLink objects
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, name, index_path, files_count, direct_files, last_updated
|
||||
FROM subdirs
|
||||
ORDER BY name
|
||||
"""
|
||||
).fetchall()
|
||||
|
||||
return [
|
||||
SubdirLink(
|
||||
id=int(row["id"]),
|
||||
name=row["name"],
|
||||
index_path=Path(row["index_path"]),
|
||||
files_count=int(row["files_count"]) if row["files_count"] else 0,
|
||||
direct_files=int(row["direct_files"]) if row["direct_files"] else 0,
|
||||
last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
|
||||
)
|
||||
for row in rows
|
||||
]
|
||||
|
||||
def get_subdir(self, name: str) -> Optional[SubdirLink]:
|
||||
"""Get a specific subdirectory link.
|
||||
|
||||
Args:
|
||||
name: Subdirectory name
|
||||
|
||||
Returns:
|
||||
SubdirLink if found, None otherwise
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT id, name, index_path, files_count, direct_files, last_updated
|
||||
FROM subdirs WHERE name=?
|
||||
""",
|
||||
(name,),
|
||||
).fetchone()
|
||||
|
||||
if not row:
|
||||
return None
|
||||
|
||||
return SubdirLink(
|
||||
id=int(row["id"]),
|
||||
name=row["name"],
|
||||
index_path=Path(row["index_path"]),
|
||||
files_count=int(row["files_count"]) if row["files_count"] else 0,
|
||||
direct_files=int(row["direct_files"]) if row["direct_files"] else 0,
|
||||
last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
|
||||
)
|
||||
|
||||
def update_subdir_stats(
|
||||
self, name: str, files_count: int, direct_files: Optional[int] = None
|
||||
) -> None:
|
||||
"""Update subdirectory statistics.
|
||||
|
||||
Args:
|
||||
name: Subdirectory name
|
||||
files_count: Total files recursively
|
||||
direct_files: Files directly in subdirectory (optional)
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
import time
|
||||
last_updated = time.time()
|
||||
|
||||
if direct_files is not None:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE subdirs
|
||||
SET files_count=?, direct_files=?, last_updated=?
|
||||
WHERE name=?
|
||||
""",
|
||||
(files_count, direct_files, last_updated, name),
|
||||
)
|
||||
else:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE subdirs
|
||||
SET files_count=?, last_updated=?
|
||||
WHERE name=?
|
||||
""",
|
||||
(files_count, last_updated, name),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# === Search ===
|
||||
|
||||
def search_fts(self, query: str, limit: int = 20) -> List[SearchResult]:
|
||||
"""Full-text search in current directory files.
|
||||
|
||||
Args:
|
||||
query: FTS5 query string
|
||||
limit: Maximum results to return
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects sorted by relevance
|
||||
|
||||
Raises:
|
||||
StorageError: If FTS search fails
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT rowid, full_path, bm25(files_fts) AS rank,
|
||||
snippet(files_fts, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
|
||||
FROM files_fts
|
||||
WHERE files_fts MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ?
|
||||
""",
|
||||
(query, limit),
|
||||
).fetchall()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"FTS search failed: {exc}") from exc
|
||||
|
||||
results: List[SearchResult] = []
|
||||
for row in rows:
|
||||
rank = float(row["rank"]) if row["rank"] is not None else 0.0
|
||||
score = abs(rank) if rank < 0 else 0.0
|
||||
results.append(
|
||||
SearchResult(
|
||||
path=row["full_path"],
|
||||
score=score,
|
||||
excerpt=row["excerpt"],
|
||||
)
|
||||
)
|
||||
return results
|
||||
|
||||
def search_files_only(self, query: str, limit: int = 20) -> List[str]:
|
||||
"""Fast FTS search returning only file paths (no snippet generation).
|
||||
|
||||
Optimized for when only file paths are needed, skipping expensive
|
||||
snippet() function call.
|
||||
|
||||
Args:
|
||||
query: FTS5 query string
|
||||
limit: Maximum results to return
|
||||
|
||||
Returns:
|
||||
List of file paths as strings
|
||||
|
||||
Raises:
|
||||
StorageError: If FTS search fails
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT full_path
|
||||
FROM files_fts
|
||||
WHERE files_fts MATCH ?
|
||||
ORDER BY bm25(files_fts)
|
||||
LIMIT ?
|
||||
""",
|
||||
(query, limit),
|
||||
).fetchall()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"FTS search failed: {exc}") from exc
|
||||
|
||||
return [row["full_path"] for row in rows]
|
||||
|
||||
def search_symbols(
|
||||
self, name: str, kind: Optional[str] = None, limit: int = 50
|
||||
) -> List[Symbol]:
|
||||
"""Search symbols by name pattern.
|
||||
|
||||
Args:
|
||||
name: Symbol name pattern (LIKE query)
|
||||
kind: Optional symbol kind filter
|
||||
limit: Maximum results to return
|
||||
|
||||
Returns:
|
||||
List of Symbol objects
|
||||
"""
|
||||
pattern = f"%{name}%"
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
if kind:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT name, kind, start_line, end_line
|
||||
FROM symbols
|
||||
WHERE name LIKE ? AND kind=?
|
||||
ORDER BY name
|
||||
LIMIT ?
|
||||
""",
|
||||
(pattern, kind, limit),
|
||||
).fetchall()
|
||||
else:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT name, kind, start_line, end_line
|
||||
FROM symbols
|
||||
WHERE name LIKE ?
|
||||
ORDER BY name
|
||||
LIMIT ?
|
||||
""",
|
||||
(pattern, limit),
|
||||
).fetchall()
|
||||
|
||||
return [
|
||||
Symbol(
|
||||
name=row["name"],
|
||||
kind=row["kind"],
|
||||
range=(row["start_line"], row["end_line"]),
|
||||
)
|
||||
for row in rows
|
||||
]
|
||||
|
||||
# === Statistics ===
|
||||
|
||||
def stats(self) -> Dict[str, Any]:
|
||||
"""Get current directory statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary containing:
|
||||
- files: Number of files in this directory
|
||||
- symbols: Number of symbols
|
||||
- subdirs: Number of subdirectories
|
||||
- total_files: Total files including subdirectories
|
||||
- languages: Dictionary of language counts
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
file_count = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()["c"]
|
||||
symbol_count = conn.execute("SELECT COUNT(*) AS c FROM symbols").fetchone()["c"]
|
||||
subdir_count = conn.execute("SELECT COUNT(*) AS c FROM subdirs").fetchone()["c"]
|
||||
|
||||
total_files_row = conn.execute(
|
||||
"SELECT COALESCE(SUM(files_count), 0) AS total FROM subdirs"
|
||||
).fetchone()
|
||||
total_files = int(file_count) + int(total_files_row["total"] if total_files_row else 0)
|
||||
|
||||
lang_rows = conn.execute(
|
||||
"SELECT language, COUNT(*) AS c FROM files GROUP BY language ORDER BY c DESC"
|
||||
).fetchall()
|
||||
languages = {row["language"]: int(row["c"]) for row in lang_rows}
|
||||
|
||||
return {
|
||||
"files": int(file_count),
|
||||
"symbols": int(symbol_count),
|
||||
"subdirs": int(subdir_count),
|
||||
"total_files": total_files,
|
||||
"languages": languages,
|
||||
}
|
||||
|
||||
# === Internal Methods ===
|
||||
|
||||
def _get_connection(self) -> sqlite3.Connection:
|
||||
"""Get or create database connection with proper configuration.
|
||||
|
||||
Returns:
|
||||
sqlite3.Connection with WAL mode and foreign keys enabled
|
||||
"""
|
||||
if self._conn is None:
|
||||
self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
|
||||
self._conn.row_factory = sqlite3.Row
|
||||
self._conn.execute("PRAGMA journal_mode=WAL")
|
||||
self._conn.execute("PRAGMA synchronous=NORMAL")
|
||||
self._conn.execute("PRAGMA foreign_keys=ON")
|
||||
# Memory-mapped I/O for faster reads (30GB limit)
|
||||
self._conn.execute("PRAGMA mmap_size=30000000000")
|
||||
return self._conn
|
||||
|
||||
def _create_schema(self, conn: sqlite3.Connection) -> None:
|
||||
"""Create database schema.
|
||||
|
||||
Args:
|
||||
conn: Database connection
|
||||
|
||||
Raises:
|
||||
StorageError: If schema creation fails
|
||||
"""
|
||||
try:
|
||||
# Files table
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
full_path TEXT UNIQUE NOT NULL,
|
||||
language TEXT,
|
||||
content TEXT,
|
||||
mtime REAL,
|
||||
line_count INTEGER
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Subdirectories table
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS subdirs (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL UNIQUE,
|
||||
index_path TEXT NOT NULL,
|
||||
files_count INTEGER DEFAULT 0,
|
||||
direct_files INTEGER DEFAULT 0,
|
||||
last_updated REAL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Symbols table
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS symbols (
|
||||
id INTEGER PRIMARY KEY,
|
||||
file_id INTEGER REFERENCES files(id) ON DELETE CASCADE,
|
||||
name TEXT NOT NULL,
|
||||
kind TEXT NOT NULL,
|
||||
start_line INTEGER,
|
||||
end_line INTEGER
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# FTS5 external content table with code-friendly tokenizer
|
||||
# unicode61 tokenchars keeps underscores as part of tokens
|
||||
# so 'user_id' is indexed as one token, not 'user' and 'id'
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
|
||||
name, full_path UNINDEXED, content,
|
||||
content='files',
|
||||
content_rowid='id',
|
||||
tokenize="unicode61 tokenchars '_'"
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Indexes
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
|
||||
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"Failed to create schema: {exc}") from exc
|
||||
|
||||
def _create_fts_triggers(self, conn: sqlite3.Connection) -> None:
|
||||
"""Create FTS5 external content triggers.
|
||||
|
||||
Args:
|
||||
conn: Database connection
|
||||
"""
|
||||
# Insert trigger
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN
|
||||
INSERT INTO files_fts(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Delete trigger
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN
|
||||
INSERT INTO files_fts(files_fts, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
|
||||
# Update trigger
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN
|
||||
INSERT INTO files_fts(files_fts, rowid, name, full_path, content)
|
||||
VALUES('delete', old.id, old.name, old.full_path, old.content);
|
||||
INSERT INTO files_fts(rowid, name, full_path, content)
|
||||
VALUES(new.id, new.name, new.full_path, new.content);
|
||||
END
|
||||
"""
|
||||
)
|
||||
698
codex-lens/src/codexlens/storage/index_tree.py
Normal file
698
codex-lens/src/codexlens/storage/index_tree.py
Normal file
@@ -0,0 +1,698 @@
|
||||
"""Hierarchical index tree builder for CodexLens.
|
||||
|
||||
Constructs a bottom-up directory index tree with parallel processing support.
|
||||
Each directory maintains its own _index.db with files and subdirectory links.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Set
|
||||
|
||||
from codexlens.config import Config
|
||||
from codexlens.parsers.factory import ParserFactory
|
||||
from codexlens.storage.dir_index import DirIndexStore
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
from codexlens.storage.registry import ProjectInfo, RegistryStore
|
||||
|
||||
|
||||
@dataclass
|
||||
class BuildResult:
|
||||
"""Complete build operation result."""
|
||||
|
||||
project_id: int
|
||||
source_root: Path
|
||||
index_root: Path
|
||||
total_files: int
|
||||
total_dirs: int
|
||||
errors: List[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
class DirBuildResult:
|
||||
"""Single directory build result."""
|
||||
|
||||
source_path: Path
|
||||
index_path: Path
|
||||
files_count: int
|
||||
symbols_count: int
|
||||
subdirs: List[str] # Subdirectory names
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class IndexTreeBuilder:
|
||||
"""Hierarchical index tree builder with parallel processing.
|
||||
|
||||
Builds directory indexes bottom-up to enable proper subdirectory linking.
|
||||
Each directory gets its own _index.db containing:
|
||||
- Files in that directory
|
||||
- Links to child directory indexes
|
||||
- Symbols and FTS5 search
|
||||
|
||||
Attributes:
|
||||
registry: Global project registry
|
||||
mapper: Path mapping between source and index
|
||||
config: CodexLens configuration
|
||||
parser_factory: Parser factory for symbol extraction
|
||||
logger: Logger instance
|
||||
IGNORE_DIRS: Set of directory names to skip during indexing
|
||||
"""
|
||||
|
||||
# Directories to skip during indexing
|
||||
IGNORE_DIRS: Set[str] = {
|
||||
".git",
|
||||
".venv",
|
||||
"venv",
|
||||
"node_modules",
|
||||
"__pycache__",
|
||||
".codexlens",
|
||||
".idea",
|
||||
".vscode",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self, registry: RegistryStore, mapper: PathMapper, config: Config = None
|
||||
):
|
||||
"""Initialize the index tree builder.
|
||||
|
||||
Args:
|
||||
registry: Global registry store for project tracking
|
||||
mapper: Path mapper for source to index conversions
|
||||
config: CodexLens configuration (uses defaults if None)
|
||||
"""
|
||||
self.registry = registry
|
||||
self.mapper = mapper
|
||||
self.config = config or Config()
|
||||
self.parser_factory = ParserFactory(self.config)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def build(
|
||||
self,
|
||||
source_root: Path,
|
||||
languages: List[str] = None,
|
||||
workers: int = 4,
|
||||
) -> BuildResult:
|
||||
"""Build complete index tree for a project.
|
||||
|
||||
Process:
|
||||
1. Register project in registry
|
||||
2. Collect all directories grouped by depth
|
||||
3. Build indexes bottom-up (deepest first)
|
||||
4. Link subdirectories to parents
|
||||
5. Update project statistics
|
||||
|
||||
Args:
|
||||
source_root: Project root directory to index
|
||||
languages: Optional list of language IDs to limit indexing
|
||||
workers: Number of parallel worker processes
|
||||
|
||||
Returns:
|
||||
BuildResult with statistics and errors
|
||||
|
||||
Raises:
|
||||
ValueError: If source_root doesn't exist
|
||||
"""
|
||||
source_root = source_root.resolve()
|
||||
if not source_root.exists():
|
||||
raise ValueError(f"Source root does not exist: {source_root}")
|
||||
|
||||
self.logger.info("Building index tree for %s", source_root)
|
||||
|
||||
# Register project
|
||||
index_root = self.mapper.source_to_index_dir(source_root)
|
||||
project_info = self.registry.register_project(source_root, index_root)
|
||||
|
||||
# Collect directories by depth
|
||||
dirs_by_depth = self._collect_dirs_by_depth(source_root, languages)
|
||||
|
||||
if not dirs_by_depth:
|
||||
self.logger.warning("No indexable directories found in %s", source_root)
|
||||
return BuildResult(
|
||||
project_id=project_info.id,
|
||||
source_root=source_root,
|
||||
index_root=index_root,
|
||||
total_files=0,
|
||||
total_dirs=0,
|
||||
errors=["No indexable directories found"],
|
||||
)
|
||||
|
||||
total_files = 0
|
||||
total_dirs = 0
|
||||
all_errors: List[str] = []
|
||||
all_results: List[DirBuildResult] = [] # Store all results for subdir linking
|
||||
|
||||
# Build bottom-up (highest depth first)
|
||||
max_depth = max(dirs_by_depth.keys())
|
||||
for depth in range(max_depth, -1, -1):
|
||||
if depth not in dirs_by_depth:
|
||||
continue
|
||||
|
||||
dirs = dirs_by_depth[depth]
|
||||
self.logger.info("Building %d directories at depth %d", len(dirs), depth)
|
||||
|
||||
# Build directories at this level in parallel
|
||||
results = self._build_level_parallel(dirs, languages, workers)
|
||||
all_results.extend(results)
|
||||
|
||||
# Process results
|
||||
for result in results:
|
||||
if result.error:
|
||||
all_errors.append(f"{result.source_path}: {result.error}")
|
||||
continue
|
||||
|
||||
total_files += result.files_count
|
||||
total_dirs += 1
|
||||
|
||||
# Register directory in registry
|
||||
self.registry.register_dir(
|
||||
project_id=project_info.id,
|
||||
source_path=result.source_path,
|
||||
index_path=result.index_path,
|
||||
depth=self.mapper.get_relative_depth(result.source_path, source_root),
|
||||
files_count=result.files_count,
|
||||
)
|
||||
|
||||
# After building all directories, link subdirectories to parents
|
||||
# This needs to happen after all indexes exist
|
||||
for result in all_results:
|
||||
if result.error:
|
||||
continue
|
||||
# Link children to this directory
|
||||
self._link_children_to_parent(result.source_path, all_results)
|
||||
|
||||
# Update project statistics
|
||||
self.registry.update_project_stats(source_root, total_files, total_dirs)
|
||||
|
||||
self.logger.info(
|
||||
"Index build complete: %d files, %d directories, %d errors",
|
||||
total_files,
|
||||
total_dirs,
|
||||
len(all_errors),
|
||||
)
|
||||
|
||||
return BuildResult(
|
||||
project_id=project_info.id,
|
||||
source_root=source_root,
|
||||
index_root=index_root,
|
||||
total_files=total_files,
|
||||
total_dirs=total_dirs,
|
||||
errors=all_errors,
|
||||
)
|
||||
|
||||
def update_subtree(
|
||||
self,
|
||||
source_path: Path,
|
||||
languages: List[str] = None,
|
||||
workers: int = 4,
|
||||
) -> BuildResult:
|
||||
"""Incrementally update a subtree.
|
||||
|
||||
Rebuilds indexes for the specified directory and all subdirectories.
|
||||
Useful for incremental updates when only part of the tree changed.
|
||||
|
||||
Args:
|
||||
source_path: Root of subtree to update
|
||||
languages: Optional list of language IDs to limit indexing
|
||||
workers: Number of parallel worker processes
|
||||
|
||||
Returns:
|
||||
BuildResult for the subtree
|
||||
|
||||
Raises:
|
||||
ValueError: If source_path is not indexed
|
||||
"""
|
||||
source_path = source_path.resolve()
|
||||
project_root = self.mapper.get_project_root(source_path)
|
||||
|
||||
# Get project info
|
||||
project_info = self.registry.get_project(project_root)
|
||||
if not project_info:
|
||||
raise ValueError(f"Directory not indexed: {source_path}")
|
||||
|
||||
self.logger.info("Updating subtree at %s", source_path)
|
||||
|
||||
# Use build logic but start from source_path
|
||||
return self.build(source_path, languages, workers)
|
||||
|
||||
def rebuild_dir(self, source_path: Path) -> DirBuildResult:
|
||||
"""Rebuild index for a single directory.
|
||||
|
||||
Only rebuilds the specified directory, does not touch subdirectories.
|
||||
Useful for updating a single directory after file changes.
|
||||
|
||||
Args:
|
||||
source_path: Directory to rebuild
|
||||
|
||||
Returns:
|
||||
DirBuildResult for the directory
|
||||
"""
|
||||
source_path = source_path.resolve()
|
||||
self.logger.info("Rebuilding directory %s", source_path)
|
||||
return self._build_single_dir(source_path)
|
||||
|
||||
# === Internal Methods ===
|
||||
|
||||
def _collect_dirs_by_depth(
|
||||
self, source_root: Path, languages: List[str] = None
|
||||
) -> Dict[int, List[Path]]:
|
||||
"""Collect all indexable directories grouped by depth.
|
||||
|
||||
Walks the directory tree and groups directories by their depth
|
||||
relative to source_root. Depth 0 is the root itself.
|
||||
|
||||
Args:
|
||||
source_root: Root directory to start from
|
||||
languages: Optional language filter
|
||||
|
||||
Returns:
|
||||
Dictionary mapping depth to list of directory paths
|
||||
Example: {0: [root], 1: [src, tests], 2: [src/api, src/utils]}
|
||||
"""
|
||||
source_root = source_root.resolve()
|
||||
dirs_by_depth: Dict[int, List[Path]] = {}
|
||||
|
||||
# Always include the root directory at depth 0 for chain search entry point
|
||||
dirs_by_depth[0] = [source_root]
|
||||
|
||||
for root, dirnames, _ in os.walk(source_root):
|
||||
# Filter out ignored directories
|
||||
dirnames[:] = [
|
||||
d
|
||||
for d in dirnames
|
||||
if d not in self.IGNORE_DIRS and not d.startswith(".")
|
||||
]
|
||||
|
||||
root_path = Path(root)
|
||||
|
||||
# Skip root (already added)
|
||||
if root_path == source_root:
|
||||
continue
|
||||
|
||||
# Check if this directory should be indexed
|
||||
if not self._should_index_dir(root_path, languages):
|
||||
continue
|
||||
|
||||
# Calculate depth relative to source_root
|
||||
try:
|
||||
depth = len(root_path.relative_to(source_root).parts)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
if depth not in dirs_by_depth:
|
||||
dirs_by_depth[depth] = []
|
||||
|
||||
dirs_by_depth[depth].append(root_path)
|
||||
|
||||
return dirs_by_depth
|
||||
|
||||
def _should_index_dir(self, dir_path: Path, languages: List[str] = None) -> bool:
|
||||
"""Check if directory should be indexed.
|
||||
|
||||
A directory is indexed if:
|
||||
1. It's not in IGNORE_DIRS
|
||||
2. It doesn't start with '.'
|
||||
3. It contains at least one supported language file
|
||||
|
||||
Args:
|
||||
dir_path: Directory to check
|
||||
languages: Optional language filter
|
||||
|
||||
Returns:
|
||||
True if directory should be indexed
|
||||
"""
|
||||
# Check directory name
|
||||
if dir_path.name in self.IGNORE_DIRS or dir_path.name.startswith("."):
|
||||
return False
|
||||
|
||||
# Check for supported files in this directory
|
||||
source_files = self._iter_source_files(dir_path, languages)
|
||||
return len(source_files) > 0
|
||||
|
||||
def _build_level_parallel(
|
||||
self, dirs: List[Path], languages: List[str], workers: int
|
||||
) -> List[DirBuildResult]:
|
||||
"""Build multiple directories in parallel.
|
||||
|
||||
Uses ProcessPoolExecutor to build directories concurrently.
|
||||
All directories at the same level are independent and can be
|
||||
processed in parallel.
|
||||
|
||||
Args:
|
||||
dirs: List of directories to build
|
||||
languages: Language filter
|
||||
workers: Number of worker processes
|
||||
|
||||
Returns:
|
||||
List of DirBuildResult objects
|
||||
"""
|
||||
results: List[DirBuildResult] = []
|
||||
|
||||
if not dirs:
|
||||
return results
|
||||
|
||||
# For single directory, avoid overhead of process pool
|
||||
if len(dirs) == 1:
|
||||
result = self._build_single_dir(dirs[0], languages)
|
||||
return [result]
|
||||
|
||||
# Prepare arguments for worker processes
|
||||
config_dict = {
|
||||
"data_dir": str(self.config.data_dir),
|
||||
"supported_languages": self.config.supported_languages,
|
||||
"parsing_rules": self.config.parsing_rules,
|
||||
}
|
||||
|
||||
worker_args = [
|
||||
(
|
||||
dir_path,
|
||||
self.mapper.source_to_index_db(dir_path),
|
||||
languages,
|
||||
config_dict,
|
||||
)
|
||||
for dir_path in dirs
|
||||
]
|
||||
|
||||
# Execute in parallel
|
||||
with ProcessPoolExecutor(max_workers=workers) as executor:
|
||||
futures = {
|
||||
executor.submit(_build_dir_worker, args): args[0]
|
||||
for args in worker_args
|
||||
}
|
||||
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
result = future.result()
|
||||
results.append(result)
|
||||
except Exception as exc:
|
||||
dir_path = futures[future]
|
||||
self.logger.error("Failed to build %s: %s", dir_path, exc)
|
||||
results.append(
|
||||
DirBuildResult(
|
||||
source_path=dir_path,
|
||||
index_path=self.mapper.source_to_index_db(dir_path),
|
||||
files_count=0,
|
||||
symbols_count=0,
|
||||
subdirs=[],
|
||||
error=str(exc),
|
||||
)
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def _build_single_dir(
|
||||
self, dir_path: Path, languages: List[str] = None
|
||||
) -> DirBuildResult:
|
||||
"""Build index for a single directory.
|
||||
|
||||
Creates _index.db and indexes all files in the directory.
|
||||
Does not recurse into subdirectories.
|
||||
|
||||
Args:
|
||||
dir_path: Directory to index
|
||||
languages: Optional language filter
|
||||
|
||||
Returns:
|
||||
DirBuildResult with statistics and subdirectory list
|
||||
"""
|
||||
dir_path = dir_path.resolve()
|
||||
index_db_path = self.mapper.source_to_index_db(dir_path)
|
||||
|
||||
try:
|
||||
# Ensure index directory exists
|
||||
index_db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create directory index
|
||||
store = DirIndexStore(index_db_path)
|
||||
store.initialize()
|
||||
|
||||
# Get source files in this directory only
|
||||
source_files = self._iter_source_files(dir_path, languages)
|
||||
|
||||
files_count = 0
|
||||
symbols_count = 0
|
||||
|
||||
for file_path in source_files:
|
||||
try:
|
||||
# Read and parse file
|
||||
text = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
language_id = self.config.language_for_path(file_path)
|
||||
if not language_id:
|
||||
continue
|
||||
|
||||
parser = self.parser_factory.get_parser(language_id)
|
||||
indexed_file = parser.parse(text, file_path)
|
||||
|
||||
# Add to directory index
|
||||
store.add_file(
|
||||
name=file_path.name,
|
||||
full_path=file_path,
|
||||
content=text,
|
||||
language=language_id,
|
||||
symbols=indexed_file.symbols,
|
||||
)
|
||||
|
||||
files_count += 1
|
||||
symbols_count += len(indexed_file.symbols)
|
||||
|
||||
except Exception as exc:
|
||||
self.logger.debug("Failed to index %s: %s", file_path, exc)
|
||||
continue
|
||||
|
||||
# Get list of subdirectories
|
||||
subdirs = [
|
||||
d.name
|
||||
for d in dir_path.iterdir()
|
||||
if d.is_dir()
|
||||
and d.name not in self.IGNORE_DIRS
|
||||
and not d.name.startswith(".")
|
||||
]
|
||||
|
||||
store.close()
|
||||
|
||||
self.logger.debug(
|
||||
"Built %s: %d files, %d symbols, %d subdirs",
|
||||
dir_path,
|
||||
files_count,
|
||||
symbols_count,
|
||||
len(subdirs),
|
||||
)
|
||||
|
||||
return DirBuildResult(
|
||||
source_path=dir_path,
|
||||
index_path=index_db_path,
|
||||
files_count=files_count,
|
||||
symbols_count=symbols_count,
|
||||
subdirs=subdirs,
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
self.logger.error("Failed to build directory %s: %s", dir_path, exc)
|
||||
return DirBuildResult(
|
||||
source_path=dir_path,
|
||||
index_path=index_db_path,
|
||||
files_count=0,
|
||||
symbols_count=0,
|
||||
subdirs=[],
|
||||
error=str(exc),
|
||||
)
|
||||
|
||||
def _link_children_to_parent(
|
||||
self, parent_path: Path, all_results: List[DirBuildResult]
|
||||
) -> None:
|
||||
"""Link child directory indexes to parent's subdirs table.
|
||||
|
||||
Finds all direct children of parent_path in all_results and
|
||||
registers them as subdirectories in the parent's index.
|
||||
|
||||
Args:
|
||||
parent_path: Parent directory path
|
||||
all_results: List of all build results
|
||||
"""
|
||||
parent_index_db = self.mapper.source_to_index_db(parent_path)
|
||||
|
||||
try:
|
||||
store = DirIndexStore(parent_index_db)
|
||||
store.initialize()
|
||||
|
||||
for result in all_results:
|
||||
# Only register direct children (parent is one level up)
|
||||
if result.source_path.parent != parent_path:
|
||||
continue
|
||||
|
||||
if result.error:
|
||||
continue
|
||||
|
||||
# Register subdirectory link
|
||||
store.register_subdir(
|
||||
name=result.source_path.name,
|
||||
index_path=result.index_path,
|
||||
files_count=result.files_count,
|
||||
direct_files=result.files_count,
|
||||
)
|
||||
self.logger.debug(
|
||||
"Linked %s to parent %s",
|
||||
result.source_path.name,
|
||||
parent_path,
|
||||
)
|
||||
|
||||
store.close()
|
||||
|
||||
except Exception as exc:
|
||||
self.logger.error(
|
||||
"Failed to link children to %s: %s", parent_path, exc
|
||||
)
|
||||
|
||||
def _iter_source_files(
|
||||
self, dir_path: Path, languages: List[str] = None
|
||||
) -> List[Path]:
|
||||
"""Iterate source files in directory (non-recursive).
|
||||
|
||||
Returns files in the specified directory that match language filters.
|
||||
Does not recurse into subdirectories.
|
||||
|
||||
Args:
|
||||
dir_path: Directory to scan
|
||||
languages: Optional language filter
|
||||
|
||||
Returns:
|
||||
List of source file paths
|
||||
"""
|
||||
files: List[Path] = []
|
||||
|
||||
if not dir_path.is_dir():
|
||||
return files
|
||||
|
||||
for item in dir_path.iterdir():
|
||||
if not item.is_file():
|
||||
continue
|
||||
|
||||
if item.name.startswith("."):
|
||||
continue
|
||||
|
||||
# Check language support
|
||||
language_id = self.config.language_for_path(item)
|
||||
if not language_id:
|
||||
continue
|
||||
|
||||
# Apply language filter
|
||||
if languages and language_id not in languages:
|
||||
continue
|
||||
|
||||
files.append(item)
|
||||
|
||||
return files
|
||||
|
||||
|
||||
# === Worker Function for ProcessPoolExecutor ===
|
||||
|
||||
|
||||
def _build_dir_worker(args: tuple) -> DirBuildResult:
|
||||
"""Worker function for parallel directory building.
|
||||
|
||||
Must be at module level for ProcessPoolExecutor pickling.
|
||||
Reconstructs necessary objects from serializable arguments.
|
||||
|
||||
Args:
|
||||
args: Tuple of (dir_path, index_db_path, languages, config_dict)
|
||||
|
||||
Returns:
|
||||
DirBuildResult for the directory
|
||||
"""
|
||||
dir_path, index_db_path, languages, config_dict = args
|
||||
|
||||
# Reconstruct config
|
||||
config = Config(
|
||||
data_dir=Path(config_dict["data_dir"]),
|
||||
supported_languages=config_dict["supported_languages"],
|
||||
parsing_rules=config_dict["parsing_rules"],
|
||||
)
|
||||
|
||||
parser_factory = ParserFactory(config)
|
||||
|
||||
try:
|
||||
# Ensure index directory exists
|
||||
index_db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create directory index
|
||||
store = DirIndexStore(index_db_path)
|
||||
store.initialize()
|
||||
|
||||
files_count = 0
|
||||
symbols_count = 0
|
||||
|
||||
# Index files in this directory
|
||||
for item in dir_path.iterdir():
|
||||
if not item.is_file():
|
||||
continue
|
||||
|
||||
if item.name.startswith("."):
|
||||
continue
|
||||
|
||||
language_id = config.language_for_path(item)
|
||||
if not language_id:
|
||||
continue
|
||||
|
||||
if languages and language_id not in languages:
|
||||
continue
|
||||
|
||||
try:
|
||||
text = item.read_text(encoding="utf-8", errors="ignore")
|
||||
parser = parser_factory.get_parser(language_id)
|
||||
indexed_file = parser.parse(text, item)
|
||||
|
||||
store.add_file(
|
||||
name=item.name,
|
||||
full_path=item,
|
||||
content=text,
|
||||
language=language_id,
|
||||
symbols=indexed_file.symbols,
|
||||
)
|
||||
|
||||
files_count += 1
|
||||
symbols_count += len(indexed_file.symbols)
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Get subdirectories
|
||||
ignore_dirs = {
|
||||
".git",
|
||||
".venv",
|
||||
"venv",
|
||||
"node_modules",
|
||||
"__pycache__",
|
||||
".codexlens",
|
||||
".idea",
|
||||
".vscode",
|
||||
}
|
||||
|
||||
subdirs = [
|
||||
d.name
|
||||
for d in dir_path.iterdir()
|
||||
if d.is_dir() and d.name not in ignore_dirs and not d.name.startswith(".")
|
||||
]
|
||||
|
||||
store.close()
|
||||
|
||||
return DirBuildResult(
|
||||
source_path=dir_path,
|
||||
index_path=index_db_path,
|
||||
files_count=files_count,
|
||||
symbols_count=symbols_count,
|
||||
subdirs=subdirs,
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
return DirBuildResult(
|
||||
source_path=dir_path,
|
||||
index_path=index_db_path,
|
||||
files_count=0,
|
||||
symbols_count=0,
|
||||
subdirs=[],
|
||||
error=str(exc),
|
||||
)
|
||||
274
codex-lens/src/codexlens/storage/path_mapper.py
Normal file
274
codex-lens/src/codexlens/storage/path_mapper.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""Path mapping utilities for source paths and index paths.
|
||||
|
||||
This module provides bidirectional mapping between source code directories
|
||||
and their corresponding index storage locations.
|
||||
|
||||
Storage Structure:
|
||||
~/.codexlens/
|
||||
├── registry.db # Global mapping table
|
||||
└── indexes/
|
||||
└── D/
|
||||
└── Claude_dms3/
|
||||
├── _index.db # Root directory index
|
||||
└── src/
|
||||
└── _index.db # src/ directory index
|
||||
"""
|
||||
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class PathMapper:
|
||||
"""Bidirectional mapping tool for source paths ↔ index paths.
|
||||
|
||||
Handles cross-platform path normalization and conversion between
|
||||
source code directories and their index storage locations.
|
||||
|
||||
Attributes:
|
||||
DEFAULT_INDEX_ROOT: Default root directory for all indexes
|
||||
INDEX_DB_NAME: Standard name for index database files
|
||||
index_root: Configured index root directory
|
||||
"""
|
||||
|
||||
DEFAULT_INDEX_ROOT = Path.home() / ".codexlens" / "indexes"
|
||||
INDEX_DB_NAME = "_index.db"
|
||||
|
||||
def __init__(self, index_root: Optional[Path] = None):
|
||||
"""Initialize PathMapper with optional custom index root.
|
||||
|
||||
Args:
|
||||
index_root: Custom index root directory. If None, uses DEFAULT_INDEX_ROOT.
|
||||
"""
|
||||
self.index_root = (index_root or self.DEFAULT_INDEX_ROOT).resolve()
|
||||
|
||||
def source_to_index_dir(self, source_path: Path) -> Path:
|
||||
"""Convert source directory to its index directory path.
|
||||
|
||||
Maps a source code directory to where its index data should be stored.
|
||||
The mapping preserves the directory structure but normalizes paths
|
||||
for cross-platform compatibility.
|
||||
|
||||
Args:
|
||||
source_path: Source directory path to map
|
||||
|
||||
Returns:
|
||||
Index directory path under index_root
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.source_to_index_dir(Path("D:/Claude_dms3/src"))
|
||||
PosixPath('/home/user/.codexlens/indexes/D/Claude_dms3/src')
|
||||
|
||||
>>> mapper.source_to_index_dir(Path("/home/user/project"))
|
||||
PosixPath('/home/user/.codexlens/indexes/home/user/project')
|
||||
"""
|
||||
source_path = source_path.resolve()
|
||||
normalized = self.normalize_path(source_path)
|
||||
return self.index_root / normalized
|
||||
|
||||
def source_to_index_db(self, source_path: Path) -> Path:
|
||||
"""Convert source directory to its index database file path.
|
||||
|
||||
Maps a source directory to the full path of its index database file,
|
||||
including the standard INDEX_DB_NAME.
|
||||
|
||||
Args:
|
||||
source_path: Source directory path to map
|
||||
|
||||
Returns:
|
||||
Full path to the index database file
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.source_to_index_db(Path("D:/Claude_dms3/src"))
|
||||
PosixPath('/home/user/.codexlens/indexes/D/Claude_dms3/src/_index.db')
|
||||
"""
|
||||
index_dir = self.source_to_index_dir(source_path)
|
||||
return index_dir / self.INDEX_DB_NAME
|
||||
|
||||
def index_to_source(self, index_path: Path) -> Path:
|
||||
"""Convert index path back to original source path.
|
||||
|
||||
Performs reverse mapping from an index storage location to the
|
||||
original source directory. Handles both directory paths and
|
||||
database file paths.
|
||||
|
||||
Args:
|
||||
index_path: Index directory or database file path
|
||||
|
||||
Returns:
|
||||
Original source directory path
|
||||
|
||||
Raises:
|
||||
ValueError: If index_path is not under index_root
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.index_to_source(
|
||||
... Path("~/.codexlens/indexes/D/Claude_dms3/src/_index.db")
|
||||
... )
|
||||
WindowsPath('D:/Claude_dms3/src')
|
||||
|
||||
>>> mapper.index_to_source(
|
||||
... Path("~/.codexlens/indexes/D/Claude_dms3/src")
|
||||
... )
|
||||
WindowsPath('D:/Claude_dms3/src')
|
||||
"""
|
||||
index_path = index_path.resolve()
|
||||
|
||||
# Remove _index.db if present
|
||||
if index_path.name == self.INDEX_DB_NAME:
|
||||
index_path = index_path.parent
|
||||
|
||||
# Verify path is under index_root
|
||||
try:
|
||||
relative = index_path.relative_to(self.index_root)
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Index path {index_path} is not under index root {self.index_root}"
|
||||
)
|
||||
|
||||
# Convert normalized path back to source path
|
||||
normalized_str = str(relative).replace("\\", "/")
|
||||
return self.denormalize_path(normalized_str)
|
||||
|
||||
def get_project_root(self, source_path: Path) -> Path:
|
||||
"""Find the project root directory (topmost indexed directory).
|
||||
|
||||
Walks up the directory tree to find the highest-level directory
|
||||
that has an index database.
|
||||
|
||||
Args:
|
||||
source_path: Source directory to start from
|
||||
|
||||
Returns:
|
||||
Project root directory path. Returns source_path itself if
|
||||
no parent index is found.
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.get_project_root(Path("D:/Claude_dms3/src/codexlens"))
|
||||
WindowsPath('D:/Claude_dms3')
|
||||
"""
|
||||
source_path = source_path.resolve()
|
||||
current = source_path
|
||||
project_root = source_path
|
||||
|
||||
# Walk up the tree
|
||||
while current.parent != current: # Stop at filesystem root
|
||||
parent_index_db = self.source_to_index_db(current.parent)
|
||||
if parent_index_db.exists():
|
||||
project_root = current.parent
|
||||
current = current.parent
|
||||
else:
|
||||
break
|
||||
|
||||
return project_root
|
||||
|
||||
def get_relative_depth(self, source_path: Path, project_root: Path) -> int:
|
||||
"""Calculate directory depth relative to project root.
|
||||
|
||||
Args:
|
||||
source_path: Target directory path
|
||||
project_root: Project root directory path
|
||||
|
||||
Returns:
|
||||
Number of directory levels from project_root to source_path
|
||||
|
||||
Raises:
|
||||
ValueError: If source_path is not under project_root
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.get_relative_depth(
|
||||
... Path("D:/Claude_dms3/src/codexlens"),
|
||||
... Path("D:/Claude_dms3")
|
||||
... )
|
||||
2
|
||||
"""
|
||||
source_path = source_path.resolve()
|
||||
project_root = project_root.resolve()
|
||||
|
||||
try:
|
||||
relative = source_path.relative_to(project_root)
|
||||
# Count path components
|
||||
return len(relative.parts)
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Source path {source_path} is not under project root {project_root}"
|
||||
)
|
||||
|
||||
def normalize_path(self, path: Path) -> str:
|
||||
"""Normalize path to cross-platform storage format.
|
||||
|
||||
Converts OS-specific paths to a standardized format for storage:
|
||||
- Windows: Removes drive colons (D: → D)
|
||||
- Unix: Removes leading slash
|
||||
- Uses forward slashes throughout
|
||||
|
||||
Args:
|
||||
path: Path to normalize
|
||||
|
||||
Returns:
|
||||
Normalized path string
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.normalize_path(Path("D:/path/to/dir"))
|
||||
'D/path/to/dir'
|
||||
|
||||
>>> mapper.normalize_path(Path("/home/user/path"))
|
||||
'home/user/path'
|
||||
"""
|
||||
path = path.resolve()
|
||||
path_str = str(path)
|
||||
|
||||
# Handle Windows paths with drive letters
|
||||
if platform.system() == "Windows" and len(path.parts) > 0:
|
||||
# Convert D:\path\to\dir → D/path/to/dir
|
||||
drive = path.parts[0].replace(":", "") # D: → D
|
||||
rest = Path(*path.parts[1:]) if len(path.parts) > 1 else Path()
|
||||
normalized = f"{drive}/{rest}".replace("\\", "/")
|
||||
return normalized.rstrip("/")
|
||||
|
||||
# Handle Unix paths
|
||||
# /home/user/path → home/user/path
|
||||
return path_str.lstrip("/").replace("\\", "/")
|
||||
|
||||
def denormalize_path(self, normalized: str) -> Path:
|
||||
"""Convert normalized path back to OS-specific path.
|
||||
|
||||
Reverses the normalization process to restore OS-native path format:
|
||||
- Windows: Adds drive colons (D → D:)
|
||||
- Unix: Adds leading slash
|
||||
|
||||
Args:
|
||||
normalized: Normalized path string
|
||||
|
||||
Returns:
|
||||
OS-specific Path object
|
||||
|
||||
Examples:
|
||||
>>> mapper = PathMapper()
|
||||
>>> mapper.denormalize_path("D/path/to/dir") # On Windows
|
||||
WindowsPath('D:/path/to/dir')
|
||||
|
||||
>>> mapper.denormalize_path("home/user/path") # On Unix
|
||||
PosixPath('/home/user/path')
|
||||
"""
|
||||
parts = normalized.split("/")
|
||||
|
||||
# Handle Windows paths
|
||||
if platform.system() == "Windows" and len(parts) > 0:
|
||||
# Check if first part is a drive letter
|
||||
if len(parts[0]) == 1 and parts[0].isalpha():
|
||||
# D/path/to/dir → D:/path/to/dir
|
||||
drive = f"{parts[0]}:"
|
||||
if len(parts) > 1:
|
||||
return Path(drive) / Path(*parts[1:])
|
||||
return Path(drive)
|
||||
|
||||
# Handle Unix paths or relative paths
|
||||
# home/user/path → /home/user/path
|
||||
return Path("/") / Path(*parts)
|
||||
600
codex-lens/src/codexlens/storage/registry.py
Normal file
600
codex-lens/src/codexlens/storage/registry.py
Normal file
@@ -0,0 +1,600 @@
|
||||
"""Global project registry for CodexLens - SQLite storage."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from codexlens.errors import StorageError
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProjectInfo:
|
||||
"""Registered project information."""
|
||||
|
||||
id: int
|
||||
source_root: Path
|
||||
index_root: Path
|
||||
created_at: float
|
||||
last_indexed: float
|
||||
total_files: int
|
||||
total_dirs: int
|
||||
status: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class DirMapping:
|
||||
"""Directory to index path mapping."""
|
||||
|
||||
id: int
|
||||
project_id: int
|
||||
source_path: Path
|
||||
index_path: Path
|
||||
depth: int
|
||||
files_count: int
|
||||
last_updated: float
|
||||
|
||||
|
||||
class RegistryStore:
|
||||
"""Global project registry - SQLite storage.
|
||||
|
||||
Manages indexed projects and directory-to-index path mappings.
|
||||
Thread-safe with connection pooling.
|
||||
"""
|
||||
|
||||
DEFAULT_DB_PATH = Path.home() / ".codexlens" / "registry.db"
|
||||
|
||||
def __init__(self, db_path: Path | None = None) -> None:
|
||||
self.db_path = (db_path or self.DEFAULT_DB_PATH).resolve()
|
||||
self._lock = threading.RLock()
|
||||
self._local = threading.local()
|
||||
self._pool_lock = threading.Lock()
|
||||
self._pool: Dict[int, sqlite3.Connection] = {}
|
||||
self._pool_generation = 0
|
||||
|
||||
def _get_connection(self) -> sqlite3.Connection:
|
||||
"""Get or create a thread-local database connection."""
|
||||
thread_id = threading.get_ident()
|
||||
if getattr(self._local, "generation", None) == self._pool_generation:
|
||||
conn = getattr(self._local, "conn", None)
|
||||
if conn is not None:
|
||||
return conn
|
||||
|
||||
with self._pool_lock:
|
||||
conn = self._pool.get(thread_id)
|
||||
if conn is None:
|
||||
conn = sqlite3.connect(self.db_path, check_same_thread=False)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA synchronous=NORMAL")
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
self._pool[thread_id] = conn
|
||||
|
||||
self._local.conn = conn
|
||||
self._local.generation = self._pool_generation
|
||||
return conn
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close all pooled connections."""
|
||||
with self._lock:
|
||||
with self._pool_lock:
|
||||
for conn in self._pool.values():
|
||||
conn.close()
|
||||
self._pool.clear()
|
||||
self._pool_generation += 1
|
||||
|
||||
if hasattr(self._local, "conn"):
|
||||
self._local.conn = None
|
||||
if hasattr(self._local, "generation"):
|
||||
self._local.generation = self._pool_generation
|
||||
|
||||
def __enter__(self) -> RegistryStore:
|
||||
self.initialize()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
|
||||
self.close()
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Create database and schema."""
|
||||
with self._lock:
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
conn = self._get_connection()
|
||||
self._create_schema(conn)
|
||||
|
||||
def _create_schema(self, conn: sqlite3.Connection) -> None:
|
||||
"""Create database schema."""
|
||||
try:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS projects (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_root TEXT UNIQUE NOT NULL,
|
||||
index_root TEXT NOT NULL,
|
||||
created_at REAL,
|
||||
last_indexed REAL,
|
||||
total_files INTEGER DEFAULT 0,
|
||||
total_dirs INTEGER DEFAULT 0,
|
||||
status TEXT DEFAULT 'active'
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS dir_mapping (
|
||||
id INTEGER PRIMARY KEY,
|
||||
project_id INTEGER REFERENCES projects(id) ON DELETE CASCADE,
|
||||
source_path TEXT NOT NULL,
|
||||
index_path TEXT NOT NULL,
|
||||
depth INTEGER,
|
||||
files_count INTEGER DEFAULT 0,
|
||||
last_updated REAL,
|
||||
UNIQUE(source_path)
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_dir_source ON dir_mapping(source_path)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_dir_project ON dir_mapping(project_id)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_project_source ON projects(source_root)"
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"Failed to initialize registry schema: {exc}") from exc
|
||||
|
||||
# === Project Operations ===
|
||||
|
||||
def register_project(self, source_root: Path, index_root: Path) -> ProjectInfo:
|
||||
"""Register a new project or update existing one.
|
||||
|
||||
Args:
|
||||
source_root: Source code root directory
|
||||
index_root: Index storage root directory
|
||||
|
||||
Returns:
|
||||
ProjectInfo for the registered project
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_root_str = str(source_root.resolve())
|
||||
index_root_str = str(index_root.resolve())
|
||||
now = time.time()
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO projects(source_root, index_root, created_at, last_indexed)
|
||||
VALUES(?, ?, ?, ?)
|
||||
ON CONFLICT(source_root) DO UPDATE SET
|
||||
index_root=excluded.index_root,
|
||||
last_indexed=excluded.last_indexed,
|
||||
status='active'
|
||||
""",
|
||||
(source_root_str, index_root_str, now, now),
|
||||
)
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT * FROM projects WHERE source_root=?", (source_root_str,)
|
||||
).fetchone()
|
||||
|
||||
conn.commit()
|
||||
|
||||
if not row:
|
||||
raise StorageError(f"Failed to register project: {source_root}")
|
||||
|
||||
return self._row_to_project_info(row)
|
||||
|
||||
def unregister_project(self, source_root: Path) -> bool:
|
||||
"""Remove a project registration (cascades to directory mappings).
|
||||
|
||||
Args:
|
||||
source_root: Source code root directory
|
||||
|
||||
Returns:
|
||||
True if project was removed, False if not found
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_root_str = str(source_root.resolve())
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT id FROM projects WHERE source_root=?", (source_root_str,)
|
||||
).fetchone()
|
||||
|
||||
if not row:
|
||||
return False
|
||||
|
||||
conn.execute("DELETE FROM projects WHERE source_root=?", (source_root_str,))
|
||||
conn.commit()
|
||||
return True
|
||||
|
||||
def get_project(self, source_root: Path) -> Optional[ProjectInfo]:
|
||||
"""Get project information by source root.
|
||||
|
||||
Args:
|
||||
source_root: Source code root directory
|
||||
|
||||
Returns:
|
||||
ProjectInfo if found, None otherwise
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_root_str = str(source_root.resolve())
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT * FROM projects WHERE source_root=?", (source_root_str,)
|
||||
).fetchone()
|
||||
|
||||
return self._row_to_project_info(row) if row else None
|
||||
|
||||
def get_project_by_id(self, project_id: int) -> Optional[ProjectInfo]:
|
||||
"""Get project information by ID.
|
||||
|
||||
Args:
|
||||
project_id: Project database ID
|
||||
|
||||
Returns:
|
||||
ProjectInfo if found, None otherwise
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT * FROM projects WHERE id=?", (project_id,)
|
||||
).fetchone()
|
||||
|
||||
return self._row_to_project_info(row) if row else None
|
||||
|
||||
def list_projects(self, status: Optional[str] = None) -> List[ProjectInfo]:
|
||||
"""List all registered projects.
|
||||
|
||||
Args:
|
||||
status: Optional status filter ('active', 'stale', 'removed')
|
||||
|
||||
Returns:
|
||||
List of ProjectInfo objects
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
if status:
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM projects WHERE status=? ORDER BY created_at DESC",
|
||||
(status,),
|
||||
).fetchall()
|
||||
else:
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM projects ORDER BY created_at DESC"
|
||||
).fetchall()
|
||||
|
||||
return [self._row_to_project_info(row) for row in rows]
|
||||
|
||||
def update_project_stats(
|
||||
self, source_root: Path, total_files: int, total_dirs: int
|
||||
) -> None:
|
||||
"""Update project statistics.
|
||||
|
||||
Args:
|
||||
source_root: Source code root directory
|
||||
total_files: Total number of indexed files
|
||||
total_dirs: Total number of indexed directories
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_root_str = str(source_root.resolve())
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE projects
|
||||
SET total_files=?, total_dirs=?, last_indexed=?
|
||||
WHERE source_root=?
|
||||
""",
|
||||
(total_files, total_dirs, time.time(), source_root_str),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def set_project_status(self, source_root: Path, status: str) -> None:
|
||||
"""Set project status.
|
||||
|
||||
Args:
|
||||
source_root: Source code root directory
|
||||
status: Status string ('active', 'stale', 'removed')
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_root_str = str(source_root.resolve())
|
||||
|
||||
conn.execute(
|
||||
"UPDATE projects SET status=? WHERE source_root=?",
|
||||
(status, source_root_str),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# === Directory Mapping Operations ===
|
||||
|
||||
def register_dir(
|
||||
self,
|
||||
project_id: int,
|
||||
source_path: Path,
|
||||
index_path: Path,
|
||||
depth: int,
|
||||
files_count: int = 0,
|
||||
) -> DirMapping:
|
||||
"""Register a directory mapping.
|
||||
|
||||
Args:
|
||||
project_id: Project database ID
|
||||
source_path: Source directory path
|
||||
index_path: Index database path
|
||||
depth: Directory depth relative to project root
|
||||
files_count: Number of files in directory
|
||||
|
||||
Returns:
|
||||
DirMapping for the registered directory
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_path_str = str(source_path.resolve())
|
||||
index_path_str = str(index_path.resolve())
|
||||
now = time.time()
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO dir_mapping(
|
||||
project_id, source_path, index_path, depth, files_count, last_updated
|
||||
)
|
||||
VALUES(?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(source_path) DO UPDATE SET
|
||||
index_path=excluded.index_path,
|
||||
depth=excluded.depth,
|
||||
files_count=excluded.files_count,
|
||||
last_updated=excluded.last_updated
|
||||
""",
|
||||
(project_id, source_path_str, index_path_str, depth, files_count, now),
|
||||
)
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT * FROM dir_mapping WHERE source_path=?", (source_path_str,)
|
||||
).fetchone()
|
||||
|
||||
conn.commit()
|
||||
|
||||
if not row:
|
||||
raise StorageError(f"Failed to register directory: {source_path}")
|
||||
|
||||
return self._row_to_dir_mapping(row)
|
||||
|
||||
def unregister_dir(self, source_path: Path) -> bool:
|
||||
"""Remove a directory mapping.
|
||||
|
||||
Args:
|
||||
source_path: Source directory path
|
||||
|
||||
Returns:
|
||||
True if directory was removed, False if not found
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_path_str = str(source_path.resolve())
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT id FROM dir_mapping WHERE source_path=?", (source_path_str,)
|
||||
).fetchone()
|
||||
|
||||
if not row:
|
||||
return False
|
||||
|
||||
conn.execute("DELETE FROM dir_mapping WHERE source_path=?", (source_path_str,))
|
||||
conn.commit()
|
||||
return True
|
||||
|
||||
def find_index_path(self, source_path: Path) -> Optional[Path]:
|
||||
"""Find index path for a source directory (exact match).
|
||||
|
||||
Args:
|
||||
source_path: Source directory path
|
||||
|
||||
Returns:
|
||||
Index path if found, None otherwise
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_path_str = str(source_path.resolve())
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT index_path FROM dir_mapping WHERE source_path=?",
|
||||
(source_path_str,),
|
||||
).fetchone()
|
||||
|
||||
return Path(row["index_path"]) if row else None
|
||||
|
||||
def find_nearest_index(self, source_path: Path) -> Optional[DirMapping]:
|
||||
"""Find nearest indexed ancestor directory.
|
||||
|
||||
Searches for the closest parent directory that has an index.
|
||||
Useful for supporting subdirectory searches.
|
||||
|
||||
Args:
|
||||
source_path: Source directory or file path
|
||||
|
||||
Returns:
|
||||
DirMapping for nearest ancestor, None if not found
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_path_resolved = source_path.resolve()
|
||||
|
||||
# Check from current path up to root
|
||||
current = source_path_resolved
|
||||
while True:
|
||||
current_str = str(current)
|
||||
row = conn.execute(
|
||||
"SELECT * FROM dir_mapping WHERE source_path=?", (current_str,)
|
||||
).fetchone()
|
||||
|
||||
if row:
|
||||
return self._row_to_dir_mapping(row)
|
||||
|
||||
parent = current.parent
|
||||
if parent == current: # Reached filesystem root
|
||||
break
|
||||
current = parent
|
||||
|
||||
return None
|
||||
|
||||
def get_project_dirs(self, project_id: int) -> List[DirMapping]:
|
||||
"""Get all directory mappings for a project.
|
||||
|
||||
Args:
|
||||
project_id: Project database ID
|
||||
|
||||
Returns:
|
||||
List of DirMapping objects
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM dir_mapping WHERE project_id=? ORDER BY depth, source_path",
|
||||
(project_id,),
|
||||
).fetchall()
|
||||
|
||||
return [self._row_to_dir_mapping(row) for row in rows]
|
||||
|
||||
def get_subdirs(self, source_path: Path) -> List[DirMapping]:
|
||||
"""Get direct subdirectory mappings.
|
||||
|
||||
Args:
|
||||
source_path: Parent directory path
|
||||
|
||||
Returns:
|
||||
List of DirMapping objects for direct children
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_path_str = str(source_path.resolve())
|
||||
|
||||
# First get the parent's depth
|
||||
parent_row = conn.execute(
|
||||
"SELECT depth, project_id FROM dir_mapping WHERE source_path=?",
|
||||
(source_path_str,),
|
||||
).fetchone()
|
||||
|
||||
if not parent_row:
|
||||
return []
|
||||
|
||||
parent_depth = int(parent_row["depth"])
|
||||
project_id = int(parent_row["project_id"])
|
||||
|
||||
# Get all subdirs with depth = parent_depth + 1 and matching path prefix
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT * FROM dir_mapping
|
||||
WHERE project_id=? AND depth=? AND source_path LIKE ?
|
||||
ORDER BY source_path
|
||||
""",
|
||||
(project_id, parent_depth + 1, f"{source_path_str}%"),
|
||||
).fetchall()
|
||||
|
||||
return [self._row_to_dir_mapping(row) for row in rows]
|
||||
|
||||
def update_dir_stats(self, source_path: Path, files_count: int) -> None:
|
||||
"""Update directory statistics.
|
||||
|
||||
Args:
|
||||
source_path: Source directory path
|
||||
files_count: Number of files in directory
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
source_path_str = str(source_path.resolve())
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE dir_mapping
|
||||
SET files_count=?, last_updated=?
|
||||
WHERE source_path=?
|
||||
""",
|
||||
(files_count, time.time(), source_path_str),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def update_index_paths(self, old_root: Path, new_root: Path) -> int:
|
||||
"""Update all index paths after migration.
|
||||
|
||||
Replaces old_root prefix with new_root in all stored index paths.
|
||||
|
||||
Args:
|
||||
old_root: Old index root directory
|
||||
new_root: New index root directory
|
||||
|
||||
Returns:
|
||||
Number of paths updated
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
old_root_str = str(old_root.resolve())
|
||||
new_root_str = str(new_root.resolve())
|
||||
updated = 0
|
||||
|
||||
# Update projects
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE projects
|
||||
SET index_root = REPLACE(index_root, ?, ?)
|
||||
WHERE index_root LIKE ?
|
||||
""",
|
||||
(old_root_str, new_root_str, f"{old_root_str}%"),
|
||||
)
|
||||
updated += conn.total_changes
|
||||
|
||||
# Update dir_mapping
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE dir_mapping
|
||||
SET index_path = REPLACE(index_path, ?, ?)
|
||||
WHERE index_path LIKE ?
|
||||
""",
|
||||
(old_root_str, new_root_str, f"{old_root_str}%"),
|
||||
)
|
||||
updated += conn.total_changes
|
||||
|
||||
conn.commit()
|
||||
return updated
|
||||
|
||||
# === Internal Methods ===
|
||||
|
||||
def _row_to_project_info(self, row: sqlite3.Row) -> ProjectInfo:
|
||||
"""Convert database row to ProjectInfo."""
|
||||
return ProjectInfo(
|
||||
id=int(row["id"]),
|
||||
source_root=Path(row["source_root"]),
|
||||
index_root=Path(row["index_root"]),
|
||||
created_at=float(row["created_at"]) if row["created_at"] else 0.0,
|
||||
last_indexed=float(row["last_indexed"]) if row["last_indexed"] else 0.0,
|
||||
total_files=int(row["total_files"]) if row["total_files"] else 0,
|
||||
total_dirs=int(row["total_dirs"]) if row["total_dirs"] else 0,
|
||||
status=str(row["status"]) if row["status"] else "active",
|
||||
)
|
||||
|
||||
def _row_to_dir_mapping(self, row: sqlite3.Row) -> DirMapping:
|
||||
"""Convert database row to DirMapping."""
|
||||
return DirMapping(
|
||||
id=int(row["id"]),
|
||||
project_id=int(row["project_id"]),
|
||||
source_path=Path(row["source_path"]),
|
||||
index_path=Path(row["index_path"]),
|
||||
depth=int(row["depth"]) if row["depth"] is not None else 0,
|
||||
files_count=int(row["files_count"]) if row["files_count"] else 0,
|
||||
last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
|
||||
)
|
||||
@@ -43,6 +43,8 @@ class SQLiteStore:
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA synchronous=NORMAL")
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
# Memory-mapped I/O for faster reads (30GB limit)
|
||||
conn.execute("PRAGMA mmap_size=30000000000")
|
||||
self._pool[thread_id] = conn
|
||||
|
||||
self._local.conn = conn
|
||||
@@ -384,7 +386,8 @@ class SQLiteStore:
|
||||
language UNINDEXED,
|
||||
content,
|
||||
content='files',
|
||||
content_rowid='id'
|
||||
content_rowid='id',
|
||||
tokenize="unicode61 tokenchars '_'"
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
603
codex-lens/tests/test_search_comprehensive.py
Normal file
603
codex-lens/tests/test_search_comprehensive.py
Normal file
@@ -0,0 +1,603 @@
|
||||
"""Comprehensive tests for CodexLens search functionality.
|
||||
|
||||
Tests cover:
|
||||
- FTS5 text search (basic, phrase, boolean, wildcard)
|
||||
- Chain search across directories
|
||||
- Symbol search (by name, kind, filters)
|
||||
- Files-only search mode
|
||||
- Edge cases and error handling
|
||||
"""
|
||||
|
||||
import tempfile
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from codexlens.storage.sqlite_store import SQLiteStore
|
||||
from codexlens.storage.dir_index import DirIndexStore
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
from codexlens.search import (
|
||||
ChainSearchEngine,
|
||||
SearchOptions,
|
||||
SearchStats,
|
||||
ChainSearchResult,
|
||||
quick_search,
|
||||
)
|
||||
from codexlens.entities import IndexedFile, Symbol, SearchResult
|
||||
|
||||
|
||||
# === Fixtures ===
|
||||
|
||||
@pytest.fixture
|
||||
def temp_dir():
|
||||
"""Create a temporary directory."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
yield Path(tmpdir)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_files():
|
||||
"""Sample file data for testing."""
|
||||
return [
|
||||
(IndexedFile(
|
||||
path="/project/src/auth.py",
|
||||
language="python",
|
||||
symbols=[
|
||||
Symbol(name="authenticate", kind="function", range=(1, 10)),
|
||||
Symbol(name="verify_token", kind="function", range=(12, 20)),
|
||||
Symbol(name="AuthManager", kind="class", range=(22, 50)),
|
||||
],
|
||||
), """
|
||||
def authenticate(username, password):
|
||||
'''Authenticate user with credentials.'''
|
||||
user = find_user(username)
|
||||
if user and check_password(user, password):
|
||||
return create_token(user)
|
||||
return None
|
||||
|
||||
def verify_token(token):
|
||||
'''Verify JWT token validity.'''
|
||||
try:
|
||||
payload = decode_token(token)
|
||||
return payload
|
||||
except TokenExpired:
|
||||
return None
|
||||
|
||||
class AuthManager:
|
||||
'''Manages authentication state.'''
|
||||
def __init__(self):
|
||||
self.sessions = {}
|
||||
|
||||
def login(self, user):
|
||||
token = authenticate(user.name, user.password)
|
||||
self.sessions[user.id] = token
|
||||
return token
|
||||
"""),
|
||||
(IndexedFile(
|
||||
path="/project/src/database.py",
|
||||
language="python",
|
||||
symbols=[
|
||||
Symbol(name="connect", kind="function", range=(1, 5)),
|
||||
Symbol(name="query", kind="function", range=(7, 15)),
|
||||
Symbol(name="DatabasePool", kind="class", range=(17, 40)),
|
||||
],
|
||||
), """
|
||||
def connect(host, port, database):
|
||||
'''Establish database connection.'''
|
||||
return Connection(host, port, database)
|
||||
|
||||
def query(connection, sql, params=None):
|
||||
'''Execute SQL query and return results.'''
|
||||
cursor = connection.cursor()
|
||||
cursor.execute(sql, params or [])
|
||||
return cursor.fetchall()
|
||||
|
||||
class DatabasePool:
|
||||
'''Connection pool for database.'''
|
||||
def __init__(self, size=10):
|
||||
self.pool = []
|
||||
self.size = size
|
||||
|
||||
def get_connection(self):
|
||||
if self.pool:
|
||||
return self.pool.pop()
|
||||
return connect()
|
||||
"""),
|
||||
(IndexedFile(
|
||||
path="/project/src/utils.py",
|
||||
language="python",
|
||||
symbols=[
|
||||
Symbol(name="format_date", kind="function", range=(1, 3)),
|
||||
Symbol(name="parse_json", kind="function", range=(5, 10)),
|
||||
Symbol(name="hash_password", kind="function", range=(12, 18)),
|
||||
],
|
||||
), """
|
||||
def format_date(date, fmt='%Y-%m-%d'):
|
||||
return date.strftime(fmt)
|
||||
|
||||
def parse_json(data):
|
||||
'''Parse JSON string to dictionary.'''
|
||||
import json
|
||||
return json.loads(data)
|
||||
|
||||
def hash_password(password, salt=None):
|
||||
'''Hash password using bcrypt.'''
|
||||
import hashlib
|
||||
salt = salt or generate_salt()
|
||||
return hashlib.sha256((password + salt).encode()).hexdigest()
|
||||
"""),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def populated_store(temp_dir, sample_files):
|
||||
"""Create a populated SQLite store for testing."""
|
||||
db_path = temp_dir / "_index.db"
|
||||
store = SQLiteStore(db_path)
|
||||
store.initialize()
|
||||
|
||||
for indexed_file, content in sample_files:
|
||||
store.add_file(indexed_file, content)
|
||||
|
||||
yield store
|
||||
store.close()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def populated_dir_store(temp_dir, sample_files):
|
||||
"""Create a populated DirIndexStore for testing."""
|
||||
db_path = temp_dir / "_index.db"
|
||||
store = DirIndexStore(db_path)
|
||||
|
||||
for indexed_file, content in sample_files:
|
||||
store.add_file(indexed_file, content)
|
||||
|
||||
yield store
|
||||
store.close()
|
||||
|
||||
|
||||
# === FTS5 Search Tests ===
|
||||
|
||||
class TestFTS5BasicSearch:
|
||||
"""Tests for basic FTS5 text search."""
|
||||
|
||||
def test_single_term_search(self, populated_store):
|
||||
"""Test search with a single term."""
|
||||
results = populated_store.search_fts("authenticate")
|
||||
assert len(results) >= 1
|
||||
assert any("auth" in r.path.lower() for r in results)
|
||||
|
||||
def test_case_insensitive_search(self, populated_store):
|
||||
"""Test that search is case insensitive."""
|
||||
results_lower = populated_store.search_fts("database")
|
||||
results_upper = populated_store.search_fts("DATABASE")
|
||||
results_mixed = populated_store.search_fts("DataBase")
|
||||
|
||||
# All should return similar results
|
||||
assert len(results_lower) == len(results_upper) == len(results_mixed)
|
||||
|
||||
def test_partial_word_search(self, populated_store):
|
||||
"""Test search with partial words using wildcards."""
|
||||
results = populated_store.search_fts("auth*")
|
||||
assert len(results) >= 1
|
||||
# Should match authenticate, authentication, AuthManager, etc.
|
||||
|
||||
def test_multiple_terms_search(self, populated_store):
|
||||
"""Test search with multiple terms (implicit AND)."""
|
||||
results = populated_store.search_fts("user password")
|
||||
assert len(results) >= 1
|
||||
|
||||
def test_no_results_search(self, populated_store):
|
||||
"""Test search that returns no results."""
|
||||
results = populated_store.search_fts("nonexistent_xyz_term")
|
||||
assert len(results) == 0
|
||||
|
||||
def test_search_with_limit(self, populated_store):
|
||||
"""Test search respects limit parameter."""
|
||||
results = populated_store.search_fts("def", limit=1)
|
||||
assert len(results) <= 1
|
||||
|
||||
def test_search_returns_excerpt(self, populated_store):
|
||||
"""Test search results include excerpts."""
|
||||
results = populated_store.search_fts("authenticate")
|
||||
assert len(results) >= 1
|
||||
# SearchResult should have excerpt field
|
||||
for r in results:
|
||||
assert hasattr(r, 'excerpt')
|
||||
|
||||
|
||||
class TestFTS5AdvancedSearch:
|
||||
"""Tests for advanced FTS5 search features."""
|
||||
|
||||
def test_phrase_search(self, populated_store):
|
||||
"""Test exact phrase search with quotes."""
|
||||
results = populated_store.search_fts('"verify_token"')
|
||||
assert len(results) >= 1
|
||||
|
||||
def test_boolean_or_search(self, populated_store):
|
||||
"""Test OR boolean search."""
|
||||
results = populated_store.search_fts("authenticate OR database")
|
||||
# Should find files containing either term
|
||||
assert len(results) >= 2
|
||||
|
||||
def test_boolean_not_search(self, populated_store):
|
||||
"""Test NOT boolean search."""
|
||||
all_results = populated_store.search_fts("def")
|
||||
not_results = populated_store.search_fts("def NOT authenticate")
|
||||
# NOT should return fewer results
|
||||
assert len(not_results) <= len(all_results)
|
||||
|
||||
def test_prefix_search(self, populated_store):
|
||||
"""Test prefix search with asterisk."""
|
||||
results = populated_store.search_fts("connect*")
|
||||
assert len(results) >= 1
|
||||
# Should match connect, connection, etc.
|
||||
|
||||
def test_special_characters_in_query(self, populated_store):
|
||||
"""Test search handles special characters gracefully."""
|
||||
# Should not raise an error
|
||||
results = populated_store.search_fts("__init__")
|
||||
# May or may not have results, but shouldn't crash
|
||||
|
||||
def test_unicode_search(self, temp_dir):
|
||||
"""Test search with unicode content."""
|
||||
store = SQLiteStore(temp_dir / "_index.db")
|
||||
store.initialize()
|
||||
|
||||
indexed_file = IndexedFile(
|
||||
path="/test/unicode.py",
|
||||
language="python",
|
||||
symbols=[Symbol(name="世界", kind="function", range=(1, 1))],
|
||||
)
|
||||
store.add_file(indexed_file, "def 世界(): return '你好世界'")
|
||||
|
||||
results = store.search_fts("世界")
|
||||
assert len(results) == 1
|
||||
|
||||
store.close()
|
||||
|
||||
|
||||
class TestFTS5Pagination:
|
||||
"""Tests for FTS5 search pagination."""
|
||||
|
||||
def test_offset_pagination(self, temp_dir):
|
||||
"""Test search with offset for pagination."""
|
||||
store = SQLiteStore(temp_dir / "_index.db")
|
||||
store.initialize()
|
||||
|
||||
# Add multiple files
|
||||
for i in range(10):
|
||||
indexed_file = IndexedFile(
|
||||
path=f"/test/file{i}.py",
|
||||
language="python",
|
||||
symbols=[],
|
||||
)
|
||||
store.add_file(indexed_file, f"searchable content number {i}")
|
||||
|
||||
page1 = store.search_fts("searchable", limit=3, offset=0)
|
||||
page2 = store.search_fts("searchable", limit=3, offset=3)
|
||||
page3 = store.search_fts("searchable", limit=3, offset=6)
|
||||
|
||||
# Each page should have different results
|
||||
paths1 = {r.path for r in page1}
|
||||
paths2 = {r.path for r in page2}
|
||||
paths3 = {r.path for r in page3}
|
||||
|
||||
assert paths1.isdisjoint(paths2)
|
||||
assert paths2.isdisjoint(paths3)
|
||||
|
||||
store.close()
|
||||
|
||||
def test_offset_beyond_results(self, populated_store):
|
||||
"""Test offset beyond available results."""
|
||||
results = populated_store.search_fts("authenticate", limit=10, offset=1000)
|
||||
assert len(results) == 0
|
||||
|
||||
|
||||
# === Symbol Search Tests ===
|
||||
|
||||
class TestSymbolSearch:
|
||||
"""Tests for symbol search functionality."""
|
||||
|
||||
def test_search_by_name(self, populated_store):
|
||||
"""Test symbol search by name."""
|
||||
results = populated_store.search_symbols("auth")
|
||||
assert len(results) >= 1
|
||||
assert any("auth" in s.name.lower() for s in results)
|
||||
|
||||
def test_search_by_kind_function(self, populated_store):
|
||||
"""Test symbol search filtered by kind=function."""
|
||||
results = populated_store.search_symbols("", kind="function")
|
||||
assert all(s.kind == "function" for s in results)
|
||||
|
||||
def test_search_by_kind_class(self, populated_store):
|
||||
"""Test symbol search filtered by kind=class."""
|
||||
results = populated_store.search_symbols("", kind="class")
|
||||
assert all(s.kind == "class" for s in results)
|
||||
assert any("Manager" in s.name or "Pool" in s.name for s in results)
|
||||
|
||||
def test_search_symbols_with_limit(self, populated_store):
|
||||
"""Test symbol search respects limit."""
|
||||
results = populated_store.search_symbols("", limit=2)
|
||||
assert len(results) <= 2
|
||||
|
||||
def test_search_symbols_returns_range(self, populated_store):
|
||||
"""Test symbol search results include line range."""
|
||||
results = populated_store.search_symbols("authenticate")
|
||||
assert len(results) >= 1
|
||||
for sym in results:
|
||||
assert hasattr(sym, 'range')
|
||||
assert len(sym.range) == 2
|
||||
assert sym.range[0] <= sym.range[1]
|
||||
|
||||
|
||||
# === Chain Search Tests ===
|
||||
|
||||
class TestChainSearchEngine:
|
||||
"""Tests for ChainSearchEngine."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_registry(self):
|
||||
"""Create a mock registry."""
|
||||
registry = MagicMock(spec=RegistryStore)
|
||||
registry.find_nearest_index.return_value = None
|
||||
return registry
|
||||
|
||||
@pytest.fixture
|
||||
def mock_mapper(self):
|
||||
"""Create a mock path mapper."""
|
||||
return MagicMock(spec=PathMapper)
|
||||
|
||||
def test_search_no_index_found(self, mock_registry, mock_mapper):
|
||||
"""Test search when no index is found."""
|
||||
mock_mapper.source_to_index_db.return_value = Path("/nonexistent/_index.db")
|
||||
|
||||
engine = ChainSearchEngine(mock_registry, mock_mapper)
|
||||
result = engine.search("test", Path("/nonexistent"))
|
||||
|
||||
assert result.results == []
|
||||
assert result.symbols == []
|
||||
assert result.stats.dirs_searched == 0
|
||||
|
||||
def test_search_options_depth(self, mock_registry, mock_mapper, temp_dir):
|
||||
"""Test search respects depth option."""
|
||||
# Create a simple index structure
|
||||
db_path = temp_dir / "_index.db"
|
||||
store = DirIndexStore(db_path)
|
||||
store.initialize()
|
||||
store.add_file(
|
||||
name="test.py",
|
||||
full_path=str(temp_dir / "test.py"),
|
||||
content="test content searchable",
|
||||
language="python",
|
||||
)
|
||||
store.close()
|
||||
|
||||
mock_mapper.source_to_index_db.return_value = db_path
|
||||
|
||||
engine = ChainSearchEngine(mock_registry, mock_mapper)
|
||||
options = SearchOptions(depth=0) # Only current dir
|
||||
|
||||
result = engine.search("test", temp_dir, options)
|
||||
|
||||
# With depth=0, should only search current directory
|
||||
assert result.stats.dirs_searched <= 1
|
||||
|
||||
def test_search_files_only(self, mock_registry, mock_mapper, temp_dir):
|
||||
"""Test search_files_only returns only paths."""
|
||||
db_path = temp_dir / "_index.db"
|
||||
store = DirIndexStore(db_path)
|
||||
store.initialize()
|
||||
store.add_file(
|
||||
name="test.py",
|
||||
full_path=str(temp_dir / "test.py"),
|
||||
content="searchable content here",
|
||||
language="python",
|
||||
)
|
||||
store.close()
|
||||
|
||||
mock_mapper.source_to_index_db.return_value = db_path
|
||||
|
||||
engine = ChainSearchEngine(mock_registry, mock_mapper)
|
||||
paths = engine.search_files_only("searchable", temp_dir)
|
||||
|
||||
assert isinstance(paths, list)
|
||||
for p in paths:
|
||||
assert isinstance(p, str)
|
||||
|
||||
def test_search_symbols_engine(self, mock_registry, mock_mapper, temp_dir):
|
||||
"""Test symbol search through engine."""
|
||||
db_path = temp_dir / "_index.db"
|
||||
store = DirIndexStore(db_path)
|
||||
store.initialize()
|
||||
store.add_file(
|
||||
name="test.py",
|
||||
full_path=str(temp_dir / "test.py"),
|
||||
content="def my_function(): pass",
|
||||
language="python",
|
||||
symbols=[Symbol(name="my_function", kind="function", range=(1, 5))],
|
||||
)
|
||||
store.close()
|
||||
|
||||
mock_mapper.source_to_index_db.return_value = db_path
|
||||
|
||||
engine = ChainSearchEngine(mock_registry, mock_mapper)
|
||||
symbols = engine.search_symbols("my_func", temp_dir)
|
||||
|
||||
assert len(symbols) >= 1
|
||||
assert symbols[0].name == "my_function"
|
||||
|
||||
def test_search_result_stats(self, mock_registry, mock_mapper, temp_dir):
|
||||
"""Test search result includes proper stats."""
|
||||
db_path = temp_dir / "_index.db"
|
||||
store = DirIndexStore(db_path)
|
||||
store.initialize()
|
||||
store.add_file(
|
||||
name="test.py",
|
||||
full_path=str(temp_dir / "test.py"),
|
||||
content="content to search",
|
||||
language="python",
|
||||
)
|
||||
store.close()
|
||||
|
||||
mock_mapper.source_to_index_db.return_value = db_path
|
||||
|
||||
engine = ChainSearchEngine(mock_registry, mock_mapper)
|
||||
result = engine.search("content", temp_dir)
|
||||
|
||||
assert result.stats.time_ms >= 0
|
||||
assert result.stats.dirs_searched >= 0
|
||||
assert isinstance(result.stats.errors, list)
|
||||
|
||||
|
||||
class TestSearchOptions:
|
||||
"""Tests for SearchOptions configuration."""
|
||||
|
||||
def test_default_options(self):
|
||||
"""Test default search options."""
|
||||
options = SearchOptions()
|
||||
assert options.depth == -1
|
||||
assert options.max_workers == 8
|
||||
assert options.limit_per_dir == 10
|
||||
assert options.total_limit == 100
|
||||
assert options.include_symbols is False
|
||||
assert options.files_only is False
|
||||
|
||||
def test_custom_options(self):
|
||||
"""Test custom search options."""
|
||||
options = SearchOptions(
|
||||
depth=3,
|
||||
max_workers=4,
|
||||
limit_per_dir=5,
|
||||
total_limit=50,
|
||||
include_symbols=True,
|
||||
files_only=True,
|
||||
)
|
||||
assert options.depth == 3
|
||||
assert options.max_workers == 4
|
||||
assert options.limit_per_dir == 5
|
||||
assert options.total_limit == 50
|
||||
assert options.include_symbols is True
|
||||
assert options.files_only is True
|
||||
|
||||
|
||||
# === Edge Cases and Error Handling ===
|
||||
|
||||
class TestSearchEdgeCases:
|
||||
"""Edge case tests for search functionality."""
|
||||
|
||||
def test_empty_query(self, populated_store):
|
||||
"""Test search with empty query."""
|
||||
# Empty query may raise an error or return empty results
|
||||
try:
|
||||
results = populated_store.search_fts("")
|
||||
assert isinstance(results, list)
|
||||
except Exception:
|
||||
# Some implementations may reject empty queries
|
||||
pass
|
||||
|
||||
def test_whitespace_query(self, populated_store):
|
||||
"""Test search with whitespace-only query."""
|
||||
# Whitespace query may raise an error or return empty results
|
||||
try:
|
||||
results = populated_store.search_fts(" ")
|
||||
assert isinstance(results, list)
|
||||
except Exception:
|
||||
# Some implementations may reject whitespace queries
|
||||
pass
|
||||
|
||||
def test_very_long_query(self, populated_store):
|
||||
"""Test search with very long query."""
|
||||
long_query = "function " * 100 # Repeat valid word
|
||||
try:
|
||||
results = populated_store.search_fts(long_query)
|
||||
assert isinstance(results, list)
|
||||
except Exception:
|
||||
# Very long queries may be rejected
|
||||
pass
|
||||
|
||||
def test_special_sql_characters(self, populated_store):
|
||||
"""Test search handles SQL-like characters safely."""
|
||||
# These should not cause SQL injection - may raise FTS syntax errors
|
||||
queries = ["test", "function*", "test OR data"]
|
||||
for q in queries:
|
||||
results = populated_store.search_fts(q)
|
||||
assert isinstance(results, list)
|
||||
|
||||
def test_search_reopened_store(self, temp_dir, sample_files):
|
||||
"""Test search works after store is reopened."""
|
||||
db_path = temp_dir / "_index.db"
|
||||
store = SQLiteStore(db_path)
|
||||
store.initialize()
|
||||
store.add_file(sample_files[0][0], sample_files[0][1])
|
||||
store.close()
|
||||
|
||||
# Reopen and search
|
||||
store2 = SQLiteStore(db_path)
|
||||
store2.initialize()
|
||||
results = store2.search_fts("authenticate")
|
||||
assert len(results) >= 1
|
||||
store2.close()
|
||||
|
||||
def test_concurrent_searches(self, populated_store):
|
||||
"""Test multiple concurrent searches."""
|
||||
import threading
|
||||
|
||||
results = []
|
||||
errors = []
|
||||
|
||||
def search_task(query):
|
||||
try:
|
||||
r = populated_store.search_fts(query)
|
||||
results.append(len(r))
|
||||
except Exception as e:
|
||||
errors.append(e)
|
||||
|
||||
threads = [
|
||||
threading.Thread(target=search_task, args=("authenticate",)),
|
||||
threading.Thread(target=search_task, args=("database",)),
|
||||
threading.Thread(target=search_task, args=("password",)),
|
||||
]
|
||||
|
||||
for t in threads:
|
||||
t.start()
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
assert len(errors) == 0
|
||||
assert len(results) == 3
|
||||
|
||||
|
||||
class TestChainSearchResult:
|
||||
"""Tests for ChainSearchResult dataclass."""
|
||||
|
||||
def test_result_structure(self):
|
||||
"""Test ChainSearchResult has all required fields."""
|
||||
result = ChainSearchResult(
|
||||
query="test",
|
||||
results=[],
|
||||
symbols=[],
|
||||
stats=SearchStats(),
|
||||
)
|
||||
assert result.query == "test"
|
||||
assert result.results == []
|
||||
assert result.symbols == []
|
||||
assert result.stats.dirs_searched == 0
|
||||
|
||||
|
||||
class TestSearchStats:
|
||||
"""Tests for SearchStats dataclass."""
|
||||
|
||||
def test_default_stats(self):
|
||||
"""Test default search stats."""
|
||||
stats = SearchStats()
|
||||
assert stats.dirs_searched == 0
|
||||
assert stats.files_matched == 0
|
||||
assert stats.time_ms == 0
|
||||
assert stats.errors == []
|
||||
|
||||
def test_stats_with_errors(self):
|
||||
"""Test search stats with errors."""
|
||||
stats = SearchStats(errors=["Error 1", "Error 2"])
|
||||
assert len(stats.errors) == 2
|
||||
660
codex-lens/tests/test_search_performance.py
Normal file
660
codex-lens/tests/test_search_performance.py
Normal file
@@ -0,0 +1,660 @@
|
||||
"""Performance benchmarks for CodexLens search functionality.
|
||||
|
||||
Measures:
|
||||
- FTS5 search speed at various scales
|
||||
- Chain search traversal performance
|
||||
- Semantic search latency
|
||||
- Memory usage during search operations
|
||||
"""
|
||||
|
||||
import gc
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
from dataclasses import dataclass
|
||||
from contextlib import contextmanager
|
||||
|
||||
import pytest
|
||||
|
||||
from codexlens.storage.sqlite_store import SQLiteStore
|
||||
from codexlens.storage.dir_index import DirIndexStore
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
from codexlens.search import ChainSearchEngine, SearchOptions
|
||||
from codexlens.entities import IndexedFile, Symbol
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkResult:
|
||||
"""Benchmark result container."""
|
||||
name: str
|
||||
iterations: int
|
||||
total_time_ms: float
|
||||
avg_time_ms: float
|
||||
min_time_ms: float
|
||||
max_time_ms: float
|
||||
ops_per_sec: float
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f"{self.name}:\n"
|
||||
f" Iterations: {self.iterations}\n"
|
||||
f" Total: {self.total_time_ms:.2f}ms\n"
|
||||
f" Avg: {self.avg_time_ms:.2f}ms\n"
|
||||
f" Min: {self.min_time_ms:.2f}ms\n"
|
||||
f" Max: {self.max_time_ms:.2f}ms\n"
|
||||
f" Ops/sec: {self.ops_per_sec:.1f}"
|
||||
)
|
||||
|
||||
|
||||
def benchmark(func, iterations=10, warmup=2):
|
||||
"""Run benchmark with warmup iterations."""
|
||||
# Warmup
|
||||
for _ in range(warmup):
|
||||
func()
|
||||
|
||||
# Measure
|
||||
times = []
|
||||
for _ in range(iterations):
|
||||
gc.collect()
|
||||
start = time.perf_counter()
|
||||
func()
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
times.append(elapsed)
|
||||
|
||||
total = sum(times)
|
||||
return BenchmarkResult(
|
||||
name=func.__name__ if hasattr(func, '__name__') else 'benchmark',
|
||||
iterations=iterations,
|
||||
total_time_ms=total,
|
||||
avg_time_ms=total / iterations,
|
||||
min_time_ms=min(times),
|
||||
max_time_ms=max(times),
|
||||
ops_per_sec=1000 / (total / iterations) if total > 0 else 0
|
||||
)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def timer(name: str):
|
||||
"""Context manager for timing code blocks."""
|
||||
start = time.perf_counter()
|
||||
yield
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
print(f" {name}: {elapsed:.2f}ms")
|
||||
|
||||
|
||||
# === Test Fixtures ===
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def temp_dir():
|
||||
"""Create a temporary directory for all tests."""
|
||||
tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
|
||||
yield Path(tmpdir.name)
|
||||
# Explicit cleanup with error handling for Windows file locking
|
||||
try:
|
||||
tmpdir.cleanup()
|
||||
except (PermissionError, OSError):
|
||||
pass # Ignore Windows file locking errors
|
||||
|
||||
|
||||
def generate_code_file(index: int, lines: int = 100) -> Tuple[IndexedFile, str]:
|
||||
"""Generate a synthetic code file for testing."""
|
||||
symbols = [
|
||||
Symbol(name=f"function_{index}_{i}", kind="function", range=(i*10+1, i*10+9))
|
||||
for i in range(lines // 10)
|
||||
]
|
||||
|
||||
content_lines = []
|
||||
for i in range(lines):
|
||||
if i % 10 == 0:
|
||||
content_lines.append(f"def function_{index}_{i//10}(param_{i}, data_{i}):")
|
||||
else:
|
||||
content_lines.append(f" # Line {i}: processing data with param_{i % 5}")
|
||||
content_lines.append(f" result_{i} = compute(data_{i})")
|
||||
|
||||
return (
|
||||
IndexedFile(
|
||||
path=f"/project/src/module_{index}/file_{index}.py",
|
||||
language="python",
|
||||
symbols=symbols,
|
||||
),
|
||||
"\n".join(content_lines)
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def small_store(temp_dir):
|
||||
"""Small store with 10 files (~100 lines each)."""
|
||||
db_path = temp_dir / "small_index.db"
|
||||
store = SQLiteStore(db_path)
|
||||
store.initialize()
|
||||
|
||||
for i in range(10):
|
||||
indexed_file, content = generate_code_file(i, lines=100)
|
||||
store.add_file(indexed_file, content)
|
||||
|
||||
yield store
|
||||
store.close()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def medium_store(temp_dir):
|
||||
"""Medium store with 100 files (~100 lines each)."""
|
||||
db_path = temp_dir / "medium_index.db"
|
||||
store = SQLiteStore(db_path)
|
||||
store.initialize()
|
||||
|
||||
for i in range(100):
|
||||
indexed_file, content = generate_code_file(i, lines=100)
|
||||
store.add_file(indexed_file, content)
|
||||
|
||||
yield store
|
||||
store.close()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def large_store(temp_dir):
|
||||
"""Large store with 500 files (~200 lines each)."""
|
||||
db_path = temp_dir / "large_index.db"
|
||||
store = SQLiteStore(db_path)
|
||||
store.initialize()
|
||||
|
||||
for i in range(500):
|
||||
indexed_file, content = generate_code_file(i, lines=200)
|
||||
store.add_file(indexed_file, content)
|
||||
|
||||
yield store
|
||||
store.close()
|
||||
|
||||
|
||||
# === FTS5 Performance Tests ===
|
||||
|
||||
class TestFTS5Performance:
|
||||
"""FTS5 search performance benchmarks."""
|
||||
|
||||
def test_small_store_search(self, small_store):
|
||||
"""Benchmark FTS5 search on small store (10 files)."""
|
||||
print("\n" + "="*60)
|
||||
print("FTS5 SEARCH - SMALL STORE (10 files)")
|
||||
print("="*60)
|
||||
|
||||
queries = ["function", "data", "compute", "result", "param"]
|
||||
|
||||
for query in queries:
|
||||
result = benchmark(
|
||||
lambda q=query: small_store.search_fts(q, limit=20),
|
||||
iterations=50
|
||||
)
|
||||
result.name = f"search '{query}'"
|
||||
print(f"\n{result}")
|
||||
|
||||
def test_medium_store_search(self, medium_store):
|
||||
"""Benchmark FTS5 search on medium store (100 files)."""
|
||||
print("\n" + "="*60)
|
||||
print("FTS5 SEARCH - MEDIUM STORE (100 files)")
|
||||
print("="*60)
|
||||
|
||||
queries = ["function", "data", "compute", "result", "param"]
|
||||
|
||||
for query in queries:
|
||||
result = benchmark(
|
||||
lambda q=query: medium_store.search_fts(q, limit=20),
|
||||
iterations=30
|
||||
)
|
||||
result.name = f"search '{query}'"
|
||||
print(f"\n{result}")
|
||||
|
||||
def test_large_store_search(self, large_store):
|
||||
"""Benchmark FTS5 search on large store (500 files)."""
|
||||
print("\n" + "="*60)
|
||||
print("FTS5 SEARCH - LARGE STORE (500 files)")
|
||||
print("="*60)
|
||||
|
||||
queries = ["function", "data", "compute", "result", "param"]
|
||||
|
||||
for query in queries:
|
||||
result = benchmark(
|
||||
lambda q=query: large_store.search_fts(q, limit=20),
|
||||
iterations=20
|
||||
)
|
||||
result.name = f"search '{query}'"
|
||||
print(f"\n{result}")
|
||||
|
||||
def test_search_limit_scaling(self, medium_store):
|
||||
"""Test how search time scales with result limit."""
|
||||
print("\n" + "="*60)
|
||||
print("FTS5 SEARCH - LIMIT SCALING")
|
||||
print("="*60)
|
||||
|
||||
limits = [5, 10, 20, 50, 100, 200]
|
||||
|
||||
for limit in limits:
|
||||
result = benchmark(
|
||||
lambda l=limit: medium_store.search_fts("function", limit=l),
|
||||
iterations=20
|
||||
)
|
||||
result.name = f"limit={limit}"
|
||||
print(f"\n{result}")
|
||||
|
||||
def test_complex_query_performance(self, medium_store):
|
||||
"""Test performance of complex FTS5 queries."""
|
||||
print("\n" + "="*60)
|
||||
print("FTS5 SEARCH - COMPLEX QUERIES")
|
||||
print("="*60)
|
||||
|
||||
queries = [
|
||||
("single term", "function"),
|
||||
("two terms", "function data"),
|
||||
("phrase", '"def function"'),
|
||||
("OR query", "function OR result"),
|
||||
("wildcard", "func*"),
|
||||
("NOT query", "function NOT data"),
|
||||
]
|
||||
|
||||
for name, query in queries:
|
||||
result = benchmark(
|
||||
lambda q=query: medium_store.search_fts(q, limit=20),
|
||||
iterations=20
|
||||
)
|
||||
result.name = name
|
||||
print(f"\n{result}")
|
||||
|
||||
|
||||
class TestSymbolSearchPerformance:
|
||||
"""Symbol search performance benchmarks."""
|
||||
|
||||
def test_symbol_search_scaling(self, small_store, medium_store, large_store):
|
||||
"""Test symbol search performance at different scales."""
|
||||
print("\n" + "="*60)
|
||||
print("SYMBOL SEARCH - SCALING")
|
||||
print("="*60)
|
||||
|
||||
stores = [
|
||||
("small (10 files)", small_store),
|
||||
("medium (100 files)", medium_store),
|
||||
("large (500 files)", large_store),
|
||||
]
|
||||
|
||||
for name, store in stores:
|
||||
result = benchmark(
|
||||
lambda s=store: s.search_symbols("function", limit=50),
|
||||
iterations=20
|
||||
)
|
||||
result.name = name
|
||||
print(f"\n{result}")
|
||||
|
||||
def test_symbol_search_with_kind_filter(self, medium_store):
|
||||
"""Test symbol search with kind filtering."""
|
||||
print("\n" + "="*60)
|
||||
print("SYMBOL SEARCH - KIND FILTER")
|
||||
print("="*60)
|
||||
|
||||
# Without filter
|
||||
result_no_filter = benchmark(
|
||||
lambda: medium_store.search_symbols("function", limit=50),
|
||||
iterations=20
|
||||
)
|
||||
result_no_filter.name = "no filter"
|
||||
print(f"\n{result_no_filter}")
|
||||
|
||||
# With filter
|
||||
result_with_filter = benchmark(
|
||||
lambda: medium_store.search_symbols("function", kind="function", limit=50),
|
||||
iterations=20
|
||||
)
|
||||
result_with_filter.name = "kind=function"
|
||||
print(f"\n{result_with_filter}")
|
||||
|
||||
|
||||
# === Chain Search Performance Tests ===
|
||||
|
||||
class TestChainSearchPerformance:
|
||||
"""Chain search engine performance benchmarks."""
|
||||
|
||||
@pytest.fixture
|
||||
def chain_engine_setup(self, temp_dir):
|
||||
"""Setup chain search engine with directory hierarchy."""
|
||||
# Create directory hierarchy
|
||||
root = temp_dir / "project"
|
||||
root.mkdir(exist_ok=True)
|
||||
|
||||
registry = RegistryStore(temp_dir / "registry.db")
|
||||
registry.initialize()
|
||||
mapper = PathMapper(temp_dir / "indexes")
|
||||
|
||||
# Create indexes at different depths
|
||||
dirs = [
|
||||
root,
|
||||
root / "src",
|
||||
root / "src" / "core",
|
||||
root / "src" / "utils",
|
||||
root / "tests",
|
||||
]
|
||||
|
||||
for i, dir_path in enumerate(dirs):
|
||||
dir_path.mkdir(exist_ok=True)
|
||||
index_path = mapper.source_to_index_db(dir_path)
|
||||
index_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
store = DirIndexStore(index_path)
|
||||
store.initialize()
|
||||
for j in range(20): # 20 files per directory
|
||||
indexed_file, content = generate_code_file(i * 100 + j, lines=50)
|
||||
file_path = str(dir_path / f"file_{j}.py")
|
||||
store.add_file(
|
||||
name=f"file_{j}.py",
|
||||
full_path=file_path,
|
||||
content=content,
|
||||
language="python",
|
||||
symbols=indexed_file.symbols,
|
||||
)
|
||||
store.close()
|
||||
|
||||
# Register directory
|
||||
project = registry.register_project(root, mapper.source_to_index_dir(root))
|
||||
registry.register_dir(project.id, dir_path, index_path, i, 20)
|
||||
|
||||
engine = ChainSearchEngine(registry, mapper)
|
||||
|
||||
yield {
|
||||
"engine": engine,
|
||||
"registry": registry,
|
||||
"root": root,
|
||||
}
|
||||
|
||||
registry.close()
|
||||
|
||||
def test_chain_search_depth(self, chain_engine_setup):
|
||||
"""Test chain search at different depths."""
|
||||
print("\n" + "="*60)
|
||||
print("CHAIN SEARCH - DEPTH VARIATION")
|
||||
print("="*60)
|
||||
|
||||
engine = chain_engine_setup["engine"]
|
||||
root = chain_engine_setup["root"]
|
||||
|
||||
depths = [0, 1, 2, -1] # -1 = unlimited
|
||||
|
||||
for depth in depths:
|
||||
options = SearchOptions(depth=depth, max_workers=4, total_limit=50)
|
||||
result = benchmark(
|
||||
lambda d=depth, o=options: engine.search("function", root, o),
|
||||
iterations=10
|
||||
)
|
||||
result.name = f"depth={depth}"
|
||||
print(f"\n{result}")
|
||||
|
||||
def test_chain_search_parallelism(self, chain_engine_setup):
|
||||
"""Test chain search with different worker counts."""
|
||||
print("\n" + "="*60)
|
||||
print("CHAIN SEARCH - PARALLELISM")
|
||||
print("="*60)
|
||||
|
||||
engine = chain_engine_setup["engine"]
|
||||
root = chain_engine_setup["root"]
|
||||
|
||||
worker_counts = [1, 2, 4, 8]
|
||||
|
||||
for workers in worker_counts:
|
||||
options = SearchOptions(depth=-1, max_workers=workers, total_limit=50)
|
||||
result = benchmark(
|
||||
lambda w=workers, o=options: engine.search("function", root, o),
|
||||
iterations=10
|
||||
)
|
||||
result.name = f"workers={workers}"
|
||||
print(f"\n{result}")
|
||||
|
||||
|
||||
# === Semantic Search Performance Tests ===
|
||||
|
||||
class TestSemanticSearchPerformance:
|
||||
"""Semantic search performance benchmarks."""
|
||||
|
||||
@pytest.fixture
|
||||
def semantic_setup(self, temp_dir):
|
||||
"""Setup semantic search with embeddings."""
|
||||
try:
|
||||
from codexlens.semantic import SEMANTIC_AVAILABLE
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
pytest.skip("Semantic search dependencies not installed")
|
||||
|
||||
from codexlens.semantic.embedder import Embedder
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
from codexlens.entities import SemanticChunk
|
||||
|
||||
embedder = Embedder()
|
||||
db_path = temp_dir / "semantic.db"
|
||||
vector_store = VectorStore(db_path)
|
||||
|
||||
# Add test chunks
|
||||
code_samples = [
|
||||
"def authenticate_user(username, password): verify user credentials",
|
||||
"class DatabaseConnection: manage database connections with pooling",
|
||||
"async def fetch_api_data(url): make HTTP request and return JSON",
|
||||
"function renderComponent(props): render React UI component",
|
||||
"def process_data(input): transform and validate input data",
|
||||
] * 50 # 250 chunks
|
||||
|
||||
for i, content in enumerate(code_samples):
|
||||
chunk = SemanticChunk(
|
||||
content=content,
|
||||
metadata={"index": i, "language": "python"}
|
||||
)
|
||||
chunk.embedding = embedder.embed_single(content)
|
||||
vector_store.add_chunk(chunk, f"/test/file_{i}.py")
|
||||
|
||||
yield {
|
||||
"embedder": embedder,
|
||||
"vector_store": vector_store,
|
||||
}
|
||||
|
||||
# Clean up vector store cache
|
||||
vector_store.clear_cache()
|
||||
|
||||
except ImportError:
|
||||
pytest.skip("Semantic search dependencies not installed")
|
||||
|
||||
def test_embedding_generation_speed(self, semantic_setup):
|
||||
"""Benchmark embedding generation speed."""
|
||||
print("\n" + "="*60)
|
||||
print("SEMANTIC SEARCH - EMBEDDING GENERATION")
|
||||
print("="*60)
|
||||
|
||||
embedder = semantic_setup["embedder"]
|
||||
|
||||
# Single embedding
|
||||
result = benchmark(
|
||||
lambda: embedder.embed_single("def example_function(): return 42"),
|
||||
iterations=50
|
||||
)
|
||||
result.name = "single embedding"
|
||||
print(f"\n{result}")
|
||||
|
||||
# Batch embedding
|
||||
texts = ["def func{}(): return {}".format(i, i) for i in range(10)]
|
||||
result = benchmark(
|
||||
lambda: embedder.embed(texts),
|
||||
iterations=20
|
||||
)
|
||||
result.name = "batch embedding (10 texts)"
|
||||
print(f"\n{result}")
|
||||
|
||||
def test_vector_search_speed(self, semantic_setup):
|
||||
"""Benchmark vector similarity search speed."""
|
||||
print("\n" + "="*60)
|
||||
print("SEMANTIC SEARCH - VECTOR SEARCH")
|
||||
print("="*60)
|
||||
|
||||
embedder = semantic_setup["embedder"]
|
||||
vector_store = semantic_setup["vector_store"]
|
||||
|
||||
query_embedding = embedder.embed_single("user authentication login")
|
||||
|
||||
# Different top_k values
|
||||
for top_k in [5, 10, 20, 50]:
|
||||
result = benchmark(
|
||||
lambda k=top_k: vector_store.search_similar(query_embedding, top_k=k),
|
||||
iterations=30
|
||||
)
|
||||
result.name = f"top_k={top_k}"
|
||||
print(f"\n{result}")
|
||||
|
||||
def test_full_semantic_search_latency(self, semantic_setup):
|
||||
"""Benchmark full semantic search (embed + search)."""
|
||||
print("\n" + "="*60)
|
||||
print("SEMANTIC SEARCH - FULL LATENCY")
|
||||
print("="*60)
|
||||
|
||||
embedder = semantic_setup["embedder"]
|
||||
vector_store = semantic_setup["vector_store"]
|
||||
|
||||
queries = [
|
||||
"user authentication",
|
||||
"database connection",
|
||||
"API request handler",
|
||||
"React component",
|
||||
"data processing",
|
||||
]
|
||||
|
||||
for query in queries:
|
||||
def full_search(q=query):
|
||||
embedding = embedder.embed_single(q)
|
||||
return vector_store.search_similar(embedding, top_k=10)
|
||||
|
||||
result = benchmark(full_search, iterations=20)
|
||||
result.name = f"'{query}'"
|
||||
print(f"\n{result}")
|
||||
|
||||
|
||||
# === Comparative Benchmarks ===
|
||||
|
||||
class TestComparativeBenchmarks:
|
||||
"""Compare FTS5 vs Semantic search performance."""
|
||||
|
||||
@pytest.fixture
|
||||
def comparison_setup(self, temp_dir):
|
||||
"""Setup both FTS5 and semantic stores with same content."""
|
||||
# FTS5 store
|
||||
fts_store = SQLiteStore(temp_dir / "fts_compare.db")
|
||||
fts_store.initialize()
|
||||
|
||||
code_samples = [
|
||||
("auth.py", "def authenticate_user(username, password): verify credentials"),
|
||||
("db.py", "class DatabasePool: manage database connection pooling"),
|
||||
("api.py", "async def handle_request(req): process API request"),
|
||||
("ui.py", "function Button({ onClick }): render button component"),
|
||||
("utils.py", "def process_data(input): transform and validate data"),
|
||||
] * 20
|
||||
|
||||
for i, (filename, content) in enumerate(code_samples):
|
||||
indexed_file = IndexedFile(
|
||||
path=f"/project/{filename.replace('.py', '')}_{i}.py",
|
||||
language="python",
|
||||
symbols=[Symbol(name=f"func_{i}", kind="function", range=(1, 5))],
|
||||
)
|
||||
fts_store.add_file(indexed_file, content)
|
||||
|
||||
# Semantic store (if available)
|
||||
try:
|
||||
from codexlens.semantic import SEMANTIC_AVAILABLE
|
||||
if SEMANTIC_AVAILABLE:
|
||||
from codexlens.semantic.embedder import Embedder
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
from codexlens.entities import SemanticChunk
|
||||
|
||||
embedder = Embedder()
|
||||
semantic_store = VectorStore(temp_dir / "semantic_compare.db")
|
||||
|
||||
for i, (filename, content) in enumerate(code_samples):
|
||||
chunk = SemanticChunk(content=content, metadata={"index": i})
|
||||
chunk.embedding = embedder.embed_single(content)
|
||||
semantic_store.add_chunk(chunk, f"/project/{filename}")
|
||||
|
||||
yield {
|
||||
"fts_store": fts_store,
|
||||
"semantic_store": semantic_store,
|
||||
"embedder": embedder,
|
||||
"has_semantic": True,
|
||||
}
|
||||
# Close semantic store connection
|
||||
semantic_store.clear_cache()
|
||||
else:
|
||||
yield {"fts_store": fts_store, "has_semantic": False}
|
||||
except ImportError:
|
||||
yield {"fts_store": fts_store, "has_semantic": False}
|
||||
|
||||
fts_store.close()
|
||||
|
||||
def test_fts_vs_semantic_latency(self, comparison_setup):
|
||||
"""Compare FTS5 vs Semantic search latency."""
|
||||
print("\n" + "="*60)
|
||||
print("FTS5 vs SEMANTIC - LATENCY COMPARISON")
|
||||
print("="*60)
|
||||
|
||||
fts_store = comparison_setup["fts_store"]
|
||||
|
||||
queries = [
|
||||
"authenticate",
|
||||
"database",
|
||||
"request",
|
||||
"button",
|
||||
"process",
|
||||
]
|
||||
|
||||
print("\nFTS5 Search:")
|
||||
for query in queries:
|
||||
result = benchmark(
|
||||
lambda q=query: fts_store.search_fts(q, limit=10),
|
||||
iterations=30
|
||||
)
|
||||
result.name = f"'{query}'"
|
||||
print(f" {result.name}: avg={result.avg_time_ms:.2f}ms")
|
||||
|
||||
if comparison_setup.get("has_semantic"):
|
||||
semantic_store = comparison_setup["semantic_store"]
|
||||
embedder = comparison_setup["embedder"]
|
||||
|
||||
print("\nSemantic Search (embed + search):")
|
||||
for query in queries:
|
||||
def semantic_search(q=query):
|
||||
emb = embedder.embed_single(q)
|
||||
return semantic_store.search_similar(emb, top_k=10)
|
||||
|
||||
result = benchmark(semantic_search, iterations=20)
|
||||
result.name = f"'{query}'"
|
||||
print(f" {result.name}: avg={result.avg_time_ms:.2f}ms")
|
||||
else:
|
||||
print("\n(Semantic search not available)")
|
||||
|
||||
|
||||
# === Memory Usage Tests ===
|
||||
|
||||
class TestMemoryUsage:
|
||||
"""Memory usage during search operations."""
|
||||
|
||||
def test_search_memory_footprint(self, medium_store):
|
||||
"""Measure memory footprint during search."""
|
||||
print("\n" + "="*60)
|
||||
print("MEMORY USAGE - SEARCH OPERATIONS")
|
||||
print("="*60)
|
||||
|
||||
import tracemalloc
|
||||
|
||||
tracemalloc.start()
|
||||
|
||||
# Run multiple searches
|
||||
for _ in range(100):
|
||||
medium_store.search_fts("function", limit=20)
|
||||
|
||||
current, peak = tracemalloc.get_traced_memory()
|
||||
tracemalloc.stop()
|
||||
|
||||
print(f"\nAfter 100 FTS5 searches:")
|
||||
print(f" Current memory: {current / 1024 / 1024:.2f} MB")
|
||||
print(f" Peak memory: {peak / 1024 / 1024:.2f} MB")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v", "-s", "--tb=short"])
|
||||
Reference in New Issue
Block a user