feat(codex-lens): add unified reranker architecture and file watcher

Unified Reranker Architecture:
- Add BaseReranker ABC with factory pattern
- Implement 4 backends: ONNX (default), API, LiteLLM, Legacy
- Add .env configuration parsing for API credentials
- Migrate from sentence-transformers to optimum+onnxruntime

File Watcher Module:
- Add real-time file system monitoring with watchdog
- Implement IncrementalIndexer for single-file updates
- Add WatcherManager with signal handling and graceful shutdown
- Add 'codexlens watch' CLI command
- Event filtering, debouncing, and deduplication
- Thread-safe design with proper resource cleanup

Tests: 16 watcher tests + 5 reranker test files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
catlog22
2026-01-01 13:23:52 +08:00
parent 8ac27548ad
commit 520f2d26f2
27 changed files with 3571 additions and 14 deletions

View File

@@ -0,0 +1,17 @@
"""File watcher module for real-time index updates."""
from .events import ChangeType, FileEvent, IndexResult, WatcherConfig, WatcherStats
from .file_watcher import FileWatcher
from .incremental_indexer import IncrementalIndexer
from .manager import WatcherManager
__all__ = [
"ChangeType",
"FileEvent",
"IndexResult",
"WatcherConfig",
"WatcherStats",
"FileWatcher",
"IncrementalIndexer",
"WatcherManager",
]

View File

@@ -0,0 +1,54 @@
"""Event types for file watcher."""
from __future__ import annotations
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import List, Optional, Set
class ChangeType(Enum):
"""Type of file system change."""
CREATED = "created"
MODIFIED = "modified"
DELETED = "deleted"
MOVED = "moved"
@dataclass
class FileEvent:
"""A file system change event."""
path: Path
change_type: ChangeType
timestamp: float
old_path: Optional[Path] = None # For MOVED events
@dataclass
class WatcherConfig:
"""Configuration for file watcher."""
debounce_ms: int = 1000
ignored_patterns: Set[str] = field(default_factory=lambda: {
".git", ".venv", "venv", "node_modules",
"__pycache__", ".codexlens", ".idea", ".vscode",
})
languages: Optional[List[str]] = None # None = all supported
@dataclass
class IndexResult:
"""Result of processing file changes."""
files_indexed: int = 0
files_removed: int = 0
symbols_added: int = 0
errors: List[str] = field(default_factory=list)
@dataclass
class WatcherStats:
"""Runtime statistics for watcher."""
files_watched: int = 0
events_processed: int = 0
last_event_time: Optional[float] = None
is_running: bool = False

View File

@@ -0,0 +1,245 @@
"""File system watcher using watchdog library."""
from __future__ import annotations
import logging
import threading
import time
from pathlib import Path
from typing import Callable, Dict, List, Optional
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from .events import ChangeType, FileEvent, WatcherConfig
from ..config import Config
logger = logging.getLogger(__name__)
class _CodexLensHandler(FileSystemEventHandler):
"""Internal handler for watchdog events."""
def __init__(
self,
watcher: "FileWatcher",
on_event: Callable[[FileEvent], None],
) -> None:
super().__init__()
self._watcher = watcher
self._on_event = on_event
def on_created(self, event) -> None:
if event.is_directory:
return
self._emit(event.src_path, ChangeType.CREATED)
def on_modified(self, event) -> None:
if event.is_directory:
return
self._emit(event.src_path, ChangeType.MODIFIED)
def on_deleted(self, event) -> None:
if event.is_directory:
return
self._emit(event.src_path, ChangeType.DELETED)
def on_moved(self, event) -> None:
if event.is_directory:
return
self._emit(event.dest_path, ChangeType.MOVED, old_path=event.src_path)
def _emit(
self,
path: str,
change_type: ChangeType,
old_path: Optional[str] = None,
) -> None:
path_obj = Path(path)
# Filter out files that should not be indexed
if not self._watcher._should_index_file(path_obj):
return
event = FileEvent(
path=path_obj,
change_type=change_type,
timestamp=time.time(),
old_path=Path(old_path) if old_path else None,
)
self._on_event(event)
class FileWatcher:
"""File system watcher for monitoring directory changes.
Uses watchdog library for cross-platform file system monitoring.
Events are forwarded to the on_changes callback.
Example:
def handle_changes(events: List[FileEvent]) -> None:
for event in events:
print(f"{event.change_type}: {event.path}")
watcher = FileWatcher(Path("."), WatcherConfig(), handle_changes)
watcher.start()
watcher.wait() # Block until stopped
"""
def __init__(
self,
root_path: Path,
config: WatcherConfig,
on_changes: Callable[[List[FileEvent]], None],
) -> None:
"""Initialize file watcher.
Args:
root_path: Directory to watch recursively
config: Watcher configuration
on_changes: Callback invoked with batched events
"""
self.root_path = Path(root_path).resolve()
self.config = config
self.on_changes = on_changes
self._observer: Optional[Observer] = None
self._running = False
self._stop_event = threading.Event()
self._lock = threading.RLock()
# Event queue for batching
self._event_queue: List[FileEvent] = []
self._queue_lock = threading.Lock()
# Debounce thread
self._debounce_thread: Optional[threading.Thread] = None
# Config instance for language checking
self._codexlens_config = Config()
def _should_index_file(self, path: Path) -> bool:
"""Check if file should be indexed based on extension and ignore patterns.
Args:
path: File path to check
Returns:
True if file should be indexed, False otherwise
"""
# Check against ignore patterns
parts = path.parts
for pattern in self.config.ignored_patterns:
if pattern in parts:
return False
# Check extension against supported languages
language = self._codexlens_config.language_for_path(path)
return language is not None
def _on_raw_event(self, event: FileEvent) -> None:
"""Handle raw event from watchdog handler."""
with self._queue_lock:
self._event_queue.append(event)
# Debouncing is handled by background thread
def _debounce_loop(self) -> None:
"""Background thread for debounced event batching."""
while self._running:
time.sleep(self.config.debounce_ms / 1000.0)
self._flush_events()
def _flush_events(self) -> None:
"""Flush queued events with deduplication."""
with self._queue_lock:
if not self._event_queue:
return
# Deduplicate: keep latest event per path
deduped: Dict[Path, FileEvent] = {}
for event in self._event_queue:
deduped[event.path] = event
events = list(deduped.values())
self._event_queue.clear()
if events:
try:
self.on_changes(events)
except Exception as exc:
logger.error("Error in on_changes callback: %s", exc)
def start(self) -> None:
"""Start watching the directory.
Non-blocking. Use wait() to block until stopped.
"""
with self._lock:
if self._running:
logger.warning("Watcher already running")
return
if not self.root_path.exists():
raise ValueError(f"Root path does not exist: {self.root_path}")
self._observer = Observer()
handler = _CodexLensHandler(self, self._on_raw_event)
self._observer.schedule(handler, str(self.root_path), recursive=True)
self._running = True
self._stop_event.clear()
self._observer.start()
# Start debounce thread
self._debounce_thread = threading.Thread(
target=self._debounce_loop,
daemon=True,
name="FileWatcher-Debounce",
)
self._debounce_thread.start()
logger.info("Started watching: %s", self.root_path)
def stop(self) -> None:
"""Stop watching the directory.
Gracefully stops the observer and flushes remaining events.
"""
with self._lock:
if not self._running:
return
self._running = False
self._stop_event.set()
if self._observer:
self._observer.stop()
self._observer.join(timeout=5.0)
self._observer = None
# Wait for debounce thread to finish
if self._debounce_thread and self._debounce_thread.is_alive():
self._debounce_thread.join(timeout=2.0)
self._debounce_thread = None
# Flush any remaining events
self._flush_events()
logger.info("Stopped watching: %s", self.root_path)
def wait(self) -> None:
"""Block until watcher is stopped.
Use Ctrl+C or call stop() from another thread to unblock.
"""
try:
while self._running:
self._stop_event.wait(timeout=1.0)
except KeyboardInterrupt:
logger.info("Received interrupt, stopping watcher...")
self.stop()
@property
def is_running(self) -> bool:
"""Check if watcher is currently running."""
return self._running

View File

@@ -0,0 +1,359 @@
"""Incremental indexer for processing file changes."""
from __future__ import annotations
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional
from codexlens.config import Config
from codexlens.parsers.factory import ParserFactory
from codexlens.storage.dir_index import DirIndexStore
from codexlens.storage.global_index import GlobalSymbolIndex
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore
from .events import ChangeType, FileEvent, IndexResult
logger = logging.getLogger(__name__)
@dataclass
class FileIndexResult:
"""Result of indexing a single file."""
path: Path
symbols_count: int
success: bool
error: Optional[str] = None
class IncrementalIndexer:
"""Incremental indexer for processing file change events.
Processes file events (create, modify, delete, move) and updates
the corresponding index databases incrementally.
Reuses existing infrastructure:
- ParserFactory for symbol extraction
- DirIndexStore for per-directory storage
- GlobalSymbolIndex for cross-file symbols
- PathMapper for source-to-index path conversion
Example:
indexer = IncrementalIndexer(registry, mapper, config)
result = indexer.process_changes([
FileEvent(Path("foo.py"), ChangeType.MODIFIED, time.time()),
])
print(f"Indexed {result.files_indexed} files")
"""
def __init__(
self,
registry: RegistryStore,
mapper: PathMapper,
config: Optional[Config] = None,
) -> None:
"""Initialize incremental indexer.
Args:
registry: Global project registry
mapper: Path mapper for source-to-index conversion
config: CodexLens configuration (uses defaults if None)
"""
self.registry = registry
self.mapper = mapper
self.config = config or Config()
self.parser_factory = ParserFactory(self.config)
self._global_index: Optional[GlobalSymbolIndex] = None
self._dir_stores: dict[Path, DirIndexStore] = {}
self._lock = __import__("threading").RLock()
def _get_global_index(self, index_root: Path) -> Optional[GlobalSymbolIndex]:
"""Get or create global symbol index."""
if not self.config.global_symbol_index_enabled:
return None
if self._global_index is None:
global_db_path = index_root / GlobalSymbolIndex.DEFAULT_DB_NAME
if global_db_path.exists():
self._global_index = GlobalSymbolIndex(global_db_path)
return self._global_index
def _get_dir_store(self, dir_path: Path) -> Optional[DirIndexStore]:
"""Get DirIndexStore for a directory, if indexed."""
with self._lock:
if dir_path in self._dir_stores:
return self._dir_stores[dir_path]
index_db = self.mapper.source_to_index_db(dir_path)
if not index_db.exists():
logger.debug("No index found for directory: %s", dir_path)
return None
# Get index root for global index
index_root = self.mapper.source_to_index_dir(
self.mapper.get_project_root(dir_path) or dir_path
)
global_index = self._get_global_index(index_root)
store = DirIndexStore(
index_db,
config=self.config,
global_index=global_index,
)
self._dir_stores[dir_path] = store
return store
def process_changes(self, events: List[FileEvent]) -> IndexResult:
"""Process a batch of file change events.
Args:
events: List of file events to process
Returns:
IndexResult with statistics
"""
result = IndexResult()
for event in events:
try:
if event.change_type == ChangeType.CREATED:
file_result = self._index_file(event.path)
if file_result.success:
result.files_indexed += 1
result.symbols_added += file_result.symbols_count
else:
result.errors.append(file_result.error or f"Failed to index: {event.path}")
elif event.change_type == ChangeType.MODIFIED:
file_result = self._index_file(event.path)
if file_result.success:
result.files_indexed += 1
result.symbols_added += file_result.symbols_count
else:
result.errors.append(file_result.error or f"Failed to index: {event.path}")
elif event.change_type == ChangeType.DELETED:
self._remove_file(event.path)
result.files_removed += 1
elif event.change_type == ChangeType.MOVED:
# Remove from old location, add at new location
if event.old_path:
self._remove_file(event.old_path)
result.files_removed += 1
file_result = self._index_file(event.path)
if file_result.success:
result.files_indexed += 1
result.symbols_added += file_result.symbols_count
else:
result.errors.append(file_result.error or f"Failed to index: {event.path}")
except Exception as exc:
error_msg = f"Error processing {event.path}: {type(exc).__name__}: {exc}"
logger.error(error_msg)
result.errors.append(error_msg)
return result
def _index_file(self, path: Path) -> FileIndexResult:
"""Index a single file.
Args:
path: Path to the file to index
Returns:
FileIndexResult with status
"""
path = Path(path).resolve()
# Check if file exists
if not path.exists():
return FileIndexResult(
path=path,
symbols_count=0,
success=False,
error=f"File not found: {path}",
)
# Check if language is supported
language = self.config.language_for_path(path)
if not language:
return FileIndexResult(
path=path,
symbols_count=0,
success=False,
error=f"Unsupported language for: {path}",
)
# Get directory store
dir_path = path.parent
store = self._get_dir_store(dir_path)
if store is None:
return FileIndexResult(
path=path,
symbols_count=0,
success=False,
error=f"Directory not indexed: {dir_path}",
)
# Read file content with fallback encodings
try:
content = path.read_text(encoding="utf-8")
except UnicodeDecodeError:
logger.debug("UTF-8 decode failed for %s, using fallback with errors='ignore'", path)
try:
content = path.read_text(encoding="utf-8", errors="ignore")
except Exception as exc:
return FileIndexResult(
path=path,
symbols_count=0,
success=False,
error=f"Failed to read file: {exc}",
)
except Exception as exc:
return FileIndexResult(
path=path,
symbols_count=0,
success=False,
error=f"Failed to read file: {exc}",
)
# Parse symbols
try:
parser = self.parser_factory.get_parser(language)
indexed_file = parser.parse(content, path)
except Exception as exc:
error_msg = f"Failed to parse {path}: {type(exc).__name__}: {exc}"
logger.error(error_msg)
return FileIndexResult(
path=path,
symbols_count=0,
success=False,
error=error_msg,
)
# Update store with retry logic for transient database errors
max_retries = 3
for attempt in range(max_retries):
try:
store.add_file(
name=path.name,
full_path=str(path),
content=content,
language=language,
symbols=indexed_file.symbols,
relationships=indexed_file.relationships,
)
# Update merkle root
store.update_merkle_root()
logger.debug("Indexed file: %s (%d symbols)", path, len(indexed_file.symbols))
return FileIndexResult(
path=path,
symbols_count=len(indexed_file.symbols),
success=True,
)
except __import__("sqlite3").OperationalError as exc:
# Transient database errors (e.g., database locked)
if attempt < max_retries - 1:
import time
wait_time = 0.1 * (2 ** attempt) # Exponential backoff
logger.debug("Database operation failed (attempt %d/%d), retrying in %.2fs: %s",
attempt + 1, max_retries, wait_time, exc)
time.sleep(wait_time)
continue
else:
error_msg = f"Failed to store {path} after {max_retries} attempts: {exc}"
logger.error(error_msg)
return FileIndexResult(
path=path,
symbols_count=0,
success=False,
error=error_msg,
)
except Exception as exc:
error_msg = f"Failed to store {path}: {type(exc).__name__}: {exc}"
logger.error(error_msg)
return FileIndexResult(
path=path,
symbols_count=0,
success=False,
error=error_msg,
)
# Should never reach here
return FileIndexResult(
path=path,
symbols_count=0,
success=False,
error="Unexpected error in indexing loop",
)
def _remove_file(self, path: Path) -> bool:
"""Remove a file from the index.
Args:
path: Path to the file to remove
Returns:
True if removed successfully
"""
path = Path(path).resolve()
dir_path = path.parent
store = self._get_dir_store(dir_path)
if store is None:
logger.debug("Cannot remove file, directory not indexed: %s", dir_path)
return False
# Retry logic for transient database errors
max_retries = 3
for attempt in range(max_retries):
try:
store.remove_file(str(path))
store.update_merkle_root()
logger.debug("Removed file from index: %s", path)
return True
except __import__("sqlite3").OperationalError as exc:
# Transient database errors (e.g., database locked)
if attempt < max_retries - 1:
import time
wait_time = 0.1 * (2 ** attempt) # Exponential backoff
logger.debug("Database operation failed (attempt %d/%d), retrying in %.2fs: %s",
attempt + 1, max_retries, wait_time, exc)
time.sleep(wait_time)
continue
else:
logger.error("Failed to remove %s after %d attempts: %s", path, max_retries, exc)
return False
except Exception as exc:
logger.error("Failed to remove %s: %s", path, exc)
return False
# Should never reach here
return False
def close(self) -> None:
"""Close all open stores."""
with self._lock:
for store in self._dir_stores.values():
try:
store.close()
except Exception:
pass
self._dir_stores.clear()
if self._global_index:
try:
self._global_index.close()
except Exception:
pass
self._global_index = None

View File

@@ -0,0 +1,194 @@
"""Watcher manager for coordinating file watching and incremental indexing."""
from __future__ import annotations
import logging
import signal
import threading
import time
from pathlib import Path
from typing import Callable, List, Optional
from codexlens.config import Config
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore
from .events import FileEvent, IndexResult, WatcherConfig, WatcherStats
from .file_watcher import FileWatcher
from .incremental_indexer import IncrementalIndexer
logger = logging.getLogger(__name__)
class WatcherManager:
"""High-level manager for file watching and incremental indexing.
Coordinates FileWatcher and IncrementalIndexer with:
- Lifecycle management (start/stop)
- Signal handling (SIGINT/SIGTERM)
- Statistics tracking
- Graceful shutdown
"""
def __init__(
self,
root_path: Path,
config: Optional[Config] = None,
watcher_config: Optional[WatcherConfig] = None,
on_indexed: Optional[Callable[[IndexResult], None]] = None,
) -> None:
self.root_path = Path(root_path).resolve()
self.config = config or Config()
self.watcher_config = watcher_config or WatcherConfig()
self.on_indexed = on_indexed
self._registry: Optional[RegistryStore] = None
self._mapper: Optional[PathMapper] = None
self._watcher: Optional[FileWatcher] = None
self._indexer: Optional[IncrementalIndexer] = None
self._running = False
self._stop_event = threading.Event()
self._lock = threading.RLock()
# Statistics
self._stats = WatcherStats()
self._original_sigint = None
self._original_sigterm = None
def _handle_changes(self, events: List[FileEvent]) -> None:
"""Handle file change events from watcher."""
if not self._indexer or not events:
return
logger.info("Processing %d file changes", len(events))
result = self._indexer.process_changes(events)
# Update stats
self._stats.events_processed += len(events)
self._stats.last_event_time = time.time()
if result.files_indexed > 0 or result.files_removed > 0:
logger.info(
"Indexed %d files, removed %d files, %d errors",
result.files_indexed, result.files_removed, len(result.errors)
)
if self.on_indexed:
try:
self.on_indexed(result)
except Exception as exc:
logger.error("Error in on_indexed callback: %s", exc)
def _signal_handler(self, signum, frame) -> None:
"""Handle shutdown signals."""
logger.info("Received signal %d, stopping...", signum)
self.stop()
def _install_signal_handlers(self) -> None:
"""Install signal handlers for graceful shutdown."""
try:
self._original_sigint = signal.signal(signal.SIGINT, self._signal_handler)
if hasattr(signal, 'SIGTERM'):
self._original_sigterm = signal.signal(signal.SIGTERM, self._signal_handler)
except (ValueError, OSError):
# Signal handling not available (e.g., not main thread)
pass
def _restore_signal_handlers(self) -> None:
"""Restore original signal handlers."""
try:
if self._original_sigint is not None:
signal.signal(signal.SIGINT, self._original_sigint)
if self._original_sigterm is not None and hasattr(signal, 'SIGTERM'):
signal.signal(signal.SIGTERM, self._original_sigterm)
except (ValueError, OSError):
pass
def start(self) -> None:
"""Start watching and indexing."""
with self._lock:
if self._running:
logger.warning("WatcherManager already running")
return
# Validate path
if not self.root_path.exists():
raise ValueError(f"Root path does not exist: {self.root_path}")
# Initialize components
self._registry = RegistryStore()
self._registry.initialize()
self._mapper = PathMapper()
self._indexer = IncrementalIndexer(
self._registry, self._mapper, self.config
)
self._watcher = FileWatcher(
self.root_path, self.watcher_config, self._handle_changes
)
# Install signal handlers
self._install_signal_handlers()
# Start watcher
self._running = True
self._stats.is_running = True
self._stop_event.clear()
self._watcher.start()
logger.info("WatcherManager started for: %s", self.root_path)
def stop(self) -> None:
"""Stop watching and clean up."""
with self._lock:
if not self._running:
return
self._running = False
self._stats.is_running = False
self._stop_event.set()
# Stop watcher
if self._watcher:
self._watcher.stop()
self._watcher = None
# Close indexer
if self._indexer:
self._indexer.close()
self._indexer = None
# Close registry
if self._registry:
self._registry.close()
self._registry = None
# Restore signal handlers
self._restore_signal_handlers()
logger.info("WatcherManager stopped")
def wait(self) -> None:
"""Block until stopped."""
try:
while self._running:
self._stop_event.wait(timeout=1.0)
except KeyboardInterrupt:
logger.info("Interrupted, stopping...")
self.stop()
@property
def is_running(self) -> bool:
"""Check if manager is running."""
return self._running
def get_stats(self) -> WatcherStats:
"""Get runtime statistics."""
return WatcherStats(
files_watched=self._stats.files_watched,
events_processed=self._stats.events_processed,
last_event_time=self._stats.last_event_time,
is_running=self._running,
)