mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
4495 lines
182 KiB
Python
4495 lines
182 KiB
Python
"""Typer commands for CodexLens."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import shutil
|
|
import sqlite3
|
|
from pathlib import Path
|
|
from typing import Annotated, Any, Dict, Iterable, List, Optional
|
|
|
|
import typer
|
|
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
|
from rich.table import Table
|
|
|
|
from codexlens.config import Config
|
|
from codexlens.entities import IndexedFile, SearchResult, Symbol
|
|
from codexlens.errors import CodexLensError, ConfigError, ParseError, StorageError, SearchError
|
|
from codexlens.parsers.factory import ParserFactory
|
|
from codexlens.storage.path_mapper import PathMapper
|
|
from codexlens.storage.registry import RegistryStore, ProjectInfo
|
|
from codexlens.storage.index_tree import IndexTreeBuilder
|
|
from codexlens.storage.dir_index import DirIndexStore
|
|
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
|
|
from codexlens.watcher import WatcherManager, WatcherConfig
|
|
|
|
from .output import (
|
|
console,
|
|
print_json,
|
|
render_file_inspect,
|
|
render_search_results,
|
|
render_status,
|
|
render_symbols,
|
|
)
|
|
|
|
app = typer.Typer(help="CodexLens CLI — local code indexing and search.")
|
|
|
|
# Index subcommand group for reorganized commands
|
|
index_app = typer.Typer(help="Index management commands (init, embeddings, splade, binary, status, migrate, all)")
|
|
app.add_typer(index_app, name="index")
|
|
|
|
|
|
def _deprecated_command_warning(old_name: str, new_name: str) -> None:
|
|
"""Display deprecation warning for renamed commands.
|
|
|
|
Args:
|
|
old_name: The old command name being deprecated
|
|
new_name: The new command name to use instead
|
|
"""
|
|
console.print(
|
|
f"[yellow]Warning:[/yellow] '{old_name}' is deprecated. "
|
|
f"Use '{new_name}' instead."
|
|
)
|
|
|
|
|
|
def _configure_logging(verbose: bool, json_mode: bool = False) -> None:
|
|
"""Configure logging level.
|
|
|
|
In JSON mode, suppress INFO logs to keep stderr clean for error parsing.
|
|
Only WARNING and above are shown to avoid mixing logs with JSON output.
|
|
"""
|
|
if json_mode and not verbose:
|
|
# In JSON mode, suppress INFO logs to keep stderr clean
|
|
level = logging.WARNING
|
|
else:
|
|
level = logging.DEBUG if verbose else logging.INFO
|
|
logging.basicConfig(level=level, format="%(levelname)s %(message)s")
|
|
|
|
|
|
def _parse_languages(raw: Optional[List[str]]) -> Optional[List[str]]:
|
|
if not raw:
|
|
return None
|
|
langs: List[str] = []
|
|
for item in raw:
|
|
for part in item.split(","):
|
|
part = part.strip()
|
|
if part:
|
|
langs.append(part)
|
|
return langs or None
|
|
|
|
|
|
def _get_index_root() -> Path:
|
|
"""Get the index root directory from config or default.
|
|
|
|
Priority order:
|
|
1. CODEXLENS_INDEX_DIR environment variable
|
|
2. index_dir from ~/.codexlens/config.json
|
|
3. Default: ~/.codexlens/indexes
|
|
"""
|
|
env_override = os.getenv("CODEXLENS_INDEX_DIR")
|
|
if env_override:
|
|
return Path(env_override).expanduser().resolve()
|
|
|
|
# Read from config.json
|
|
config_file = Path.home() / ".codexlens" / "config.json"
|
|
if config_file.exists():
|
|
try:
|
|
cfg = json.loads(config_file.read_text(encoding="utf-8"))
|
|
if "index_dir" in cfg:
|
|
return Path(cfg["index_dir"]).expanduser().resolve()
|
|
except (json.JSONDecodeError, OSError):
|
|
pass # Fall through to default
|
|
|
|
return Path.home() / ".codexlens" / "indexes"
|
|
|
|
|
|
def _get_registry_path() -> Path:
|
|
"""Get the registry database path."""
|
|
env_override = os.getenv("CODEXLENS_DATA_DIR")
|
|
if env_override:
|
|
return Path(env_override).expanduser().resolve() / "registry.db"
|
|
return Path.home() / ".codexlens" / "registry.db"
|
|
|
|
|
|
@index_app.command("init")
|
|
def index_init(
|
|
path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to index."),
|
|
language: Optional[List[str]] = typer.Option(
|
|
None,
|
|
"--language",
|
|
"-l",
|
|
help="Limit indexing to specific languages (repeat or comma-separated).",
|
|
),
|
|
workers: Optional[int] = typer.Option(None, "--workers", "-w", min=1, help="Parallel worker processes (default: auto-detect based on CPU count)."),
|
|
force: bool = typer.Option(False, "--force", "-f", help="Force full reindex (skip incremental mode)."),
|
|
no_embeddings: bool = typer.Option(False, "--no-embeddings", help="Skip automatic embedding generation (if semantic deps installed)."),
|
|
backend: Optional[str] = typer.Option(None, "--backend", "-b", help="Embedding backend: fastembed (local) or litellm (remote API). Defaults to settings.json config."),
|
|
model: Optional[str] = typer.Option(None, "--model", "-m", help="Embedding model: profile name for fastembed or model name for litellm. Defaults to settings.json config."),
|
|
max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls for embedding generation. Recommended: 4-8 for litellm backend."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""Initialize or rebuild the index for a directory.
|
|
|
|
Indexes are stored in ~/.codexlens/indexes/ with mirrored directory structure.
|
|
Set CODEXLENS_INDEX_DIR to customize the index location.
|
|
|
|
By default, uses incremental indexing (skip unchanged files).
|
|
Use --force to rebuild all files regardless of modification time.
|
|
|
|
If semantic search dependencies are installed, automatically generates embeddings
|
|
after indexing completes. Use --no-embeddings to skip this step.
|
|
|
|
Backend Options (--backend):
|
|
- fastembed: Local ONNX-based embeddings (default, no API calls)
|
|
- litellm: Remote API embeddings via ccw-litellm (requires API keys)
|
|
|
|
Model Options (--model):
|
|
- For fastembed backend: Use profile names (fast, code, multilingual, balanced)
|
|
- For litellm backend: Use model names (e.g., text-embedding-3-small, text-embedding-ada-002)
|
|
"""
|
|
_configure_logging(verbose, json_mode)
|
|
config = Config()
|
|
|
|
# Fallback to settings.json config if CLI params not provided
|
|
config.load_settings() # Ensure settings are loaded
|
|
actual_backend = backend or config.embedding_backend
|
|
actual_model = model or config.embedding_model
|
|
|
|
languages = _parse_languages(language)
|
|
base_path = path.expanduser().resolve()
|
|
|
|
registry: RegistryStore | None = None
|
|
try:
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
|
|
builder = IndexTreeBuilder(registry, mapper, config, incremental=not force)
|
|
|
|
if force:
|
|
console.print(f"[bold]Building index for:[/bold] {base_path} [yellow](FULL reindex)[/yellow]")
|
|
else:
|
|
console.print(f"[bold]Building index for:[/bold] {base_path} [dim](incremental)[/dim]")
|
|
|
|
build_result = builder.build(
|
|
source_root=base_path,
|
|
languages=languages,
|
|
workers=workers,
|
|
force_full=force,
|
|
)
|
|
|
|
result = {
|
|
"path": str(base_path),
|
|
"files_indexed": build_result.total_files,
|
|
"dirs_indexed": build_result.total_dirs,
|
|
"index_root": str(build_result.index_root),
|
|
"project_id": build_result.project_id,
|
|
"languages": languages or sorted(config.supported_languages.keys()),
|
|
"errors": len(build_result.errors),
|
|
}
|
|
|
|
if not json_mode:
|
|
console.print(f"[green]OK[/green] Indexed [bold]{build_result.total_files}[/bold] files in [bold]{build_result.total_dirs}[/bold] directories")
|
|
console.print(f" Index root: {build_result.index_root}")
|
|
if build_result.errors:
|
|
console.print(f" [yellow]Warnings:[/yellow] {len(build_result.errors)} errors")
|
|
|
|
# Auto-generate embeddings if the requested backend is available
|
|
if not no_embeddings:
|
|
try:
|
|
from codexlens.semantic import is_embedding_backend_available
|
|
from codexlens.cli.embedding_manager import generate_embeddings_recursive, get_embeddings_status
|
|
|
|
# Validate embedding backend
|
|
valid_backends = ["fastembed", "litellm"]
|
|
if actual_backend not in valid_backends:
|
|
error_msg = f"Invalid embedding backend: {actual_backend}. Must be one of: {', '.join(valid_backends)}"
|
|
if json_mode:
|
|
print_json(success=False, error=error_msg)
|
|
else:
|
|
console.print(f"[red]Error:[/red] {error_msg}")
|
|
raise typer.Exit(code=1)
|
|
|
|
backend_available, backend_error = is_embedding_backend_available(actual_backend)
|
|
|
|
if backend_available:
|
|
# Use the index root directory (not the _index.db file)
|
|
index_root = Path(build_result.index_root)
|
|
|
|
if not json_mode:
|
|
console.print("\n[bold]Generating embeddings...[/bold]")
|
|
console.print(f"Backend: [cyan]{actual_backend}[/cyan]")
|
|
console.print(f"Model: [cyan]{actual_model}[/cyan]")
|
|
else:
|
|
# Output progress message for JSON mode (parsed by Node.js)
|
|
print("Generating embeddings...", flush=True)
|
|
|
|
# Progress callback - outputs progress for both json and non-json modes
|
|
# Node.js parseProgressLine() expects formats like:
|
|
# - "Batch X: N files, M chunks"
|
|
# - "Processing N files"
|
|
# - "Finalizing index"
|
|
def progress_update(msg: str):
|
|
if json_mode:
|
|
# Output without prefix so Node.js can parse it
|
|
# Strip leading spaces that embedding_manager adds
|
|
print(msg.strip(), flush=True)
|
|
elif verbose:
|
|
console.print(f" {msg}")
|
|
|
|
embed_result = generate_embeddings_recursive(
|
|
index_root,
|
|
embedding_backend=actual_backend,
|
|
model_profile=actual_model,
|
|
force=False, # Don't force regenerate during init
|
|
chunk_size=2000,
|
|
progress_callback=progress_update, # Always use callback
|
|
max_workers=max_workers,
|
|
)
|
|
|
|
if embed_result["success"]:
|
|
embed_data = embed_result["result"]
|
|
|
|
# Output completion message for Node.js to parse
|
|
if json_mode:
|
|
print(f"Embeddings complete: {embed_data['total_chunks_created']} chunks", flush=True)
|
|
|
|
# Get comprehensive coverage statistics
|
|
status_result = get_embeddings_status(index_root)
|
|
if status_result["success"]:
|
|
coverage = status_result["result"]
|
|
result["embeddings"] = {
|
|
"generated": True,
|
|
"total_indexes": coverage["total_indexes"],
|
|
"total_files": coverage["total_files"],
|
|
"files_with_embeddings": coverage["files_with_embeddings"],
|
|
"coverage_percent": coverage["coverage_percent"],
|
|
"total_chunks": coverage["total_chunks"],
|
|
}
|
|
else:
|
|
result["embeddings"] = {
|
|
"generated": True,
|
|
"total_chunks": embed_data["total_chunks_created"],
|
|
"files_processed": embed_data["total_files_processed"],
|
|
}
|
|
|
|
if not json_mode:
|
|
console.print(f"[green]✓[/green] Generated embeddings for [bold]{embed_data['total_files_processed']}[/bold] files")
|
|
console.print(f" Total chunks: [bold]{embed_data['total_chunks_created']}[/bold]")
|
|
console.print(f" Indexes processed: [bold]{embed_data['indexes_successful']}/{embed_data['indexes_processed']}[/bold]")
|
|
else:
|
|
if not json_mode:
|
|
console.print(f"[yellow]Warning:[/yellow] Embedding generation failed: {embed_result.get('error', 'Unknown error')}")
|
|
result["embeddings"] = {
|
|
"generated": False,
|
|
"error": embed_result.get("error"),
|
|
}
|
|
else:
|
|
if not json_mode and verbose:
|
|
console.print(f"[dim]Embedding backend '{actual_backend}' not available. Skipping embeddings.[/dim]")
|
|
result["embeddings"] = {
|
|
"generated": False,
|
|
"error": backend_error or "Embedding backend not available",
|
|
}
|
|
except Exception as e:
|
|
if not json_mode and verbose:
|
|
console.print(f"[yellow]Warning:[/yellow] Could not generate embeddings: {e}")
|
|
result["embeddings"] = {
|
|
"generated": False,
|
|
"error": str(e),
|
|
}
|
|
else:
|
|
result["embeddings"] = {
|
|
"generated": False,
|
|
"error": "Skipped (--no-embeddings)",
|
|
}
|
|
|
|
# Output final JSON result with embeddings status
|
|
if json_mode:
|
|
print_json(success=True, result=result)
|
|
|
|
except StorageError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Storage error: {exc}")
|
|
else:
|
|
console.print(f"[red]Init failed (storage):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except ConfigError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Configuration error: {exc}")
|
|
else:
|
|
console.print(f"[red]Init failed (config):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except ParseError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Parse error: {exc}")
|
|
else:
|
|
console.print(f"[red]Init failed (parse):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except PermissionError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Permission denied: {exc}")
|
|
else:
|
|
console.print(f"[red]Init failed (permission denied):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except CodexLensError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=str(exc))
|
|
else:
|
|
console.print(f"[red]Init failed:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
finally:
|
|
if registry is not None:
|
|
registry.close()
|
|
|
|
|
|
@app.command()
|
|
def watch(
|
|
path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to watch."),
|
|
language: Optional[List[str]] = typer.Option(
|
|
None,
|
|
"--language",
|
|
"-l",
|
|
help="Limit watching to specific languages (repeat or comma-separated).",
|
|
),
|
|
debounce: int = typer.Option(1000, "--debounce", "-d", min=100, max=10000, help="Debounce interval in milliseconds."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose logging."),
|
|
) -> None:
|
|
"""Watch directory for changes and update index incrementally.
|
|
|
|
Monitors filesystem events and automatically updates the index
|
|
when files are created, modified, or deleted.
|
|
|
|
The directory must already be indexed (run 'codexlens init' first).
|
|
|
|
Press Ctrl+C to stop watching.
|
|
|
|
Examples:
|
|
codexlens watch .
|
|
codexlens watch /path/to/project --debounce 500 --verbose
|
|
codexlens watch . --language python,typescript
|
|
"""
|
|
_configure_logging(verbose)
|
|
|
|
from codexlens.watcher.events import IndexResult
|
|
|
|
base_path = path.expanduser().resolve()
|
|
|
|
# Check if path is indexed
|
|
mapper = PathMapper()
|
|
index_db = mapper.source_to_index_db(base_path)
|
|
if not index_db.exists():
|
|
console.print(f"[red]Error:[/red] Directory not indexed: {base_path}")
|
|
console.print("Run 'codexlens init' first to create the index.")
|
|
raise typer.Exit(code=1)
|
|
|
|
# Parse languages
|
|
languages = _parse_languages(language)
|
|
|
|
# Create watcher config
|
|
watcher_config = WatcherConfig(
|
|
debounce_ms=debounce,
|
|
languages=languages,
|
|
)
|
|
|
|
# Callback for indexed files
|
|
def on_indexed(result: IndexResult) -> None:
|
|
if result.files_indexed > 0:
|
|
console.print(f" [green]Indexed:[/green] {result.files_indexed} files ({result.symbols_added} symbols)")
|
|
if result.files_removed > 0:
|
|
console.print(f" [yellow]Removed:[/yellow] {result.files_removed} files")
|
|
if result.errors:
|
|
for error in result.errors[:3]: # Show first 3 errors
|
|
console.print(f" [red]Error:[/red] {error}")
|
|
|
|
console.print(f"[bold]Watching:[/bold] {base_path}")
|
|
console.print(f" Debounce: {debounce}ms")
|
|
if languages:
|
|
console.print(f" Languages: {', '.join(languages)}")
|
|
console.print(" Press Ctrl+C to stop.\n")
|
|
|
|
manager: WatcherManager | None = None
|
|
try:
|
|
manager = WatcherManager(
|
|
root_path=base_path,
|
|
watcher_config=watcher_config,
|
|
on_indexed=on_indexed,
|
|
)
|
|
manager.start()
|
|
manager.wait()
|
|
except KeyboardInterrupt:
|
|
pass
|
|
except Exception as exc:
|
|
console.print(f"[red]Error:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
finally:
|
|
if manager is not None:
|
|
manager.stop()
|
|
console.print("\n[dim]Watcher stopped.[/dim]")
|
|
|
|
|
|
@app.command()
|
|
def search(
|
|
query: str = typer.Argument(..., help="Search query."),
|
|
path: Path = typer.Option(Path("."), "--path", "-p", help="Directory to search from."),
|
|
limit: int = typer.Option(20, "--limit", "-n", min=1, max=500, help="Max results."),
|
|
offset: int = typer.Option(0, "--offset", min=0, help="Pagination offset - skip first N results."),
|
|
depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."),
|
|
files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
|
|
method: str = typer.Option("dense_rerank", "--method", "-m", help="Search method: 'dense_rerank' (semantic, default), 'fts' (exact keyword)."),
|
|
use_fuzzy: bool = typer.Option(False, "--use-fuzzy", help="Enable fuzzy matching in FTS method."),
|
|
code_only: bool = typer.Option(False, "--code-only", help="Only return code files (excludes md, txt, json, yaml, xml, etc.)."),
|
|
exclude_extensions: Optional[str] = typer.Option(None, "--exclude-extensions", help="Comma-separated list of file extensions to exclude (e.g., 'md,txt,json')."),
|
|
# Hidden advanced options for backward compatibility
|
|
weights: Optional[str] = typer.Option(
|
|
None,
|
|
"--weights", "-w",
|
|
hidden=True,
|
|
help="[Advanced] RRF weights as key=value pairs."
|
|
),
|
|
cascade_strategy: Optional[str] = typer.Option(
|
|
None,
|
|
"--cascade-strategy",
|
|
hidden=True,
|
|
help="[Advanced] Cascade strategy for --method cascade."
|
|
),
|
|
# Hidden deprecated parameter for backward compatibility
|
|
mode: Optional[str] = typer.Option(None, "--mode", hidden=True, help="[DEPRECATED] Use --method instead."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""Search indexed file contents.
|
|
|
|
Uses chain search across directory indexes.
|
|
Use --depth to limit search recursion (0 = current dir only).
|
|
|
|
Search Methods:
|
|
- dense_rerank (default): Semantic search using Dense embedding coarse retrieval +
|
|
Cross-encoder reranking. Best for natural language queries and code understanding.
|
|
- fts: Full-text search using FTS5 (unicode61 tokenizer). Best for exact code
|
|
identifiers like function/class names. Use --use-fuzzy for typo tolerance.
|
|
|
|
Method Selection Guide:
|
|
- Code identifiers (function/class names): fts
|
|
- Natural language queries: dense_rerank (default)
|
|
- Typo-tolerant search: fts --use-fuzzy
|
|
|
|
Requirements:
|
|
The dense_rerank method requires pre-generated embeddings.
|
|
Use 'codexlens embeddings-generate' to create embeddings first.
|
|
|
|
Examples:
|
|
# Default semantic search (dense_rerank)
|
|
codexlens search "authentication logic"
|
|
|
|
# Exact code identifier search
|
|
codexlens search "authenticate_user" --method fts
|
|
|
|
# Typo-tolerant fuzzy search
|
|
codexlens search "authentcate" --method fts --use-fuzzy
|
|
"""
|
|
_configure_logging(verbose, json_mode)
|
|
search_path = path.expanduser().resolve()
|
|
|
|
# Handle deprecated --mode parameter
|
|
actual_method = method
|
|
if mode is not None:
|
|
# Show deprecation warning
|
|
if not json_mode:
|
|
console.print("[yellow]Warning: --mode is deprecated, use --method instead.[/yellow]")
|
|
|
|
# Map old mode values to new method values
|
|
mode_to_method = {
|
|
"auto": "hybrid",
|
|
"exact": "fts",
|
|
"fuzzy": "fts", # with use_fuzzy=True
|
|
"hybrid": "hybrid",
|
|
"vector": "vector",
|
|
"pure-vector": "vector",
|
|
}
|
|
|
|
if mode in mode_to_method:
|
|
actual_method = mode_to_method[mode]
|
|
# Enable fuzzy for old fuzzy mode
|
|
if mode == "fuzzy":
|
|
use_fuzzy = True
|
|
else:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Invalid deprecated mode: {mode}. Use --method instead.")
|
|
else:
|
|
console.print(f"[red]Invalid deprecated mode:[/red] {mode}")
|
|
console.print("[dim]Use --method with: fts, vector, splade, hybrid, cascade[/dim]")
|
|
raise typer.Exit(code=1)
|
|
|
|
# Configure search (load settings from file)
|
|
config = Config.load()
|
|
|
|
# Validate method - simplified interface exposes only dense_rerank and fts
|
|
# Other methods (vector, splade, hybrid, cascade) are hidden but still work for backward compatibility
|
|
valid_methods = ["fts", "dense_rerank", "vector", "splade", "hybrid", "cascade"]
|
|
if actual_method not in valid_methods:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Invalid method: {actual_method}. Use 'dense_rerank' (semantic) or 'fts' (exact keyword).")
|
|
else:
|
|
console.print(f"[red]Invalid method:[/red] {actual_method}")
|
|
console.print("[dim]Use 'dense_rerank' (semantic, default) or 'fts' (exact keyword)[/dim]")
|
|
raise typer.Exit(code=1)
|
|
|
|
# Map dense_rerank to cascade method internally
|
|
internal_cascade_strategy = cascade_strategy
|
|
if actual_method == "dense_rerank":
|
|
actual_method = "cascade"
|
|
internal_cascade_strategy = "dense_rerank"
|
|
|
|
# Validate cascade_strategy if provided (for advanced users)
|
|
if internal_cascade_strategy is not None:
|
|
valid_strategies = ["binary", "hybrid", "binary_rerank", "dense_rerank"]
|
|
if internal_cascade_strategy not in valid_strategies:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Invalid cascade strategy: {internal_cascade_strategy}. Must be one of: {', '.join(valid_strategies)}")
|
|
else:
|
|
console.print(f"[red]Invalid cascade strategy:[/red] {internal_cascade_strategy}")
|
|
console.print(f"[dim]Valid strategies: {', '.join(valid_strategies)}[/dim]")
|
|
raise typer.Exit(code=1)
|
|
|
|
# Parse custom weights if provided
|
|
hybrid_weights = None
|
|
if weights:
|
|
try:
|
|
# Check if using key=value format (new) or legacy comma-separated format
|
|
if "=" in weights:
|
|
# New format: splade=0.4,vector=0.6 or exact=0.3,fuzzy=0.1,vector=0.6
|
|
weight_dict = {}
|
|
for pair in weights.split(","):
|
|
if "=" in pair:
|
|
key, val = pair.split("=", 1)
|
|
weight_dict[key.strip()] = float(val.strip())
|
|
else:
|
|
raise ValueError("Mixed format not supported - use all key=value pairs")
|
|
|
|
# Validate and normalize weights
|
|
weight_sum = sum(weight_dict.values())
|
|
if abs(weight_sum - 1.0) > 0.01:
|
|
if not json_mode:
|
|
console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]")
|
|
weight_dict = {k: v / weight_sum for k, v in weight_dict.items()}
|
|
|
|
hybrid_weights = weight_dict
|
|
else:
|
|
# Legacy format: 0.3,0.1,0.6 (exact,fuzzy,vector)
|
|
weight_parts = [float(w.strip()) for w in weights.split(",")]
|
|
if len(weight_parts) == 3:
|
|
weight_sum = sum(weight_parts)
|
|
if abs(weight_sum - 1.0) > 0.01:
|
|
if not json_mode:
|
|
console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]")
|
|
weight_parts = [w / weight_sum for w in weight_parts]
|
|
hybrid_weights = {
|
|
"exact": weight_parts[0],
|
|
"fuzzy": weight_parts[1],
|
|
"vector": weight_parts[2],
|
|
}
|
|
elif len(weight_parts) == 2:
|
|
# Two values: assume splade,vector
|
|
weight_sum = sum(weight_parts)
|
|
if abs(weight_sum - 1.0) > 0.01:
|
|
if not json_mode:
|
|
console.print(f"[yellow]Warning: Weights sum to {weight_sum:.2f}, should sum to 1.0. Normalizing...[/yellow]")
|
|
weight_parts = [w / weight_sum for w in weight_parts]
|
|
hybrid_weights = {
|
|
"splade": weight_parts[0],
|
|
"vector": weight_parts[1],
|
|
}
|
|
else:
|
|
if not json_mode:
|
|
console.print("[yellow]Warning: Invalid weights format. Using defaults.[/yellow]")
|
|
except ValueError as e:
|
|
if not json_mode:
|
|
console.print(f"[yellow]Warning: Invalid weights format ({e}). Using defaults.[/yellow]")
|
|
|
|
registry: RegistryStore | None = None
|
|
try:
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
|
|
engine = ChainSearchEngine(registry, mapper, config=config)
|
|
|
|
# Map method to SearchOptions flags
|
|
# fts: FTS-only search (optionally with fuzzy)
|
|
# vector: Pure vector semantic search
|
|
# splade: SPLADE sparse neural search
|
|
# hybrid: RRF fusion of sparse + dense
|
|
# cascade: Two-stage binary + dense retrieval
|
|
if actual_method == "fts":
|
|
hybrid_mode = False
|
|
enable_fuzzy = use_fuzzy
|
|
enable_vector = False
|
|
pure_vector = False
|
|
enable_splade = False
|
|
enable_cascade = False
|
|
elif actual_method == "vector":
|
|
hybrid_mode = True
|
|
enable_fuzzy = False
|
|
enable_vector = True
|
|
pure_vector = True
|
|
enable_splade = False
|
|
enable_cascade = False
|
|
elif actual_method == "splade":
|
|
hybrid_mode = True
|
|
enable_fuzzy = False
|
|
enable_vector = False
|
|
pure_vector = False
|
|
enable_splade = True
|
|
enable_cascade = False
|
|
elif actual_method == "hybrid":
|
|
hybrid_mode = True
|
|
enable_fuzzy = use_fuzzy
|
|
enable_vector = True
|
|
pure_vector = False
|
|
enable_splade = True # SPLADE is preferred sparse in hybrid
|
|
enable_cascade = False
|
|
elif actual_method == "cascade":
|
|
hybrid_mode = True
|
|
enable_fuzzy = False
|
|
enable_vector = True
|
|
pure_vector = False
|
|
enable_splade = False
|
|
enable_cascade = True
|
|
else:
|
|
raise ValueError(f"Invalid method: {actual_method}")
|
|
|
|
# Parse exclude_extensions from comma-separated string
|
|
exclude_exts_list = None
|
|
if exclude_extensions:
|
|
exclude_exts_list = [ext.strip() for ext in exclude_extensions.split(',') if ext.strip()]
|
|
|
|
options = SearchOptions(
|
|
depth=depth,
|
|
total_limit=limit,
|
|
offset=offset,
|
|
files_only=files_only,
|
|
code_only=code_only,
|
|
exclude_extensions=exclude_exts_list,
|
|
hybrid_mode=hybrid_mode,
|
|
enable_fuzzy=enable_fuzzy,
|
|
enable_vector=enable_vector,
|
|
pure_vector=pure_vector,
|
|
enable_splade=enable_splade,
|
|
enable_cascade=enable_cascade,
|
|
hybrid_weights=hybrid_weights,
|
|
)
|
|
|
|
if files_only:
|
|
file_paths = engine.search_files_only(query, search_path, options)
|
|
payload = {"query": query, "count": len(file_paths), "files": file_paths}
|
|
if json_mode:
|
|
print_json(success=True, result=payload)
|
|
else:
|
|
for fp in file_paths:
|
|
console.print(fp)
|
|
else:
|
|
# Dispatch to cascade_search for cascade method
|
|
if actual_method == "cascade":
|
|
result = engine.cascade_search(query, search_path, k=limit, options=options, strategy=internal_cascade_strategy)
|
|
else:
|
|
result = engine.search(query, search_path, options)
|
|
results_list = [
|
|
{
|
|
"path": r.path,
|
|
"score": r.score,
|
|
"excerpt": r.excerpt,
|
|
"content": r.content, # Full function/class body
|
|
"source": getattr(r, "search_source", None),
|
|
"symbol": getattr(r, "symbol", None),
|
|
}
|
|
for r in result.results
|
|
]
|
|
|
|
payload = {
|
|
"query": query,
|
|
"method": actual_method,
|
|
"count": len(results_list),
|
|
"results": results_list,
|
|
"stats": {
|
|
"dirs_searched": result.stats.dirs_searched,
|
|
"files_matched": result.stats.files_matched,
|
|
"time_ms": result.stats.time_ms,
|
|
},
|
|
}
|
|
if json_mode:
|
|
print_json(success=True, result=payload)
|
|
else:
|
|
render_search_results(result.results, verbose=verbose)
|
|
console.print(f"[dim]Method: {actual_method} | Searched {result.stats.dirs_searched} directories in {result.stats.time_ms:.1f}ms[/dim]")
|
|
|
|
except SearchError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Search error: {exc}")
|
|
else:
|
|
console.print(f"[red]Search failed (query):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except StorageError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Storage error: {exc}")
|
|
else:
|
|
console.print(f"[red]Search failed (storage):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except CodexLensError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=str(exc))
|
|
else:
|
|
console.print(f"[red]Search failed:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
finally:
|
|
if registry is not None:
|
|
registry.close()
|
|
|
|
|
|
@app.command()
|
|
def symbol(
|
|
name: str = typer.Argument(..., help="Symbol name to look up."),
|
|
path: Path = typer.Option(Path("."), "--path", "-p", help="Directory to search from."),
|
|
kind: Optional[str] = typer.Option(
|
|
None,
|
|
"--kind",
|
|
"-k",
|
|
help="Filter by kind (function|class|method).",
|
|
),
|
|
limit: int = typer.Option(50, "--limit", "-n", min=1, max=500, help="Max symbols."),
|
|
depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited)."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""Look up symbols by name and optional kind."""
|
|
_configure_logging(verbose, json_mode)
|
|
search_path = path.expanduser().resolve()
|
|
|
|
registry: RegistryStore | None = None
|
|
try:
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
|
|
engine = ChainSearchEngine(registry, mapper, config=config)
|
|
options = SearchOptions(depth=depth, total_limit=limit)
|
|
|
|
syms = engine.search_symbols(name, search_path, kind=kind, options=options)
|
|
|
|
payload = {"name": name, "kind": kind, "count": len(syms), "symbols": syms}
|
|
if json_mode:
|
|
print_json(success=True, result=payload)
|
|
else:
|
|
render_symbols(syms)
|
|
|
|
except SearchError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Search error: {exc}")
|
|
else:
|
|
console.print(f"[red]Symbol lookup failed (search):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except StorageError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Storage error: {exc}")
|
|
else:
|
|
console.print(f"[red]Symbol lookup failed (storage):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except CodexLensError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=str(exc))
|
|
else:
|
|
console.print(f"[red]Symbol lookup failed:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
finally:
|
|
if registry is not None:
|
|
registry.close()
|
|
|
|
|
|
@app.command()
|
|
def inspect(
|
|
file: Path = typer.Argument(..., exists=True, dir_okay=False, help="File to analyze."),
|
|
symbols: bool = typer.Option(True, "--symbols/--no-symbols", help="Show discovered symbols."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""Analyze a single file and display symbols."""
|
|
_configure_logging(verbose, json_mode)
|
|
config = Config()
|
|
factory = ParserFactory(config)
|
|
|
|
file_path = file.expanduser().resolve()
|
|
try:
|
|
text = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
language_id = config.language_for_path(file_path) or "unknown"
|
|
parser = factory.get_parser(language_id)
|
|
indexed = parser.parse(text, file_path)
|
|
payload = {"file": indexed, "content_lines": len(text.splitlines())}
|
|
if json_mode:
|
|
print_json(success=True, result=payload)
|
|
else:
|
|
if symbols:
|
|
render_file_inspect(indexed.path, indexed.language, indexed.symbols)
|
|
else:
|
|
render_status({"file": indexed.path, "language": indexed.language})
|
|
except ParseError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Parse error: {exc}")
|
|
else:
|
|
console.print(f"[red]Inspect failed (parse):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except FileNotFoundError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"File not found: {exc}")
|
|
else:
|
|
console.print(f"[red]Inspect failed (file not found):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except PermissionError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Permission denied: {exc}")
|
|
else:
|
|
console.print(f"[red]Inspect failed (permission denied):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except CodexLensError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=str(exc))
|
|
else:
|
|
console.print(f"[red]Inspect failed:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
|
|
|
|
@app.command()
|
|
def status(
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""Show index status and configuration."""
|
|
_configure_logging(verbose, json_mode)
|
|
|
|
registry: RegistryStore | None = None
|
|
try:
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
|
|
# Get all projects
|
|
projects = registry.list_projects()
|
|
|
|
# Calculate total stats
|
|
total_files = sum(p.total_files for p in projects)
|
|
total_dirs = sum(p.total_dirs for p in projects)
|
|
|
|
# Get index root size
|
|
index_root = mapper.index_root
|
|
index_size = 0
|
|
if index_root.exists():
|
|
for f in index_root.rglob("*"):
|
|
if f.is_file():
|
|
index_size += f.stat().st_size
|
|
|
|
# Check schema version and enabled features
|
|
schema_version = None
|
|
has_dual_fts = False
|
|
if projects and index_root.exists():
|
|
# Check first index database for features
|
|
index_files = list(index_root.rglob("_index.db"))
|
|
if index_files:
|
|
try:
|
|
with DirIndexStore(index_files[0]) as store:
|
|
with store._lock:
|
|
conn = store._get_connection()
|
|
schema_version = store._get_schema_version(conn)
|
|
# Check if dual FTS tables exist
|
|
cursor = conn.execute(
|
|
"SELECT name FROM sqlite_master WHERE type='table' AND name IN ('files_fts_exact', 'files_fts_fuzzy')"
|
|
)
|
|
fts_tables = [row[0] for row in cursor.fetchall()]
|
|
has_dual_fts = len(fts_tables) == 2
|
|
except Exception:
|
|
pass
|
|
|
|
# Check embeddings coverage
|
|
embeddings_info = None
|
|
has_vector_search = False
|
|
try:
|
|
from codexlens.cli.embedding_manager import get_embeddings_status
|
|
|
|
if index_root.exists():
|
|
embed_status = get_embeddings_status(index_root)
|
|
if embed_status["success"]:
|
|
embeddings_info = embed_status["result"]
|
|
# Enable vector search if coverage >= 50%
|
|
has_vector_search = embeddings_info["coverage_percent"] >= 50.0
|
|
except ImportError:
|
|
# Embedding manager not available
|
|
pass
|
|
except Exception as e:
|
|
logging.debug(f"Failed to get embeddings status: {e}")
|
|
|
|
stats = {
|
|
"index_root": str(index_root),
|
|
"registry_path": str(_get_registry_path()),
|
|
"projects_count": len(projects),
|
|
"total_files": total_files,
|
|
"total_dirs": total_dirs,
|
|
"index_size_bytes": index_size,
|
|
"index_size_mb": round(index_size / (1024 * 1024), 2),
|
|
"schema_version": schema_version,
|
|
"features": {
|
|
"exact_fts": True, # Always available
|
|
"fuzzy_fts": has_dual_fts,
|
|
"hybrid_search": has_dual_fts,
|
|
"vector_search": has_vector_search,
|
|
},
|
|
}
|
|
|
|
# Add embeddings info if available
|
|
if embeddings_info:
|
|
stats["embeddings"] = embeddings_info
|
|
|
|
if json_mode:
|
|
print_json(success=True, result=stats)
|
|
else:
|
|
console.print("[bold]CodexLens Status[/bold]")
|
|
console.print(f" Index Root: {stats['index_root']}")
|
|
console.print(f" Registry: {stats['registry_path']}")
|
|
console.print(f" Projects: {stats['projects_count']}")
|
|
console.print(f" Total Files: {stats['total_files']}")
|
|
console.print(f" Total Directories: {stats['total_dirs']}")
|
|
console.print(f" Index Size: {stats['index_size_mb']} MB")
|
|
if schema_version:
|
|
console.print(f" Schema Version: {schema_version}")
|
|
console.print("\n[bold]Search Backends:[/bold]")
|
|
console.print(f" Exact FTS: ✓ (unicode61)")
|
|
if has_dual_fts:
|
|
console.print(f" Fuzzy FTS: ✓ (trigram)")
|
|
console.print(f" Hybrid Search: ✓ (RRF fusion)")
|
|
else:
|
|
console.print(f" Fuzzy FTS: ✗ (run 'migrate' to enable)")
|
|
console.print(f" Hybrid Search: ✗ (run 'migrate' to enable)")
|
|
|
|
if has_vector_search:
|
|
console.print(f" Vector Search: ✓ (embeddings available)")
|
|
else:
|
|
console.print(f" Vector Search: ✗ (no embeddings or coverage < 50%)")
|
|
|
|
# Display embeddings statistics if available
|
|
if embeddings_info:
|
|
console.print("\n[bold]Embeddings Coverage:[/bold]")
|
|
console.print(f" Total Indexes: {embeddings_info['total_indexes']}")
|
|
console.print(f" Total Files: {embeddings_info['total_files']}")
|
|
console.print(f" Files with Embeddings: {embeddings_info['files_with_embeddings']}")
|
|
console.print(f" Coverage: {embeddings_info['coverage_percent']:.1f}%")
|
|
console.print(f" Total Chunks: {embeddings_info['total_chunks']}")
|
|
|
|
# Display model information if available
|
|
model_info = embeddings_info.get('model_info')
|
|
if model_info:
|
|
console.print("\n[bold]Embedding Model:[/bold]")
|
|
console.print(f" Backend: [cyan]{model_info.get('backend', 'unknown')}[/cyan]")
|
|
console.print(f" Model: [cyan]{model_info.get('model_profile', 'unknown')}[/cyan] ({model_info.get('model_name', '')})")
|
|
console.print(f" Dimensions: {model_info.get('embedding_dim', 'unknown')}")
|
|
if model_info.get('updated_at'):
|
|
console.print(f" Last Updated: {model_info['updated_at']}")
|
|
|
|
except StorageError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Storage error: {exc}")
|
|
else:
|
|
console.print(f"[red]Status failed (storage):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except CodexLensError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=str(exc))
|
|
else:
|
|
console.print(f"[red]Status failed:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
finally:
|
|
if registry is not None:
|
|
registry.close()
|
|
|
|
|
|
@app.command()
|
|
def projects(
|
|
action: str = typer.Argument("list", help="Action: list, show, remove"),
|
|
project_path: Optional[Path] = typer.Argument(None, help="Project path (for show/remove)."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""Manage registered projects in the global registry.
|
|
|
|
Actions:
|
|
- list: Show all registered projects
|
|
- show <path>: Show details for a specific project
|
|
- remove <path>: Remove a project from the registry
|
|
"""
|
|
_configure_logging(verbose, json_mode)
|
|
|
|
registry: RegistryStore | None = None
|
|
try:
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
|
|
if action == "list":
|
|
project_list = registry.list_projects()
|
|
if json_mode:
|
|
result = [
|
|
{
|
|
"id": p.id,
|
|
"source_root": str(p.source_root),
|
|
"index_root": str(p.index_root),
|
|
"total_files": p.total_files,
|
|
"total_dirs": p.total_dirs,
|
|
"status": p.status,
|
|
}
|
|
for p in project_list
|
|
]
|
|
print_json(success=True, result=result)
|
|
else:
|
|
if not project_list:
|
|
console.print("[yellow]No projects registered.[/yellow]")
|
|
else:
|
|
table = Table(title="Registered Projects")
|
|
table.add_column("ID", style="dim")
|
|
table.add_column("Source Root")
|
|
table.add_column("Files", justify="right")
|
|
table.add_column("Dirs", justify="right")
|
|
table.add_column("Status")
|
|
|
|
for p in project_list:
|
|
table.add_row(
|
|
str(p.id),
|
|
str(p.source_root),
|
|
str(p.total_files),
|
|
str(p.total_dirs),
|
|
p.status,
|
|
)
|
|
console.print(table)
|
|
|
|
elif action == "show":
|
|
if not project_path:
|
|
raise typer.BadParameter("Project path required for 'show' action")
|
|
|
|
project_path = project_path.expanduser().resolve()
|
|
project_info = registry.get_project(project_path)
|
|
|
|
if not project_info:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Project not found: {project_path}")
|
|
else:
|
|
console.print(f"[red]Project not found:[/red] {project_path}")
|
|
raise typer.Exit(code=1)
|
|
|
|
if json_mode:
|
|
result = {
|
|
"id": project_info.id,
|
|
"source_root": str(project_info.source_root),
|
|
"index_root": str(project_info.index_root),
|
|
"total_files": project_info.total_files,
|
|
"total_dirs": project_info.total_dirs,
|
|
"status": project_info.status,
|
|
"created_at": project_info.created_at,
|
|
"last_indexed": project_info.last_indexed,
|
|
}
|
|
print_json(success=True, result=result)
|
|
else:
|
|
console.print(f"[bold]Project:[/bold] {project_info.source_root}")
|
|
console.print(f" ID: {project_info.id}")
|
|
console.print(f" Index Root: {project_info.index_root}")
|
|
console.print(f" Files: {project_info.total_files}")
|
|
console.print(f" Directories: {project_info.total_dirs}")
|
|
console.print(f" Status: {project_info.status}")
|
|
|
|
# Show directory breakdown
|
|
dirs = registry.get_project_dirs(project_info.id)
|
|
if dirs:
|
|
console.print(f"\n [bold]Indexed Directories:[/bold] {len(dirs)}")
|
|
for d in dirs[:10]:
|
|
console.print(f" - {d.source_path.name}/ ({d.files_count} files)")
|
|
if len(dirs) > 10:
|
|
console.print(f" ... and {len(dirs) - 10} more")
|
|
|
|
elif action == "remove":
|
|
if not project_path:
|
|
raise typer.BadParameter("Project path required for 'remove' action")
|
|
|
|
project_path = project_path.expanduser().resolve()
|
|
removed = registry.unregister_project(project_path)
|
|
|
|
if removed:
|
|
mapper = PathMapper()
|
|
index_root = mapper.source_to_index_dir(project_path)
|
|
if index_root.exists():
|
|
shutil.rmtree(index_root)
|
|
|
|
if json_mode:
|
|
print_json(success=True, result={"removed": str(project_path)})
|
|
else:
|
|
console.print(f"[green]Removed:[/green] {project_path}")
|
|
else:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Project not found: {project_path}")
|
|
else:
|
|
console.print(f"[yellow]Project not found:[/yellow] {project_path}")
|
|
|
|
else:
|
|
raise typer.BadParameter(f"Unknown action: {action}. Use list, show, or remove.")
|
|
|
|
except typer.BadParameter:
|
|
raise
|
|
except StorageError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Storage error: {exc}")
|
|
else:
|
|
console.print(f"[red]Projects command failed (storage):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except PermissionError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Permission denied: {exc}")
|
|
else:
|
|
console.print(f"[red]Projects command failed (permission denied):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except CodexLensError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=str(exc))
|
|
else:
|
|
console.print(f"[red]Projects command failed:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
finally:
|
|
if registry is not None:
|
|
registry.close()
|
|
|
|
|
|
@app.command()
|
|
def config(
|
|
action: str = typer.Argument("show", help="Action: show, set, migrate"),
|
|
key: Optional[str] = typer.Argument(None, help="Config key (for set action)."),
|
|
value: Optional[str] = typer.Argument(None, help="Config value (for set action)."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""Manage CodexLens configuration.
|
|
|
|
Actions:
|
|
- show: Display current configuration
|
|
- set <key> <value>: Set configuration value
|
|
- migrate <new_path>: Migrate indexes to new location
|
|
|
|
Config keys:
|
|
- index_dir: Directory to store indexes (default: ~/.codexlens/indexes)
|
|
- reranker_backend: Reranker backend (onnx, api, litellm, legacy)
|
|
- reranker_model: Reranker model name
|
|
- reranker_enabled: Enable reranking (true/false)
|
|
- reranker_top_k: Number of results to rerank
|
|
- reranker_api_provider: API provider for reranker (siliconflow, cohere, jina)
|
|
- embedding_backend: Embedding backend (fastembed, litellm)
|
|
- embedding_model: Embedding model profile or name
|
|
"""
|
|
_configure_logging(verbose, json_mode)
|
|
|
|
config_file = Path.home() / ".codexlens" / "config.json"
|
|
|
|
def load_config() -> Dict[str, Any]:
|
|
if config_file.exists():
|
|
return json.loads(config_file.read_text(encoding="utf-8"))
|
|
return {}
|
|
|
|
def save_config(cfg: Dict[str, Any]) -> None:
|
|
config_file.parent.mkdir(parents=True, exist_ok=True)
|
|
config_file.write_text(json.dumps(cfg, indent=2), encoding="utf-8")
|
|
|
|
try:
|
|
if action == "show":
|
|
cfg = load_config()
|
|
current_index_dir = os.getenv("CODEXLENS_INDEX_DIR") or cfg.get("index_dir") or str(Path.home() / ".codexlens" / "indexes")
|
|
|
|
result = {
|
|
"config_file": str(config_file),
|
|
"index_dir": current_index_dir,
|
|
"env_override": os.getenv("CODEXLENS_INDEX_DIR"),
|
|
}
|
|
|
|
# Load settings.json for reranker and other runtime settings
|
|
settings_file = Path.home() / ".codexlens" / "settings.json"
|
|
if settings_file.exists():
|
|
try:
|
|
settings = json.loads(settings_file.read_text(encoding="utf-8"))
|
|
# Extract reranker settings (flat keys for CCW compatibility)
|
|
reranker = settings.get("reranker", {})
|
|
if reranker.get("backend"):
|
|
result["reranker_backend"] = reranker["backend"]
|
|
if reranker.get("model"):
|
|
result["reranker_model"] = reranker["model"]
|
|
if reranker.get("enabled") is not None:
|
|
result["reranker_enabled"] = reranker["enabled"]
|
|
if reranker.get("top_k"):
|
|
result["reranker_top_k"] = reranker["top_k"]
|
|
if reranker.get("api_provider"):
|
|
result["reranker_api_provider"] = reranker["api_provider"]
|
|
# Extract embedding settings
|
|
embedding = settings.get("embedding", {})
|
|
if embedding.get("backend"):
|
|
result["embedding_backend"] = embedding["backend"]
|
|
if embedding.get("model"):
|
|
result["embedding_model"] = embedding["model"]
|
|
except (json.JSONDecodeError, OSError):
|
|
pass # Settings file not readable, continue with defaults
|
|
|
|
# Load .env overrides from global ~/.codexlens/.env
|
|
env_overrides: Dict[str, str] = {}
|
|
try:
|
|
from codexlens.env_config import load_global_env
|
|
env_overrides = load_global_env()
|
|
except ImportError:
|
|
pass
|
|
|
|
# Apply .env overrides (highest priority) and track them
|
|
if env_overrides.get("EMBEDDING_MODEL"):
|
|
result["embedding_model"] = env_overrides["EMBEDDING_MODEL"]
|
|
result["embedding_model_source"] = ".env"
|
|
if env_overrides.get("EMBEDDING_BACKEND"):
|
|
result["embedding_backend"] = env_overrides["EMBEDDING_BACKEND"]
|
|
result["embedding_backend_source"] = ".env"
|
|
if env_overrides.get("RERANKER_MODEL"):
|
|
result["reranker_model"] = env_overrides["RERANKER_MODEL"]
|
|
result["reranker_model_source"] = ".env"
|
|
if env_overrides.get("RERANKER_BACKEND"):
|
|
result["reranker_backend"] = env_overrides["RERANKER_BACKEND"]
|
|
result["reranker_backend_source"] = ".env"
|
|
if env_overrides.get("RERANKER_ENABLED"):
|
|
result["reranker_enabled"] = env_overrides["RERANKER_ENABLED"].lower() in ("true", "1", "yes", "on")
|
|
result["reranker_enabled_source"] = ".env"
|
|
if env_overrides.get("RERANKER_PROVIDER") or os.getenv("RERANKER_PROVIDER"):
|
|
result["reranker_api_provider"] = env_overrides.get("RERANKER_PROVIDER") or os.getenv("RERANKER_PROVIDER")
|
|
|
|
if json_mode:
|
|
print_json(success=True, result=result)
|
|
else:
|
|
console.print("[bold]CodexLens Configuration[/bold]")
|
|
console.print(f" Config File: {result['config_file']}")
|
|
console.print(f" Index Directory: {result['index_dir']}")
|
|
if result['env_override']:
|
|
console.print(f" [dim](Override via CODEXLENS_INDEX_DIR)[/dim]")
|
|
|
|
# Show embedding settings
|
|
console.print(f"\n[bold]Embedding[/bold]")
|
|
backend = result.get('embedding_backend', 'fastembed')
|
|
backend_source = result.get('embedding_backend_source', 'settings.json')
|
|
console.print(f" Backend: {backend} [dim]({backend_source})[/dim]")
|
|
model = result.get('embedding_model', 'code')
|
|
model_source = result.get('embedding_model_source', 'settings.json')
|
|
console.print(f" Model: {model} [dim]({model_source})[/dim]")
|
|
|
|
# Show reranker settings
|
|
console.print(f"\n[bold]Reranker[/bold]")
|
|
backend = result.get('reranker_backend', 'fastembed')
|
|
backend_source = result.get('reranker_backend_source', 'settings.json')
|
|
console.print(f" Backend: {backend} [dim]({backend_source})[/dim]")
|
|
model = result.get('reranker_model', 'N/A')
|
|
model_source = result.get('reranker_model_source', 'settings.json')
|
|
console.print(f" Model: {model} [dim]({model_source})[/dim]")
|
|
enabled = result.get('reranker_enabled', False)
|
|
enabled_source = result.get('reranker_enabled_source', 'settings.json')
|
|
console.print(f" Enabled: {enabled} [dim]({enabled_source})[/dim]")
|
|
|
|
elif action == "set":
|
|
if not key:
|
|
raise typer.BadParameter("Config key required for 'set' action")
|
|
if not value:
|
|
raise typer.BadParameter("Config value required for 'set' action")
|
|
|
|
cfg = load_config()
|
|
|
|
if key == "index_dir":
|
|
new_path = Path(value).expanduser().resolve()
|
|
cfg["index_dir"] = str(new_path)
|
|
save_config(cfg)
|
|
|
|
if json_mode:
|
|
print_json(success=True, result={"key": key, "value": str(new_path)})
|
|
else:
|
|
console.print(f"[green]Set {key}=[/green] {new_path}")
|
|
console.print("[yellow]Note: Existing indexes remain at old location. Use 'config migrate' to move them.[/yellow]")
|
|
|
|
# Handle reranker and embedding settings (stored in settings.json)
|
|
elif key in ("reranker_backend", "reranker_model", "reranker_enabled", "reranker_top_k",
|
|
"embedding_backend", "embedding_model", "reranker_api_provider"):
|
|
settings_file = Path.home() / ".codexlens" / "settings.json"
|
|
settings_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load existing settings
|
|
settings: Dict[str, Any] = {}
|
|
if settings_file.exists():
|
|
try:
|
|
settings = json.loads(settings_file.read_text(encoding="utf-8"))
|
|
except (json.JSONDecodeError, OSError):
|
|
pass
|
|
|
|
# Ensure nested structures exist
|
|
if "reranker" not in settings:
|
|
settings["reranker"] = {}
|
|
if "embedding" not in settings:
|
|
settings["embedding"] = {}
|
|
|
|
# Map flat keys to nested structure
|
|
if key == "reranker_backend":
|
|
settings["reranker"]["backend"] = value
|
|
elif key == "reranker_model":
|
|
settings["reranker"]["model"] = value
|
|
elif key == "reranker_enabled":
|
|
settings["reranker"]["enabled"] = value.lower() in ("true", "1", "yes")
|
|
elif key == "reranker_top_k":
|
|
settings["reranker"]["top_k"] = int(value)
|
|
elif key == "reranker_api_provider":
|
|
settings["reranker"]["api_provider"] = value
|
|
elif key == "embedding_backend":
|
|
settings["embedding"]["backend"] = value
|
|
elif key == "embedding_model":
|
|
settings["embedding"]["model"] = value
|
|
|
|
# Save settings
|
|
settings_file.write_text(json.dumps(settings, indent=2), encoding="utf-8")
|
|
|
|
if json_mode:
|
|
print_json(success=True, result={"key": key, "value": value})
|
|
else:
|
|
console.print(f"[green]Set {key}=[/green] {value}")
|
|
else:
|
|
raise typer.BadParameter(f"Unknown config key: {key}")
|
|
|
|
elif action == "migrate":
|
|
if not key:
|
|
raise typer.BadParameter("New path required for 'migrate' action")
|
|
|
|
new_path = Path(key).expanduser().resolve()
|
|
mapper = PathMapper()
|
|
old_path = mapper.index_root
|
|
|
|
if not old_path.exists():
|
|
if json_mode:
|
|
print_json(success=False, error="No indexes to migrate")
|
|
else:
|
|
console.print("[yellow]No indexes to migrate.[/yellow]")
|
|
return
|
|
|
|
# Create new directory
|
|
new_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Count items to migrate
|
|
items = list(old_path.iterdir())
|
|
migrated = 0
|
|
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
BarColumn(),
|
|
TextColumn("{task.completed}/{task.total}"),
|
|
TimeElapsedColumn(),
|
|
console=console,
|
|
) as progress:
|
|
task = progress.add_task("Migrating indexes", total=len(items))
|
|
|
|
for item in items:
|
|
dest = new_path / item.name
|
|
if item.is_dir():
|
|
shutil.copytree(item, dest, dirs_exist_ok=True)
|
|
else:
|
|
shutil.copy2(item, dest)
|
|
migrated += 1
|
|
progress.advance(task)
|
|
|
|
# Update config
|
|
cfg = load_config()
|
|
cfg["index_dir"] = str(new_path)
|
|
save_config(cfg)
|
|
|
|
# Update registry paths
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
registry.update_index_paths(old_path, new_path)
|
|
registry.close()
|
|
|
|
result = {
|
|
"migrated_from": str(old_path),
|
|
"migrated_to": str(new_path),
|
|
"items_migrated": migrated,
|
|
}
|
|
|
|
if json_mode:
|
|
print_json(success=True, result=result)
|
|
else:
|
|
console.print(f"[green]Migrated {migrated} items to:[/green] {new_path}")
|
|
console.print("[dim]Old indexes can be manually deleted after verifying migration.[/dim]")
|
|
|
|
else:
|
|
raise typer.BadParameter(f"Unknown action: {action}. Use show, set, or migrate.")
|
|
|
|
except typer.BadParameter:
|
|
raise
|
|
except ConfigError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Configuration error: {exc}")
|
|
else:
|
|
console.print(f"[red]Config command failed (config):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except StorageError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Storage error: {exc}")
|
|
else:
|
|
console.print(f"[red]Config command failed (storage):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except PermissionError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Permission denied: {exc}")
|
|
else:
|
|
console.print(f"[red]Config command failed (permission denied):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except CodexLensError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=str(exc))
|
|
else:
|
|
console.print(f"[red]Config command failed:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
|
|
|
|
@app.command()
|
|
def migrate(
|
|
path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to migrate."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""Migrate project indexes to latest schema (Dual-FTS upgrade).
|
|
|
|
Upgrades all _index.db files in the project to schema version 4, which includes:
|
|
- Dual FTS tables (exact + fuzzy)
|
|
- Encoding detection support
|
|
- Incremental indexing metadata
|
|
|
|
This is a safe operation that preserves all existing data.
|
|
Progress is shown during migration.
|
|
"""
|
|
_configure_logging(verbose, json_mode)
|
|
base_path = path.expanduser().resolve()
|
|
|
|
registry: RegistryStore | None = None
|
|
try:
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
|
|
# Find project
|
|
project_info = registry.get_project(base_path)
|
|
if not project_info:
|
|
raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.")
|
|
|
|
index_dir = mapper.source_to_index_dir(base_path)
|
|
if not index_dir.exists():
|
|
raise CodexLensError(f"Index directory not found: {index_dir}")
|
|
|
|
# Find all _index.db files
|
|
index_files = list(index_dir.rglob("_index.db"))
|
|
|
|
if not index_files:
|
|
if json_mode:
|
|
print_json(success=True, result={"message": "No indexes to migrate", "migrated": 0})
|
|
else:
|
|
console.print("[yellow]No indexes found to migrate.[/yellow]")
|
|
return
|
|
|
|
migrated_count = 0
|
|
error_count = 0
|
|
already_migrated = 0
|
|
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
BarColumn(),
|
|
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
|
TextColumn("({task.completed}/{task.total})"),
|
|
TimeElapsedColumn(),
|
|
console=console,
|
|
) as progress:
|
|
task = progress.add_task(f"Migrating {len(index_files)} indexes...", total=len(index_files))
|
|
|
|
for db_path in index_files:
|
|
try:
|
|
store = DirIndexStore(db_path)
|
|
|
|
# Check current version
|
|
with store._lock:
|
|
conn = store._get_connection()
|
|
current_version = store._get_schema_version(conn)
|
|
|
|
if current_version >= DirIndexStore.SCHEMA_VERSION:
|
|
already_migrated += 1
|
|
if verbose:
|
|
progress.console.print(f"[dim]Already migrated: {db_path.parent.name}[/dim]")
|
|
elif current_version > 0:
|
|
# Apply migrations
|
|
store._apply_migrations(conn, current_version)
|
|
store._set_schema_version(conn, DirIndexStore.SCHEMA_VERSION)
|
|
conn.commit()
|
|
migrated_count += 1
|
|
if verbose:
|
|
progress.console.print(f"[green]Migrated: {db_path.parent.name} (v{current_version} → v{DirIndexStore.SCHEMA_VERSION})[/green]")
|
|
else:
|
|
# New database, initialize directly
|
|
store.initialize()
|
|
migrated_count += 1
|
|
|
|
store.close()
|
|
|
|
except Exception as e:
|
|
error_count += 1
|
|
if verbose:
|
|
progress.console.print(f"[red]Error migrating {db_path}: {e}[/red]")
|
|
|
|
progress.update(task, advance=1)
|
|
|
|
result = {
|
|
"path": str(base_path),
|
|
"total_indexes": len(index_files),
|
|
"migrated": migrated_count,
|
|
"already_migrated": already_migrated,
|
|
"errors": error_count,
|
|
}
|
|
|
|
if json_mode:
|
|
print_json(success=True, result=result)
|
|
else:
|
|
console.print(f"[green]Migration complete:[/green]")
|
|
console.print(f" Total indexes: {len(index_files)}")
|
|
console.print(f" Migrated: {migrated_count}")
|
|
console.print(f" Already up-to-date: {already_migrated}")
|
|
if error_count > 0:
|
|
console.print(f" [yellow]Errors: {error_count}[/yellow]")
|
|
|
|
except StorageError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Storage error: {exc}")
|
|
else:
|
|
console.print(f"[red]Migration failed (storage):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except CodexLensError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=str(exc))
|
|
else:
|
|
console.print(f"[red]Migration failed:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
finally:
|
|
if registry is not None:
|
|
registry.close()
|
|
|
|
|
|
@app.command()
|
|
def clean(
|
|
path: Optional[Path] = typer.Argument(None, help="Project path to clean (removes project index)."),
|
|
all_indexes: bool = typer.Option(False, "--all", "-a", help="Remove all indexes."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""Remove CodexLens index data.
|
|
|
|
Without arguments, shows current index size.
|
|
With path, removes that project's indexes.
|
|
With --all, removes all indexes (use with caution).
|
|
"""
|
|
_configure_logging(verbose, json_mode)
|
|
|
|
try:
|
|
mapper = PathMapper()
|
|
index_root = mapper.index_root
|
|
|
|
if all_indexes:
|
|
# Remove everything
|
|
if not index_root.exists():
|
|
if json_mode:
|
|
print_json(success=True, result={"cleaned": None, "message": "No indexes to clean"})
|
|
else:
|
|
console.print("[yellow]No indexes to clean.[/yellow]")
|
|
return
|
|
|
|
# Calculate size before removal
|
|
total_size = 0
|
|
for f in index_root.rglob("*"):
|
|
if f.is_file():
|
|
total_size += f.stat().st_size
|
|
|
|
# Remove registry first
|
|
registry_path = _get_registry_path()
|
|
if registry_path.exists():
|
|
registry_path.unlink()
|
|
|
|
# Remove all indexes
|
|
shutil.rmtree(index_root)
|
|
|
|
result = {
|
|
"cleaned": str(index_root),
|
|
"size_freed_mb": round(total_size / (1024 * 1024), 2),
|
|
}
|
|
|
|
if json_mode:
|
|
print_json(success=True, result=result)
|
|
else:
|
|
console.print(f"[green]Removed all indexes:[/green] {result['size_freed_mb']} MB freed")
|
|
|
|
elif path:
|
|
# Remove specific project
|
|
project_path = path.expanduser().resolve()
|
|
project_index = mapper.source_to_index_dir(project_path)
|
|
|
|
if not project_index.exists():
|
|
if json_mode:
|
|
print_json(success=False, error=f"No index found for: {project_path}")
|
|
else:
|
|
console.print(f"[yellow]No index found for:[/yellow] {project_path}")
|
|
return
|
|
|
|
# Calculate size
|
|
total_size = 0
|
|
for f in project_index.rglob("*"):
|
|
if f.is_file():
|
|
total_size += f.stat().st_size
|
|
|
|
# Remove from registry
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
registry.unregister_project(project_path)
|
|
registry.close()
|
|
|
|
# Remove indexes
|
|
shutil.rmtree(project_index)
|
|
|
|
result = {
|
|
"cleaned": str(project_path),
|
|
"index_path": str(project_index),
|
|
"size_freed_mb": round(total_size / (1024 * 1024), 2),
|
|
}
|
|
|
|
if json_mode:
|
|
print_json(success=True, result=result)
|
|
else:
|
|
console.print(f"[green]Removed indexes for:[/green] {project_path}")
|
|
console.print(f" Freed: {result['size_freed_mb']} MB")
|
|
|
|
else:
|
|
# Show current status
|
|
if not index_root.exists():
|
|
if json_mode:
|
|
print_json(success=True, result={"index_root": str(index_root), "exists": False})
|
|
else:
|
|
console.print("[yellow]No indexes found.[/yellow]")
|
|
return
|
|
|
|
total_size = 0
|
|
for f in index_root.rglob("*"):
|
|
if f.is_file():
|
|
total_size += f.stat().st_size
|
|
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
projects = registry.list_projects()
|
|
registry.close()
|
|
|
|
result = {
|
|
"index_root": str(index_root),
|
|
"projects_count": len(projects),
|
|
"total_size_mb": round(total_size / (1024 * 1024), 2),
|
|
}
|
|
|
|
if json_mode:
|
|
print_json(success=True, result=result)
|
|
else:
|
|
console.print("[bold]Index Status[/bold]")
|
|
console.print(f" Location: {result['index_root']}")
|
|
console.print(f" Projects: {result['projects_count']}")
|
|
console.print(f" Total Size: {result['total_size_mb']} MB")
|
|
console.print("\n[dim]Use 'clean <path>' to remove a specific project or 'clean --all' to remove everything.[/dim]")
|
|
|
|
except StorageError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Storage error: {exc}")
|
|
else:
|
|
console.print(f"[red]Clean failed (storage):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except PermissionError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Permission denied: {exc}")
|
|
else:
|
|
console.print(f"[red]Clean failed (permission denied):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except CodexLensError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=str(exc))
|
|
else:
|
|
console.print(f"[red]Clean failed:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
|
|
|
|
@app.command("semantic-list")
|
|
def semantic_list(
|
|
path: Path = typer.Option(Path("."), "--path", "-p", help="Project path to list metadata from."),
|
|
offset: int = typer.Option(0, "--offset", "-o", min=0, help="Number of records to skip."),
|
|
limit: int = typer.Option(50, "--limit", "-n", min=1, max=100, help="Maximum records to return."),
|
|
tool_filter: Optional[str] = typer.Option(None, "--tool", "-t", help="Filter by LLM tool (gemini/qwen)."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""List semantic metadata entries for indexed files.
|
|
|
|
Shows files that have LLM-generated summaries and keywords.
|
|
Results are aggregated from all index databases in the project.
|
|
"""
|
|
_configure_logging(verbose, json_mode)
|
|
base_path = path.expanduser().resolve()
|
|
|
|
registry: Optional[RegistryStore] = None
|
|
try:
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
|
|
project_info = registry.get_project(base_path)
|
|
if not project_info:
|
|
raise CodexLensError(f"No index found for: {base_path}. Run 'codex-lens init' first.")
|
|
|
|
index_dir = Path(project_info.index_root)
|
|
if not index_dir.exists():
|
|
raise CodexLensError(f"Index directory not found: {index_dir}")
|
|
|
|
all_results: list = []
|
|
total_count = 0
|
|
|
|
index_files = sorted(index_dir.rglob("_index.db"))
|
|
|
|
for db_path in index_files:
|
|
try:
|
|
store = DirIndexStore(db_path)
|
|
store.initialize()
|
|
|
|
results, count = store.list_semantic_metadata(
|
|
offset=0,
|
|
limit=1000,
|
|
llm_tool=tool_filter,
|
|
)
|
|
|
|
source_dir = mapper.index_to_source(db_path.parent)
|
|
for r in results:
|
|
r["source_dir"] = str(source_dir)
|
|
|
|
all_results.extend(results)
|
|
total_count += count
|
|
|
|
store.close()
|
|
except Exception as e:
|
|
if verbose:
|
|
console.print(f"[yellow]Warning: Error reading {db_path}: {e}[/yellow]")
|
|
|
|
all_results.sort(key=lambda x: x["generated_at"], reverse=True)
|
|
paginated = all_results[offset : offset + limit]
|
|
|
|
result = {
|
|
"path": str(base_path),
|
|
"total": total_count,
|
|
"offset": offset,
|
|
"limit": limit,
|
|
"count": len(paginated),
|
|
"entries": paginated,
|
|
}
|
|
|
|
if json_mode:
|
|
print_json(success=True, result=result)
|
|
else:
|
|
if not paginated:
|
|
console.print("[yellow]No semantic metadata found.[/yellow]")
|
|
console.print("Run 'codex-lens enhance' to generate metadata for indexed files.")
|
|
else:
|
|
table = Table(title=f"Semantic Metadata ({total_count} total)")
|
|
table.add_column("File", style="cyan", max_width=40)
|
|
table.add_column("Language", style="dim")
|
|
table.add_column("Purpose", max_width=30)
|
|
table.add_column("Keywords", max_width=25)
|
|
table.add_column("Tool")
|
|
|
|
for entry in paginated:
|
|
keywords_str = ", ".join(entry["keywords"][:3])
|
|
if len(entry["keywords"]) > 3:
|
|
keywords_str += f" (+{len(entry['keywords']) - 3})"
|
|
|
|
table.add_row(
|
|
entry["file_name"],
|
|
entry["language"] or "-",
|
|
(entry["purpose"] or "-")[:30],
|
|
keywords_str or "-",
|
|
entry["llm_tool"] or "-",
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
if total_count > len(paginated):
|
|
console.print(
|
|
f"[dim]Showing {offset + 1}-{offset + len(paginated)} of {total_count}. "
|
|
"Use --offset and --limit for pagination.[/dim]"
|
|
)
|
|
|
|
except StorageError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Storage error: {exc}")
|
|
else:
|
|
console.print(f"[red]Semantic-list failed (storage):[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except CodexLensError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=str(exc))
|
|
else:
|
|
console.print(f"[red]Semantic-list failed:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
finally:
|
|
if registry is not None:
|
|
registry.close()
|
|
|
|
|
|
# ==================== Model Management Commands ====================
|
|
|
|
@app.command(name="model-list")
|
|
def model_list(
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
) -> None:
|
|
"""List available embedding models and their installation status.
|
|
|
|
Shows 4 model profiles (fast, code, multilingual, balanced) with:
|
|
- Installation status
|
|
- Model size and dimensions
|
|
- Use case recommendations
|
|
"""
|
|
try:
|
|
from codexlens.cli.model_manager import list_models
|
|
|
|
result = list_models()
|
|
|
|
if json_mode:
|
|
print_json(**result)
|
|
else:
|
|
if not result["success"]:
|
|
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
|
raise typer.Exit(code=1)
|
|
|
|
data = result["result"]
|
|
models = data["models"]
|
|
cache_dir = data["cache_dir"]
|
|
cache_exists = data["cache_exists"]
|
|
|
|
console.print("[bold]Available Embedding Models:[/bold]")
|
|
console.print(f"Cache directory: [dim]{cache_dir}[/dim] {'(exists)' if cache_exists else '(not found)'}\n")
|
|
|
|
table = Table(show_header=True, header_style="bold")
|
|
table.add_column("Profile", style="cyan")
|
|
table.add_column("Model Name", style="blue")
|
|
table.add_column("Dims", justify="right")
|
|
table.add_column("Size (MB)", justify="right")
|
|
table.add_column("Status", justify="center")
|
|
table.add_column("Use Case", style="dim")
|
|
|
|
for model in models:
|
|
status_icon = "[green]✓[/green]" if model["installed"] else "[dim]—[/dim]"
|
|
size_display = (
|
|
f"{model['actual_size_mb']:.1f}" if model["installed"]
|
|
else f"~{model['estimated_size_mb']}"
|
|
)
|
|
table.add_row(
|
|
model["profile"],
|
|
model["model_name"],
|
|
str(model["dimensions"]),
|
|
size_display,
|
|
status_icon,
|
|
model["use_case"][:40] + "..." if len(model["use_case"]) > 40 else model["use_case"],
|
|
)
|
|
|
|
console.print(table)
|
|
console.print("\n[dim]Use 'codexlens model-download <profile>' to download a model[/dim]")
|
|
|
|
except ImportError:
|
|
if json_mode:
|
|
print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]")
|
|
else:
|
|
console.print("[red]Error:[/red] fastembed not installed")
|
|
console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]")
|
|
raise typer.Exit(code=1)
|
|
|
|
|
|
@app.command(name="model-download")
|
|
def model_download(
|
|
profile: str = typer.Argument(..., help="Model profile to download (fast, code, multilingual, balanced)."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
) -> None:
|
|
"""Download an embedding model by profile name.
|
|
|
|
Example:
|
|
codexlens model-download code # Download code-optimized model
|
|
"""
|
|
try:
|
|
from codexlens.cli.model_manager import download_model
|
|
|
|
if not json_mode:
|
|
console.print(f"[bold]Downloading model:[/bold] {profile}")
|
|
console.print("[dim]This may take a few minutes depending on your internet connection...[/dim]\n")
|
|
|
|
# Create progress callback for non-JSON mode
|
|
progress_callback = None if json_mode else lambda msg: console.print(f"[cyan]{msg}[/cyan]")
|
|
|
|
result = download_model(profile, progress_callback=progress_callback)
|
|
|
|
if json_mode:
|
|
print_json(**result)
|
|
else:
|
|
if not result["success"]:
|
|
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
|
raise typer.Exit(code=1)
|
|
|
|
data = result["result"]
|
|
console.print(f"[green]✓[/green] Model downloaded successfully!")
|
|
console.print(f" Profile: {data['profile']}")
|
|
console.print(f" Model: {data['model_name']}")
|
|
console.print(f" Cache size: {data['cache_size_mb']:.1f} MB")
|
|
console.print(f" Location: [dim]{data['cache_path']}[/dim]")
|
|
|
|
except ImportError:
|
|
if json_mode:
|
|
print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]")
|
|
else:
|
|
console.print("[red]Error:[/red] fastembed not installed")
|
|
console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]")
|
|
raise typer.Exit(code=1)
|
|
|
|
|
|
@app.command(name="model-delete")
|
|
def model_delete(
|
|
profile: str = typer.Argument(..., help="Model profile to delete (fast, code, multilingual, balanced)."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
) -> None:
|
|
"""Delete a downloaded embedding model from cache.
|
|
|
|
Example:
|
|
codexlens model-delete fast # Delete fast model
|
|
"""
|
|
from codexlens.cli.model_manager import delete_model
|
|
|
|
if not json_mode:
|
|
console.print(f"[bold yellow]Deleting model:[/bold yellow] {profile}")
|
|
|
|
result = delete_model(profile)
|
|
|
|
if json_mode:
|
|
print_json(**result)
|
|
else:
|
|
if not result["success"]:
|
|
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
|
raise typer.Exit(code=1)
|
|
|
|
data = result["result"]
|
|
console.print(f"[green]✓[/green] Model deleted successfully!")
|
|
console.print(f" Profile: {data['profile']}")
|
|
console.print(f" Model: {data['model_name']}")
|
|
console.print(f" Freed space: {data['deleted_size_mb']:.1f} MB")
|
|
|
|
|
|
@app.command(name="model-download-custom")
|
|
def model_download_custom(
|
|
model_name: str = typer.Argument(..., help="Full HuggingFace model name (e.g., BAAI/bge-small-en-v1.5)."),
|
|
model_type: str = typer.Option("embedding", "--type", help="Model type: embedding or reranker."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
) -> None:
|
|
"""Download a custom HuggingFace model by name.
|
|
|
|
This allows downloading any fastembed-compatible model from HuggingFace.
|
|
|
|
Example:
|
|
codexlens model-download-custom BAAI/bge-small-en-v1.5
|
|
codexlens model-download-custom BAAI/bge-reranker-base --type reranker
|
|
"""
|
|
try:
|
|
from codexlens.cli.model_manager import download_custom_model
|
|
|
|
if not json_mode:
|
|
console.print(f"[bold]Downloading custom model:[/bold] {model_name}")
|
|
console.print(f"[dim]Model type: {model_type}[/dim]")
|
|
console.print("[dim]This may take a few minutes depending on your internet connection...[/dim]\n")
|
|
|
|
progress_callback = None if json_mode else lambda msg: console.print(f"[cyan]{msg}[/cyan]")
|
|
|
|
result = download_custom_model(model_name, model_type=model_type, progress_callback=progress_callback)
|
|
|
|
if json_mode:
|
|
print_json(**result)
|
|
else:
|
|
if not result["success"]:
|
|
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
|
raise typer.Exit(code=1)
|
|
|
|
data = result["result"]
|
|
console.print(f"[green]✓[/green] Custom model downloaded successfully!")
|
|
console.print(f" Model: {data['model_name']}")
|
|
console.print(f" Type: {data['model_type']}")
|
|
console.print(f" Cache size: {data['cache_size_mb']:.1f} MB")
|
|
console.print(f" Location: [dim]{data['cache_path']}[/dim]")
|
|
|
|
except ImportError:
|
|
if json_mode:
|
|
print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]")
|
|
else:
|
|
console.print("[red]Error:[/red] fastembed not installed")
|
|
console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]")
|
|
raise typer.Exit(code=1)
|
|
|
|
|
|
@app.command(name="model-info")
|
|
def model_info(
|
|
profile: str = typer.Argument(..., help="Model profile to get info (fast, code, multilingual, balanced)."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
) -> None:
|
|
"""Get detailed information about a model profile.
|
|
|
|
Example:
|
|
codexlens model-info code # Get code model details
|
|
"""
|
|
from codexlens.cli.model_manager import get_model_info
|
|
|
|
result = get_model_info(profile)
|
|
|
|
if json_mode:
|
|
print_json(**result)
|
|
else:
|
|
if not result["success"]:
|
|
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
|
raise typer.Exit(code=1)
|
|
|
|
data = result["result"]
|
|
console.print(f"[bold]Model Profile:[/bold] {data['profile']}")
|
|
console.print(f" Model name: {data['model_name']}")
|
|
console.print(f" Dimensions: {data['dimensions']}")
|
|
console.print(f" Status: {'[green]Installed[/green]' if data['installed'] else '[dim]Not installed[/dim]'}")
|
|
if data['installed'] and data['actual_size_mb']:
|
|
console.print(f" Cache size: {data['actual_size_mb']:.1f} MB")
|
|
console.print(f" Location: [dim]{data['cache_path']}[/dim]")
|
|
else:
|
|
console.print(f" Estimated size: ~{data['estimated_size_mb']} MB")
|
|
console.print(f"\n Description: {data['description']}")
|
|
console.print(f" Use case: {data['use_case']}")
|
|
|
|
|
|
# ==================== Reranker Model Management Commands ====================
|
|
|
|
|
|
@app.command(name="reranker-model-list")
|
|
def reranker_model_list(
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
) -> None:
|
|
"""List available reranker models and their installation status.
|
|
|
|
Shows reranker model profiles with:
|
|
- Installation status
|
|
- Model size
|
|
- Use case recommendations
|
|
"""
|
|
try:
|
|
from codexlens.cli.model_manager import list_reranker_models
|
|
|
|
result = list_reranker_models()
|
|
|
|
if json_mode:
|
|
print_json(**result)
|
|
else:
|
|
if not result["success"]:
|
|
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
|
raise typer.Exit(code=1)
|
|
|
|
data = result["result"]
|
|
models = data["models"]
|
|
cache_dir = data["cache_dir"]
|
|
cache_exists = data["cache_exists"]
|
|
|
|
console.print("[bold]Available Reranker Models:[/bold]")
|
|
console.print(f"Cache directory: [dim]{cache_dir}[/dim] {'(exists)' if cache_exists else '(not found)'}\n")
|
|
|
|
table = Table(show_header=True, header_style="bold")
|
|
table.add_column("Profile", style="cyan")
|
|
table.add_column("Model", style="dim")
|
|
table.add_column("Size", justify="right")
|
|
table.add_column("Status")
|
|
table.add_column("Description")
|
|
|
|
for m in models:
|
|
status = "[green]✓ Installed[/green]" if m["installed"] else "[dim]Not installed[/dim]"
|
|
size = f"{m['actual_size_mb']:.1f} MB" if m["installed"] and m["actual_size_mb"] else f"~{m['estimated_size_mb']} MB"
|
|
rec = " [yellow]★[/yellow]" if m.get("recommended") else ""
|
|
table.add_row(m["profile"] + rec, m["model_name"], size, status, m["description"])
|
|
|
|
console.print(table)
|
|
console.print("\n[yellow]★[/yellow] = Recommended")
|
|
|
|
except ImportError:
|
|
if json_mode:
|
|
print_json(success=False, error="fastembed reranker not available. Install with: pip install fastembed>=0.4.0")
|
|
else:
|
|
console.print("[red]Error:[/red] fastembed reranker not available")
|
|
console.print("Install with: [cyan]pip install fastembed>=0.4.0[/cyan]")
|
|
raise typer.Exit(code=1)
|
|
|
|
|
|
@app.command(name="reranker-model-download")
|
|
def reranker_model_download(
|
|
profile: str = typer.Argument(..., help="Reranker model profile to download."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
) -> None:
|
|
"""Download a reranker model by profile name.
|
|
|
|
Example:
|
|
codexlens reranker-model-download ms-marco-mini # Download default reranker
|
|
"""
|
|
try:
|
|
from codexlens.cli.model_manager import download_reranker_model
|
|
|
|
if not json_mode:
|
|
console.print(f"[bold]Downloading reranker model:[/bold] {profile}")
|
|
console.print("[dim]This may take a few minutes depending on your internet connection...[/dim]\n")
|
|
|
|
progress_callback = None if json_mode else lambda msg: console.print(f"[cyan]{msg}[/cyan]")
|
|
|
|
result = download_reranker_model(profile, progress_callback=progress_callback)
|
|
|
|
if json_mode:
|
|
print_json(**result)
|
|
else:
|
|
if not result["success"]:
|
|
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
|
raise typer.Exit(code=1)
|
|
|
|
data = result["result"]
|
|
console.print(f"[green]✓[/green] Reranker model downloaded successfully!")
|
|
console.print(f" Profile: {data['profile']}")
|
|
console.print(f" Model: {data['model_name']}")
|
|
console.print(f" Cache size: {data['cache_size_mb']:.1f} MB")
|
|
console.print(f" Location: [dim]{data['cache_path']}[/dim]")
|
|
|
|
except ImportError:
|
|
if json_mode:
|
|
print_json(success=False, error="fastembed reranker not available. Install with: pip install fastembed>=0.4.0")
|
|
else:
|
|
console.print("[red]Error:[/red] fastembed reranker not available")
|
|
console.print("Install with: [cyan]pip install fastembed>=0.4.0[/cyan]")
|
|
raise typer.Exit(code=1)
|
|
|
|
|
|
@app.command(name="reranker-model-delete")
|
|
def reranker_model_delete(
|
|
profile: str = typer.Argument(..., help="Reranker model profile to delete."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
) -> None:
|
|
"""Delete a downloaded reranker model from cache.
|
|
|
|
Example:
|
|
codexlens reranker-model-delete ms-marco-mini # Delete reranker model
|
|
"""
|
|
from codexlens.cli.model_manager import delete_reranker_model
|
|
|
|
if not json_mode:
|
|
console.print(f"[bold yellow]Deleting reranker model:[/bold yellow] {profile}")
|
|
|
|
result = delete_reranker_model(profile)
|
|
|
|
if json_mode:
|
|
print_json(**result)
|
|
else:
|
|
if not result["success"]:
|
|
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
|
raise typer.Exit(code=1)
|
|
|
|
data = result["result"]
|
|
console.print(f"[green]✓[/green] Reranker model deleted successfully!")
|
|
console.print(f" Profile: {data['profile']}")
|
|
console.print(f" Model: {data['model_name']}")
|
|
console.print(f" Freed space: {data['deleted_size_mb']:.1f} MB")
|
|
|
|
|
|
@app.command(name="reranker-model-info")
|
|
def reranker_model_info(
|
|
profile: str = typer.Argument(..., help="Reranker model profile to get info."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
) -> None:
|
|
"""Get detailed information about a reranker model profile.
|
|
|
|
Example:
|
|
codexlens reranker-model-info ms-marco-mini # Get reranker model details
|
|
"""
|
|
from codexlens.cli.model_manager import get_reranker_model_info
|
|
|
|
result = get_reranker_model_info(profile)
|
|
|
|
if json_mode:
|
|
print_json(**result)
|
|
else:
|
|
if not result["success"]:
|
|
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
|
raise typer.Exit(code=1)
|
|
|
|
data = result["result"]
|
|
console.print(f"[bold]Reranker Model Profile:[/bold] {data['profile']}")
|
|
console.print(f" Model name: {data['model_name']}")
|
|
console.print(f" Status: {'[green]Installed[/green]' if data['installed'] else '[dim]Not installed[/dim]'}")
|
|
if data['installed'] and data['actual_size_mb']:
|
|
console.print(f" Cache size: {data['actual_size_mb']:.1f} MB")
|
|
console.print(f" Location: [dim]{data['cache_path']}[/dim]")
|
|
else:
|
|
console.print(f" Estimated size: ~{data['estimated_size_mb']} MB")
|
|
console.print(f" Recommended: {'[green]Yes[/green]' if data.get('recommended') else '[dim]No[/dim]'}")
|
|
console.print(f"\n Description: {data['description']}")
|
|
console.print(f" Use case: {data['use_case']}")
|
|
|
|
|
|
# ==================== Embedding Management Commands ====================
|
|
|
|
@app.command(name="embeddings-status", hidden=True, deprecated=True)
|
|
def embeddings_status(
|
|
path: Optional[Path] = typer.Argument(
|
|
None,
|
|
exists=True,
|
|
help="Path to specific _index.db file or directory containing indexes. If not specified, uses default index root.",
|
|
),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
) -> None:
|
|
"""[Deprecated] Use 'codexlens index status' instead.
|
|
|
|
Check embedding status for one or all indexes.
|
|
|
|
Shows embedding statistics including:
|
|
- Number of chunks generated
|
|
- File coverage percentage
|
|
- Files missing embeddings
|
|
|
|
Examples:
|
|
codexlens embeddings-status # Check all indexes
|
|
codexlens embeddings-status ~/.codexlens/indexes/project/_index.db # Check specific index
|
|
codexlens embeddings-status ~/projects/my-app # Check project (auto-finds index)
|
|
"""
|
|
_deprecated_command_warning("embeddings-status", "index status")
|
|
from codexlens.cli.embedding_manager import check_index_embeddings, get_embedding_stats_summary
|
|
|
|
# Determine what to check
|
|
if path is None:
|
|
# Check all indexes in default root
|
|
index_root = _get_index_root()
|
|
result = get_embedding_stats_summary(index_root)
|
|
|
|
if json_mode:
|
|
print_json(**result)
|
|
else:
|
|
if not result["success"]:
|
|
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
|
raise typer.Exit(code=1)
|
|
|
|
data = result["result"]
|
|
total = data["total_indexes"]
|
|
with_emb = data["indexes_with_embeddings"]
|
|
total_chunks = data["total_chunks"]
|
|
|
|
console.print(f"[bold]Embedding Status Summary[/bold]")
|
|
console.print(f"Index root: [dim]{index_root}[/dim]\n")
|
|
console.print(f"Total indexes: {total}")
|
|
console.print(f"Indexes with embeddings: [{'green' if with_emb > 0 else 'yellow'}]{with_emb}[/]/{total}")
|
|
console.print(f"Total chunks: {total_chunks:,}\n")
|
|
|
|
if data["indexes"]:
|
|
table = Table(show_header=True, header_style="bold")
|
|
table.add_column("Project", style="cyan")
|
|
table.add_column("Files", justify="right")
|
|
table.add_column("Chunks", justify="right")
|
|
table.add_column("Coverage", justify="right")
|
|
table.add_column("Status", justify="center")
|
|
|
|
for idx_stat in data["indexes"]:
|
|
status_icon = "[green]✓[/green]" if idx_stat["has_embeddings"] else "[dim]—[/dim]"
|
|
coverage = f"{idx_stat['coverage_percent']:.1f}%" if idx_stat["has_embeddings"] else "—"
|
|
|
|
table.add_row(
|
|
idx_stat["project"],
|
|
str(idx_stat["total_files"]),
|
|
f"{idx_stat['total_chunks']:,}" if idx_stat["has_embeddings"] else "0",
|
|
coverage,
|
|
status_icon,
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
else:
|
|
# Check specific index or find index for project
|
|
target_path = path.expanduser().resolve()
|
|
|
|
if target_path.is_file() and target_path.name == "_index.db":
|
|
# Direct index file
|
|
index_path = target_path
|
|
elif target_path.is_dir():
|
|
# Try to find index for this project
|
|
registry = RegistryStore()
|
|
try:
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
index_path = mapper.source_to_index_db(target_path)
|
|
|
|
if not index_path.exists():
|
|
console.print(f"[red]Error:[/red] No index found for {target_path}")
|
|
console.print("Run 'codexlens init' first to create an index")
|
|
raise typer.Exit(code=1)
|
|
finally:
|
|
registry.close()
|
|
else:
|
|
console.print(f"[red]Error:[/red] Path must be _index.db file or directory")
|
|
raise typer.Exit(code=1)
|
|
|
|
result = check_index_embeddings(index_path)
|
|
|
|
if json_mode:
|
|
print_json(**result)
|
|
else:
|
|
if not result["success"]:
|
|
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
|
raise typer.Exit(code=1)
|
|
|
|
data = result["result"]
|
|
has_emb = data["has_embeddings"]
|
|
|
|
console.print(f"[bold]Embedding Status[/bold]")
|
|
console.print(f"Index: [dim]{data['index_path']}[/dim]\n")
|
|
|
|
if has_emb:
|
|
console.print(f"[green]✓[/green] Embeddings available")
|
|
console.print(f" Total chunks: {data['total_chunks']:,}")
|
|
console.print(f" Total files: {data['total_files']:,}")
|
|
console.print(f" Files with embeddings: {data['files_with_chunks']:,}/{data['total_files']}")
|
|
console.print(f" Coverage: {data['coverage_percent']:.1f}%")
|
|
|
|
if data["files_without_chunks"] > 0:
|
|
console.print(f"\n[yellow]Warning:[/yellow] {data['files_without_chunks']} files missing embeddings")
|
|
if data["missing_files_sample"]:
|
|
console.print(" Sample missing files:")
|
|
for file in data["missing_files_sample"]:
|
|
console.print(f" [dim]{file}[/dim]")
|
|
else:
|
|
console.print(f"[yellow]—[/yellow] No embeddings found")
|
|
console.print(f" Total files indexed: {data['total_files']:,}")
|
|
console.print("\n[dim]Generate embeddings with:[/dim]")
|
|
console.print(f" [cyan]codexlens embeddings-generate {index_path}[/cyan]")
|
|
|
|
|
|
@index_app.command("embeddings")
|
|
def index_embeddings(
|
|
path: Path = typer.Argument(
|
|
...,
|
|
exists=True,
|
|
help="Path to _index.db file or project directory.",
|
|
),
|
|
backend: str = typer.Option(
|
|
"fastembed",
|
|
"--backend",
|
|
"-b",
|
|
help="Embedding backend: fastembed (local) or litellm (remote API).",
|
|
),
|
|
model: str = typer.Option(
|
|
"code",
|
|
"--model",
|
|
"-m",
|
|
help="Model: profile name for fastembed (fast/code/multilingual/balanced) or model name for litellm (e.g. text-embedding-3-small).",
|
|
),
|
|
force: bool = typer.Option(
|
|
False,
|
|
"--force",
|
|
"-f",
|
|
help="Force regeneration even if embeddings exist.",
|
|
),
|
|
chunk_size: int = typer.Option(
|
|
2000,
|
|
"--chunk-size",
|
|
help="Maximum chunk size in characters.",
|
|
),
|
|
max_workers: int = typer.Option(
|
|
1,
|
|
"--max-workers",
|
|
"-w",
|
|
min=1,
|
|
help="Max concurrent API calls. Recommended: 4-8 for litellm backend. Default: 1 (sequential).",
|
|
),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
|
|
centralized: bool = typer.Option(
|
|
True,
|
|
"--centralized/--distributed",
|
|
"-c/-d",
|
|
help="Use centralized vector storage (default) or distributed per-directory indexes.",
|
|
),
|
|
) -> None:
|
|
"""Generate semantic embeddings for code search.
|
|
|
|
Creates vector embeddings for all files in an index to enable
|
|
semantic search capabilities. Embeddings are stored in the same
|
|
database as the FTS index.
|
|
|
|
Storage Modes:
|
|
- Default: Per-directory HNSW indexes alongside _index.db files
|
|
- Centralized: Single HNSW index at project root (_vectors.hnsw)
|
|
|
|
Embedding Backend Options:
|
|
- fastembed: Local ONNX-based embeddings (default, no API calls)
|
|
- litellm: Remote API embeddings via ccw-litellm (requires API keys)
|
|
|
|
Model Options:
|
|
For fastembed backend (profiles):
|
|
- fast: BAAI/bge-small-en-v1.5 (384 dims, ~80MB)
|
|
- code: jinaai/jina-embeddings-v2-base-code (768 dims, ~150MB) [recommended]
|
|
- multilingual: intfloat/multilingual-e5-large (1024 dims, ~1GB)
|
|
- balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dims, ~600MB)
|
|
|
|
For litellm backend (model names):
|
|
- text-embedding-3-small, text-embedding-3-large (OpenAI)
|
|
- text-embedding-ada-002 (OpenAI legacy)
|
|
- Any model supported by ccw-litellm
|
|
|
|
Examples:
|
|
codexlens index embeddings ~/projects/my-app # Auto-find index (fastembed, code profile)
|
|
codexlens index embeddings ~/.codexlens/indexes/project/_index.db # Specific index
|
|
codexlens index embeddings ~/projects/my-app --backend litellm --model text-embedding-3-small # Use LiteLLM
|
|
codexlens index embeddings ~/projects/my-app --model fast --force # Regenerate with fast profile
|
|
codexlens index embeddings ~/projects/my-app --centralized # Centralized vector storage
|
|
"""
|
|
_configure_logging(verbose, json_mode)
|
|
|
|
from codexlens.cli.embedding_manager import (
|
|
generate_embeddings,
|
|
generate_dense_embeddings_centralized,
|
|
scan_for_model_conflicts,
|
|
check_global_model_lock,
|
|
set_locked_model_config,
|
|
)
|
|
|
|
# Validate backend
|
|
valid_backends = ["fastembed", "litellm"]
|
|
if backend not in valid_backends:
|
|
error_msg = f"Invalid backend: {backend}. Must be one of: {', '.join(valid_backends)}"
|
|
if json_mode:
|
|
print_json(success=False, error=error_msg)
|
|
else:
|
|
console.print(f"[red]Error:[/red] {error_msg}")
|
|
console.print(f"[dim]Valid backends: {', '.join(valid_backends)}[/dim]")
|
|
raise typer.Exit(code=1)
|
|
|
|
# Resolve path
|
|
target_path = path.expanduser().resolve()
|
|
|
|
# Determine index path or root for centralized mode
|
|
index_path = None
|
|
index_root = None
|
|
|
|
if target_path.is_file() and target_path.name == "_index.db":
|
|
# Direct index file
|
|
index_path = target_path
|
|
index_root = target_path.parent
|
|
elif target_path.is_dir():
|
|
# Directory: Find index location from registry
|
|
registry = RegistryStore()
|
|
try:
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
index_path = mapper.source_to_index_db(target_path)
|
|
|
|
if not index_path.exists():
|
|
console.print(f"[red]Error:[/red] No index found for {target_path}")
|
|
console.print("Run 'codexlens init' first to create an index")
|
|
raise typer.Exit(code=1)
|
|
index_root = index_path.parent # Use index directory for both modes
|
|
finally:
|
|
registry.close()
|
|
else:
|
|
console.print(f"[red]Error:[/red] Path must be _index.db file or directory")
|
|
raise typer.Exit(code=1)
|
|
|
|
# Progress callback
|
|
def progress_update(msg: str):
|
|
if not json_mode and verbose:
|
|
console.print(f" {msg}")
|
|
|
|
console.print(f"[bold]Generating embeddings[/bold]")
|
|
if centralized:
|
|
effective_root = index_root if index_root else (index_path.parent if index_path else target_path)
|
|
console.print(f"Index root: [dim]{effective_root}[/dim]")
|
|
console.print(f"Mode: [green]Centralized[/green]")
|
|
else:
|
|
console.print(f"Index: [dim]{index_path}[/dim]")
|
|
console.print(f"Backend: [cyan]{backend}[/cyan]")
|
|
console.print(f"Model: [cyan]{model}[/cyan]")
|
|
if max_workers > 1:
|
|
console.print(f"Concurrency: [cyan]{max_workers} workers[/cyan]")
|
|
console.print()
|
|
|
|
# Check global model lock (prevents mixing different models)
|
|
if not force:
|
|
lock_result = check_global_model_lock(backend, model)
|
|
if lock_result["has_conflict"]:
|
|
locked = lock_result["locked_config"]
|
|
if json_mode:
|
|
print_json(
|
|
success=False,
|
|
error="Global model lock conflict",
|
|
code="MODEL_LOCKED",
|
|
locked_config=locked,
|
|
target_config=lock_result["target_config"],
|
|
hint="Use --force to override the lock and switch to a different model (will regenerate all embeddings)",
|
|
)
|
|
raise typer.Exit(code=1)
|
|
else:
|
|
console.print("[red]⛔ Global Model Lock Active[/red]")
|
|
console.print(f" Locked model: [cyan]{locked['backend']}/{locked['model']}[/cyan]")
|
|
console.print(f" Requested: [yellow]{backend}/{model}[/yellow]")
|
|
console.print(f" Locked at: {locked.get('locked_at', 'unknown')}")
|
|
console.print()
|
|
console.print("[dim]All indexes must use the same embedding model.[/dim]")
|
|
console.print("[dim]Use --force to switch models (will regenerate all embeddings).[/dim]")
|
|
raise typer.Exit(code=1)
|
|
|
|
# Pre-check for model conflicts (only if not forcing)
|
|
if not force:
|
|
# Determine the index root for conflict scanning
|
|
scan_root = index_root if index_root else (index_path.parent if index_path else None)
|
|
|
|
if scan_root:
|
|
conflict_result = scan_for_model_conflicts(scan_root, backend, model)
|
|
|
|
if conflict_result["has_conflict"]:
|
|
existing = conflict_result["existing_config"]
|
|
conflict_count = len(conflict_result["conflicts"])
|
|
|
|
if json_mode:
|
|
# JSON mode: return structured error for UI handling
|
|
print_json(
|
|
success=False,
|
|
error="Model conflict detected",
|
|
code="MODEL_CONFLICT",
|
|
existing_config=existing,
|
|
target_config=conflict_result["target_config"],
|
|
conflict_count=conflict_count,
|
|
conflicts=conflict_result["conflicts"][:5], # Show first 5 conflicts
|
|
hint="Use --force to overwrite existing embeddings with the new model",
|
|
)
|
|
raise typer.Exit(code=1)
|
|
else:
|
|
# Interactive mode: show warning and ask for confirmation
|
|
console.print("[yellow]⚠ Model Conflict Detected[/yellow]")
|
|
console.print(f" Existing: [red]{existing['backend']}/{existing['model']}[/red] ({existing.get('embedding_dim', '?')} dim)")
|
|
console.print(f" Requested: [green]{backend}/{model}[/green]")
|
|
console.print(f" Affected indexes: [yellow]{conflict_count}[/yellow]")
|
|
console.print()
|
|
console.print("[dim]Mixing different embedding models in the same index is not supported.[/dim]")
|
|
console.print("[dim]Overwriting will delete all existing embeddings and regenerate with the new model.[/dim]")
|
|
console.print()
|
|
|
|
# Ask for confirmation
|
|
if typer.confirm("Overwrite existing embeddings with the new model?", default=False):
|
|
force = True
|
|
console.print("[green]Confirmed.[/green] Proceeding with overwrite...\n")
|
|
else:
|
|
console.print("[yellow]Cancelled.[/yellow] Use --force to skip this prompt.")
|
|
raise typer.Exit(code=0)
|
|
|
|
if centralized:
|
|
# Centralized mode: single HNSW index at project root
|
|
if not index_root:
|
|
index_root = index_path.parent if index_path else target_path
|
|
result = generate_dense_embeddings_centralized(
|
|
index_root,
|
|
embedding_backend=backend,
|
|
model_profile=model,
|
|
force=force,
|
|
chunk_size=chunk_size,
|
|
progress_callback=progress_update,
|
|
max_workers=max_workers,
|
|
)
|
|
else:
|
|
result = generate_embeddings(
|
|
index_path,
|
|
embedding_backend=backend,
|
|
model_profile=model,
|
|
force=force,
|
|
chunk_size=chunk_size,
|
|
progress_callback=progress_update,
|
|
max_workers=max_workers,
|
|
)
|
|
|
|
if json_mode:
|
|
print_json(**result)
|
|
else:
|
|
if not result["success"]:
|
|
error_msg = result.get("error", "Unknown error")
|
|
console.print(f"[red]Error:[/red] {error_msg}")
|
|
|
|
# Provide helpful hints
|
|
if "already has" in error_msg:
|
|
console.print("\n[dim]Use --force to regenerate existing embeddings[/dim]")
|
|
elif "fastembed not available" in error_msg or "Semantic search not available" in error_msg:
|
|
console.print("\n[dim]Install semantic dependencies:[/dim]")
|
|
console.print(" [cyan]pip install codexlens[semantic][/cyan]")
|
|
elif "ccw-litellm not available" in error_msg:
|
|
console.print("\n[dim]Install LiteLLM backend dependencies:[/dim]")
|
|
console.print(" [cyan]pip install ccw-litellm[/cyan]")
|
|
|
|
raise typer.Exit(code=1)
|
|
|
|
data = result["result"]
|
|
|
|
# Set global model lock after successful generation
|
|
# This prevents using different models for future indexes
|
|
set_locked_model_config(backend, model)
|
|
|
|
if centralized:
|
|
# Centralized mode output
|
|
elapsed = data.get("elapsed_time", 0)
|
|
console.print(f"[green]v[/green] Centralized embeddings generated successfully!")
|
|
console.print(f" Model: {data.get('model_name', model)}")
|
|
console.print(f" Chunks created: {data['chunks_created']:,}")
|
|
console.print(f" Files processed: {data['files_processed']}")
|
|
if data.get("files_failed", 0) > 0:
|
|
console.print(f" [yellow]Files failed: {data['files_failed']}[/yellow]")
|
|
console.print(f" Central index: {data.get('central_index_path', 'N/A')}")
|
|
console.print(f" Time: {elapsed:.1f}s")
|
|
else:
|
|
# Single index mode output
|
|
elapsed = data["elapsed_time"]
|
|
|
|
console.print(f"[green]v[/green] Embeddings generated successfully!")
|
|
console.print(f" Model: {data['model_name']}")
|
|
console.print(f" Chunks created: {data['chunks_created']:,}")
|
|
console.print(f" Files processed: {data['files_processed']}")
|
|
|
|
if data["files_failed"] > 0:
|
|
console.print(f" [yellow]Files failed: {data['files_failed']}[/yellow]")
|
|
if data["failed_files"]:
|
|
console.print(" [dim]First failures:[/dim]")
|
|
for file_path, error in data["failed_files"]:
|
|
console.print(f" [dim]{file_path}: {error}[/dim]")
|
|
|
|
console.print(f" Time: {elapsed:.1f}s")
|
|
|
|
console.print("\n[dim]Use vector search with:[/dim]")
|
|
console.print(" [cyan]codexlens search 'your query' --mode pure-vector[/cyan]")
|
|
|
|
|
|
# ==================== GPU Management Commands ====================
|
|
|
|
@app.command(name="gpu-list")
|
|
def gpu_list(
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
) -> None:
|
|
"""List available GPU devices for embedding acceleration.
|
|
|
|
Shows all detected GPU devices with their capabilities and selection status.
|
|
Discrete GPUs (NVIDIA, AMD) are automatically preferred over integrated GPUs.
|
|
|
|
Examples:
|
|
codexlens gpu-list # List all GPUs
|
|
codexlens gpu-list --json # JSON output for scripting
|
|
"""
|
|
from codexlens.semantic.gpu_support import get_gpu_devices, detect_gpu, get_selected_device_id
|
|
|
|
gpu_info = detect_gpu()
|
|
devices = get_gpu_devices()
|
|
selected_id = get_selected_device_id()
|
|
|
|
if json_mode:
|
|
print_json(
|
|
success=True,
|
|
result={
|
|
"devices": devices,
|
|
"selected_device_id": selected_id,
|
|
"gpu_available": gpu_info.gpu_available,
|
|
"providers": gpu_info.onnx_providers,
|
|
}
|
|
)
|
|
else:
|
|
if not devices:
|
|
console.print("[yellow]No GPU devices detected[/yellow]")
|
|
console.print(f"ONNX Providers: [dim]{', '.join(gpu_info.onnx_providers)}[/dim]")
|
|
return
|
|
|
|
console.print("[bold]Available GPU Devices[/bold]\n")
|
|
|
|
table = Table(show_header=True, header_style="bold")
|
|
table.add_column("ID", justify="center")
|
|
table.add_column("Name")
|
|
table.add_column("Vendor", justify="center")
|
|
table.add_column("Type", justify="center")
|
|
table.add_column("Status", justify="center")
|
|
|
|
for dev in devices:
|
|
type_str = "[green]Discrete[/green]" if dev["is_discrete"] else "[dim]Integrated[/dim]"
|
|
vendor_color = {
|
|
"nvidia": "green",
|
|
"amd": "red",
|
|
"intel": "blue"
|
|
}.get(dev["vendor"], "white")
|
|
vendor_str = f"[{vendor_color}]{dev['vendor'].upper()}[/{vendor_color}]"
|
|
|
|
status_parts = []
|
|
if dev["is_preferred"]:
|
|
status_parts.append("[cyan]Auto[/cyan]")
|
|
if dev["is_selected"]:
|
|
status_parts.append("[green]✓ Selected[/green]")
|
|
|
|
status_str = " ".join(status_parts) if status_parts else "[dim]—[/dim]"
|
|
|
|
table.add_row(
|
|
str(dev["device_id"]),
|
|
dev["name"],
|
|
vendor_str,
|
|
type_str,
|
|
status_str,
|
|
)
|
|
|
|
console.print(table)
|
|
console.print(f"\nONNX Providers: [dim]{', '.join(gpu_info.onnx_providers)}[/dim]")
|
|
console.print("\n[dim]Select GPU with:[/dim]")
|
|
console.print(" [cyan]codexlens gpu-select <device_id>[/cyan]")
|
|
|
|
|
|
@app.command(name="gpu-select")
|
|
def gpu_select(
|
|
device_id: int = typer.Argument(
|
|
...,
|
|
help="GPU device ID to use for embeddings. Use 'codexlens gpu-list' to see available IDs.",
|
|
),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
) -> None:
|
|
"""Select a specific GPU device for embedding generation.
|
|
|
|
By default, CodexLens automatically selects the most powerful GPU (discrete over integrated).
|
|
Use this command to override the selection.
|
|
|
|
Examples:
|
|
codexlens gpu-select 1 # Use GPU device 1
|
|
codexlens gpu-select 0 --json # Select GPU 0 with JSON output
|
|
"""
|
|
from codexlens.semantic.gpu_support import set_selected_device_id, get_gpu_devices
|
|
from codexlens.semantic.embedder import clear_embedder_cache
|
|
|
|
devices = get_gpu_devices()
|
|
valid_ids = [dev["device_id"] for dev in devices]
|
|
|
|
if device_id not in valid_ids:
|
|
if json_mode:
|
|
print_json(success=False, error=f"Invalid device_id {device_id}. Valid IDs: {valid_ids}")
|
|
else:
|
|
console.print(f"[red]Error:[/red] Invalid device_id {device_id}")
|
|
console.print(f"Valid IDs: {valid_ids}")
|
|
console.print("\n[dim]Use 'codexlens gpu-list' to see available devices[/dim]")
|
|
raise typer.Exit(code=1)
|
|
|
|
success = set_selected_device_id(device_id)
|
|
|
|
if success:
|
|
# Clear embedder cache to force reload with new GPU
|
|
clear_embedder_cache()
|
|
|
|
device_name = next((dev["name"] for dev in devices if dev["device_id"] == device_id), "Unknown")
|
|
|
|
if json_mode:
|
|
print_json(
|
|
success=True,
|
|
result={
|
|
"device_id": device_id,
|
|
"device_name": device_name,
|
|
"message": f"GPU selection set to device {device_id}: {device_name}",
|
|
}
|
|
)
|
|
else:
|
|
console.print(f"[green]✓[/green] GPU selection updated")
|
|
console.print(f" Device ID: {device_id}")
|
|
console.print(f" Device: [cyan]{device_name}[/cyan]")
|
|
console.print("\n[dim]New embeddings will use this GPU[/dim]")
|
|
else:
|
|
if json_mode:
|
|
print_json(success=False, error="Failed to set GPU selection")
|
|
else:
|
|
console.print("[red]Error:[/red] Failed to set GPU selection")
|
|
raise typer.Exit(code=1)
|
|
|
|
|
|
@app.command(name="gpu-reset")
|
|
def gpu_reset(
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
) -> None:
|
|
"""Reset GPU selection to automatic detection.
|
|
|
|
Clears any manual GPU selection and returns to automatic selection
|
|
(discrete GPU preferred over integrated).
|
|
|
|
Examples:
|
|
codexlens gpu-reset # Reset to auto-detection
|
|
"""
|
|
from codexlens.semantic.gpu_support import set_selected_device_id, detect_gpu
|
|
from codexlens.semantic.embedder import clear_embedder_cache
|
|
|
|
set_selected_device_id(None)
|
|
clear_embedder_cache()
|
|
|
|
gpu_info = detect_gpu(force_refresh=True)
|
|
|
|
if json_mode:
|
|
print_json(
|
|
success=True,
|
|
result={
|
|
"message": "GPU selection reset to auto-detection",
|
|
"preferred_device_id": gpu_info.preferred_device_id,
|
|
"preferred_device_name": gpu_info.gpu_name,
|
|
}
|
|
)
|
|
else:
|
|
console.print("[green]✓[/green] GPU selection reset to auto-detection")
|
|
if gpu_info.preferred_device_id is not None:
|
|
console.print(f" Auto-selected device: {gpu_info.preferred_device_id}")
|
|
console.print(f" Device: [cyan]{gpu_info.gpu_name}[/cyan]")
|
|
|
|
|
|
|
|
# ==================== SPLADE Commands ====================
|
|
|
|
@index_app.command("splade")
|
|
def index_splade(
|
|
path: Path = typer.Argument(..., help="Project path to index"),
|
|
rebuild: bool = typer.Option(False, "--rebuild", "-r", help="Force rebuild SPLADE index"),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
|
|
) -> None:
|
|
"""Generate SPLADE sparse index for existing codebase.
|
|
|
|
Encodes all semantic chunks with SPLADE model and builds inverted index
|
|
for efficient sparse retrieval.
|
|
|
|
This command discovers all _index.db files recursively in the project's
|
|
index directory and builds SPLADE encodings for chunks across all of them.
|
|
|
|
Examples:
|
|
codexlens index splade ~/projects/my-app
|
|
codexlens index splade . --rebuild
|
|
"""
|
|
_configure_logging(verbose)
|
|
|
|
from codexlens.semantic.splade_encoder import get_splade_encoder, check_splade_available
|
|
from codexlens.storage.splade_index import SpladeIndex
|
|
from codexlens.semantic.vector_store import VectorStore
|
|
|
|
# Check SPLADE availability
|
|
ok, err = check_splade_available()
|
|
if not ok:
|
|
console.print(f"[red]SPLADE not available: {err}[/red]")
|
|
console.print("[dim]Install with: pip install transformers torch[/dim]")
|
|
raise typer.Exit(1)
|
|
|
|
# Find index root directory
|
|
target_path = path.expanduser().resolve()
|
|
|
|
# Determine index root directory (containing _index.db files)
|
|
if target_path.is_file() and target_path.name == "_index.db":
|
|
index_root = target_path.parent
|
|
elif target_path.is_dir():
|
|
# Check for local .codexlens/_index.db
|
|
local_index = target_path / ".codexlens" / "_index.db"
|
|
if local_index.exists():
|
|
index_root = local_index.parent
|
|
else:
|
|
# Try to find via registry
|
|
registry = RegistryStore()
|
|
try:
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
index_db = mapper.source_to_index_db(target_path)
|
|
if not index_db.exists():
|
|
console.print(f"[red]Error:[/red] No index found for {target_path}")
|
|
console.print("Run 'codexlens init' first to create an index")
|
|
raise typer.Exit(1)
|
|
index_root = index_db.parent
|
|
finally:
|
|
registry.close()
|
|
else:
|
|
console.print(f"[red]Error:[/red] Path must be _index.db file or indexed directory")
|
|
raise typer.Exit(1)
|
|
|
|
# Discover all _index.db files recursively
|
|
all_index_dbs = sorted(index_root.rglob("_index.db"))
|
|
if not all_index_dbs:
|
|
console.print(f"[red]Error:[/red] No _index.db files found in {index_root}")
|
|
raise typer.Exit(1)
|
|
|
|
console.print(f"[blue]Discovered {len(all_index_dbs)} index databases[/blue]")
|
|
|
|
# SPLADE index is stored alongside the root _index.db
|
|
from codexlens.config import SPLADE_DB_NAME
|
|
splade_db = index_root / SPLADE_DB_NAME
|
|
|
|
if splade_db.exists() and not rebuild:
|
|
console.print("[yellow]SPLADE index exists. Use --rebuild to regenerate.[/yellow]")
|
|
return
|
|
|
|
# If rebuild, delete existing splade database
|
|
if splade_db.exists() and rebuild:
|
|
splade_db.unlink()
|
|
|
|
# Collect all chunks from all distributed index databases
|
|
# Assign globally unique IDs to avoid collisions (each DB starts with ID 1)
|
|
console.print(f"[blue]Loading chunks from {len(all_index_dbs)} distributed indexes...[/blue]")
|
|
all_chunks = [] # (global_id, chunk) pairs
|
|
total_files_checked = 0
|
|
indexes_with_chunks = 0
|
|
global_id = 0 # Sequential global ID across all databases
|
|
|
|
for index_db in all_index_dbs:
|
|
total_files_checked += 1
|
|
try:
|
|
vector_store = VectorStore(index_db)
|
|
chunks = vector_store.get_all_chunks()
|
|
if chunks:
|
|
indexes_with_chunks += 1
|
|
# Assign sequential global IDs to avoid collisions
|
|
for chunk in chunks:
|
|
global_id += 1
|
|
all_chunks.append((global_id, chunk, index_db))
|
|
if verbose:
|
|
console.print(f" [dim]{index_db.parent.name}: {len(chunks)} chunks[/dim]")
|
|
vector_store.close()
|
|
except Exception as e:
|
|
if verbose:
|
|
console.print(f" [yellow]Warning: Failed to read {index_db}: {e}[/yellow]")
|
|
|
|
if not all_chunks:
|
|
console.print("[yellow]No chunks found in any index database[/yellow]")
|
|
console.print(f"[dim]Checked {total_files_checked} index files, found 0 chunks[/dim]")
|
|
console.print("[dim]Generate embeddings first with 'codexlens embeddings-generate --recursive'[/dim]")
|
|
raise typer.Exit(1)
|
|
|
|
console.print(f"[blue]Found {len(all_chunks)} chunks across {indexes_with_chunks} indexes[/blue]")
|
|
console.print(f"[blue]Encoding with SPLADE...[/blue]")
|
|
|
|
# Initialize SPLADE
|
|
encoder = get_splade_encoder()
|
|
splade_index = SpladeIndex(splade_db)
|
|
splade_index.create_tables()
|
|
|
|
# Encode in batches with progress bar
|
|
chunk_metadata_batch = []
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
BarColumn(),
|
|
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
|
TimeElapsedColumn(),
|
|
console=console,
|
|
) as progress:
|
|
task = progress.add_task("Encoding...", total=len(all_chunks))
|
|
for global_id, chunk, source_db_path in all_chunks:
|
|
sparse_vec = encoder.encode_text(chunk.content)
|
|
splade_index.add_posting(global_id, sparse_vec)
|
|
# Store chunk metadata for self-contained search
|
|
# Serialize metadata dict to JSON string
|
|
metadata_str = None
|
|
if hasattr(chunk, 'metadata') and chunk.metadata:
|
|
try:
|
|
metadata_str = json.dumps(chunk.metadata) if isinstance(chunk.metadata, dict) else chunk.metadata
|
|
except Exception:
|
|
pass
|
|
chunk_metadata_batch.append((
|
|
global_id,
|
|
chunk.file_path or "",
|
|
chunk.content,
|
|
metadata_str,
|
|
str(source_db_path)
|
|
))
|
|
progress.advance(task)
|
|
|
|
# Batch insert chunk metadata
|
|
if chunk_metadata_batch:
|
|
splade_index.add_chunks_metadata_batch(chunk_metadata_batch)
|
|
|
|
# Set metadata
|
|
splade_index.set_metadata(
|
|
model_name=encoder.model_name,
|
|
vocab_size=encoder.vocab_size
|
|
)
|
|
|
|
stats = splade_index.get_stats()
|
|
console.print(f"[green]OK[/green] SPLADE index built: {stats['unique_chunks']} chunks, {stats['total_postings']} postings")
|
|
console.print(f" Source indexes: {indexes_with_chunks}")
|
|
console.print(f" Database: [dim]{splade_db}[/dim]")
|
|
|
|
|
|
@app.command("splade-status", hidden=True, deprecated=True)
|
|
def splade_status_command(
|
|
path: Path = typer.Argument(..., help="Project path"),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
|
|
) -> None:
|
|
"""[Deprecated] Use 'codexlens index status' instead.
|
|
|
|
Show SPLADE index status and statistics.
|
|
|
|
Examples:
|
|
codexlens splade-status ~/projects/my-app
|
|
codexlens splade-status .
|
|
"""
|
|
_deprecated_command_warning("splade-status", "index status")
|
|
_configure_logging(verbose)
|
|
|
|
from codexlens.storage.splade_index import SpladeIndex
|
|
from codexlens.semantic.splade_encoder import check_splade_available
|
|
from codexlens.config import SPLADE_DB_NAME
|
|
|
|
# Find index database
|
|
target_path = path.expanduser().resolve()
|
|
|
|
if target_path.is_file() and target_path.name == "_index.db":
|
|
splade_db = target_path.parent / SPLADE_DB_NAME
|
|
elif target_path.is_dir():
|
|
# Check for local .codexlens/_splade.db
|
|
local_splade = target_path / ".codexlens" / SPLADE_DB_NAME
|
|
if local_splade.exists():
|
|
splade_db = local_splade
|
|
else:
|
|
# Try to find via registry
|
|
registry = RegistryStore()
|
|
try:
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
index_db = mapper.source_to_index_db(target_path)
|
|
splade_db = index_db.parent / SPLADE_DB_NAME
|
|
finally:
|
|
registry.close()
|
|
else:
|
|
console.print(f"[red]Error:[/red] Path must be _index.db file or indexed directory")
|
|
raise typer.Exit(1)
|
|
|
|
if not splade_db.exists():
|
|
console.print("[yellow]No SPLADE index found[/yellow]")
|
|
console.print(f"[dim]Run 'codexlens splade-index {path}' to create one[/dim]")
|
|
return
|
|
|
|
splade_index = SpladeIndex(splade_db)
|
|
|
|
if not splade_index.has_index():
|
|
console.print("[yellow]SPLADE tables not initialized[/yellow]")
|
|
return
|
|
|
|
metadata = splade_index.get_metadata()
|
|
stats = splade_index.get_stats()
|
|
|
|
# Create status table
|
|
table = Table(title="SPLADE Index Status", show_header=False)
|
|
table.add_column("Property", style="cyan")
|
|
table.add_column("Value")
|
|
|
|
table.add_row("Database", str(splade_db))
|
|
if metadata:
|
|
table.add_row("Model", metadata['model_name'])
|
|
table.add_row("Vocab Size", str(metadata['vocab_size']))
|
|
table.add_row("Chunks", str(stats['unique_chunks']))
|
|
table.add_row("Unique Tokens", str(stats['unique_tokens']))
|
|
table.add_row("Total Postings", str(stats['total_postings']))
|
|
|
|
ok, err = check_splade_available()
|
|
status_text = "[green]Yes[/green]" if ok else f"[red]No[/red] - {err}"
|
|
table.add_row("SPLADE Available", status_text)
|
|
|
|
console.print(table)
|
|
|
|
|
|
# ==================== Watch Command ====================
|
|
|
|
@app.command()
|
|
def watch(
|
|
path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to watch."),
|
|
language: Optional[List[str]] = typer.Option(None, "--language", "-l", help="Languages to watch (comma-separated)."),
|
|
debounce: int = typer.Option(1000, "--debounce", "-d", min=100, max=10000, help="Debounce interval in milliseconds."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""Watch a directory for file changes and incrementally update the index.
|
|
|
|
Monitors the specified directory for file system changes (create, modify, delete)
|
|
and automatically updates the CodexLens index. The directory must already be indexed
|
|
using 'codexlens init' before watching.
|
|
|
|
Examples:
|
|
# Watch current directory
|
|
codexlens watch .
|
|
|
|
# Watch with custom debounce interval
|
|
codexlens watch . --debounce 2000
|
|
|
|
# Watch only Python and JavaScript files
|
|
codexlens watch . --language python,javascript
|
|
|
|
Press Ctrl+C to stop watching.
|
|
"""
|
|
_configure_logging(verbose)
|
|
watch_path = path.expanduser().resolve()
|
|
|
|
registry: RegistryStore | None = None
|
|
try:
|
|
# Validate that path is indexed
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
|
|
project_record = registry.find_by_source_path(str(watch_path))
|
|
if not project_record:
|
|
console.print(f"[red]Error:[/red] Directory is not indexed: {watch_path}")
|
|
console.print("[dim]Run 'codexlens init' first to create an index.[/dim]")
|
|
raise typer.Exit(code=1)
|
|
|
|
# Parse languages
|
|
languages = _parse_languages(language)
|
|
|
|
# Create watcher config
|
|
watcher_config = WatcherConfig(
|
|
debounce_ms=debounce,
|
|
languages=languages,
|
|
)
|
|
|
|
# Display startup message
|
|
console.print(f"[green]Starting watcher for:[/green] {watch_path}")
|
|
console.print(f"[dim]Debounce interval: {debounce}ms[/dim]")
|
|
if languages:
|
|
console.print(f"[dim]Watching languages: {', '.join(languages)}[/dim]")
|
|
console.print("[dim]Press Ctrl+C to stop[/dim]\n")
|
|
|
|
# Create and start watcher manager
|
|
manager = WatcherManager(
|
|
root_path=watch_path,
|
|
watcher_config=watcher_config,
|
|
on_indexed=lambda result: _display_index_result(result),
|
|
)
|
|
|
|
manager.start()
|
|
manager.wait()
|
|
|
|
except KeyboardInterrupt:
|
|
console.print("\n[yellow]Stopping watcher...[/yellow]")
|
|
except CodexLensError as exc:
|
|
console.print(f"[red]Watch failed:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
except Exception as exc:
|
|
console.print(f"[red]Unexpected error:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
finally:
|
|
if registry is not None:
|
|
registry.close()
|
|
|
|
|
|
def _display_index_result(result) -> None:
|
|
"""Display indexing result in real-time."""
|
|
if result.files_indexed > 0 or result.files_removed > 0:
|
|
parts = []
|
|
if result.files_indexed > 0:
|
|
parts.append(f"[green]✓ Indexed {result.files_indexed} file(s)[/green]")
|
|
if result.files_removed > 0:
|
|
parts.append(f"[yellow]✗ Removed {result.files_removed} file(s)[/yellow]")
|
|
console.print(" | ".join(parts))
|
|
|
|
if result.errors:
|
|
for error in result.errors[:3]: # Show max 3 errors
|
|
console.print(f" [red]Error:[/red] {error}")
|
|
if len(result.errors) > 3:
|
|
console.print(f" [dim]... and {len(result.errors) - 3} more errors[/dim]")
|
|
|
|
|
|
|
|
# ==================== Cascade Index Commands ====================
|
|
|
|
|
|
def get_binary_index_path(db_path: Path) -> Path:
|
|
"""Get the path for binary ANN index file.
|
|
|
|
Args:
|
|
db_path: Path to the _index.db file
|
|
|
|
Returns:
|
|
Path to the binary index file (_index_binary.bin)
|
|
"""
|
|
return db_path.parent / f"{db_path.stem}_binary.bin"
|
|
|
|
|
|
@index_app.command("binary")
|
|
def index_binary(
|
|
path: Annotated[Path, typer.Argument(help="Directory to index")],
|
|
force: Annotated[bool, typer.Option("--force", "-f", help="Force regenerate")] = False,
|
|
batch_size: Annotated[int, typer.Option("--batch-size", "-b", help="Batch size for embedding")] = 32,
|
|
json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False,
|
|
verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False,
|
|
) -> None:
|
|
"""Generate cascade embeddings (binary + dense) for two-stage retrieval.
|
|
|
|
Cascade retrieval uses a two-stage approach:
|
|
1. Binary search (fast, 32 bytes/vector) -> coarse filtering
|
|
2. Dense rerank (precise, 8KB/vector) -> final results
|
|
|
|
This command:
|
|
- Finds all _index.db files in the directory
|
|
- Generates binary (256-dim) and dense (2048-dim) embeddings for each chunk
|
|
- Stores embeddings in the database (embedding_binary, embedding_dense columns)
|
|
- Creates a BinaryANNIndex file for fast coarse retrieval
|
|
|
|
Examples:
|
|
codexlens index binary ~/projects/my-app
|
|
codexlens index binary . --force
|
|
codexlens index binary . --batch-size 64 --verbose
|
|
"""
|
|
_configure_logging(verbose, json_mode)
|
|
|
|
target_path = path.expanduser().resolve()
|
|
|
|
# Find index database(s)
|
|
if target_path.is_file() and target_path.name == "_index.db":
|
|
index_dbs = [target_path]
|
|
elif target_path.is_dir():
|
|
# Check local .codexlens/_index.db first
|
|
local_index = target_path / ".codexlens" / "_index.db"
|
|
if local_index.exists():
|
|
index_dbs = [local_index]
|
|
else:
|
|
# Find via registry
|
|
registry = RegistryStore()
|
|
try:
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
index_db = mapper.source_to_index_db(target_path)
|
|
if not index_db.exists():
|
|
if json_mode:
|
|
print_json(success=False, error=f"No index found for {target_path}")
|
|
else:
|
|
console.print(f"[red]Error:[/red] No index found for {target_path}")
|
|
console.print("Run 'codexlens init' first to create an index")
|
|
raise typer.Exit(code=1)
|
|
# Find all _index.db files under the index root
|
|
index_root = index_db.parent
|
|
index_dbs = list(index_root.rglob("_index.db"))
|
|
finally:
|
|
registry.close()
|
|
else:
|
|
if json_mode:
|
|
print_json(success=False, error="Path must be _index.db file or indexed directory")
|
|
else:
|
|
console.print("[red]Error:[/red] Path must be _index.db file or indexed directory")
|
|
raise typer.Exit(code=1)
|
|
|
|
if not index_dbs:
|
|
if json_mode:
|
|
print_json(success=False, error="No index databases found")
|
|
else:
|
|
console.print("[yellow]No index databases found[/yellow]")
|
|
raise typer.Exit(code=1)
|
|
|
|
# Import cascade embedding backend
|
|
try:
|
|
from codexlens.indexing.embedding import CascadeEmbeddingBackend
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
from codexlens.indexing.embedding import pack_binary_embedding
|
|
except ImportError as e:
|
|
error_msg = f"Cascade embedding dependencies not available: {e}"
|
|
if json_mode:
|
|
print_json(success=False, error=error_msg)
|
|
else:
|
|
console.print(f"[red]Error:[/red] {error_msg}")
|
|
console.print("[dim]Install with: pip install codexlens[semantic][/dim]")
|
|
raise typer.Exit(code=1)
|
|
|
|
if not json_mode:
|
|
console.print(f"[bold]Generating cascade embeddings[/bold]")
|
|
console.print(f"Path: [dim]{target_path}[/dim]")
|
|
console.print(f"Index databases: [cyan]{len(index_dbs)}[/cyan]")
|
|
console.print(f"Batch size: [cyan]{batch_size}[/cyan]")
|
|
console.print()
|
|
|
|
# Initialize cascade embedding backend
|
|
try:
|
|
cascade_backend = CascadeEmbeddingBackend()
|
|
except Exception as e:
|
|
error_msg = f"Failed to initialize cascade embedding backend: {e}"
|
|
if json_mode:
|
|
print_json(success=False, error=error_msg)
|
|
else:
|
|
console.print(f"[red]Error:[/red] {error_msg}")
|
|
raise typer.Exit(code=1)
|
|
|
|
# Process statistics
|
|
total_chunks_processed = 0
|
|
total_indexes_processed = 0
|
|
total_indexes_successful = 0
|
|
total_binary_indexes_created = 0
|
|
errors_list: List[str] = []
|
|
|
|
# Process each index database
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
BarColumn(),
|
|
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
|
TextColumn("({task.completed}/{task.total})"),
|
|
TimeElapsedColumn(),
|
|
console=console,
|
|
disable=json_mode,
|
|
) as progress:
|
|
db_task = progress.add_task("Processing indexes...", total=len(index_dbs))
|
|
|
|
for db_path in index_dbs:
|
|
total_indexes_processed += 1
|
|
index_name = db_path.parent.name
|
|
|
|
try:
|
|
# Open the index store
|
|
store = DirIndexStore(db_path)
|
|
store.initialize()
|
|
|
|
# Get connection for direct queries
|
|
conn = store._get_connection()
|
|
|
|
# Ensure cascade columns exist in semantic_chunks table
|
|
try:
|
|
conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_binary BLOB")
|
|
except Exception:
|
|
pass # Column already exists
|
|
try:
|
|
conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_dense BLOB")
|
|
except Exception:
|
|
pass # Column already exists
|
|
conn.commit()
|
|
|
|
# Check if semantic_chunks table exists and has data
|
|
try:
|
|
cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
|
|
chunk_count = cursor.fetchone()[0]
|
|
except Exception:
|
|
# semantic_chunks table doesn't exist or is empty
|
|
chunk_count = 0
|
|
|
|
if chunk_count == 0:
|
|
if verbose and not json_mode:
|
|
console.print(f" [dim]Skipping {index_name}: no chunks found[/dim]")
|
|
progress.advance(db_task)
|
|
store.close()
|
|
continue
|
|
|
|
# Check if embeddings already exist (unless force)
|
|
if not force:
|
|
cursor = conn.execute(
|
|
"SELECT COUNT(*) FROM semantic_chunks WHERE embedding_binary IS NOT NULL"
|
|
)
|
|
existing_count = cursor.fetchone()[0]
|
|
if existing_count > 0:
|
|
if verbose and not json_mode:
|
|
console.print(f" [dim]Skipping {index_name}: embeddings exist (use --force to regenerate)[/dim]")
|
|
progress.advance(db_task)
|
|
store.close()
|
|
continue
|
|
|
|
# If force, clear existing cascade embeddings
|
|
if force:
|
|
conn.execute(
|
|
"UPDATE semantic_chunks SET embedding_binary = NULL, embedding_dense = NULL"
|
|
)
|
|
conn.commit()
|
|
|
|
# Get all chunks
|
|
cursor = conn.execute("SELECT id, content FROM semantic_chunks")
|
|
chunks = cursor.fetchall()
|
|
|
|
if not chunks:
|
|
progress.advance(db_task)
|
|
store.close()
|
|
continue
|
|
|
|
if verbose and not json_mode:
|
|
console.print(f" Processing {index_name}: {len(chunks)} chunks")
|
|
|
|
# Process in batches
|
|
chunk_task = progress.add_task(
|
|
f" {index_name}", total=len(chunks)
|
|
)
|
|
|
|
# Prepare for BinaryANNIndex
|
|
binary_index_path = get_binary_index_path(db_path)
|
|
binary_ann_index = BinaryANNIndex(db_path, dim=256)
|
|
|
|
for i in range(0, len(chunks), batch_size):
|
|
batch_chunks = chunks[i:i + batch_size]
|
|
batch_ids = [c[0] for c in batch_chunks]
|
|
batch_contents = [c[1] for c in batch_chunks]
|
|
|
|
# Generate cascade embeddings
|
|
binary_embeddings, dense_embeddings = cascade_backend.encode_cascade(
|
|
batch_contents, batch_size=batch_size
|
|
)
|
|
|
|
# Pack binary embeddings and convert dense to bytes
|
|
packed_binaries = []
|
|
dense_bytes_list = []
|
|
|
|
for j in range(len(batch_ids)):
|
|
# Pack binary embedding (256 bits -> 32 bytes)
|
|
packed_binary = pack_binary_embedding(binary_embeddings[j])
|
|
packed_binaries.append(packed_binary)
|
|
|
|
# Convert dense embedding to bytes
|
|
import numpy as np
|
|
dense_blob = dense_embeddings[j].astype(np.float32).tobytes()
|
|
dense_bytes_list.append(dense_blob)
|
|
|
|
# Update database
|
|
for j, chunk_id in enumerate(batch_ids):
|
|
conn.execute(
|
|
"""
|
|
UPDATE semantic_chunks
|
|
SET embedding_binary = ?, embedding_dense = ?
|
|
WHERE id = ?
|
|
""",
|
|
(packed_binaries[j], dense_bytes_list[j], chunk_id)
|
|
)
|
|
|
|
# Add to binary ANN index
|
|
binary_ann_index.add_vectors(batch_ids, packed_binaries)
|
|
|
|
conn.commit()
|
|
total_chunks_processed += len(batch_ids)
|
|
progress.advance(chunk_task, len(batch_ids))
|
|
|
|
# Save binary ANN index
|
|
binary_ann_index.save()
|
|
total_binary_indexes_created += 1
|
|
|
|
progress.remove_task(chunk_task)
|
|
store.close()
|
|
total_indexes_successful += 1
|
|
|
|
except Exception as e:
|
|
error_msg = f"{index_name}: {e}"
|
|
errors_list.append(error_msg)
|
|
if verbose and not json_mode:
|
|
console.print(f" [red]Error processing {index_name}:[/red] {e}")
|
|
|
|
progress.advance(db_task)
|
|
|
|
# Build result
|
|
result = {
|
|
"path": str(target_path),
|
|
"indexes_processed": total_indexes_processed,
|
|
"indexes_successful": total_indexes_successful,
|
|
"chunks_processed": total_chunks_processed,
|
|
"binary_indexes_created": total_binary_indexes_created,
|
|
"errors": len(errors_list),
|
|
"error_details": errors_list[:5] if errors_list else [],
|
|
}
|
|
|
|
if json_mode:
|
|
print_json(success=True, result=result)
|
|
else:
|
|
console.print(f"\n[green]Cascade indexing complete[/green]")
|
|
console.print(f" Indexes processed: {total_indexes_processed}")
|
|
console.print(f" Indexes successful: {total_indexes_successful}")
|
|
console.print(f" Chunks processed: {total_chunks_processed:,}")
|
|
console.print(f" Binary indexes created: {total_binary_indexes_created}")
|
|
if errors_list:
|
|
console.print(f" [yellow]Errors: {len(errors_list)}[/yellow]")
|
|
for err in errors_list[:3]:
|
|
console.print(f" [dim]{err}[/dim]")
|
|
if len(errors_list) > 3:
|
|
console.print(f" [dim]... and {len(errors_list) - 3} more[/dim]")
|
|
|
|
|
|
# ==================== Index Status Command ====================
|
|
|
|
@index_app.command("status")
|
|
def index_status(
|
|
path: Optional[Path] = typer.Argument(
|
|
None,
|
|
help="Path to project directory or _index.db file. If not specified, uses default index root.",
|
|
),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
|
|
) -> None:
|
|
"""Show comprehensive index status (embeddings + SPLADE).
|
|
|
|
Shows combined status for all index types:
|
|
- Dense vector embeddings (HNSW)
|
|
- SPLADE sparse embeddings
|
|
- Binary cascade embeddings
|
|
|
|
Examples:
|
|
codexlens index status # Check all indexes
|
|
codexlens index status ~/projects/my-app # Check specific project
|
|
codexlens index status --json # JSON output
|
|
"""
|
|
_configure_logging(verbose, json_mode)
|
|
|
|
from codexlens.cli.embedding_manager import check_index_embeddings, get_embedding_stats_summary
|
|
from codexlens.storage.splade_index import SpladeIndex
|
|
from codexlens.semantic.splade_encoder import check_splade_available
|
|
from codexlens.config import SPLADE_DB_NAME
|
|
|
|
# Determine target path and index root
|
|
if path is None:
|
|
index_root = _get_index_root()
|
|
target_path = None
|
|
else:
|
|
target_path = path.resolve()
|
|
if target_path.is_file() and target_path.name == "_index.db":
|
|
index_root = target_path.parent
|
|
elif target_path.is_dir():
|
|
# Try to find index for this project
|
|
registry = RegistryStore()
|
|
try:
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
index_path = mapper.source_to_index_db(target_path)
|
|
if index_path.exists():
|
|
index_root = index_path.parent
|
|
else:
|
|
if json_mode:
|
|
print_json(success=False, error=f"No index found for {target_path}")
|
|
else:
|
|
console.print(f"[red]Error:[/red] No index found for {target_path}")
|
|
console.print("Run 'codexlens index init' first to create an index")
|
|
raise typer.Exit(code=1)
|
|
finally:
|
|
registry.close()
|
|
else:
|
|
if json_mode:
|
|
print_json(success=False, error="Path must be _index.db file or directory")
|
|
else:
|
|
console.print(f"[red]Error:[/red] Path must be _index.db file or directory")
|
|
raise typer.Exit(code=1)
|
|
|
|
# Get embeddings status
|
|
embeddings_result = get_embedding_stats_summary(index_root)
|
|
|
|
# Get SPLADE status
|
|
splade_db = index_root / SPLADE_DB_NAME
|
|
splade_status = {
|
|
"available": False,
|
|
"has_index": False,
|
|
"stats": None,
|
|
"metadata": None,
|
|
}
|
|
|
|
splade_available, splade_err = check_splade_available()
|
|
splade_status["available"] = splade_available
|
|
|
|
if splade_db.exists():
|
|
try:
|
|
splade_index = SpladeIndex(splade_db)
|
|
if splade_index.has_index():
|
|
splade_status["has_index"] = True
|
|
splade_status["stats"] = splade_index.get_stats()
|
|
splade_status["metadata"] = splade_index.get_metadata()
|
|
splade_index.close()
|
|
except Exception as e:
|
|
if verbose:
|
|
console.print(f"[yellow]Warning: Failed to read SPLADE index: {e}[/yellow]")
|
|
|
|
# Build combined result
|
|
result = {
|
|
"index_root": str(index_root),
|
|
"embeddings": embeddings_result.get("result") if embeddings_result.get("success") else None,
|
|
"embeddings_error": embeddings_result.get("error") if not embeddings_result.get("success") else None,
|
|
"splade": splade_status,
|
|
}
|
|
|
|
if json_mode:
|
|
print_json(success=True, result=result)
|
|
else:
|
|
console.print(f"[bold]Index Status[/bold]")
|
|
console.print(f"Index root: [dim]{index_root}[/dim]\n")
|
|
|
|
# Embeddings section
|
|
console.print("[bold]Dense Embeddings (HNSW):[/bold]")
|
|
if embeddings_result.get("success"):
|
|
data = embeddings_result["result"]
|
|
total = data.get("total_indexes", 0)
|
|
with_emb = data.get("indexes_with_embeddings", 0)
|
|
total_chunks = data.get("total_chunks", 0)
|
|
|
|
console.print(f" Total indexes: {total}")
|
|
console.print(f" Indexes with embeddings: [{'green' if with_emb > 0 else 'yellow'}]{with_emb}[/]/{total}")
|
|
console.print(f" Total chunks: {total_chunks:,}")
|
|
else:
|
|
console.print(f" [yellow]--[/yellow] {embeddings_result.get('error', 'Not available')}")
|
|
|
|
# SPLADE section
|
|
console.print("\n[bold]SPLADE Sparse Index:[/bold]")
|
|
if splade_status["has_index"]:
|
|
stats = splade_status["stats"] or {}
|
|
metadata = splade_status["metadata"] or {}
|
|
console.print(f" [green]OK[/green] SPLADE index available")
|
|
console.print(f" Chunks: {stats.get('unique_chunks', 0):,}")
|
|
console.print(f" Unique tokens: {stats.get('unique_tokens', 0):,}")
|
|
console.print(f" Total postings: {stats.get('total_postings', 0):,}")
|
|
if metadata.get("model_name"):
|
|
console.print(f" Model: {metadata['model_name']}")
|
|
elif splade_available:
|
|
console.print(f" [yellow]--[/yellow] No SPLADE index found")
|
|
console.print(f" [dim]Run 'codexlens index splade <path>' to create one[/dim]")
|
|
else:
|
|
console.print(f" [yellow]--[/yellow] SPLADE not available: {splade_err}")
|
|
|
|
# Runtime availability
|
|
console.print("\n[bold]Runtime Availability:[/bold]")
|
|
console.print(f" SPLADE encoder: {'[green]Yes[/green]' if splade_available else f'[red]No[/red] ({splade_err})'}")
|
|
|
|
|
|
# ==================== Index Update Command ====================
|
|
|
|
@index_app.command("update")
|
|
def index_update(
|
|
file_path: Path = typer.Argument(..., exists=True, file_okay=True, dir_okay=False, help="Path to the file to update in the index."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""Update the index for a single file incrementally.
|
|
|
|
This is a lightweight command designed for use in hooks (e.g., Claude Code PostToolUse).
|
|
It updates only the specified file without scanning the entire directory.
|
|
|
|
The file's parent directory must already be indexed via 'codexlens index init'.
|
|
|
|
Examples:
|
|
codexlens index update src/main.py # Update single file
|
|
codexlens index update ./foo.ts --json # JSON output for hooks
|
|
"""
|
|
_configure_logging(verbose, json_mode)
|
|
|
|
from codexlens.watcher.incremental_indexer import IncrementalIndexer
|
|
|
|
registry: RegistryStore | None = None
|
|
indexer: IncrementalIndexer | None = None
|
|
|
|
try:
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
config = Config()
|
|
|
|
resolved_path = file_path.resolve()
|
|
|
|
# Check if project is indexed
|
|
source_root = mapper.get_project_root(resolved_path)
|
|
if not source_root or not registry.get_project(source_root):
|
|
error_msg = f"Project containing file is not indexed: {file_path}"
|
|
if json_mode:
|
|
print_json(success=False, error=error_msg)
|
|
else:
|
|
console.print(f"[red]Error:[/red] {error_msg}")
|
|
console.print("[dim]Run 'codexlens index init' on the project root first.[/dim]")
|
|
raise typer.Exit(code=1)
|
|
|
|
indexer = IncrementalIndexer(registry, mapper, config)
|
|
result = indexer._index_file(resolved_path)
|
|
|
|
if result.success:
|
|
if json_mode:
|
|
print_json(success=True, result={
|
|
"path": str(result.path),
|
|
"symbols_count": result.symbols_count,
|
|
"status": "updated",
|
|
})
|
|
else:
|
|
console.print(f"[green]✓[/green] Updated index for [bold]{result.path.name}[/bold] ({result.symbols_count} symbols)")
|
|
else:
|
|
error_msg = result.error or f"Failed to update index for {file_path}"
|
|
if json_mode:
|
|
print_json(success=False, error=error_msg)
|
|
else:
|
|
console.print(f"[red]Error:[/red] {error_msg}")
|
|
raise typer.Exit(code=1)
|
|
|
|
except CodexLensError as exc:
|
|
if json_mode:
|
|
print_json(success=False, error=str(exc))
|
|
else:
|
|
console.print(f"[red]Update failed:[/red] {exc}")
|
|
raise typer.Exit(code=1)
|
|
finally:
|
|
if indexer:
|
|
indexer.close()
|
|
if registry:
|
|
registry.close()
|
|
|
|
|
|
# ==================== Index All Command ====================
|
|
|
|
@index_app.command("all")
|
|
def index_all(
|
|
path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to index."),
|
|
language: Optional[List[str]] = typer.Option(
|
|
None,
|
|
"--language",
|
|
"-l",
|
|
help="Limit indexing to specific languages (repeat or comma-separated).",
|
|
),
|
|
workers: Optional[int] = typer.Option(None, "--workers", "-w", min=1, help="Parallel worker processes."),
|
|
force: bool = typer.Option(False, "--force", "-f", help="Force full reindex."),
|
|
backend: str = typer.Option("fastembed", "--backend", "-b", help="Embedding backend: fastembed or litellm."),
|
|
model: str = typer.Option("code", "--model", "-m", help="Embedding model profile or name."),
|
|
max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls."),
|
|
skip_splade: bool = typer.Option(False, "--skip-splade", help="Skip SPLADE index generation."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""Run all indexing operations in sequence (init, embeddings, splade).
|
|
|
|
This is a convenience command that runs the complete indexing pipeline:
|
|
1. FTS index initialization (index init)
|
|
2. Dense vector embeddings (index embeddings)
|
|
3. SPLADE sparse index (index splade) - unless --skip-splade
|
|
|
|
Examples:
|
|
codexlens index all ~/projects/my-app
|
|
codexlens index all . --force
|
|
codexlens index all . --backend litellm --model text-embedding-3-small
|
|
codexlens index all . --skip-splade
|
|
"""
|
|
_configure_logging(verbose, json_mode)
|
|
|
|
base_path = path.expanduser().resolve()
|
|
results = {
|
|
"path": str(base_path),
|
|
"steps": {},
|
|
}
|
|
|
|
# Step 1: Run init
|
|
if not json_mode:
|
|
console.print(f"[bold]Step 1/3: Initializing FTS index...[/bold]")
|
|
|
|
try:
|
|
# Import and call the init function directly
|
|
from codexlens.config import Config
|
|
from codexlens.storage.index_tree import IndexTreeBuilder
|
|
|
|
config = Config()
|
|
languages = _parse_languages(language)
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
|
|
builder = IndexTreeBuilder(registry, mapper, config, incremental=not force)
|
|
build_result = builder.build(
|
|
source_root=base_path,
|
|
languages=languages,
|
|
workers=workers,
|
|
force_full=force,
|
|
)
|
|
|
|
results["steps"]["init"] = {
|
|
"success": True,
|
|
"files_indexed": build_result.total_files,
|
|
"dirs_indexed": build_result.total_dirs,
|
|
"index_root": str(build_result.index_root),
|
|
}
|
|
|
|
if not json_mode:
|
|
console.print(f" [green]OK[/green] Indexed {build_result.total_files} files in {build_result.total_dirs} directories")
|
|
|
|
index_root = Path(build_result.index_root)
|
|
registry.close()
|
|
|
|
except Exception as e:
|
|
results["steps"]["init"] = {"success": False, "error": str(e)}
|
|
if json_mode:
|
|
print_json(success=False, result=results, error=f"Init failed: {e}")
|
|
else:
|
|
console.print(f" [red]Error:[/red] {e}")
|
|
raise typer.Exit(code=1)
|
|
|
|
# Step 2: Generate embeddings
|
|
if not json_mode:
|
|
console.print(f"\n[bold]Step 2/3: Generating dense embeddings...[/bold]")
|
|
|
|
try:
|
|
from codexlens.cli.embedding_manager import generate_dense_embeddings_centralized
|
|
|
|
def progress_update(msg: str):
|
|
if not json_mode and verbose:
|
|
console.print(f" {msg}")
|
|
|
|
embed_result = generate_dense_embeddings_centralized(
|
|
index_root,
|
|
embedding_backend=backend,
|
|
model_profile=model,
|
|
force=force,
|
|
chunk_size=2000,
|
|
progress_callback=progress_update,
|
|
max_workers=max_workers,
|
|
)
|
|
|
|
if embed_result["success"]:
|
|
data = embed_result["result"]
|
|
results["steps"]["embeddings"] = {
|
|
"success": True,
|
|
"chunks_created": data.get("chunks_created", 0),
|
|
"files_processed": data.get("files_processed", 0),
|
|
}
|
|
if not json_mode:
|
|
console.print(f" [green]OK[/green] Generated {data.get('chunks_created', 0)} chunks for {data.get('files_processed', 0)} files")
|
|
else:
|
|
results["steps"]["embeddings"] = {
|
|
"success": False,
|
|
"error": embed_result.get("error"),
|
|
}
|
|
if not json_mode:
|
|
console.print(f" [yellow]Warning:[/yellow] {embed_result.get('error', 'Unknown error')}")
|
|
|
|
except Exception as e:
|
|
results["steps"]["embeddings"] = {"success": False, "error": str(e)}
|
|
if not json_mode:
|
|
console.print(f" [yellow]Warning:[/yellow] {e}")
|
|
|
|
# Step 3: Generate SPLADE index (unless skipped)
|
|
if not skip_splade:
|
|
if not json_mode:
|
|
console.print(f"\n[bold]Step 3/3: Generating SPLADE index...[/bold]")
|
|
|
|
try:
|
|
from codexlens.semantic.splade_encoder import get_splade_encoder, check_splade_available
|
|
from codexlens.storage.splade_index import SpladeIndex
|
|
from codexlens.semantic.vector_store import VectorStore
|
|
from codexlens.config import SPLADE_DB_NAME
|
|
|
|
ok, err = check_splade_available()
|
|
if not ok:
|
|
results["steps"]["splade"] = {"success": False, "error": f"SPLADE not available: {err}"}
|
|
if not json_mode:
|
|
console.print(f" [yellow]Skipped:[/yellow] SPLADE not available ({err})")
|
|
else:
|
|
# Discover all _index.db files
|
|
all_index_dbs = sorted(index_root.rglob("_index.db"))
|
|
if not all_index_dbs:
|
|
results["steps"]["splade"] = {"success": False, "error": "No index databases found"}
|
|
if not json_mode:
|
|
console.print(f" [yellow]Skipped:[/yellow] No index databases found")
|
|
else:
|
|
# Collect chunks
|
|
all_chunks = []
|
|
global_id = 0
|
|
for index_db in all_index_dbs:
|
|
try:
|
|
vector_store = VectorStore(index_db)
|
|
chunks = vector_store.get_all_chunks()
|
|
for chunk in chunks:
|
|
global_id += 1
|
|
all_chunks.append((global_id, chunk, index_db))
|
|
vector_store.close()
|
|
except Exception:
|
|
pass
|
|
|
|
if all_chunks:
|
|
splade_db = index_root / SPLADE_DB_NAME
|
|
if splade_db.exists() and force:
|
|
splade_db.unlink()
|
|
|
|
encoder = get_splade_encoder()
|
|
splade_index = SpladeIndex(splade_db)
|
|
splade_index.create_tables()
|
|
|
|
chunk_metadata_batch = []
|
|
import json as json_module
|
|
for gid, chunk, source_db_path in all_chunks:
|
|
sparse_vec = encoder.encode_text(chunk.content)
|
|
splade_index.add_posting(gid, sparse_vec)
|
|
metadata_str = None
|
|
if hasattr(chunk, 'metadata') and chunk.metadata:
|
|
try:
|
|
metadata_str = json_module.dumps(chunk.metadata) if isinstance(chunk.metadata, dict) else chunk.metadata
|
|
except Exception:
|
|
pass
|
|
chunk_metadata_batch.append((
|
|
gid,
|
|
chunk.file_path or "",
|
|
chunk.content,
|
|
metadata_str,
|
|
str(source_db_path)
|
|
))
|
|
|
|
if chunk_metadata_batch:
|
|
splade_index.add_chunks_metadata_batch(chunk_metadata_batch)
|
|
|
|
splade_index.set_metadata(
|
|
model_name=encoder.model_name,
|
|
vocab_size=encoder.vocab_size
|
|
)
|
|
|
|
stats = splade_index.get_stats()
|
|
results["steps"]["splade"] = {
|
|
"success": True,
|
|
"chunks": stats['unique_chunks'],
|
|
"postings": stats['total_postings'],
|
|
}
|
|
if not json_mode:
|
|
console.print(f" [green]OK[/green] SPLADE index built: {stats['unique_chunks']} chunks, {stats['total_postings']} postings")
|
|
else:
|
|
results["steps"]["splade"] = {"success": False, "error": "No chunks found"}
|
|
if not json_mode:
|
|
console.print(f" [yellow]Skipped:[/yellow] No chunks found in indexes")
|
|
|
|
except Exception as e:
|
|
results["steps"]["splade"] = {"success": False, "error": str(e)}
|
|
if not json_mode:
|
|
console.print(f" [yellow]Warning:[/yellow] {e}")
|
|
else:
|
|
results["steps"]["splade"] = {"success": True, "skipped": True}
|
|
if not json_mode:
|
|
console.print(f"\n[bold]Step 3/3: SPLADE index...[/bold]")
|
|
console.print(f" [dim]Skipped (--skip-splade)[/dim]")
|
|
|
|
# Summary
|
|
if json_mode:
|
|
print_json(success=True, result=results)
|
|
else:
|
|
console.print(f"\n[bold]Indexing Complete[/bold]")
|
|
init_ok = results["steps"].get("init", {}).get("success", False)
|
|
emb_ok = results["steps"].get("embeddings", {}).get("success", False)
|
|
splade_ok = results["steps"].get("splade", {}).get("success", False)
|
|
console.print(f" FTS Index: {'[green]OK[/green]' if init_ok else '[red]Failed[/red]'}")
|
|
console.print(f" Embeddings: {'[green]OK[/green]' if emb_ok else '[yellow]Partial/Skipped[/yellow]'}")
|
|
console.print(f" SPLADE: {'[green]OK[/green]' if splade_ok else '[yellow]Partial/Skipped[/yellow]'}")
|
|
|
|
|
|
# ==================== Index Migration Commands ====================
|
|
|
|
# Index version for migration tracking (file-based version marker)
|
|
INDEX_FORMAT_VERSION = "2.0"
|
|
INDEX_VERSION_FILE = "_index_version.txt"
|
|
|
|
|
|
def _get_index_version(index_root: Path) -> Optional[str]:
|
|
"""Read index format version from version marker file.
|
|
|
|
Args:
|
|
index_root: Root directory of the index
|
|
|
|
Returns:
|
|
Version string if file exists, None otherwise
|
|
"""
|
|
version_file = index_root / INDEX_VERSION_FILE
|
|
if version_file.exists():
|
|
try:
|
|
return version_file.read_text(encoding="utf-8").strip()
|
|
except Exception:
|
|
return None
|
|
return None
|
|
|
|
|
|
def _set_index_version(index_root: Path, version: str) -> None:
|
|
"""Write index format version to version marker file.
|
|
|
|
Args:
|
|
index_root: Root directory of the index
|
|
version: Version string to write
|
|
"""
|
|
version_file = index_root / INDEX_VERSION_FILE
|
|
version_file.write_text(version, encoding="utf-8")
|
|
|
|
|
|
def _discover_distributed_splade(index_root: Path) -> List[Dict[str, Any]]:
|
|
"""Discover distributed SPLADE data in _index.db files.
|
|
|
|
Scans all _index.db files for embedded splade_postings tables.
|
|
This is the old distributed format that needs migration.
|
|
|
|
Args:
|
|
index_root: Root directory to scan
|
|
|
|
Returns:
|
|
List of dicts with db_path, posting_count, chunk_count
|
|
"""
|
|
results = []
|
|
|
|
for db_path in index_root.rglob("_index.db"):
|
|
try:
|
|
conn = sqlite3.connect(db_path, timeout=5.0)
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
# Check if splade_postings table exists (old embedded format)
|
|
cursor = conn.execute(
|
|
"SELECT name FROM sqlite_master WHERE type='table' AND name='splade_postings'"
|
|
)
|
|
if cursor.fetchone():
|
|
# Count postings and chunks
|
|
try:
|
|
row = conn.execute(
|
|
"SELECT COUNT(*) as postings, COUNT(DISTINCT chunk_id) as chunks FROM splade_postings"
|
|
).fetchone()
|
|
results.append({
|
|
"db_path": db_path,
|
|
"posting_count": row["postings"] if row else 0,
|
|
"chunk_count": row["chunks"] if row else 0,
|
|
})
|
|
except Exception:
|
|
pass
|
|
|
|
conn.close()
|
|
except Exception:
|
|
pass
|
|
|
|
return results
|
|
|
|
|
|
def _discover_distributed_hnsw(index_root: Path) -> List[Dict[str, Any]]:
|
|
"""Discover distributed HNSW index files.
|
|
|
|
Scans for .hnsw files that are stored alongside _index.db files.
|
|
This is the old distributed format that needs migration.
|
|
|
|
Args:
|
|
index_root: Root directory to scan
|
|
|
|
Returns:
|
|
List of dicts with hnsw_path, size_bytes
|
|
"""
|
|
results = []
|
|
|
|
for hnsw_path in index_root.rglob("*.hnsw"):
|
|
try:
|
|
size = hnsw_path.stat().st_size
|
|
results.append({
|
|
"hnsw_path": hnsw_path,
|
|
"size_bytes": size,
|
|
})
|
|
except Exception:
|
|
pass
|
|
|
|
return results
|
|
|
|
|
|
def _check_centralized_storage(index_root: Path) -> Dict[str, Any]:
|
|
"""Check for centralized storage files.
|
|
|
|
Args:
|
|
index_root: Root directory to check
|
|
|
|
Returns:
|
|
Dict with has_splade, has_vectors, splade_stats, vector_stats
|
|
"""
|
|
from codexlens.config import SPLADE_DB_NAME, VECTORS_HNSW_NAME
|
|
|
|
splade_db = index_root / SPLADE_DB_NAME
|
|
vectors_hnsw = index_root / VECTORS_HNSW_NAME
|
|
|
|
result = {
|
|
"has_splade": splade_db.exists(),
|
|
"has_vectors": vectors_hnsw.exists(),
|
|
"splade_path": str(splade_db) if splade_db.exists() else None,
|
|
"vectors_path": str(vectors_hnsw) if vectors_hnsw.exists() else None,
|
|
"splade_stats": None,
|
|
"vector_stats": None,
|
|
}
|
|
|
|
# Get SPLADE stats if exists
|
|
if splade_db.exists():
|
|
try:
|
|
from codexlens.storage.splade_index import SpladeIndex
|
|
splade = SpladeIndex(splade_db)
|
|
if splade.has_index():
|
|
result["splade_stats"] = splade.get_stats()
|
|
splade.close()
|
|
except Exception:
|
|
pass
|
|
|
|
# Get vector stats if exists
|
|
if vectors_hnsw.exists():
|
|
try:
|
|
result["vector_stats"] = {
|
|
"size_bytes": vectors_hnsw.stat().st_size,
|
|
}
|
|
except Exception:
|
|
pass
|
|
|
|
return result
|
|
|
|
|
|
@index_app.command("migrate")
|
|
def index_migrate_cmd(
|
|
path: Annotated[Optional[str], typer.Argument(help="Project path to migrate")] = None,
|
|
dry_run: Annotated[bool, typer.Option("--dry-run", help="Show what would be migrated without making changes")] = False,
|
|
force: Annotated[bool, typer.Option("--force", help="Force migration even if already migrated")] = False,
|
|
json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False,
|
|
verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose output")] = False,
|
|
) -> None:
|
|
"""Migrate old distributed index to new centralized architecture.
|
|
|
|
This command upgrades indexes from the old distributed storage format
|
|
(where SPLADE/vectors were stored in each _index.db) to the new centralized
|
|
format (single _splade.db and _vectors.hnsw at index root).
|
|
|
|
Migration Steps:
|
|
1. Detect if migration is needed (check version marker)
|
|
2. Discover distributed SPLADE data in _index.db files
|
|
3. Discover distributed .hnsw files
|
|
4. Report current status
|
|
5. Create version marker (unless --dry-run)
|
|
|
|
Use --dry-run to preview what would be migrated without making changes.
|
|
Use --force to re-run migration even if version marker exists.
|
|
|
|
Note: For full data migration (SPLADE/vectors consolidation), run:
|
|
codexlens index splade <path> --rebuild
|
|
codexlens index embeddings <path> --force
|
|
|
|
Examples:
|
|
codexlens index migrate ~/projects/my-app --dry-run
|
|
codexlens index migrate . --force
|
|
codexlens index migrate --json
|
|
"""
|
|
_configure_logging(verbose, json_mode)
|
|
|
|
# Resolve target path
|
|
if path:
|
|
target_path = Path(path).expanduser().resolve()
|
|
else:
|
|
target_path = Path.cwd()
|
|
|
|
if not target_path.exists():
|
|
if json_mode:
|
|
print_json(success=False, error=f"Path does not exist: {target_path}")
|
|
else:
|
|
console.print(f"[red]Error:[/red] Path does not exist: {target_path}")
|
|
raise typer.Exit(code=1)
|
|
|
|
# Find index root
|
|
registry: RegistryStore | None = None
|
|
index_root: Optional[Path] = None
|
|
|
|
try:
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
mapper = PathMapper()
|
|
|
|
# Check if path is a project with an index
|
|
project_info = registry.get_project(target_path)
|
|
if project_info:
|
|
index_root = Path(project_info.index_root)
|
|
else:
|
|
# Try to find index via mapper
|
|
index_db = mapper.source_to_index_db(target_path)
|
|
if index_db.exists():
|
|
index_root = index_db.parent
|
|
finally:
|
|
if registry:
|
|
registry.close()
|
|
|
|
if not index_root or not index_root.exists():
|
|
if json_mode:
|
|
print_json(success=False, error=f"No index found for: {target_path}")
|
|
else:
|
|
console.print(f"[red]Error:[/red] No index found for: {target_path}")
|
|
console.print("[dim]Run 'codexlens init' first to create an index.[/dim]")
|
|
raise typer.Exit(code=1)
|
|
|
|
if not json_mode:
|
|
console.print(f"[bold]Index Migration Check[/bold]")
|
|
console.print(f"Source path: [dim]{target_path}[/dim]")
|
|
console.print(f"Index root: [dim]{index_root}[/dim]")
|
|
if dry_run:
|
|
console.print("[yellow]Mode: DRY RUN (no changes will be made)[/yellow]")
|
|
console.print()
|
|
|
|
# Check current version
|
|
current_version = _get_index_version(index_root)
|
|
needs_migration = current_version is None or (force and current_version != INDEX_FORMAT_VERSION)
|
|
|
|
if current_version and current_version >= INDEX_FORMAT_VERSION and not force:
|
|
result = {
|
|
"path": str(target_path),
|
|
"index_root": str(index_root),
|
|
"current_version": current_version,
|
|
"target_version": INDEX_FORMAT_VERSION,
|
|
"needs_migration": False,
|
|
"message": "Index is already at the latest version",
|
|
}
|
|
|
|
if json_mode:
|
|
print_json(success=True, result=result)
|
|
else:
|
|
console.print(f"[green]OK[/green] Index is already at version {current_version}")
|
|
console.print("[dim]No migration needed. Use --force to re-run migration.[/dim]")
|
|
return
|
|
|
|
# Discover distributed data
|
|
distributed_splade = _discover_distributed_splade(index_root)
|
|
distributed_hnsw = _discover_distributed_hnsw(index_root)
|
|
centralized = _check_centralized_storage(index_root)
|
|
|
|
# Count all _index.db files
|
|
all_index_dbs = list(index_root.rglob("_index.db"))
|
|
|
|
# Build migration report
|
|
migration_report = {
|
|
"path": str(target_path),
|
|
"index_root": str(index_root),
|
|
"dry_run": dry_run,
|
|
"current_version": current_version,
|
|
"target_version": INDEX_FORMAT_VERSION,
|
|
"needs_migration": needs_migration,
|
|
"discovery": {
|
|
"total_index_dbs": len(all_index_dbs),
|
|
"distributed_splade_count": len(distributed_splade),
|
|
"distributed_splade_total_postings": sum(d["posting_count"] for d in distributed_splade),
|
|
"distributed_hnsw_count": len(distributed_hnsw),
|
|
"distributed_hnsw_total_bytes": sum(d["size_bytes"] for d in distributed_hnsw),
|
|
},
|
|
"centralized": centralized,
|
|
"recommendations": [],
|
|
}
|
|
|
|
# Generate recommendations
|
|
if distributed_splade and not centralized["has_splade"]:
|
|
migration_report["recommendations"].append(
|
|
f"Run 'codexlens splade-index {target_path} --rebuild' to consolidate SPLADE data"
|
|
)
|
|
|
|
if distributed_hnsw and not centralized["has_vectors"]:
|
|
migration_report["recommendations"].append(
|
|
f"Run 'codexlens embeddings-generate {target_path} --recursive --force' to consolidate vector data"
|
|
)
|
|
|
|
if not distributed_splade and not distributed_hnsw:
|
|
migration_report["recommendations"].append(
|
|
"No distributed data found. Index may already be using centralized storage."
|
|
)
|
|
|
|
if json_mode:
|
|
# Perform migration action (set version marker) unless dry-run
|
|
if not dry_run and needs_migration:
|
|
_set_index_version(index_root, INDEX_FORMAT_VERSION)
|
|
migration_report["migrated"] = True
|
|
migration_report["new_version"] = INDEX_FORMAT_VERSION
|
|
else:
|
|
migration_report["migrated"] = False
|
|
|
|
print_json(success=True, result=migration_report)
|
|
else:
|
|
# Display discovery results
|
|
console.print("[bold]Discovery Results:[/bold]")
|
|
console.print(f" Total _index.db files: {len(all_index_dbs)}")
|
|
console.print()
|
|
|
|
# Distributed SPLADE
|
|
console.print("[bold]Distributed SPLADE Data:[/bold]")
|
|
if distributed_splade:
|
|
total_postings = sum(d["posting_count"] for d in distributed_splade)
|
|
total_chunks = sum(d["chunk_count"] for d in distributed_splade)
|
|
console.print(f" Found in {len(distributed_splade)} _index.db files")
|
|
console.print(f" Total postings: {total_postings:,}")
|
|
console.print(f" Total chunks: {total_chunks:,}")
|
|
if verbose:
|
|
for d in distributed_splade[:5]:
|
|
console.print(f" [dim]{d['db_path'].parent.name}: {d['posting_count']} postings[/dim]")
|
|
if len(distributed_splade) > 5:
|
|
console.print(f" [dim]... and {len(distributed_splade) - 5} more[/dim]")
|
|
else:
|
|
console.print(" [dim]None found (already centralized or not generated)[/dim]")
|
|
console.print()
|
|
|
|
# Distributed HNSW
|
|
console.print("[bold]Distributed HNSW Files:[/bold]")
|
|
if distributed_hnsw:
|
|
total_size = sum(d["size_bytes"] for d in distributed_hnsw)
|
|
console.print(f" Found {len(distributed_hnsw)} .hnsw files")
|
|
console.print(f" Total size: {total_size / (1024 * 1024):.1f} MB")
|
|
if verbose:
|
|
for d in distributed_hnsw[:5]:
|
|
console.print(f" [dim]{d['hnsw_path'].name}: {d['size_bytes'] / 1024:.1f} KB[/dim]")
|
|
if len(distributed_hnsw) > 5:
|
|
console.print(f" [dim]... and {len(distributed_hnsw) - 5} more[/dim]")
|
|
else:
|
|
console.print(" [dim]None found (already centralized or not generated)[/dim]")
|
|
console.print()
|
|
|
|
# Centralized storage status
|
|
console.print("[bold]Centralized Storage:[/bold]")
|
|
if centralized["has_splade"]:
|
|
stats = centralized.get("splade_stats") or {}
|
|
console.print(f" [green]OK[/green] _splade.db exists")
|
|
if stats:
|
|
console.print(f" Chunks: {stats.get('unique_chunks', 0):,}")
|
|
console.print(f" Postings: {stats.get('total_postings', 0):,}")
|
|
else:
|
|
console.print(f" [yellow]--[/yellow] _splade.db not found")
|
|
|
|
if centralized["has_vectors"]:
|
|
stats = centralized.get("vector_stats") or {}
|
|
size_mb = stats.get("size_bytes", 0) / (1024 * 1024)
|
|
console.print(f" [green]OK[/green] _vectors.hnsw exists ({size_mb:.1f} MB)")
|
|
else:
|
|
console.print(f" [yellow]--[/yellow] _vectors.hnsw not found")
|
|
console.print()
|
|
|
|
# Migration action
|
|
if not dry_run and needs_migration:
|
|
_set_index_version(index_root, INDEX_FORMAT_VERSION)
|
|
console.print(f"[green]OK[/green] Version marker created: {INDEX_FORMAT_VERSION}")
|
|
elif dry_run:
|
|
console.print(f"[yellow]DRY RUN:[/yellow] Would create version marker: {INDEX_FORMAT_VERSION}")
|
|
|
|
# Recommendations
|
|
if migration_report["recommendations"]:
|
|
console.print("\n[bold]Recommendations:[/bold]")
|
|
for rec in migration_report["recommendations"]:
|
|
console.print(f" [cyan]>[/cyan] {rec}")
|
|
|
|
|
|
# ==================== Deprecated Command Aliases ====================
|
|
# These commands maintain backward compatibility with the old CLI structure.
|
|
# They display deprecation warnings and delegate to the new `index` subcommands.
|
|
|
|
|
|
@app.command("embeddings-generate", hidden=True, deprecated=True)
|
|
def embeddings_generate_deprecated(
|
|
path: Path = typer.Argument(
|
|
...,
|
|
exists=True,
|
|
help="Path to _index.db file or project directory.",
|
|
),
|
|
backend: str = typer.Option(
|
|
"fastembed",
|
|
"--backend",
|
|
"-b",
|
|
help="Embedding backend: fastembed (local) or litellm (remote API).",
|
|
),
|
|
model: str = typer.Option(
|
|
"code",
|
|
"--model",
|
|
"-m",
|
|
help="Model: profile name for fastembed or model name for litellm.",
|
|
),
|
|
force: bool = typer.Option(
|
|
False,
|
|
"--force",
|
|
"-f",
|
|
help="Force regeneration even if embeddings exist.",
|
|
),
|
|
chunk_size: int = typer.Option(
|
|
2000,
|
|
"--chunk-size",
|
|
help="Maximum chunk size in characters.",
|
|
),
|
|
max_workers: int = typer.Option(
|
|
1,
|
|
"--max-workers",
|
|
"-w",
|
|
min=1,
|
|
help="Max concurrent API calls.",
|
|
),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
|
|
centralized: bool = typer.Option(
|
|
True,
|
|
"--centralized/--distributed",
|
|
"-c/-d",
|
|
help="Use centralized vector storage (default) or distributed.",
|
|
),
|
|
) -> None:
|
|
"""[Deprecated] Use 'codexlens index embeddings' instead."""
|
|
_deprecated_command_warning("embeddings-generate", "index embeddings")
|
|
index_embeddings(
|
|
path=path,
|
|
backend=backend,
|
|
model=model,
|
|
force=force,
|
|
chunk_size=chunk_size,
|
|
max_workers=max_workers,
|
|
json_mode=json_mode,
|
|
verbose=verbose,
|
|
centralized=centralized,
|
|
)
|
|
|
|
|
|
@app.command("init", hidden=True, deprecated=True)
|
|
def init_deprecated(
|
|
path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to index."),
|
|
language: Optional[List[str]] = typer.Option(None, "--language", "-l", help="Limit indexing to specific languages."),
|
|
workers: Optional[int] = typer.Option(None, "--workers", "-w", min=1, help="Parallel worker processes."),
|
|
force: bool = typer.Option(False, "--force", "-f", help="Force full reindex."),
|
|
no_embeddings: bool = typer.Option(False, "--no-embeddings", help="Skip automatic embedding generation."),
|
|
backend: str = typer.Option("fastembed", "--backend", "-b", help="Embedding backend."),
|
|
model: str = typer.Option("code", "--model", "-m", help="Embedding model."),
|
|
max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls."),
|
|
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
|
) -> None:
|
|
"""[Deprecated] Use 'codexlens index init' instead."""
|
|
_deprecated_command_warning("init", "index init")
|
|
index_init(
|
|
path=path,
|
|
language=language,
|
|
workers=workers,
|
|
force=force,
|
|
no_embeddings=no_embeddings,
|
|
backend=backend,
|
|
model=model,
|
|
max_workers=max_workers,
|
|
json_mode=json_mode,
|
|
verbose=verbose,
|
|
)
|
|
|
|
|
|
@app.command("splade-index", hidden=True, deprecated=True)
|
|
def splade_index_deprecated(
|
|
path: Path = typer.Argument(..., help="Project path to index"),
|
|
rebuild: bool = typer.Option(False, "--rebuild", "-r", help="Force rebuild SPLADE index"),
|
|
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
|
|
) -> None:
|
|
"""[Deprecated] Use 'codexlens index splade' instead."""
|
|
_deprecated_command_warning("splade-index", "index splade")
|
|
index_splade(
|
|
path=path,
|
|
rebuild=rebuild,
|
|
verbose=verbose,
|
|
)
|
|
|
|
|
|
@app.command("cascade-index", hidden=True, deprecated=True)
|
|
def cascade_index_deprecated(
|
|
path: Annotated[Path, typer.Argument(help="Directory to index")],
|
|
force: Annotated[bool, typer.Option("--force", "-f", help="Force regenerate")] = False,
|
|
batch_size: Annotated[int, typer.Option("--batch-size", "-b", help="Batch size for embedding")] = 32,
|
|
json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False,
|
|
verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False,
|
|
) -> None:
|
|
"""[Deprecated] Use 'codexlens index binary' instead."""
|
|
_deprecated_command_warning("cascade-index", "index binary")
|
|
index_binary(
|
|
path=path,
|
|
force=force,
|
|
batch_size=batch_size,
|
|
json_mode=json_mode,
|
|
verbose=verbose,
|
|
)
|
|
|
|
|
|
@app.command("index-migrate", hidden=True, deprecated=True)
|
|
def index_migrate_deprecated(
|
|
path: Annotated[Optional[str], typer.Argument(help="Project path to migrate")] = None,
|
|
dry_run: Annotated[bool, typer.Option("--dry-run", help="Show what would be migrated")] = False,
|
|
force: Annotated[bool, typer.Option("--force", help="Force migration")] = False,
|
|
json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False,
|
|
verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose output")] = False,
|
|
) -> None:
|
|
"""[Deprecated] Use 'codexlens index migrate' instead."""
|
|
_deprecated_command_warning("index-migrate", "index migrate")
|
|
index_migrate_cmd(
|
|
path=path,
|
|
dry_run=dry_run,
|
|
force=force,
|
|
json_mode=json_mode,
|
|
verbose=verbose,
|
|
)
|
|
|