Add comprehensive tests for schema cleanup migration and search comparison

- Implement tests for migration 005 to verify removal of deprecated fields in the database schema.
- Ensure that new databases are created with a clean schema.
- Validate that keywords are correctly extracted from the normalized file_keywords table.
- Test symbol insertion without deprecated fields and subdir operations without direct_files.
- Create a detailed search comparison test to evaluate vector search vs hybrid search performance.
- Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality.
- Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
This commit is contained in:
catlog22
2025-12-16 19:27:05 +08:00
parent 3da0ef2adb
commit df23975a0b
61 changed files with 13114 additions and 366 deletions

View File

@@ -2,6 +2,25 @@
from __future__ import annotations
import sys
import os
# Force UTF-8 encoding for Windows console
# This ensures Chinese characters display correctly instead of GBK garbled text
if sys.platform == "win32":
# Set environment variable for Python I/O encoding
os.environ.setdefault("PYTHONIOENCODING", "utf-8")
# Reconfigure stdout/stderr to use UTF-8 if possible
try:
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
if hasattr(sys.stderr, "reconfigure"):
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
except Exception:
# Fallback: some environments don't support reconfigure
pass
from .commands import app
__all__ = ["app"]

View File

@@ -181,31 +181,46 @@ def search(
limit: int = typer.Option(20, "--limit", "-n", min=1, max=500, help="Max results."),
depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."),
files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
mode: str = typer.Option("exact", "--mode", "-m", help="Search mode: exact, fuzzy, hybrid, vector."),
mode: str = typer.Option("exact", "--mode", "-m", help="Search mode: exact, fuzzy, hybrid, vector, pure-vector."),
weights: Optional[str] = typer.Option(None, "--weights", help="Custom RRF weights as 'exact,fuzzy,vector' (e.g., '0.5,0.3,0.2')."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
) -> None:
"""Search indexed file contents using SQLite FTS5.
"""Search indexed file contents using SQLite FTS5 or semantic vectors.
Uses chain search across directory indexes.
Use --depth to limit search recursion (0 = current dir only).
Search Modes:
- exact: Exact FTS using unicode61 tokenizer (default)
- fuzzy: Fuzzy FTS using trigram tokenizer
- hybrid: RRF fusion of exact + fuzzy (recommended)
- vector: Semantic vector search (future)
- exact: Exact FTS using unicode61 tokenizer (default) - for code identifiers
- fuzzy: Fuzzy FTS using trigram tokenizer - for typo-tolerant search
- hybrid: RRF fusion of exact + fuzzy + vector (recommended) - best recall
- vector: Vector search with exact FTS fallback - semantic + keyword
- pure-vector: Pure semantic vector search only - natural language queries
Vector Search Requirements:
Vector search modes require pre-generated embeddings.
Use 'codexlens embeddings-generate' to create embeddings first.
Hybrid Mode:
Default weights: exact=0.4, fuzzy=0.3, vector=0.3
Use --weights to customize (e.g., --weights 0.5,0.3,0.2)
Examples:
# Exact code search
codexlens search "authenticate_user" --mode exact
# Semantic search (requires embeddings)
codexlens search "how to verify user credentials" --mode pure-vector
# Best of both worlds
codexlens search "authentication" --mode hybrid
"""
_configure_logging(verbose)
search_path = path.expanduser().resolve()
# Validate mode
valid_modes = ["exact", "fuzzy", "hybrid", "vector"]
valid_modes = ["exact", "fuzzy", "hybrid", "vector", "pure-vector"]
if mode not in valid_modes:
if json_mode:
print_json(success=False, error=f"Invalid mode: {mode}. Must be one of: {', '.join(valid_modes)}")
@@ -244,8 +259,18 @@ def search(
engine = ChainSearchEngine(registry, mapper)
# Map mode to options
hybrid_mode = mode == "hybrid"
enable_fuzzy = mode in ["fuzzy", "hybrid"]
if mode == "exact":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, False, False, False
elif mode == "fuzzy":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, True, False, False
elif mode == "vector":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, False # Vector + exact fallback
elif mode == "pure-vector":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, True # Pure vector only
elif mode == "hybrid":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, True, True, False
else:
raise ValueError(f"Invalid mode: {mode}")
options = SearchOptions(
depth=depth,
@@ -253,6 +278,8 @@ def search(
files_only=files_only,
hybrid_mode=hybrid_mode,
enable_fuzzy=enable_fuzzy,
enable_vector=enable_vector,
pure_vector=pure_vector,
hybrid_weights=hybrid_weights,
)
@@ -1573,3 +1600,483 @@ def semantic_list(
finally:
if registry is not None:
registry.close()
# ==================== Model Management Commands ====================
@app.command(name="model-list")
def model_list(
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
) -> None:
"""List available embedding models and their installation status.
Shows 4 model profiles (fast, code, multilingual, balanced) with:
- Installation status
- Model size and dimensions
- Use case recommendations
"""
try:
from codexlens.cli.model_manager import list_models
result = list_models()
if json_mode:
print_json(**result)
else:
if not result["success"]:
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
raise typer.Exit(code=1)
data = result["result"]
models = data["models"]
cache_dir = data["cache_dir"]
cache_exists = data["cache_exists"]
console.print("[bold]Available Embedding Models:[/bold]")
console.print(f"Cache directory: [dim]{cache_dir}[/dim] {'(exists)' if cache_exists else '(not found)'}\n")
table = Table(show_header=True, header_style="bold")
table.add_column("Profile", style="cyan")
table.add_column("Model Name", style="blue")
table.add_column("Dims", justify="right")
table.add_column("Size (MB)", justify="right")
table.add_column("Status", justify="center")
table.add_column("Use Case", style="dim")
for model in models:
status_icon = "[green]✓[/green]" if model["installed"] else "[dim]—[/dim]"
size_display = (
f"{model['actual_size_mb']:.1f}" if model["installed"]
else f"~{model['estimated_size_mb']}"
)
table.add_row(
model["profile"],
model["model_name"],
str(model["dimensions"]),
size_display,
status_icon,
model["use_case"][:40] + "..." if len(model["use_case"]) > 40 else model["use_case"],
)
console.print(table)
console.print("\n[dim]Use 'codexlens model-download <profile>' to download a model[/dim]")
except ImportError:
if json_mode:
print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]")
else:
console.print("[red]Error:[/red] fastembed not installed")
console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]")
raise typer.Exit(code=1)
except Exception as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Model-list failed:[/red] {exc}")
raise typer.Exit(code=1)
@app.command(name="model-download")
def model_download(
profile: str = typer.Argument(..., help="Model profile to download (fast, code, multilingual, balanced)."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
) -> None:
"""Download an embedding model by profile name.
Example:
codexlens model-download code # Download code-optimized model
"""
try:
from codexlens.cli.model_manager import download_model
if not json_mode:
console.print(f"[bold]Downloading model:[/bold] {profile}")
console.print("[dim]This may take a few minutes depending on your internet connection...[/dim]\n")
# Create progress callback for non-JSON mode
progress_callback = None if json_mode else lambda msg: console.print(f"[cyan]{msg}[/cyan]")
result = download_model(profile, progress_callback=progress_callback)
if json_mode:
print_json(**result)
else:
if not result["success"]:
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
raise typer.Exit(code=1)
data = result["result"]
console.print(f"[green]✓[/green] Model downloaded successfully!")
console.print(f" Profile: {data['profile']}")
console.print(f" Model: {data['model_name']}")
console.print(f" Cache size: {data['cache_size_mb']:.1f} MB")
console.print(f" Location: [dim]{data['cache_path']}[/dim]")
except ImportError:
if json_mode:
print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]")
else:
console.print("[red]Error:[/red] fastembed not installed")
console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]")
raise typer.Exit(code=1)
except Exception as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Model-download failed:[/red] {exc}")
raise typer.Exit(code=1)
@app.command(name="model-delete")
def model_delete(
profile: str = typer.Argument(..., help="Model profile to delete (fast, code, multilingual, balanced)."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
) -> None:
"""Delete a downloaded embedding model from cache.
Example:
codexlens model-delete fast # Delete fast model
"""
try:
from codexlens.cli.model_manager import delete_model
if not json_mode:
console.print(f"[bold yellow]Deleting model:[/bold yellow] {profile}")
result = delete_model(profile)
if json_mode:
print_json(**result)
else:
if not result["success"]:
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
raise typer.Exit(code=1)
data = result["result"]
console.print(f"[green]✓[/green] Model deleted successfully!")
console.print(f" Profile: {data['profile']}")
console.print(f" Model: {data['model_name']}")
console.print(f" Freed space: {data['deleted_size_mb']:.1f} MB")
except Exception as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Model-delete failed:[/red] {exc}")
raise typer.Exit(code=1)
@app.command(name="model-info")
def model_info(
profile: str = typer.Argument(..., help="Model profile to get info (fast, code, multilingual, balanced)."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
) -> None:
"""Get detailed information about a model profile.
Example:
codexlens model-info code # Get code model details
"""
try:
from codexlens.cli.model_manager import get_model_info
result = get_model_info(profile)
if json_mode:
print_json(**result)
else:
if not result["success"]:
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
raise typer.Exit(code=1)
data = result["result"]
console.print(f"[bold]Model Profile:[/bold] {data['profile']}")
console.print(f" Model name: {data['model_name']}")
console.print(f" Dimensions: {data['dimensions']}")
console.print(f" Status: {'[green]Installed[/green]' if data['installed'] else '[dim]Not installed[/dim]'}")
if data['installed'] and data['actual_size_mb']:
console.print(f" Cache size: {data['actual_size_mb']:.1f} MB")
console.print(f" Location: [dim]{data['cache_path']}[/dim]")
else:
console.print(f" Estimated size: ~{data['estimated_size_mb']} MB")
console.print(f"\n Description: {data['description']}")
console.print(f" Use case: {data['use_case']}")
except Exception as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Model-info failed:[/red] {exc}")
raise typer.Exit(code=1)
# ==================== Embedding Management Commands ====================
@app.command(name="embeddings-status")
def embeddings_status(
path: Optional[Path] = typer.Argument(
None,
exists=True,
help="Path to specific _index.db file or directory containing indexes. If not specified, uses default index root.",
),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
) -> None:
"""Check embedding status for one or all indexes.
Shows embedding statistics including:
- Number of chunks generated
- File coverage percentage
- Files missing embeddings
Examples:
codexlens embeddings-status # Check all indexes
codexlens embeddings-status ~/.codexlens/indexes/project/_index.db # Check specific index
codexlens embeddings-status ~/projects/my-app # Check project (auto-finds index)
"""
try:
from codexlens.cli.embedding_manager import check_index_embeddings, get_embedding_stats_summary
# Determine what to check
if path is None:
# Check all indexes in default root
index_root = _get_index_root()
result = get_embedding_stats_summary(index_root)
if json_mode:
print_json(**result)
else:
if not result["success"]:
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
raise typer.Exit(code=1)
data = result["result"]
total = data["total_indexes"]
with_emb = data["indexes_with_embeddings"]
total_chunks = data["total_chunks"]
console.print(f"[bold]Embedding Status Summary[/bold]")
console.print(f"Index root: [dim]{index_root}[/dim]\n")
console.print(f"Total indexes: {total}")
console.print(f"Indexes with embeddings: [{'green' if with_emb > 0 else 'yellow'}]{with_emb}[/]/{total}")
console.print(f"Total chunks: {total_chunks:,}\n")
if data["indexes"]:
table = Table(show_header=True, header_style="bold")
table.add_column("Project", style="cyan")
table.add_column("Files", justify="right")
table.add_column("Chunks", justify="right")
table.add_column("Coverage", justify="right")
table.add_column("Status", justify="center")
for idx_stat in data["indexes"]:
status_icon = "[green]✓[/green]" if idx_stat["has_embeddings"] else "[dim]—[/dim]"
coverage = f"{idx_stat['coverage_percent']:.1f}%" if idx_stat["has_embeddings"] else ""
table.add_row(
idx_stat["project"],
str(idx_stat["total_files"]),
f"{idx_stat['total_chunks']:,}" if idx_stat["has_embeddings"] else "0",
coverage,
status_icon,
)
console.print(table)
else:
# Check specific index or find index for project
target_path = path.expanduser().resolve()
if target_path.is_file() and target_path.name == "_index.db":
# Direct index file
index_path = target_path
elif target_path.is_dir():
# Try to find index for this project
registry = RegistryStore()
try:
registry.initialize()
mapper = PathMapper()
index_path = mapper.source_to_index_db(target_path)
if not index_path.exists():
console.print(f"[red]Error:[/red] No index found for {target_path}")
console.print("Run 'codexlens init' first to create an index")
raise typer.Exit(code=1)
finally:
registry.close()
else:
console.print(f"[red]Error:[/red] Path must be _index.db file or directory")
raise typer.Exit(code=1)
result = check_index_embeddings(index_path)
if json_mode:
print_json(**result)
else:
if not result["success"]:
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
raise typer.Exit(code=1)
data = result["result"]
has_emb = data["has_embeddings"]
console.print(f"[bold]Embedding Status[/bold]")
console.print(f"Index: [dim]{data['index_path']}[/dim]\n")
if has_emb:
console.print(f"[green]✓[/green] Embeddings available")
console.print(f" Total chunks: {data['total_chunks']:,}")
console.print(f" Total files: {data['total_files']:,}")
console.print(f" Files with embeddings: {data['files_with_chunks']:,}/{data['total_files']}")
console.print(f" Coverage: {data['coverage_percent']:.1f}%")
if data["files_without_chunks"] > 0:
console.print(f"\n[yellow]Warning:[/yellow] {data['files_without_chunks']} files missing embeddings")
if data["missing_files_sample"]:
console.print(" Sample missing files:")
for file in data["missing_files_sample"]:
console.print(f" [dim]{file}[/dim]")
else:
console.print(f"[yellow]—[/yellow] No embeddings found")
console.print(f" Total files indexed: {data['total_files']:,}")
console.print("\n[dim]Generate embeddings with:[/dim]")
console.print(f" [cyan]codexlens embeddings-generate {index_path}[/cyan]")
except Exception as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Embeddings-status failed:[/red] {exc}")
raise typer.Exit(code=1)
@app.command(name="embeddings-generate")
def embeddings_generate(
path: Path = typer.Argument(
...,
exists=True,
help="Path to _index.db file or project directory.",
),
model: str = typer.Option(
"code",
"--model",
"-m",
help="Model profile: fast, code, multilingual, balanced.",
),
force: bool = typer.Option(
False,
"--force",
"-f",
help="Force regeneration even if embeddings exist.",
),
chunk_size: int = typer.Option(
2000,
"--chunk-size",
help="Maximum chunk size in characters.",
),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
) -> None:
"""Generate semantic embeddings for code search.
Creates vector embeddings for all files in an index to enable
semantic search capabilities. Embeddings are stored in the same
database as the FTS index.
Model Profiles:
- fast: BAAI/bge-small-en-v1.5 (384 dims, ~80MB)
- code: jinaai/jina-embeddings-v2-base-code (768 dims, ~150MB) [recommended]
- multilingual: intfloat/multilingual-e5-large (1024 dims, ~1GB)
- balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dims, ~600MB)
Examples:
codexlens embeddings-generate ~/projects/my-app # Auto-find index for project
codexlens embeddings-generate ~/.codexlens/indexes/project/_index.db # Specific index
codexlens embeddings-generate ~/projects/my-app --model fast --force # Regenerate with fast model
"""
_configure_logging(verbose)
try:
from codexlens.cli.embedding_manager import generate_embeddings
# Resolve path
target_path = path.expanduser().resolve()
if target_path.is_file() and target_path.name == "_index.db":
# Direct index file
index_path = target_path
elif target_path.is_dir():
# Try to find index for this project
registry = RegistryStore()
try:
registry.initialize()
mapper = PathMapper()
index_path = mapper.source_to_index_db(target_path)
if not index_path.exists():
console.print(f"[red]Error:[/red] No index found for {target_path}")
console.print("Run 'codexlens init' first to create an index")
raise typer.Exit(code=1)
finally:
registry.close()
else:
console.print(f"[red]Error:[/red] Path must be _index.db file or directory")
raise typer.Exit(code=1)
# Progress callback
def progress_update(msg: str):
if not json_mode and verbose:
console.print(f" {msg}")
console.print(f"[bold]Generating embeddings[/bold]")
console.print(f"Index: [dim]{index_path}[/dim]")
console.print(f"Model: [cyan]{model}[/cyan]\n")
result = generate_embeddings(
index_path,
model_profile=model,
force=force,
chunk_size=chunk_size,
progress_callback=progress_update,
)
if json_mode:
print_json(**result)
else:
if not result["success"]:
error_msg = result.get("error", "Unknown error")
console.print(f"[red]Error:[/red] {error_msg}")
# Provide helpful hints
if "already has" in error_msg:
console.print("\n[dim]Use --force to regenerate existing embeddings[/dim]")
elif "Semantic search not available" in error_msg:
console.print("\n[dim]Install semantic dependencies:[/dim]")
console.print(" [cyan]pip install codexlens[semantic][/cyan]")
raise typer.Exit(code=1)
data = result["result"]
elapsed = data["elapsed_time"]
console.print(f"[green]✓[/green] Embeddings generated successfully!")
console.print(f" Model: {data['model_name']}")
console.print(f" Chunks created: {data['chunks_created']:,}")
console.print(f" Files processed: {data['files_processed']}")
if data["files_failed"] > 0:
console.print(f" [yellow]Files failed: {data['files_failed']}[/yellow]")
if data["failed_files"]:
console.print(" [dim]First failures:[/dim]")
for file_path, error in data["failed_files"]:
console.print(f" [dim]{file_path}: {error}[/dim]")
console.print(f" Time: {elapsed:.1f}s")
console.print("\n[dim]Use vector search with:[/dim]")
console.print(" [cyan]codexlens search 'your query' --mode pure-vector[/cyan]")
except Exception as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Embeddings-generate failed:[/red] {exc}")
raise typer.Exit(code=1)

View File

@@ -0,0 +1,331 @@
"""Embedding Manager - Manage semantic embeddings for code indexes."""
import logging
import sqlite3
import time
from pathlib import Path
from typing import Dict, List, Optional
try:
from codexlens.semantic import SEMANTIC_AVAILABLE
if SEMANTIC_AVAILABLE:
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
except ImportError:
SEMANTIC_AVAILABLE = False
logger = logging.getLogger(__name__)
def check_index_embeddings(index_path: Path) -> Dict[str, any]:
"""Check if an index has embeddings and return statistics.
Args:
index_path: Path to _index.db file
Returns:
Dictionary with embedding statistics and status
"""
if not index_path.exists():
return {
"success": False,
"error": f"Index not found: {index_path}",
}
try:
with sqlite3.connect(index_path) as conn:
# Check if semantic_chunks table exists
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
)
table_exists = cursor.fetchone() is not None
if not table_exists:
# Count total indexed files even without embeddings
cursor = conn.execute("SELECT COUNT(*) FROM files")
total_files = cursor.fetchone()[0]
return {
"success": True,
"result": {
"has_embeddings": False,
"total_chunks": 0,
"total_files": total_files,
"files_with_chunks": 0,
"files_without_chunks": total_files,
"coverage_percent": 0.0,
"missing_files_sample": [],
"index_path": str(index_path),
},
}
# Count total chunks
cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
total_chunks = cursor.fetchone()[0]
# Count total indexed files
cursor = conn.execute("SELECT COUNT(*) FROM files")
total_files = cursor.fetchone()[0]
# Count files with embeddings
cursor = conn.execute(
"SELECT COUNT(DISTINCT file_path) FROM semantic_chunks"
)
files_with_chunks = cursor.fetchone()[0]
# Get a sample of files without embeddings
cursor = conn.execute("""
SELECT full_path
FROM files
WHERE full_path NOT IN (
SELECT DISTINCT file_path FROM semantic_chunks
)
LIMIT 5
""")
missing_files = [row[0] for row in cursor.fetchall()]
return {
"success": True,
"result": {
"has_embeddings": total_chunks > 0,
"total_chunks": total_chunks,
"total_files": total_files,
"files_with_chunks": files_with_chunks,
"files_without_chunks": total_files - files_with_chunks,
"coverage_percent": round((files_with_chunks / total_files * 100) if total_files > 0 else 0, 1),
"missing_files_sample": missing_files,
"index_path": str(index_path),
},
}
except Exception as e:
return {
"success": False,
"error": f"Failed to check embeddings: {str(e)}",
}
def generate_embeddings(
index_path: Path,
model_profile: str = "code",
force: bool = False,
chunk_size: int = 2000,
progress_callback: Optional[callable] = None,
) -> Dict[str, any]:
"""Generate embeddings for an index.
Args:
index_path: Path to _index.db file
model_profile: Model profile (fast, code, multilingual, balanced)
force: If True, regenerate even if embeddings exist
chunk_size: Maximum chunk size in characters
progress_callback: Optional callback for progress updates
Returns:
Result dictionary with generation statistics
"""
if not SEMANTIC_AVAILABLE:
return {
"success": False,
"error": "Semantic search not available. Install with: pip install codexlens[semantic]",
}
if not index_path.exists():
return {
"success": False,
"error": f"Index not found: {index_path}",
}
# Check existing chunks
status = check_index_embeddings(index_path)
if not status["success"]:
return status
existing_chunks = status["result"]["total_chunks"]
if existing_chunks > 0 and not force:
return {
"success": False,
"error": f"Index already has {existing_chunks} chunks. Use --force to regenerate.",
"existing_chunks": existing_chunks,
}
if force and existing_chunks > 0:
if progress_callback:
progress_callback(f"Clearing {existing_chunks} existing chunks...")
try:
with sqlite3.connect(index_path) as conn:
conn.execute("DELETE FROM semantic_chunks")
conn.commit()
except Exception as e:
return {
"success": False,
"error": f"Failed to clear existing chunks: {str(e)}",
}
# Initialize components
try:
embedder = Embedder(profile=model_profile)
vector_store = VectorStore(index_path)
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
if progress_callback:
progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
except Exception as e:
return {
"success": False,
"error": f"Failed to initialize components: {str(e)}",
}
# Read files from index
try:
with sqlite3.connect(index_path) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.execute("SELECT full_path, content, language FROM files")
files = cursor.fetchall()
except Exception as e:
return {
"success": False,
"error": f"Failed to read files: {str(e)}",
}
if len(files) == 0:
return {
"success": False,
"error": "No files found in index",
}
if progress_callback:
progress_callback(f"Processing {len(files)} files...")
# Process each file
total_chunks = 0
failed_files = []
start_time = time.time()
for idx, file_row in enumerate(files, 1):
file_path = file_row["full_path"]
content = file_row["content"]
language = file_row["language"] or "python"
try:
# Create chunks
chunks = chunker.chunk_sliding_window(
content,
file_path=file_path,
language=language
)
if not chunks:
continue
# Generate embeddings
for chunk in chunks:
embedding = embedder.embed_single(chunk.content)
chunk.embedding = embedding
# Store chunks
vector_store.add_chunks(chunks, file_path)
total_chunks += len(chunks)
if progress_callback:
progress_callback(f"[{idx}/{len(files)}] {file_path}: {len(chunks)} chunks")
except Exception as e:
logger.error(f"Failed to process {file_path}: {e}")
failed_files.append((file_path, str(e)))
elapsed_time = time.time() - start_time
return {
"success": True,
"result": {
"chunks_created": total_chunks,
"files_processed": len(files) - len(failed_files),
"files_failed": len(failed_files),
"elapsed_time": elapsed_time,
"model_profile": model_profile,
"model_name": embedder.model_name,
"failed_files": failed_files[:5], # First 5 failures
"index_path": str(index_path),
},
}
def find_all_indexes(scan_dir: Path) -> List[Path]:
"""Find all _index.db files in directory tree.
Args:
scan_dir: Directory to scan
Returns:
List of paths to _index.db files
"""
if not scan_dir.exists():
return []
return list(scan_dir.rglob("_index.db"))
def get_embedding_stats_summary(index_root: Path) -> Dict[str, any]:
"""Get summary statistics for all indexes in root directory.
Args:
index_root: Root directory containing indexes
Returns:
Summary statistics for all indexes
"""
indexes = find_all_indexes(index_root)
if not indexes:
return {
"success": True,
"result": {
"total_indexes": 0,
"indexes_with_embeddings": 0,
"total_chunks": 0,
"indexes": [],
},
}
total_chunks = 0
indexes_with_embeddings = 0
index_stats = []
for index_path in indexes:
status = check_index_embeddings(index_path)
if status["success"]:
result = status["result"]
has_emb = result["has_embeddings"]
chunks = result["total_chunks"]
if has_emb:
indexes_with_embeddings += 1
total_chunks += chunks
# Extract project name from path
project_name = index_path.parent.name
index_stats.append({
"project": project_name,
"path": str(index_path),
"has_embeddings": has_emb,
"total_chunks": chunks,
"total_files": result["total_files"],
"coverage_percent": result.get("coverage_percent", 0),
})
return {
"success": True,
"result": {
"total_indexes": len(indexes),
"indexes_with_embeddings": indexes_with_embeddings,
"total_chunks": total_chunks,
"indexes": index_stats,
},
}

View File

@@ -0,0 +1,289 @@
"""Model Manager - Manage fastembed models for semantic search."""
import json
import os
import shutil
from pathlib import Path
from typing import Dict, List, Optional
try:
from fastembed import TextEmbedding
FASTEMBED_AVAILABLE = True
except ImportError:
FASTEMBED_AVAILABLE = False
# Model profiles with metadata
MODEL_PROFILES = {
"fast": {
"model_name": "BAAI/bge-small-en-v1.5",
"dimensions": 384,
"size_mb": 80,
"description": "Fast, lightweight, English-optimized",
"use_case": "Quick prototyping, resource-constrained environments",
},
"code": {
"model_name": "jinaai/jina-embeddings-v2-base-code",
"dimensions": 768,
"size_mb": 150,
"description": "Code-optimized, best for programming languages",
"use_case": "Open source projects, code semantic search",
},
"multilingual": {
"model_name": "intfloat/multilingual-e5-large",
"dimensions": 1024,
"size_mb": 1000,
"description": "Multilingual + code support",
"use_case": "Enterprise multilingual projects",
},
"balanced": {
"model_name": "mixedbread-ai/mxbai-embed-large-v1",
"dimensions": 1024,
"size_mb": 600,
"description": "High accuracy, general purpose",
"use_case": "High-quality semantic search, balanced performance",
},
}
def get_cache_dir() -> Path:
"""Get fastembed cache directory.
Returns:
Path to cache directory (usually ~/.cache/fastembed or %LOCALAPPDATA%\\Temp\\fastembed_cache)
"""
# Check HF_HOME environment variable first
if "HF_HOME" in os.environ:
return Path(os.environ["HF_HOME"])
# Default cache locations
if os.name == "nt": # Windows
cache_dir = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local")) / "Temp" / "fastembed_cache"
else: # Unix-like
cache_dir = Path.home() / ".cache" / "fastembed"
return cache_dir
def list_models() -> Dict[str, any]:
"""List available model profiles and their installation status.
Returns:
Dictionary with model profiles, installed status, and cache info
"""
if not FASTEMBED_AVAILABLE:
return {
"success": False,
"error": "fastembed not installed. Install with: pip install codexlens[semantic]",
}
cache_dir = get_cache_dir()
cache_exists = cache_dir.exists()
models = []
for profile, info in MODEL_PROFILES.items():
model_name = info["model_name"]
# Check if model is cached
installed = False
cache_size_mb = 0
if cache_exists:
# Check for model directory in cache
model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
if model_cache_path.exists():
installed = True
# Calculate cache size
total_size = sum(
f.stat().st_size
for f in model_cache_path.rglob("*")
if f.is_file()
)
cache_size_mb = round(total_size / (1024 * 1024), 1)
models.append({
"profile": profile,
"model_name": model_name,
"dimensions": info["dimensions"],
"estimated_size_mb": info["size_mb"],
"actual_size_mb": cache_size_mb if installed else None,
"description": info["description"],
"use_case": info["use_case"],
"installed": installed,
})
return {
"success": True,
"result": {
"models": models,
"cache_dir": str(cache_dir),
"cache_exists": cache_exists,
},
}
def download_model(profile: str, progress_callback: Optional[callable] = None) -> Dict[str, any]:
"""Download a model by profile name.
Args:
profile: Model profile name (fast, code, multilingual, balanced)
progress_callback: Optional callback function to report progress
Returns:
Result dictionary with success status
"""
if not FASTEMBED_AVAILABLE:
return {
"success": False,
"error": "fastembed not installed. Install with: pip install codexlens[semantic]",
}
if profile not in MODEL_PROFILES:
return {
"success": False,
"error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}",
}
model_name = MODEL_PROFILES[profile]["model_name"]
try:
# Download model by instantiating TextEmbedding
# This will automatically download to cache if not present
if progress_callback:
progress_callback(f"Downloading {model_name}...")
embedder = TextEmbedding(model_name=model_name)
if progress_callback:
progress_callback(f"Model {model_name} downloaded successfully")
# Get cache info
cache_dir = get_cache_dir()
model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
cache_size = 0
if model_cache_path.exists():
total_size = sum(
f.stat().st_size
for f in model_cache_path.rglob("*")
if f.is_file()
)
cache_size = round(total_size / (1024 * 1024), 1)
return {
"success": True,
"result": {
"profile": profile,
"model_name": model_name,
"cache_size_mb": cache_size,
"cache_path": str(model_cache_path),
},
}
except Exception as e:
return {
"success": False,
"error": f"Failed to download model: {str(e)}",
}
def delete_model(profile: str) -> Dict[str, any]:
"""Delete a downloaded model from cache.
Args:
profile: Model profile name to delete
Returns:
Result dictionary with success status
"""
if profile not in MODEL_PROFILES:
return {
"success": False,
"error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}",
}
model_name = MODEL_PROFILES[profile]["model_name"]
cache_dir = get_cache_dir()
model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
if not model_cache_path.exists():
return {
"success": False,
"error": f"Model {profile} ({model_name}) is not installed",
}
try:
# Calculate size before deletion
total_size = sum(
f.stat().st_size
for f in model_cache_path.rglob("*")
if f.is_file()
)
size_mb = round(total_size / (1024 * 1024), 1)
# Delete model directory
shutil.rmtree(model_cache_path)
return {
"success": True,
"result": {
"profile": profile,
"model_name": model_name,
"deleted_size_mb": size_mb,
"cache_path": str(model_cache_path),
},
}
except Exception as e:
return {
"success": False,
"error": f"Failed to delete model: {str(e)}",
}
def get_model_info(profile: str) -> Dict[str, any]:
"""Get detailed information about a model profile.
Args:
profile: Model profile name
Returns:
Result dictionary with model information
"""
if profile not in MODEL_PROFILES:
return {
"success": False,
"error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}",
}
info = MODEL_PROFILES[profile]
model_name = info["model_name"]
# Check installation status
cache_dir = get_cache_dir()
model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
installed = model_cache_path.exists()
cache_size_mb = None
if installed:
total_size = sum(
f.stat().st_size
for f in model_cache_path.rglob("*")
if f.is_file()
)
cache_size_mb = round(total_size / (1024 * 1024), 1)
return {
"success": True,
"result": {
"profile": profile,
"model_name": model_name,
"dimensions": info["dimensions"],
"estimated_size_mb": info["size_mb"],
"actual_size_mb": cache_size_mb,
"description": info["description"],
"use_case": info["use_case"],
"installed": installed,
"cache_path": str(model_cache_path) if installed else None,
},
}

View File

@@ -3,6 +3,7 @@
from __future__ import annotations
import json
import sys
from dataclasses import asdict, is_dataclass
from pathlib import Path
from typing import Any, Iterable, Mapping, Sequence
@@ -13,7 +14,9 @@ from rich.text import Text
from codexlens.entities import SearchResult, Symbol
console = Console()
# Force UTF-8 encoding for Windows console to properly display Chinese text
# Use force_terminal=True and legacy_windows=False to avoid GBK encoding issues
console = Console(force_terminal=True, legacy_windows=False)
def _to_jsonable(value: Any) -> Any: