Add comprehensive tests for schema cleanup migration and search comparison

- Implement tests for migration 005 to verify removal of deprecated fields in the database schema.
- Ensure that new databases are created with a clean schema.
- Validate that keywords are correctly extracted from the normalized file_keywords table.
- Test symbol insertion without deprecated fields and subdir operations without direct_files.
- Create a detailed search comparison test to evaluate vector search vs hybrid search performance.
- Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality.
- Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
This commit is contained in:
catlog22
2025-12-16 19:27:05 +08:00
parent 3da0ef2adb
commit df23975a0b
61 changed files with 13114 additions and 366 deletions

View File

@@ -18,3 +18,7 @@ Requires-Dist: pathspec>=0.11
Provides-Extra: semantic
Requires-Dist: numpy>=1.24; extra == "semantic"
Requires-Dist: fastembed>=0.2; extra == "semantic"
Provides-Extra: encoding
Requires-Dist: chardet>=5.0; extra == "encoding"
Provides-Extra: full
Requires-Dist: tiktoken>=0.5.0; extra == "full"

View File

@@ -11,15 +11,23 @@ src/codexlens/entities.py
src/codexlens/errors.py
src/codexlens/cli/__init__.py
src/codexlens/cli/commands.py
src/codexlens/cli/model_manager.py
src/codexlens/cli/output.py
src/codexlens/parsers/__init__.py
src/codexlens/parsers/encoding.py
src/codexlens/parsers/factory.py
src/codexlens/parsers/tokenizer.py
src/codexlens/parsers/treesitter_parser.py
src/codexlens/search/__init__.py
src/codexlens/search/chain_search.py
src/codexlens/search/hybrid_search.py
src/codexlens/search/query_parser.py
src/codexlens/search/ranking.py
src/codexlens/semantic/__init__.py
src/codexlens/semantic/chunker.py
src/codexlens/semantic/code_extractor.py
src/codexlens/semantic/embedder.py
src/codexlens/semantic/graph_analyzer.py
src/codexlens/semantic/llm_enhancer.py
src/codexlens/semantic/vector_store.py
src/codexlens/storage/__init__.py
@@ -30,21 +38,45 @@ src/codexlens/storage/migration_manager.py
src/codexlens/storage/path_mapper.py
src/codexlens/storage/registry.py
src/codexlens/storage/sqlite_store.py
src/codexlens/storage/sqlite_utils.py
src/codexlens/storage/migrations/__init__.py
src/codexlens/storage/migrations/migration_001_normalize_keywords.py
src/codexlens/storage/migrations/migration_002_add_token_metadata.py
src/codexlens/storage/migrations/migration_003_code_relationships.py
src/codexlens/storage/migrations/migration_004_dual_fts.py
src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py
tests/test_chain_search_engine.py
tests/test_cli_hybrid_search.py
tests/test_cli_output.py
tests/test_code_extractor.py
tests/test_config.py
tests/test_dual_fts.py
tests/test_encoding.py
tests/test_entities.py
tests/test_errors.py
tests/test_file_cache.py
tests/test_graph_analyzer.py
tests/test_graph_cli.py
tests/test_graph_storage.py
tests/test_hybrid_chunker.py
tests/test_hybrid_search_e2e.py
tests/test_incremental_indexing.py
tests/test_llm_enhancer.py
tests/test_parser_integration.py
tests/test_parsers.py
tests/test_performance_optimizations.py
tests/test_query_parser.py
tests/test_rrf_fusion.py
tests/test_schema_cleanup_migration.py
tests/test_search_comprehensive.py
tests/test_search_full_coverage.py
tests/test_search_performance.py
tests/test_semantic.py
tests/test_semantic_search.py
tests/test_storage.py
tests/test_token_chunking.py
tests/test_token_storage.py
tests/test_tokenizer.py
tests/test_tokenizer_performance.py
tests/test_treesitter_parser.py
tests/test_vector_search_full.py

View File

@@ -7,6 +7,12 @@ tree-sitter-javascript>=0.25
tree-sitter-typescript>=0.23
pathspec>=0.11
[encoding]
chardet>=5.0
[full]
tiktoken>=0.5.0
[semantic]
numpy>=1.24
fastembed>=0.2

View File

@@ -2,6 +2,25 @@
from __future__ import annotations
import sys
import os
# Force UTF-8 encoding for Windows console
# This ensures Chinese characters display correctly instead of GBK garbled text
if sys.platform == "win32":
# Set environment variable for Python I/O encoding
os.environ.setdefault("PYTHONIOENCODING", "utf-8")
# Reconfigure stdout/stderr to use UTF-8 if possible
try:
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
if hasattr(sys.stderr, "reconfigure"):
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
except Exception:
# Fallback: some environments don't support reconfigure
pass
from .commands import app
__all__ = ["app"]

View File

@@ -181,31 +181,46 @@ def search(
limit: int = typer.Option(20, "--limit", "-n", min=1, max=500, help="Max results."),
depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."),
files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
mode: str = typer.Option("exact", "--mode", "-m", help="Search mode: exact, fuzzy, hybrid, vector."),
mode: str = typer.Option("exact", "--mode", "-m", help="Search mode: exact, fuzzy, hybrid, vector, pure-vector."),
weights: Optional[str] = typer.Option(None, "--weights", help="Custom RRF weights as 'exact,fuzzy,vector' (e.g., '0.5,0.3,0.2')."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
) -> None:
"""Search indexed file contents using SQLite FTS5.
"""Search indexed file contents using SQLite FTS5 or semantic vectors.
Uses chain search across directory indexes.
Use --depth to limit search recursion (0 = current dir only).
Search Modes:
- exact: Exact FTS using unicode61 tokenizer (default)
- fuzzy: Fuzzy FTS using trigram tokenizer
- hybrid: RRF fusion of exact + fuzzy (recommended)
- vector: Semantic vector search (future)
- exact: Exact FTS using unicode61 tokenizer (default) - for code identifiers
- fuzzy: Fuzzy FTS using trigram tokenizer - for typo-tolerant search
- hybrid: RRF fusion of exact + fuzzy + vector (recommended) - best recall
- vector: Vector search with exact FTS fallback - semantic + keyword
- pure-vector: Pure semantic vector search only - natural language queries
Vector Search Requirements:
Vector search modes require pre-generated embeddings.
Use 'codexlens embeddings-generate' to create embeddings first.
Hybrid Mode:
Default weights: exact=0.4, fuzzy=0.3, vector=0.3
Use --weights to customize (e.g., --weights 0.5,0.3,0.2)
Examples:
# Exact code search
codexlens search "authenticate_user" --mode exact
# Semantic search (requires embeddings)
codexlens search "how to verify user credentials" --mode pure-vector
# Best of both worlds
codexlens search "authentication" --mode hybrid
"""
_configure_logging(verbose)
search_path = path.expanduser().resolve()
# Validate mode
valid_modes = ["exact", "fuzzy", "hybrid", "vector"]
valid_modes = ["exact", "fuzzy", "hybrid", "vector", "pure-vector"]
if mode not in valid_modes:
if json_mode:
print_json(success=False, error=f"Invalid mode: {mode}. Must be one of: {', '.join(valid_modes)}")
@@ -244,8 +259,18 @@ def search(
engine = ChainSearchEngine(registry, mapper)
# Map mode to options
hybrid_mode = mode == "hybrid"
enable_fuzzy = mode in ["fuzzy", "hybrid"]
if mode == "exact":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, False, False, False
elif mode == "fuzzy":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, True, False, False
elif mode == "vector":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, False # Vector + exact fallback
elif mode == "pure-vector":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, True # Pure vector only
elif mode == "hybrid":
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, True, True, False
else:
raise ValueError(f"Invalid mode: {mode}")
options = SearchOptions(
depth=depth,
@@ -253,6 +278,8 @@ def search(
files_only=files_only,
hybrid_mode=hybrid_mode,
enable_fuzzy=enable_fuzzy,
enable_vector=enable_vector,
pure_vector=pure_vector,
hybrid_weights=hybrid_weights,
)
@@ -1573,3 +1600,483 @@ def semantic_list(
finally:
if registry is not None:
registry.close()
# ==================== Model Management Commands ====================
@app.command(name="model-list")
def model_list(
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
) -> None:
"""List available embedding models and their installation status.
Shows 4 model profiles (fast, code, multilingual, balanced) with:
- Installation status
- Model size and dimensions
- Use case recommendations
"""
try:
from codexlens.cli.model_manager import list_models
result = list_models()
if json_mode:
print_json(**result)
else:
if not result["success"]:
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
raise typer.Exit(code=1)
data = result["result"]
models = data["models"]
cache_dir = data["cache_dir"]
cache_exists = data["cache_exists"]
console.print("[bold]Available Embedding Models:[/bold]")
console.print(f"Cache directory: [dim]{cache_dir}[/dim] {'(exists)' if cache_exists else '(not found)'}\n")
table = Table(show_header=True, header_style="bold")
table.add_column("Profile", style="cyan")
table.add_column("Model Name", style="blue")
table.add_column("Dims", justify="right")
table.add_column("Size (MB)", justify="right")
table.add_column("Status", justify="center")
table.add_column("Use Case", style="dim")
for model in models:
status_icon = "[green]✓[/green]" if model["installed"] else "[dim]—[/dim]"
size_display = (
f"{model['actual_size_mb']:.1f}" if model["installed"]
else f"~{model['estimated_size_mb']}"
)
table.add_row(
model["profile"],
model["model_name"],
str(model["dimensions"]),
size_display,
status_icon,
model["use_case"][:40] + "..." if len(model["use_case"]) > 40 else model["use_case"],
)
console.print(table)
console.print("\n[dim]Use 'codexlens model-download <profile>' to download a model[/dim]")
except ImportError:
if json_mode:
print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]")
else:
console.print("[red]Error:[/red] fastembed not installed")
console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]")
raise typer.Exit(code=1)
except Exception as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Model-list failed:[/red] {exc}")
raise typer.Exit(code=1)
@app.command(name="model-download")
def model_download(
profile: str = typer.Argument(..., help="Model profile to download (fast, code, multilingual, balanced)."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
) -> None:
"""Download an embedding model by profile name.
Example:
codexlens model-download code # Download code-optimized model
"""
try:
from codexlens.cli.model_manager import download_model
if not json_mode:
console.print(f"[bold]Downloading model:[/bold] {profile}")
console.print("[dim]This may take a few minutes depending on your internet connection...[/dim]\n")
# Create progress callback for non-JSON mode
progress_callback = None if json_mode else lambda msg: console.print(f"[cyan]{msg}[/cyan]")
result = download_model(profile, progress_callback=progress_callback)
if json_mode:
print_json(**result)
else:
if not result["success"]:
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
raise typer.Exit(code=1)
data = result["result"]
console.print(f"[green]✓[/green] Model downloaded successfully!")
console.print(f" Profile: {data['profile']}")
console.print(f" Model: {data['model_name']}")
console.print(f" Cache size: {data['cache_size_mb']:.1f} MB")
console.print(f" Location: [dim]{data['cache_path']}[/dim]")
except ImportError:
if json_mode:
print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]")
else:
console.print("[red]Error:[/red] fastembed not installed")
console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]")
raise typer.Exit(code=1)
except Exception as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Model-download failed:[/red] {exc}")
raise typer.Exit(code=1)
@app.command(name="model-delete")
def model_delete(
profile: str = typer.Argument(..., help="Model profile to delete (fast, code, multilingual, balanced)."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
) -> None:
"""Delete a downloaded embedding model from cache.
Example:
codexlens model-delete fast # Delete fast model
"""
try:
from codexlens.cli.model_manager import delete_model
if not json_mode:
console.print(f"[bold yellow]Deleting model:[/bold yellow] {profile}")
result = delete_model(profile)
if json_mode:
print_json(**result)
else:
if not result["success"]:
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
raise typer.Exit(code=1)
data = result["result"]
console.print(f"[green]✓[/green] Model deleted successfully!")
console.print(f" Profile: {data['profile']}")
console.print(f" Model: {data['model_name']}")
console.print(f" Freed space: {data['deleted_size_mb']:.1f} MB")
except Exception as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Model-delete failed:[/red] {exc}")
raise typer.Exit(code=1)
@app.command(name="model-info")
def model_info(
profile: str = typer.Argument(..., help="Model profile to get info (fast, code, multilingual, balanced)."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
) -> None:
"""Get detailed information about a model profile.
Example:
codexlens model-info code # Get code model details
"""
try:
from codexlens.cli.model_manager import get_model_info
result = get_model_info(profile)
if json_mode:
print_json(**result)
else:
if not result["success"]:
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
raise typer.Exit(code=1)
data = result["result"]
console.print(f"[bold]Model Profile:[/bold] {data['profile']}")
console.print(f" Model name: {data['model_name']}")
console.print(f" Dimensions: {data['dimensions']}")
console.print(f" Status: {'[green]Installed[/green]' if data['installed'] else '[dim]Not installed[/dim]'}")
if data['installed'] and data['actual_size_mb']:
console.print(f" Cache size: {data['actual_size_mb']:.1f} MB")
console.print(f" Location: [dim]{data['cache_path']}[/dim]")
else:
console.print(f" Estimated size: ~{data['estimated_size_mb']} MB")
console.print(f"\n Description: {data['description']}")
console.print(f" Use case: {data['use_case']}")
except Exception as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Model-info failed:[/red] {exc}")
raise typer.Exit(code=1)
# ==================== Embedding Management Commands ====================
@app.command(name="embeddings-status")
def embeddings_status(
path: Optional[Path] = typer.Argument(
None,
exists=True,
help="Path to specific _index.db file or directory containing indexes. If not specified, uses default index root.",
),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
) -> None:
"""Check embedding status for one or all indexes.
Shows embedding statistics including:
- Number of chunks generated
- File coverage percentage
- Files missing embeddings
Examples:
codexlens embeddings-status # Check all indexes
codexlens embeddings-status ~/.codexlens/indexes/project/_index.db # Check specific index
codexlens embeddings-status ~/projects/my-app # Check project (auto-finds index)
"""
try:
from codexlens.cli.embedding_manager import check_index_embeddings, get_embedding_stats_summary
# Determine what to check
if path is None:
# Check all indexes in default root
index_root = _get_index_root()
result = get_embedding_stats_summary(index_root)
if json_mode:
print_json(**result)
else:
if not result["success"]:
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
raise typer.Exit(code=1)
data = result["result"]
total = data["total_indexes"]
with_emb = data["indexes_with_embeddings"]
total_chunks = data["total_chunks"]
console.print(f"[bold]Embedding Status Summary[/bold]")
console.print(f"Index root: [dim]{index_root}[/dim]\n")
console.print(f"Total indexes: {total}")
console.print(f"Indexes with embeddings: [{'green' if with_emb > 0 else 'yellow'}]{with_emb}[/]/{total}")
console.print(f"Total chunks: {total_chunks:,}\n")
if data["indexes"]:
table = Table(show_header=True, header_style="bold")
table.add_column("Project", style="cyan")
table.add_column("Files", justify="right")
table.add_column("Chunks", justify="right")
table.add_column("Coverage", justify="right")
table.add_column("Status", justify="center")
for idx_stat in data["indexes"]:
status_icon = "[green]✓[/green]" if idx_stat["has_embeddings"] else "[dim]—[/dim]"
coverage = f"{idx_stat['coverage_percent']:.1f}%" if idx_stat["has_embeddings"] else ""
table.add_row(
idx_stat["project"],
str(idx_stat["total_files"]),
f"{idx_stat['total_chunks']:,}" if idx_stat["has_embeddings"] else "0",
coverage,
status_icon,
)
console.print(table)
else:
# Check specific index or find index for project
target_path = path.expanduser().resolve()
if target_path.is_file() and target_path.name == "_index.db":
# Direct index file
index_path = target_path
elif target_path.is_dir():
# Try to find index for this project
registry = RegistryStore()
try:
registry.initialize()
mapper = PathMapper()
index_path = mapper.source_to_index_db(target_path)
if not index_path.exists():
console.print(f"[red]Error:[/red] No index found for {target_path}")
console.print("Run 'codexlens init' first to create an index")
raise typer.Exit(code=1)
finally:
registry.close()
else:
console.print(f"[red]Error:[/red] Path must be _index.db file or directory")
raise typer.Exit(code=1)
result = check_index_embeddings(index_path)
if json_mode:
print_json(**result)
else:
if not result["success"]:
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
raise typer.Exit(code=1)
data = result["result"]
has_emb = data["has_embeddings"]
console.print(f"[bold]Embedding Status[/bold]")
console.print(f"Index: [dim]{data['index_path']}[/dim]\n")
if has_emb:
console.print(f"[green]✓[/green] Embeddings available")
console.print(f" Total chunks: {data['total_chunks']:,}")
console.print(f" Total files: {data['total_files']:,}")
console.print(f" Files with embeddings: {data['files_with_chunks']:,}/{data['total_files']}")
console.print(f" Coverage: {data['coverage_percent']:.1f}%")
if data["files_without_chunks"] > 0:
console.print(f"\n[yellow]Warning:[/yellow] {data['files_without_chunks']} files missing embeddings")
if data["missing_files_sample"]:
console.print(" Sample missing files:")
for file in data["missing_files_sample"]:
console.print(f" [dim]{file}[/dim]")
else:
console.print(f"[yellow]—[/yellow] No embeddings found")
console.print(f" Total files indexed: {data['total_files']:,}")
console.print("\n[dim]Generate embeddings with:[/dim]")
console.print(f" [cyan]codexlens embeddings-generate {index_path}[/cyan]")
except Exception as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Embeddings-status failed:[/red] {exc}")
raise typer.Exit(code=1)
@app.command(name="embeddings-generate")
def embeddings_generate(
path: Path = typer.Argument(
...,
exists=True,
help="Path to _index.db file or project directory.",
),
model: str = typer.Option(
"code",
"--model",
"-m",
help="Model profile: fast, code, multilingual, balanced.",
),
force: bool = typer.Option(
False,
"--force",
"-f",
help="Force regeneration even if embeddings exist.",
),
chunk_size: int = typer.Option(
2000,
"--chunk-size",
help="Maximum chunk size in characters.",
),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
) -> None:
"""Generate semantic embeddings for code search.
Creates vector embeddings for all files in an index to enable
semantic search capabilities. Embeddings are stored in the same
database as the FTS index.
Model Profiles:
- fast: BAAI/bge-small-en-v1.5 (384 dims, ~80MB)
- code: jinaai/jina-embeddings-v2-base-code (768 dims, ~150MB) [recommended]
- multilingual: intfloat/multilingual-e5-large (1024 dims, ~1GB)
- balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dims, ~600MB)
Examples:
codexlens embeddings-generate ~/projects/my-app # Auto-find index for project
codexlens embeddings-generate ~/.codexlens/indexes/project/_index.db # Specific index
codexlens embeddings-generate ~/projects/my-app --model fast --force # Regenerate with fast model
"""
_configure_logging(verbose)
try:
from codexlens.cli.embedding_manager import generate_embeddings
# Resolve path
target_path = path.expanduser().resolve()
if target_path.is_file() and target_path.name == "_index.db":
# Direct index file
index_path = target_path
elif target_path.is_dir():
# Try to find index for this project
registry = RegistryStore()
try:
registry.initialize()
mapper = PathMapper()
index_path = mapper.source_to_index_db(target_path)
if not index_path.exists():
console.print(f"[red]Error:[/red] No index found for {target_path}")
console.print("Run 'codexlens init' first to create an index")
raise typer.Exit(code=1)
finally:
registry.close()
else:
console.print(f"[red]Error:[/red] Path must be _index.db file or directory")
raise typer.Exit(code=1)
# Progress callback
def progress_update(msg: str):
if not json_mode and verbose:
console.print(f" {msg}")
console.print(f"[bold]Generating embeddings[/bold]")
console.print(f"Index: [dim]{index_path}[/dim]")
console.print(f"Model: [cyan]{model}[/cyan]\n")
result = generate_embeddings(
index_path,
model_profile=model,
force=force,
chunk_size=chunk_size,
progress_callback=progress_update,
)
if json_mode:
print_json(**result)
else:
if not result["success"]:
error_msg = result.get("error", "Unknown error")
console.print(f"[red]Error:[/red] {error_msg}")
# Provide helpful hints
if "already has" in error_msg:
console.print("\n[dim]Use --force to regenerate existing embeddings[/dim]")
elif "Semantic search not available" in error_msg:
console.print("\n[dim]Install semantic dependencies:[/dim]")
console.print(" [cyan]pip install codexlens[semantic][/cyan]")
raise typer.Exit(code=1)
data = result["result"]
elapsed = data["elapsed_time"]
console.print(f"[green]✓[/green] Embeddings generated successfully!")
console.print(f" Model: {data['model_name']}")
console.print(f" Chunks created: {data['chunks_created']:,}")
console.print(f" Files processed: {data['files_processed']}")
if data["files_failed"] > 0:
console.print(f" [yellow]Files failed: {data['files_failed']}[/yellow]")
if data["failed_files"]:
console.print(" [dim]First failures:[/dim]")
for file_path, error in data["failed_files"]:
console.print(f" [dim]{file_path}: {error}[/dim]")
console.print(f" Time: {elapsed:.1f}s")
console.print("\n[dim]Use vector search with:[/dim]")
console.print(" [cyan]codexlens search 'your query' --mode pure-vector[/cyan]")
except Exception as exc:
if json_mode:
print_json(success=False, error=str(exc))
else:
console.print(f"[red]Embeddings-generate failed:[/red] {exc}")
raise typer.Exit(code=1)

View File

@@ -0,0 +1,331 @@
"""Embedding Manager - Manage semantic embeddings for code indexes."""
import logging
import sqlite3
import time
from pathlib import Path
from typing import Dict, List, Optional
try:
from codexlens.semantic import SEMANTIC_AVAILABLE
if SEMANTIC_AVAILABLE:
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
except ImportError:
SEMANTIC_AVAILABLE = False
logger = logging.getLogger(__name__)
def check_index_embeddings(index_path: Path) -> Dict[str, any]:
"""Check if an index has embeddings and return statistics.
Args:
index_path: Path to _index.db file
Returns:
Dictionary with embedding statistics and status
"""
if not index_path.exists():
return {
"success": False,
"error": f"Index not found: {index_path}",
}
try:
with sqlite3.connect(index_path) as conn:
# Check if semantic_chunks table exists
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
)
table_exists = cursor.fetchone() is not None
if not table_exists:
# Count total indexed files even without embeddings
cursor = conn.execute("SELECT COUNT(*) FROM files")
total_files = cursor.fetchone()[0]
return {
"success": True,
"result": {
"has_embeddings": False,
"total_chunks": 0,
"total_files": total_files,
"files_with_chunks": 0,
"files_without_chunks": total_files,
"coverage_percent": 0.0,
"missing_files_sample": [],
"index_path": str(index_path),
},
}
# Count total chunks
cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
total_chunks = cursor.fetchone()[0]
# Count total indexed files
cursor = conn.execute("SELECT COUNT(*) FROM files")
total_files = cursor.fetchone()[0]
# Count files with embeddings
cursor = conn.execute(
"SELECT COUNT(DISTINCT file_path) FROM semantic_chunks"
)
files_with_chunks = cursor.fetchone()[0]
# Get a sample of files without embeddings
cursor = conn.execute("""
SELECT full_path
FROM files
WHERE full_path NOT IN (
SELECT DISTINCT file_path FROM semantic_chunks
)
LIMIT 5
""")
missing_files = [row[0] for row in cursor.fetchall()]
return {
"success": True,
"result": {
"has_embeddings": total_chunks > 0,
"total_chunks": total_chunks,
"total_files": total_files,
"files_with_chunks": files_with_chunks,
"files_without_chunks": total_files - files_with_chunks,
"coverage_percent": round((files_with_chunks / total_files * 100) if total_files > 0 else 0, 1),
"missing_files_sample": missing_files,
"index_path": str(index_path),
},
}
except Exception as e:
return {
"success": False,
"error": f"Failed to check embeddings: {str(e)}",
}
def generate_embeddings(
index_path: Path,
model_profile: str = "code",
force: bool = False,
chunk_size: int = 2000,
progress_callback: Optional[callable] = None,
) -> Dict[str, any]:
"""Generate embeddings for an index.
Args:
index_path: Path to _index.db file
model_profile: Model profile (fast, code, multilingual, balanced)
force: If True, regenerate even if embeddings exist
chunk_size: Maximum chunk size in characters
progress_callback: Optional callback for progress updates
Returns:
Result dictionary with generation statistics
"""
if not SEMANTIC_AVAILABLE:
return {
"success": False,
"error": "Semantic search not available. Install with: pip install codexlens[semantic]",
}
if not index_path.exists():
return {
"success": False,
"error": f"Index not found: {index_path}",
}
# Check existing chunks
status = check_index_embeddings(index_path)
if not status["success"]:
return status
existing_chunks = status["result"]["total_chunks"]
if existing_chunks > 0 and not force:
return {
"success": False,
"error": f"Index already has {existing_chunks} chunks. Use --force to regenerate.",
"existing_chunks": existing_chunks,
}
if force and existing_chunks > 0:
if progress_callback:
progress_callback(f"Clearing {existing_chunks} existing chunks...")
try:
with sqlite3.connect(index_path) as conn:
conn.execute("DELETE FROM semantic_chunks")
conn.commit()
except Exception as e:
return {
"success": False,
"error": f"Failed to clear existing chunks: {str(e)}",
}
# Initialize components
try:
embedder = Embedder(profile=model_profile)
vector_store = VectorStore(index_path)
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
if progress_callback:
progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
except Exception as e:
return {
"success": False,
"error": f"Failed to initialize components: {str(e)}",
}
# Read files from index
try:
with sqlite3.connect(index_path) as conn:
conn.row_factory = sqlite3.Row
cursor = conn.execute("SELECT full_path, content, language FROM files")
files = cursor.fetchall()
except Exception as e:
return {
"success": False,
"error": f"Failed to read files: {str(e)}",
}
if len(files) == 0:
return {
"success": False,
"error": "No files found in index",
}
if progress_callback:
progress_callback(f"Processing {len(files)} files...")
# Process each file
total_chunks = 0
failed_files = []
start_time = time.time()
for idx, file_row in enumerate(files, 1):
file_path = file_row["full_path"]
content = file_row["content"]
language = file_row["language"] or "python"
try:
# Create chunks
chunks = chunker.chunk_sliding_window(
content,
file_path=file_path,
language=language
)
if not chunks:
continue
# Generate embeddings
for chunk in chunks:
embedding = embedder.embed_single(chunk.content)
chunk.embedding = embedding
# Store chunks
vector_store.add_chunks(chunks, file_path)
total_chunks += len(chunks)
if progress_callback:
progress_callback(f"[{idx}/{len(files)}] {file_path}: {len(chunks)} chunks")
except Exception as e:
logger.error(f"Failed to process {file_path}: {e}")
failed_files.append((file_path, str(e)))
elapsed_time = time.time() - start_time
return {
"success": True,
"result": {
"chunks_created": total_chunks,
"files_processed": len(files) - len(failed_files),
"files_failed": len(failed_files),
"elapsed_time": elapsed_time,
"model_profile": model_profile,
"model_name": embedder.model_name,
"failed_files": failed_files[:5], # First 5 failures
"index_path": str(index_path),
},
}
def find_all_indexes(scan_dir: Path) -> List[Path]:
"""Find all _index.db files in directory tree.
Args:
scan_dir: Directory to scan
Returns:
List of paths to _index.db files
"""
if not scan_dir.exists():
return []
return list(scan_dir.rglob("_index.db"))
def get_embedding_stats_summary(index_root: Path) -> Dict[str, any]:
"""Get summary statistics for all indexes in root directory.
Args:
index_root: Root directory containing indexes
Returns:
Summary statistics for all indexes
"""
indexes = find_all_indexes(index_root)
if not indexes:
return {
"success": True,
"result": {
"total_indexes": 0,
"indexes_with_embeddings": 0,
"total_chunks": 0,
"indexes": [],
},
}
total_chunks = 0
indexes_with_embeddings = 0
index_stats = []
for index_path in indexes:
status = check_index_embeddings(index_path)
if status["success"]:
result = status["result"]
has_emb = result["has_embeddings"]
chunks = result["total_chunks"]
if has_emb:
indexes_with_embeddings += 1
total_chunks += chunks
# Extract project name from path
project_name = index_path.parent.name
index_stats.append({
"project": project_name,
"path": str(index_path),
"has_embeddings": has_emb,
"total_chunks": chunks,
"total_files": result["total_files"],
"coverage_percent": result.get("coverage_percent", 0),
})
return {
"success": True,
"result": {
"total_indexes": len(indexes),
"indexes_with_embeddings": indexes_with_embeddings,
"total_chunks": total_chunks,
"indexes": index_stats,
},
}

View File

@@ -0,0 +1,289 @@
"""Model Manager - Manage fastembed models for semantic search."""
import json
import os
import shutil
from pathlib import Path
from typing import Dict, List, Optional
try:
from fastembed import TextEmbedding
FASTEMBED_AVAILABLE = True
except ImportError:
FASTEMBED_AVAILABLE = False
# Model profiles with metadata
MODEL_PROFILES = {
"fast": {
"model_name": "BAAI/bge-small-en-v1.5",
"dimensions": 384,
"size_mb": 80,
"description": "Fast, lightweight, English-optimized",
"use_case": "Quick prototyping, resource-constrained environments",
},
"code": {
"model_name": "jinaai/jina-embeddings-v2-base-code",
"dimensions": 768,
"size_mb": 150,
"description": "Code-optimized, best for programming languages",
"use_case": "Open source projects, code semantic search",
},
"multilingual": {
"model_name": "intfloat/multilingual-e5-large",
"dimensions": 1024,
"size_mb": 1000,
"description": "Multilingual + code support",
"use_case": "Enterprise multilingual projects",
},
"balanced": {
"model_name": "mixedbread-ai/mxbai-embed-large-v1",
"dimensions": 1024,
"size_mb": 600,
"description": "High accuracy, general purpose",
"use_case": "High-quality semantic search, balanced performance",
},
}
def get_cache_dir() -> Path:
"""Get fastembed cache directory.
Returns:
Path to cache directory (usually ~/.cache/fastembed or %LOCALAPPDATA%\\Temp\\fastembed_cache)
"""
# Check HF_HOME environment variable first
if "HF_HOME" in os.environ:
return Path(os.environ["HF_HOME"])
# Default cache locations
if os.name == "nt": # Windows
cache_dir = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local")) / "Temp" / "fastembed_cache"
else: # Unix-like
cache_dir = Path.home() / ".cache" / "fastembed"
return cache_dir
def list_models() -> Dict[str, any]:
"""List available model profiles and their installation status.
Returns:
Dictionary with model profiles, installed status, and cache info
"""
if not FASTEMBED_AVAILABLE:
return {
"success": False,
"error": "fastembed not installed. Install with: pip install codexlens[semantic]",
}
cache_dir = get_cache_dir()
cache_exists = cache_dir.exists()
models = []
for profile, info in MODEL_PROFILES.items():
model_name = info["model_name"]
# Check if model is cached
installed = False
cache_size_mb = 0
if cache_exists:
# Check for model directory in cache
model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
if model_cache_path.exists():
installed = True
# Calculate cache size
total_size = sum(
f.stat().st_size
for f in model_cache_path.rglob("*")
if f.is_file()
)
cache_size_mb = round(total_size / (1024 * 1024), 1)
models.append({
"profile": profile,
"model_name": model_name,
"dimensions": info["dimensions"],
"estimated_size_mb": info["size_mb"],
"actual_size_mb": cache_size_mb if installed else None,
"description": info["description"],
"use_case": info["use_case"],
"installed": installed,
})
return {
"success": True,
"result": {
"models": models,
"cache_dir": str(cache_dir),
"cache_exists": cache_exists,
},
}
def download_model(profile: str, progress_callback: Optional[callable] = None) -> Dict[str, any]:
"""Download a model by profile name.
Args:
profile: Model profile name (fast, code, multilingual, balanced)
progress_callback: Optional callback function to report progress
Returns:
Result dictionary with success status
"""
if not FASTEMBED_AVAILABLE:
return {
"success": False,
"error": "fastembed not installed. Install with: pip install codexlens[semantic]",
}
if profile not in MODEL_PROFILES:
return {
"success": False,
"error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}",
}
model_name = MODEL_PROFILES[profile]["model_name"]
try:
# Download model by instantiating TextEmbedding
# This will automatically download to cache if not present
if progress_callback:
progress_callback(f"Downloading {model_name}...")
embedder = TextEmbedding(model_name=model_name)
if progress_callback:
progress_callback(f"Model {model_name} downloaded successfully")
# Get cache info
cache_dir = get_cache_dir()
model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
cache_size = 0
if model_cache_path.exists():
total_size = sum(
f.stat().st_size
for f in model_cache_path.rglob("*")
if f.is_file()
)
cache_size = round(total_size / (1024 * 1024), 1)
return {
"success": True,
"result": {
"profile": profile,
"model_name": model_name,
"cache_size_mb": cache_size,
"cache_path": str(model_cache_path),
},
}
except Exception as e:
return {
"success": False,
"error": f"Failed to download model: {str(e)}",
}
def delete_model(profile: str) -> Dict[str, any]:
"""Delete a downloaded model from cache.
Args:
profile: Model profile name to delete
Returns:
Result dictionary with success status
"""
if profile not in MODEL_PROFILES:
return {
"success": False,
"error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}",
}
model_name = MODEL_PROFILES[profile]["model_name"]
cache_dir = get_cache_dir()
model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
if not model_cache_path.exists():
return {
"success": False,
"error": f"Model {profile} ({model_name}) is not installed",
}
try:
# Calculate size before deletion
total_size = sum(
f.stat().st_size
for f in model_cache_path.rglob("*")
if f.is_file()
)
size_mb = round(total_size / (1024 * 1024), 1)
# Delete model directory
shutil.rmtree(model_cache_path)
return {
"success": True,
"result": {
"profile": profile,
"model_name": model_name,
"deleted_size_mb": size_mb,
"cache_path": str(model_cache_path),
},
}
except Exception as e:
return {
"success": False,
"error": f"Failed to delete model: {str(e)}",
}
def get_model_info(profile: str) -> Dict[str, any]:
"""Get detailed information about a model profile.
Args:
profile: Model profile name
Returns:
Result dictionary with model information
"""
if profile not in MODEL_PROFILES:
return {
"success": False,
"error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}",
}
info = MODEL_PROFILES[profile]
model_name = info["model_name"]
# Check installation status
cache_dir = get_cache_dir()
model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
installed = model_cache_path.exists()
cache_size_mb = None
if installed:
total_size = sum(
f.stat().st_size
for f in model_cache_path.rglob("*")
if f.is_file()
)
cache_size_mb = round(total_size / (1024 * 1024), 1)
return {
"success": True,
"result": {
"profile": profile,
"model_name": model_name,
"dimensions": info["dimensions"],
"estimated_size_mb": info["size_mb"],
"actual_size_mb": cache_size_mb,
"description": info["description"],
"use_case": info["use_case"],
"installed": installed,
"cache_path": str(model_cache_path) if installed else None,
},
}

View File

@@ -3,6 +3,7 @@
from __future__ import annotations
import json
import sys
from dataclasses import asdict, is_dataclass
from pathlib import Path
from typing import Any, Iterable, Mapping, Sequence
@@ -13,7 +14,9 @@ from rich.text import Text
from codexlens.entities import SearchResult, Symbol
console = Console()
# Force UTF-8 encoding for Windows console to properly display Chinese text
# Use force_terminal=True and legacy_windows=False to avoid GBK encoding issues
console = Console(force_terminal=True, legacy_windows=False)
def _to_jsonable(value: Any) -> Any:

View File

@@ -13,6 +13,7 @@ class Symbol(BaseModel):
name: str = Field(..., min_length=1)
kind: str = Field(..., min_length=1)
range: Tuple[int, int] = Field(..., description="(start_line, end_line), 1-based inclusive")
file: Optional[str] = Field(default=None, description="Full path to the file containing this symbol")
token_count: Optional[int] = Field(default=None, description="Token count for symbol content")
symbol_type: Optional[str] = Field(default=None, description="Extended symbol type for filtering")

View File

@@ -35,6 +35,8 @@ class SearchOptions:
include_semantic: Whether to include semantic keyword search results
hybrid_mode: Enable hybrid search with RRF fusion (default False)
enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True)
enable_vector: Enable vector semantic search (default False)
pure_vector: If True, only use vector search without FTS fallback (default False)
hybrid_weights: Custom RRF weights for hybrid search (optional)
"""
depth: int = -1
@@ -46,6 +48,8 @@ class SearchOptions:
include_semantic: bool = False
hybrid_mode: bool = False
enable_fuzzy: bool = True
enable_vector: bool = False
pure_vector: bool = False
hybrid_weights: Optional[Dict[str, float]] = None
@@ -494,6 +498,8 @@ class ChainSearchEngine:
options.include_semantic,
options.hybrid_mode,
options.enable_fuzzy,
options.enable_vector,
options.pure_vector,
options.hybrid_weights
): idx_path
for idx_path in index_paths
@@ -520,6 +526,8 @@ class ChainSearchEngine:
include_semantic: bool = False,
hybrid_mode: bool = False,
enable_fuzzy: bool = True,
enable_vector: bool = False,
pure_vector: bool = False,
hybrid_weights: Optional[Dict[str, float]] = None) -> List[SearchResult]:
"""Search a single index database.
@@ -527,12 +535,14 @@ class ChainSearchEngine:
Args:
index_path: Path to _index.db file
query: FTS5 query string
query: FTS5 query string (for FTS) or natural language query (for vector)
limit: Maximum results from this index
files_only: If True, skip snippet generation for faster search
include_semantic: If True, also search semantic keywords and merge results
hybrid_mode: If True, use hybrid search with RRF fusion
enable_fuzzy: Enable fuzzy FTS in hybrid mode
enable_vector: Enable vector semantic search
pure_vector: If True, only use vector search without FTS fallback
hybrid_weights: Custom RRF weights for hybrid search
Returns:
@@ -547,10 +557,11 @@ class ChainSearchEngine:
query,
limit=limit,
enable_fuzzy=enable_fuzzy,
enable_vector=False, # Vector search not yet implemented
enable_vector=enable_vector,
pure_vector=pure_vector,
)
else:
# Legacy single-FTS search
# Single-FTS search (exact or fuzzy mode)
with DirIndexStore(index_path) as store:
# Get FTS results
if files_only:
@@ -558,7 +569,11 @@ class ChainSearchEngine:
paths = store.search_files_only(query, limit=limit)
fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
else:
fts_results = store.search_fts(query, limit=limit)
# Use fuzzy FTS if enable_fuzzy=True (mode="fuzzy"), otherwise exact FTS
if enable_fuzzy:
fts_results = store.search_fts_fuzzy(query, limit=limit)
else:
fts_results = store.search_fts(query, limit=limit)
# Optionally add semantic keyword results
if include_semantic:

View File

@@ -50,35 +50,68 @@ class HybridSearchEngine:
limit: int = 20,
enable_fuzzy: bool = True,
enable_vector: bool = False,
pure_vector: bool = False,
) -> List[SearchResult]:
"""Execute hybrid search with parallel retrieval and RRF fusion.
Args:
index_path: Path to _index.db file
query: FTS5 query string
query: FTS5 query string (for FTS) or natural language query (for vector)
limit: Maximum results to return after fusion
enable_fuzzy: Enable fuzzy FTS search (default True)
enable_vector: Enable vector search (default False)
pure_vector: If True, only use vector search without FTS fallback (default False)
Returns:
List of SearchResult objects sorted by fusion score
Examples:
>>> engine = HybridSearchEngine()
>>> results = engine.search(Path("project/_index.db"), "authentication")
>>> # Hybrid search (exact + fuzzy + vector)
>>> results = engine.search(Path("project/_index.db"), "authentication",
... enable_vector=True)
>>> # Pure vector search (semantic only)
>>> results = engine.search(Path("project/_index.db"),
... "how to authenticate users",
... enable_vector=True, pure_vector=True)
>>> for r in results[:5]:
... print(f"{r.path}: {r.score:.3f}")
"""
# Determine which backends to use
backends = {"exact": True} # Always use exact search
if enable_fuzzy:
backends["fuzzy"] = True
if enable_vector:
backends["vector"] = True
backends = {}
if pure_vector:
# Pure vector mode: only use vector search, no FTS fallback
if enable_vector:
backends["vector"] = True
else:
# Invalid configuration: pure_vector=True but enable_vector=False
self.logger.warning(
"pure_vector=True requires enable_vector=True. "
"Falling back to exact search. "
"To use pure vector search, enable vector search mode."
)
backends["exact"] = True
else:
# Hybrid mode: always include exact search as baseline
backends["exact"] = True
if enable_fuzzy:
backends["fuzzy"] = True
if enable_vector:
backends["vector"] = True
# Execute parallel searches
results_map = self._search_parallel(index_path, query, backends, limit)
# Provide helpful message if pure-vector mode returns no results
if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0:
self.logger.warning(
"Pure vector search returned no results. "
"This usually means embeddings haven't been generated. "
"Run: codexlens embeddings-generate %s",
index_path.parent if index_path.name == "_index.db" else index_path
)
# Apply RRF fusion
# Filter weights to only active backends
active_weights = {
@@ -195,17 +228,67 @@ class HybridSearchEngine:
def _search_vector(
self, index_path: Path, query: str, limit: int
) -> List[SearchResult]:
"""Execute vector search (placeholder for future implementation).
"""Execute vector similarity search using semantic embeddings.
Args:
index_path: Path to _index.db file
query: Query string
query: Natural language query string
limit: Maximum results
Returns:
List of SearchResult objects (empty for now)
List of SearchResult objects ordered by semantic similarity
"""
# Placeholder for vector search integration
# Will be implemented when VectorStore is available
self.logger.debug("Vector search not yet implemented")
return []
try:
# Check if semantic chunks table exists
import sqlite3
conn = sqlite3.connect(index_path)
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
)
has_semantic_table = cursor.fetchone() is not None
conn.close()
if not has_semantic_table:
self.logger.info(
"No embeddings found in index. "
"Generate embeddings with: codexlens embeddings-generate %s",
index_path.parent if index_path.name == "_index.db" else index_path
)
return []
# Initialize embedder and vector store
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
embedder = Embedder(profile="code") # Use code-optimized model
vector_store = VectorStore(index_path)
# Check if vector store has data
if vector_store.count_chunks() == 0:
self.logger.info(
"Vector store is empty (0 chunks). "
"Generate embeddings with: codexlens embeddings-generate %s",
index_path.parent if index_path.name == "_index.db" else index_path
)
return []
# Generate query embedding
query_embedding = embedder.embed_single(query)
# Search for similar chunks
results = vector_store.search_similar(
query_embedding=query_embedding,
top_k=limit,
min_score=0.0, # Return all results, let RRF handle filtering
return_full_content=True,
)
self.logger.debug("Vector search found %d results", len(results))
return results
except ImportError as exc:
self.logger.debug("Semantic dependencies not available: %s", exc)
return []
except Exception as exc:
self.logger.error("Vector search error: %s", exc)
return []

View File

@@ -8,21 +8,64 @@ from . import SEMANTIC_AVAILABLE
class Embedder:
"""Generate embeddings for code chunks using fastembed (ONNX-based)."""
"""Generate embeddings for code chunks using fastembed (ONNX-based).
MODEL_NAME = "BAAI/bge-small-en-v1.5"
EMBEDDING_DIM = 384
Supported Model Profiles:
- fast: BAAI/bge-small-en-v1.5 (384 dim) - Fast, lightweight, English-optimized
- code: jinaai/jina-embeddings-v2-base-code (768 dim) - Code-optimized, best for programming languages
- multilingual: intfloat/multilingual-e5-large (1024 dim) - Multilingual + code support
- balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dim) - High accuracy, general purpose
"""
def __init__(self, model_name: str | None = None) -> None:
# Model profiles for different use cases
MODELS = {
"fast": "BAAI/bge-small-en-v1.5", # 384 dim - Fast, lightweight
"code": "jinaai/jina-embeddings-v2-base-code", # 768 dim - Code-optimized
"multilingual": "intfloat/multilingual-e5-large", # 1024 dim - Multilingual
"balanced": "mixedbread-ai/mxbai-embed-large-v1", # 1024 dim - High accuracy
}
# Dimension mapping for each model
MODEL_DIMS = {
"BAAI/bge-small-en-v1.5": 384,
"jinaai/jina-embeddings-v2-base-code": 768,
"intfloat/multilingual-e5-large": 1024,
"mixedbread-ai/mxbai-embed-large-v1": 1024,
}
# Default model (fast profile)
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
DEFAULT_PROFILE = "fast"
def __init__(self, model_name: str | None = None, profile: str | None = None) -> None:
"""Initialize embedder with model or profile.
Args:
model_name: Explicit model name (e.g., "jinaai/jina-embeddings-v2-base-code")
profile: Model profile shortcut ("fast", "code", "multilingual", "balanced")
If both provided, model_name takes precedence.
"""
if not SEMANTIC_AVAILABLE:
raise ImportError(
"Semantic search dependencies not available. "
"Install with: pip install codexlens[semantic]"
)
self.model_name = model_name or self.MODEL_NAME
# Resolve model name from profile or use explicit name
if model_name:
self.model_name = model_name
elif profile and profile in self.MODELS:
self.model_name = self.MODELS[profile]
else:
self.model_name = self.DEFAULT_MODEL
self._model = None
@property
def embedding_dim(self) -> int:
"""Get embedding dimension for current model."""
return self.MODEL_DIMS.get(self.model_name, 768) # Default to 768 if unknown
def _load_model(self) -> None:
"""Lazy load the embedding model."""
if self._model is not None:

View File

@@ -27,7 +27,6 @@ class SubdirLink:
name: str
index_path: Path
files_count: int
direct_files: int
last_updated: float
@@ -57,7 +56,7 @@ class DirIndexStore:
# Schema version for migration tracking
# Increment this when schema changes require migration
SCHEMA_VERSION = 4
SCHEMA_VERSION = 5
def __init__(self, db_path: str | Path) -> None:
"""Initialize directory index store.
@@ -133,6 +132,11 @@ class DirIndexStore:
from codexlens.storage.migrations.migration_004_dual_fts import upgrade
upgrade(conn)
# Migration v4 -> v5: Remove unused/redundant fields
if from_version < 5:
from codexlens.storage.migrations.migration_005_cleanup_unused_fields import upgrade
upgrade(conn)
def close(self) -> None:
"""Close database connection."""
with self._lock:
@@ -208,19 +212,17 @@ class DirIndexStore:
# Replace symbols
conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
if symbols:
# Extract token_count and symbol_type from symbol metadata if available
# Insert symbols without token_count and symbol_type
symbol_rows = []
for s in symbols:
token_count = getattr(s, 'token_count', None)
symbol_type = getattr(s, 'symbol_type', None) or s.kind
symbol_rows.append(
(file_id, s.name, s.kind, s.range[0], s.range[1], token_count, symbol_type)
(file_id, s.name, s.kind, s.range[0], s.range[1])
)
conn.executemany(
"""
INSERT INTO symbols(file_id, name, kind, start_line, end_line, token_count, symbol_type)
VALUES(?, ?, ?, ?, ?, ?, ?)
INSERT INTO symbols(file_id, name, kind, start_line, end_line)
VALUES(?, ?, ?, ?, ?)
""",
symbol_rows,
)
@@ -374,19 +376,17 @@ class DirIndexStore:
conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
if symbols:
# Extract token_count and symbol_type from symbol metadata if available
# Insert symbols without token_count and symbol_type
symbol_rows = []
for s in symbols:
token_count = getattr(s, 'token_count', None)
symbol_type = getattr(s, 'symbol_type', None) or s.kind
symbol_rows.append(
(file_id, s.name, s.kind, s.range[0], s.range[1], token_count, symbol_type)
(file_id, s.name, s.kind, s.range[0], s.range[1])
)
conn.executemany(
"""
INSERT INTO symbols(file_id, name, kind, start_line, end_line, token_count, symbol_type)
VALUES(?, ?, ?, ?, ?, ?, ?)
INSERT INTO symbols(file_id, name, kind, start_line, end_line)
VALUES(?, ?, ?, ?, ?)
""",
symbol_rows,
)
@@ -644,25 +644,22 @@ class DirIndexStore:
with self._lock:
conn = self._get_connection()
import json
import time
keywords_json = json.dumps(keywords)
generated_at = time.time()
# Write to semantic_metadata table (for backward compatibility)
# Write to semantic_metadata table (without keywords column)
conn.execute(
"""
INSERT INTO semantic_metadata(file_id, summary, keywords, purpose, llm_tool, generated_at)
VALUES(?, ?, ?, ?, ?, ?)
INSERT INTO semantic_metadata(file_id, summary, purpose, llm_tool, generated_at)
VALUES(?, ?, ?, ?, ?)
ON CONFLICT(file_id) DO UPDATE SET
summary=excluded.summary,
keywords=excluded.keywords,
purpose=excluded.purpose,
llm_tool=excluded.llm_tool,
generated_at=excluded.generated_at
""",
(file_id, summary, keywords_json, purpose, llm_tool, generated_at),
(file_id, summary, purpose, llm_tool, generated_at),
)
# Write to normalized keywords tables for optimized search
@@ -709,9 +706,10 @@ class DirIndexStore:
with self._lock:
conn = self._get_connection()
# Get semantic metadata (without keywords column)
row = conn.execute(
"""
SELECT summary, keywords, purpose, llm_tool, generated_at
SELECT summary, purpose, llm_tool, generated_at
FROM semantic_metadata WHERE file_id=?
""",
(file_id,),
@@ -720,11 +718,23 @@ class DirIndexStore:
if not row:
return None
import json
# Get keywords from normalized file_keywords table
keyword_rows = conn.execute(
"""
SELECT k.keyword
FROM file_keywords fk
JOIN keywords k ON fk.keyword_id = k.id
WHERE fk.file_id = ?
ORDER BY k.keyword
""",
(file_id,),
).fetchall()
keywords = [kw["keyword"] for kw in keyword_rows]
return {
"summary": row["summary"],
"keywords": json.loads(row["keywords"]) if row["keywords"] else [],
"keywords": keywords,
"purpose": row["purpose"],
"llm_tool": row["llm_tool"],
"generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
@@ -856,15 +866,14 @@ class DirIndexStore:
Returns:
Tuple of (list of metadata dicts, total count)
"""
import json
with self._lock:
conn = self._get_connection()
# Query semantic metadata without keywords column
base_query = """
SELECT f.id as file_id, f.name as file_name, f.full_path,
f.language, f.line_count,
sm.summary, sm.keywords, sm.purpose,
sm.summary, sm.purpose,
sm.llm_tool, sm.generated_at
FROM files f
JOIN semantic_metadata sm ON f.id = sm.file_id
@@ -892,14 +901,30 @@ class DirIndexStore:
results = []
for row in rows:
file_id = int(row["file_id"])
# Get keywords from normalized file_keywords table
keyword_rows = conn.execute(
"""
SELECT k.keyword
FROM file_keywords fk
JOIN keywords k ON fk.keyword_id = k.id
WHERE fk.file_id = ?
ORDER BY k.keyword
""",
(file_id,),
).fetchall()
keywords = [kw["keyword"] for kw in keyword_rows]
results.append({
"file_id": int(row["file_id"]),
"file_id": file_id,
"file_name": row["file_name"],
"full_path": row["full_path"],
"language": row["language"],
"line_count": int(row["line_count"]) if row["line_count"] else 0,
"summary": row["summary"],
"keywords": json.loads(row["keywords"]) if row["keywords"] else [],
"keywords": keywords,
"purpose": row["purpose"],
"llm_tool": row["llm_tool"],
"generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
@@ -922,7 +947,7 @@ class DirIndexStore:
name: Subdirectory name
index_path: Path to subdirectory's _index.db
files_count: Total files recursively
direct_files: Files directly in subdirectory
direct_files: Deprecated parameter (no longer used)
"""
with self._lock:
conn = self._get_connection()
@@ -931,17 +956,17 @@ class DirIndexStore:
import time
last_updated = time.time()
# Note: direct_files parameter is deprecated but kept for backward compatibility
conn.execute(
"""
INSERT INTO subdirs(name, index_path, files_count, direct_files, last_updated)
VALUES(?, ?, ?, ?, ?)
INSERT INTO subdirs(name, index_path, files_count, last_updated)
VALUES(?, ?, ?, ?)
ON CONFLICT(name) DO UPDATE SET
index_path=excluded.index_path,
files_count=excluded.files_count,
direct_files=excluded.direct_files,
last_updated=excluded.last_updated
""",
(name, index_path_str, files_count, direct_files, last_updated),
(name, index_path_str, files_count, last_updated),
)
conn.commit()
@@ -974,7 +999,7 @@ class DirIndexStore:
conn = self._get_connection()
rows = conn.execute(
"""
SELECT id, name, index_path, files_count, direct_files, last_updated
SELECT id, name, index_path, files_count, last_updated
FROM subdirs
ORDER BY name
"""
@@ -986,7 +1011,6 @@ class DirIndexStore:
name=row["name"],
index_path=Path(row["index_path"]),
files_count=int(row["files_count"]) if row["files_count"] else 0,
direct_files=int(row["direct_files"]) if row["direct_files"] else 0,
last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
)
for row in rows
@@ -1005,7 +1029,7 @@ class DirIndexStore:
conn = self._get_connection()
row = conn.execute(
"""
SELECT id, name, index_path, files_count, direct_files, last_updated
SELECT id, name, index_path, files_count, last_updated
FROM subdirs WHERE name=?
""",
(name,),
@@ -1019,7 +1043,6 @@ class DirIndexStore:
name=row["name"],
index_path=Path(row["index_path"]),
files_count=int(row["files_count"]) if row["files_count"] else 0,
direct_files=int(row["direct_files"]) if row["direct_files"] else 0,
last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
)
@@ -1031,41 +1054,71 @@ class DirIndexStore:
Args:
name: Subdirectory name
files_count: Total files recursively
direct_files: Files directly in subdirectory (optional)
direct_files: Deprecated parameter (no longer used)
"""
with self._lock:
conn = self._get_connection()
import time
last_updated = time.time()
if direct_files is not None:
conn.execute(
"""
UPDATE subdirs
SET files_count=?, direct_files=?, last_updated=?
WHERE name=?
""",
(files_count, direct_files, last_updated, name),
)
else:
conn.execute(
"""
UPDATE subdirs
SET files_count=?, last_updated=?
WHERE name=?
""",
(files_count, last_updated, name),
)
# Note: direct_files parameter is deprecated but kept for backward compatibility
conn.execute(
"""
UPDATE subdirs
SET files_count=?, last_updated=?
WHERE name=?
""",
(files_count, last_updated, name),
)
conn.commit()
# === Search ===
def search_fts(self, query: str, limit: int = 20) -> List[SearchResult]:
@staticmethod
def _enhance_fts_query(query: str) -> str:
"""Enhance FTS5 query to support prefix matching for simple queries.
For simple single-word or multi-word queries without FTS5 operators,
automatically adds prefix wildcard (*) to enable partial matching.
Examples:
"loadPack" -> "loadPack*"
"load package" -> "load* package*"
"load*" -> "load*" (already has wildcard, unchanged)
"NOT test" -> "NOT test" (has FTS operator, unchanged)
Args:
query: Original FTS5 query string
Returns:
Enhanced query string with prefix wildcards for simple queries
"""
# Don't modify if query already contains FTS5 operators or wildcards
if any(op in query.upper() for op in [' AND ', ' OR ', ' NOT ', ' NEAR ', '*', '"']):
return query
# For simple queries, add prefix wildcard to each word
words = query.split()
enhanced_words = [f"{word}*" if not word.endswith('*') else word for word in words]
return ' '.join(enhanced_words)
def search_fts(self, query: str, limit: int = 20, enhance_query: bool = False) -> List[SearchResult]:
"""Full-text search in current directory files.
Uses files_fts_exact (unicode61 tokenizer) for exact token matching.
For fuzzy/substring search, use search_fts_fuzzy() instead.
Best Practice (from industry analysis of Codanna/Code-Index-MCP):
- Default: Respects exact user input without modification
- Users can manually add wildcards (e.g., "loadPack*") for prefix matching
- Automatic enhancement (enhance_query=True) is NOT recommended as it can
violate user intent and bring unwanted noise in results
Args:
query: FTS5 query string
limit: Maximum results to return
enhance_query: If True, automatically add prefix wildcards for simple queries.
Default False to respect exact user input.
Returns:
List of SearchResult objects sorted by relevance
@@ -1073,19 +1126,23 @@ class DirIndexStore:
Raises:
StorageError: If FTS search fails
"""
# Only enhance query if explicitly requested (not default behavior)
# Best practice: Let users control wildcards manually
final_query = self._enhance_fts_query(query) if enhance_query else query
with self._lock:
conn = self._get_connection()
try:
rows = conn.execute(
"""
SELECT rowid, full_path, bm25(files_fts) AS rank,
snippet(files_fts, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
FROM files_fts
WHERE files_fts MATCH ?
SELECT rowid, full_path, bm25(files_fts_exact) AS rank,
snippet(files_fts_exact, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
FROM files_fts_exact
WHERE files_fts_exact MATCH ?
ORDER BY rank
LIMIT ?
""",
(query, limit),
(final_query, limit),
).fetchall()
except sqlite3.DatabaseError as exc:
raise StorageError(f"FTS search failed: {exc}") from exc
@@ -1249,10 +1306,11 @@ class DirIndexStore:
if kind:
rows = conn.execute(
"""
SELECT name, kind, start_line, end_line
FROM symbols
WHERE name LIKE ? AND kind=?
ORDER BY name
SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
FROM symbols s
JOIN files f ON s.file_id = f.id
WHERE s.name LIKE ? AND s.kind=?
ORDER BY s.name
LIMIT ?
""",
(pattern, kind, limit),
@@ -1260,10 +1318,11 @@ class DirIndexStore:
else:
rows = conn.execute(
"""
SELECT name, kind, start_line, end_line
FROM symbols
WHERE name LIKE ?
ORDER BY name
SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
FROM symbols s
JOIN files f ON s.file_id = f.id
WHERE s.name LIKE ?
ORDER BY s.name
LIMIT ?
""",
(pattern, limit),
@@ -1274,6 +1333,7 @@ class DirIndexStore:
name=row["name"],
kind=row["kind"],
range=(row["start_line"], row["end_line"]),
file=row["full_path"],
)
for row in rows
]
@@ -1359,7 +1419,7 @@ class DirIndexStore:
"""
)
# Subdirectories table
# Subdirectories table (v5: removed direct_files)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS subdirs (
@@ -1367,13 +1427,12 @@ class DirIndexStore:
name TEXT NOT NULL UNIQUE,
index_path TEXT NOT NULL,
files_count INTEGER DEFAULT 0,
direct_files INTEGER DEFAULT 0,
last_updated REAL
)
"""
)
# Symbols table
# Symbols table (v5: removed token_count and symbol_type)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS symbols (
@@ -1382,9 +1441,7 @@ class DirIndexStore:
name TEXT NOT NULL,
kind TEXT NOT NULL,
start_line INTEGER,
end_line INTEGER,
token_count INTEGER,
symbol_type TEXT
end_line INTEGER
)
"""
)
@@ -1421,14 +1478,13 @@ class DirIndexStore:
"""
)
# Semantic metadata table
# Semantic metadata table (v5: removed keywords column)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS semantic_metadata (
id INTEGER PRIMARY KEY,
file_id INTEGER UNIQUE REFERENCES files(id) ON DELETE CASCADE,
summary TEXT,
keywords TEXT,
purpose TEXT,
llm_tool TEXT,
generated_at REAL
@@ -1473,13 +1529,12 @@ class DirIndexStore:
"""
)
# Indexes
# Indexes (v5: removed idx_symbols_type)
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords(keyword)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords(file_id)")

View File

@@ -0,0 +1,188 @@
"""
Migration 005: Remove unused and redundant database fields.
This migration removes four problematic fields identified by Gemini analysis:
1. **semantic_metadata.keywords** (deprecated - replaced by file_keywords table)
- Data: Migrated to normalized file_keywords table in migration 001
- Impact: Column now redundant, remove to prevent sync issues
2. **symbols.token_count** (unused - always NULL)
- Data: Never populated, always NULL
- Impact: No data loss, just removes unused column
3. **symbols.symbol_type** (redundant - duplicates kind)
- Data: Redundant with symbols.kind field
- Impact: No data loss, kind field contains same information
4. **subdirs.direct_files** (unused - never displayed)
- Data: Never used in queries or display logic
- Impact: No data loss, just removes unused column
Schema changes use table recreation pattern (SQLite best practice):
- Create new table without deprecated columns
- Copy data from old table
- Drop old table
- Rename new table
- Recreate indexes
"""
import logging
from sqlite3 import Connection
log = logging.getLogger(__name__)
def upgrade(db_conn: Connection):
"""Remove unused and redundant fields from schema.
Args:
db_conn: The SQLite database connection.
"""
cursor = db_conn.cursor()
try:
cursor.execute("BEGIN TRANSACTION")
# Step 1: Remove semantic_metadata.keywords
log.info("Removing semantic_metadata.keywords column...")
# Check if semantic_metadata table exists
cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'"
)
if cursor.fetchone():
cursor.execute("""
CREATE TABLE semantic_metadata_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL UNIQUE,
summary TEXT,
purpose TEXT,
llm_tool TEXT,
generated_at REAL,
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
)
""")
cursor.execute("""
INSERT INTO semantic_metadata_new (id, file_id, summary, purpose, llm_tool, generated_at)
SELECT id, file_id, summary, purpose, llm_tool, generated_at
FROM semantic_metadata
""")
cursor.execute("DROP TABLE semantic_metadata")
cursor.execute("ALTER TABLE semantic_metadata_new RENAME TO semantic_metadata")
# Recreate index
cursor.execute(
"CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)"
)
log.info("Removed semantic_metadata.keywords column")
else:
log.info("semantic_metadata table does not exist, skipping")
# Step 2: Remove symbols.token_count and symbols.symbol_type
log.info("Removing symbols.token_count and symbols.symbol_type columns...")
# Check if symbols table exists
cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='symbols'"
)
if cursor.fetchone():
cursor.execute("""
CREATE TABLE symbols_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_id INTEGER NOT NULL,
name TEXT NOT NULL,
kind TEXT,
start_line INTEGER,
end_line INTEGER,
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
)
""")
cursor.execute("""
INSERT INTO symbols_new (id, file_id, name, kind, start_line, end_line)
SELECT id, file_id, name, kind, start_line, end_line
FROM symbols
""")
cursor.execute("DROP TABLE symbols")
cursor.execute("ALTER TABLE symbols_new RENAME TO symbols")
# Recreate indexes (excluding idx_symbols_type which indexed symbol_type)
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
log.info("Removed symbols.token_count and symbols.symbol_type columns")
else:
log.info("symbols table does not exist, skipping")
# Step 3: Remove subdirs.direct_files
log.info("Removing subdirs.direct_files column...")
# Check if subdirs table exists
cursor.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='subdirs'"
)
if cursor.fetchone():
cursor.execute("""
CREATE TABLE subdirs_new (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
index_path TEXT NOT NULL,
files_count INTEGER DEFAULT 0,
last_updated REAL
)
""")
cursor.execute("""
INSERT INTO subdirs_new (id, name, index_path, files_count, last_updated)
SELECT id, name, index_path, files_count, last_updated
FROM subdirs
""")
cursor.execute("DROP TABLE subdirs")
cursor.execute("ALTER TABLE subdirs_new RENAME TO subdirs")
# Recreate index
cursor.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
log.info("Removed subdirs.direct_files column")
else:
log.info("subdirs table does not exist, skipping")
cursor.execute("COMMIT")
log.info("Migration 005 completed successfully")
# Vacuum to reclaim space (outside transaction)
try:
log.info("Running VACUUM to reclaim space...")
cursor.execute("VACUUM")
log.info("VACUUM completed successfully")
except Exception as e:
log.warning(f"VACUUM failed (non-critical): {e}")
except Exception as e:
log.error(f"Migration 005 failed: {e}")
try:
cursor.execute("ROLLBACK")
except Exception:
pass
raise
def downgrade(db_conn: Connection):
"""Restore removed fields (data will be lost for keywords, token_count, symbol_type, direct_files).
This is a placeholder - true downgrade is not feasible as data is lost.
The migration is designed to be one-way since removed fields are unused/redundant.
Args:
db_conn: The SQLite database connection.
"""
log.warning(
"Migration 005 downgrade not supported - removed fields are unused/redundant. "
"Data cannot be restored."
)
raise NotImplementedError(
"Migration 005 downgrade not supported - this is a one-way migration"
)