mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-09 02:24:11 +08:00
Add comprehensive tests for schema cleanup migration and search comparison
- Implement tests for migration 005 to verify removal of deprecated fields in the database schema. - Ensure that new databases are created with a clean schema. - Validate that keywords are correctly extracted from the normalized file_keywords table. - Test symbol insertion without deprecated fields and subdir operations without direct_files. - Create a detailed search comparison test to evaluate vector search vs hybrid search performance. - Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality. - Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
This commit is contained in:
@@ -18,3 +18,7 @@ Requires-Dist: pathspec>=0.11
|
||||
Provides-Extra: semantic
|
||||
Requires-Dist: numpy>=1.24; extra == "semantic"
|
||||
Requires-Dist: fastembed>=0.2; extra == "semantic"
|
||||
Provides-Extra: encoding
|
||||
Requires-Dist: chardet>=5.0; extra == "encoding"
|
||||
Provides-Extra: full
|
||||
Requires-Dist: tiktoken>=0.5.0; extra == "full"
|
||||
|
||||
@@ -11,15 +11,23 @@ src/codexlens/entities.py
|
||||
src/codexlens/errors.py
|
||||
src/codexlens/cli/__init__.py
|
||||
src/codexlens/cli/commands.py
|
||||
src/codexlens/cli/model_manager.py
|
||||
src/codexlens/cli/output.py
|
||||
src/codexlens/parsers/__init__.py
|
||||
src/codexlens/parsers/encoding.py
|
||||
src/codexlens/parsers/factory.py
|
||||
src/codexlens/parsers/tokenizer.py
|
||||
src/codexlens/parsers/treesitter_parser.py
|
||||
src/codexlens/search/__init__.py
|
||||
src/codexlens/search/chain_search.py
|
||||
src/codexlens/search/hybrid_search.py
|
||||
src/codexlens/search/query_parser.py
|
||||
src/codexlens/search/ranking.py
|
||||
src/codexlens/semantic/__init__.py
|
||||
src/codexlens/semantic/chunker.py
|
||||
src/codexlens/semantic/code_extractor.py
|
||||
src/codexlens/semantic/embedder.py
|
||||
src/codexlens/semantic/graph_analyzer.py
|
||||
src/codexlens/semantic/llm_enhancer.py
|
||||
src/codexlens/semantic/vector_store.py
|
||||
src/codexlens/storage/__init__.py
|
||||
@@ -30,21 +38,45 @@ src/codexlens/storage/migration_manager.py
|
||||
src/codexlens/storage/path_mapper.py
|
||||
src/codexlens/storage/registry.py
|
||||
src/codexlens/storage/sqlite_store.py
|
||||
src/codexlens/storage/sqlite_utils.py
|
||||
src/codexlens/storage/migrations/__init__.py
|
||||
src/codexlens/storage/migrations/migration_001_normalize_keywords.py
|
||||
src/codexlens/storage/migrations/migration_002_add_token_metadata.py
|
||||
src/codexlens/storage/migrations/migration_003_code_relationships.py
|
||||
src/codexlens/storage/migrations/migration_004_dual_fts.py
|
||||
src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py
|
||||
tests/test_chain_search_engine.py
|
||||
tests/test_cli_hybrid_search.py
|
||||
tests/test_cli_output.py
|
||||
tests/test_code_extractor.py
|
||||
tests/test_config.py
|
||||
tests/test_dual_fts.py
|
||||
tests/test_encoding.py
|
||||
tests/test_entities.py
|
||||
tests/test_errors.py
|
||||
tests/test_file_cache.py
|
||||
tests/test_graph_analyzer.py
|
||||
tests/test_graph_cli.py
|
||||
tests/test_graph_storage.py
|
||||
tests/test_hybrid_chunker.py
|
||||
tests/test_hybrid_search_e2e.py
|
||||
tests/test_incremental_indexing.py
|
||||
tests/test_llm_enhancer.py
|
||||
tests/test_parser_integration.py
|
||||
tests/test_parsers.py
|
||||
tests/test_performance_optimizations.py
|
||||
tests/test_query_parser.py
|
||||
tests/test_rrf_fusion.py
|
||||
tests/test_schema_cleanup_migration.py
|
||||
tests/test_search_comprehensive.py
|
||||
tests/test_search_full_coverage.py
|
||||
tests/test_search_performance.py
|
||||
tests/test_semantic.py
|
||||
tests/test_semantic_search.py
|
||||
tests/test_storage.py
|
||||
tests/test_token_chunking.py
|
||||
tests/test_token_storage.py
|
||||
tests/test_tokenizer.py
|
||||
tests/test_tokenizer_performance.py
|
||||
tests/test_treesitter_parser.py
|
||||
tests/test_vector_search_full.py
|
||||
@@ -7,6 +7,12 @@ tree-sitter-javascript>=0.25
|
||||
tree-sitter-typescript>=0.23
|
||||
pathspec>=0.11
|
||||
|
||||
[encoding]
|
||||
chardet>=5.0
|
||||
|
||||
[full]
|
||||
tiktoken>=0.5.0
|
||||
|
||||
[semantic]
|
||||
numpy>=1.24
|
||||
fastembed>=0.2
|
||||
|
||||
@@ -2,6 +2,25 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Force UTF-8 encoding for Windows console
|
||||
# This ensures Chinese characters display correctly instead of GBK garbled text
|
||||
if sys.platform == "win32":
|
||||
# Set environment variable for Python I/O encoding
|
||||
os.environ.setdefault("PYTHONIOENCODING", "utf-8")
|
||||
|
||||
# Reconfigure stdout/stderr to use UTF-8 if possible
|
||||
try:
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
if hasattr(sys.stderr, "reconfigure"):
|
||||
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
|
||||
except Exception:
|
||||
# Fallback: some environments don't support reconfigure
|
||||
pass
|
||||
|
||||
from .commands import app
|
||||
|
||||
__all__ = ["app"]
|
||||
|
||||
@@ -181,31 +181,46 @@ def search(
|
||||
limit: int = typer.Option(20, "--limit", "-n", min=1, max=500, help="Max results."),
|
||||
depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."),
|
||||
files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
|
||||
mode: str = typer.Option("exact", "--mode", "-m", help="Search mode: exact, fuzzy, hybrid, vector."),
|
||||
mode: str = typer.Option("exact", "--mode", "-m", help="Search mode: exact, fuzzy, hybrid, vector, pure-vector."),
|
||||
weights: Optional[str] = typer.Option(None, "--weights", help="Custom RRF weights as 'exact,fuzzy,vector' (e.g., '0.5,0.3,0.2')."),
|
||||
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
||||
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
|
||||
) -> None:
|
||||
"""Search indexed file contents using SQLite FTS5.
|
||||
"""Search indexed file contents using SQLite FTS5 or semantic vectors.
|
||||
|
||||
Uses chain search across directory indexes.
|
||||
Use --depth to limit search recursion (0 = current dir only).
|
||||
|
||||
Search Modes:
|
||||
- exact: Exact FTS using unicode61 tokenizer (default)
|
||||
- fuzzy: Fuzzy FTS using trigram tokenizer
|
||||
- hybrid: RRF fusion of exact + fuzzy (recommended)
|
||||
- vector: Semantic vector search (future)
|
||||
- exact: Exact FTS using unicode61 tokenizer (default) - for code identifiers
|
||||
- fuzzy: Fuzzy FTS using trigram tokenizer - for typo-tolerant search
|
||||
- hybrid: RRF fusion of exact + fuzzy + vector (recommended) - best recall
|
||||
- vector: Vector search with exact FTS fallback - semantic + keyword
|
||||
- pure-vector: Pure semantic vector search only - natural language queries
|
||||
|
||||
Vector Search Requirements:
|
||||
Vector search modes require pre-generated embeddings.
|
||||
Use 'codexlens embeddings-generate' to create embeddings first.
|
||||
|
||||
Hybrid Mode:
|
||||
Default weights: exact=0.4, fuzzy=0.3, vector=0.3
|
||||
Use --weights to customize (e.g., --weights 0.5,0.3,0.2)
|
||||
|
||||
Examples:
|
||||
# Exact code search
|
||||
codexlens search "authenticate_user" --mode exact
|
||||
|
||||
# Semantic search (requires embeddings)
|
||||
codexlens search "how to verify user credentials" --mode pure-vector
|
||||
|
||||
# Best of both worlds
|
||||
codexlens search "authentication" --mode hybrid
|
||||
"""
|
||||
_configure_logging(verbose)
|
||||
search_path = path.expanduser().resolve()
|
||||
|
||||
# Validate mode
|
||||
valid_modes = ["exact", "fuzzy", "hybrid", "vector"]
|
||||
valid_modes = ["exact", "fuzzy", "hybrid", "vector", "pure-vector"]
|
||||
if mode not in valid_modes:
|
||||
if json_mode:
|
||||
print_json(success=False, error=f"Invalid mode: {mode}. Must be one of: {', '.join(valid_modes)}")
|
||||
@@ -244,8 +259,18 @@ def search(
|
||||
engine = ChainSearchEngine(registry, mapper)
|
||||
|
||||
# Map mode to options
|
||||
hybrid_mode = mode == "hybrid"
|
||||
enable_fuzzy = mode in ["fuzzy", "hybrid"]
|
||||
if mode == "exact":
|
||||
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, False, False, False
|
||||
elif mode == "fuzzy":
|
||||
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, True, False, False
|
||||
elif mode == "vector":
|
||||
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, False # Vector + exact fallback
|
||||
elif mode == "pure-vector":
|
||||
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, True # Pure vector only
|
||||
elif mode == "hybrid":
|
||||
hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, True, True, False
|
||||
else:
|
||||
raise ValueError(f"Invalid mode: {mode}")
|
||||
|
||||
options = SearchOptions(
|
||||
depth=depth,
|
||||
@@ -253,6 +278,8 @@ def search(
|
||||
files_only=files_only,
|
||||
hybrid_mode=hybrid_mode,
|
||||
enable_fuzzy=enable_fuzzy,
|
||||
enable_vector=enable_vector,
|
||||
pure_vector=pure_vector,
|
||||
hybrid_weights=hybrid_weights,
|
||||
)
|
||||
|
||||
@@ -1573,3 +1600,483 @@ def semantic_list(
|
||||
finally:
|
||||
if registry is not None:
|
||||
registry.close()
|
||||
|
||||
|
||||
# ==================== Model Management Commands ====================
|
||||
|
||||
@app.command(name="model-list")
|
||||
def model_list(
|
||||
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
||||
) -> None:
|
||||
"""List available embedding models and their installation status.
|
||||
|
||||
Shows 4 model profiles (fast, code, multilingual, balanced) with:
|
||||
- Installation status
|
||||
- Model size and dimensions
|
||||
- Use case recommendations
|
||||
"""
|
||||
try:
|
||||
from codexlens.cli.model_manager import list_models
|
||||
|
||||
result = list_models()
|
||||
|
||||
if json_mode:
|
||||
print_json(**result)
|
||||
else:
|
||||
if not result["success"]:
|
||||
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
data = result["result"]
|
||||
models = data["models"]
|
||||
cache_dir = data["cache_dir"]
|
||||
cache_exists = data["cache_exists"]
|
||||
|
||||
console.print("[bold]Available Embedding Models:[/bold]")
|
||||
console.print(f"Cache directory: [dim]{cache_dir}[/dim] {'(exists)' if cache_exists else '(not found)'}\n")
|
||||
|
||||
table = Table(show_header=True, header_style="bold")
|
||||
table.add_column("Profile", style="cyan")
|
||||
table.add_column("Model Name", style="blue")
|
||||
table.add_column("Dims", justify="right")
|
||||
table.add_column("Size (MB)", justify="right")
|
||||
table.add_column("Status", justify="center")
|
||||
table.add_column("Use Case", style="dim")
|
||||
|
||||
for model in models:
|
||||
status_icon = "[green]✓[/green]" if model["installed"] else "[dim]—[/dim]"
|
||||
size_display = (
|
||||
f"{model['actual_size_mb']:.1f}" if model["installed"]
|
||||
else f"~{model['estimated_size_mb']}"
|
||||
)
|
||||
table.add_row(
|
||||
model["profile"],
|
||||
model["model_name"],
|
||||
str(model["dimensions"]),
|
||||
size_display,
|
||||
status_icon,
|
||||
model["use_case"][:40] + "..." if len(model["use_case"]) > 40 else model["use_case"],
|
||||
)
|
||||
|
||||
console.print(table)
|
||||
console.print("\n[dim]Use 'codexlens model-download <profile>' to download a model[/dim]")
|
||||
|
||||
except ImportError:
|
||||
if json_mode:
|
||||
print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]")
|
||||
else:
|
||||
console.print("[red]Error:[/red] fastembed not installed")
|
||||
console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]")
|
||||
raise typer.Exit(code=1)
|
||||
except Exception as exc:
|
||||
if json_mode:
|
||||
print_json(success=False, error=str(exc))
|
||||
else:
|
||||
console.print(f"[red]Model-list failed:[/red] {exc}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command(name="model-download")
|
||||
def model_download(
|
||||
profile: str = typer.Argument(..., help="Model profile to download (fast, code, multilingual, balanced)."),
|
||||
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
||||
) -> None:
|
||||
"""Download an embedding model by profile name.
|
||||
|
||||
Example:
|
||||
codexlens model-download code # Download code-optimized model
|
||||
"""
|
||||
try:
|
||||
from codexlens.cli.model_manager import download_model
|
||||
|
||||
if not json_mode:
|
||||
console.print(f"[bold]Downloading model:[/bold] {profile}")
|
||||
console.print("[dim]This may take a few minutes depending on your internet connection...[/dim]\n")
|
||||
|
||||
# Create progress callback for non-JSON mode
|
||||
progress_callback = None if json_mode else lambda msg: console.print(f"[cyan]{msg}[/cyan]")
|
||||
|
||||
result = download_model(profile, progress_callback=progress_callback)
|
||||
|
||||
if json_mode:
|
||||
print_json(**result)
|
||||
else:
|
||||
if not result["success"]:
|
||||
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
data = result["result"]
|
||||
console.print(f"[green]✓[/green] Model downloaded successfully!")
|
||||
console.print(f" Profile: {data['profile']}")
|
||||
console.print(f" Model: {data['model_name']}")
|
||||
console.print(f" Cache size: {data['cache_size_mb']:.1f} MB")
|
||||
console.print(f" Location: [dim]{data['cache_path']}[/dim]")
|
||||
|
||||
except ImportError:
|
||||
if json_mode:
|
||||
print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]")
|
||||
else:
|
||||
console.print("[red]Error:[/red] fastembed not installed")
|
||||
console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]")
|
||||
raise typer.Exit(code=1)
|
||||
except Exception as exc:
|
||||
if json_mode:
|
||||
print_json(success=False, error=str(exc))
|
||||
else:
|
||||
console.print(f"[red]Model-download failed:[/red] {exc}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command(name="model-delete")
|
||||
def model_delete(
|
||||
profile: str = typer.Argument(..., help="Model profile to delete (fast, code, multilingual, balanced)."),
|
||||
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
||||
) -> None:
|
||||
"""Delete a downloaded embedding model from cache.
|
||||
|
||||
Example:
|
||||
codexlens model-delete fast # Delete fast model
|
||||
"""
|
||||
try:
|
||||
from codexlens.cli.model_manager import delete_model
|
||||
|
||||
if not json_mode:
|
||||
console.print(f"[bold yellow]Deleting model:[/bold yellow] {profile}")
|
||||
|
||||
result = delete_model(profile)
|
||||
|
||||
if json_mode:
|
||||
print_json(**result)
|
||||
else:
|
||||
if not result["success"]:
|
||||
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
data = result["result"]
|
||||
console.print(f"[green]✓[/green] Model deleted successfully!")
|
||||
console.print(f" Profile: {data['profile']}")
|
||||
console.print(f" Model: {data['model_name']}")
|
||||
console.print(f" Freed space: {data['deleted_size_mb']:.1f} MB")
|
||||
|
||||
except Exception as exc:
|
||||
if json_mode:
|
||||
print_json(success=False, error=str(exc))
|
||||
else:
|
||||
console.print(f"[red]Model-delete failed:[/red] {exc}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command(name="model-info")
|
||||
def model_info(
|
||||
profile: str = typer.Argument(..., help="Model profile to get info (fast, code, multilingual, balanced)."),
|
||||
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
||||
) -> None:
|
||||
"""Get detailed information about a model profile.
|
||||
|
||||
Example:
|
||||
codexlens model-info code # Get code model details
|
||||
"""
|
||||
try:
|
||||
from codexlens.cli.model_manager import get_model_info
|
||||
|
||||
result = get_model_info(profile)
|
||||
|
||||
if json_mode:
|
||||
print_json(**result)
|
||||
else:
|
||||
if not result["success"]:
|
||||
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
data = result["result"]
|
||||
console.print(f"[bold]Model Profile:[/bold] {data['profile']}")
|
||||
console.print(f" Model name: {data['model_name']}")
|
||||
console.print(f" Dimensions: {data['dimensions']}")
|
||||
console.print(f" Status: {'[green]Installed[/green]' if data['installed'] else '[dim]Not installed[/dim]'}")
|
||||
if data['installed'] and data['actual_size_mb']:
|
||||
console.print(f" Cache size: {data['actual_size_mb']:.1f} MB")
|
||||
console.print(f" Location: [dim]{data['cache_path']}[/dim]")
|
||||
else:
|
||||
console.print(f" Estimated size: ~{data['estimated_size_mb']} MB")
|
||||
console.print(f"\n Description: {data['description']}")
|
||||
console.print(f" Use case: {data['use_case']}")
|
||||
|
||||
except Exception as exc:
|
||||
if json_mode:
|
||||
print_json(success=False, error=str(exc))
|
||||
else:
|
||||
console.print(f"[red]Model-info failed:[/red] {exc}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
# ==================== Embedding Management Commands ====================
|
||||
|
||||
@app.command(name="embeddings-status")
|
||||
def embeddings_status(
|
||||
path: Optional[Path] = typer.Argument(
|
||||
None,
|
||||
exists=True,
|
||||
help="Path to specific _index.db file or directory containing indexes. If not specified, uses default index root.",
|
||||
),
|
||||
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
||||
) -> None:
|
||||
"""Check embedding status for one or all indexes.
|
||||
|
||||
Shows embedding statistics including:
|
||||
- Number of chunks generated
|
||||
- File coverage percentage
|
||||
- Files missing embeddings
|
||||
|
||||
Examples:
|
||||
codexlens embeddings-status # Check all indexes
|
||||
codexlens embeddings-status ~/.codexlens/indexes/project/_index.db # Check specific index
|
||||
codexlens embeddings-status ~/projects/my-app # Check project (auto-finds index)
|
||||
"""
|
||||
try:
|
||||
from codexlens.cli.embedding_manager import check_index_embeddings, get_embedding_stats_summary
|
||||
|
||||
# Determine what to check
|
||||
if path is None:
|
||||
# Check all indexes in default root
|
||||
index_root = _get_index_root()
|
||||
result = get_embedding_stats_summary(index_root)
|
||||
|
||||
if json_mode:
|
||||
print_json(**result)
|
||||
else:
|
||||
if not result["success"]:
|
||||
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
data = result["result"]
|
||||
total = data["total_indexes"]
|
||||
with_emb = data["indexes_with_embeddings"]
|
||||
total_chunks = data["total_chunks"]
|
||||
|
||||
console.print(f"[bold]Embedding Status Summary[/bold]")
|
||||
console.print(f"Index root: [dim]{index_root}[/dim]\n")
|
||||
console.print(f"Total indexes: {total}")
|
||||
console.print(f"Indexes with embeddings: [{'green' if with_emb > 0 else 'yellow'}]{with_emb}[/]/{total}")
|
||||
console.print(f"Total chunks: {total_chunks:,}\n")
|
||||
|
||||
if data["indexes"]:
|
||||
table = Table(show_header=True, header_style="bold")
|
||||
table.add_column("Project", style="cyan")
|
||||
table.add_column("Files", justify="right")
|
||||
table.add_column("Chunks", justify="right")
|
||||
table.add_column("Coverage", justify="right")
|
||||
table.add_column("Status", justify="center")
|
||||
|
||||
for idx_stat in data["indexes"]:
|
||||
status_icon = "[green]✓[/green]" if idx_stat["has_embeddings"] else "[dim]—[/dim]"
|
||||
coverage = f"{idx_stat['coverage_percent']:.1f}%" if idx_stat["has_embeddings"] else "—"
|
||||
|
||||
table.add_row(
|
||||
idx_stat["project"],
|
||||
str(idx_stat["total_files"]),
|
||||
f"{idx_stat['total_chunks']:,}" if idx_stat["has_embeddings"] else "0",
|
||||
coverage,
|
||||
status_icon,
|
||||
)
|
||||
|
||||
console.print(table)
|
||||
|
||||
else:
|
||||
# Check specific index or find index for project
|
||||
target_path = path.expanduser().resolve()
|
||||
|
||||
if target_path.is_file() and target_path.name == "_index.db":
|
||||
# Direct index file
|
||||
index_path = target_path
|
||||
elif target_path.is_dir():
|
||||
# Try to find index for this project
|
||||
registry = RegistryStore()
|
||||
try:
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
index_path = mapper.source_to_index_db(target_path)
|
||||
|
||||
if not index_path.exists():
|
||||
console.print(f"[red]Error:[/red] No index found for {target_path}")
|
||||
console.print("Run 'codexlens init' first to create an index")
|
||||
raise typer.Exit(code=1)
|
||||
finally:
|
||||
registry.close()
|
||||
else:
|
||||
console.print(f"[red]Error:[/red] Path must be _index.db file or directory")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
result = check_index_embeddings(index_path)
|
||||
|
||||
if json_mode:
|
||||
print_json(**result)
|
||||
else:
|
||||
if not result["success"]:
|
||||
console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
data = result["result"]
|
||||
has_emb = data["has_embeddings"]
|
||||
|
||||
console.print(f"[bold]Embedding Status[/bold]")
|
||||
console.print(f"Index: [dim]{data['index_path']}[/dim]\n")
|
||||
|
||||
if has_emb:
|
||||
console.print(f"[green]✓[/green] Embeddings available")
|
||||
console.print(f" Total chunks: {data['total_chunks']:,}")
|
||||
console.print(f" Total files: {data['total_files']:,}")
|
||||
console.print(f" Files with embeddings: {data['files_with_chunks']:,}/{data['total_files']}")
|
||||
console.print(f" Coverage: {data['coverage_percent']:.1f}%")
|
||||
|
||||
if data["files_without_chunks"] > 0:
|
||||
console.print(f"\n[yellow]Warning:[/yellow] {data['files_without_chunks']} files missing embeddings")
|
||||
if data["missing_files_sample"]:
|
||||
console.print(" Sample missing files:")
|
||||
for file in data["missing_files_sample"]:
|
||||
console.print(f" [dim]{file}[/dim]")
|
||||
else:
|
||||
console.print(f"[yellow]—[/yellow] No embeddings found")
|
||||
console.print(f" Total files indexed: {data['total_files']:,}")
|
||||
console.print("\n[dim]Generate embeddings with:[/dim]")
|
||||
console.print(f" [cyan]codexlens embeddings-generate {index_path}[/cyan]")
|
||||
|
||||
except Exception as exc:
|
||||
if json_mode:
|
||||
print_json(success=False, error=str(exc))
|
||||
else:
|
||||
console.print(f"[red]Embeddings-status failed:[/red] {exc}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
@app.command(name="embeddings-generate")
|
||||
def embeddings_generate(
|
||||
path: Path = typer.Argument(
|
||||
...,
|
||||
exists=True,
|
||||
help="Path to _index.db file or project directory.",
|
||||
),
|
||||
model: str = typer.Option(
|
||||
"code",
|
||||
"--model",
|
||||
"-m",
|
||||
help="Model profile: fast, code, multilingual, balanced.",
|
||||
),
|
||||
force: bool = typer.Option(
|
||||
False,
|
||||
"--force",
|
||||
"-f",
|
||||
help="Force regeneration even if embeddings exist.",
|
||||
),
|
||||
chunk_size: int = typer.Option(
|
||||
2000,
|
||||
"--chunk-size",
|
||||
help="Maximum chunk size in characters.",
|
||||
),
|
||||
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
|
||||
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
|
||||
) -> None:
|
||||
"""Generate semantic embeddings for code search.
|
||||
|
||||
Creates vector embeddings for all files in an index to enable
|
||||
semantic search capabilities. Embeddings are stored in the same
|
||||
database as the FTS index.
|
||||
|
||||
Model Profiles:
|
||||
- fast: BAAI/bge-small-en-v1.5 (384 dims, ~80MB)
|
||||
- code: jinaai/jina-embeddings-v2-base-code (768 dims, ~150MB) [recommended]
|
||||
- multilingual: intfloat/multilingual-e5-large (1024 dims, ~1GB)
|
||||
- balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dims, ~600MB)
|
||||
|
||||
Examples:
|
||||
codexlens embeddings-generate ~/projects/my-app # Auto-find index for project
|
||||
codexlens embeddings-generate ~/.codexlens/indexes/project/_index.db # Specific index
|
||||
codexlens embeddings-generate ~/projects/my-app --model fast --force # Regenerate with fast model
|
||||
"""
|
||||
_configure_logging(verbose)
|
||||
|
||||
try:
|
||||
from codexlens.cli.embedding_manager import generate_embeddings
|
||||
|
||||
# Resolve path
|
||||
target_path = path.expanduser().resolve()
|
||||
|
||||
if target_path.is_file() and target_path.name == "_index.db":
|
||||
# Direct index file
|
||||
index_path = target_path
|
||||
elif target_path.is_dir():
|
||||
# Try to find index for this project
|
||||
registry = RegistryStore()
|
||||
try:
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
index_path = mapper.source_to_index_db(target_path)
|
||||
|
||||
if not index_path.exists():
|
||||
console.print(f"[red]Error:[/red] No index found for {target_path}")
|
||||
console.print("Run 'codexlens init' first to create an index")
|
||||
raise typer.Exit(code=1)
|
||||
finally:
|
||||
registry.close()
|
||||
else:
|
||||
console.print(f"[red]Error:[/red] Path must be _index.db file or directory")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
# Progress callback
|
||||
def progress_update(msg: str):
|
||||
if not json_mode and verbose:
|
||||
console.print(f" {msg}")
|
||||
|
||||
console.print(f"[bold]Generating embeddings[/bold]")
|
||||
console.print(f"Index: [dim]{index_path}[/dim]")
|
||||
console.print(f"Model: [cyan]{model}[/cyan]\n")
|
||||
|
||||
result = generate_embeddings(
|
||||
index_path,
|
||||
model_profile=model,
|
||||
force=force,
|
||||
chunk_size=chunk_size,
|
||||
progress_callback=progress_update,
|
||||
)
|
||||
|
||||
if json_mode:
|
||||
print_json(**result)
|
||||
else:
|
||||
if not result["success"]:
|
||||
error_msg = result.get("error", "Unknown error")
|
||||
console.print(f"[red]Error:[/red] {error_msg}")
|
||||
|
||||
# Provide helpful hints
|
||||
if "already has" in error_msg:
|
||||
console.print("\n[dim]Use --force to regenerate existing embeddings[/dim]")
|
||||
elif "Semantic search not available" in error_msg:
|
||||
console.print("\n[dim]Install semantic dependencies:[/dim]")
|
||||
console.print(" [cyan]pip install codexlens[semantic][/cyan]")
|
||||
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
data = result["result"]
|
||||
elapsed = data["elapsed_time"]
|
||||
|
||||
console.print(f"[green]✓[/green] Embeddings generated successfully!")
|
||||
console.print(f" Model: {data['model_name']}")
|
||||
console.print(f" Chunks created: {data['chunks_created']:,}")
|
||||
console.print(f" Files processed: {data['files_processed']}")
|
||||
|
||||
if data["files_failed"] > 0:
|
||||
console.print(f" [yellow]Files failed: {data['files_failed']}[/yellow]")
|
||||
if data["failed_files"]:
|
||||
console.print(" [dim]First failures:[/dim]")
|
||||
for file_path, error in data["failed_files"]:
|
||||
console.print(f" [dim]{file_path}: {error}[/dim]")
|
||||
|
||||
console.print(f" Time: {elapsed:.1f}s")
|
||||
|
||||
console.print("\n[dim]Use vector search with:[/dim]")
|
||||
console.print(" [cyan]codexlens search 'your query' --mode pure-vector[/cyan]")
|
||||
|
||||
except Exception as exc:
|
||||
if json_mode:
|
||||
print_json(success=False, error=str(exc))
|
||||
else:
|
||||
console.print(f"[red]Embeddings-generate failed:[/red] {exc}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
331
codex-lens/src/codexlens/cli/embedding_manager.py
Normal file
331
codex-lens/src/codexlens/cli/embedding_manager.py
Normal file
@@ -0,0 +1,331 @@
|
||||
"""Embedding Manager - Manage semantic embeddings for code indexes."""
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
try:
|
||||
from codexlens.semantic import SEMANTIC_AVAILABLE
|
||||
if SEMANTIC_AVAILABLE:
|
||||
from codexlens.semantic.embedder import Embedder
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
from codexlens.semantic.chunker import Chunker, ChunkConfig
|
||||
except ImportError:
|
||||
SEMANTIC_AVAILABLE = False
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def check_index_embeddings(index_path: Path) -> Dict[str, any]:
|
||||
"""Check if an index has embeddings and return statistics.
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
|
||||
Returns:
|
||||
Dictionary with embedding statistics and status
|
||||
"""
|
||||
if not index_path.exists():
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Index not found: {index_path}",
|
||||
}
|
||||
|
||||
try:
|
||||
with sqlite3.connect(index_path) as conn:
|
||||
# Check if semantic_chunks table exists
|
||||
cursor = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
|
||||
)
|
||||
table_exists = cursor.fetchone() is not None
|
||||
|
||||
if not table_exists:
|
||||
# Count total indexed files even without embeddings
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM files")
|
||||
total_files = cursor.fetchone()[0]
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"result": {
|
||||
"has_embeddings": False,
|
||||
"total_chunks": 0,
|
||||
"total_files": total_files,
|
||||
"files_with_chunks": 0,
|
||||
"files_without_chunks": total_files,
|
||||
"coverage_percent": 0.0,
|
||||
"missing_files_sample": [],
|
||||
"index_path": str(index_path),
|
||||
},
|
||||
}
|
||||
|
||||
# Count total chunks
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
|
||||
total_chunks = cursor.fetchone()[0]
|
||||
|
||||
# Count total indexed files
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM files")
|
||||
total_files = cursor.fetchone()[0]
|
||||
|
||||
# Count files with embeddings
|
||||
cursor = conn.execute(
|
||||
"SELECT COUNT(DISTINCT file_path) FROM semantic_chunks"
|
||||
)
|
||||
files_with_chunks = cursor.fetchone()[0]
|
||||
|
||||
# Get a sample of files without embeddings
|
||||
cursor = conn.execute("""
|
||||
SELECT full_path
|
||||
FROM files
|
||||
WHERE full_path NOT IN (
|
||||
SELECT DISTINCT file_path FROM semantic_chunks
|
||||
)
|
||||
LIMIT 5
|
||||
""")
|
||||
missing_files = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"result": {
|
||||
"has_embeddings": total_chunks > 0,
|
||||
"total_chunks": total_chunks,
|
||||
"total_files": total_files,
|
||||
"files_with_chunks": files_with_chunks,
|
||||
"files_without_chunks": total_files - files_with_chunks,
|
||||
"coverage_percent": round((files_with_chunks / total_files * 100) if total_files > 0 else 0, 1),
|
||||
"missing_files_sample": missing_files,
|
||||
"index_path": str(index_path),
|
||||
},
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Failed to check embeddings: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def generate_embeddings(
|
||||
index_path: Path,
|
||||
model_profile: str = "code",
|
||||
force: bool = False,
|
||||
chunk_size: int = 2000,
|
||||
progress_callback: Optional[callable] = None,
|
||||
) -> Dict[str, any]:
|
||||
"""Generate embeddings for an index.
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
model_profile: Model profile (fast, code, multilingual, balanced)
|
||||
force: If True, regenerate even if embeddings exist
|
||||
chunk_size: Maximum chunk size in characters
|
||||
progress_callback: Optional callback for progress updates
|
||||
|
||||
Returns:
|
||||
Result dictionary with generation statistics
|
||||
"""
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Semantic search not available. Install with: pip install codexlens[semantic]",
|
||||
}
|
||||
|
||||
if not index_path.exists():
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Index not found: {index_path}",
|
||||
}
|
||||
|
||||
# Check existing chunks
|
||||
status = check_index_embeddings(index_path)
|
||||
if not status["success"]:
|
||||
return status
|
||||
|
||||
existing_chunks = status["result"]["total_chunks"]
|
||||
|
||||
if existing_chunks > 0 and not force:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Index already has {existing_chunks} chunks. Use --force to regenerate.",
|
||||
"existing_chunks": existing_chunks,
|
||||
}
|
||||
|
||||
if force and existing_chunks > 0:
|
||||
if progress_callback:
|
||||
progress_callback(f"Clearing {existing_chunks} existing chunks...")
|
||||
|
||||
try:
|
||||
with sqlite3.connect(index_path) as conn:
|
||||
conn.execute("DELETE FROM semantic_chunks")
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Failed to clear existing chunks: {str(e)}",
|
||||
}
|
||||
|
||||
# Initialize components
|
||||
try:
|
||||
embedder = Embedder(profile=model_profile)
|
||||
vector_store = VectorStore(index_path)
|
||||
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Failed to initialize components: {str(e)}",
|
||||
}
|
||||
|
||||
# Read files from index
|
||||
try:
|
||||
with sqlite3.connect(index_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.execute("SELECT full_path, content, language FROM files")
|
||||
files = cursor.fetchall()
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Failed to read files: {str(e)}",
|
||||
}
|
||||
|
||||
if len(files) == 0:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "No files found in index",
|
||||
}
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f"Processing {len(files)} files...")
|
||||
|
||||
# Process each file
|
||||
total_chunks = 0
|
||||
failed_files = []
|
||||
start_time = time.time()
|
||||
|
||||
for idx, file_row in enumerate(files, 1):
|
||||
file_path = file_row["full_path"]
|
||||
content = file_row["content"]
|
||||
language = file_row["language"] or "python"
|
||||
|
||||
try:
|
||||
# Create chunks
|
||||
chunks = chunker.chunk_sliding_window(
|
||||
content,
|
||||
file_path=file_path,
|
||||
language=language
|
||||
)
|
||||
|
||||
if not chunks:
|
||||
continue
|
||||
|
||||
# Generate embeddings
|
||||
for chunk in chunks:
|
||||
embedding = embedder.embed_single(chunk.content)
|
||||
chunk.embedding = embedding
|
||||
|
||||
# Store chunks
|
||||
vector_store.add_chunks(chunks, file_path)
|
||||
total_chunks += len(chunks)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f"[{idx}/{len(files)}] {file_path}: {len(chunks)} chunks")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process {file_path}: {e}")
|
||||
failed_files.append((file_path, str(e)))
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"result": {
|
||||
"chunks_created": total_chunks,
|
||||
"files_processed": len(files) - len(failed_files),
|
||||
"files_failed": len(failed_files),
|
||||
"elapsed_time": elapsed_time,
|
||||
"model_profile": model_profile,
|
||||
"model_name": embedder.model_name,
|
||||
"failed_files": failed_files[:5], # First 5 failures
|
||||
"index_path": str(index_path),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def find_all_indexes(scan_dir: Path) -> List[Path]:
|
||||
"""Find all _index.db files in directory tree.
|
||||
|
||||
Args:
|
||||
scan_dir: Directory to scan
|
||||
|
||||
Returns:
|
||||
List of paths to _index.db files
|
||||
"""
|
||||
if not scan_dir.exists():
|
||||
return []
|
||||
|
||||
return list(scan_dir.rglob("_index.db"))
|
||||
|
||||
|
||||
def get_embedding_stats_summary(index_root: Path) -> Dict[str, any]:
|
||||
"""Get summary statistics for all indexes in root directory.
|
||||
|
||||
Args:
|
||||
index_root: Root directory containing indexes
|
||||
|
||||
Returns:
|
||||
Summary statistics for all indexes
|
||||
"""
|
||||
indexes = find_all_indexes(index_root)
|
||||
|
||||
if not indexes:
|
||||
return {
|
||||
"success": True,
|
||||
"result": {
|
||||
"total_indexes": 0,
|
||||
"indexes_with_embeddings": 0,
|
||||
"total_chunks": 0,
|
||||
"indexes": [],
|
||||
},
|
||||
}
|
||||
|
||||
total_chunks = 0
|
||||
indexes_with_embeddings = 0
|
||||
index_stats = []
|
||||
|
||||
for index_path in indexes:
|
||||
status = check_index_embeddings(index_path)
|
||||
|
||||
if status["success"]:
|
||||
result = status["result"]
|
||||
has_emb = result["has_embeddings"]
|
||||
chunks = result["total_chunks"]
|
||||
|
||||
if has_emb:
|
||||
indexes_with_embeddings += 1
|
||||
total_chunks += chunks
|
||||
|
||||
# Extract project name from path
|
||||
project_name = index_path.parent.name
|
||||
|
||||
index_stats.append({
|
||||
"project": project_name,
|
||||
"path": str(index_path),
|
||||
"has_embeddings": has_emb,
|
||||
"total_chunks": chunks,
|
||||
"total_files": result["total_files"],
|
||||
"coverage_percent": result.get("coverage_percent", 0),
|
||||
})
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"result": {
|
||||
"total_indexes": len(indexes),
|
||||
"indexes_with_embeddings": indexes_with_embeddings,
|
||||
"total_chunks": total_chunks,
|
||||
"indexes": index_stats,
|
||||
},
|
||||
}
|
||||
289
codex-lens/src/codexlens/cli/model_manager.py
Normal file
289
codex-lens/src/codexlens/cli/model_manager.py
Normal file
@@ -0,0 +1,289 @@
|
||||
"""Model Manager - Manage fastembed models for semantic search."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
try:
|
||||
from fastembed import TextEmbedding
|
||||
FASTEMBED_AVAILABLE = True
|
||||
except ImportError:
|
||||
FASTEMBED_AVAILABLE = False
|
||||
|
||||
|
||||
# Model profiles with metadata
|
||||
MODEL_PROFILES = {
|
||||
"fast": {
|
||||
"model_name": "BAAI/bge-small-en-v1.5",
|
||||
"dimensions": 384,
|
||||
"size_mb": 80,
|
||||
"description": "Fast, lightweight, English-optimized",
|
||||
"use_case": "Quick prototyping, resource-constrained environments",
|
||||
},
|
||||
"code": {
|
||||
"model_name": "jinaai/jina-embeddings-v2-base-code",
|
||||
"dimensions": 768,
|
||||
"size_mb": 150,
|
||||
"description": "Code-optimized, best for programming languages",
|
||||
"use_case": "Open source projects, code semantic search",
|
||||
},
|
||||
"multilingual": {
|
||||
"model_name": "intfloat/multilingual-e5-large",
|
||||
"dimensions": 1024,
|
||||
"size_mb": 1000,
|
||||
"description": "Multilingual + code support",
|
||||
"use_case": "Enterprise multilingual projects",
|
||||
},
|
||||
"balanced": {
|
||||
"model_name": "mixedbread-ai/mxbai-embed-large-v1",
|
||||
"dimensions": 1024,
|
||||
"size_mb": 600,
|
||||
"description": "High accuracy, general purpose",
|
||||
"use_case": "High-quality semantic search, balanced performance",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def get_cache_dir() -> Path:
|
||||
"""Get fastembed cache directory.
|
||||
|
||||
Returns:
|
||||
Path to cache directory (usually ~/.cache/fastembed or %LOCALAPPDATA%\\Temp\\fastembed_cache)
|
||||
"""
|
||||
# Check HF_HOME environment variable first
|
||||
if "HF_HOME" in os.environ:
|
||||
return Path(os.environ["HF_HOME"])
|
||||
|
||||
# Default cache locations
|
||||
if os.name == "nt": # Windows
|
||||
cache_dir = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local")) / "Temp" / "fastembed_cache"
|
||||
else: # Unix-like
|
||||
cache_dir = Path.home() / ".cache" / "fastembed"
|
||||
|
||||
return cache_dir
|
||||
|
||||
|
||||
def list_models() -> Dict[str, any]:
|
||||
"""List available model profiles and their installation status.
|
||||
|
||||
Returns:
|
||||
Dictionary with model profiles, installed status, and cache info
|
||||
"""
|
||||
if not FASTEMBED_AVAILABLE:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "fastembed not installed. Install with: pip install codexlens[semantic]",
|
||||
}
|
||||
|
||||
cache_dir = get_cache_dir()
|
||||
cache_exists = cache_dir.exists()
|
||||
|
||||
models = []
|
||||
for profile, info in MODEL_PROFILES.items():
|
||||
model_name = info["model_name"]
|
||||
|
||||
# Check if model is cached
|
||||
installed = False
|
||||
cache_size_mb = 0
|
||||
|
||||
if cache_exists:
|
||||
# Check for model directory in cache
|
||||
model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
|
||||
if model_cache_path.exists():
|
||||
installed = True
|
||||
# Calculate cache size
|
||||
total_size = sum(
|
||||
f.stat().st_size
|
||||
for f in model_cache_path.rglob("*")
|
||||
if f.is_file()
|
||||
)
|
||||
cache_size_mb = round(total_size / (1024 * 1024), 1)
|
||||
|
||||
models.append({
|
||||
"profile": profile,
|
||||
"model_name": model_name,
|
||||
"dimensions": info["dimensions"],
|
||||
"estimated_size_mb": info["size_mb"],
|
||||
"actual_size_mb": cache_size_mb if installed else None,
|
||||
"description": info["description"],
|
||||
"use_case": info["use_case"],
|
||||
"installed": installed,
|
||||
})
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"result": {
|
||||
"models": models,
|
||||
"cache_dir": str(cache_dir),
|
||||
"cache_exists": cache_exists,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def download_model(profile: str, progress_callback: Optional[callable] = None) -> Dict[str, any]:
|
||||
"""Download a model by profile name.
|
||||
|
||||
Args:
|
||||
profile: Model profile name (fast, code, multilingual, balanced)
|
||||
progress_callback: Optional callback function to report progress
|
||||
|
||||
Returns:
|
||||
Result dictionary with success status
|
||||
"""
|
||||
if not FASTEMBED_AVAILABLE:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "fastembed not installed. Install with: pip install codexlens[semantic]",
|
||||
}
|
||||
|
||||
if profile not in MODEL_PROFILES:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}",
|
||||
}
|
||||
|
||||
model_name = MODEL_PROFILES[profile]["model_name"]
|
||||
|
||||
try:
|
||||
# Download model by instantiating TextEmbedding
|
||||
# This will automatically download to cache if not present
|
||||
if progress_callback:
|
||||
progress_callback(f"Downloading {model_name}...")
|
||||
|
||||
embedder = TextEmbedding(model_name=model_name)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f"Model {model_name} downloaded successfully")
|
||||
|
||||
# Get cache info
|
||||
cache_dir = get_cache_dir()
|
||||
model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
|
||||
|
||||
cache_size = 0
|
||||
if model_cache_path.exists():
|
||||
total_size = sum(
|
||||
f.stat().st_size
|
||||
for f in model_cache_path.rglob("*")
|
||||
if f.is_file()
|
||||
)
|
||||
cache_size = round(total_size / (1024 * 1024), 1)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"result": {
|
||||
"profile": profile,
|
||||
"model_name": model_name,
|
||||
"cache_size_mb": cache_size,
|
||||
"cache_path": str(model_cache_path),
|
||||
},
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Failed to download model: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def delete_model(profile: str) -> Dict[str, any]:
|
||||
"""Delete a downloaded model from cache.
|
||||
|
||||
Args:
|
||||
profile: Model profile name to delete
|
||||
|
||||
Returns:
|
||||
Result dictionary with success status
|
||||
"""
|
||||
if profile not in MODEL_PROFILES:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}",
|
||||
}
|
||||
|
||||
model_name = MODEL_PROFILES[profile]["model_name"]
|
||||
cache_dir = get_cache_dir()
|
||||
model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
|
||||
|
||||
if not model_cache_path.exists():
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Model {profile} ({model_name}) is not installed",
|
||||
}
|
||||
|
||||
try:
|
||||
# Calculate size before deletion
|
||||
total_size = sum(
|
||||
f.stat().st_size
|
||||
for f in model_cache_path.rglob("*")
|
||||
if f.is_file()
|
||||
)
|
||||
size_mb = round(total_size / (1024 * 1024), 1)
|
||||
|
||||
# Delete model directory
|
||||
shutil.rmtree(model_cache_path)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"result": {
|
||||
"profile": profile,
|
||||
"model_name": model_name,
|
||||
"deleted_size_mb": size_mb,
|
||||
"cache_path": str(model_cache_path),
|
||||
},
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Failed to delete model: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def get_model_info(profile: str) -> Dict[str, any]:
|
||||
"""Get detailed information about a model profile.
|
||||
|
||||
Args:
|
||||
profile: Model profile name
|
||||
|
||||
Returns:
|
||||
Result dictionary with model information
|
||||
"""
|
||||
if profile not in MODEL_PROFILES:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}",
|
||||
}
|
||||
|
||||
info = MODEL_PROFILES[profile]
|
||||
model_name = info["model_name"]
|
||||
|
||||
# Check installation status
|
||||
cache_dir = get_cache_dir()
|
||||
model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
|
||||
installed = model_cache_path.exists()
|
||||
|
||||
cache_size_mb = None
|
||||
if installed:
|
||||
total_size = sum(
|
||||
f.stat().st_size
|
||||
for f in model_cache_path.rglob("*")
|
||||
if f.is_file()
|
||||
)
|
||||
cache_size_mb = round(total_size / (1024 * 1024), 1)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"result": {
|
||||
"profile": profile,
|
||||
"model_name": model_name,
|
||||
"dimensions": info["dimensions"],
|
||||
"estimated_size_mb": info["size_mb"],
|
||||
"actual_size_mb": cache_size_mb,
|
||||
"description": info["description"],
|
||||
"use_case": info["use_case"],
|
||||
"installed": installed,
|
||||
"cache_path": str(model_cache_path) if installed else None,
|
||||
},
|
||||
}
|
||||
@@ -3,6 +3,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from dataclasses import asdict, is_dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, Mapping, Sequence
|
||||
@@ -13,7 +14,9 @@ from rich.text import Text
|
||||
|
||||
from codexlens.entities import SearchResult, Symbol
|
||||
|
||||
console = Console()
|
||||
# Force UTF-8 encoding for Windows console to properly display Chinese text
|
||||
# Use force_terminal=True and legacy_windows=False to avoid GBK encoding issues
|
||||
console = Console(force_terminal=True, legacy_windows=False)
|
||||
|
||||
|
||||
def _to_jsonable(value: Any) -> Any:
|
||||
|
||||
@@ -13,6 +13,7 @@ class Symbol(BaseModel):
|
||||
name: str = Field(..., min_length=1)
|
||||
kind: str = Field(..., min_length=1)
|
||||
range: Tuple[int, int] = Field(..., description="(start_line, end_line), 1-based inclusive")
|
||||
file: Optional[str] = Field(default=None, description="Full path to the file containing this symbol")
|
||||
token_count: Optional[int] = Field(default=None, description="Token count for symbol content")
|
||||
symbol_type: Optional[str] = Field(default=None, description="Extended symbol type for filtering")
|
||||
|
||||
|
||||
@@ -35,6 +35,8 @@ class SearchOptions:
|
||||
include_semantic: Whether to include semantic keyword search results
|
||||
hybrid_mode: Enable hybrid search with RRF fusion (default False)
|
||||
enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True)
|
||||
enable_vector: Enable vector semantic search (default False)
|
||||
pure_vector: If True, only use vector search without FTS fallback (default False)
|
||||
hybrid_weights: Custom RRF weights for hybrid search (optional)
|
||||
"""
|
||||
depth: int = -1
|
||||
@@ -46,6 +48,8 @@ class SearchOptions:
|
||||
include_semantic: bool = False
|
||||
hybrid_mode: bool = False
|
||||
enable_fuzzy: bool = True
|
||||
enable_vector: bool = False
|
||||
pure_vector: bool = False
|
||||
hybrid_weights: Optional[Dict[str, float]] = None
|
||||
|
||||
|
||||
@@ -494,6 +498,8 @@ class ChainSearchEngine:
|
||||
options.include_semantic,
|
||||
options.hybrid_mode,
|
||||
options.enable_fuzzy,
|
||||
options.enable_vector,
|
||||
options.pure_vector,
|
||||
options.hybrid_weights
|
||||
): idx_path
|
||||
for idx_path in index_paths
|
||||
@@ -520,6 +526,8 @@ class ChainSearchEngine:
|
||||
include_semantic: bool = False,
|
||||
hybrid_mode: bool = False,
|
||||
enable_fuzzy: bool = True,
|
||||
enable_vector: bool = False,
|
||||
pure_vector: bool = False,
|
||||
hybrid_weights: Optional[Dict[str, float]] = None) -> List[SearchResult]:
|
||||
"""Search a single index database.
|
||||
|
||||
@@ -527,12 +535,14 @@ class ChainSearchEngine:
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
query: FTS5 query string
|
||||
query: FTS5 query string (for FTS) or natural language query (for vector)
|
||||
limit: Maximum results from this index
|
||||
files_only: If True, skip snippet generation for faster search
|
||||
include_semantic: If True, also search semantic keywords and merge results
|
||||
hybrid_mode: If True, use hybrid search with RRF fusion
|
||||
enable_fuzzy: Enable fuzzy FTS in hybrid mode
|
||||
enable_vector: Enable vector semantic search
|
||||
pure_vector: If True, only use vector search without FTS fallback
|
||||
hybrid_weights: Custom RRF weights for hybrid search
|
||||
|
||||
Returns:
|
||||
@@ -547,10 +557,11 @@ class ChainSearchEngine:
|
||||
query,
|
||||
limit=limit,
|
||||
enable_fuzzy=enable_fuzzy,
|
||||
enable_vector=False, # Vector search not yet implemented
|
||||
enable_vector=enable_vector,
|
||||
pure_vector=pure_vector,
|
||||
)
|
||||
else:
|
||||
# Legacy single-FTS search
|
||||
# Single-FTS search (exact or fuzzy mode)
|
||||
with DirIndexStore(index_path) as store:
|
||||
# Get FTS results
|
||||
if files_only:
|
||||
@@ -558,7 +569,11 @@ class ChainSearchEngine:
|
||||
paths = store.search_files_only(query, limit=limit)
|
||||
fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
|
||||
else:
|
||||
fts_results = store.search_fts(query, limit=limit)
|
||||
# Use fuzzy FTS if enable_fuzzy=True (mode="fuzzy"), otherwise exact FTS
|
||||
if enable_fuzzy:
|
||||
fts_results = store.search_fts_fuzzy(query, limit=limit)
|
||||
else:
|
||||
fts_results = store.search_fts(query, limit=limit)
|
||||
|
||||
# Optionally add semantic keyword results
|
||||
if include_semantic:
|
||||
|
||||
@@ -50,35 +50,68 @@ class HybridSearchEngine:
|
||||
limit: int = 20,
|
||||
enable_fuzzy: bool = True,
|
||||
enable_vector: bool = False,
|
||||
pure_vector: bool = False,
|
||||
) -> List[SearchResult]:
|
||||
"""Execute hybrid search with parallel retrieval and RRF fusion.
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
query: FTS5 query string
|
||||
query: FTS5 query string (for FTS) or natural language query (for vector)
|
||||
limit: Maximum results to return after fusion
|
||||
enable_fuzzy: Enable fuzzy FTS search (default True)
|
||||
enable_vector: Enable vector search (default False)
|
||||
pure_vector: If True, only use vector search without FTS fallback (default False)
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects sorted by fusion score
|
||||
|
||||
Examples:
|
||||
>>> engine = HybridSearchEngine()
|
||||
>>> results = engine.search(Path("project/_index.db"), "authentication")
|
||||
>>> # Hybrid search (exact + fuzzy + vector)
|
||||
>>> results = engine.search(Path("project/_index.db"), "authentication",
|
||||
... enable_vector=True)
|
||||
>>> # Pure vector search (semantic only)
|
||||
>>> results = engine.search(Path("project/_index.db"),
|
||||
... "how to authenticate users",
|
||||
... enable_vector=True, pure_vector=True)
|
||||
>>> for r in results[:5]:
|
||||
... print(f"{r.path}: {r.score:.3f}")
|
||||
"""
|
||||
# Determine which backends to use
|
||||
backends = {"exact": True} # Always use exact search
|
||||
if enable_fuzzy:
|
||||
backends["fuzzy"] = True
|
||||
if enable_vector:
|
||||
backends["vector"] = True
|
||||
backends = {}
|
||||
|
||||
if pure_vector:
|
||||
# Pure vector mode: only use vector search, no FTS fallback
|
||||
if enable_vector:
|
||||
backends["vector"] = True
|
||||
else:
|
||||
# Invalid configuration: pure_vector=True but enable_vector=False
|
||||
self.logger.warning(
|
||||
"pure_vector=True requires enable_vector=True. "
|
||||
"Falling back to exact search. "
|
||||
"To use pure vector search, enable vector search mode."
|
||||
)
|
||||
backends["exact"] = True
|
||||
else:
|
||||
# Hybrid mode: always include exact search as baseline
|
||||
backends["exact"] = True
|
||||
if enable_fuzzy:
|
||||
backends["fuzzy"] = True
|
||||
if enable_vector:
|
||||
backends["vector"] = True
|
||||
|
||||
# Execute parallel searches
|
||||
results_map = self._search_parallel(index_path, query, backends, limit)
|
||||
|
||||
# Provide helpful message if pure-vector mode returns no results
|
||||
if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0:
|
||||
self.logger.warning(
|
||||
"Pure vector search returned no results. "
|
||||
"This usually means embeddings haven't been generated. "
|
||||
"Run: codexlens embeddings-generate %s",
|
||||
index_path.parent if index_path.name == "_index.db" else index_path
|
||||
)
|
||||
|
||||
# Apply RRF fusion
|
||||
# Filter weights to only active backends
|
||||
active_weights = {
|
||||
@@ -195,17 +228,67 @@ class HybridSearchEngine:
|
||||
def _search_vector(
|
||||
self, index_path: Path, query: str, limit: int
|
||||
) -> List[SearchResult]:
|
||||
"""Execute vector search (placeholder for future implementation).
|
||||
"""Execute vector similarity search using semantic embeddings.
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
query: Query string
|
||||
query: Natural language query string
|
||||
limit: Maximum results
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects (empty for now)
|
||||
List of SearchResult objects ordered by semantic similarity
|
||||
"""
|
||||
# Placeholder for vector search integration
|
||||
# Will be implemented when VectorStore is available
|
||||
self.logger.debug("Vector search not yet implemented")
|
||||
return []
|
||||
try:
|
||||
# Check if semantic chunks table exists
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(index_path)
|
||||
cursor = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
|
||||
)
|
||||
has_semantic_table = cursor.fetchone() is not None
|
||||
conn.close()
|
||||
|
||||
if not has_semantic_table:
|
||||
self.logger.info(
|
||||
"No embeddings found in index. "
|
||||
"Generate embeddings with: codexlens embeddings-generate %s",
|
||||
index_path.parent if index_path.name == "_index.db" else index_path
|
||||
)
|
||||
return []
|
||||
|
||||
# Initialize embedder and vector store
|
||||
from codexlens.semantic.embedder import Embedder
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
|
||||
embedder = Embedder(profile="code") # Use code-optimized model
|
||||
vector_store = VectorStore(index_path)
|
||||
|
||||
# Check if vector store has data
|
||||
if vector_store.count_chunks() == 0:
|
||||
self.logger.info(
|
||||
"Vector store is empty (0 chunks). "
|
||||
"Generate embeddings with: codexlens embeddings-generate %s",
|
||||
index_path.parent if index_path.name == "_index.db" else index_path
|
||||
)
|
||||
return []
|
||||
|
||||
# Generate query embedding
|
||||
query_embedding = embedder.embed_single(query)
|
||||
|
||||
# Search for similar chunks
|
||||
results = vector_store.search_similar(
|
||||
query_embedding=query_embedding,
|
||||
top_k=limit,
|
||||
min_score=0.0, # Return all results, let RRF handle filtering
|
||||
return_full_content=True,
|
||||
)
|
||||
|
||||
self.logger.debug("Vector search found %d results", len(results))
|
||||
return results
|
||||
|
||||
except ImportError as exc:
|
||||
self.logger.debug("Semantic dependencies not available: %s", exc)
|
||||
return []
|
||||
except Exception as exc:
|
||||
self.logger.error("Vector search error: %s", exc)
|
||||
return []
|
||||
|
||||
@@ -8,21 +8,64 @@ from . import SEMANTIC_AVAILABLE
|
||||
|
||||
|
||||
class Embedder:
|
||||
"""Generate embeddings for code chunks using fastembed (ONNX-based)."""
|
||||
"""Generate embeddings for code chunks using fastembed (ONNX-based).
|
||||
|
||||
MODEL_NAME = "BAAI/bge-small-en-v1.5"
|
||||
EMBEDDING_DIM = 384
|
||||
Supported Model Profiles:
|
||||
- fast: BAAI/bge-small-en-v1.5 (384 dim) - Fast, lightweight, English-optimized
|
||||
- code: jinaai/jina-embeddings-v2-base-code (768 dim) - Code-optimized, best for programming languages
|
||||
- multilingual: intfloat/multilingual-e5-large (1024 dim) - Multilingual + code support
|
||||
- balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dim) - High accuracy, general purpose
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: str | None = None) -> None:
|
||||
# Model profiles for different use cases
|
||||
MODELS = {
|
||||
"fast": "BAAI/bge-small-en-v1.5", # 384 dim - Fast, lightweight
|
||||
"code": "jinaai/jina-embeddings-v2-base-code", # 768 dim - Code-optimized
|
||||
"multilingual": "intfloat/multilingual-e5-large", # 1024 dim - Multilingual
|
||||
"balanced": "mixedbread-ai/mxbai-embed-large-v1", # 1024 dim - High accuracy
|
||||
}
|
||||
|
||||
# Dimension mapping for each model
|
||||
MODEL_DIMS = {
|
||||
"BAAI/bge-small-en-v1.5": 384,
|
||||
"jinaai/jina-embeddings-v2-base-code": 768,
|
||||
"intfloat/multilingual-e5-large": 1024,
|
||||
"mixedbread-ai/mxbai-embed-large-v1": 1024,
|
||||
}
|
||||
|
||||
# Default model (fast profile)
|
||||
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
|
||||
DEFAULT_PROFILE = "fast"
|
||||
|
||||
def __init__(self, model_name: str | None = None, profile: str | None = None) -> None:
|
||||
"""Initialize embedder with model or profile.
|
||||
|
||||
Args:
|
||||
model_name: Explicit model name (e.g., "jinaai/jina-embeddings-v2-base-code")
|
||||
profile: Model profile shortcut ("fast", "code", "multilingual", "balanced")
|
||||
If both provided, model_name takes precedence.
|
||||
"""
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
raise ImportError(
|
||||
"Semantic search dependencies not available. "
|
||||
"Install with: pip install codexlens[semantic]"
|
||||
)
|
||||
|
||||
self.model_name = model_name or self.MODEL_NAME
|
||||
# Resolve model name from profile or use explicit name
|
||||
if model_name:
|
||||
self.model_name = model_name
|
||||
elif profile and profile in self.MODELS:
|
||||
self.model_name = self.MODELS[profile]
|
||||
else:
|
||||
self.model_name = self.DEFAULT_MODEL
|
||||
|
||||
self._model = None
|
||||
|
||||
@property
|
||||
def embedding_dim(self) -> int:
|
||||
"""Get embedding dimension for current model."""
|
||||
return self.MODEL_DIMS.get(self.model_name, 768) # Default to 768 if unknown
|
||||
|
||||
def _load_model(self) -> None:
|
||||
"""Lazy load the embedding model."""
|
||||
if self._model is not None:
|
||||
|
||||
@@ -27,7 +27,6 @@ class SubdirLink:
|
||||
name: str
|
||||
index_path: Path
|
||||
files_count: int
|
||||
direct_files: int
|
||||
last_updated: float
|
||||
|
||||
|
||||
@@ -57,7 +56,7 @@ class DirIndexStore:
|
||||
|
||||
# Schema version for migration tracking
|
||||
# Increment this when schema changes require migration
|
||||
SCHEMA_VERSION = 4
|
||||
SCHEMA_VERSION = 5
|
||||
|
||||
def __init__(self, db_path: str | Path) -> None:
|
||||
"""Initialize directory index store.
|
||||
@@ -133,6 +132,11 @@ class DirIndexStore:
|
||||
from codexlens.storage.migrations.migration_004_dual_fts import upgrade
|
||||
upgrade(conn)
|
||||
|
||||
# Migration v4 -> v5: Remove unused/redundant fields
|
||||
if from_version < 5:
|
||||
from codexlens.storage.migrations.migration_005_cleanup_unused_fields import upgrade
|
||||
upgrade(conn)
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close database connection."""
|
||||
with self._lock:
|
||||
@@ -208,19 +212,17 @@ class DirIndexStore:
|
||||
# Replace symbols
|
||||
conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
|
||||
if symbols:
|
||||
# Extract token_count and symbol_type from symbol metadata if available
|
||||
# Insert symbols without token_count and symbol_type
|
||||
symbol_rows = []
|
||||
for s in symbols:
|
||||
token_count = getattr(s, 'token_count', None)
|
||||
symbol_type = getattr(s, 'symbol_type', None) or s.kind
|
||||
symbol_rows.append(
|
||||
(file_id, s.name, s.kind, s.range[0], s.range[1], token_count, symbol_type)
|
||||
(file_id, s.name, s.kind, s.range[0], s.range[1])
|
||||
)
|
||||
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT INTO symbols(file_id, name, kind, start_line, end_line, token_count, symbol_type)
|
||||
VALUES(?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO symbols(file_id, name, kind, start_line, end_line)
|
||||
VALUES(?, ?, ?, ?, ?)
|
||||
""",
|
||||
symbol_rows,
|
||||
)
|
||||
@@ -374,19 +376,17 @@ class DirIndexStore:
|
||||
|
||||
conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
|
||||
if symbols:
|
||||
# Extract token_count and symbol_type from symbol metadata if available
|
||||
# Insert symbols without token_count and symbol_type
|
||||
symbol_rows = []
|
||||
for s in symbols:
|
||||
token_count = getattr(s, 'token_count', None)
|
||||
symbol_type = getattr(s, 'symbol_type', None) or s.kind
|
||||
symbol_rows.append(
|
||||
(file_id, s.name, s.kind, s.range[0], s.range[1], token_count, symbol_type)
|
||||
(file_id, s.name, s.kind, s.range[0], s.range[1])
|
||||
)
|
||||
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT INTO symbols(file_id, name, kind, start_line, end_line, token_count, symbol_type)
|
||||
VALUES(?, ?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO symbols(file_id, name, kind, start_line, end_line)
|
||||
VALUES(?, ?, ?, ?, ?)
|
||||
""",
|
||||
symbol_rows,
|
||||
)
|
||||
@@ -644,25 +644,22 @@ class DirIndexStore:
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
import json
|
||||
import time
|
||||
|
||||
keywords_json = json.dumps(keywords)
|
||||
generated_at = time.time()
|
||||
|
||||
# Write to semantic_metadata table (for backward compatibility)
|
||||
# Write to semantic_metadata table (without keywords column)
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO semantic_metadata(file_id, summary, keywords, purpose, llm_tool, generated_at)
|
||||
VALUES(?, ?, ?, ?, ?, ?)
|
||||
INSERT INTO semantic_metadata(file_id, summary, purpose, llm_tool, generated_at)
|
||||
VALUES(?, ?, ?, ?, ?)
|
||||
ON CONFLICT(file_id) DO UPDATE SET
|
||||
summary=excluded.summary,
|
||||
keywords=excluded.keywords,
|
||||
purpose=excluded.purpose,
|
||||
llm_tool=excluded.llm_tool,
|
||||
generated_at=excluded.generated_at
|
||||
""",
|
||||
(file_id, summary, keywords_json, purpose, llm_tool, generated_at),
|
||||
(file_id, summary, purpose, llm_tool, generated_at),
|
||||
)
|
||||
|
||||
# Write to normalized keywords tables for optimized search
|
||||
@@ -709,9 +706,10 @@ class DirIndexStore:
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
# Get semantic metadata (without keywords column)
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT summary, keywords, purpose, llm_tool, generated_at
|
||||
SELECT summary, purpose, llm_tool, generated_at
|
||||
FROM semantic_metadata WHERE file_id=?
|
||||
""",
|
||||
(file_id,),
|
||||
@@ -720,11 +718,23 @@ class DirIndexStore:
|
||||
if not row:
|
||||
return None
|
||||
|
||||
import json
|
||||
# Get keywords from normalized file_keywords table
|
||||
keyword_rows = conn.execute(
|
||||
"""
|
||||
SELECT k.keyword
|
||||
FROM file_keywords fk
|
||||
JOIN keywords k ON fk.keyword_id = k.id
|
||||
WHERE fk.file_id = ?
|
||||
ORDER BY k.keyword
|
||||
""",
|
||||
(file_id,),
|
||||
).fetchall()
|
||||
|
||||
keywords = [kw["keyword"] for kw in keyword_rows]
|
||||
|
||||
return {
|
||||
"summary": row["summary"],
|
||||
"keywords": json.loads(row["keywords"]) if row["keywords"] else [],
|
||||
"keywords": keywords,
|
||||
"purpose": row["purpose"],
|
||||
"llm_tool": row["llm_tool"],
|
||||
"generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
|
||||
@@ -856,15 +866,14 @@ class DirIndexStore:
|
||||
Returns:
|
||||
Tuple of (list of metadata dicts, total count)
|
||||
"""
|
||||
import json
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
|
||||
# Query semantic metadata without keywords column
|
||||
base_query = """
|
||||
SELECT f.id as file_id, f.name as file_name, f.full_path,
|
||||
f.language, f.line_count,
|
||||
sm.summary, sm.keywords, sm.purpose,
|
||||
sm.summary, sm.purpose,
|
||||
sm.llm_tool, sm.generated_at
|
||||
FROM files f
|
||||
JOIN semantic_metadata sm ON f.id = sm.file_id
|
||||
@@ -892,14 +901,30 @@ class DirIndexStore:
|
||||
|
||||
results = []
|
||||
for row in rows:
|
||||
file_id = int(row["file_id"])
|
||||
|
||||
# Get keywords from normalized file_keywords table
|
||||
keyword_rows = conn.execute(
|
||||
"""
|
||||
SELECT k.keyword
|
||||
FROM file_keywords fk
|
||||
JOIN keywords k ON fk.keyword_id = k.id
|
||||
WHERE fk.file_id = ?
|
||||
ORDER BY k.keyword
|
||||
""",
|
||||
(file_id,),
|
||||
).fetchall()
|
||||
|
||||
keywords = [kw["keyword"] for kw in keyword_rows]
|
||||
|
||||
results.append({
|
||||
"file_id": int(row["file_id"]),
|
||||
"file_id": file_id,
|
||||
"file_name": row["file_name"],
|
||||
"full_path": row["full_path"],
|
||||
"language": row["language"],
|
||||
"line_count": int(row["line_count"]) if row["line_count"] else 0,
|
||||
"summary": row["summary"],
|
||||
"keywords": json.loads(row["keywords"]) if row["keywords"] else [],
|
||||
"keywords": keywords,
|
||||
"purpose": row["purpose"],
|
||||
"llm_tool": row["llm_tool"],
|
||||
"generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
|
||||
@@ -922,7 +947,7 @@ class DirIndexStore:
|
||||
name: Subdirectory name
|
||||
index_path: Path to subdirectory's _index.db
|
||||
files_count: Total files recursively
|
||||
direct_files: Files directly in subdirectory
|
||||
direct_files: Deprecated parameter (no longer used)
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
@@ -931,17 +956,17 @@ class DirIndexStore:
|
||||
import time
|
||||
last_updated = time.time()
|
||||
|
||||
# Note: direct_files parameter is deprecated but kept for backward compatibility
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO subdirs(name, index_path, files_count, direct_files, last_updated)
|
||||
VALUES(?, ?, ?, ?, ?)
|
||||
INSERT INTO subdirs(name, index_path, files_count, last_updated)
|
||||
VALUES(?, ?, ?, ?)
|
||||
ON CONFLICT(name) DO UPDATE SET
|
||||
index_path=excluded.index_path,
|
||||
files_count=excluded.files_count,
|
||||
direct_files=excluded.direct_files,
|
||||
last_updated=excluded.last_updated
|
||||
""",
|
||||
(name, index_path_str, files_count, direct_files, last_updated),
|
||||
(name, index_path_str, files_count, last_updated),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
@@ -974,7 +999,7 @@ class DirIndexStore:
|
||||
conn = self._get_connection()
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, name, index_path, files_count, direct_files, last_updated
|
||||
SELECT id, name, index_path, files_count, last_updated
|
||||
FROM subdirs
|
||||
ORDER BY name
|
||||
"""
|
||||
@@ -986,7 +1011,6 @@ class DirIndexStore:
|
||||
name=row["name"],
|
||||
index_path=Path(row["index_path"]),
|
||||
files_count=int(row["files_count"]) if row["files_count"] else 0,
|
||||
direct_files=int(row["direct_files"]) if row["direct_files"] else 0,
|
||||
last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
|
||||
)
|
||||
for row in rows
|
||||
@@ -1005,7 +1029,7 @@ class DirIndexStore:
|
||||
conn = self._get_connection()
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT id, name, index_path, files_count, direct_files, last_updated
|
||||
SELECT id, name, index_path, files_count, last_updated
|
||||
FROM subdirs WHERE name=?
|
||||
""",
|
||||
(name,),
|
||||
@@ -1019,7 +1043,6 @@ class DirIndexStore:
|
||||
name=row["name"],
|
||||
index_path=Path(row["index_path"]),
|
||||
files_count=int(row["files_count"]) if row["files_count"] else 0,
|
||||
direct_files=int(row["direct_files"]) if row["direct_files"] else 0,
|
||||
last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
|
||||
)
|
||||
|
||||
@@ -1031,41 +1054,71 @@ class DirIndexStore:
|
||||
Args:
|
||||
name: Subdirectory name
|
||||
files_count: Total files recursively
|
||||
direct_files: Files directly in subdirectory (optional)
|
||||
direct_files: Deprecated parameter (no longer used)
|
||||
"""
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
import time
|
||||
last_updated = time.time()
|
||||
|
||||
if direct_files is not None:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE subdirs
|
||||
SET files_count=?, direct_files=?, last_updated=?
|
||||
WHERE name=?
|
||||
""",
|
||||
(files_count, direct_files, last_updated, name),
|
||||
)
|
||||
else:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE subdirs
|
||||
SET files_count=?, last_updated=?
|
||||
WHERE name=?
|
||||
""",
|
||||
(files_count, last_updated, name),
|
||||
)
|
||||
# Note: direct_files parameter is deprecated but kept for backward compatibility
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE subdirs
|
||||
SET files_count=?, last_updated=?
|
||||
WHERE name=?
|
||||
""",
|
||||
(files_count, last_updated, name),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# === Search ===
|
||||
|
||||
def search_fts(self, query: str, limit: int = 20) -> List[SearchResult]:
|
||||
@staticmethod
|
||||
def _enhance_fts_query(query: str) -> str:
|
||||
"""Enhance FTS5 query to support prefix matching for simple queries.
|
||||
|
||||
For simple single-word or multi-word queries without FTS5 operators,
|
||||
automatically adds prefix wildcard (*) to enable partial matching.
|
||||
|
||||
Examples:
|
||||
"loadPack" -> "loadPack*"
|
||||
"load package" -> "load* package*"
|
||||
"load*" -> "load*" (already has wildcard, unchanged)
|
||||
"NOT test" -> "NOT test" (has FTS operator, unchanged)
|
||||
|
||||
Args:
|
||||
query: Original FTS5 query string
|
||||
|
||||
Returns:
|
||||
Enhanced query string with prefix wildcards for simple queries
|
||||
"""
|
||||
# Don't modify if query already contains FTS5 operators or wildcards
|
||||
if any(op in query.upper() for op in [' AND ', ' OR ', ' NOT ', ' NEAR ', '*', '"']):
|
||||
return query
|
||||
|
||||
# For simple queries, add prefix wildcard to each word
|
||||
words = query.split()
|
||||
enhanced_words = [f"{word}*" if not word.endswith('*') else word for word in words]
|
||||
return ' '.join(enhanced_words)
|
||||
|
||||
def search_fts(self, query: str, limit: int = 20, enhance_query: bool = False) -> List[SearchResult]:
|
||||
"""Full-text search in current directory files.
|
||||
|
||||
Uses files_fts_exact (unicode61 tokenizer) for exact token matching.
|
||||
For fuzzy/substring search, use search_fts_fuzzy() instead.
|
||||
|
||||
Best Practice (from industry analysis of Codanna/Code-Index-MCP):
|
||||
- Default: Respects exact user input without modification
|
||||
- Users can manually add wildcards (e.g., "loadPack*") for prefix matching
|
||||
- Automatic enhancement (enhance_query=True) is NOT recommended as it can
|
||||
violate user intent and bring unwanted noise in results
|
||||
|
||||
Args:
|
||||
query: FTS5 query string
|
||||
limit: Maximum results to return
|
||||
enhance_query: If True, automatically add prefix wildcards for simple queries.
|
||||
Default False to respect exact user input.
|
||||
|
||||
Returns:
|
||||
List of SearchResult objects sorted by relevance
|
||||
@@ -1073,19 +1126,23 @@ class DirIndexStore:
|
||||
Raises:
|
||||
StorageError: If FTS search fails
|
||||
"""
|
||||
# Only enhance query if explicitly requested (not default behavior)
|
||||
# Best practice: Let users control wildcards manually
|
||||
final_query = self._enhance_fts_query(query) if enhance_query else query
|
||||
|
||||
with self._lock:
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT rowid, full_path, bm25(files_fts) AS rank,
|
||||
snippet(files_fts, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
|
||||
FROM files_fts
|
||||
WHERE files_fts MATCH ?
|
||||
SELECT rowid, full_path, bm25(files_fts_exact) AS rank,
|
||||
snippet(files_fts_exact, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
|
||||
FROM files_fts_exact
|
||||
WHERE files_fts_exact MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ?
|
||||
""",
|
||||
(query, limit),
|
||||
(final_query, limit),
|
||||
).fetchall()
|
||||
except sqlite3.DatabaseError as exc:
|
||||
raise StorageError(f"FTS search failed: {exc}") from exc
|
||||
@@ -1249,10 +1306,11 @@ class DirIndexStore:
|
||||
if kind:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT name, kind, start_line, end_line
|
||||
FROM symbols
|
||||
WHERE name LIKE ? AND kind=?
|
||||
ORDER BY name
|
||||
SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
|
||||
FROM symbols s
|
||||
JOIN files f ON s.file_id = f.id
|
||||
WHERE s.name LIKE ? AND s.kind=?
|
||||
ORDER BY s.name
|
||||
LIMIT ?
|
||||
""",
|
||||
(pattern, kind, limit),
|
||||
@@ -1260,10 +1318,11 @@ class DirIndexStore:
|
||||
else:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT name, kind, start_line, end_line
|
||||
FROM symbols
|
||||
WHERE name LIKE ?
|
||||
ORDER BY name
|
||||
SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
|
||||
FROM symbols s
|
||||
JOIN files f ON s.file_id = f.id
|
||||
WHERE s.name LIKE ?
|
||||
ORDER BY s.name
|
||||
LIMIT ?
|
||||
""",
|
||||
(pattern, limit),
|
||||
@@ -1274,6 +1333,7 @@ class DirIndexStore:
|
||||
name=row["name"],
|
||||
kind=row["kind"],
|
||||
range=(row["start_line"], row["end_line"]),
|
||||
file=row["full_path"],
|
||||
)
|
||||
for row in rows
|
||||
]
|
||||
@@ -1359,7 +1419,7 @@ class DirIndexStore:
|
||||
"""
|
||||
)
|
||||
|
||||
# Subdirectories table
|
||||
# Subdirectories table (v5: removed direct_files)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS subdirs (
|
||||
@@ -1367,13 +1427,12 @@ class DirIndexStore:
|
||||
name TEXT NOT NULL UNIQUE,
|
||||
index_path TEXT NOT NULL,
|
||||
files_count INTEGER DEFAULT 0,
|
||||
direct_files INTEGER DEFAULT 0,
|
||||
last_updated REAL
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# Symbols table
|
||||
# Symbols table (v5: removed token_count and symbol_type)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS symbols (
|
||||
@@ -1382,9 +1441,7 @@ class DirIndexStore:
|
||||
name TEXT NOT NULL,
|
||||
kind TEXT NOT NULL,
|
||||
start_line INTEGER,
|
||||
end_line INTEGER,
|
||||
token_count INTEGER,
|
||||
symbol_type TEXT
|
||||
end_line INTEGER
|
||||
)
|
||||
"""
|
||||
)
|
||||
@@ -1421,14 +1478,13 @@ class DirIndexStore:
|
||||
"""
|
||||
)
|
||||
|
||||
# Semantic metadata table
|
||||
# Semantic metadata table (v5: removed keywords column)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS semantic_metadata (
|
||||
id INTEGER PRIMARY KEY,
|
||||
file_id INTEGER UNIQUE REFERENCES files(id) ON DELETE CASCADE,
|
||||
summary TEXT,
|
||||
keywords TEXT,
|
||||
purpose TEXT,
|
||||
llm_tool TEXT,
|
||||
generated_at REAL
|
||||
@@ -1473,13 +1529,12 @@ class DirIndexStore:
|
||||
"""
|
||||
)
|
||||
|
||||
# Indexes
|
||||
# Indexes (v5: removed idx_symbols_type)
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords(keyword)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords(file_id)")
|
||||
|
||||
@@ -0,0 +1,188 @@
|
||||
"""
|
||||
Migration 005: Remove unused and redundant database fields.
|
||||
|
||||
This migration removes four problematic fields identified by Gemini analysis:
|
||||
|
||||
1. **semantic_metadata.keywords** (deprecated - replaced by file_keywords table)
|
||||
- Data: Migrated to normalized file_keywords table in migration 001
|
||||
- Impact: Column now redundant, remove to prevent sync issues
|
||||
|
||||
2. **symbols.token_count** (unused - always NULL)
|
||||
- Data: Never populated, always NULL
|
||||
- Impact: No data loss, just removes unused column
|
||||
|
||||
3. **symbols.symbol_type** (redundant - duplicates kind)
|
||||
- Data: Redundant with symbols.kind field
|
||||
- Impact: No data loss, kind field contains same information
|
||||
|
||||
4. **subdirs.direct_files** (unused - never displayed)
|
||||
- Data: Never used in queries or display logic
|
||||
- Impact: No data loss, just removes unused column
|
||||
|
||||
Schema changes use table recreation pattern (SQLite best practice):
|
||||
- Create new table without deprecated columns
|
||||
- Copy data from old table
|
||||
- Drop old table
|
||||
- Rename new table
|
||||
- Recreate indexes
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlite3 import Connection
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upgrade(db_conn: Connection):
|
||||
"""Remove unused and redundant fields from schema.
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
cursor = db_conn.cursor()
|
||||
|
||||
try:
|
||||
cursor.execute("BEGIN TRANSACTION")
|
||||
|
||||
# Step 1: Remove semantic_metadata.keywords
|
||||
log.info("Removing semantic_metadata.keywords column...")
|
||||
|
||||
# Check if semantic_metadata table exists
|
||||
cursor.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'"
|
||||
)
|
||||
if cursor.fetchone():
|
||||
cursor.execute("""
|
||||
CREATE TABLE semantic_metadata_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_id INTEGER NOT NULL UNIQUE,
|
||||
summary TEXT,
|
||||
purpose TEXT,
|
||||
llm_tool TEXT,
|
||||
generated_at REAL,
|
||||
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
|
||||
)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO semantic_metadata_new (id, file_id, summary, purpose, llm_tool, generated_at)
|
||||
SELECT id, file_id, summary, purpose, llm_tool, generated_at
|
||||
FROM semantic_metadata
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE semantic_metadata")
|
||||
cursor.execute("ALTER TABLE semantic_metadata_new RENAME TO semantic_metadata")
|
||||
|
||||
# Recreate index
|
||||
cursor.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)"
|
||||
)
|
||||
log.info("Removed semantic_metadata.keywords column")
|
||||
else:
|
||||
log.info("semantic_metadata table does not exist, skipping")
|
||||
|
||||
# Step 2: Remove symbols.token_count and symbols.symbol_type
|
||||
log.info("Removing symbols.token_count and symbols.symbol_type columns...")
|
||||
|
||||
# Check if symbols table exists
|
||||
cursor.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='symbols'"
|
||||
)
|
||||
if cursor.fetchone():
|
||||
cursor.execute("""
|
||||
CREATE TABLE symbols_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file_id INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
kind TEXT,
|
||||
start_line INTEGER,
|
||||
end_line INTEGER,
|
||||
FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
|
||||
)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO symbols_new (id, file_id, name, kind, start_line, end_line)
|
||||
SELECT id, file_id, name, kind, start_line, end_line
|
||||
FROM symbols
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE symbols")
|
||||
cursor.execute("ALTER TABLE symbols_new RENAME TO symbols")
|
||||
|
||||
# Recreate indexes (excluding idx_symbols_type which indexed symbol_type)
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
|
||||
log.info("Removed symbols.token_count and symbols.symbol_type columns")
|
||||
else:
|
||||
log.info("symbols table does not exist, skipping")
|
||||
|
||||
# Step 3: Remove subdirs.direct_files
|
||||
log.info("Removing subdirs.direct_files column...")
|
||||
|
||||
# Check if subdirs table exists
|
||||
cursor.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='subdirs'"
|
||||
)
|
||||
if cursor.fetchone():
|
||||
cursor.execute("""
|
||||
CREATE TABLE subdirs_new (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL UNIQUE,
|
||||
index_path TEXT NOT NULL,
|
||||
files_count INTEGER DEFAULT 0,
|
||||
last_updated REAL
|
||||
)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO subdirs_new (id, name, index_path, files_count, last_updated)
|
||||
SELECT id, name, index_path, files_count, last_updated
|
||||
FROM subdirs
|
||||
""")
|
||||
|
||||
cursor.execute("DROP TABLE subdirs")
|
||||
cursor.execute("ALTER TABLE subdirs_new RENAME TO subdirs")
|
||||
|
||||
# Recreate index
|
||||
cursor.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
|
||||
log.info("Removed subdirs.direct_files column")
|
||||
else:
|
||||
log.info("subdirs table does not exist, skipping")
|
||||
|
||||
cursor.execute("COMMIT")
|
||||
log.info("Migration 005 completed successfully")
|
||||
|
||||
# Vacuum to reclaim space (outside transaction)
|
||||
try:
|
||||
log.info("Running VACUUM to reclaim space...")
|
||||
cursor.execute("VACUUM")
|
||||
log.info("VACUUM completed successfully")
|
||||
except Exception as e:
|
||||
log.warning(f"VACUUM failed (non-critical): {e}")
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Migration 005 failed: {e}")
|
||||
try:
|
||||
cursor.execute("ROLLBACK")
|
||||
except Exception:
|
||||
pass
|
||||
raise
|
||||
|
||||
|
||||
def downgrade(db_conn: Connection):
|
||||
"""Restore removed fields (data will be lost for keywords, token_count, symbol_type, direct_files).
|
||||
|
||||
This is a placeholder - true downgrade is not feasible as data is lost.
|
||||
The migration is designed to be one-way since removed fields are unused/redundant.
|
||||
|
||||
Args:
|
||||
db_conn: The SQLite database connection.
|
||||
"""
|
||||
log.warning(
|
||||
"Migration 005 downgrade not supported - removed fields are unused/redundant. "
|
||||
"Data cannot be restored."
|
||||
)
|
||||
raise NotImplementedError(
|
||||
"Migration 005 downgrade not supported - this is a one-way migration"
|
||||
)
|
||||
Reference in New Issue
Block a user