Add comprehensive tests for schema cleanup migration and search comparison

- Implement tests for migration 005 to verify removal of deprecated fields in the database schema. - Ensure that new databases are created with a clean schema. - Validate that keywords are correctly extracted from the normalized file_keywords table. - Test symbol insertion without deprecated fields and subdir operations without direct_files. - Create a detailed search comparison test to evaluate vector search vs hybrid search performance. - Add a script for reindexing projects to extract code relationships and verify GraphAnalyzer functionality. - Include a test script to check TreeSitter parser availability and relationship extraction from sample files.
2026-02-09 02:24:11 +08:00 · 2025-12-16 19:27:05 +08:00
parent 3da0ef2adb
commit df23975a0b
61 changed files with 13114 additions and 366 deletions
--- a/codex-lens/src/codex_lens.egg-info/PKG-INFO
+++ b/codex-lens/src/codex_lens.egg-info/PKG-INFO
@@ -18,3 +18,7 @@ Requires-Dist: pathspec>=0.11
 Provides-Extra: semantic
 Requires-Dist: numpy>=1.24; extra == "semantic"
 Requires-Dist: fastembed>=0.2; extra == "semantic"
+Provides-Extra: encoding
+Requires-Dist: chardet>=5.0; extra == "encoding"
+Provides-Extra: full
+Requires-Dist: tiktoken>=0.5.0; extra == "full"
--- a/codex-lens/src/codex_lens.egg-info/SOURCES.txt
+++ b/codex-lens/src/codex_lens.egg-info/SOURCES.txt
@@ -11,15 +11,23 @@ src/codexlens/entities.py
 src/codexlens/errors.py
 src/codexlens/cli/__init__.py
 src/codexlens/cli/commands.py
+src/codexlens/cli/model_manager.py
 src/codexlens/cli/output.py
 src/codexlens/parsers/__init__.py
+src/codexlens/parsers/encoding.py
 src/codexlens/parsers/factory.py
+src/codexlens/parsers/tokenizer.py
+src/codexlens/parsers/treesitter_parser.py
 src/codexlens/search/__init__.py
 src/codexlens/search/chain_search.py
+src/codexlens/search/hybrid_search.py
+src/codexlens/search/query_parser.py
+src/codexlens/search/ranking.py
 src/codexlens/semantic/__init__.py
 src/codexlens/semantic/chunker.py
 src/codexlens/semantic/code_extractor.py
 src/codexlens/semantic/embedder.py
+src/codexlens/semantic/graph_analyzer.py
 src/codexlens/semantic/llm_enhancer.py
 src/codexlens/semantic/vector_store.py
 src/codexlens/storage/__init__.py
@@ -30,21 +38,45 @@ src/codexlens/storage/migration_manager.py
 src/codexlens/storage/path_mapper.py
 src/codexlens/storage/registry.py
 src/codexlens/storage/sqlite_store.py
+src/codexlens/storage/sqlite_utils.py
 src/codexlens/storage/migrations/__init__.py
 src/codexlens/storage/migrations/migration_001_normalize_keywords.py
+src/codexlens/storage/migrations/migration_002_add_token_metadata.py
+src/codexlens/storage/migrations/migration_003_code_relationships.py
+src/codexlens/storage/migrations/migration_004_dual_fts.py
+src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py
+tests/test_chain_search_engine.py
+tests/test_cli_hybrid_search.py
 tests/test_cli_output.py
 tests/test_code_extractor.py
 tests/test_config.py
+tests/test_dual_fts.py
+tests/test_encoding.py
 tests/test_entities.py
 tests/test_errors.py
 tests/test_file_cache.py
+tests/test_graph_analyzer.py
+tests/test_graph_cli.py
+tests/test_graph_storage.py
+tests/test_hybrid_chunker.py
+tests/test_hybrid_search_e2e.py
+tests/test_incremental_indexing.py
 tests/test_llm_enhancer.py
+tests/test_parser_integration.py
 tests/test_parsers.py
 tests/test_performance_optimizations.py
+tests/test_query_parser.py
+tests/test_rrf_fusion.py
+tests/test_schema_cleanup_migration.py
 tests/test_search_comprehensive.py
 tests/test_search_full_coverage.py
 tests/test_search_performance.py
 tests/test_semantic.py
 tests/test_semantic_search.py
 tests/test_storage.py
+tests/test_token_chunking.py
+tests/test_token_storage.py
+tests/test_tokenizer.py
+tests/test_tokenizer_performance.py
+tests/test_treesitter_parser.py
 tests/test_vector_search_full.py
--- a/codex-lens/src/codex_lens.egg-info/requires.txt
+++ b/codex-lens/src/codex_lens.egg-info/requires.txt
@@ -7,6 +7,12 @@ tree-sitter-javascript>=0.25
 tree-sitter-typescript>=0.23
 pathspec>=0.11

+[encoding]
+chardet>=5.0
+
+[full]
+tiktoken>=0.5.0
+
 [semantic]
 numpy>=1.24
 fastembed>=0.2
--- a/codex-lens/src/codexlens/cli/init.py
+++ b/codex-lens/src/codexlens/cli/init.py
@@ -2,6 +2,25 @@

 from __future__ import annotations

+import sys
+import os
+
+# Force UTF-8 encoding for Windows console
+# This ensures Chinese characters display correctly instead of GBK garbled text
+if sys.platform == "win32":
+    # Set environment variable for Python I/O encoding
+    os.environ.setdefault("PYTHONIOENCODING", "utf-8")
+
+    # Reconfigure stdout/stderr to use UTF-8 if possible
+    try:
+        if hasattr(sys.stdout, "reconfigure"):
+            sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+        if hasattr(sys.stderr, "reconfigure"):
+            sys.stderr.reconfigure(encoding="utf-8", errors="replace")
+    except Exception:
+        # Fallback: some environments don't support reconfigure
+        pass
+
 from .commands import app

 __all__ = ["app"]
--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -181,31 +181,46 @@ def search(
    limit: int = typer.Option(20, "--limit", "-n", min=1, max=500, help="Max results."),
    depth: int = typer.Option(-1, "--depth", "-d", help="Search depth (-1 = unlimited, 0 = current only)."),
    files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
-    mode: str = typer.Option("exact", "--mode", "-m", help="Search mode: exact, fuzzy, hybrid, vector."),
+    mode: str = typer.Option("exact", "--mode", "-m", help="Search mode: exact, fuzzy, hybrid, vector, pure-vector."),
    weights: Optional[str] = typer.Option(None, "--weights", help="Custom RRF weights as 'exact,fuzzy,vector' (e.g., '0.5,0.3,0.2')."),
    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
 ) -> None:
-    """Search indexed file contents using SQLite FTS5.
+    """Search indexed file contents using SQLite FTS5 or semantic vectors.

    Uses chain search across directory indexes.
    Use --depth to limit search recursion (0 = current dir only).

    Search Modes:
-      - exact: Exact FTS using unicode61 tokenizer (default)
-      - fuzzy: Fuzzy FTS using trigram tokenizer
-      - hybrid: RRF fusion of exact + fuzzy (recommended)
-      - vector: Semantic vector search (future)
+      - exact: Exact FTS using unicode61 tokenizer (default) - for code identifiers
+      - fuzzy: Fuzzy FTS using trigram tokenizer - for typo-tolerant search
+      - hybrid: RRF fusion of exact + fuzzy + vector (recommended) - best recall
+      - vector: Vector search with exact FTS fallback - semantic + keyword
+      - pure-vector: Pure semantic vector search only - natural language queries
+
+    Vector Search Requirements:
+      Vector search modes require pre-generated embeddings.
+      Use 'codexlens embeddings-generate' to create embeddings first.

    Hybrid Mode:
      Default weights: exact=0.4, fuzzy=0.3, vector=0.3
      Use --weights to customize (e.g., --weights 0.5,0.3,0.2)
+
+    Examples:
+      # Exact code search
+      codexlens search "authenticate_user" --mode exact
+
+      # Semantic search (requires embeddings)
+      codexlens search "how to verify user credentials" --mode pure-vector
+
+      # Best of both worlds
+      codexlens search "authentication" --mode hybrid
    """
    _configure_logging(verbose)
    search_path = path.expanduser().resolve()

    # Validate mode
-    valid_modes = ["exact", "fuzzy", "hybrid", "vector"]
+    valid_modes = ["exact", "fuzzy", "hybrid", "vector", "pure-vector"]
    if mode not in valid_modes:
        if json_mode:
            print_json(success=False, error=f"Invalid mode: {mode}. Must be one of: {', '.join(valid_modes)}")
@@ -244,8 +259,18 @@ def search(
        engine = ChainSearchEngine(registry, mapper)

        # Map mode to options
-        hybrid_mode = mode == "hybrid"
-        enable_fuzzy = mode in ["fuzzy", "hybrid"]
+        if mode == "exact":
+            hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, False, False, False
+        elif mode == "fuzzy":
+            hybrid_mode, enable_fuzzy, enable_vector, pure_vector = False, True, False, False
+        elif mode == "vector":
+            hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, False  # Vector + exact fallback
+        elif mode == "pure-vector":
+            hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, False, True, True  # Pure vector only
+        elif mode == "hybrid":
+            hybrid_mode, enable_fuzzy, enable_vector, pure_vector = True, True, True, False
+        else:
+            raise ValueError(f"Invalid mode: {mode}")

        options = SearchOptions(
            depth=depth,
@@ -253,6 +278,8 @@ def search(
            files_only=files_only,
            hybrid_mode=hybrid_mode,
            enable_fuzzy=enable_fuzzy,
+            enable_vector=enable_vector,
+            pure_vector=pure_vector,
            hybrid_weights=hybrid_weights,
        )

@@ -1573,3 +1600,483 @@ def semantic_list(
    finally:
        if registry is not None:
            registry.close()
+
+
+# ==================== Model Management Commands ====================
+
+@app.command(name="model-list")
+def model_list(
+    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
+) -> None:
+    """List available embedding models and their installation status.
+
+    Shows 4 model profiles (fast, code, multilingual, balanced) with:
+    - Installation status
+    - Model size and dimensions
+    - Use case recommendations
+    """
+    try:
+        from codexlens.cli.model_manager import list_models
+
+        result = list_models()
+
+        if json_mode:
+            print_json(**result)
+        else:
+            if not result["success"]:
+                console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
+                raise typer.Exit(code=1)
+
+            data = result["result"]
+            models = data["models"]
+            cache_dir = data["cache_dir"]
+            cache_exists = data["cache_exists"]
+
+            console.print("[bold]Available Embedding Models:[/bold]")
+            console.print(f"Cache directory: [dim]{cache_dir}[/dim] {'(exists)' if cache_exists else '(not found)'}\n")
+
+            table = Table(show_header=True, header_style="bold")
+            table.add_column("Profile", style="cyan")
+            table.add_column("Model Name", style="blue")
+            table.add_column("Dims", justify="right")
+            table.add_column("Size (MB)", justify="right")
+            table.add_column("Status", justify="center")
+            table.add_column("Use Case", style="dim")
+
+            for model in models:
+                status_icon = "[green]✓[/green]" if model["installed"] else "[dim]—[/dim]"
+                size_display = (
+                    f"{model['actual_size_mb']:.1f}" if model["installed"]
+                    else f"~{model['estimated_size_mb']}"
+                )
+                table.add_row(
+                    model["profile"],
+                    model["model_name"],
+                    str(model["dimensions"]),
+                    size_display,
+                    status_icon,
+                    model["use_case"][:40] + "..." if len(model["use_case"]) > 40 else model["use_case"],
+                )
+
+            console.print(table)
+            console.print("\n[dim]Use 'codexlens model-download <profile>' to download a model[/dim]")
+
+    except ImportError:
+        if json_mode:
+            print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]")
+        else:
+            console.print("[red]Error:[/red] fastembed not installed")
+            console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]")
+            raise typer.Exit(code=1)
+    except Exception as exc:
+        if json_mode:
+            print_json(success=False, error=str(exc))
+        else:
+            console.print(f"[red]Model-list failed:[/red] {exc}")
+            raise typer.Exit(code=1)
+
+
+@app.command(name="model-download")
+def model_download(
+    profile: str = typer.Argument(..., help="Model profile to download (fast, code, multilingual, balanced)."),
+    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
+) -> None:
+    """Download an embedding model by profile name.
+
+    Example:
+        codexlens model-download code  # Download code-optimized model
+    """
+    try:
+        from codexlens.cli.model_manager import download_model
+
+        if not json_mode:
+            console.print(f"[bold]Downloading model:[/bold] {profile}")
+            console.print("[dim]This may take a few minutes depending on your internet connection...[/dim]\n")
+
+        # Create progress callback for non-JSON mode
+        progress_callback = None if json_mode else lambda msg: console.print(f"[cyan]{msg}[/cyan]")
+
+        result = download_model(profile, progress_callback=progress_callback)
+
+        if json_mode:
+            print_json(**result)
+        else:
+            if not result["success"]:
+                console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
+                raise typer.Exit(code=1)
+
+            data = result["result"]
+            console.print(f"[green]✓[/green] Model downloaded successfully!")
+            console.print(f"  Profile: {data['profile']}")
+            console.print(f"  Model: {data['model_name']}")
+            console.print(f"  Cache size: {data['cache_size_mb']:.1f} MB")
+            console.print(f"  Location: [dim]{data['cache_path']}[/dim]")
+
+    except ImportError:
+        if json_mode:
+            print_json(success=False, error="fastembed not installed. Install with: pip install codexlens[semantic]")
+        else:
+            console.print("[red]Error:[/red] fastembed not installed")
+            console.print("[yellow]Install with:[/yellow] pip install codexlens[semantic]")
+            raise typer.Exit(code=1)
+    except Exception as exc:
+        if json_mode:
+            print_json(success=False, error=str(exc))
+        else:
+            console.print(f"[red]Model-download failed:[/red] {exc}")
+            raise typer.Exit(code=1)
+
+
+@app.command(name="model-delete")
+def model_delete(
+    profile: str = typer.Argument(..., help="Model profile to delete (fast, code, multilingual, balanced)."),
+    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
+) -> None:
+    """Delete a downloaded embedding model from cache.
+
+    Example:
+        codexlens model-delete fast  # Delete fast model
+    """
+    try:
+        from codexlens.cli.model_manager import delete_model
+
+        if not json_mode:
+            console.print(f"[bold yellow]Deleting model:[/bold yellow] {profile}")
+
+        result = delete_model(profile)
+
+        if json_mode:
+            print_json(**result)
+        else:
+            if not result["success"]:
+                console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
+                raise typer.Exit(code=1)
+
+            data = result["result"]
+            console.print(f"[green]✓[/green] Model deleted successfully!")
+            console.print(f"  Profile: {data['profile']}")
+            console.print(f"  Model: {data['model_name']}")
+            console.print(f"  Freed space: {data['deleted_size_mb']:.1f} MB")
+
+    except Exception as exc:
+        if json_mode:
+            print_json(success=False, error=str(exc))
+        else:
+            console.print(f"[red]Model-delete failed:[/red] {exc}")
+            raise typer.Exit(code=1)
+
+
+@app.command(name="model-info")
+def model_info(
+    profile: str = typer.Argument(..., help="Model profile to get info (fast, code, multilingual, balanced)."),
+    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
+) -> None:
+    """Get detailed information about a model profile.
+
+    Example:
+        codexlens model-info code  # Get code model details
+    """
+    try:
+        from codexlens.cli.model_manager import get_model_info
+
+        result = get_model_info(profile)
+
+        if json_mode:
+            print_json(**result)
+        else:
+            if not result["success"]:
+                console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
+                raise typer.Exit(code=1)
+
+            data = result["result"]
+            console.print(f"[bold]Model Profile:[/bold] {data['profile']}")
+            console.print(f"  Model name: {data['model_name']}")
+            console.print(f"  Dimensions: {data['dimensions']}")
+            console.print(f"  Status: {'[green]Installed[/green]' if data['installed'] else '[dim]Not installed[/dim]'}")
+            if data['installed'] and data['actual_size_mb']:
+                console.print(f"  Cache size: {data['actual_size_mb']:.1f} MB")
+                console.print(f"  Location: [dim]{data['cache_path']}[/dim]")
+            else:
+                console.print(f"  Estimated size: ~{data['estimated_size_mb']} MB")
+            console.print(f"\n  Description: {data['description']}")
+            console.print(f"  Use case: {data['use_case']}")
+
+    except Exception as exc:
+        if json_mode:
+            print_json(success=False, error=str(exc))
+        else:
+            console.print(f"[red]Model-info failed:[/red] {exc}")
+            raise typer.Exit(code=1)
+
+
+# ==================== Embedding Management Commands ====================
+
+@app.command(name="embeddings-status")
+def embeddings_status(
+    path: Optional[Path] = typer.Argument(
+        None,
+        exists=True,
+        help="Path to specific _index.db file or directory containing indexes. If not specified, uses default index root.",
+    ),
+    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
+) -> None:
+    """Check embedding status for one or all indexes.
+
+    Shows embedding statistics including:
+    - Number of chunks generated
+    - File coverage percentage
+    - Files missing embeddings
+
+    Examples:
+        codexlens embeddings-status                                    # Check all indexes
+        codexlens embeddings-status ~/.codexlens/indexes/project/_index.db  # Check specific index
+        codexlens embeddings-status ~/projects/my-app                  # Check project (auto-finds index)
+    """
+    try:
+        from codexlens.cli.embedding_manager import check_index_embeddings, get_embedding_stats_summary
+
+        # Determine what to check
+        if path is None:
+            # Check all indexes in default root
+            index_root = _get_index_root()
+            result = get_embedding_stats_summary(index_root)
+
+            if json_mode:
+                print_json(**result)
+            else:
+                if not result["success"]:
+                    console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
+                    raise typer.Exit(code=1)
+
+                data = result["result"]
+                total = data["total_indexes"]
+                with_emb = data["indexes_with_embeddings"]
+                total_chunks = data["total_chunks"]
+
+                console.print(f"[bold]Embedding Status Summary[/bold]")
+                console.print(f"Index root: [dim]{index_root}[/dim]\n")
+                console.print(f"Total indexes: {total}")
+                console.print(f"Indexes with embeddings: [{'green' if with_emb > 0 else 'yellow'}]{with_emb}[/]/{total}")
+                console.print(f"Total chunks: {total_chunks:,}\n")
+
+                if data["indexes"]:
+                    table = Table(show_header=True, header_style="bold")
+                    table.add_column("Project", style="cyan")
+                    table.add_column("Files", justify="right")
+                    table.add_column("Chunks", justify="right")
+                    table.add_column("Coverage", justify="right")
+                    table.add_column("Status", justify="center")
+
+                    for idx_stat in data["indexes"]:
+                        status_icon = "[green]✓[/green]" if idx_stat["has_embeddings"] else "[dim]—[/dim]"
+                        coverage = f"{idx_stat['coverage_percent']:.1f}%" if idx_stat["has_embeddings"] else "—"
+
+                        table.add_row(
+                            idx_stat["project"],
+                            str(idx_stat["total_files"]),
+                            f"{idx_stat['total_chunks']:,}" if idx_stat["has_embeddings"] else "0",
+                            coverage,
+                            status_icon,
+                        )
+
+                    console.print(table)
+
+        else:
+            # Check specific index or find index for project
+            target_path = path.expanduser().resolve()
+
+            if target_path.is_file() and target_path.name == "_index.db":
+                # Direct index file
+                index_path = target_path
+            elif target_path.is_dir():
+                # Try to find index for this project
+                registry = RegistryStore()
+                try:
+                    registry.initialize()
+                    mapper = PathMapper()
+                    index_path = mapper.source_to_index_db(target_path)
+
+                    if not index_path.exists():
+                        console.print(f"[red]Error:[/red] No index found for {target_path}")
+                        console.print("Run 'codexlens init' first to create an index")
+                        raise typer.Exit(code=1)
+                finally:
+                    registry.close()
+            else:
+                console.print(f"[red]Error:[/red] Path must be _index.db file or directory")
+                raise typer.Exit(code=1)
+
+            result = check_index_embeddings(index_path)
+
+            if json_mode:
+                print_json(**result)
+            else:
+                if not result["success"]:
+                    console.print(f"[red]Error:[/red] {result.get('error', 'Unknown error')}")
+                    raise typer.Exit(code=1)
+
+                data = result["result"]
+                has_emb = data["has_embeddings"]
+
+                console.print(f"[bold]Embedding Status[/bold]")
+                console.print(f"Index: [dim]{data['index_path']}[/dim]\n")
+
+                if has_emb:
+                    console.print(f"[green]✓[/green] Embeddings available")
+                    console.print(f"  Total chunks: {data['total_chunks']:,}")
+                    console.print(f"  Total files: {data['total_files']:,}")
+                    console.print(f"  Files with embeddings: {data['files_with_chunks']:,}/{data['total_files']}")
+                    console.print(f"  Coverage: {data['coverage_percent']:.1f}%")
+
+                    if data["files_without_chunks"] > 0:
+                        console.print(f"\n[yellow]Warning:[/yellow] {data['files_without_chunks']} files missing embeddings")
+                        if data["missing_files_sample"]:
+                            console.print("  Sample missing files:")
+                            for file in data["missing_files_sample"]:
+                                console.print(f"    [dim]{file}[/dim]")
+                else:
+                    console.print(f"[yellow]—[/yellow] No embeddings found")
+                    console.print(f"  Total files indexed: {data['total_files']:,}")
+                    console.print("\n[dim]Generate embeddings with:[/dim]")
+                    console.print(f"  [cyan]codexlens embeddings-generate {index_path}[/cyan]")
+
+    except Exception as exc:
+        if json_mode:
+            print_json(success=False, error=str(exc))
+        else:
+            console.print(f"[red]Embeddings-status failed:[/red] {exc}")
+            raise typer.Exit(code=1)
+
+
+@app.command(name="embeddings-generate")
+def embeddings_generate(
+    path: Path = typer.Argument(
+        ...,
+        exists=True,
+        help="Path to _index.db file or project directory.",
+    ),
+    model: str = typer.Option(
+        "code",
+        "--model",
+        "-m",
+        help="Model profile: fast, code, multilingual, balanced.",
+    ),
+    force: bool = typer.Option(
+        False,
+        "--force",
+        "-f",
+        help="Force regeneration even if embeddings exist.",
+    ),
+    chunk_size: int = typer.Option(
+        2000,
+        "--chunk-size",
+        help="Maximum chunk size in characters.",
+    ),
+    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
+) -> None:
+    """Generate semantic embeddings for code search.
+
+    Creates vector embeddings for all files in an index to enable
+    semantic search capabilities. Embeddings are stored in the same
+    database as the FTS index.
+
+    Model Profiles:
+      - fast: BAAI/bge-small-en-v1.5 (384 dims, ~80MB)
+      - code: jinaai/jina-embeddings-v2-base-code (768 dims, ~150MB) [recommended]
+      - multilingual: intfloat/multilingual-e5-large (1024 dims, ~1GB)
+      - balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dims, ~600MB)
+
+    Examples:
+        codexlens embeddings-generate ~/projects/my-app              # Auto-find index for project
+        codexlens embeddings-generate ~/.codexlens/indexes/project/_index.db  # Specific index
+        codexlens embeddings-generate ~/projects/my-app --model fast --force  # Regenerate with fast model
+    """
+    _configure_logging(verbose)
+
+    try:
+        from codexlens.cli.embedding_manager import generate_embeddings
+
+        # Resolve path
+        target_path = path.expanduser().resolve()
+
+        if target_path.is_file() and target_path.name == "_index.db":
+            # Direct index file
+            index_path = target_path
+        elif target_path.is_dir():
+            # Try to find index for this project
+            registry = RegistryStore()
+            try:
+                registry.initialize()
+                mapper = PathMapper()
+                index_path = mapper.source_to_index_db(target_path)
+
+                if not index_path.exists():
+                    console.print(f"[red]Error:[/red] No index found for {target_path}")
+                    console.print("Run 'codexlens init' first to create an index")
+                    raise typer.Exit(code=1)
+            finally:
+                registry.close()
+        else:
+            console.print(f"[red]Error:[/red] Path must be _index.db file or directory")
+            raise typer.Exit(code=1)
+
+        # Progress callback
+        def progress_update(msg: str):
+            if not json_mode and verbose:
+                console.print(f"  {msg}")
+
+        console.print(f"[bold]Generating embeddings[/bold]")
+        console.print(f"Index: [dim]{index_path}[/dim]")
+        console.print(f"Model: [cyan]{model}[/cyan]\n")
+
+        result = generate_embeddings(
+            index_path,
+            model_profile=model,
+            force=force,
+            chunk_size=chunk_size,
+            progress_callback=progress_update,
+        )
+
+        if json_mode:
+            print_json(**result)
+        else:
+            if not result["success"]:
+                error_msg = result.get("error", "Unknown error")
+                console.print(f"[red]Error:[/red] {error_msg}")
+
+                # Provide helpful hints
+                if "already has" in error_msg:
+                    console.print("\n[dim]Use --force to regenerate existing embeddings[/dim]")
+                elif "Semantic search not available" in error_msg:
+                    console.print("\n[dim]Install semantic dependencies:[/dim]")
+                    console.print("  [cyan]pip install codexlens[semantic][/cyan]")
+
+                raise typer.Exit(code=1)
+
+            data = result["result"]
+            elapsed = data["elapsed_time"]
+
+            console.print(f"[green]✓[/green] Embeddings generated successfully!")
+            console.print(f"  Model: {data['model_name']}")
+            console.print(f"  Chunks created: {data['chunks_created']:,}")
+            console.print(f"  Files processed: {data['files_processed']}")
+
+            if data["files_failed"] > 0:
+                console.print(f"  [yellow]Files failed: {data['files_failed']}[/yellow]")
+                if data["failed_files"]:
+                    console.print("  [dim]First failures:[/dim]")
+                    for file_path, error in data["failed_files"]:
+                        console.print(f"    [dim]{file_path}: {error}[/dim]")
+
+            console.print(f"  Time: {elapsed:.1f}s")
+
+            console.print("\n[dim]Use vector search with:[/dim]")
+            console.print("  [cyan]codexlens search 'your query' --mode pure-vector[/cyan]")
+
+    except Exception as exc:
+        if json_mode:
+            print_json(success=False, error=str(exc))
+        else:
+            console.print(f"[red]Embeddings-generate failed:[/red] {exc}")
+            raise typer.Exit(code=1)
--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -0,0 +1,331 @@
+"""Embedding Manager - Manage semantic embeddings for code indexes."""
+
+import logging
+import sqlite3
+import time
+from pathlib import Path
+from typing import Dict, List, Optional
+
+try:
+    from codexlens.semantic import SEMANTIC_AVAILABLE
+    if SEMANTIC_AVAILABLE:
+        from codexlens.semantic.embedder import Embedder
+        from codexlens.semantic.vector_store import VectorStore
+        from codexlens.semantic.chunker import Chunker, ChunkConfig
+except ImportError:
+    SEMANTIC_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+
+def check_index_embeddings(index_path: Path) -> Dict[str, any]:
+    """Check if an index has embeddings and return statistics.
+
+    Args:
+        index_path: Path to _index.db file
+
+    Returns:
+        Dictionary with embedding statistics and status
+    """
+    if not index_path.exists():
+        return {
+            "success": False,
+            "error": f"Index not found: {index_path}",
+        }
+
+    try:
+        with sqlite3.connect(index_path) as conn:
+            # Check if semantic_chunks table exists
+            cursor = conn.execute(
+                "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
+            )
+            table_exists = cursor.fetchone() is not None
+
+            if not table_exists:
+                # Count total indexed files even without embeddings
+                cursor = conn.execute("SELECT COUNT(*) FROM files")
+                total_files = cursor.fetchone()[0]
+
+                return {
+                    "success": True,
+                    "result": {
+                        "has_embeddings": False,
+                        "total_chunks": 0,
+                        "total_files": total_files,
+                        "files_with_chunks": 0,
+                        "files_without_chunks": total_files,
+                        "coverage_percent": 0.0,
+                        "missing_files_sample": [],
+                        "index_path": str(index_path),
+                    },
+                }
+
+            # Count total chunks
+            cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
+            total_chunks = cursor.fetchone()[0]
+
+            # Count total indexed files
+            cursor = conn.execute("SELECT COUNT(*) FROM files")
+            total_files = cursor.fetchone()[0]
+
+            # Count files with embeddings
+            cursor = conn.execute(
+                "SELECT COUNT(DISTINCT file_path) FROM semantic_chunks"
+            )
+            files_with_chunks = cursor.fetchone()[0]
+
+            # Get a sample of files without embeddings
+            cursor = conn.execute("""
+                SELECT full_path
+                FROM files
+                WHERE full_path NOT IN (
+                    SELECT DISTINCT file_path FROM semantic_chunks
+                )
+                LIMIT 5
+            """)
+            missing_files = [row[0] for row in cursor.fetchall()]
+
+            return {
+                "success": True,
+                "result": {
+                    "has_embeddings": total_chunks > 0,
+                    "total_chunks": total_chunks,
+                    "total_files": total_files,
+                    "files_with_chunks": files_with_chunks,
+                    "files_without_chunks": total_files - files_with_chunks,
+                    "coverage_percent": round((files_with_chunks / total_files * 100) if total_files > 0 else 0, 1),
+                    "missing_files_sample": missing_files,
+                    "index_path": str(index_path),
+                },
+            }
+
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to check embeddings: {str(e)}",
+        }
+
+
+def generate_embeddings(
+    index_path: Path,
+    model_profile: str = "code",
+    force: bool = False,
+    chunk_size: int = 2000,
+    progress_callback: Optional[callable] = None,
+) -> Dict[str, any]:
+    """Generate embeddings for an index.
+
+    Args:
+        index_path: Path to _index.db file
+        model_profile: Model profile (fast, code, multilingual, balanced)
+        force: If True, regenerate even if embeddings exist
+        chunk_size: Maximum chunk size in characters
+        progress_callback: Optional callback for progress updates
+
+    Returns:
+        Result dictionary with generation statistics
+    """
+    if not SEMANTIC_AVAILABLE:
+        return {
+            "success": False,
+            "error": "Semantic search not available. Install with: pip install codexlens[semantic]",
+        }
+
+    if not index_path.exists():
+        return {
+            "success": False,
+            "error": f"Index not found: {index_path}",
+        }
+
+    # Check existing chunks
+    status = check_index_embeddings(index_path)
+    if not status["success"]:
+        return status
+
+    existing_chunks = status["result"]["total_chunks"]
+
+    if existing_chunks > 0 and not force:
+        return {
+            "success": False,
+            "error": f"Index already has {existing_chunks} chunks. Use --force to regenerate.",
+            "existing_chunks": existing_chunks,
+        }
+
+    if force and existing_chunks > 0:
+        if progress_callback:
+            progress_callback(f"Clearing {existing_chunks} existing chunks...")
+
+        try:
+            with sqlite3.connect(index_path) as conn:
+                conn.execute("DELETE FROM semantic_chunks")
+                conn.commit()
+        except Exception as e:
+            return {
+                "success": False,
+                "error": f"Failed to clear existing chunks: {str(e)}",
+            }
+
+    # Initialize components
+    try:
+        embedder = Embedder(profile=model_profile)
+        vector_store = VectorStore(index_path)
+        chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
+
+        if progress_callback:
+            progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
+
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to initialize components: {str(e)}",
+        }
+
+    # Read files from index
+    try:
+        with sqlite3.connect(index_path) as conn:
+            conn.row_factory = sqlite3.Row
+            cursor = conn.execute("SELECT full_path, content, language FROM files")
+            files = cursor.fetchall()
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to read files: {str(e)}",
+        }
+
+    if len(files) == 0:
+        return {
+            "success": False,
+            "error": "No files found in index",
+        }
+
+    if progress_callback:
+        progress_callback(f"Processing {len(files)} files...")
+
+    # Process each file
+    total_chunks = 0
+    failed_files = []
+    start_time = time.time()
+
+    for idx, file_row in enumerate(files, 1):
+        file_path = file_row["full_path"]
+        content = file_row["content"]
+        language = file_row["language"] or "python"
+
+        try:
+            # Create chunks
+            chunks = chunker.chunk_sliding_window(
+                content,
+                file_path=file_path,
+                language=language
+            )
+
+            if not chunks:
+                continue
+
+            # Generate embeddings
+            for chunk in chunks:
+                embedding = embedder.embed_single(chunk.content)
+                chunk.embedding = embedding
+
+            # Store chunks
+            vector_store.add_chunks(chunks, file_path)
+            total_chunks += len(chunks)
+
+            if progress_callback:
+                progress_callback(f"[{idx}/{len(files)}] {file_path}: {len(chunks)} chunks")
+
+        except Exception as e:
+            logger.error(f"Failed to process {file_path}: {e}")
+            failed_files.append((file_path, str(e)))
+
+    elapsed_time = time.time() - start_time
+
+    return {
+        "success": True,
+        "result": {
+            "chunks_created": total_chunks,
+            "files_processed": len(files) - len(failed_files),
+            "files_failed": len(failed_files),
+            "elapsed_time": elapsed_time,
+            "model_profile": model_profile,
+            "model_name": embedder.model_name,
+            "failed_files": failed_files[:5],  # First 5 failures
+            "index_path": str(index_path),
+        },
+    }
+
+
+def find_all_indexes(scan_dir: Path) -> List[Path]:
+    """Find all _index.db files in directory tree.
+
+    Args:
+        scan_dir: Directory to scan
+
+    Returns:
+        List of paths to _index.db files
+    """
+    if not scan_dir.exists():
+        return []
+
+    return list(scan_dir.rglob("_index.db"))
+
+
+def get_embedding_stats_summary(index_root: Path) -> Dict[str, any]:
+    """Get summary statistics for all indexes in root directory.
+
+    Args:
+        index_root: Root directory containing indexes
+
+    Returns:
+        Summary statistics for all indexes
+    """
+    indexes = find_all_indexes(index_root)
+
+    if not indexes:
+        return {
+            "success": True,
+            "result": {
+                "total_indexes": 0,
+                "indexes_with_embeddings": 0,
+                "total_chunks": 0,
+                "indexes": [],
+            },
+        }
+
+    total_chunks = 0
+    indexes_with_embeddings = 0
+    index_stats = []
+
+    for index_path in indexes:
+        status = check_index_embeddings(index_path)
+
+        if status["success"]:
+            result = status["result"]
+            has_emb = result["has_embeddings"]
+            chunks = result["total_chunks"]
+
+            if has_emb:
+                indexes_with_embeddings += 1
+                total_chunks += chunks
+
+            # Extract project name from path
+            project_name = index_path.parent.name
+
+            index_stats.append({
+                "project": project_name,
+                "path": str(index_path),
+                "has_embeddings": has_emb,
+                "total_chunks": chunks,
+                "total_files": result["total_files"],
+                "coverage_percent": result.get("coverage_percent", 0),
+            })
+
+    return {
+        "success": True,
+        "result": {
+            "total_indexes": len(indexes),
+            "indexes_with_embeddings": indexes_with_embeddings,
+            "total_chunks": total_chunks,
+            "indexes": index_stats,
+        },
+    }
--- a/codex-lens/src/codexlens/cli/model_manager.py
+++ b/codex-lens/src/codexlens/cli/model_manager.py
@@ -0,0 +1,289 @@
+"""Model Manager - Manage fastembed models for semantic search."""
+
+import json
+import os
+import shutil
+from pathlib import Path
+from typing import Dict, List, Optional
+
+try:
+    from fastembed import TextEmbedding
+    FASTEMBED_AVAILABLE = True
+except ImportError:
+    FASTEMBED_AVAILABLE = False
+
+
+# Model profiles with metadata
+MODEL_PROFILES = {
+    "fast": {
+        "model_name": "BAAI/bge-small-en-v1.5",
+        "dimensions": 384,
+        "size_mb": 80,
+        "description": "Fast, lightweight, English-optimized",
+        "use_case": "Quick prototyping, resource-constrained environments",
+    },
+    "code": {
+        "model_name": "jinaai/jina-embeddings-v2-base-code",
+        "dimensions": 768,
+        "size_mb": 150,
+        "description": "Code-optimized, best for programming languages",
+        "use_case": "Open source projects, code semantic search",
+    },
+    "multilingual": {
+        "model_name": "intfloat/multilingual-e5-large",
+        "dimensions": 1024,
+        "size_mb": 1000,
+        "description": "Multilingual + code support",
+        "use_case": "Enterprise multilingual projects",
+    },
+    "balanced": {
+        "model_name": "mixedbread-ai/mxbai-embed-large-v1",
+        "dimensions": 1024,
+        "size_mb": 600,
+        "description": "High accuracy, general purpose",
+        "use_case": "High-quality semantic search, balanced performance",
+    },
+}
+
+
+def get_cache_dir() -> Path:
+    """Get fastembed cache directory.
+
+    Returns:
+        Path to cache directory (usually ~/.cache/fastembed or %LOCALAPPDATA%\\Temp\\fastembed_cache)
+    """
+    # Check HF_HOME environment variable first
+    if "HF_HOME" in os.environ:
+        return Path(os.environ["HF_HOME"])
+
+    # Default cache locations
+    if os.name == "nt":  # Windows
+        cache_dir = Path(os.environ.get("LOCALAPPDATA", Path.home() / "AppData" / "Local")) / "Temp" / "fastembed_cache"
+    else:  # Unix-like
+        cache_dir = Path.home() / ".cache" / "fastembed"
+
+    return cache_dir
+
+
+def list_models() -> Dict[str, any]:
+    """List available model profiles and their installation status.
+
+    Returns:
+        Dictionary with model profiles, installed status, and cache info
+    """
+    if not FASTEMBED_AVAILABLE:
+        return {
+            "success": False,
+            "error": "fastembed not installed. Install with: pip install codexlens[semantic]",
+        }
+
+    cache_dir = get_cache_dir()
+    cache_exists = cache_dir.exists()
+
+    models = []
+    for profile, info in MODEL_PROFILES.items():
+        model_name = info["model_name"]
+
+        # Check if model is cached
+        installed = False
+        cache_size_mb = 0
+
+        if cache_exists:
+            # Check for model directory in cache
+            model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
+            if model_cache_path.exists():
+                installed = True
+                # Calculate cache size
+                total_size = sum(
+                    f.stat().st_size
+                    for f in model_cache_path.rglob("*")
+                    if f.is_file()
+                )
+                cache_size_mb = round(total_size / (1024 * 1024), 1)
+
+        models.append({
+            "profile": profile,
+            "model_name": model_name,
+            "dimensions": info["dimensions"],
+            "estimated_size_mb": info["size_mb"],
+            "actual_size_mb": cache_size_mb if installed else None,
+            "description": info["description"],
+            "use_case": info["use_case"],
+            "installed": installed,
+        })
+
+    return {
+        "success": True,
+        "result": {
+            "models": models,
+            "cache_dir": str(cache_dir),
+            "cache_exists": cache_exists,
+        },
+    }
+
+
+def download_model(profile: str, progress_callback: Optional[callable] = None) -> Dict[str, any]:
+    """Download a model by profile name.
+
+    Args:
+        profile: Model profile name (fast, code, multilingual, balanced)
+        progress_callback: Optional callback function to report progress
+
+    Returns:
+        Result dictionary with success status
+    """
+    if not FASTEMBED_AVAILABLE:
+        return {
+            "success": False,
+            "error": "fastembed not installed. Install with: pip install codexlens[semantic]",
+        }
+
+    if profile not in MODEL_PROFILES:
+        return {
+            "success": False,
+            "error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}",
+        }
+
+    model_name = MODEL_PROFILES[profile]["model_name"]
+
+    try:
+        # Download model by instantiating TextEmbedding
+        # This will automatically download to cache if not present
+        if progress_callback:
+            progress_callback(f"Downloading {model_name}...")
+
+        embedder = TextEmbedding(model_name=model_name)
+
+        if progress_callback:
+            progress_callback(f"Model {model_name} downloaded successfully")
+
+        # Get cache info
+        cache_dir = get_cache_dir()
+        model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
+
+        cache_size = 0
+        if model_cache_path.exists():
+            total_size = sum(
+                f.stat().st_size
+                for f in model_cache_path.rglob("*")
+                if f.is_file()
+            )
+            cache_size = round(total_size / (1024 * 1024), 1)
+
+        return {
+            "success": True,
+            "result": {
+                "profile": profile,
+                "model_name": model_name,
+                "cache_size_mb": cache_size,
+                "cache_path": str(model_cache_path),
+            },
+        }
+
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to download model: {str(e)}",
+        }
+
+
+def delete_model(profile: str) -> Dict[str, any]:
+    """Delete a downloaded model from cache.
+
+    Args:
+        profile: Model profile name to delete
+
+    Returns:
+        Result dictionary with success status
+    """
+    if profile not in MODEL_PROFILES:
+        return {
+            "success": False,
+            "error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}",
+        }
+
+    model_name = MODEL_PROFILES[profile]["model_name"]
+    cache_dir = get_cache_dir()
+    model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
+
+    if not model_cache_path.exists():
+        return {
+            "success": False,
+            "error": f"Model {profile} ({model_name}) is not installed",
+        }
+
+    try:
+        # Calculate size before deletion
+        total_size = sum(
+            f.stat().st_size
+            for f in model_cache_path.rglob("*")
+            if f.is_file()
+        )
+        size_mb = round(total_size / (1024 * 1024), 1)
+
+        # Delete model directory
+        shutil.rmtree(model_cache_path)
+
+        return {
+            "success": True,
+            "result": {
+                "profile": profile,
+                "model_name": model_name,
+                "deleted_size_mb": size_mb,
+                "cache_path": str(model_cache_path),
+            },
+        }
+
+    except Exception as e:
+        return {
+            "success": False,
+            "error": f"Failed to delete model: {str(e)}",
+        }
+
+
+def get_model_info(profile: str) -> Dict[str, any]:
+    """Get detailed information about a model profile.
+
+    Args:
+        profile: Model profile name
+
+    Returns:
+        Result dictionary with model information
+    """
+    if profile not in MODEL_PROFILES:
+        return {
+            "success": False,
+            "error": f"Unknown profile: {profile}. Available: {', '.join(MODEL_PROFILES.keys())}",
+        }
+
+    info = MODEL_PROFILES[profile]
+    model_name = info["model_name"]
+
+    # Check installation status
+    cache_dir = get_cache_dir()
+    model_cache_path = cache_dir / f"models--{model_name.replace('/', '--')}"
+    installed = model_cache_path.exists()
+
+    cache_size_mb = None
+    if installed:
+        total_size = sum(
+            f.stat().st_size
+            for f in model_cache_path.rglob("*")
+            if f.is_file()
+        )
+        cache_size_mb = round(total_size / (1024 * 1024), 1)
+
+    return {
+        "success": True,
+        "result": {
+            "profile": profile,
+            "model_name": model_name,
+            "dimensions": info["dimensions"],
+            "estimated_size_mb": info["size_mb"],
+            "actual_size_mb": cache_size_mb,
+            "description": info["description"],
+            "use_case": info["use_case"],
+            "installed": installed,
+            "cache_path": str(model_cache_path) if installed else None,
+        },
+    }
--- a/codex-lens/src/codexlens/cli/output.py
+++ b/codex-lens/src/codexlens/cli/output.py
@@ -3,6 +3,7 @@
 from __future__ import annotations

 import json
+import sys
 from dataclasses import asdict, is_dataclass
 from pathlib import Path
 from typing import Any, Iterable, Mapping, Sequence
@@ -13,7 +14,9 @@ from rich.text import Text

 from codexlens.entities import SearchResult, Symbol

-console = Console()
+# Force UTF-8 encoding for Windows console to properly display Chinese text
+# Use force_terminal=True and legacy_windows=False to avoid GBK encoding issues
+console = Console(force_terminal=True, legacy_windows=False)


 def _to_jsonable(value: Any) -> Any:
--- a/codex-lens/src/codexlens/entities.py
+++ b/codex-lens/src/codexlens/entities.py
@@ -13,6 +13,7 @@ class Symbol(BaseModel):
    name: str = Field(..., min_length=1)
    kind: str = Field(..., min_length=1)
    range: Tuple[int, int] = Field(..., description="(start_line, end_line), 1-based inclusive")
+    file: Optional[str] = Field(default=None, description="Full path to the file containing this symbol")
    token_count: Optional[int] = Field(default=None, description="Token count for symbol content")
    symbol_type: Optional[str] = Field(default=None, description="Extended symbol type for filtering")

--- a/codex-lens/src/codexlens/search/chain_search.py
+++ b/codex-lens/src/codexlens/search/chain_search.py
@@ -35,6 +35,8 @@ class SearchOptions:
        include_semantic: Whether to include semantic keyword search results
        hybrid_mode: Enable hybrid search with RRF fusion (default False)
        enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True)
+        enable_vector: Enable vector semantic search (default False)
+        pure_vector: If True, only use vector search without FTS fallback (default False)
        hybrid_weights: Custom RRF weights for hybrid search (optional)
    """
    depth: int = -1
@@ -46,6 +48,8 @@ class SearchOptions:
    include_semantic: bool = False
    hybrid_mode: bool = False
    enable_fuzzy: bool = True
+    enable_vector: bool = False
+    pure_vector: bool = False
    hybrid_weights: Optional[Dict[str, float]] = None


@@ -494,6 +498,8 @@ class ChainSearchEngine:
                options.include_semantic,
                options.hybrid_mode,
                options.enable_fuzzy,
+                options.enable_vector,
+                options.pure_vector,
                options.hybrid_weights
            ): idx_path
            for idx_path in index_paths
@@ -520,6 +526,8 @@ class ChainSearchEngine:
                              include_semantic: bool = False,
                              hybrid_mode: bool = False,
                              enable_fuzzy: bool = True,
+                              enable_vector: bool = False,
+                              pure_vector: bool = False,
                              hybrid_weights: Optional[Dict[str, float]] = None) -> List[SearchResult]:
        """Search a single index database.

@@ -527,12 +535,14 @@ class ChainSearchEngine:

        Args:
            index_path: Path to _index.db file
-            query: FTS5 query string
+            query: FTS5 query string (for FTS) or natural language query (for vector)
            limit: Maximum results from this index
            files_only: If True, skip snippet generation for faster search
            include_semantic: If True, also search semantic keywords and merge results
            hybrid_mode: If True, use hybrid search with RRF fusion
            enable_fuzzy: Enable fuzzy FTS in hybrid mode
+            enable_vector: Enable vector semantic search
+            pure_vector: If True, only use vector search without FTS fallback
            hybrid_weights: Custom RRF weights for hybrid search

        Returns:
@@ -547,10 +557,11 @@ class ChainSearchEngine:
                    query,
                    limit=limit,
                    enable_fuzzy=enable_fuzzy,
-                    enable_vector=False,  # Vector search not yet implemented
+                    enable_vector=enable_vector,
+                    pure_vector=pure_vector,
                )
            else:
-                # Legacy single-FTS search
+                # Single-FTS search (exact or fuzzy mode)
                with DirIndexStore(index_path) as store:
                    # Get FTS results
                    if files_only:
@@ -558,7 +569,11 @@ class ChainSearchEngine:
                        paths = store.search_files_only(query, limit=limit)
                        fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
                    else:
-                        fts_results = store.search_fts(query, limit=limit)
+                        # Use fuzzy FTS if enable_fuzzy=True (mode="fuzzy"), otherwise exact FTS
+                        if enable_fuzzy:
+                            fts_results = store.search_fts_fuzzy(query, limit=limit)
+                        else:
+                            fts_results = store.search_fts(query, limit=limit)

                    # Optionally add semantic keyword results
                    if include_semantic:
--- a/codex-lens/src/codexlens/search/hybrid_search.py
+++ b/codex-lens/src/codexlens/search/hybrid_search.py
@@ -50,35 +50,68 @@ class HybridSearchEngine:
        limit: int = 20,
        enable_fuzzy: bool = True,
        enable_vector: bool = False,
+        pure_vector: bool = False,
    ) -> List[SearchResult]:
        """Execute hybrid search with parallel retrieval and RRF fusion.

        Args:
            index_path: Path to _index.db file
-            query: FTS5 query string
+            query: FTS5 query string (for FTS) or natural language query (for vector)
            limit: Maximum results to return after fusion
            enable_fuzzy: Enable fuzzy FTS search (default True)
            enable_vector: Enable vector search (default False)
+            pure_vector: If True, only use vector search without FTS fallback (default False)

        Returns:
            List of SearchResult objects sorted by fusion score

        Examples:
            >>> engine = HybridSearchEngine()
-            >>> results = engine.search(Path("project/_index.db"), "authentication")
+            >>> # Hybrid search (exact + fuzzy + vector)
+            >>> results = engine.search(Path("project/_index.db"), "authentication",
+            ...                         enable_vector=True)
+            >>> # Pure vector search (semantic only)
+            >>> results = engine.search(Path("project/_index.db"),
+            ...                         "how to authenticate users",
+            ...                         enable_vector=True, pure_vector=True)
            >>> for r in results[:5]:
            ...     print(f"{r.path}: {r.score:.3f}")
        """
        # Determine which backends to use
-        backends = {"exact": True}  # Always use exact search
-        if enable_fuzzy:
-            backends["fuzzy"] = True
-        if enable_vector:
-            backends["vector"] = True
+        backends = {}
+
+        if pure_vector:
+            # Pure vector mode: only use vector search, no FTS fallback
+            if enable_vector:
+                backends["vector"] = True
+            else:
+                # Invalid configuration: pure_vector=True but enable_vector=False
+                self.logger.warning(
+                    "pure_vector=True requires enable_vector=True. "
+                    "Falling back to exact search. "
+                    "To use pure vector search, enable vector search mode."
+                )
+                backends["exact"] = True
+        else:
+            # Hybrid mode: always include exact search as baseline
+            backends["exact"] = True
+            if enable_fuzzy:
+                backends["fuzzy"] = True
+            if enable_vector:
+                backends["vector"] = True

        # Execute parallel searches
        results_map = self._search_parallel(index_path, query, backends, limit)

+        # Provide helpful message if pure-vector mode returns no results
+        if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0:
+            self.logger.warning(
+                "Pure vector search returned no results. "
+                "This usually means embeddings haven't been generated. "
+                "Run: codexlens embeddings-generate %s",
+                index_path.parent if index_path.name == "_index.db" else index_path
+            )
+
        # Apply RRF fusion
        # Filter weights to only active backends
        active_weights = {
@@ -195,17 +228,67 @@ class HybridSearchEngine:
    def _search_vector(
        self, index_path: Path, query: str, limit: int
    ) -> List[SearchResult]:
-        """Execute vector search (placeholder for future implementation).
+        """Execute vector similarity search using semantic embeddings.

        Args:
            index_path: Path to _index.db file
-            query: Query string
+            query: Natural language query string
            limit: Maximum results

        Returns:
-            List of SearchResult objects (empty for now)
+            List of SearchResult objects ordered by semantic similarity
        """
-        # Placeholder for vector search integration
-        # Will be implemented when VectorStore is available
-        self.logger.debug("Vector search not yet implemented")
-        return []
+        try:
+            # Check if semantic chunks table exists
+            import sqlite3
+            conn = sqlite3.connect(index_path)
+            cursor = conn.execute(
+                "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
+            )
+            has_semantic_table = cursor.fetchone() is not None
+            conn.close()
+
+            if not has_semantic_table:
+                self.logger.info(
+                    "No embeddings found in index. "
+                    "Generate embeddings with: codexlens embeddings-generate %s",
+                    index_path.parent if index_path.name == "_index.db" else index_path
+                )
+                return []
+
+            # Initialize embedder and vector store
+            from codexlens.semantic.embedder import Embedder
+            from codexlens.semantic.vector_store import VectorStore
+
+            embedder = Embedder(profile="code")  # Use code-optimized model
+            vector_store = VectorStore(index_path)
+
+            # Check if vector store has data
+            if vector_store.count_chunks() == 0:
+                self.logger.info(
+                    "Vector store is empty (0 chunks). "
+                    "Generate embeddings with: codexlens embeddings-generate %s",
+                    index_path.parent if index_path.name == "_index.db" else index_path
+                )
+                return []
+
+            # Generate query embedding
+            query_embedding = embedder.embed_single(query)
+
+            # Search for similar chunks
+            results = vector_store.search_similar(
+                query_embedding=query_embedding,
+                top_k=limit,
+                min_score=0.0,  # Return all results, let RRF handle filtering
+                return_full_content=True,
+            )
+
+            self.logger.debug("Vector search found %d results", len(results))
+            return results
+
+        except ImportError as exc:
+            self.logger.debug("Semantic dependencies not available: %s", exc)
+            return []
+        except Exception as exc:
+            self.logger.error("Vector search error: %s", exc)
+            return []
--- a/codex-lens/src/codexlens/semantic/embedder.py
+++ b/codex-lens/src/codexlens/semantic/embedder.py
@@ -8,21 +8,64 @@ from . import SEMANTIC_AVAILABLE


 class Embedder:
-    """Generate embeddings for code chunks using fastembed (ONNX-based)."""
+    """Generate embeddings for code chunks using fastembed (ONNX-based).

-    MODEL_NAME = "BAAI/bge-small-en-v1.5"
-    EMBEDDING_DIM = 384
+    Supported Model Profiles:
+    - fast: BAAI/bge-small-en-v1.5 (384 dim) - Fast, lightweight, English-optimized
+    - code: jinaai/jina-embeddings-v2-base-code (768 dim) - Code-optimized, best for programming languages
+    - multilingual: intfloat/multilingual-e5-large (1024 dim) - Multilingual + code support
+    - balanced: mixedbread-ai/mxbai-embed-large-v1 (1024 dim) - High accuracy, general purpose
+    """

-    def __init__(self, model_name: str | None = None) -> None:
+    # Model profiles for different use cases
+    MODELS = {
+        "fast": "BAAI/bge-small-en-v1.5",           # 384 dim - Fast, lightweight
+        "code": "jinaai/jina-embeddings-v2-base-code",  # 768 dim - Code-optimized
+        "multilingual": "intfloat/multilingual-e5-large",  # 1024 dim - Multilingual
+        "balanced": "mixedbread-ai/mxbai-embed-large-v1",  # 1024 dim - High accuracy
+    }
+
+    # Dimension mapping for each model
+    MODEL_DIMS = {
+        "BAAI/bge-small-en-v1.5": 384,
+        "jinaai/jina-embeddings-v2-base-code": 768,
+        "intfloat/multilingual-e5-large": 1024,
+        "mixedbread-ai/mxbai-embed-large-v1": 1024,
+    }
+
+    # Default model (fast profile)
+    DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"
+    DEFAULT_PROFILE = "fast"
+
+    def __init__(self, model_name: str | None = None, profile: str | None = None) -> None:
+        """Initialize embedder with model or profile.
+
+        Args:
+            model_name: Explicit model name (e.g., "jinaai/jina-embeddings-v2-base-code")
+            profile: Model profile shortcut ("fast", "code", "multilingual", "balanced")
+                    If both provided, model_name takes precedence.
+        """
        if not SEMANTIC_AVAILABLE:
            raise ImportError(
                "Semantic search dependencies not available. "
                "Install with: pip install codexlens[semantic]"
            )

-        self.model_name = model_name or self.MODEL_NAME
+        # Resolve model name from profile or use explicit name
+        if model_name:
+            self.model_name = model_name
+        elif profile and profile in self.MODELS:
+            self.model_name = self.MODELS[profile]
+        else:
+            self.model_name = self.DEFAULT_MODEL
+
        self._model = None

+    @property
+    def embedding_dim(self) -> int:
+        """Get embedding dimension for current model."""
+        return self.MODEL_DIMS.get(self.model_name, 768)  # Default to 768 if unknown
+
    def _load_model(self) -> None:
        """Lazy load the embedding model."""
        if self._model is not None:
--- a/codex-lens/src/codexlens/storage/dir_index.py
+++ b/codex-lens/src/codexlens/storage/dir_index.py
@@ -27,7 +27,6 @@ class SubdirLink:
    name: str
    index_path: Path
    files_count: int
-    direct_files: int
    last_updated: float


@@ -57,7 +56,7 @@ class DirIndexStore:

    # Schema version for migration tracking
    # Increment this when schema changes require migration
-    SCHEMA_VERSION = 4
+    SCHEMA_VERSION = 5

    def __init__(self, db_path: str | Path) -> None:
        """Initialize directory index store.
@@ -133,6 +132,11 @@ class DirIndexStore:
            from codexlens.storage.migrations.migration_004_dual_fts import upgrade
            upgrade(conn)

+        # Migration v4 -> v5: Remove unused/redundant fields
+        if from_version < 5:
+            from codexlens.storage.migrations.migration_005_cleanup_unused_fields import upgrade
+            upgrade(conn)
+
    def close(self) -> None:
        """Close database connection."""
        with self._lock:
@@ -208,19 +212,17 @@ class DirIndexStore:
                # Replace symbols
                conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
                if symbols:
-                    # Extract token_count and symbol_type from symbol metadata if available
+                    # Insert symbols without token_count and symbol_type
                    symbol_rows = []
                    for s in symbols:
-                        token_count = getattr(s, 'token_count', None)
-                        symbol_type = getattr(s, 'symbol_type', None) or s.kind
                        symbol_rows.append(
-                            (file_id, s.name, s.kind, s.range[0], s.range[1], token_count, symbol_type)
+                            (file_id, s.name, s.kind, s.range[0], s.range[1])
                        )

                    conn.executemany(
                        """
-                        INSERT INTO symbols(file_id, name, kind, start_line, end_line, token_count, symbol_type)
-                        VALUES(?, ?, ?, ?, ?, ?, ?)
+                        INSERT INTO symbols(file_id, name, kind, start_line, end_line)
+                        VALUES(?, ?, ?, ?, ?)
                        """,
                        symbol_rows,
                    )
@@ -374,19 +376,17 @@ class DirIndexStore:

                    conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
                    if symbols:
-                        # Extract token_count and symbol_type from symbol metadata if available
+                        # Insert symbols without token_count and symbol_type
                        symbol_rows = []
                        for s in symbols:
-                            token_count = getattr(s, 'token_count', None)
-                            symbol_type = getattr(s, 'symbol_type', None) or s.kind
                            symbol_rows.append(
-                                (file_id, s.name, s.kind, s.range[0], s.range[1], token_count, symbol_type)
+                                (file_id, s.name, s.kind, s.range[0], s.range[1])
                            )

                        conn.executemany(
                            """
-                            INSERT INTO symbols(file_id, name, kind, start_line, end_line, token_count, symbol_type)
-                            VALUES(?, ?, ?, ?, ?, ?, ?)
+                            INSERT INTO symbols(file_id, name, kind, start_line, end_line)
+                            VALUES(?, ?, ?, ?, ?)
                            """,
                            symbol_rows,
                        )
@@ -644,25 +644,22 @@ class DirIndexStore:
        with self._lock:
            conn = self._get_connection()

-            import json
            import time

-            keywords_json = json.dumps(keywords)
            generated_at = time.time()

-            # Write to semantic_metadata table (for backward compatibility)
+            # Write to semantic_metadata table (without keywords column)
            conn.execute(
                """
-                INSERT INTO semantic_metadata(file_id, summary, keywords, purpose, llm_tool, generated_at)
-                VALUES(?, ?, ?, ?, ?, ?)
+                INSERT INTO semantic_metadata(file_id, summary, purpose, llm_tool, generated_at)
+                VALUES(?, ?, ?, ?, ?)
                ON CONFLICT(file_id) DO UPDATE SET
                    summary=excluded.summary,
-                    keywords=excluded.keywords,
                    purpose=excluded.purpose,
                    llm_tool=excluded.llm_tool,
                    generated_at=excluded.generated_at
                """,
-                (file_id, summary, keywords_json, purpose, llm_tool, generated_at),
+                (file_id, summary, purpose, llm_tool, generated_at),
            )

            # Write to normalized keywords tables for optimized search
@@ -709,9 +706,10 @@ class DirIndexStore:
        with self._lock:
            conn = self._get_connection()

+            # Get semantic metadata (without keywords column)
            row = conn.execute(
                """
-                SELECT summary, keywords, purpose, llm_tool, generated_at
+                SELECT summary, purpose, llm_tool, generated_at
                FROM semantic_metadata WHERE file_id=?
                """,
                (file_id,),
@@ -720,11 +718,23 @@ class DirIndexStore:
            if not row:
                return None

-            import json
+            # Get keywords from normalized file_keywords table
+            keyword_rows = conn.execute(
+                """
+                SELECT k.keyword
+                FROM file_keywords fk
+                JOIN keywords k ON fk.keyword_id = k.id
+                WHERE fk.file_id = ?
+                ORDER BY k.keyword
+                """,
+                (file_id,),
+            ).fetchall()
+
+            keywords = [kw["keyword"] for kw in keyword_rows]

            return {
                "summary": row["summary"],
-                "keywords": json.loads(row["keywords"]) if row["keywords"] else [],
+                "keywords": keywords,
                "purpose": row["purpose"],
                "llm_tool": row["llm_tool"],
                "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
@@ -856,15 +866,14 @@ class DirIndexStore:
        Returns:
            Tuple of (list of metadata dicts, total count)
        """
-        import json
-
        with self._lock:
            conn = self._get_connection()

+            # Query semantic metadata without keywords column
            base_query = """
                SELECT f.id as file_id, f.name as file_name, f.full_path,
                       f.language, f.line_count,
-                       sm.summary, sm.keywords, sm.purpose,
+                       sm.summary, sm.purpose,
                       sm.llm_tool, sm.generated_at
                FROM files f
                JOIN semantic_metadata sm ON f.id = sm.file_id
@@ -892,14 +901,30 @@ class DirIndexStore:

            results = []
            for row in rows:
+                file_id = int(row["file_id"])
+
+                # Get keywords from normalized file_keywords table
+                keyword_rows = conn.execute(
+                    """
+                    SELECT k.keyword
+                    FROM file_keywords fk
+                    JOIN keywords k ON fk.keyword_id = k.id
+                    WHERE fk.file_id = ?
+                    ORDER BY k.keyword
+                    """,
+                    (file_id,),
+                ).fetchall()
+
+                keywords = [kw["keyword"] for kw in keyword_rows]
+
                results.append({
-                    "file_id": int(row["file_id"]),
+                    "file_id": file_id,
                    "file_name": row["file_name"],
                    "full_path": row["full_path"],
                    "language": row["language"],
                    "line_count": int(row["line_count"]) if row["line_count"] else 0,
                    "summary": row["summary"],
-                    "keywords": json.loads(row["keywords"]) if row["keywords"] else [],
+                    "keywords": keywords,
                    "purpose": row["purpose"],
                    "llm_tool": row["llm_tool"],
                    "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
@@ -922,7 +947,7 @@ class DirIndexStore:
            name: Subdirectory name
            index_path: Path to subdirectory's _index.db
            files_count: Total files recursively
-            direct_files: Files directly in subdirectory
+            direct_files: Deprecated parameter (no longer used)
        """
        with self._lock:
            conn = self._get_connection()
@@ -931,17 +956,17 @@ class DirIndexStore:
            import time
            last_updated = time.time()

+            # Note: direct_files parameter is deprecated but kept for backward compatibility
            conn.execute(
                """
-                INSERT INTO subdirs(name, index_path, files_count, direct_files, last_updated)
-                VALUES(?, ?, ?, ?, ?)
+                INSERT INTO subdirs(name, index_path, files_count, last_updated)
+                VALUES(?, ?, ?, ?)
                ON CONFLICT(name) DO UPDATE SET
                    index_path=excluded.index_path,
                    files_count=excluded.files_count,
-                    direct_files=excluded.direct_files,
                    last_updated=excluded.last_updated
                """,
-                (name, index_path_str, files_count, direct_files, last_updated),
+                (name, index_path_str, files_count, last_updated),
            )
            conn.commit()

@@ -974,7 +999,7 @@ class DirIndexStore:
            conn = self._get_connection()
            rows = conn.execute(
                """
-                SELECT id, name, index_path, files_count, direct_files, last_updated
+                SELECT id, name, index_path, files_count, last_updated
                FROM subdirs
                ORDER BY name
                """
@@ -986,7 +1011,6 @@ class DirIndexStore:
                    name=row["name"],
                    index_path=Path(row["index_path"]),
                    files_count=int(row["files_count"]) if row["files_count"] else 0,
-                    direct_files=int(row["direct_files"]) if row["direct_files"] else 0,
                    last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
                )
                for row in rows
@@ -1005,7 +1029,7 @@ class DirIndexStore:
            conn = self._get_connection()
            row = conn.execute(
                """
-                SELECT id, name, index_path, files_count, direct_files, last_updated
+                SELECT id, name, index_path, files_count, last_updated
                FROM subdirs WHERE name=?
                """,
                (name,),
@@ -1019,7 +1043,6 @@ class DirIndexStore:
                name=row["name"],
                index_path=Path(row["index_path"]),
                files_count=int(row["files_count"]) if row["files_count"] else 0,
-                direct_files=int(row["direct_files"]) if row["direct_files"] else 0,
                last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
            )

@@ -1031,41 +1054,71 @@ class DirIndexStore:
        Args:
            name: Subdirectory name
            files_count: Total files recursively
-            direct_files: Files directly in subdirectory (optional)
+            direct_files: Deprecated parameter (no longer used)
        """
        with self._lock:
            conn = self._get_connection()
            import time
            last_updated = time.time()

-            if direct_files is not None:
-                conn.execute(
-                    """
-                    UPDATE subdirs
-                    SET files_count=?, direct_files=?, last_updated=?
-                    WHERE name=?
-                    """,
-                    (files_count, direct_files, last_updated, name),
-                )
-            else:
-                conn.execute(
-                    """
-                    UPDATE subdirs
-                    SET files_count=?, last_updated=?
-                    WHERE name=?
-                    """,
-                    (files_count, last_updated, name),
-                )
+            # Note: direct_files parameter is deprecated but kept for backward compatibility
+            conn.execute(
+                """
+                UPDATE subdirs
+                SET files_count=?, last_updated=?
+                WHERE name=?
+                """,
+                (files_count, last_updated, name),
+            )
            conn.commit()

    # === Search ===

-    def search_fts(self, query: str, limit: int = 20) -> List[SearchResult]:
+    @staticmethod
+    def _enhance_fts_query(query: str) -> str:
+        """Enhance FTS5 query to support prefix matching for simple queries.
+
+        For simple single-word or multi-word queries without FTS5 operators,
+        automatically adds prefix wildcard (*) to enable partial matching.
+
+        Examples:
+            "loadPack" -> "loadPack*"
+            "load package" -> "load* package*"
+            "load*" -> "load*" (already has wildcard, unchanged)
+            "NOT test" -> "NOT test" (has FTS operator, unchanged)
+
+        Args:
+            query: Original FTS5 query string
+
+        Returns:
+            Enhanced query string with prefix wildcards for simple queries
+        """
+        # Don't modify if query already contains FTS5 operators or wildcards
+        if any(op in query.upper() for op in [' AND ', ' OR ', ' NOT ', ' NEAR ', '*', '"']):
+            return query
+
+        # For simple queries, add prefix wildcard to each word
+        words = query.split()
+        enhanced_words = [f"{word}*" if not word.endswith('*') else word for word in words]
+        return ' '.join(enhanced_words)
+
+    def search_fts(self, query: str, limit: int = 20, enhance_query: bool = False) -> List[SearchResult]:
        """Full-text search in current directory files.

+        Uses files_fts_exact (unicode61 tokenizer) for exact token matching.
+        For fuzzy/substring search, use search_fts_fuzzy() instead.
+
+        Best Practice (from industry analysis of Codanna/Code-Index-MCP):
+        - Default: Respects exact user input without modification
+        - Users can manually add wildcards (e.g., "loadPack*") for prefix matching
+        - Automatic enhancement (enhance_query=True) is NOT recommended as it can
+          violate user intent and bring unwanted noise in results
+
        Args:
            query: FTS5 query string
            limit: Maximum results to return
+            enhance_query: If True, automatically add prefix wildcards for simple queries.
+                          Default False to respect exact user input.

        Returns:
            List of SearchResult objects sorted by relevance
@@ -1073,19 +1126,23 @@ class DirIndexStore:
        Raises:
            StorageError: If FTS search fails
        """
+        # Only enhance query if explicitly requested (not default behavior)
+        # Best practice: Let users control wildcards manually
+        final_query = self._enhance_fts_query(query) if enhance_query else query
+
        with self._lock:
            conn = self._get_connection()
            try:
                rows = conn.execute(
                    """
-                    SELECT rowid, full_path, bm25(files_fts) AS rank,
-                           snippet(files_fts, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
-                    FROM files_fts
-                    WHERE files_fts MATCH ?
+                    SELECT rowid, full_path, bm25(files_fts_exact) AS rank,
+                           snippet(files_fts_exact, 2, '[bold red]', '[/bold red]', '...', 20) AS excerpt
+                    FROM files_fts_exact
+                    WHERE files_fts_exact MATCH ?
                    ORDER BY rank
                    LIMIT ?
                    """,
-                    (query, limit),
+                    (final_query, limit),
                ).fetchall()
            except sqlite3.DatabaseError as exc:
                raise StorageError(f"FTS search failed: {exc}") from exc
@@ -1249,10 +1306,11 @@ class DirIndexStore:
            if kind:
                rows = conn.execute(
                    """
-                    SELECT name, kind, start_line, end_line
-                    FROM symbols
-                    WHERE name LIKE ? AND kind=?
-                    ORDER BY name
+                    SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
+                    FROM symbols s
+                    JOIN files f ON s.file_id = f.id
+                    WHERE s.name LIKE ? AND s.kind=?
+                    ORDER BY s.name
                    LIMIT ?
                    """,
                    (pattern, kind, limit),
@@ -1260,10 +1318,11 @@ class DirIndexStore:
            else:
                rows = conn.execute(
                    """
-                    SELECT name, kind, start_line, end_line
-                    FROM symbols
-                    WHERE name LIKE ?
-                    ORDER BY name
+                    SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
+                    FROM symbols s
+                    JOIN files f ON s.file_id = f.id
+                    WHERE s.name LIKE ?
+                    ORDER BY s.name
                    LIMIT ?
                    """,
                    (pattern, limit),
@@ -1274,6 +1333,7 @@ class DirIndexStore:
                    name=row["name"],
                    kind=row["kind"],
                    range=(row["start_line"], row["end_line"]),
+                    file=row["full_path"],
                )
                for row in rows
            ]
@@ -1359,7 +1419,7 @@ class DirIndexStore:
                """
            )

-            # Subdirectories table
+            # Subdirectories table (v5: removed direct_files)
            conn.execute(
                """
                CREATE TABLE IF NOT EXISTS subdirs (
@@ -1367,13 +1427,12 @@ class DirIndexStore:
                    name TEXT NOT NULL UNIQUE,
                    index_path TEXT NOT NULL,
                    files_count INTEGER DEFAULT 0,
-                    direct_files INTEGER DEFAULT 0,
                    last_updated REAL
                )
                """
            )

-            # Symbols table
+            # Symbols table (v5: removed token_count and symbol_type)
            conn.execute(
                """
                CREATE TABLE IF NOT EXISTS symbols (
@@ -1382,9 +1441,7 @@ class DirIndexStore:
                    name TEXT NOT NULL,
                    kind TEXT NOT NULL,
                    start_line INTEGER,
-                    end_line INTEGER,
-                    token_count INTEGER,
-                    symbol_type TEXT
+                    end_line INTEGER
                )
                """
            )
@@ -1421,14 +1478,13 @@ class DirIndexStore:
                """
            )

-            # Semantic metadata table
+            # Semantic metadata table (v5: removed keywords column)
            conn.execute(
                """
                CREATE TABLE IF NOT EXISTS semantic_metadata (
                    id INTEGER PRIMARY KEY,
                    file_id INTEGER UNIQUE REFERENCES files(id) ON DELETE CASCADE,
                    summary TEXT,
-                    keywords TEXT,
                    purpose TEXT,
                    llm_tool TEXT,
                    generated_at REAL
@@ -1473,13 +1529,12 @@ class DirIndexStore:
                """
            )

-            # Indexes
+            # Indexes (v5: removed idx_symbols_type)
            conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
-            conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords(keyword)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords(file_id)")
--- a/codex-lens/src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py
+++ b/codex-lens/src/codexlens/storage/migrations/migration_005_cleanup_unused_fields.py
@@ -0,0 +1,188 @@
+"""
+Migration 005: Remove unused and redundant database fields.
+
+This migration removes four problematic fields identified by Gemini analysis:
+
+1. **semantic_metadata.keywords** (deprecated - replaced by file_keywords table)
+   - Data: Migrated to normalized file_keywords table in migration 001
+   - Impact: Column now redundant, remove to prevent sync issues
+
+2. **symbols.token_count** (unused - always NULL)
+   - Data: Never populated, always NULL
+   - Impact: No data loss, just removes unused column
+
+3. **symbols.symbol_type** (redundant - duplicates kind)
+   - Data: Redundant with symbols.kind field
+   - Impact: No data loss, kind field contains same information
+
+4. **subdirs.direct_files** (unused - never displayed)
+   - Data: Never used in queries or display logic
+   - Impact: No data loss, just removes unused column
+
+Schema changes use table recreation pattern (SQLite best practice):
+- Create new table without deprecated columns
+- Copy data from old table
+- Drop old table
+- Rename new table
+- Recreate indexes
+"""
+
+import logging
+from sqlite3 import Connection
+
+log = logging.getLogger(__name__)
+
+
+def upgrade(db_conn: Connection):
+    """Remove unused and redundant fields from schema.
+
+    Args:
+        db_conn: The SQLite database connection.
+    """
+    cursor = db_conn.cursor()
+
+    try:
+        cursor.execute("BEGIN TRANSACTION")
+
+        # Step 1: Remove semantic_metadata.keywords
+        log.info("Removing semantic_metadata.keywords column...")
+
+        # Check if semantic_metadata table exists
+        cursor.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_metadata'"
+        )
+        if cursor.fetchone():
+            cursor.execute("""
+                CREATE TABLE semantic_metadata_new (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    file_id INTEGER NOT NULL UNIQUE,
+                    summary TEXT,
+                    purpose TEXT,
+                    llm_tool TEXT,
+                    generated_at REAL,
+                    FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
+                )
+            """)
+
+            cursor.execute("""
+                INSERT INTO semantic_metadata_new (id, file_id, summary, purpose, llm_tool, generated_at)
+                SELECT id, file_id, summary, purpose, llm_tool, generated_at
+                FROM semantic_metadata
+            """)
+
+            cursor.execute("DROP TABLE semantic_metadata")
+            cursor.execute("ALTER TABLE semantic_metadata_new RENAME TO semantic_metadata")
+
+            # Recreate index
+            cursor.execute(
+                "CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)"
+            )
+            log.info("Removed semantic_metadata.keywords column")
+        else:
+            log.info("semantic_metadata table does not exist, skipping")
+
+        # Step 2: Remove symbols.token_count and symbols.symbol_type
+        log.info("Removing symbols.token_count and symbols.symbol_type columns...")
+
+        # Check if symbols table exists
+        cursor.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name='symbols'"
+        )
+        if cursor.fetchone():
+            cursor.execute("""
+                CREATE TABLE symbols_new (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    file_id INTEGER NOT NULL,
+                    name TEXT NOT NULL,
+                    kind TEXT,
+                    start_line INTEGER,
+                    end_line INTEGER,
+                    FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
+                )
+            """)
+
+            cursor.execute("""
+                INSERT INTO symbols_new (id, file_id, name, kind, start_line, end_line)
+                SELECT id, file_id, name, kind, start_line, end_line
+                FROM symbols
+            """)
+
+            cursor.execute("DROP TABLE symbols")
+            cursor.execute("ALTER TABLE symbols_new RENAME TO symbols")
+
+            # Recreate indexes (excluding idx_symbols_type which indexed symbol_type)
+            cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
+            cursor.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
+            log.info("Removed symbols.token_count and symbols.symbol_type columns")
+        else:
+            log.info("symbols table does not exist, skipping")
+
+        # Step 3: Remove subdirs.direct_files
+        log.info("Removing subdirs.direct_files column...")
+
+        # Check if subdirs table exists
+        cursor.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name='subdirs'"
+        )
+        if cursor.fetchone():
+            cursor.execute("""
+                CREATE TABLE subdirs_new (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    name TEXT NOT NULL UNIQUE,
+                    index_path TEXT NOT NULL,
+                    files_count INTEGER DEFAULT 0,
+                    last_updated REAL
+                )
+            """)
+
+            cursor.execute("""
+                INSERT INTO subdirs_new (id, name, index_path, files_count, last_updated)
+                SELECT id, name, index_path, files_count, last_updated
+                FROM subdirs
+            """)
+
+            cursor.execute("DROP TABLE subdirs")
+            cursor.execute("ALTER TABLE subdirs_new RENAME TO subdirs")
+
+            # Recreate index
+            cursor.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
+            log.info("Removed subdirs.direct_files column")
+        else:
+            log.info("subdirs table does not exist, skipping")
+
+        cursor.execute("COMMIT")
+        log.info("Migration 005 completed successfully")
+
+        # Vacuum to reclaim space (outside transaction)
+        try:
+            log.info("Running VACUUM to reclaim space...")
+            cursor.execute("VACUUM")
+            log.info("VACUUM completed successfully")
+        except Exception as e:
+            log.warning(f"VACUUM failed (non-critical): {e}")
+
+    except Exception as e:
+        log.error(f"Migration 005 failed: {e}")
+        try:
+            cursor.execute("ROLLBACK")
+        except Exception:
+            pass
+        raise
+
+
+def downgrade(db_conn: Connection):
+    """Restore removed fields (data will be lost for keywords, token_count, symbol_type, direct_files).
+
+    This is a placeholder - true downgrade is not feasible as data is lost.
+    The migration is designed to be one-way since removed fields are unused/redundant.
+
+    Args:
+        db_conn: The SQLite database connection.
+    """
+    log.warning(
+        "Migration 005 downgrade not supported - removed fields are unused/redundant. "
+        "Data cannot be restored."
+    )
+    raise NotImplementedError(
+        "Migration 005 downgrade not supported - this is a one-way migration"
+    )