Fix CodexLens embeddings generation to achieve 100% coverage

Previously, embeddings were only generated for root directory files (1.6% coverage, 5/303 files). This fix implements recursive processing across all subdirectory indexes, achieving 100% coverage with 2,042 semantic chunks across all 303 files in 26 index databases. Key improvements: 1. **Recursive embeddings generation** (embedding_manager.py): - Add generate_embeddings_recursive() to process all _index.db files in directory tree - Add get_embeddings_status() for comprehensive coverage statistics - Add discover_all_index_dbs() helper for recursive file discovery 2. **Enhanced CLI commands** (commands.py): - embeddings-generate: Add --recursive flag for full project coverage - init: Use recursive generation by default for complete indexing - status: Display embeddings coverage statistics with 50% threshold 3. **Smart search routing improvements** (smart-search.ts): - Add 50% embeddings coverage threshold for hybrid mode routing - Auto-fallback to exact mode when coverage insufficient - Strip ANSI color codes from JSON output for correct parsing - Add embeddings_coverage_percent to IndexStatus and SearchMetadata - Provide clear warnings with actionable suggestions 4. **Documentation and analysis**: - Add SMART_SEARCH_ANALYSIS.md with initial investigation - Add SMART_SEARCH_CORRECTED_ANALYSIS.md revealing true extent of issue - Add EMBEDDINGS_FIX_SUMMARY.md with complete fix summary - Add check_embeddings.py script for coverage verification Results: - Coverage improved from 1.6% (5/303 files) to 100% (303/303 files) - 62.5x increase - Semantic chunks increased from 10 to 2,042 - 204x increase - All 26 subdirectory indexes now have embeddings vs just 1 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-10 02:24:35 +08:00 · 2025-12-17 17:54:33 +08:00
parent d06a3ca12e
commit 74a830694c
7 changed files with 1540 additions and 346 deletions
--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -142,11 +142,11 @@ def init(
        if not no_embeddings:
            try:
                from codexlens.semantic import SEMANTIC_AVAILABLE
-                from codexlens.cli.embedding_manager import generate_embeddings
+                from codexlens.cli.embedding_manager import generate_embeddings_recursive, get_embeddings_status

                if SEMANTIC_AVAILABLE:
-                    # Find the index file
-                    index_path = Path(build_result.index_root) / "_index.db"
+                    # Use the index root directory (not the _index.db file)
+                    index_root = Path(build_result.index_root)

                    if not json_mode:
                        console.print("\n[bold]Generating embeddings...[/bold]")
@@ -157,8 +157,8 @@ def init(
                        if not json_mode and verbose:
                            console.print(f"  {msg}")

-                    embed_result = generate_embeddings(
-                        index_path,
+                    embed_result = generate_embeddings_recursive(
+                        index_root,
                        model_profile=embedding_model,
                        force=False,  # Don't force regenerate during init
                        chunk_size=2000,
@@ -167,29 +167,56 @@ def init(

                    if embed_result["success"]:
                        embed_data = embed_result["result"]
-                        result["embeddings_generated"] = True
-                        result["embeddings_count"] = embed_data["chunks_embedded"]
+                        
+                        # Get comprehensive coverage statistics
+                        status_result = get_embeddings_status(index_root)
+                        if status_result["success"]:
+                            coverage = status_result["result"]
+                            result["embeddings"] = {
+                                "generated": True,
+                                "total_indexes": coverage["total_indexes"],
+                                "total_files": coverage["total_files"],
+                                "files_with_embeddings": coverage["files_with_embeddings"],
+                                "coverage_percent": coverage["coverage_percent"],
+                                "total_chunks": coverage["total_chunks"],
+                            }
+                        else:
+                            result["embeddings"] = {
+                                "generated": True,
+                                "total_chunks": embed_data["total_chunks_created"],
+                                "files_processed": embed_data["total_files_processed"],
+                            }

                        if not json_mode:
-                            console.print(f"[green]✓[/green] Generated [bold]{embed_data['chunks_embedded']}[/bold] embeddings in {embed_data['elapsed_time']:.1f}s")
+                            console.print(f"[green]✓[/green] Generated embeddings for [bold]{embed_data['total_files_processed']}[/bold] files")
+                            console.print(f"  Total chunks: [bold]{embed_data['total_chunks_created']}[/bold]")
+                            console.print(f"  Indexes processed: [bold]{embed_data['indexes_successful']}/{embed_data['indexes_processed']}[/bold]")
                    else:
                        if not json_mode:
                            console.print(f"[yellow]Warning:[/yellow] Embedding generation failed: {embed_result.get('error', 'Unknown error')}")
-                        result["embeddings_generated"] = False
-                        result["embeddings_error"] = embed_result.get("error")
+                        result["embeddings"] = {
+                            "generated": False,
+                            "error": embed_result.get("error"),
+                        }
                else:
                    if not json_mode and verbose:
                        console.print("[dim]Semantic search not available. Skipping embeddings.[/dim]")
-                    result["embeddings_generated"] = False
-                    result["embeddings_error"] = "Semantic dependencies not installed"
+                    result["embeddings"] = {
+                        "generated": False,
+                        "error": "Semantic dependencies not installed",
+                    }
            except Exception as e:
                if not json_mode and verbose:
                    console.print(f"[yellow]Warning:[/yellow] Could not generate embeddings: {e}")
-                result["embeddings_generated"] = False
-                result["embeddings_error"] = str(e)
+                result["embeddings"] = {
+                    "generated": False,
+                    "error": str(e),
+                }
        else:
-            result["embeddings_generated"] = False
-            result["embeddings_error"] = "Skipped (--no-embeddings)"
+            result["embeddings"] = {
+                "generated": False,
+                "error": "Skipped (--no-embeddings)",
+            }

    except StorageError as exc:
        if json_mode:
@@ -611,6 +638,24 @@ def status(
                except Exception:
                    pass

+        # Check embeddings coverage
+        embeddings_info = None
+        has_vector_search = False
+        try:
+            from codexlens.cli.embedding_manager import get_embeddings_status
+            
+            if index_root.exists():
+                embed_status = get_embeddings_status(index_root)
+                if embed_status["success"]:
+                    embeddings_info = embed_status["result"]
+                    # Enable vector search if coverage >= 50%
+                    has_vector_search = embeddings_info["coverage_percent"] >= 50.0
+        except ImportError:
+            # Embedding manager not available
+            pass
+        except Exception as e:
+            logger.debug(f"Failed to get embeddings status: {e}")
+
        stats = {
            "index_root": str(index_root),
            "registry_path": str(_get_registry_path()),
@@ -624,9 +669,13 @@ def status(
                "exact_fts": True,  # Always available
                "fuzzy_fts": has_dual_fts,
                "hybrid_search": has_dual_fts,
-                "vector_search": False,  # Not yet implemented
+                "vector_search": has_vector_search,
            },
        }
+        
+        # Add embeddings info if available
+        if embeddings_info:
+            stats["embeddings"] = embeddings_info

        if json_mode:
            print_json(success=True, result=stats)
@@ -648,7 +697,20 @@ def status(
            else:
                console.print(f"  Fuzzy FTS: ✗ (run 'migrate' to enable)")
                console.print(f"  Hybrid Search: ✗ (run 'migrate' to enable)")
-            console.print(f"  Vector Search: ✗ (future)")
+            
+            if has_vector_search:
+                console.print(f"  Vector Search: ✓ (embeddings available)")
+            else:
+                console.print(f"  Vector Search: ✗ (no embeddings or coverage < 50%)")
+            
+            # Display embeddings statistics if available
+            if embeddings_info:
+                console.print("\n[bold]Embeddings Coverage:[/bold]")
+                console.print(f"  Total Indexes: {embeddings_info['total_indexes']}")
+                console.print(f"  Total Files: {embeddings_info['total_files']}")
+                console.print(f"  Files with Embeddings: {embeddings_info['files_with_embeddings']}")
+                console.print(f"  Coverage: {embeddings_info['coverage_percent']:.1f}%")
+                console.print(f"  Total Chunks: {embeddings_info['total_chunks']}")

    except StorageError as exc:
        if json_mode:
@@ -1885,6 +1947,12 @@ def embeddings_generate(
        "--chunk-size",
        help="Maximum chunk size in characters.",
    ),
+    recursive: bool = typer.Option(
+        False,
+        "--recursive",
+        "-r",
+        help="Recursively process all _index.db files in directory tree.",
+    ),
    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
 ) -> None:
@@ -1908,28 +1976,42 @@ def embeddings_generate(
    _configure_logging(verbose)

    try:
-        from codexlens.cli.embedding_manager import generate_embeddings
+        from codexlens.cli.embedding_manager import generate_embeddings, generate_embeddings_recursive

        # Resolve path
        target_path = path.expanduser().resolve()

+        # Determine if we should use recursive mode
+        use_recursive = False
+        index_path = None
+        index_root = None
+
        if target_path.is_file() and target_path.name == "_index.db":
            # Direct index file
            index_path = target_path
+            if recursive:
+                # Use parent directory for recursive processing
+                use_recursive = True
+                index_root = target_path.parent
        elif target_path.is_dir():
-            # Try to find index for this project
-            registry = RegistryStore()
-            try:
-                registry.initialize()
-                mapper = PathMapper()
-                index_path = mapper.source_to_index_db(target_path)
+            if recursive:
+                # Recursive mode: process all _index.db files in directory tree
+                use_recursive = True
+                index_root = target_path
+            else:
+                # Non-recursive: Try to find index for this project
+                registry = RegistryStore()
+                try:
+                    registry.initialize()
+                    mapper = PathMapper()
+                    index_path = mapper.source_to_index_db(target_path)

-                if not index_path.exists():
-                    console.print(f"[red]Error:[/red] No index found for {target_path}")
-                    console.print("Run 'codexlens init' first to create an index")
-                    raise typer.Exit(code=1)
-            finally:
-                registry.close()
+                    if not index_path.exists():
+                        console.print(f"[red]Error:[/red] No index found for {target_path}")
+                        console.print("Run 'codexlens init' first to create an index")
+                        raise typer.Exit(code=1)
+                finally:
+                    registry.close()
        else:
            console.print(f"[red]Error:[/red] Path must be _index.db file or directory")
            raise typer.Exit(code=1)
@@ -1940,16 +2022,29 @@ def embeddings_generate(
                console.print(f"  {msg}")

        console.print(f"[bold]Generating embeddings[/bold]")
-        console.print(f"Index: [dim]{index_path}[/dim]")
+        if use_recursive:
+            console.print(f"Index root: [dim]{index_root}[/dim]")
+            console.print(f"Mode: [yellow]Recursive[/yellow]")
+        else:
+            console.print(f"Index: [dim]{index_path}[/dim]")
        console.print(f"Model: [cyan]{model}[/cyan]\n")

-        result = generate_embeddings(
-            index_path,
-            model_profile=model,
-            force=force,
-            chunk_size=chunk_size,
-            progress_callback=progress_update,
-        )
+        if use_recursive:
+            result = generate_embeddings_recursive(
+                index_root,
+                model_profile=model,
+                force=force,
+                chunk_size=chunk_size,
+                progress_callback=progress_update,
+            )
+        else:
+            result = generate_embeddings(
+                index_path,
+                model_profile=model,
+                force=force,
+                chunk_size=chunk_size,
+                progress_callback=progress_update,
+            )

        if json_mode:
            print_json(**result)
@@ -1968,21 +2063,45 @@ def embeddings_generate(
                raise typer.Exit(code=1)

            data = result["result"]
-            elapsed = data["elapsed_time"]

-            console.print(f"[green]✓[/green] Embeddings generated successfully!")
-            console.print(f"  Model: {data['model_name']}")
-            console.print(f"  Chunks created: {data['chunks_created']:,}")
-            console.print(f"  Files processed: {data['files_processed']}")
+            if use_recursive:
+                # Recursive mode output
+                console.print(f"[green]✓[/green] Recursive embeddings generation complete!")
+                console.print(f"  Indexes processed: {data['indexes_processed']}")
+                console.print(f"  Indexes successful: {data['indexes_successful']}")
+                if data['indexes_failed'] > 0:
+                    console.print(f"  [yellow]Indexes failed: {data['indexes_failed']}[/yellow]")
+                console.print(f"  Total chunks created: {data['total_chunks_created']:,}")
+                console.print(f"  Total files processed: {data['total_files_processed']}")
+                if data['total_files_failed'] > 0:
+                    console.print(f"  [yellow]Total files failed: {data['total_files_failed']}[/yellow]")
+                console.print(f"  Model profile: {data['model_profile']}")

-            if data["files_failed"] > 0:
-                console.print(f"  [yellow]Files failed: {data['files_failed']}[/yellow]")
-                if data["failed_files"]:
-                    console.print("  [dim]First failures:[/dim]")
-                    for file_path, error in data["failed_files"]:
-                        console.print(f"    [dim]{file_path}: {error}[/dim]")
+                # Show details if verbose
+                if verbose and data.get('details'):
+                    console.print("\n[dim]Index details:[/dim]")
+                    for detail in data['details']:
+                        status_icon = "[green]✓[/green]" if detail['success'] else "[red]✗[/red]"
+                        console.print(f"  {status_icon} {detail['path']}")
+                        if not detail['success'] and detail.get('error'):
+                            console.print(f"    [dim]Error: {detail['error']}[/dim]")
+            else:
+                # Single index mode output
+                elapsed = data["elapsed_time"]

-            console.print(f"  Time: {elapsed:.1f}s")
+                console.print(f"[green]✓[/green] Embeddings generated successfully!")
+                console.print(f"  Model: {data['model_name']}")
+                console.print(f"  Chunks created: {data['chunks_created']:,}")
+                console.print(f"  Files processed: {data['files_processed']}")
+
+                if data["files_failed"] > 0:
+                    console.print(f"  [yellow]Files failed: {data['files_failed']}[/yellow]")
+                    if data["failed_files"]:
+                        console.print("  [dim]First failures:[/dim]")
+                        for file_path, error in data["failed_files"]:
+                            console.print(f"    [dim]{file_path}: {error}[/dim]")
+
+                console.print(f"  Time: {elapsed:.1f}s")

            console.print("\n[dim]Use vector search with:[/dim]")
            console.print("  [cyan]codexlens search 'your query' --mode pure-vector[/cyan]")
--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -255,6 +255,21 @@ def generate_embeddings(
    }


+def discover_all_index_dbs(index_root: Path) -> List[Path]:
+    """Recursively find all _index.db files in an index tree.
+
+    Args:
+        index_root: Root directory to scan for _index.db files
+
+    Returns:
+        Sorted list of paths to _index.db files
+    """
+    if not index_root.exists():
+        return []
+
+    return sorted(index_root.rglob("_index.db"))
+
+
 def find_all_indexes(scan_dir: Path) -> List[Path]:
    """Find all _index.db files in directory tree.

@@ -270,6 +285,146 @@ def find_all_indexes(scan_dir: Path) -> List[Path]:
    return list(scan_dir.rglob("_index.db"))


+
+def generate_embeddings_recursive(
+    index_root: Path,
+    model_profile: str = "code",
+    force: bool = False,
+    chunk_size: int = 2000,
+    progress_callback: Optional[callable] = None,
+) -> Dict[str, any]:
+    """Generate embeddings for all index databases in a project recursively.
+
+    Args:
+        index_root: Root index directory containing _index.db files
+        model_profile: Model profile (fast, code, multilingual, balanced)
+        force: If True, regenerate even if embeddings exist
+        chunk_size: Maximum chunk size in characters
+        progress_callback: Optional callback for progress updates
+
+    Returns:
+        Aggregated result dictionary with generation statistics
+    """
+    # Discover all _index.db files
+    index_files = discover_all_index_dbs(index_root)
+
+    if not index_files:
+        return {
+            "success": False,
+            "error": f"No index databases found in {index_root}",
+        }
+
+    if progress_callback:
+        progress_callback(f"Found {len(index_files)} index databases to process")
+
+    # Process each index database
+    all_results = []
+    total_chunks = 0
+    total_files_processed = 0
+    total_files_failed = 0
+
+    for idx, index_path in enumerate(index_files, 1):
+        if progress_callback:
+            try:
+                rel_path = index_path.relative_to(index_root)
+            except ValueError:
+                rel_path = index_path
+            progress_callback(f"[{idx}/{len(index_files)}] Processing {rel_path}")
+
+        result = generate_embeddings(
+            index_path,
+            model_profile=model_profile,
+            force=force,
+            chunk_size=chunk_size,
+            progress_callback=None,  # Don't cascade callbacks
+        )
+
+        all_results.append({
+            "path": str(index_path),
+            "success": result["success"],
+            "result": result.get("result"),
+            "error": result.get("error"),
+        })
+
+        if result["success"]:
+            data = result["result"]
+            total_chunks += data["chunks_created"]
+            total_files_processed += data["files_processed"]
+            total_files_failed += data["files_failed"]
+
+    successful = sum(1 for r in all_results if r["success"])
+
+    return {
+        "success": successful > 0,
+        "result": {
+            "indexes_processed": len(index_files),
+            "indexes_successful": successful,
+            "indexes_failed": len(index_files) - successful,
+            "total_chunks_created": total_chunks,
+            "total_files_processed": total_files_processed,
+            "total_files_failed": total_files_failed,
+            "model_profile": model_profile,
+            "details": all_results,
+        },
+    }
+
+
+def get_embeddings_status(index_root: Path) -> Dict[str, any]:
+    """Get comprehensive embeddings coverage status for all indexes.
+
+    Args:
+        index_root: Root index directory
+
+    Returns:
+        Aggregated status with coverage statistics
+    """
+    index_files = discover_all_index_dbs(index_root)
+
+    if not index_files:
+        return {
+            "success": True,
+            "result": {
+                "total_indexes": 0,
+                "total_files": 0,
+                "files_with_embeddings": 0,
+                "files_without_embeddings": 0,
+                "total_chunks": 0,
+                "coverage_percent": 0.0,
+                "indexes_with_embeddings": 0,
+                "indexes_without_embeddings": 0,
+            },
+        }
+
+    total_files = 0
+    files_with_embeddings = 0
+    total_chunks = 0
+    indexes_with_embeddings = 0
+
+    for index_path in index_files:
+        status = check_index_embeddings(index_path)
+        if status["success"]:
+            result = status["result"]
+            total_files += result["total_files"]
+            files_with_embeddings += result["files_with_chunks"]
+            total_chunks += result["total_chunks"]
+            if result["has_embeddings"]:
+                indexes_with_embeddings += 1
+
+    return {
+        "success": True,
+        "result": {
+            "total_indexes": len(index_files),
+            "total_files": total_files,
+            "files_with_embeddings": files_with_embeddings,
+            "files_without_embeddings": total_files - files_with_embeddings,
+            "total_chunks": total_chunks,
+            "coverage_percent": round((files_with_embeddings / total_files * 100) if total_files > 0 else 0, 1),
+            "indexes_with_embeddings": indexes_with_embeddings,
+            "indexes_without_embeddings": len(index_files) - indexes_with_embeddings,
+        },
+    }
+
+
 def get_embedding_stats_summary(index_root: Path) -> Dict[str, any]:
    """Get summary statistics for all indexes in root directory.