feat: Implement cascade indexing command and benchmark script for performance evaluation

2026-02-05 01:50:27 +08:00 · 2026-01-02 11:24:06 +08:00
parent e21d801523
commit da68ba0b82
4 changed files with 984 additions and 3 deletions
--- a/codex-lens/benchmarks/cascade_benchmark.py
+++ b/codex-lens/benchmarks/cascade_benchmark.py
@@ -0,0 +1,402 @@
+#!/usr/bin/env python
+"""Benchmark script for comparing cascade search strategies.
+
+Compares:
+- binary: 256-dim binary coarse ranking + 2048-dim dense fine ranking
+- hybrid: FTS+SPLADE+Vector coarse ranking + CrossEncoder fine ranking
+
+Usage:
+    python benchmarks/cascade_benchmark.py [--source PATH] [--queries N] [--warmup N]
+"""
+
+from __future__ import annotations
+
+import argparse
+import gc
+import json
+import os
+import statistics
+import sys
+import time
+import traceback
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+
+from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
+from codexlens.config import Config
+from codexlens.storage.registry import RegistryStore
+from codexlens.storage.path_mapper import PathMapper
+
+
+@dataclass
+class BenchmarkResult:
+    """Result from a single benchmark run."""
+    strategy: str
+    query: str
+    latency_ms: float
+    num_results: int
+    top_result: Optional[str]
+    error: Optional[str] = None
+
+
+@dataclass
+class BenchmarkSummary:
+    """Aggregated benchmark statistics."""
+    strategy: str
+    total_queries: int
+    successful_queries: int
+    avg_latency_ms: float
+    min_latency_ms: float
+    max_latency_ms: float
+    p50_latency_ms: float
+    p95_latency_ms: float
+    p99_latency_ms: float
+    avg_results: float
+    errors: List[str]
+
+
+# Default test queries covering different scenarios
+DEFAULT_QUERIES = [
+    # Code patterns
+    "def search",
+    "class Engine",
+    "import numpy",
+    "async def",
+    "raise ValueError",
+    # Semantic queries
+    "how to parse json",
+    "database connection",
+    "error handling",
+    "authentication logic",
+    "file read write",
+    # Technical terms
+    "embedding vector",
+    "cosine similarity",
+    "binary quantization",
+    "hamming distance",
+    "reranking",
+]
+
+
+def percentile(data: List[float], p: float) -> float:
+    """Calculate percentile of sorted data."""
+    if not data:
+        return 0.0
+    sorted_data = sorted(data)
+    k = (len(sorted_data) - 1) * (p / 100)
+    f = int(k)
+    c = f + 1 if f + 1 < len(sorted_data) else f
+    return sorted_data[f] + (k - f) * (sorted_data[c] - sorted_data[f])
+
+
+def run_single_benchmark(
+    engine: ChainSearchEngine,
+    query: str,
+    source_path: Path,
+    strategy: str,
+    options: Optional[SearchOptions] = None,
+) -> BenchmarkResult:
+    """Run a single benchmark query."""
+    gc.collect()
+    
+    start_time = time.perf_counter()
+    try:
+        result = engine.cascade_search(
+            query=query,
+            source_path=source_path,
+            k=10,
+            coarse_k=100,
+            options=options,
+            strategy=strategy,
+        )
+        elapsed_ms = (time.perf_counter() - start_time) * 1000
+        
+        top_result = None
+        if result.results:
+            r = result.results[0]
+            line = r.start_line or 0
+            top_result = f"{r.path}:{line}"
+        
+        return BenchmarkResult(
+            strategy=strategy,
+            query=query,
+            latency_ms=elapsed_ms,
+            num_results=len(result.results),
+            top_result=top_result,
+        )
+    except Exception as e:
+        elapsed_ms = (time.perf_counter() - start_time) * 1000
+        return BenchmarkResult(
+            strategy=strategy,
+            query=query,
+            latency_ms=elapsed_ms,
+            num_results=0,
+            top_result=None,
+            error=str(e),
+        )
+
+
+def run_benchmarks(
+    source_path: Path,
+    queries: List[str],
+    strategies: List[str],
+    warmup_runs: int = 2,
+    options: Optional[SearchOptions] = None,
+) -> Dict[str, List[BenchmarkResult]]:
+    """Run benchmarks for all queries and strategies."""
+    
+    print(f"\n{'='*60}")
+    print(f"Cascade Search Benchmark")
+    print(f"{'='*60}")
+    print(f"Source: {source_path}")
+    print(f"Queries: {len(queries)}")
+    print(f"Strategies: {strategies}")
+    print(f"Warmup runs: {warmup_runs}")
+    print(f"{'='*60}\n")
+    
+    # Initialize engine
+    config = Config()
+    registry = RegistryStore()  # Uses default path
+    registry.initialize()
+    mapper = PathMapper()  # Uses default path
+    engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
+    
+    results: Dict[str, List[BenchmarkResult]] = {s: [] for s in strategies}
+    
+    # Warmup phase
+    if warmup_runs > 0:
+        print(f"Running {warmup_runs} warmup queries...")
+        warmup_query = queries[0] if queries else "test"
+        for strategy in strategies:
+            for _ in range(warmup_runs):
+                try:
+                    run_single_benchmark(engine, warmup_query, source_path, strategy, options)
+                except Exception:
+                    pass
+        print("Warmup complete.\n")
+    
+    # Benchmark phase
+    total_runs = len(queries) * len(strategies)
+    current_run = 0
+    
+    for query in queries:
+        for strategy in strategies:
+            current_run += 1
+            print(f"[{current_run}/{total_runs}] {strategy}: '{query[:40]}...' ", end="", flush=True)
+            
+            result = run_single_benchmark(engine, query, source_path, strategy, options)
+            results[strategy].append(result)
+            
+            if result.error:
+                print(f"ERROR: {result.error[:50]}")
+            else:
+                print(f"{result.latency_ms:.1f}ms, {result.num_results} results")
+    
+    return results
+
+
+def summarize_results(results: Dict[str, List[BenchmarkResult]]) -> Dict[str, BenchmarkSummary]:
+    """Generate summary statistics for each strategy."""
+    summaries = {}
+    
+    for strategy, benchmark_results in results.items():
+        latencies = [r.latency_ms for r in benchmark_results if r.error is None]
+        result_counts = [r.num_results for r in benchmark_results if r.error is None]
+        errors = [r.error for r in benchmark_results if r.error is not None]
+        
+        if latencies:
+            summary = BenchmarkSummary(
+                strategy=strategy,
+                total_queries=len(benchmark_results),
+                successful_queries=len(latencies),
+                avg_latency_ms=statistics.mean(latencies),
+                min_latency_ms=min(latencies),
+                max_latency_ms=max(latencies),
+                p50_latency_ms=percentile(latencies, 50),
+                p95_latency_ms=percentile(latencies, 95),
+                p99_latency_ms=percentile(latencies, 99),
+                avg_results=statistics.mean(result_counts) if result_counts else 0,
+                errors=errors,
+            )
+        else:
+            summary = BenchmarkSummary(
+                strategy=strategy,
+                total_queries=len(benchmark_results),
+                successful_queries=0,
+                avg_latency_ms=0,
+                min_latency_ms=0,
+                max_latency_ms=0,
+                p50_latency_ms=0,
+                p95_latency_ms=0,
+                p99_latency_ms=0,
+                avg_results=0,
+                errors=errors,
+            )
+        
+        summaries[strategy] = summary
+    
+    return summaries
+
+
+def print_comparison_table(summaries: Dict[str, BenchmarkSummary]) -> None:
+    """Print formatted comparison table."""
+    print(f"\n{'='*80}")
+    print("BENCHMARK RESULTS COMPARISON")
+    print(f"{'='*80}\n")
+    
+    # Header
+    print(f"{'Metric':<25} {'Binary':>15} {'Hybrid':>15} {'Diff':>15}")
+    print(f"{'-'*25} {'-'*15} {'-'*15} {'-'*15}")
+    
+    binary = summaries.get("binary")
+    hybrid = summaries.get("hybrid")
+    
+    if not binary or not hybrid:
+        print("Missing results for comparison")
+        return
+    
+    metrics = [
+        ("Total Queries", binary.total_queries, hybrid.total_queries),
+        ("Successful", binary.successful_queries, hybrid.successful_queries),
+        ("Avg Latency (ms)", binary.avg_latency_ms, hybrid.avg_latency_ms),
+        ("Min Latency (ms)", binary.min_latency_ms, hybrid.min_latency_ms),
+        ("Max Latency (ms)", binary.max_latency_ms, hybrid.max_latency_ms),
+        ("P50 Latency (ms)", binary.p50_latency_ms, hybrid.p50_latency_ms),
+        ("P95 Latency (ms)", binary.p95_latency_ms, hybrid.p95_latency_ms),
+        ("P99 Latency (ms)", binary.p99_latency_ms, hybrid.p99_latency_ms),
+        ("Avg Results", binary.avg_results, hybrid.avg_results),
+    ]
+    
+    for name, b_val, h_val in metrics:
+        if isinstance(b_val, float):
+            diff = b_val - h_val
+            diff_str = f"{diff:+.2f}" if diff != 0 else "0.00"
+            speedup = h_val / b_val if b_val > 0 else 0
+            if "Latency" in name and speedup > 1:
+                diff_str += f" ({speedup:.1f}x faster)"
+            print(f"{name:<25} {b_val:>15.2f} {h_val:>15.2f} {diff_str:>15}")
+        else:
+            diff = b_val - h_val
+            print(f"{name:<25} {b_val:>15} {h_val:>15} {diff:>+15}")
+    
+    # Errors
+    print(f"\n{'Errors:':<25}")
+    print(f"  Binary: {len(binary.errors)}")
+    for err in binary.errors[:3]:
+        print(f"    - {err[:60]}...")
+    print(f"  Hybrid: {len(hybrid.errors)}")
+    for err in hybrid.errors[:3]:
+        print(f"    - {err[:60]}...")
+    
+    # Winner
+    print(f"\n{'='*80}")
+    if binary.avg_latency_ms < hybrid.avg_latency_ms and binary.successful_queries > 0:
+        speedup = hybrid.avg_latency_ms / binary.avg_latency_ms
+        print(f"[WINNER] Binary ({speedup:.2f}x faster average latency)")
+    elif hybrid.avg_latency_ms < binary.avg_latency_ms and hybrid.successful_queries > 0:
+        speedup = binary.avg_latency_ms / hybrid.avg_latency_ms
+        print(f"[WINNER] Hybrid ({speedup:.2f}x faster average latency)")
+    else:
+        print("No clear winner (check errors)")
+    print(f"{'='*80}\n")
+
+
+def save_results(
+    results: Dict[str, List[BenchmarkResult]],
+    summaries: Dict[str, BenchmarkSummary],
+    output_path: Path,
+) -> None:
+    """Save benchmark results to JSON file."""
+    data = {
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+        "summaries": {k: asdict(v) for k, v in summaries.items()},
+        "details": {
+            k: [asdict(r) for r in v]
+            for k, v in results.items()
+        },
+    }
+    
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2)
+    
+    print(f"Results saved to: {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark cascade search strategies")
+    parser.add_argument(
+        "--source", "-s",
+        type=Path,
+        default=Path(__file__).parent.parent / "src",
+        help="Source directory to search (default: ./src)",
+    )
+    parser.add_argument(
+        "--queries", "-q",
+        type=int,
+        default=len(DEFAULT_QUERIES),
+        help=f"Number of queries to run (default: {len(DEFAULT_QUERIES)})",
+    )
+    parser.add_argument(
+        "--warmup", "-w",
+        type=int,
+        default=2,
+        help="Number of warmup runs (default: 2)",
+    )
+    parser.add_argument(
+        "--output", "-o",
+        type=Path,
+        default=Path(__file__).parent / "results" / "cascade_benchmark.json",
+        help="Output file for results (default: benchmarks/results/cascade_benchmark.json)",
+    )
+    parser.add_argument(
+        "--strategies",
+        nargs="+",
+        default=["binary", "hybrid"],
+        choices=["binary", "hybrid"],
+        help="Strategies to benchmark (default: both)",
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate source path
+    if not args.source.exists():
+        print(f"Error: Source path does not exist: {args.source}")
+        sys.exit(1)
+    
+    # Select queries
+    queries = DEFAULT_QUERIES[:args.queries]
+    
+    # Run benchmarks
+    try:
+        results = run_benchmarks(
+            source_path=args.source,
+            queries=queries,
+            strategies=args.strategies,
+            warmup_runs=args.warmup,
+        )
+        
+        # Generate summaries
+        summaries = summarize_results(results)
+        
+        # Print comparison
+        print_comparison_table(summaries)
+        
+        # Save results
+        save_results(results, summaries, args.output)
+        
+    except KeyboardInterrupt:
+        print("\nBenchmark interrupted.")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\nBenchmark failed: {e}")
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/codex-lens/benchmarks/results/cascade_benchmark.json
+++ b/codex-lens/benchmarks/results/cascade_benchmark.json
@@ -0,0 +1,277 @@
+{
+  "timestamp": "2026-01-02 11:22:34",
+  "summaries": {
+    "binary": {
+      "strategy": "binary",
+      "total_queries": 15,
+      "successful_queries": 15,
+      "avg_latency_ms": 850.328753333209,
+      "min_latency_ms": 750.9617999967304,
+      "max_latency_ms": 1015.733200001705,
+      "p50_latency_ms": 847.9711999971187,
+      "p95_latency_ms": 976.768470002571,
+      "p99_latency_ms": 1007.9402540018782,
+      "avg_results": 0,
+      "errors": []
+    },
+    "hybrid": {
+      "strategy": "hybrid",
+      "total_queries": 15,
+      "successful_queries": 15,
+      "avg_latency_ms": 821.3745733330143,
+      "min_latency_ms": 720.5589000004693,
+      "max_latency_ms": 943.0299999949057,
+      "p50_latency_ms": 819.5875000019441,
+      "p95_latency_ms": 916.3381599981221,
+      "p99_latency_ms": 937.691631995549,
+      "avg_results": 0,
+      "errors": []
+    }
+  },
+  "details": {
+    "binary": [
+      {
+        "strategy": "binary",
+        "query": "def search",
+        "latency_ms": 862.7266999974381,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "class Engine",
+        "latency_ms": 773.8472999990336,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "import numpy",
+        "latency_ms": 858.1023000006098,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "async def",
+        "latency_ms": 877.2815999982413,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "raise ValueError",
+        "latency_ms": 824.3320999972639,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "how to parse json",
+        "latency_ms": 948.0362000031164,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "database connection",
+        "latency_ms": 789.3126000053599,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "error handling",
+        "latency_ms": 960.0693000029423,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "authentication logic",
+        "latency_ms": 757.247900000948,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "file read write",
+        "latency_ms": 750.9617999967304,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "embedding vector",
+        "latency_ms": 871.1426000008942,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "cosine similarity",
+        "latency_ms": 817.1380999992834,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "binary quantization",
+        "latency_ms": 1015.733200001705,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "hamming distance",
+        "latency_ms": 847.9711999971187,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "binary",
+        "query": "reranking",
+        "latency_ms": 801.028399997449,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      }
+    ],
+    "hybrid": [
+      {
+        "strategy": "hybrid",
+        "query": "def search",
+        "latency_ms": 720.5589000004693,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "class Engine",
+        "latency_ms": 792.9914000051212,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "import numpy",
+        "latency_ms": 943.0299999949057,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "async def",
+        "latency_ms": 819.5875000019441,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "raise ValueError",
+        "latency_ms": 835.5114000005415,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "how to parse json",
+        "latency_ms": 867.8118999960134,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "database connection",
+        "latency_ms": 824.6361999990768,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "error handling",
+        "latency_ms": 742.638600000646,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "authentication logic",
+        "latency_ms": 840.4286999939359,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "file read write",
+        "latency_ms": 810.9049000049708,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "embedding vector",
+        "latency_ms": 876.5335000061896,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "cosine similarity",
+        "latency_ms": 797.3090999948909,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "binary quantization",
+        "latency_ms": 767.9803999999422,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "hamming distance",
+        "latency_ms": 775.7972999970661,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      },
+      {
+        "strategy": "hybrid",
+        "query": "reranking",
+        "latency_ms": 904.8987999995006,
+        "num_results": 0,
+        "top_result": null,
+        "error": null
+      }
+    ]
+  }
+}
--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -7,7 +7,7 @@ import logging
 import os
 import shutil
 from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Annotated, Any, Dict, Iterable, List, Optional

 import typer
 from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
@@ -2721,3 +2721,305 @@ def _display_index_result(result) -> None:
                console.print(f"  [red]Error:[/red] {error}")
            if len(result.errors) > 3:
                console.print(f"  [dim]... and {len(result.errors) - 3} more errors[/dim]")
+
+
+
+# ==================== Cascade Index Commands ====================
+
+
+def get_binary_index_path(db_path: Path) -> Path:
+    """Get the path for binary ANN index file.
+
+    Args:
+        db_path: Path to the _index.db file
+
+    Returns:
+        Path to the binary index file (_index_binary.bin)
+    """
+    return db_path.parent / f"{db_path.stem}_binary.bin"
+
+
+@app.command("cascade-index")
+def cascade_index(
+    path: Annotated[Path, typer.Argument(help="Directory to index")],
+    force: Annotated[bool, typer.Option("--force", "-f", help="Force regenerate")] = False,
+    batch_size: Annotated[int, typer.Option("--batch-size", "-b", help="Batch size for embedding")] = 32,
+    json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False,
+    verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False,
+) -> None:
+    """Generate cascade embeddings (binary + dense) for two-stage retrieval.
+
+    Cascade retrieval uses a two-stage approach:
+    1. Binary search (fast, 32 bytes/vector) -> coarse filtering
+    2. Dense rerank (precise, 8KB/vector) -> final results
+
+    This command:
+    - Finds all _index.db files in the directory
+    - Generates binary (256-dim) and dense (2048-dim) embeddings for each chunk
+    - Stores embeddings in the database (embedding_binary, embedding_dense columns)
+    - Creates a BinaryANNIndex file for fast coarse retrieval
+
+    Examples:
+        codexlens cascade-index ~/projects/my-app
+        codexlens cascade-index . --force
+        codexlens cascade-index . --batch-size 64 --verbose
+    """
+    _configure_logging(verbose, json_mode)
+
+    target_path = path.expanduser().resolve()
+
+    # Find index database(s)
+    if target_path.is_file() and target_path.name == "_index.db":
+        index_dbs = [target_path]
+    elif target_path.is_dir():
+        # Check local .codexlens/_index.db first
+        local_index = target_path / ".codexlens" / "_index.db"
+        if local_index.exists():
+            index_dbs = [local_index]
+        else:
+            # Find via registry
+            registry = RegistryStore()
+            try:
+                registry.initialize()
+                mapper = PathMapper()
+                index_db = mapper.source_to_index_db(target_path)
+                if not index_db.exists():
+                    if json_mode:
+                        print_json(success=False, error=f"No index found for {target_path}")
+                    else:
+                        console.print(f"[red]Error:[/red] No index found for {target_path}")
+                        console.print("Run 'codexlens init' first to create an index")
+                    raise typer.Exit(code=1)
+                # Find all _index.db files under the index root
+                index_root = index_db.parent
+                index_dbs = list(index_root.rglob("_index.db"))
+            finally:
+                registry.close()
+    else:
+        if json_mode:
+            print_json(success=False, error="Path must be _index.db file or indexed directory")
+        else:
+            console.print("[red]Error:[/red] Path must be _index.db file or indexed directory")
+        raise typer.Exit(code=1)
+
+    if not index_dbs:
+        if json_mode:
+            print_json(success=False, error="No index databases found")
+        else:
+            console.print("[yellow]No index databases found[/yellow]")
+        raise typer.Exit(code=1)
+
+    # Import cascade embedding backend
+    try:
+        from codexlens.indexing.embedding import CascadeEmbeddingBackend
+        from codexlens.semantic.ann_index import BinaryANNIndex
+        from codexlens.indexing.embedding import pack_binary_embedding
+    except ImportError as e:
+        error_msg = f"Cascade embedding dependencies not available: {e}"
+        if json_mode:
+            print_json(success=False, error=error_msg)
+        else:
+            console.print(f"[red]Error:[/red] {error_msg}")
+            console.print("[dim]Install with: pip install codexlens[semantic][/dim]")
+        raise typer.Exit(code=1)
+
+    if not json_mode:
+        console.print(f"[bold]Generating cascade embeddings[/bold]")
+        console.print(f"Path: [dim]{target_path}[/dim]")
+        console.print(f"Index databases: [cyan]{len(index_dbs)}[/cyan]")
+        console.print(f"Batch size: [cyan]{batch_size}[/cyan]")
+        console.print()
+
+    # Initialize cascade embedding backend
+    try:
+        cascade_backend = CascadeEmbeddingBackend()
+    except Exception as e:
+        error_msg = f"Failed to initialize cascade embedding backend: {e}"
+        if json_mode:
+            print_json(success=False, error=error_msg)
+        else:
+            console.print(f"[red]Error:[/red] {error_msg}")
+        raise typer.Exit(code=1)
+
+    # Process statistics
+    total_chunks_processed = 0
+    total_indexes_processed = 0
+    total_indexes_successful = 0
+    total_binary_indexes_created = 0
+    errors_list: List[str] = []
+
+    # Process each index database
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+        TextColumn("({task.completed}/{task.total})"),
+        TimeElapsedColumn(),
+        console=console,
+        disable=json_mode,
+    ) as progress:
+        db_task = progress.add_task("Processing indexes...", total=len(index_dbs))
+
+        for db_path in index_dbs:
+            total_indexes_processed += 1
+            index_name = db_path.parent.name
+
+            try:
+                # Open the index store
+                store = DirIndexStore(db_path)
+                store.initialize()
+
+                # Get connection for direct queries
+                conn = store._get_connection()
+
+                # Ensure cascade columns exist in semantic_chunks table
+                try:
+                    conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_binary BLOB")
+                except Exception:
+                    pass  # Column already exists
+                try:
+                    conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_dense BLOB")
+                except Exception:
+                    pass  # Column already exists
+                conn.commit()
+
+                # Check if semantic_chunks table exists and has data
+                try:
+                    cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
+                    chunk_count = cursor.fetchone()[0]
+                except Exception:
+                    # semantic_chunks table doesn't exist or is empty
+                    chunk_count = 0
+
+                if chunk_count == 0:
+                    if verbose and not json_mode:
+                        console.print(f"  [dim]Skipping {index_name}: no chunks found[/dim]")
+                    progress.advance(db_task)
+                    store.close()
+                    continue
+
+                # Check if embeddings already exist (unless force)
+                if not force:
+                    cursor = conn.execute(
+                        "SELECT COUNT(*) FROM semantic_chunks WHERE embedding_binary IS NOT NULL"
+                    )
+                    existing_count = cursor.fetchone()[0]
+                    if existing_count > 0:
+                        if verbose and not json_mode:
+                            console.print(f"  [dim]Skipping {index_name}: embeddings exist (use --force to regenerate)[/dim]")
+                        progress.advance(db_task)
+                        store.close()
+                        continue
+
+                # If force, clear existing cascade embeddings
+                if force:
+                    conn.execute(
+                        "UPDATE semantic_chunks SET embedding_binary = NULL, embedding_dense = NULL"
+                    )
+                    conn.commit()
+
+                # Get all chunks
+                cursor = conn.execute("SELECT id, content FROM semantic_chunks")
+                chunks = cursor.fetchall()
+
+                if not chunks:
+                    progress.advance(db_task)
+                    store.close()
+                    continue
+
+                if verbose and not json_mode:
+                    console.print(f"  Processing {index_name}: {len(chunks)} chunks")
+
+                # Process in batches
+                chunk_task = progress.add_task(
+                    f"  {index_name}", total=len(chunks)
+                )
+
+                # Prepare for BinaryANNIndex
+                binary_index_path = get_binary_index_path(db_path)
+                binary_ann_index = BinaryANNIndex(db_path, dim=256)
+
+                for i in range(0, len(chunks), batch_size):
+                    batch_chunks = chunks[i:i + batch_size]
+                    batch_ids = [c[0] for c in batch_chunks]
+                    batch_contents = [c[1] for c in batch_chunks]
+
+                    # Generate cascade embeddings
+                    binary_embeddings, dense_embeddings = cascade_backend.encode_cascade(
+                        batch_contents, batch_size=batch_size
+                    )
+
+                    # Pack binary embeddings and convert dense to bytes
+                    packed_binaries = []
+                    dense_bytes_list = []
+
+                    for j in range(len(batch_ids)):
+                        # Pack binary embedding (256 bits -> 32 bytes)
+                        packed_binary = pack_binary_embedding(binary_embeddings[j])
+                        packed_binaries.append(packed_binary)
+
+                        # Convert dense embedding to bytes
+                        import numpy as np
+                        dense_blob = dense_embeddings[j].astype(np.float32).tobytes()
+                        dense_bytes_list.append(dense_blob)
+
+                    # Update database
+                    for j, chunk_id in enumerate(batch_ids):
+                        conn.execute(
+                            """
+                            UPDATE semantic_chunks
+                            SET embedding_binary = ?, embedding_dense = ?
+                            WHERE id = ?
+                            """,
+                            (packed_binaries[j], dense_bytes_list[j], chunk_id)
+                        )
+
+                    # Add to binary ANN index
+                    binary_ann_index.add_vectors(batch_ids, packed_binaries)
+
+                    conn.commit()
+                    total_chunks_processed += len(batch_ids)
+                    progress.advance(chunk_task, len(batch_ids))
+
+                # Save binary ANN index
+                binary_ann_index.save()
+                total_binary_indexes_created += 1
+
+                progress.remove_task(chunk_task)
+                store.close()
+                total_indexes_successful += 1
+
+            except Exception as e:
+                error_msg = f"{index_name}: {e}"
+                errors_list.append(error_msg)
+                if verbose and not json_mode:
+                    console.print(f"  [red]Error processing {index_name}:[/red] {e}")
+
+            progress.advance(db_task)
+
+    # Build result
+    result = {
+        "path": str(target_path),
+        "indexes_processed": total_indexes_processed,
+        "indexes_successful": total_indexes_successful,
+        "chunks_processed": total_chunks_processed,
+        "binary_indexes_created": total_binary_indexes_created,
+        "errors": len(errors_list),
+        "error_details": errors_list[:5] if errors_list else [],
+    }
+
+    if json_mode:
+        print_json(success=True, result=result)
+    else:
+        console.print(f"\n[green]Cascade indexing complete[/green]")
+        console.print(f"  Indexes processed: {total_indexes_processed}")
+        console.print(f"  Indexes successful: {total_indexes_successful}")
+        console.print(f"  Chunks processed: {total_chunks_processed:,}")
+        console.print(f"  Binary indexes created: {total_binary_indexes_created}")
+        if errors_list:
+            console.print(f"  [yellow]Errors: {len(errors_list)}[/yellow]")
+            for err in errors_list[:3]:
+                console.print(f"    [dim]{err}[/dim]")
+            if len(errors_list) > 3:
+                console.print(f"    [dim]... and {len(errors_list) - 3} more[/dim]")
--- a/codex-lens/src/codexlens/indexing/embedding.py
+++ b/codex-lens/src/codexlens/indexing/embedding.py
@@ -265,8 +265,8 @@ class DenseEmbeddingBackend(BaseEmbedder):
    Model: BAAI/bge-large-en-v1.5 (1024 dim) with optional expansion
    """

-    DEFAULT_MODEL = "BAAI/bge-large-en-v1.5"  # 1024 dim, high quality
-    TARGET_DIM = 2048
+    DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"  # 384 dim, use small for testing
+    TARGET_DIM = 768  # Reduced target for faster testing

    def __init__(
        self,