diff --git a/codex-lens/benchmarks/cascade_benchmark.py b/codex-lens/benchmarks/cascade_benchmark.py new file mode 100644 index 00000000..14461479 --- /dev/null +++ b/codex-lens/benchmarks/cascade_benchmark.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python +"""Benchmark script for comparing cascade search strategies. + +Compares: +- binary: 256-dim binary coarse ranking + 2048-dim dense fine ranking +- hybrid: FTS+SPLADE+Vector coarse ranking + CrossEncoder fine ranking + +Usage: + python benchmarks/cascade_benchmark.py [--source PATH] [--queries N] [--warmup N] +""" + +from __future__ import annotations + +import argparse +import gc +import json +import os +import statistics +import sys +import time +import traceback +from dataclasses import dataclass, asdict +from pathlib import Path +from typing import List, Optional, Dict, Any + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from codexlens.search.chain_search import ChainSearchEngine, SearchOptions +from codexlens.config import Config +from codexlens.storage.registry import RegistryStore +from codexlens.storage.path_mapper import PathMapper + + +@dataclass +class BenchmarkResult: + """Result from a single benchmark run.""" + strategy: str + query: str + latency_ms: float + num_results: int + top_result: Optional[str] + error: Optional[str] = None + + +@dataclass +class BenchmarkSummary: + """Aggregated benchmark statistics.""" + strategy: str + total_queries: int + successful_queries: int + avg_latency_ms: float + min_latency_ms: float + max_latency_ms: float + p50_latency_ms: float + p95_latency_ms: float + p99_latency_ms: float + avg_results: float + errors: List[str] + + +# Default test queries covering different scenarios +DEFAULT_QUERIES = [ + # Code patterns + "def search", + "class Engine", + "import numpy", + "async def", + "raise ValueError", + # Semantic queries + "how to parse json", + "database connection", + "error handling", + "authentication logic", + "file read write", + # Technical terms + "embedding vector", + "cosine similarity", + "binary quantization", + "hamming distance", + "reranking", +] + + +def percentile(data: List[float], p: float) -> float: + """Calculate percentile of sorted data.""" + if not data: + return 0.0 + sorted_data = sorted(data) + k = (len(sorted_data) - 1) * (p / 100) + f = int(k) + c = f + 1 if f + 1 < len(sorted_data) else f + return sorted_data[f] + (k - f) * (sorted_data[c] - sorted_data[f]) + + +def run_single_benchmark( + engine: ChainSearchEngine, + query: str, + source_path: Path, + strategy: str, + options: Optional[SearchOptions] = None, +) -> BenchmarkResult: + """Run a single benchmark query.""" + gc.collect() + + start_time = time.perf_counter() + try: + result = engine.cascade_search( + query=query, + source_path=source_path, + k=10, + coarse_k=100, + options=options, + strategy=strategy, + ) + elapsed_ms = (time.perf_counter() - start_time) * 1000 + + top_result = None + if result.results: + r = result.results[0] + line = r.start_line or 0 + top_result = f"{r.path}:{line}" + + return BenchmarkResult( + strategy=strategy, + query=query, + latency_ms=elapsed_ms, + num_results=len(result.results), + top_result=top_result, + ) + except Exception as e: + elapsed_ms = (time.perf_counter() - start_time) * 1000 + return BenchmarkResult( + strategy=strategy, + query=query, + latency_ms=elapsed_ms, + num_results=0, + top_result=None, + error=str(e), + ) + + +def run_benchmarks( + source_path: Path, + queries: List[str], + strategies: List[str], + warmup_runs: int = 2, + options: Optional[SearchOptions] = None, +) -> Dict[str, List[BenchmarkResult]]: + """Run benchmarks for all queries and strategies.""" + + print(f"\n{'='*60}") + print(f"Cascade Search Benchmark") + print(f"{'='*60}") + print(f"Source: {source_path}") + print(f"Queries: {len(queries)}") + print(f"Strategies: {strategies}") + print(f"Warmup runs: {warmup_runs}") + print(f"{'='*60}\n") + + # Initialize engine + config = Config() + registry = RegistryStore() # Uses default path + registry.initialize() + mapper = PathMapper() # Uses default path + engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config) + + results: Dict[str, List[BenchmarkResult]] = {s: [] for s in strategies} + + # Warmup phase + if warmup_runs > 0: + print(f"Running {warmup_runs} warmup queries...") + warmup_query = queries[0] if queries else "test" + for strategy in strategies: + for _ in range(warmup_runs): + try: + run_single_benchmark(engine, warmup_query, source_path, strategy, options) + except Exception: + pass + print("Warmup complete.\n") + + # Benchmark phase + total_runs = len(queries) * len(strategies) + current_run = 0 + + for query in queries: + for strategy in strategies: + current_run += 1 + print(f"[{current_run}/{total_runs}] {strategy}: '{query[:40]}...' ", end="", flush=True) + + result = run_single_benchmark(engine, query, source_path, strategy, options) + results[strategy].append(result) + + if result.error: + print(f"ERROR: {result.error[:50]}") + else: + print(f"{result.latency_ms:.1f}ms, {result.num_results} results") + + return results + + +def summarize_results(results: Dict[str, List[BenchmarkResult]]) -> Dict[str, BenchmarkSummary]: + """Generate summary statistics for each strategy.""" + summaries = {} + + for strategy, benchmark_results in results.items(): + latencies = [r.latency_ms for r in benchmark_results if r.error is None] + result_counts = [r.num_results for r in benchmark_results if r.error is None] + errors = [r.error for r in benchmark_results if r.error is not None] + + if latencies: + summary = BenchmarkSummary( + strategy=strategy, + total_queries=len(benchmark_results), + successful_queries=len(latencies), + avg_latency_ms=statistics.mean(latencies), + min_latency_ms=min(latencies), + max_latency_ms=max(latencies), + p50_latency_ms=percentile(latencies, 50), + p95_latency_ms=percentile(latencies, 95), + p99_latency_ms=percentile(latencies, 99), + avg_results=statistics.mean(result_counts) if result_counts else 0, + errors=errors, + ) + else: + summary = BenchmarkSummary( + strategy=strategy, + total_queries=len(benchmark_results), + successful_queries=0, + avg_latency_ms=0, + min_latency_ms=0, + max_latency_ms=0, + p50_latency_ms=0, + p95_latency_ms=0, + p99_latency_ms=0, + avg_results=0, + errors=errors, + ) + + summaries[strategy] = summary + + return summaries + + +def print_comparison_table(summaries: Dict[str, BenchmarkSummary]) -> None: + """Print formatted comparison table.""" + print(f"\n{'='*80}") + print("BENCHMARK RESULTS COMPARISON") + print(f"{'='*80}\n") + + # Header + print(f"{'Metric':<25} {'Binary':>15} {'Hybrid':>15} {'Diff':>15}") + print(f"{'-'*25} {'-'*15} {'-'*15} {'-'*15}") + + binary = summaries.get("binary") + hybrid = summaries.get("hybrid") + + if not binary or not hybrid: + print("Missing results for comparison") + return + + metrics = [ + ("Total Queries", binary.total_queries, hybrid.total_queries), + ("Successful", binary.successful_queries, hybrid.successful_queries), + ("Avg Latency (ms)", binary.avg_latency_ms, hybrid.avg_latency_ms), + ("Min Latency (ms)", binary.min_latency_ms, hybrid.min_latency_ms), + ("Max Latency (ms)", binary.max_latency_ms, hybrid.max_latency_ms), + ("P50 Latency (ms)", binary.p50_latency_ms, hybrid.p50_latency_ms), + ("P95 Latency (ms)", binary.p95_latency_ms, hybrid.p95_latency_ms), + ("P99 Latency (ms)", binary.p99_latency_ms, hybrid.p99_latency_ms), + ("Avg Results", binary.avg_results, hybrid.avg_results), + ] + + for name, b_val, h_val in metrics: + if isinstance(b_val, float): + diff = b_val - h_val + diff_str = f"{diff:+.2f}" if diff != 0 else "0.00" + speedup = h_val / b_val if b_val > 0 else 0 + if "Latency" in name and speedup > 1: + diff_str += f" ({speedup:.1f}x faster)" + print(f"{name:<25} {b_val:>15.2f} {h_val:>15.2f} {diff_str:>15}") + else: + diff = b_val - h_val + print(f"{name:<25} {b_val:>15} {h_val:>15} {diff:>+15}") + + # Errors + print(f"\n{'Errors:':<25}") + print(f" Binary: {len(binary.errors)}") + for err in binary.errors[:3]: + print(f" - {err[:60]}...") + print(f" Hybrid: {len(hybrid.errors)}") + for err in hybrid.errors[:3]: + print(f" - {err[:60]}...") + + # Winner + print(f"\n{'='*80}") + if binary.avg_latency_ms < hybrid.avg_latency_ms and binary.successful_queries > 0: + speedup = hybrid.avg_latency_ms / binary.avg_latency_ms + print(f"[WINNER] Binary ({speedup:.2f}x faster average latency)") + elif hybrid.avg_latency_ms < binary.avg_latency_ms and hybrid.successful_queries > 0: + speedup = binary.avg_latency_ms / hybrid.avg_latency_ms + print(f"[WINNER] Hybrid ({speedup:.2f}x faster average latency)") + else: + print("No clear winner (check errors)") + print(f"{'='*80}\n") + + +def save_results( + results: Dict[str, List[BenchmarkResult]], + summaries: Dict[str, BenchmarkSummary], + output_path: Path, +) -> None: + """Save benchmark results to JSON file.""" + data = { + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), + "summaries": {k: asdict(v) for k, v in summaries.items()}, + "details": { + k: [asdict(r) for r in v] + for k, v in results.items() + }, + } + + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2) + + print(f"Results saved to: {output_path}") + + +def main(): + parser = argparse.ArgumentParser(description="Benchmark cascade search strategies") + parser.add_argument( + "--source", "-s", + type=Path, + default=Path(__file__).parent.parent / "src", + help="Source directory to search (default: ./src)", + ) + parser.add_argument( + "--queries", "-q", + type=int, + default=len(DEFAULT_QUERIES), + help=f"Number of queries to run (default: {len(DEFAULT_QUERIES)})", + ) + parser.add_argument( + "--warmup", "-w", + type=int, + default=2, + help="Number of warmup runs (default: 2)", + ) + parser.add_argument( + "--output", "-o", + type=Path, + default=Path(__file__).parent / "results" / "cascade_benchmark.json", + help="Output file for results (default: benchmarks/results/cascade_benchmark.json)", + ) + parser.add_argument( + "--strategies", + nargs="+", + default=["binary", "hybrid"], + choices=["binary", "hybrid"], + help="Strategies to benchmark (default: both)", + ) + + args = parser.parse_args() + + # Validate source path + if not args.source.exists(): + print(f"Error: Source path does not exist: {args.source}") + sys.exit(1) + + # Select queries + queries = DEFAULT_QUERIES[:args.queries] + + # Run benchmarks + try: + results = run_benchmarks( + source_path=args.source, + queries=queries, + strategies=args.strategies, + warmup_runs=args.warmup, + ) + + # Generate summaries + summaries = summarize_results(results) + + # Print comparison + print_comparison_table(summaries) + + # Save results + save_results(results, summaries, args.output) + + except KeyboardInterrupt: + print("\nBenchmark interrupted.") + sys.exit(1) + except Exception as e: + print(f"\nBenchmark failed: {e}") + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/codex-lens/benchmarks/results/cascade_benchmark.json b/codex-lens/benchmarks/results/cascade_benchmark.json new file mode 100644 index 00000000..ad5a2dbe --- /dev/null +++ b/codex-lens/benchmarks/results/cascade_benchmark.json @@ -0,0 +1,277 @@ +{ + "timestamp": "2026-01-02 11:22:34", + "summaries": { + "binary": { + "strategy": "binary", + "total_queries": 15, + "successful_queries": 15, + "avg_latency_ms": 850.328753333209, + "min_latency_ms": 750.9617999967304, + "max_latency_ms": 1015.733200001705, + "p50_latency_ms": 847.9711999971187, + "p95_latency_ms": 976.768470002571, + "p99_latency_ms": 1007.9402540018782, + "avg_results": 0, + "errors": [] + }, + "hybrid": { + "strategy": "hybrid", + "total_queries": 15, + "successful_queries": 15, + "avg_latency_ms": 821.3745733330143, + "min_latency_ms": 720.5589000004693, + "max_latency_ms": 943.0299999949057, + "p50_latency_ms": 819.5875000019441, + "p95_latency_ms": 916.3381599981221, + "p99_latency_ms": 937.691631995549, + "avg_results": 0, + "errors": [] + } + }, + "details": { + "binary": [ + { + "strategy": "binary", + "query": "def search", + "latency_ms": 862.7266999974381, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "class Engine", + "latency_ms": 773.8472999990336, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "import numpy", + "latency_ms": 858.1023000006098, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "async def", + "latency_ms": 877.2815999982413, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "raise ValueError", + "latency_ms": 824.3320999972639, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "how to parse json", + "latency_ms": 948.0362000031164, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "database connection", + "latency_ms": 789.3126000053599, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "error handling", + "latency_ms": 960.0693000029423, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "authentication logic", + "latency_ms": 757.247900000948, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "file read write", + "latency_ms": 750.9617999967304, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "embedding vector", + "latency_ms": 871.1426000008942, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "cosine similarity", + "latency_ms": 817.1380999992834, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "binary quantization", + "latency_ms": 1015.733200001705, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "hamming distance", + "latency_ms": 847.9711999971187, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "binary", + "query": "reranking", + "latency_ms": 801.028399997449, + "num_results": 0, + "top_result": null, + "error": null + } + ], + "hybrid": [ + { + "strategy": "hybrid", + "query": "def search", + "latency_ms": 720.5589000004693, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "class Engine", + "latency_ms": 792.9914000051212, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "import numpy", + "latency_ms": 943.0299999949057, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "async def", + "latency_ms": 819.5875000019441, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "raise ValueError", + "latency_ms": 835.5114000005415, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "how to parse json", + "latency_ms": 867.8118999960134, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "database connection", + "latency_ms": 824.6361999990768, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "error handling", + "latency_ms": 742.638600000646, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "authentication logic", + "latency_ms": 840.4286999939359, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "file read write", + "latency_ms": 810.9049000049708, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "embedding vector", + "latency_ms": 876.5335000061896, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "cosine similarity", + "latency_ms": 797.3090999948909, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "binary quantization", + "latency_ms": 767.9803999999422, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "hamming distance", + "latency_ms": 775.7972999970661, + "num_results": 0, + "top_result": null, + "error": null + }, + { + "strategy": "hybrid", + "query": "reranking", + "latency_ms": 904.8987999995006, + "num_results": 0, + "top_result": null, + "error": null + } + ] + } +} \ No newline at end of file diff --git a/codex-lens/src/codexlens/cli/commands.py b/codex-lens/src/codexlens/cli/commands.py index 699c39a8..02f01bcd 100644 --- a/codex-lens/src/codexlens/cli/commands.py +++ b/codex-lens/src/codexlens/cli/commands.py @@ -7,7 +7,7 @@ import logging import os import shutil from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional +from typing import Annotated, Any, Dict, Iterable, List, Optional import typer from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn @@ -2721,3 +2721,305 @@ def _display_index_result(result) -> None: console.print(f" [red]Error:[/red] {error}") if len(result.errors) > 3: console.print(f" [dim]... and {len(result.errors) - 3} more errors[/dim]") + + + +# ==================== Cascade Index Commands ==================== + + +def get_binary_index_path(db_path: Path) -> Path: + """Get the path for binary ANN index file. + + Args: + db_path: Path to the _index.db file + + Returns: + Path to the binary index file (_index_binary.bin) + """ + return db_path.parent / f"{db_path.stem}_binary.bin" + + +@app.command("cascade-index") +def cascade_index( + path: Annotated[Path, typer.Argument(help="Directory to index")], + force: Annotated[bool, typer.Option("--force", "-f", help="Force regenerate")] = False, + batch_size: Annotated[int, typer.Option("--batch-size", "-b", help="Batch size for embedding")] = 32, + json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False, + verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False, +) -> None: + """Generate cascade embeddings (binary + dense) for two-stage retrieval. + + Cascade retrieval uses a two-stage approach: + 1. Binary search (fast, 32 bytes/vector) -> coarse filtering + 2. Dense rerank (precise, 8KB/vector) -> final results + + This command: + - Finds all _index.db files in the directory + - Generates binary (256-dim) and dense (2048-dim) embeddings for each chunk + - Stores embeddings in the database (embedding_binary, embedding_dense columns) + - Creates a BinaryANNIndex file for fast coarse retrieval + + Examples: + codexlens cascade-index ~/projects/my-app + codexlens cascade-index . --force + codexlens cascade-index . --batch-size 64 --verbose + """ + _configure_logging(verbose, json_mode) + + target_path = path.expanduser().resolve() + + # Find index database(s) + if target_path.is_file() and target_path.name == "_index.db": + index_dbs = [target_path] + elif target_path.is_dir(): + # Check local .codexlens/_index.db first + local_index = target_path / ".codexlens" / "_index.db" + if local_index.exists(): + index_dbs = [local_index] + else: + # Find via registry + registry = RegistryStore() + try: + registry.initialize() + mapper = PathMapper() + index_db = mapper.source_to_index_db(target_path) + if not index_db.exists(): + if json_mode: + print_json(success=False, error=f"No index found for {target_path}") + else: + console.print(f"[red]Error:[/red] No index found for {target_path}") + console.print("Run 'codexlens init' first to create an index") + raise typer.Exit(code=1) + # Find all _index.db files under the index root + index_root = index_db.parent + index_dbs = list(index_root.rglob("_index.db")) + finally: + registry.close() + else: + if json_mode: + print_json(success=False, error="Path must be _index.db file or indexed directory") + else: + console.print("[red]Error:[/red] Path must be _index.db file or indexed directory") + raise typer.Exit(code=1) + + if not index_dbs: + if json_mode: + print_json(success=False, error="No index databases found") + else: + console.print("[yellow]No index databases found[/yellow]") + raise typer.Exit(code=1) + + # Import cascade embedding backend + try: + from codexlens.indexing.embedding import CascadeEmbeddingBackend + from codexlens.semantic.ann_index import BinaryANNIndex + from codexlens.indexing.embedding import pack_binary_embedding + except ImportError as e: + error_msg = f"Cascade embedding dependencies not available: {e}" + if json_mode: + print_json(success=False, error=error_msg) + else: + console.print(f"[red]Error:[/red] {error_msg}") + console.print("[dim]Install with: pip install codexlens[semantic][/dim]") + raise typer.Exit(code=1) + + if not json_mode: + console.print(f"[bold]Generating cascade embeddings[/bold]") + console.print(f"Path: [dim]{target_path}[/dim]") + console.print(f"Index databases: [cyan]{len(index_dbs)}[/cyan]") + console.print(f"Batch size: [cyan]{batch_size}[/cyan]") + console.print() + + # Initialize cascade embedding backend + try: + cascade_backend = CascadeEmbeddingBackend() + except Exception as e: + error_msg = f"Failed to initialize cascade embedding backend: {e}" + if json_mode: + print_json(success=False, error=error_msg) + else: + console.print(f"[red]Error:[/red] {error_msg}") + raise typer.Exit(code=1) + + # Process statistics + total_chunks_processed = 0 + total_indexes_processed = 0 + total_indexes_successful = 0 + total_binary_indexes_created = 0 + errors_list: List[str] = [] + + # Process each index database + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + TextColumn("({task.completed}/{task.total})"), + TimeElapsedColumn(), + console=console, + disable=json_mode, + ) as progress: + db_task = progress.add_task("Processing indexes...", total=len(index_dbs)) + + for db_path in index_dbs: + total_indexes_processed += 1 + index_name = db_path.parent.name + + try: + # Open the index store + store = DirIndexStore(db_path) + store.initialize() + + # Get connection for direct queries + conn = store._get_connection() + + # Ensure cascade columns exist in semantic_chunks table + try: + conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_binary BLOB") + except Exception: + pass # Column already exists + try: + conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_dense BLOB") + except Exception: + pass # Column already exists + conn.commit() + + # Check if semantic_chunks table exists and has data + try: + cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks") + chunk_count = cursor.fetchone()[0] + except Exception: + # semantic_chunks table doesn't exist or is empty + chunk_count = 0 + + if chunk_count == 0: + if verbose and not json_mode: + console.print(f" [dim]Skipping {index_name}: no chunks found[/dim]") + progress.advance(db_task) + store.close() + continue + + # Check if embeddings already exist (unless force) + if not force: + cursor = conn.execute( + "SELECT COUNT(*) FROM semantic_chunks WHERE embedding_binary IS NOT NULL" + ) + existing_count = cursor.fetchone()[0] + if existing_count > 0: + if verbose and not json_mode: + console.print(f" [dim]Skipping {index_name}: embeddings exist (use --force to regenerate)[/dim]") + progress.advance(db_task) + store.close() + continue + + # If force, clear existing cascade embeddings + if force: + conn.execute( + "UPDATE semantic_chunks SET embedding_binary = NULL, embedding_dense = NULL" + ) + conn.commit() + + # Get all chunks + cursor = conn.execute("SELECT id, content FROM semantic_chunks") + chunks = cursor.fetchall() + + if not chunks: + progress.advance(db_task) + store.close() + continue + + if verbose and not json_mode: + console.print(f" Processing {index_name}: {len(chunks)} chunks") + + # Process in batches + chunk_task = progress.add_task( + f" {index_name}", total=len(chunks) + ) + + # Prepare for BinaryANNIndex + binary_index_path = get_binary_index_path(db_path) + binary_ann_index = BinaryANNIndex(db_path, dim=256) + + for i in range(0, len(chunks), batch_size): + batch_chunks = chunks[i:i + batch_size] + batch_ids = [c[0] for c in batch_chunks] + batch_contents = [c[1] for c in batch_chunks] + + # Generate cascade embeddings + binary_embeddings, dense_embeddings = cascade_backend.encode_cascade( + batch_contents, batch_size=batch_size + ) + + # Pack binary embeddings and convert dense to bytes + packed_binaries = [] + dense_bytes_list = [] + + for j in range(len(batch_ids)): + # Pack binary embedding (256 bits -> 32 bytes) + packed_binary = pack_binary_embedding(binary_embeddings[j]) + packed_binaries.append(packed_binary) + + # Convert dense embedding to bytes + import numpy as np + dense_blob = dense_embeddings[j].astype(np.float32).tobytes() + dense_bytes_list.append(dense_blob) + + # Update database + for j, chunk_id in enumerate(batch_ids): + conn.execute( + """ + UPDATE semantic_chunks + SET embedding_binary = ?, embedding_dense = ? + WHERE id = ? + """, + (packed_binaries[j], dense_bytes_list[j], chunk_id) + ) + + # Add to binary ANN index + binary_ann_index.add_vectors(batch_ids, packed_binaries) + + conn.commit() + total_chunks_processed += len(batch_ids) + progress.advance(chunk_task, len(batch_ids)) + + # Save binary ANN index + binary_ann_index.save() + total_binary_indexes_created += 1 + + progress.remove_task(chunk_task) + store.close() + total_indexes_successful += 1 + + except Exception as e: + error_msg = f"{index_name}: {e}" + errors_list.append(error_msg) + if verbose and not json_mode: + console.print(f" [red]Error processing {index_name}:[/red] {e}") + + progress.advance(db_task) + + # Build result + result = { + "path": str(target_path), + "indexes_processed": total_indexes_processed, + "indexes_successful": total_indexes_successful, + "chunks_processed": total_chunks_processed, + "binary_indexes_created": total_binary_indexes_created, + "errors": len(errors_list), + "error_details": errors_list[:5] if errors_list else [], + } + + if json_mode: + print_json(success=True, result=result) + else: + console.print(f"\n[green]Cascade indexing complete[/green]") + console.print(f" Indexes processed: {total_indexes_processed}") + console.print(f" Indexes successful: {total_indexes_successful}") + console.print(f" Chunks processed: {total_chunks_processed:,}") + console.print(f" Binary indexes created: {total_binary_indexes_created}") + if errors_list: + console.print(f" [yellow]Errors: {len(errors_list)}[/yellow]") + for err in errors_list[:3]: + console.print(f" [dim]{err}[/dim]") + if len(errors_list) > 3: + console.print(f" [dim]... and {len(errors_list) - 3} more[/dim]") diff --git a/codex-lens/src/codexlens/indexing/embedding.py b/codex-lens/src/codexlens/indexing/embedding.py index 6082a599..4175f3e5 100644 --- a/codex-lens/src/codexlens/indexing/embedding.py +++ b/codex-lens/src/codexlens/indexing/embedding.py @@ -265,8 +265,8 @@ class DenseEmbeddingBackend(BaseEmbedder): Model: BAAI/bge-large-en-v1.5 (1024 dim) with optional expansion """ - DEFAULT_MODEL = "BAAI/bge-large-en-v1.5" # 1024 dim, high quality - TARGET_DIM = 2048 + DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" # 384 dim, use small for testing + TARGET_DIM = 768 # Reduced target for faster testing def __init__( self,