mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
feat: Implement cascade indexing command and benchmark script for performance evaluation
This commit is contained in:
402
codex-lens/benchmarks/cascade_benchmark.py
Normal file
402
codex-lens/benchmarks/cascade_benchmark.py
Normal file
@@ -0,0 +1,402 @@
|
||||
#!/usr/bin/env python
|
||||
"""Benchmark script for comparing cascade search strategies.
|
||||
|
||||
Compares:
|
||||
- binary: 256-dim binary coarse ranking + 2048-dim dense fine ranking
|
||||
- hybrid: FTS+SPLADE+Vector coarse ranking + CrossEncoder fine ranking
|
||||
|
||||
Usage:
|
||||
python benchmarks/cascade_benchmark.py [--source PATH] [--queries N] [--warmup N]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import json
|
||||
import os
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
|
||||
from codexlens.config import Config
|
||||
from codexlens.storage.registry import RegistryStore
|
||||
from codexlens.storage.path_mapper import PathMapper
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkResult:
|
||||
"""Result from a single benchmark run."""
|
||||
strategy: str
|
||||
query: str
|
||||
latency_ms: float
|
||||
num_results: int
|
||||
top_result: Optional[str]
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchmarkSummary:
|
||||
"""Aggregated benchmark statistics."""
|
||||
strategy: str
|
||||
total_queries: int
|
||||
successful_queries: int
|
||||
avg_latency_ms: float
|
||||
min_latency_ms: float
|
||||
max_latency_ms: float
|
||||
p50_latency_ms: float
|
||||
p95_latency_ms: float
|
||||
p99_latency_ms: float
|
||||
avg_results: float
|
||||
errors: List[str]
|
||||
|
||||
|
||||
# Default test queries covering different scenarios
|
||||
DEFAULT_QUERIES = [
|
||||
# Code patterns
|
||||
"def search",
|
||||
"class Engine",
|
||||
"import numpy",
|
||||
"async def",
|
||||
"raise ValueError",
|
||||
# Semantic queries
|
||||
"how to parse json",
|
||||
"database connection",
|
||||
"error handling",
|
||||
"authentication logic",
|
||||
"file read write",
|
||||
# Technical terms
|
||||
"embedding vector",
|
||||
"cosine similarity",
|
||||
"binary quantization",
|
||||
"hamming distance",
|
||||
"reranking",
|
||||
]
|
||||
|
||||
|
||||
def percentile(data: List[float], p: float) -> float:
|
||||
"""Calculate percentile of sorted data."""
|
||||
if not data:
|
||||
return 0.0
|
||||
sorted_data = sorted(data)
|
||||
k = (len(sorted_data) - 1) * (p / 100)
|
||||
f = int(k)
|
||||
c = f + 1 if f + 1 < len(sorted_data) else f
|
||||
return sorted_data[f] + (k - f) * (sorted_data[c] - sorted_data[f])
|
||||
|
||||
|
||||
def run_single_benchmark(
|
||||
engine: ChainSearchEngine,
|
||||
query: str,
|
||||
source_path: Path,
|
||||
strategy: str,
|
||||
options: Optional[SearchOptions] = None,
|
||||
) -> BenchmarkResult:
|
||||
"""Run a single benchmark query."""
|
||||
gc.collect()
|
||||
|
||||
start_time = time.perf_counter()
|
||||
try:
|
||||
result = engine.cascade_search(
|
||||
query=query,
|
||||
source_path=source_path,
|
||||
k=10,
|
||||
coarse_k=100,
|
||||
options=options,
|
||||
strategy=strategy,
|
||||
)
|
||||
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||||
|
||||
top_result = None
|
||||
if result.results:
|
||||
r = result.results[0]
|
||||
line = r.start_line or 0
|
||||
top_result = f"{r.path}:{line}"
|
||||
|
||||
return BenchmarkResult(
|
||||
strategy=strategy,
|
||||
query=query,
|
||||
latency_ms=elapsed_ms,
|
||||
num_results=len(result.results),
|
||||
top_result=top_result,
|
||||
)
|
||||
except Exception as e:
|
||||
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||||
return BenchmarkResult(
|
||||
strategy=strategy,
|
||||
query=query,
|
||||
latency_ms=elapsed_ms,
|
||||
num_results=0,
|
||||
top_result=None,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
|
||||
def run_benchmarks(
|
||||
source_path: Path,
|
||||
queries: List[str],
|
||||
strategies: List[str],
|
||||
warmup_runs: int = 2,
|
||||
options: Optional[SearchOptions] = None,
|
||||
) -> Dict[str, List[BenchmarkResult]]:
|
||||
"""Run benchmarks for all queries and strategies."""
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Cascade Search Benchmark")
|
||||
print(f"{'='*60}")
|
||||
print(f"Source: {source_path}")
|
||||
print(f"Queries: {len(queries)}")
|
||||
print(f"Strategies: {strategies}")
|
||||
print(f"Warmup runs: {warmup_runs}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Initialize engine
|
||||
config = Config()
|
||||
registry = RegistryStore() # Uses default path
|
||||
registry.initialize()
|
||||
mapper = PathMapper() # Uses default path
|
||||
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
|
||||
|
||||
results: Dict[str, List[BenchmarkResult]] = {s: [] for s in strategies}
|
||||
|
||||
# Warmup phase
|
||||
if warmup_runs > 0:
|
||||
print(f"Running {warmup_runs} warmup queries...")
|
||||
warmup_query = queries[0] if queries else "test"
|
||||
for strategy in strategies:
|
||||
for _ in range(warmup_runs):
|
||||
try:
|
||||
run_single_benchmark(engine, warmup_query, source_path, strategy, options)
|
||||
except Exception:
|
||||
pass
|
||||
print("Warmup complete.\n")
|
||||
|
||||
# Benchmark phase
|
||||
total_runs = len(queries) * len(strategies)
|
||||
current_run = 0
|
||||
|
||||
for query in queries:
|
||||
for strategy in strategies:
|
||||
current_run += 1
|
||||
print(f"[{current_run}/{total_runs}] {strategy}: '{query[:40]}...' ", end="", flush=True)
|
||||
|
||||
result = run_single_benchmark(engine, query, source_path, strategy, options)
|
||||
results[strategy].append(result)
|
||||
|
||||
if result.error:
|
||||
print(f"ERROR: {result.error[:50]}")
|
||||
else:
|
||||
print(f"{result.latency_ms:.1f}ms, {result.num_results} results")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def summarize_results(results: Dict[str, List[BenchmarkResult]]) -> Dict[str, BenchmarkSummary]:
|
||||
"""Generate summary statistics for each strategy."""
|
||||
summaries = {}
|
||||
|
||||
for strategy, benchmark_results in results.items():
|
||||
latencies = [r.latency_ms for r in benchmark_results if r.error is None]
|
||||
result_counts = [r.num_results for r in benchmark_results if r.error is None]
|
||||
errors = [r.error for r in benchmark_results if r.error is not None]
|
||||
|
||||
if latencies:
|
||||
summary = BenchmarkSummary(
|
||||
strategy=strategy,
|
||||
total_queries=len(benchmark_results),
|
||||
successful_queries=len(latencies),
|
||||
avg_latency_ms=statistics.mean(latencies),
|
||||
min_latency_ms=min(latencies),
|
||||
max_latency_ms=max(latencies),
|
||||
p50_latency_ms=percentile(latencies, 50),
|
||||
p95_latency_ms=percentile(latencies, 95),
|
||||
p99_latency_ms=percentile(latencies, 99),
|
||||
avg_results=statistics.mean(result_counts) if result_counts else 0,
|
||||
errors=errors,
|
||||
)
|
||||
else:
|
||||
summary = BenchmarkSummary(
|
||||
strategy=strategy,
|
||||
total_queries=len(benchmark_results),
|
||||
successful_queries=0,
|
||||
avg_latency_ms=0,
|
||||
min_latency_ms=0,
|
||||
max_latency_ms=0,
|
||||
p50_latency_ms=0,
|
||||
p95_latency_ms=0,
|
||||
p99_latency_ms=0,
|
||||
avg_results=0,
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
summaries[strategy] = summary
|
||||
|
||||
return summaries
|
||||
|
||||
|
||||
def print_comparison_table(summaries: Dict[str, BenchmarkSummary]) -> None:
|
||||
"""Print formatted comparison table."""
|
||||
print(f"\n{'='*80}")
|
||||
print("BENCHMARK RESULTS COMPARISON")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Header
|
||||
print(f"{'Metric':<25} {'Binary':>15} {'Hybrid':>15} {'Diff':>15}")
|
||||
print(f"{'-'*25} {'-'*15} {'-'*15} {'-'*15}")
|
||||
|
||||
binary = summaries.get("binary")
|
||||
hybrid = summaries.get("hybrid")
|
||||
|
||||
if not binary or not hybrid:
|
||||
print("Missing results for comparison")
|
||||
return
|
||||
|
||||
metrics = [
|
||||
("Total Queries", binary.total_queries, hybrid.total_queries),
|
||||
("Successful", binary.successful_queries, hybrid.successful_queries),
|
||||
("Avg Latency (ms)", binary.avg_latency_ms, hybrid.avg_latency_ms),
|
||||
("Min Latency (ms)", binary.min_latency_ms, hybrid.min_latency_ms),
|
||||
("Max Latency (ms)", binary.max_latency_ms, hybrid.max_latency_ms),
|
||||
("P50 Latency (ms)", binary.p50_latency_ms, hybrid.p50_latency_ms),
|
||||
("P95 Latency (ms)", binary.p95_latency_ms, hybrid.p95_latency_ms),
|
||||
("P99 Latency (ms)", binary.p99_latency_ms, hybrid.p99_latency_ms),
|
||||
("Avg Results", binary.avg_results, hybrid.avg_results),
|
||||
]
|
||||
|
||||
for name, b_val, h_val in metrics:
|
||||
if isinstance(b_val, float):
|
||||
diff = b_val - h_val
|
||||
diff_str = f"{diff:+.2f}" if diff != 0 else "0.00"
|
||||
speedup = h_val / b_val if b_val > 0 else 0
|
||||
if "Latency" in name and speedup > 1:
|
||||
diff_str += f" ({speedup:.1f}x faster)"
|
||||
print(f"{name:<25} {b_val:>15.2f} {h_val:>15.2f} {diff_str:>15}")
|
||||
else:
|
||||
diff = b_val - h_val
|
||||
print(f"{name:<25} {b_val:>15} {h_val:>15} {diff:>+15}")
|
||||
|
||||
# Errors
|
||||
print(f"\n{'Errors:':<25}")
|
||||
print(f" Binary: {len(binary.errors)}")
|
||||
for err in binary.errors[:3]:
|
||||
print(f" - {err[:60]}...")
|
||||
print(f" Hybrid: {len(hybrid.errors)}")
|
||||
for err in hybrid.errors[:3]:
|
||||
print(f" - {err[:60]}...")
|
||||
|
||||
# Winner
|
||||
print(f"\n{'='*80}")
|
||||
if binary.avg_latency_ms < hybrid.avg_latency_ms and binary.successful_queries > 0:
|
||||
speedup = hybrid.avg_latency_ms / binary.avg_latency_ms
|
||||
print(f"[WINNER] Binary ({speedup:.2f}x faster average latency)")
|
||||
elif hybrid.avg_latency_ms < binary.avg_latency_ms and hybrid.successful_queries > 0:
|
||||
speedup = binary.avg_latency_ms / hybrid.avg_latency_ms
|
||||
print(f"[WINNER] Hybrid ({speedup:.2f}x faster average latency)")
|
||||
else:
|
||||
print("No clear winner (check errors)")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
|
||||
def save_results(
|
||||
results: Dict[str, List[BenchmarkResult]],
|
||||
summaries: Dict[str, BenchmarkSummary],
|
||||
output_path: Path,
|
||||
) -> None:
|
||||
"""Save benchmark results to JSON file."""
|
||||
data = {
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"summaries": {k: asdict(v) for k, v in summaries.items()},
|
||||
"details": {
|
||||
k: [asdict(r) for r in v]
|
||||
for k, v in results.items()
|
||||
},
|
||||
}
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
print(f"Results saved to: {output_path}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Benchmark cascade search strategies")
|
||||
parser.add_argument(
|
||||
"--source", "-s",
|
||||
type=Path,
|
||||
default=Path(__file__).parent.parent / "src",
|
||||
help="Source directory to search (default: ./src)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--queries", "-q",
|
||||
type=int,
|
||||
default=len(DEFAULT_QUERIES),
|
||||
help=f"Number of queries to run (default: {len(DEFAULT_QUERIES)})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--warmup", "-w",
|
||||
type=int,
|
||||
default=2,
|
||||
help="Number of warmup runs (default: 2)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o",
|
||||
type=Path,
|
||||
default=Path(__file__).parent / "results" / "cascade_benchmark.json",
|
||||
help="Output file for results (default: benchmarks/results/cascade_benchmark.json)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--strategies",
|
||||
nargs="+",
|
||||
default=["binary", "hybrid"],
|
||||
choices=["binary", "hybrid"],
|
||||
help="Strategies to benchmark (default: both)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate source path
|
||||
if not args.source.exists():
|
||||
print(f"Error: Source path does not exist: {args.source}")
|
||||
sys.exit(1)
|
||||
|
||||
# Select queries
|
||||
queries = DEFAULT_QUERIES[:args.queries]
|
||||
|
||||
# Run benchmarks
|
||||
try:
|
||||
results = run_benchmarks(
|
||||
source_path=args.source,
|
||||
queries=queries,
|
||||
strategies=args.strategies,
|
||||
warmup_runs=args.warmup,
|
||||
)
|
||||
|
||||
# Generate summaries
|
||||
summaries = summarize_results(results)
|
||||
|
||||
# Print comparison
|
||||
print_comparison_table(summaries)
|
||||
|
||||
# Save results
|
||||
save_results(results, summaries, args.output)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nBenchmark interrupted.")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\nBenchmark failed: {e}")
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
277
codex-lens/benchmarks/results/cascade_benchmark.json
Normal file
277
codex-lens/benchmarks/results/cascade_benchmark.json
Normal file
@@ -0,0 +1,277 @@
|
||||
{
|
||||
"timestamp": "2026-01-02 11:22:34",
|
||||
"summaries": {
|
||||
"binary": {
|
||||
"strategy": "binary",
|
||||
"total_queries": 15,
|
||||
"successful_queries": 15,
|
||||
"avg_latency_ms": 850.328753333209,
|
||||
"min_latency_ms": 750.9617999967304,
|
||||
"max_latency_ms": 1015.733200001705,
|
||||
"p50_latency_ms": 847.9711999971187,
|
||||
"p95_latency_ms": 976.768470002571,
|
||||
"p99_latency_ms": 1007.9402540018782,
|
||||
"avg_results": 0,
|
||||
"errors": []
|
||||
},
|
||||
"hybrid": {
|
||||
"strategy": "hybrid",
|
||||
"total_queries": 15,
|
||||
"successful_queries": 15,
|
||||
"avg_latency_ms": 821.3745733330143,
|
||||
"min_latency_ms": 720.5589000004693,
|
||||
"max_latency_ms": 943.0299999949057,
|
||||
"p50_latency_ms": 819.5875000019441,
|
||||
"p95_latency_ms": 916.3381599981221,
|
||||
"p99_latency_ms": 937.691631995549,
|
||||
"avg_results": 0,
|
||||
"errors": []
|
||||
}
|
||||
},
|
||||
"details": {
|
||||
"binary": [
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "def search",
|
||||
"latency_ms": 862.7266999974381,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "class Engine",
|
||||
"latency_ms": 773.8472999990336,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "import numpy",
|
||||
"latency_ms": 858.1023000006098,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "async def",
|
||||
"latency_ms": 877.2815999982413,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "raise ValueError",
|
||||
"latency_ms": 824.3320999972639,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 948.0362000031164,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "database connection",
|
||||
"latency_ms": 789.3126000053599,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "error handling",
|
||||
"latency_ms": 960.0693000029423,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "authentication logic",
|
||||
"latency_ms": 757.247900000948,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "file read write",
|
||||
"latency_ms": 750.9617999967304,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "embedding vector",
|
||||
"latency_ms": 871.1426000008942,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "cosine similarity",
|
||||
"latency_ms": 817.1380999992834,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "binary quantization",
|
||||
"latency_ms": 1015.733200001705,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "hamming distance",
|
||||
"latency_ms": 847.9711999971187,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "reranking",
|
||||
"latency_ms": 801.028399997449,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
}
|
||||
],
|
||||
"hybrid": [
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "def search",
|
||||
"latency_ms": 720.5589000004693,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "class Engine",
|
||||
"latency_ms": 792.9914000051212,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "import numpy",
|
||||
"latency_ms": 943.0299999949057,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "async def",
|
||||
"latency_ms": 819.5875000019441,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "raise ValueError",
|
||||
"latency_ms": 835.5114000005415,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 867.8118999960134,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "database connection",
|
||||
"latency_ms": 824.6361999990768,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "error handling",
|
||||
"latency_ms": 742.638600000646,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "authentication logic",
|
||||
"latency_ms": 840.4286999939359,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "file read write",
|
||||
"latency_ms": 810.9049000049708,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "embedding vector",
|
||||
"latency_ms": 876.5335000061896,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "cosine similarity",
|
||||
"latency_ms": 797.3090999948909,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "binary quantization",
|
||||
"latency_ms": 767.9803999999422,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "hamming distance",
|
||||
"latency_ms": 775.7972999970661,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "reranking",
|
||||
"latency_ms": 904.8987999995006,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"error": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -7,7 +7,7 @@ import logging
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional
|
||||
from typing import Annotated, Any, Dict, Iterable, List, Optional
|
||||
|
||||
import typer
|
||||
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
||||
@@ -2721,3 +2721,305 @@ def _display_index_result(result) -> None:
|
||||
console.print(f" [red]Error:[/red] {error}")
|
||||
if len(result.errors) > 3:
|
||||
console.print(f" [dim]... and {len(result.errors) - 3} more errors[/dim]")
|
||||
|
||||
|
||||
|
||||
# ==================== Cascade Index Commands ====================
|
||||
|
||||
|
||||
def get_binary_index_path(db_path: Path) -> Path:
|
||||
"""Get the path for binary ANN index file.
|
||||
|
||||
Args:
|
||||
db_path: Path to the _index.db file
|
||||
|
||||
Returns:
|
||||
Path to the binary index file (_index_binary.bin)
|
||||
"""
|
||||
return db_path.parent / f"{db_path.stem}_binary.bin"
|
||||
|
||||
|
||||
@app.command("cascade-index")
|
||||
def cascade_index(
|
||||
path: Annotated[Path, typer.Argument(help="Directory to index")],
|
||||
force: Annotated[bool, typer.Option("--force", "-f", help="Force regenerate")] = False,
|
||||
batch_size: Annotated[int, typer.Option("--batch-size", "-b", help="Batch size for embedding")] = 32,
|
||||
json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False,
|
||||
verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False,
|
||||
) -> None:
|
||||
"""Generate cascade embeddings (binary + dense) for two-stage retrieval.
|
||||
|
||||
Cascade retrieval uses a two-stage approach:
|
||||
1. Binary search (fast, 32 bytes/vector) -> coarse filtering
|
||||
2. Dense rerank (precise, 8KB/vector) -> final results
|
||||
|
||||
This command:
|
||||
- Finds all _index.db files in the directory
|
||||
- Generates binary (256-dim) and dense (2048-dim) embeddings for each chunk
|
||||
- Stores embeddings in the database (embedding_binary, embedding_dense columns)
|
||||
- Creates a BinaryANNIndex file for fast coarse retrieval
|
||||
|
||||
Examples:
|
||||
codexlens cascade-index ~/projects/my-app
|
||||
codexlens cascade-index . --force
|
||||
codexlens cascade-index . --batch-size 64 --verbose
|
||||
"""
|
||||
_configure_logging(verbose, json_mode)
|
||||
|
||||
target_path = path.expanduser().resolve()
|
||||
|
||||
# Find index database(s)
|
||||
if target_path.is_file() and target_path.name == "_index.db":
|
||||
index_dbs = [target_path]
|
||||
elif target_path.is_dir():
|
||||
# Check local .codexlens/_index.db first
|
||||
local_index = target_path / ".codexlens" / "_index.db"
|
||||
if local_index.exists():
|
||||
index_dbs = [local_index]
|
||||
else:
|
||||
# Find via registry
|
||||
registry = RegistryStore()
|
||||
try:
|
||||
registry.initialize()
|
||||
mapper = PathMapper()
|
||||
index_db = mapper.source_to_index_db(target_path)
|
||||
if not index_db.exists():
|
||||
if json_mode:
|
||||
print_json(success=False, error=f"No index found for {target_path}")
|
||||
else:
|
||||
console.print(f"[red]Error:[/red] No index found for {target_path}")
|
||||
console.print("Run 'codexlens init' first to create an index")
|
||||
raise typer.Exit(code=1)
|
||||
# Find all _index.db files under the index root
|
||||
index_root = index_db.parent
|
||||
index_dbs = list(index_root.rglob("_index.db"))
|
||||
finally:
|
||||
registry.close()
|
||||
else:
|
||||
if json_mode:
|
||||
print_json(success=False, error="Path must be _index.db file or indexed directory")
|
||||
else:
|
||||
console.print("[red]Error:[/red] Path must be _index.db file or indexed directory")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
if not index_dbs:
|
||||
if json_mode:
|
||||
print_json(success=False, error="No index databases found")
|
||||
else:
|
||||
console.print("[yellow]No index databases found[/yellow]")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
# Import cascade embedding backend
|
||||
try:
|
||||
from codexlens.indexing.embedding import CascadeEmbeddingBackend
|
||||
from codexlens.semantic.ann_index import BinaryANNIndex
|
||||
from codexlens.indexing.embedding import pack_binary_embedding
|
||||
except ImportError as e:
|
||||
error_msg = f"Cascade embedding dependencies not available: {e}"
|
||||
if json_mode:
|
||||
print_json(success=False, error=error_msg)
|
||||
else:
|
||||
console.print(f"[red]Error:[/red] {error_msg}")
|
||||
console.print("[dim]Install with: pip install codexlens[semantic][/dim]")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
if not json_mode:
|
||||
console.print(f"[bold]Generating cascade embeddings[/bold]")
|
||||
console.print(f"Path: [dim]{target_path}[/dim]")
|
||||
console.print(f"Index databases: [cyan]{len(index_dbs)}[/cyan]")
|
||||
console.print(f"Batch size: [cyan]{batch_size}[/cyan]")
|
||||
console.print()
|
||||
|
||||
# Initialize cascade embedding backend
|
||||
try:
|
||||
cascade_backend = CascadeEmbeddingBackend()
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to initialize cascade embedding backend: {e}"
|
||||
if json_mode:
|
||||
print_json(success=False, error=error_msg)
|
||||
else:
|
||||
console.print(f"[red]Error:[/red] {error_msg}")
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
# Process statistics
|
||||
total_chunks_processed = 0
|
||||
total_indexes_processed = 0
|
||||
total_indexes_successful = 0
|
||||
total_binary_indexes_created = 0
|
||||
errors_list: List[str] = []
|
||||
|
||||
# Process each index database
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
||||
TextColumn("({task.completed}/{task.total})"),
|
||||
TimeElapsedColumn(),
|
||||
console=console,
|
||||
disable=json_mode,
|
||||
) as progress:
|
||||
db_task = progress.add_task("Processing indexes...", total=len(index_dbs))
|
||||
|
||||
for db_path in index_dbs:
|
||||
total_indexes_processed += 1
|
||||
index_name = db_path.parent.name
|
||||
|
||||
try:
|
||||
# Open the index store
|
||||
store = DirIndexStore(db_path)
|
||||
store.initialize()
|
||||
|
||||
# Get connection for direct queries
|
||||
conn = store._get_connection()
|
||||
|
||||
# Ensure cascade columns exist in semantic_chunks table
|
||||
try:
|
||||
conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_binary BLOB")
|
||||
except Exception:
|
||||
pass # Column already exists
|
||||
try:
|
||||
conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_dense BLOB")
|
||||
except Exception:
|
||||
pass # Column already exists
|
||||
conn.commit()
|
||||
|
||||
# Check if semantic_chunks table exists and has data
|
||||
try:
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
|
||||
chunk_count = cursor.fetchone()[0]
|
||||
except Exception:
|
||||
# semantic_chunks table doesn't exist or is empty
|
||||
chunk_count = 0
|
||||
|
||||
if chunk_count == 0:
|
||||
if verbose and not json_mode:
|
||||
console.print(f" [dim]Skipping {index_name}: no chunks found[/dim]")
|
||||
progress.advance(db_task)
|
||||
store.close()
|
||||
continue
|
||||
|
||||
# Check if embeddings already exist (unless force)
|
||||
if not force:
|
||||
cursor = conn.execute(
|
||||
"SELECT COUNT(*) FROM semantic_chunks WHERE embedding_binary IS NOT NULL"
|
||||
)
|
||||
existing_count = cursor.fetchone()[0]
|
||||
if existing_count > 0:
|
||||
if verbose and not json_mode:
|
||||
console.print(f" [dim]Skipping {index_name}: embeddings exist (use --force to regenerate)[/dim]")
|
||||
progress.advance(db_task)
|
||||
store.close()
|
||||
continue
|
||||
|
||||
# If force, clear existing cascade embeddings
|
||||
if force:
|
||||
conn.execute(
|
||||
"UPDATE semantic_chunks SET embedding_binary = NULL, embedding_dense = NULL"
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# Get all chunks
|
||||
cursor = conn.execute("SELECT id, content FROM semantic_chunks")
|
||||
chunks = cursor.fetchall()
|
||||
|
||||
if not chunks:
|
||||
progress.advance(db_task)
|
||||
store.close()
|
||||
continue
|
||||
|
||||
if verbose and not json_mode:
|
||||
console.print(f" Processing {index_name}: {len(chunks)} chunks")
|
||||
|
||||
# Process in batches
|
||||
chunk_task = progress.add_task(
|
||||
f" {index_name}", total=len(chunks)
|
||||
)
|
||||
|
||||
# Prepare for BinaryANNIndex
|
||||
binary_index_path = get_binary_index_path(db_path)
|
||||
binary_ann_index = BinaryANNIndex(db_path, dim=256)
|
||||
|
||||
for i in range(0, len(chunks), batch_size):
|
||||
batch_chunks = chunks[i:i + batch_size]
|
||||
batch_ids = [c[0] for c in batch_chunks]
|
||||
batch_contents = [c[1] for c in batch_chunks]
|
||||
|
||||
# Generate cascade embeddings
|
||||
binary_embeddings, dense_embeddings = cascade_backend.encode_cascade(
|
||||
batch_contents, batch_size=batch_size
|
||||
)
|
||||
|
||||
# Pack binary embeddings and convert dense to bytes
|
||||
packed_binaries = []
|
||||
dense_bytes_list = []
|
||||
|
||||
for j in range(len(batch_ids)):
|
||||
# Pack binary embedding (256 bits -> 32 bytes)
|
||||
packed_binary = pack_binary_embedding(binary_embeddings[j])
|
||||
packed_binaries.append(packed_binary)
|
||||
|
||||
# Convert dense embedding to bytes
|
||||
import numpy as np
|
||||
dense_blob = dense_embeddings[j].astype(np.float32).tobytes()
|
||||
dense_bytes_list.append(dense_blob)
|
||||
|
||||
# Update database
|
||||
for j, chunk_id in enumerate(batch_ids):
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE semantic_chunks
|
||||
SET embedding_binary = ?, embedding_dense = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(packed_binaries[j], dense_bytes_list[j], chunk_id)
|
||||
)
|
||||
|
||||
# Add to binary ANN index
|
||||
binary_ann_index.add_vectors(batch_ids, packed_binaries)
|
||||
|
||||
conn.commit()
|
||||
total_chunks_processed += len(batch_ids)
|
||||
progress.advance(chunk_task, len(batch_ids))
|
||||
|
||||
# Save binary ANN index
|
||||
binary_ann_index.save()
|
||||
total_binary_indexes_created += 1
|
||||
|
||||
progress.remove_task(chunk_task)
|
||||
store.close()
|
||||
total_indexes_successful += 1
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"{index_name}: {e}"
|
||||
errors_list.append(error_msg)
|
||||
if verbose and not json_mode:
|
||||
console.print(f" [red]Error processing {index_name}:[/red] {e}")
|
||||
|
||||
progress.advance(db_task)
|
||||
|
||||
# Build result
|
||||
result = {
|
||||
"path": str(target_path),
|
||||
"indexes_processed": total_indexes_processed,
|
||||
"indexes_successful": total_indexes_successful,
|
||||
"chunks_processed": total_chunks_processed,
|
||||
"binary_indexes_created": total_binary_indexes_created,
|
||||
"errors": len(errors_list),
|
||||
"error_details": errors_list[:5] if errors_list else [],
|
||||
}
|
||||
|
||||
if json_mode:
|
||||
print_json(success=True, result=result)
|
||||
else:
|
||||
console.print(f"\n[green]Cascade indexing complete[/green]")
|
||||
console.print(f" Indexes processed: {total_indexes_processed}")
|
||||
console.print(f" Indexes successful: {total_indexes_successful}")
|
||||
console.print(f" Chunks processed: {total_chunks_processed:,}")
|
||||
console.print(f" Binary indexes created: {total_binary_indexes_created}")
|
||||
if errors_list:
|
||||
console.print(f" [yellow]Errors: {len(errors_list)}[/yellow]")
|
||||
for err in errors_list[:3]:
|
||||
console.print(f" [dim]{err}[/dim]")
|
||||
if len(errors_list) > 3:
|
||||
console.print(f" [dim]... and {len(errors_list) - 3} more[/dim]")
|
||||
|
||||
@@ -265,8 +265,8 @@ class DenseEmbeddingBackend(BaseEmbedder):
|
||||
Model: BAAI/bge-large-en-v1.5 (1024 dim) with optional expansion
|
||||
"""
|
||||
|
||||
DEFAULT_MODEL = "BAAI/bge-large-en-v1.5" # 1024 dim, high quality
|
||||
TARGET_DIM = 2048
|
||||
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" # 384 dim, use small for testing
|
||||
TARGET_DIM = 768 # Reduced target for faster testing
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user