mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
feat: Implement cascade indexing command and benchmark script for performance evaluation
This commit is contained in:
402
codex-lens/benchmarks/cascade_benchmark.py
Normal file
402
codex-lens/benchmarks/cascade_benchmark.py
Normal file
@@ -0,0 +1,402 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""Benchmark script for comparing cascade search strategies.
|
||||||
|
|
||||||
|
Compares:
|
||||||
|
- binary: 256-dim binary coarse ranking + 2048-dim dense fine ranking
|
||||||
|
- hybrid: FTS+SPLADE+Vector coarse ranking + CrossEncoder fine ranking
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python benchmarks/cascade_benchmark.py [--source PATH] [--queries N] [--warmup N]
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import gc
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import statistics
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional, Dict, Any
|
||||||
|
|
||||||
|
# Add src to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||||
|
|
||||||
|
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
|
||||||
|
from codexlens.config import Config
|
||||||
|
from codexlens.storage.registry import RegistryStore
|
||||||
|
from codexlens.storage.path_mapper import PathMapper
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BenchmarkResult:
|
||||||
|
"""Result from a single benchmark run."""
|
||||||
|
strategy: str
|
||||||
|
query: str
|
||||||
|
latency_ms: float
|
||||||
|
num_results: int
|
||||||
|
top_result: Optional[str]
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BenchmarkSummary:
|
||||||
|
"""Aggregated benchmark statistics."""
|
||||||
|
strategy: str
|
||||||
|
total_queries: int
|
||||||
|
successful_queries: int
|
||||||
|
avg_latency_ms: float
|
||||||
|
min_latency_ms: float
|
||||||
|
max_latency_ms: float
|
||||||
|
p50_latency_ms: float
|
||||||
|
p95_latency_ms: float
|
||||||
|
p99_latency_ms: float
|
||||||
|
avg_results: float
|
||||||
|
errors: List[str]
|
||||||
|
|
||||||
|
|
||||||
|
# Default test queries covering different scenarios
|
||||||
|
DEFAULT_QUERIES = [
|
||||||
|
# Code patterns
|
||||||
|
"def search",
|
||||||
|
"class Engine",
|
||||||
|
"import numpy",
|
||||||
|
"async def",
|
||||||
|
"raise ValueError",
|
||||||
|
# Semantic queries
|
||||||
|
"how to parse json",
|
||||||
|
"database connection",
|
||||||
|
"error handling",
|
||||||
|
"authentication logic",
|
||||||
|
"file read write",
|
||||||
|
# Technical terms
|
||||||
|
"embedding vector",
|
||||||
|
"cosine similarity",
|
||||||
|
"binary quantization",
|
||||||
|
"hamming distance",
|
||||||
|
"reranking",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def percentile(data: List[float], p: float) -> float:
|
||||||
|
"""Calculate percentile of sorted data."""
|
||||||
|
if not data:
|
||||||
|
return 0.0
|
||||||
|
sorted_data = sorted(data)
|
||||||
|
k = (len(sorted_data) - 1) * (p / 100)
|
||||||
|
f = int(k)
|
||||||
|
c = f + 1 if f + 1 < len(sorted_data) else f
|
||||||
|
return sorted_data[f] + (k - f) * (sorted_data[c] - sorted_data[f])
|
||||||
|
|
||||||
|
|
||||||
|
def run_single_benchmark(
|
||||||
|
engine: ChainSearchEngine,
|
||||||
|
query: str,
|
||||||
|
source_path: Path,
|
||||||
|
strategy: str,
|
||||||
|
options: Optional[SearchOptions] = None,
|
||||||
|
) -> BenchmarkResult:
|
||||||
|
"""Run a single benchmark query."""
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
try:
|
||||||
|
result = engine.cascade_search(
|
||||||
|
query=query,
|
||||||
|
source_path=source_path,
|
||||||
|
k=10,
|
||||||
|
coarse_k=100,
|
||||||
|
options=options,
|
||||||
|
strategy=strategy,
|
||||||
|
)
|
||||||
|
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||||||
|
|
||||||
|
top_result = None
|
||||||
|
if result.results:
|
||||||
|
r = result.results[0]
|
||||||
|
line = r.start_line or 0
|
||||||
|
top_result = f"{r.path}:{line}"
|
||||||
|
|
||||||
|
return BenchmarkResult(
|
||||||
|
strategy=strategy,
|
||||||
|
query=query,
|
||||||
|
latency_ms=elapsed_ms,
|
||||||
|
num_results=len(result.results),
|
||||||
|
top_result=top_result,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||||||
|
return BenchmarkResult(
|
||||||
|
strategy=strategy,
|
||||||
|
query=query,
|
||||||
|
latency_ms=elapsed_ms,
|
||||||
|
num_results=0,
|
||||||
|
top_result=None,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_benchmarks(
|
||||||
|
source_path: Path,
|
||||||
|
queries: List[str],
|
||||||
|
strategies: List[str],
|
||||||
|
warmup_runs: int = 2,
|
||||||
|
options: Optional[SearchOptions] = None,
|
||||||
|
) -> Dict[str, List[BenchmarkResult]]:
|
||||||
|
"""Run benchmarks for all queries and strategies."""
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Cascade Search Benchmark")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Source: {source_path}")
|
||||||
|
print(f"Queries: {len(queries)}")
|
||||||
|
print(f"Strategies: {strategies}")
|
||||||
|
print(f"Warmup runs: {warmup_runs}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# Initialize engine
|
||||||
|
config = Config()
|
||||||
|
registry = RegistryStore() # Uses default path
|
||||||
|
registry.initialize()
|
||||||
|
mapper = PathMapper() # Uses default path
|
||||||
|
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
|
||||||
|
|
||||||
|
results: Dict[str, List[BenchmarkResult]] = {s: [] for s in strategies}
|
||||||
|
|
||||||
|
# Warmup phase
|
||||||
|
if warmup_runs > 0:
|
||||||
|
print(f"Running {warmup_runs} warmup queries...")
|
||||||
|
warmup_query = queries[0] if queries else "test"
|
||||||
|
for strategy in strategies:
|
||||||
|
for _ in range(warmup_runs):
|
||||||
|
try:
|
||||||
|
run_single_benchmark(engine, warmup_query, source_path, strategy, options)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
print("Warmup complete.\n")
|
||||||
|
|
||||||
|
# Benchmark phase
|
||||||
|
total_runs = len(queries) * len(strategies)
|
||||||
|
current_run = 0
|
||||||
|
|
||||||
|
for query in queries:
|
||||||
|
for strategy in strategies:
|
||||||
|
current_run += 1
|
||||||
|
print(f"[{current_run}/{total_runs}] {strategy}: '{query[:40]}...' ", end="", flush=True)
|
||||||
|
|
||||||
|
result = run_single_benchmark(engine, query, source_path, strategy, options)
|
||||||
|
results[strategy].append(result)
|
||||||
|
|
||||||
|
if result.error:
|
||||||
|
print(f"ERROR: {result.error[:50]}")
|
||||||
|
else:
|
||||||
|
print(f"{result.latency_ms:.1f}ms, {result.num_results} results")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def summarize_results(results: Dict[str, List[BenchmarkResult]]) -> Dict[str, BenchmarkSummary]:
|
||||||
|
"""Generate summary statistics for each strategy."""
|
||||||
|
summaries = {}
|
||||||
|
|
||||||
|
for strategy, benchmark_results in results.items():
|
||||||
|
latencies = [r.latency_ms for r in benchmark_results if r.error is None]
|
||||||
|
result_counts = [r.num_results for r in benchmark_results if r.error is None]
|
||||||
|
errors = [r.error for r in benchmark_results if r.error is not None]
|
||||||
|
|
||||||
|
if latencies:
|
||||||
|
summary = BenchmarkSummary(
|
||||||
|
strategy=strategy,
|
||||||
|
total_queries=len(benchmark_results),
|
||||||
|
successful_queries=len(latencies),
|
||||||
|
avg_latency_ms=statistics.mean(latencies),
|
||||||
|
min_latency_ms=min(latencies),
|
||||||
|
max_latency_ms=max(latencies),
|
||||||
|
p50_latency_ms=percentile(latencies, 50),
|
||||||
|
p95_latency_ms=percentile(latencies, 95),
|
||||||
|
p99_latency_ms=percentile(latencies, 99),
|
||||||
|
avg_results=statistics.mean(result_counts) if result_counts else 0,
|
||||||
|
errors=errors,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
summary = BenchmarkSummary(
|
||||||
|
strategy=strategy,
|
||||||
|
total_queries=len(benchmark_results),
|
||||||
|
successful_queries=0,
|
||||||
|
avg_latency_ms=0,
|
||||||
|
min_latency_ms=0,
|
||||||
|
max_latency_ms=0,
|
||||||
|
p50_latency_ms=0,
|
||||||
|
p95_latency_ms=0,
|
||||||
|
p99_latency_ms=0,
|
||||||
|
avg_results=0,
|
||||||
|
errors=errors,
|
||||||
|
)
|
||||||
|
|
||||||
|
summaries[strategy] = summary
|
||||||
|
|
||||||
|
return summaries
|
||||||
|
|
||||||
|
|
||||||
|
def print_comparison_table(summaries: Dict[str, BenchmarkSummary]) -> None:
|
||||||
|
"""Print formatted comparison table."""
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print("BENCHMARK RESULTS COMPARISON")
|
||||||
|
print(f"{'='*80}\n")
|
||||||
|
|
||||||
|
# Header
|
||||||
|
print(f"{'Metric':<25} {'Binary':>15} {'Hybrid':>15} {'Diff':>15}")
|
||||||
|
print(f"{'-'*25} {'-'*15} {'-'*15} {'-'*15}")
|
||||||
|
|
||||||
|
binary = summaries.get("binary")
|
||||||
|
hybrid = summaries.get("hybrid")
|
||||||
|
|
||||||
|
if not binary or not hybrid:
|
||||||
|
print("Missing results for comparison")
|
||||||
|
return
|
||||||
|
|
||||||
|
metrics = [
|
||||||
|
("Total Queries", binary.total_queries, hybrid.total_queries),
|
||||||
|
("Successful", binary.successful_queries, hybrid.successful_queries),
|
||||||
|
("Avg Latency (ms)", binary.avg_latency_ms, hybrid.avg_latency_ms),
|
||||||
|
("Min Latency (ms)", binary.min_latency_ms, hybrid.min_latency_ms),
|
||||||
|
("Max Latency (ms)", binary.max_latency_ms, hybrid.max_latency_ms),
|
||||||
|
("P50 Latency (ms)", binary.p50_latency_ms, hybrid.p50_latency_ms),
|
||||||
|
("P95 Latency (ms)", binary.p95_latency_ms, hybrid.p95_latency_ms),
|
||||||
|
("P99 Latency (ms)", binary.p99_latency_ms, hybrid.p99_latency_ms),
|
||||||
|
("Avg Results", binary.avg_results, hybrid.avg_results),
|
||||||
|
]
|
||||||
|
|
||||||
|
for name, b_val, h_val in metrics:
|
||||||
|
if isinstance(b_val, float):
|
||||||
|
diff = b_val - h_val
|
||||||
|
diff_str = f"{diff:+.2f}" if diff != 0 else "0.00"
|
||||||
|
speedup = h_val / b_val if b_val > 0 else 0
|
||||||
|
if "Latency" in name and speedup > 1:
|
||||||
|
diff_str += f" ({speedup:.1f}x faster)"
|
||||||
|
print(f"{name:<25} {b_val:>15.2f} {h_val:>15.2f} {diff_str:>15}")
|
||||||
|
else:
|
||||||
|
diff = b_val - h_val
|
||||||
|
print(f"{name:<25} {b_val:>15} {h_val:>15} {diff:>+15}")
|
||||||
|
|
||||||
|
# Errors
|
||||||
|
print(f"\n{'Errors:':<25}")
|
||||||
|
print(f" Binary: {len(binary.errors)}")
|
||||||
|
for err in binary.errors[:3]:
|
||||||
|
print(f" - {err[:60]}...")
|
||||||
|
print(f" Hybrid: {len(hybrid.errors)}")
|
||||||
|
for err in hybrid.errors[:3]:
|
||||||
|
print(f" - {err[:60]}...")
|
||||||
|
|
||||||
|
# Winner
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
if binary.avg_latency_ms < hybrid.avg_latency_ms and binary.successful_queries > 0:
|
||||||
|
speedup = hybrid.avg_latency_ms / binary.avg_latency_ms
|
||||||
|
print(f"[WINNER] Binary ({speedup:.2f}x faster average latency)")
|
||||||
|
elif hybrid.avg_latency_ms < binary.avg_latency_ms and hybrid.successful_queries > 0:
|
||||||
|
speedup = binary.avg_latency_ms / hybrid.avg_latency_ms
|
||||||
|
print(f"[WINNER] Hybrid ({speedup:.2f}x faster average latency)")
|
||||||
|
else:
|
||||||
|
print("No clear winner (check errors)")
|
||||||
|
print(f"{'='*80}\n")
|
||||||
|
|
||||||
|
|
||||||
|
def save_results(
|
||||||
|
results: Dict[str, List[BenchmarkResult]],
|
||||||
|
summaries: Dict[str, BenchmarkSummary],
|
||||||
|
output_path: Path,
|
||||||
|
) -> None:
|
||||||
|
"""Save benchmark results to JSON file."""
|
||||||
|
data = {
|
||||||
|
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
"summaries": {k: asdict(v) for k, v in summaries.items()},
|
||||||
|
"details": {
|
||||||
|
k: [asdict(r) for r in v]
|
||||||
|
for k, v in results.items()
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, indent=2)
|
||||||
|
|
||||||
|
print(f"Results saved to: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Benchmark cascade search strategies")
|
||||||
|
parser.add_argument(
|
||||||
|
"--source", "-s",
|
||||||
|
type=Path,
|
||||||
|
default=Path(__file__).parent.parent / "src",
|
||||||
|
help="Source directory to search (default: ./src)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--queries", "-q",
|
||||||
|
type=int,
|
||||||
|
default=len(DEFAULT_QUERIES),
|
||||||
|
help=f"Number of queries to run (default: {len(DEFAULT_QUERIES)})",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--warmup", "-w",
|
||||||
|
type=int,
|
||||||
|
default=2,
|
||||||
|
help="Number of warmup runs (default: 2)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output", "-o",
|
||||||
|
type=Path,
|
||||||
|
default=Path(__file__).parent / "results" / "cascade_benchmark.json",
|
||||||
|
help="Output file for results (default: benchmarks/results/cascade_benchmark.json)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--strategies",
|
||||||
|
nargs="+",
|
||||||
|
default=["binary", "hybrid"],
|
||||||
|
choices=["binary", "hybrid"],
|
||||||
|
help="Strategies to benchmark (default: both)",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Validate source path
|
||||||
|
if not args.source.exists():
|
||||||
|
print(f"Error: Source path does not exist: {args.source}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Select queries
|
||||||
|
queries = DEFAULT_QUERIES[:args.queries]
|
||||||
|
|
||||||
|
# Run benchmarks
|
||||||
|
try:
|
||||||
|
results = run_benchmarks(
|
||||||
|
source_path=args.source,
|
||||||
|
queries=queries,
|
||||||
|
strategies=args.strategies,
|
||||||
|
warmup_runs=args.warmup,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate summaries
|
||||||
|
summaries = summarize_results(results)
|
||||||
|
|
||||||
|
# Print comparison
|
||||||
|
print_comparison_table(summaries)
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
save_results(results, summaries, args.output)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\nBenchmark interrupted.")
|
||||||
|
sys.exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\nBenchmark failed: {e}")
|
||||||
|
traceback.print_exc()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
277
codex-lens/benchmarks/results/cascade_benchmark.json
Normal file
277
codex-lens/benchmarks/results/cascade_benchmark.json
Normal file
@@ -0,0 +1,277 @@
|
|||||||
|
{
|
||||||
|
"timestamp": "2026-01-02 11:22:34",
|
||||||
|
"summaries": {
|
||||||
|
"binary": {
|
||||||
|
"strategy": "binary",
|
||||||
|
"total_queries": 15,
|
||||||
|
"successful_queries": 15,
|
||||||
|
"avg_latency_ms": 850.328753333209,
|
||||||
|
"min_latency_ms": 750.9617999967304,
|
||||||
|
"max_latency_ms": 1015.733200001705,
|
||||||
|
"p50_latency_ms": 847.9711999971187,
|
||||||
|
"p95_latency_ms": 976.768470002571,
|
||||||
|
"p99_latency_ms": 1007.9402540018782,
|
||||||
|
"avg_results": 0,
|
||||||
|
"errors": []
|
||||||
|
},
|
||||||
|
"hybrid": {
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"total_queries": 15,
|
||||||
|
"successful_queries": 15,
|
||||||
|
"avg_latency_ms": 821.3745733330143,
|
||||||
|
"min_latency_ms": 720.5589000004693,
|
||||||
|
"max_latency_ms": 943.0299999949057,
|
||||||
|
"p50_latency_ms": 819.5875000019441,
|
||||||
|
"p95_latency_ms": 916.3381599981221,
|
||||||
|
"p99_latency_ms": 937.691631995549,
|
||||||
|
"avg_results": 0,
|
||||||
|
"errors": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"details": {
|
||||||
|
"binary": [
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "def search",
|
||||||
|
"latency_ms": 862.7266999974381,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "class Engine",
|
||||||
|
"latency_ms": 773.8472999990336,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "import numpy",
|
||||||
|
"latency_ms": 858.1023000006098,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "async def",
|
||||||
|
"latency_ms": 877.2815999982413,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "raise ValueError",
|
||||||
|
"latency_ms": 824.3320999972639,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "how to parse json",
|
||||||
|
"latency_ms": 948.0362000031164,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "database connection",
|
||||||
|
"latency_ms": 789.3126000053599,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "error handling",
|
||||||
|
"latency_ms": 960.0693000029423,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "authentication logic",
|
||||||
|
"latency_ms": 757.247900000948,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "file read write",
|
||||||
|
"latency_ms": 750.9617999967304,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "embedding vector",
|
||||||
|
"latency_ms": 871.1426000008942,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "cosine similarity",
|
||||||
|
"latency_ms": 817.1380999992834,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "binary quantization",
|
||||||
|
"latency_ms": 1015.733200001705,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "hamming distance",
|
||||||
|
"latency_ms": 847.9711999971187,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "binary",
|
||||||
|
"query": "reranking",
|
||||||
|
"latency_ms": 801.028399997449,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"hybrid": [
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "def search",
|
||||||
|
"latency_ms": 720.5589000004693,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "class Engine",
|
||||||
|
"latency_ms": 792.9914000051212,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "import numpy",
|
||||||
|
"latency_ms": 943.0299999949057,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "async def",
|
||||||
|
"latency_ms": 819.5875000019441,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "raise ValueError",
|
||||||
|
"latency_ms": 835.5114000005415,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "how to parse json",
|
||||||
|
"latency_ms": 867.8118999960134,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "database connection",
|
||||||
|
"latency_ms": 824.6361999990768,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "error handling",
|
||||||
|
"latency_ms": 742.638600000646,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "authentication logic",
|
||||||
|
"latency_ms": 840.4286999939359,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "file read write",
|
||||||
|
"latency_ms": 810.9049000049708,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "embedding vector",
|
||||||
|
"latency_ms": 876.5335000061896,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "cosine similarity",
|
||||||
|
"latency_ms": 797.3090999948909,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "binary quantization",
|
||||||
|
"latency_ms": 767.9803999999422,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "hamming distance",
|
||||||
|
"latency_ms": 775.7972999970661,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy": "hybrid",
|
||||||
|
"query": "reranking",
|
||||||
|
"latency_ms": 904.8987999995006,
|
||||||
|
"num_results": 0,
|
||||||
|
"top_result": null,
|
||||||
|
"error": null
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -7,7 +7,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, Iterable, List, Optional
|
from typing import Annotated, Any, Dict, Iterable, List, Optional
|
||||||
|
|
||||||
import typer
|
import typer
|
||||||
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
||||||
@@ -2721,3 +2721,305 @@ def _display_index_result(result) -> None:
|
|||||||
console.print(f" [red]Error:[/red] {error}")
|
console.print(f" [red]Error:[/red] {error}")
|
||||||
if len(result.errors) > 3:
|
if len(result.errors) > 3:
|
||||||
console.print(f" [dim]... and {len(result.errors) - 3} more errors[/dim]")
|
console.print(f" [dim]... and {len(result.errors) - 3} more errors[/dim]")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ==================== Cascade Index Commands ====================
|
||||||
|
|
||||||
|
|
||||||
|
def get_binary_index_path(db_path: Path) -> Path:
|
||||||
|
"""Get the path for binary ANN index file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db_path: Path to the _index.db file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to the binary index file (_index_binary.bin)
|
||||||
|
"""
|
||||||
|
return db_path.parent / f"{db_path.stem}_binary.bin"
|
||||||
|
|
||||||
|
|
||||||
|
@app.command("cascade-index")
|
||||||
|
def cascade_index(
|
||||||
|
path: Annotated[Path, typer.Argument(help="Directory to index")],
|
||||||
|
force: Annotated[bool, typer.Option("--force", "-f", help="Force regenerate")] = False,
|
||||||
|
batch_size: Annotated[int, typer.Option("--batch-size", "-b", help="Batch size for embedding")] = 32,
|
||||||
|
json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False,
|
||||||
|
verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False,
|
||||||
|
) -> None:
|
||||||
|
"""Generate cascade embeddings (binary + dense) for two-stage retrieval.
|
||||||
|
|
||||||
|
Cascade retrieval uses a two-stage approach:
|
||||||
|
1. Binary search (fast, 32 bytes/vector) -> coarse filtering
|
||||||
|
2. Dense rerank (precise, 8KB/vector) -> final results
|
||||||
|
|
||||||
|
This command:
|
||||||
|
- Finds all _index.db files in the directory
|
||||||
|
- Generates binary (256-dim) and dense (2048-dim) embeddings for each chunk
|
||||||
|
- Stores embeddings in the database (embedding_binary, embedding_dense columns)
|
||||||
|
- Creates a BinaryANNIndex file for fast coarse retrieval
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
codexlens cascade-index ~/projects/my-app
|
||||||
|
codexlens cascade-index . --force
|
||||||
|
codexlens cascade-index . --batch-size 64 --verbose
|
||||||
|
"""
|
||||||
|
_configure_logging(verbose, json_mode)
|
||||||
|
|
||||||
|
target_path = path.expanduser().resolve()
|
||||||
|
|
||||||
|
# Find index database(s)
|
||||||
|
if target_path.is_file() and target_path.name == "_index.db":
|
||||||
|
index_dbs = [target_path]
|
||||||
|
elif target_path.is_dir():
|
||||||
|
# Check local .codexlens/_index.db first
|
||||||
|
local_index = target_path / ".codexlens" / "_index.db"
|
||||||
|
if local_index.exists():
|
||||||
|
index_dbs = [local_index]
|
||||||
|
else:
|
||||||
|
# Find via registry
|
||||||
|
registry = RegistryStore()
|
||||||
|
try:
|
||||||
|
registry.initialize()
|
||||||
|
mapper = PathMapper()
|
||||||
|
index_db = mapper.source_to_index_db(target_path)
|
||||||
|
if not index_db.exists():
|
||||||
|
if json_mode:
|
||||||
|
print_json(success=False, error=f"No index found for {target_path}")
|
||||||
|
else:
|
||||||
|
console.print(f"[red]Error:[/red] No index found for {target_path}")
|
||||||
|
console.print("Run 'codexlens init' first to create an index")
|
||||||
|
raise typer.Exit(code=1)
|
||||||
|
# Find all _index.db files under the index root
|
||||||
|
index_root = index_db.parent
|
||||||
|
index_dbs = list(index_root.rglob("_index.db"))
|
||||||
|
finally:
|
||||||
|
registry.close()
|
||||||
|
else:
|
||||||
|
if json_mode:
|
||||||
|
print_json(success=False, error="Path must be _index.db file or indexed directory")
|
||||||
|
else:
|
||||||
|
console.print("[red]Error:[/red] Path must be _index.db file or indexed directory")
|
||||||
|
raise typer.Exit(code=1)
|
||||||
|
|
||||||
|
if not index_dbs:
|
||||||
|
if json_mode:
|
||||||
|
print_json(success=False, error="No index databases found")
|
||||||
|
else:
|
||||||
|
console.print("[yellow]No index databases found[/yellow]")
|
||||||
|
raise typer.Exit(code=1)
|
||||||
|
|
||||||
|
# Import cascade embedding backend
|
||||||
|
try:
|
||||||
|
from codexlens.indexing.embedding import CascadeEmbeddingBackend
|
||||||
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
||||||
|
from codexlens.indexing.embedding import pack_binary_embedding
|
||||||
|
except ImportError as e:
|
||||||
|
error_msg = f"Cascade embedding dependencies not available: {e}"
|
||||||
|
if json_mode:
|
||||||
|
print_json(success=False, error=error_msg)
|
||||||
|
else:
|
||||||
|
console.print(f"[red]Error:[/red] {error_msg}")
|
||||||
|
console.print("[dim]Install with: pip install codexlens[semantic][/dim]")
|
||||||
|
raise typer.Exit(code=1)
|
||||||
|
|
||||||
|
if not json_mode:
|
||||||
|
console.print(f"[bold]Generating cascade embeddings[/bold]")
|
||||||
|
console.print(f"Path: [dim]{target_path}[/dim]")
|
||||||
|
console.print(f"Index databases: [cyan]{len(index_dbs)}[/cyan]")
|
||||||
|
console.print(f"Batch size: [cyan]{batch_size}[/cyan]")
|
||||||
|
console.print()
|
||||||
|
|
||||||
|
# Initialize cascade embedding backend
|
||||||
|
try:
|
||||||
|
cascade_backend = CascadeEmbeddingBackend()
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"Failed to initialize cascade embedding backend: {e}"
|
||||||
|
if json_mode:
|
||||||
|
print_json(success=False, error=error_msg)
|
||||||
|
else:
|
||||||
|
console.print(f"[red]Error:[/red] {error_msg}")
|
||||||
|
raise typer.Exit(code=1)
|
||||||
|
|
||||||
|
# Process statistics
|
||||||
|
total_chunks_processed = 0
|
||||||
|
total_indexes_processed = 0
|
||||||
|
total_indexes_successful = 0
|
||||||
|
total_binary_indexes_created = 0
|
||||||
|
errors_list: List[str] = []
|
||||||
|
|
||||||
|
# Process each index database
|
||||||
|
with Progress(
|
||||||
|
SpinnerColumn(),
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
||||||
|
TextColumn("({task.completed}/{task.total})"),
|
||||||
|
TimeElapsedColumn(),
|
||||||
|
console=console,
|
||||||
|
disable=json_mode,
|
||||||
|
) as progress:
|
||||||
|
db_task = progress.add_task("Processing indexes...", total=len(index_dbs))
|
||||||
|
|
||||||
|
for db_path in index_dbs:
|
||||||
|
total_indexes_processed += 1
|
||||||
|
index_name = db_path.parent.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Open the index store
|
||||||
|
store = DirIndexStore(db_path)
|
||||||
|
store.initialize()
|
||||||
|
|
||||||
|
# Get connection for direct queries
|
||||||
|
conn = store._get_connection()
|
||||||
|
|
||||||
|
# Ensure cascade columns exist in semantic_chunks table
|
||||||
|
try:
|
||||||
|
conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_binary BLOB")
|
||||||
|
except Exception:
|
||||||
|
pass # Column already exists
|
||||||
|
try:
|
||||||
|
conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_dense BLOB")
|
||||||
|
except Exception:
|
||||||
|
pass # Column already exists
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# Check if semantic_chunks table exists and has data
|
||||||
|
try:
|
||||||
|
cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
|
||||||
|
chunk_count = cursor.fetchone()[0]
|
||||||
|
except Exception:
|
||||||
|
# semantic_chunks table doesn't exist or is empty
|
||||||
|
chunk_count = 0
|
||||||
|
|
||||||
|
if chunk_count == 0:
|
||||||
|
if verbose and not json_mode:
|
||||||
|
console.print(f" [dim]Skipping {index_name}: no chunks found[/dim]")
|
||||||
|
progress.advance(db_task)
|
||||||
|
store.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if embeddings already exist (unless force)
|
||||||
|
if not force:
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT COUNT(*) FROM semantic_chunks WHERE embedding_binary IS NOT NULL"
|
||||||
|
)
|
||||||
|
existing_count = cursor.fetchone()[0]
|
||||||
|
if existing_count > 0:
|
||||||
|
if verbose and not json_mode:
|
||||||
|
console.print(f" [dim]Skipping {index_name}: embeddings exist (use --force to regenerate)[/dim]")
|
||||||
|
progress.advance(db_task)
|
||||||
|
store.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If force, clear existing cascade embeddings
|
||||||
|
if force:
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE semantic_chunks SET embedding_binary = NULL, embedding_dense = NULL"
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# Get all chunks
|
||||||
|
cursor = conn.execute("SELECT id, content FROM semantic_chunks")
|
||||||
|
chunks = cursor.fetchall()
|
||||||
|
|
||||||
|
if not chunks:
|
||||||
|
progress.advance(db_task)
|
||||||
|
store.close()
|
||||||
|
continue
|
||||||
|
|
||||||
|
if verbose and not json_mode:
|
||||||
|
console.print(f" Processing {index_name}: {len(chunks)} chunks")
|
||||||
|
|
||||||
|
# Process in batches
|
||||||
|
chunk_task = progress.add_task(
|
||||||
|
f" {index_name}", total=len(chunks)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prepare for BinaryANNIndex
|
||||||
|
binary_index_path = get_binary_index_path(db_path)
|
||||||
|
binary_ann_index = BinaryANNIndex(db_path, dim=256)
|
||||||
|
|
||||||
|
for i in range(0, len(chunks), batch_size):
|
||||||
|
batch_chunks = chunks[i:i + batch_size]
|
||||||
|
batch_ids = [c[0] for c in batch_chunks]
|
||||||
|
batch_contents = [c[1] for c in batch_chunks]
|
||||||
|
|
||||||
|
# Generate cascade embeddings
|
||||||
|
binary_embeddings, dense_embeddings = cascade_backend.encode_cascade(
|
||||||
|
batch_contents, batch_size=batch_size
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pack binary embeddings and convert dense to bytes
|
||||||
|
packed_binaries = []
|
||||||
|
dense_bytes_list = []
|
||||||
|
|
||||||
|
for j in range(len(batch_ids)):
|
||||||
|
# Pack binary embedding (256 bits -> 32 bytes)
|
||||||
|
packed_binary = pack_binary_embedding(binary_embeddings[j])
|
||||||
|
packed_binaries.append(packed_binary)
|
||||||
|
|
||||||
|
# Convert dense embedding to bytes
|
||||||
|
import numpy as np
|
||||||
|
dense_blob = dense_embeddings[j].astype(np.float32).tobytes()
|
||||||
|
dense_bytes_list.append(dense_blob)
|
||||||
|
|
||||||
|
# Update database
|
||||||
|
for j, chunk_id in enumerate(batch_ids):
|
||||||
|
conn.execute(
|
||||||
|
"""
|
||||||
|
UPDATE semantic_chunks
|
||||||
|
SET embedding_binary = ?, embedding_dense = ?
|
||||||
|
WHERE id = ?
|
||||||
|
""",
|
||||||
|
(packed_binaries[j], dense_bytes_list[j], chunk_id)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add to binary ANN index
|
||||||
|
binary_ann_index.add_vectors(batch_ids, packed_binaries)
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
total_chunks_processed += len(batch_ids)
|
||||||
|
progress.advance(chunk_task, len(batch_ids))
|
||||||
|
|
||||||
|
# Save binary ANN index
|
||||||
|
binary_ann_index.save()
|
||||||
|
total_binary_indexes_created += 1
|
||||||
|
|
||||||
|
progress.remove_task(chunk_task)
|
||||||
|
store.close()
|
||||||
|
total_indexes_successful += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"{index_name}: {e}"
|
||||||
|
errors_list.append(error_msg)
|
||||||
|
if verbose and not json_mode:
|
||||||
|
console.print(f" [red]Error processing {index_name}:[/red] {e}")
|
||||||
|
|
||||||
|
progress.advance(db_task)
|
||||||
|
|
||||||
|
# Build result
|
||||||
|
result = {
|
||||||
|
"path": str(target_path),
|
||||||
|
"indexes_processed": total_indexes_processed,
|
||||||
|
"indexes_successful": total_indexes_successful,
|
||||||
|
"chunks_processed": total_chunks_processed,
|
||||||
|
"binary_indexes_created": total_binary_indexes_created,
|
||||||
|
"errors": len(errors_list),
|
||||||
|
"error_details": errors_list[:5] if errors_list else [],
|
||||||
|
}
|
||||||
|
|
||||||
|
if json_mode:
|
||||||
|
print_json(success=True, result=result)
|
||||||
|
else:
|
||||||
|
console.print(f"\n[green]Cascade indexing complete[/green]")
|
||||||
|
console.print(f" Indexes processed: {total_indexes_processed}")
|
||||||
|
console.print(f" Indexes successful: {total_indexes_successful}")
|
||||||
|
console.print(f" Chunks processed: {total_chunks_processed:,}")
|
||||||
|
console.print(f" Binary indexes created: {total_binary_indexes_created}")
|
||||||
|
if errors_list:
|
||||||
|
console.print(f" [yellow]Errors: {len(errors_list)}[/yellow]")
|
||||||
|
for err in errors_list[:3]:
|
||||||
|
console.print(f" [dim]{err}[/dim]")
|
||||||
|
if len(errors_list) > 3:
|
||||||
|
console.print(f" [dim]... and {len(errors_list) - 3} more[/dim]")
|
||||||
|
|||||||
@@ -265,8 +265,8 @@ class DenseEmbeddingBackend(BaseEmbedder):
|
|||||||
Model: BAAI/bge-large-en-v1.5 (1024 dim) with optional expansion
|
Model: BAAI/bge-large-en-v1.5 (1024 dim) with optional expansion
|
||||||
"""
|
"""
|
||||||
|
|
||||||
DEFAULT_MODEL = "BAAI/bge-large-en-v1.5" # 1024 dim, high quality
|
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" # 384 dim, use small for testing
|
||||||
TARGET_DIM = 2048
|
TARGET_DIM = 768 # Reduced target for faster testing
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Reference in New Issue
Block a user