feat: Implement cascade indexing command and benchmark script for performance evaluation

This commit is contained in:
catlog22
2026-01-02 11:24:06 +08:00
parent e21d801523
commit da68ba0b82
4 changed files with 984 additions and 3 deletions

View File

@@ -0,0 +1,402 @@
#!/usr/bin/env python
"""Benchmark script for comparing cascade search strategies.
Compares:
- binary: 256-dim binary coarse ranking + 2048-dim dense fine ranking
- hybrid: FTS+SPLADE+Vector coarse ranking + CrossEncoder fine ranking
Usage:
python benchmarks/cascade_benchmark.py [--source PATH] [--queries N] [--warmup N]
"""
from __future__ import annotations
import argparse
import gc
import json
import os
import statistics
import sys
import time
import traceback
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import List, Optional, Dict, Any
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.search.chain_search import ChainSearchEngine, SearchOptions
from codexlens.config import Config
from codexlens.storage.registry import RegistryStore
from codexlens.storage.path_mapper import PathMapper
@dataclass
class BenchmarkResult:
"""Result from a single benchmark run."""
strategy: str
query: str
latency_ms: float
num_results: int
top_result: Optional[str]
error: Optional[str] = None
@dataclass
class BenchmarkSummary:
"""Aggregated benchmark statistics."""
strategy: str
total_queries: int
successful_queries: int
avg_latency_ms: float
min_latency_ms: float
max_latency_ms: float
p50_latency_ms: float
p95_latency_ms: float
p99_latency_ms: float
avg_results: float
errors: List[str]
# Default test queries covering different scenarios
DEFAULT_QUERIES = [
# Code patterns
"def search",
"class Engine",
"import numpy",
"async def",
"raise ValueError",
# Semantic queries
"how to parse json",
"database connection",
"error handling",
"authentication logic",
"file read write",
# Technical terms
"embedding vector",
"cosine similarity",
"binary quantization",
"hamming distance",
"reranking",
]
def percentile(data: List[float], p: float) -> float:
"""Calculate percentile of sorted data."""
if not data:
return 0.0
sorted_data = sorted(data)
k = (len(sorted_data) - 1) * (p / 100)
f = int(k)
c = f + 1 if f + 1 < len(sorted_data) else f
return sorted_data[f] + (k - f) * (sorted_data[c] - sorted_data[f])
def run_single_benchmark(
engine: ChainSearchEngine,
query: str,
source_path: Path,
strategy: str,
options: Optional[SearchOptions] = None,
) -> BenchmarkResult:
"""Run a single benchmark query."""
gc.collect()
start_time = time.perf_counter()
try:
result = engine.cascade_search(
query=query,
source_path=source_path,
k=10,
coarse_k=100,
options=options,
strategy=strategy,
)
elapsed_ms = (time.perf_counter() - start_time) * 1000
top_result = None
if result.results:
r = result.results[0]
line = r.start_line or 0
top_result = f"{r.path}:{line}"
return BenchmarkResult(
strategy=strategy,
query=query,
latency_ms=elapsed_ms,
num_results=len(result.results),
top_result=top_result,
)
except Exception as e:
elapsed_ms = (time.perf_counter() - start_time) * 1000
return BenchmarkResult(
strategy=strategy,
query=query,
latency_ms=elapsed_ms,
num_results=0,
top_result=None,
error=str(e),
)
def run_benchmarks(
source_path: Path,
queries: List[str],
strategies: List[str],
warmup_runs: int = 2,
options: Optional[SearchOptions] = None,
) -> Dict[str, List[BenchmarkResult]]:
"""Run benchmarks for all queries and strategies."""
print(f"\n{'='*60}")
print(f"Cascade Search Benchmark")
print(f"{'='*60}")
print(f"Source: {source_path}")
print(f"Queries: {len(queries)}")
print(f"Strategies: {strategies}")
print(f"Warmup runs: {warmup_runs}")
print(f"{'='*60}\n")
# Initialize engine
config = Config()
registry = RegistryStore() # Uses default path
registry.initialize()
mapper = PathMapper() # Uses default path
engine = ChainSearchEngine(registry=registry, mapper=mapper, config=config)
results: Dict[str, List[BenchmarkResult]] = {s: [] for s in strategies}
# Warmup phase
if warmup_runs > 0:
print(f"Running {warmup_runs} warmup queries...")
warmup_query = queries[0] if queries else "test"
for strategy in strategies:
for _ in range(warmup_runs):
try:
run_single_benchmark(engine, warmup_query, source_path, strategy, options)
except Exception:
pass
print("Warmup complete.\n")
# Benchmark phase
total_runs = len(queries) * len(strategies)
current_run = 0
for query in queries:
for strategy in strategies:
current_run += 1
print(f"[{current_run}/{total_runs}] {strategy}: '{query[:40]}...' ", end="", flush=True)
result = run_single_benchmark(engine, query, source_path, strategy, options)
results[strategy].append(result)
if result.error:
print(f"ERROR: {result.error[:50]}")
else:
print(f"{result.latency_ms:.1f}ms, {result.num_results} results")
return results
def summarize_results(results: Dict[str, List[BenchmarkResult]]) -> Dict[str, BenchmarkSummary]:
"""Generate summary statistics for each strategy."""
summaries = {}
for strategy, benchmark_results in results.items():
latencies = [r.latency_ms for r in benchmark_results if r.error is None]
result_counts = [r.num_results for r in benchmark_results if r.error is None]
errors = [r.error for r in benchmark_results if r.error is not None]
if latencies:
summary = BenchmarkSummary(
strategy=strategy,
total_queries=len(benchmark_results),
successful_queries=len(latencies),
avg_latency_ms=statistics.mean(latencies),
min_latency_ms=min(latencies),
max_latency_ms=max(latencies),
p50_latency_ms=percentile(latencies, 50),
p95_latency_ms=percentile(latencies, 95),
p99_latency_ms=percentile(latencies, 99),
avg_results=statistics.mean(result_counts) if result_counts else 0,
errors=errors,
)
else:
summary = BenchmarkSummary(
strategy=strategy,
total_queries=len(benchmark_results),
successful_queries=0,
avg_latency_ms=0,
min_latency_ms=0,
max_latency_ms=0,
p50_latency_ms=0,
p95_latency_ms=0,
p99_latency_ms=0,
avg_results=0,
errors=errors,
)
summaries[strategy] = summary
return summaries
def print_comparison_table(summaries: Dict[str, BenchmarkSummary]) -> None:
"""Print formatted comparison table."""
print(f"\n{'='*80}")
print("BENCHMARK RESULTS COMPARISON")
print(f"{'='*80}\n")
# Header
print(f"{'Metric':<25} {'Binary':>15} {'Hybrid':>15} {'Diff':>15}")
print(f"{'-'*25} {'-'*15} {'-'*15} {'-'*15}")
binary = summaries.get("binary")
hybrid = summaries.get("hybrid")
if not binary or not hybrid:
print("Missing results for comparison")
return
metrics = [
("Total Queries", binary.total_queries, hybrid.total_queries),
("Successful", binary.successful_queries, hybrid.successful_queries),
("Avg Latency (ms)", binary.avg_latency_ms, hybrid.avg_latency_ms),
("Min Latency (ms)", binary.min_latency_ms, hybrid.min_latency_ms),
("Max Latency (ms)", binary.max_latency_ms, hybrid.max_latency_ms),
("P50 Latency (ms)", binary.p50_latency_ms, hybrid.p50_latency_ms),
("P95 Latency (ms)", binary.p95_latency_ms, hybrid.p95_latency_ms),
("P99 Latency (ms)", binary.p99_latency_ms, hybrid.p99_latency_ms),
("Avg Results", binary.avg_results, hybrid.avg_results),
]
for name, b_val, h_val in metrics:
if isinstance(b_val, float):
diff = b_val - h_val
diff_str = f"{diff:+.2f}" if diff != 0 else "0.00"
speedup = h_val / b_val if b_val > 0 else 0
if "Latency" in name and speedup > 1:
diff_str += f" ({speedup:.1f}x faster)"
print(f"{name:<25} {b_val:>15.2f} {h_val:>15.2f} {diff_str:>15}")
else:
diff = b_val - h_val
print(f"{name:<25} {b_val:>15} {h_val:>15} {diff:>+15}")
# Errors
print(f"\n{'Errors:':<25}")
print(f" Binary: {len(binary.errors)}")
for err in binary.errors[:3]:
print(f" - {err[:60]}...")
print(f" Hybrid: {len(hybrid.errors)}")
for err in hybrid.errors[:3]:
print(f" - {err[:60]}...")
# Winner
print(f"\n{'='*80}")
if binary.avg_latency_ms < hybrid.avg_latency_ms and binary.successful_queries > 0:
speedup = hybrid.avg_latency_ms / binary.avg_latency_ms
print(f"[WINNER] Binary ({speedup:.2f}x faster average latency)")
elif hybrid.avg_latency_ms < binary.avg_latency_ms and hybrid.successful_queries > 0:
speedup = binary.avg_latency_ms / hybrid.avg_latency_ms
print(f"[WINNER] Hybrid ({speedup:.2f}x faster average latency)")
else:
print("No clear winner (check errors)")
print(f"{'='*80}\n")
def save_results(
results: Dict[str, List[BenchmarkResult]],
summaries: Dict[str, BenchmarkSummary],
output_path: Path,
) -> None:
"""Save benchmark results to JSON file."""
data = {
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"summaries": {k: asdict(v) for k, v in summaries.items()},
"details": {
k: [asdict(r) for r in v]
for k, v in results.items()
},
}
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
print(f"Results saved to: {output_path}")
def main():
parser = argparse.ArgumentParser(description="Benchmark cascade search strategies")
parser.add_argument(
"--source", "-s",
type=Path,
default=Path(__file__).parent.parent / "src",
help="Source directory to search (default: ./src)",
)
parser.add_argument(
"--queries", "-q",
type=int,
default=len(DEFAULT_QUERIES),
help=f"Number of queries to run (default: {len(DEFAULT_QUERIES)})",
)
parser.add_argument(
"--warmup", "-w",
type=int,
default=2,
help="Number of warmup runs (default: 2)",
)
parser.add_argument(
"--output", "-o",
type=Path,
default=Path(__file__).parent / "results" / "cascade_benchmark.json",
help="Output file for results (default: benchmarks/results/cascade_benchmark.json)",
)
parser.add_argument(
"--strategies",
nargs="+",
default=["binary", "hybrid"],
choices=["binary", "hybrid"],
help="Strategies to benchmark (default: both)",
)
args = parser.parse_args()
# Validate source path
if not args.source.exists():
print(f"Error: Source path does not exist: {args.source}")
sys.exit(1)
# Select queries
queries = DEFAULT_QUERIES[:args.queries]
# Run benchmarks
try:
results = run_benchmarks(
source_path=args.source,
queries=queries,
strategies=args.strategies,
warmup_runs=args.warmup,
)
# Generate summaries
summaries = summarize_results(results)
# Print comparison
print_comparison_table(summaries)
# Save results
save_results(results, summaries, args.output)
except KeyboardInterrupt:
print("\nBenchmark interrupted.")
sys.exit(1)
except Exception as e:
print(f"\nBenchmark failed: {e}")
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,277 @@
{
"timestamp": "2026-01-02 11:22:34",
"summaries": {
"binary": {
"strategy": "binary",
"total_queries": 15,
"successful_queries": 15,
"avg_latency_ms": 850.328753333209,
"min_latency_ms": 750.9617999967304,
"max_latency_ms": 1015.733200001705,
"p50_latency_ms": 847.9711999971187,
"p95_latency_ms": 976.768470002571,
"p99_latency_ms": 1007.9402540018782,
"avg_results": 0,
"errors": []
},
"hybrid": {
"strategy": "hybrid",
"total_queries": 15,
"successful_queries": 15,
"avg_latency_ms": 821.3745733330143,
"min_latency_ms": 720.5589000004693,
"max_latency_ms": 943.0299999949057,
"p50_latency_ms": 819.5875000019441,
"p95_latency_ms": 916.3381599981221,
"p99_latency_ms": 937.691631995549,
"avg_results": 0,
"errors": []
}
},
"details": {
"binary": [
{
"strategy": "binary",
"query": "def search",
"latency_ms": 862.7266999974381,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "class Engine",
"latency_ms": 773.8472999990336,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "import numpy",
"latency_ms": 858.1023000006098,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "async def",
"latency_ms": 877.2815999982413,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "raise ValueError",
"latency_ms": 824.3320999972639,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "how to parse json",
"latency_ms": 948.0362000031164,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "database connection",
"latency_ms": 789.3126000053599,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "error handling",
"latency_ms": 960.0693000029423,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "authentication logic",
"latency_ms": 757.247900000948,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "file read write",
"latency_ms": 750.9617999967304,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "embedding vector",
"latency_ms": 871.1426000008942,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "cosine similarity",
"latency_ms": 817.1380999992834,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "binary quantization",
"latency_ms": 1015.733200001705,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "hamming distance",
"latency_ms": 847.9711999971187,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "binary",
"query": "reranking",
"latency_ms": 801.028399997449,
"num_results": 0,
"top_result": null,
"error": null
}
],
"hybrid": [
{
"strategy": "hybrid",
"query": "def search",
"latency_ms": 720.5589000004693,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "class Engine",
"latency_ms": 792.9914000051212,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "import numpy",
"latency_ms": 943.0299999949057,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "async def",
"latency_ms": 819.5875000019441,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "raise ValueError",
"latency_ms": 835.5114000005415,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "how to parse json",
"latency_ms": 867.8118999960134,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "database connection",
"latency_ms": 824.6361999990768,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "error handling",
"latency_ms": 742.638600000646,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "authentication logic",
"latency_ms": 840.4286999939359,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "file read write",
"latency_ms": 810.9049000049708,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "embedding vector",
"latency_ms": 876.5335000061896,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "cosine similarity",
"latency_ms": 797.3090999948909,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "binary quantization",
"latency_ms": 767.9803999999422,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "hamming distance",
"latency_ms": 775.7972999970661,
"num_results": 0,
"top_result": null,
"error": null
},
{
"strategy": "hybrid",
"query": "reranking",
"latency_ms": 904.8987999995006,
"num_results": 0,
"top_result": null,
"error": null
}
]
}
}

View File

@@ -7,7 +7,7 @@ import logging
import os
import shutil
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional
from typing import Annotated, Any, Dict, Iterable, List, Optional
import typer
from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
@@ -2721,3 +2721,305 @@ def _display_index_result(result) -> None:
console.print(f" [red]Error:[/red] {error}")
if len(result.errors) > 3:
console.print(f" [dim]... and {len(result.errors) - 3} more errors[/dim]")
# ==================== Cascade Index Commands ====================
def get_binary_index_path(db_path: Path) -> Path:
"""Get the path for binary ANN index file.
Args:
db_path: Path to the _index.db file
Returns:
Path to the binary index file (_index_binary.bin)
"""
return db_path.parent / f"{db_path.stem}_binary.bin"
@app.command("cascade-index")
def cascade_index(
path: Annotated[Path, typer.Argument(help="Directory to index")],
force: Annotated[bool, typer.Option("--force", "-f", help="Force regenerate")] = False,
batch_size: Annotated[int, typer.Option("--batch-size", "-b", help="Batch size for embedding")] = 32,
json_mode: Annotated[bool, typer.Option("--json", help="Output JSON response")] = False,
verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable verbose logging")] = False,
) -> None:
"""Generate cascade embeddings (binary + dense) for two-stage retrieval.
Cascade retrieval uses a two-stage approach:
1. Binary search (fast, 32 bytes/vector) -> coarse filtering
2. Dense rerank (precise, 8KB/vector) -> final results
This command:
- Finds all _index.db files in the directory
- Generates binary (256-dim) and dense (2048-dim) embeddings for each chunk
- Stores embeddings in the database (embedding_binary, embedding_dense columns)
- Creates a BinaryANNIndex file for fast coarse retrieval
Examples:
codexlens cascade-index ~/projects/my-app
codexlens cascade-index . --force
codexlens cascade-index . --batch-size 64 --verbose
"""
_configure_logging(verbose, json_mode)
target_path = path.expanduser().resolve()
# Find index database(s)
if target_path.is_file() and target_path.name == "_index.db":
index_dbs = [target_path]
elif target_path.is_dir():
# Check local .codexlens/_index.db first
local_index = target_path / ".codexlens" / "_index.db"
if local_index.exists():
index_dbs = [local_index]
else:
# Find via registry
registry = RegistryStore()
try:
registry.initialize()
mapper = PathMapper()
index_db = mapper.source_to_index_db(target_path)
if not index_db.exists():
if json_mode:
print_json(success=False, error=f"No index found for {target_path}")
else:
console.print(f"[red]Error:[/red] No index found for {target_path}")
console.print("Run 'codexlens init' first to create an index")
raise typer.Exit(code=1)
# Find all _index.db files under the index root
index_root = index_db.parent
index_dbs = list(index_root.rglob("_index.db"))
finally:
registry.close()
else:
if json_mode:
print_json(success=False, error="Path must be _index.db file or indexed directory")
else:
console.print("[red]Error:[/red] Path must be _index.db file or indexed directory")
raise typer.Exit(code=1)
if not index_dbs:
if json_mode:
print_json(success=False, error="No index databases found")
else:
console.print("[yellow]No index databases found[/yellow]")
raise typer.Exit(code=1)
# Import cascade embedding backend
try:
from codexlens.indexing.embedding import CascadeEmbeddingBackend
from codexlens.semantic.ann_index import BinaryANNIndex
from codexlens.indexing.embedding import pack_binary_embedding
except ImportError as e:
error_msg = f"Cascade embedding dependencies not available: {e}"
if json_mode:
print_json(success=False, error=error_msg)
else:
console.print(f"[red]Error:[/red] {error_msg}")
console.print("[dim]Install with: pip install codexlens[semantic][/dim]")
raise typer.Exit(code=1)
if not json_mode:
console.print(f"[bold]Generating cascade embeddings[/bold]")
console.print(f"Path: [dim]{target_path}[/dim]")
console.print(f"Index databases: [cyan]{len(index_dbs)}[/cyan]")
console.print(f"Batch size: [cyan]{batch_size}[/cyan]")
console.print()
# Initialize cascade embedding backend
try:
cascade_backend = CascadeEmbeddingBackend()
except Exception as e:
error_msg = f"Failed to initialize cascade embedding backend: {e}"
if json_mode:
print_json(success=False, error=error_msg)
else:
console.print(f"[red]Error:[/red] {error_msg}")
raise typer.Exit(code=1)
# Process statistics
total_chunks_processed = 0
total_indexes_processed = 0
total_indexes_successful = 0
total_binary_indexes_created = 0
errors_list: List[str] = []
# Process each index database
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
TextColumn("({task.completed}/{task.total})"),
TimeElapsedColumn(),
console=console,
disable=json_mode,
) as progress:
db_task = progress.add_task("Processing indexes...", total=len(index_dbs))
for db_path in index_dbs:
total_indexes_processed += 1
index_name = db_path.parent.name
try:
# Open the index store
store = DirIndexStore(db_path)
store.initialize()
# Get connection for direct queries
conn = store._get_connection()
# Ensure cascade columns exist in semantic_chunks table
try:
conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_binary BLOB")
except Exception:
pass # Column already exists
try:
conn.execute("ALTER TABLE semantic_chunks ADD COLUMN embedding_dense BLOB")
except Exception:
pass # Column already exists
conn.commit()
# Check if semantic_chunks table exists and has data
try:
cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
chunk_count = cursor.fetchone()[0]
except Exception:
# semantic_chunks table doesn't exist or is empty
chunk_count = 0
if chunk_count == 0:
if verbose and not json_mode:
console.print(f" [dim]Skipping {index_name}: no chunks found[/dim]")
progress.advance(db_task)
store.close()
continue
# Check if embeddings already exist (unless force)
if not force:
cursor = conn.execute(
"SELECT COUNT(*) FROM semantic_chunks WHERE embedding_binary IS NOT NULL"
)
existing_count = cursor.fetchone()[0]
if existing_count > 0:
if verbose and not json_mode:
console.print(f" [dim]Skipping {index_name}: embeddings exist (use --force to regenerate)[/dim]")
progress.advance(db_task)
store.close()
continue
# If force, clear existing cascade embeddings
if force:
conn.execute(
"UPDATE semantic_chunks SET embedding_binary = NULL, embedding_dense = NULL"
)
conn.commit()
# Get all chunks
cursor = conn.execute("SELECT id, content FROM semantic_chunks")
chunks = cursor.fetchall()
if not chunks:
progress.advance(db_task)
store.close()
continue
if verbose and not json_mode:
console.print(f" Processing {index_name}: {len(chunks)} chunks")
# Process in batches
chunk_task = progress.add_task(
f" {index_name}", total=len(chunks)
)
# Prepare for BinaryANNIndex
binary_index_path = get_binary_index_path(db_path)
binary_ann_index = BinaryANNIndex(db_path, dim=256)
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i + batch_size]
batch_ids = [c[0] for c in batch_chunks]
batch_contents = [c[1] for c in batch_chunks]
# Generate cascade embeddings
binary_embeddings, dense_embeddings = cascade_backend.encode_cascade(
batch_contents, batch_size=batch_size
)
# Pack binary embeddings and convert dense to bytes
packed_binaries = []
dense_bytes_list = []
for j in range(len(batch_ids)):
# Pack binary embedding (256 bits -> 32 bytes)
packed_binary = pack_binary_embedding(binary_embeddings[j])
packed_binaries.append(packed_binary)
# Convert dense embedding to bytes
import numpy as np
dense_blob = dense_embeddings[j].astype(np.float32).tobytes()
dense_bytes_list.append(dense_blob)
# Update database
for j, chunk_id in enumerate(batch_ids):
conn.execute(
"""
UPDATE semantic_chunks
SET embedding_binary = ?, embedding_dense = ?
WHERE id = ?
""",
(packed_binaries[j], dense_bytes_list[j], chunk_id)
)
# Add to binary ANN index
binary_ann_index.add_vectors(batch_ids, packed_binaries)
conn.commit()
total_chunks_processed += len(batch_ids)
progress.advance(chunk_task, len(batch_ids))
# Save binary ANN index
binary_ann_index.save()
total_binary_indexes_created += 1
progress.remove_task(chunk_task)
store.close()
total_indexes_successful += 1
except Exception as e:
error_msg = f"{index_name}: {e}"
errors_list.append(error_msg)
if verbose and not json_mode:
console.print(f" [red]Error processing {index_name}:[/red] {e}")
progress.advance(db_task)
# Build result
result = {
"path": str(target_path),
"indexes_processed": total_indexes_processed,
"indexes_successful": total_indexes_successful,
"chunks_processed": total_chunks_processed,
"binary_indexes_created": total_binary_indexes_created,
"errors": len(errors_list),
"error_details": errors_list[:5] if errors_list else [],
}
if json_mode:
print_json(success=True, result=result)
else:
console.print(f"\n[green]Cascade indexing complete[/green]")
console.print(f" Indexes processed: {total_indexes_processed}")
console.print(f" Indexes successful: {total_indexes_successful}")
console.print(f" Chunks processed: {total_chunks_processed:,}")
console.print(f" Binary indexes created: {total_binary_indexes_created}")
if errors_list:
console.print(f" [yellow]Errors: {len(errors_list)}[/yellow]")
for err in errors_list[:3]:
console.print(f" [dim]{err}[/dim]")
if len(errors_list) > 3:
console.print(f" [dim]... and {len(errors_list) - 3} more[/dim]")

View File

@@ -265,8 +265,8 @@ class DenseEmbeddingBackend(BaseEmbedder):
Model: BAAI/bge-large-en-v1.5 (1024 dim) with optional expansion
"""
DEFAULT_MODEL = "BAAI/bge-large-en-v1.5" # 1024 dim, high quality
TARGET_DIM = 2048
DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" # 384 dim, use small for testing
TARGET_DIM = 768 # Reduced target for faster testing
def __init__(
self,