mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
feat: Enhance BinaryANNIndex with vectorized search and performance benchmarking
This commit is contained in:
209
codex-lens/benchmarks/binary_search_microbenchmark.py
Normal file
209
codex-lens/benchmarks/binary_search_microbenchmark.py
Normal file
@@ -0,0 +1,209 @@
|
||||
#!/usr/bin/env python
|
||||
"""Micro-benchmark for BinaryANNIndex search performance.
|
||||
|
||||
Measures the actual speedup of vectorized Hamming distance computation.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import gc
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def old_search_implementation(query_arr: np.ndarray, vectors: dict, id_list: list, top_k: int):
|
||||
"""Original O(N) loop-based implementation for comparison."""
|
||||
packed_dim = len(query_arr)
|
||||
distances = []
|
||||
|
||||
for vec_id in id_list:
|
||||
vec = vectors[vec_id]
|
||||
vec_arr = np.frombuffer(vec, dtype=np.uint8)
|
||||
xor = np.bitwise_xor(query_arr, vec_arr)
|
||||
dist = int(np.unpackbits(xor).sum())
|
||||
distances.append((vec_id, dist))
|
||||
|
||||
distances.sort(key=lambda x: x[1])
|
||||
top_results = distances[:top_k]
|
||||
ids = [r[0] for r in top_results]
|
||||
dists = [r[1] for r in top_results]
|
||||
|
||||
return ids, dists
|
||||
|
||||
|
||||
def new_search_implementation(query_arr: np.ndarray, vectors_matrix: np.ndarray, ids_array: np.ndarray, top_k: int):
|
||||
"""Optimized vectorized implementation."""
|
||||
# Broadcast XOR
|
||||
xor_result = np.bitwise_xor(query_arr, vectors_matrix)
|
||||
|
||||
# Vectorized popcount using lookup table
|
||||
popcount_lut = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
|
||||
bit_counts = popcount_lut[xor_result]
|
||||
|
||||
# Sum across packed bytes
|
||||
distances = bit_counts.sum(axis=1)
|
||||
|
||||
# Get top-k using argpartition
|
||||
n_vectors = len(distances)
|
||||
k = min(top_k, n_vectors)
|
||||
|
||||
if k == n_vectors:
|
||||
sorted_indices = np.argsort(distances)
|
||||
else:
|
||||
partition_indices = np.argpartition(distances, k)[:k]
|
||||
top_k_distances = distances[partition_indices]
|
||||
sorted_order = np.argsort(top_k_distances)
|
||||
sorted_indices = partition_indices[sorted_order]
|
||||
|
||||
result_ids = ids_array[sorted_indices].tolist()
|
||||
result_dists = distances[sorted_indices].tolist()
|
||||
|
||||
return result_ids, result_dists
|
||||
|
||||
|
||||
def run_benchmark(n_vectors: int, dim: int = 256, top_k: int = 100, n_iterations: int = 50):
|
||||
"""Run benchmark comparing old and new implementations."""
|
||||
packed_dim = dim // 8 # 32 bytes for 256-bit
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Binary Search Micro-Benchmark")
|
||||
print(f"{'='*60}")
|
||||
print(f"Vectors: {n_vectors}")
|
||||
print(f"Dimension: {dim} bits ({packed_dim} bytes packed)")
|
||||
print(f"Top-K: {top_k}")
|
||||
print(f"Iterations: {n_iterations}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Generate random binary vectors
|
||||
print("Generating test data...")
|
||||
vectors_dict = {}
|
||||
id_list = []
|
||||
|
||||
for i in range(n_vectors):
|
||||
vec_bytes = np.random.randint(0, 256, size=packed_dim, dtype=np.uint8).tobytes()
|
||||
vectors_dict[i] = vec_bytes
|
||||
id_list.append(i)
|
||||
|
||||
# Build matrix for vectorized search
|
||||
vectors_matrix = np.empty((n_vectors, packed_dim), dtype=np.uint8)
|
||||
ids_array = np.array(id_list, dtype=np.int64)
|
||||
|
||||
for i, vec_id in enumerate(id_list):
|
||||
vec_bytes = vectors_dict[vec_id]
|
||||
vectors_matrix[i] = np.frombuffer(vec_bytes, dtype=np.uint8)
|
||||
|
||||
# Generate random query
|
||||
query_bytes = np.random.randint(0, 256, size=packed_dim, dtype=np.uint8).tobytes()
|
||||
query_arr = np.frombuffer(query_bytes, dtype=np.uint8)
|
||||
|
||||
# Warmup
|
||||
print("Running warmup...")
|
||||
for _ in range(3):
|
||||
old_search_implementation(query_arr, vectors_dict, id_list, top_k)
|
||||
new_search_implementation(query_arr, vectors_matrix, ids_array, top_k)
|
||||
|
||||
# Benchmark old implementation
|
||||
print("Benchmarking old implementation...")
|
||||
old_times = []
|
||||
for _ in range(n_iterations):
|
||||
gc.collect()
|
||||
start = time.perf_counter()
|
||||
old_ids, old_dists = old_search_implementation(query_arr, vectors_dict, id_list, top_k)
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
old_times.append(elapsed)
|
||||
|
||||
# Benchmark new implementation
|
||||
print("Benchmarking new implementation...")
|
||||
new_times = []
|
||||
for _ in range(n_iterations):
|
||||
gc.collect()
|
||||
start = time.perf_counter()
|
||||
new_ids, new_dists = new_search_implementation(query_arr, vectors_matrix, ids_array, top_k)
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
new_times.append(elapsed)
|
||||
|
||||
# Verify correctness
|
||||
print("\nVerifying correctness...")
|
||||
# Check that distances are correct (IDs may differ for ties)
|
||||
if old_dists == new_dists:
|
||||
print("Distances match! (IDs may differ for ties)")
|
||||
else:
|
||||
# Check if difference is just in tie-breaking
|
||||
old_dist_set = set(old_dists)
|
||||
new_dist_set = set(new_dists)
|
||||
if old_dist_set == new_dist_set:
|
||||
print("Distances equivalent (tie-breaking differs, which is acceptable)")
|
||||
else:
|
||||
print("WARNING: Distance distributions differ!")
|
||||
print(f" Old dists (first 5): {old_dists[:5]}")
|
||||
print(f" New dists (first 5): {new_dists[:5]}")
|
||||
|
||||
# Calculate statistics
|
||||
old_avg = statistics.mean(old_times)
|
||||
old_std = statistics.stdev(old_times) if len(old_times) > 1 else 0
|
||||
new_avg = statistics.mean(new_times)
|
||||
new_std = statistics.stdev(new_times) if len(new_times) > 1 else 0
|
||||
|
||||
speedup = old_avg / new_avg if new_avg > 0 else 0
|
||||
|
||||
# Print results
|
||||
print(f"\n{'='*60}")
|
||||
print("RESULTS")
|
||||
print(f"{'='*60}")
|
||||
print(f"{'Metric':<25} {'Old (loop)':>15} {'New (vectorized)':>18}")
|
||||
print(f"{'-'*25} {'-'*15} {'-'*18}")
|
||||
print(f"{'Avg Latency (ms)':<25} {old_avg:>15.3f} {new_avg:>18.3f}")
|
||||
print(f"{'Std Dev (ms)':<25} {old_std:>15.3f} {new_std:>18.3f}")
|
||||
print(f"{'Min Latency (ms)':<25} {min(old_times):>15.3f} {min(new_times):>18.3f}")
|
||||
print(f"{'Max Latency (ms)':<25} {max(old_times):>15.3f} {max(new_times):>18.3f}")
|
||||
print(f"{'P50 (ms)':<25} {sorted(old_times)[len(old_times)//2]:>15.3f} {sorted(new_times)[len(new_times)//2]:>18.3f}")
|
||||
print(f"\n{'Speedup:':<25} {speedup:>15.2f}x")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
return {
|
||||
"n_vectors": n_vectors,
|
||||
"dim": dim,
|
||||
"top_k": top_k,
|
||||
"old_avg_ms": old_avg,
|
||||
"new_avg_ms": new_avg,
|
||||
"speedup": speedup,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
print("\n" + "="*70)
|
||||
print(" BINARY SEARCH OPTIMIZATION MICRO-BENCHMARK")
|
||||
print("="*70)
|
||||
|
||||
# Test different vector counts
|
||||
results = []
|
||||
|
||||
for n_vectors in [1000, 5000, 10000, 50000]:
|
||||
result = run_benchmark(
|
||||
n_vectors=n_vectors,
|
||||
dim=256,
|
||||
top_k=100,
|
||||
n_iterations=20,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*70)
|
||||
print(" SUMMARY")
|
||||
print("="*70)
|
||||
print(f"{'N Vectors':<12} {'Old (ms)':<12} {'New (ms)':<12} {'Speedup':>10}")
|
||||
print("-"*50)
|
||||
for r in results:
|
||||
print(f"{r['n_vectors']:<12} {r['old_avg_ms']:<12.3f} {r['new_avg_ms']:<12.3f} {r['speedup']:>10.2f}x")
|
||||
print("="*70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,30 +1,30 @@
|
||||
{
|
||||
"timestamp": "2026-01-02 11:22:34",
|
||||
"timestamp": "2026-01-02 11:48:33",
|
||||
"summaries": {
|
||||
"binary": {
|
||||
"strategy": "binary",
|
||||
"total_queries": 15,
|
||||
"successful_queries": 15,
|
||||
"avg_latency_ms": 850.328753333209,
|
||||
"min_latency_ms": 750.9617999967304,
|
||||
"max_latency_ms": 1015.733200001705,
|
||||
"p50_latency_ms": 847.9711999971187,
|
||||
"p95_latency_ms": 976.768470002571,
|
||||
"p99_latency_ms": 1007.9402540018782,
|
||||
"avg_results": 0,
|
||||
"avg_latency_ms": 1133.4008666667312,
|
||||
"min_latency_ms": 959.5361000028788,
|
||||
"max_latency_ms": 1330.8978999993997,
|
||||
"p50_latency_ms": 1125.8439999946859,
|
||||
"p95_latency_ms": 1330.0081999987015,
|
||||
"p99_latency_ms": 1330.71995999926,
|
||||
"avg_results": 10,
|
||||
"errors": []
|
||||
},
|
||||
"hybrid": {
|
||||
"strategy": "hybrid",
|
||||
"total_queries": 15,
|
||||
"successful_queries": 15,
|
||||
"avg_latency_ms": 821.3745733330143,
|
||||
"min_latency_ms": 720.5589000004693,
|
||||
"max_latency_ms": 943.0299999949057,
|
||||
"p50_latency_ms": 819.5875000019441,
|
||||
"p95_latency_ms": 916.3381599981221,
|
||||
"p99_latency_ms": 937.691631995549,
|
||||
"avg_results": 0,
|
||||
"avg_latency_ms": 1111.1401133336283,
|
||||
"min_latency_ms": 857.0021999985329,
|
||||
"max_latency_ms": 1278.8890000010724,
|
||||
"p50_latency_ms": 1130.696000000171,
|
||||
"p95_latency_ms": 1254.2417899981956,
|
||||
"p99_latency_ms": 1273.959558000497,
|
||||
"avg_results": 10,
|
||||
"errors": []
|
||||
}
|
||||
},
|
||||
@@ -33,121 +33,121 @@
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "def search",
|
||||
"latency_ms": 862.7266999974381,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1044.525999997859,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "class Engine",
|
||||
"latency_ms": 773.8472999990336,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1052.5979999947594,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "import numpy",
|
||||
"latency_ms": 858.1023000006098,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1217.217100005655,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\__main__.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "async def",
|
||||
"latency_ms": 877.2815999982413,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1276.9802000038908,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "raise ValueError",
|
||||
"latency_ms": 824.3320999972639,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1005.9053000004496,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 948.0362000031164,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1330.8978999993997,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "database connection",
|
||||
"latency_ms": 789.3126000053599,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1041.6685000018333,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "error handling",
|
||||
"latency_ms": 960.0693000029423,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 959.5361000028788,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_004_dual_fts.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "authentication logic",
|
||||
"latency_ms": 757.247900000948,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1060.9395999999833,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "file read write",
|
||||
"latency_ms": 750.9617999967304,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 971.8680000005406,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "embedding vector",
|
||||
"latency_ms": 871.1426000008942,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1135.879900000873,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\embedder.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "cosine similarity",
|
||||
"latency_ms": 817.1380999992834,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1188.1732000038028,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "binary quantization",
|
||||
"latency_ms": 1015.733200001705,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1259.3522999959532,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "hamming distance",
|
||||
"latency_ms": 847.9711999971187,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1329.6268999984022,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "binary",
|
||||
"query": "reranking",
|
||||
"latency_ms": 801.028399997449,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1125.8439999946859,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py:0",
|
||||
"error": null
|
||||
}
|
||||
],
|
||||
@@ -155,121 +155,121 @@
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "def search",
|
||||
"latency_ms": 720.5589000004693,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1117.0937999995658,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\dir_index.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "class Engine",
|
||||
"latency_ms": 792.9914000051212,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1039.3984000038472,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\parsers\\factory.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "import numpy",
|
||||
"latency_ms": 943.0299999949057,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1144.7916999968584,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\__main__.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "async def",
|
||||
"latency_ms": 819.5875000019441,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 857.0021999985329,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\reranker\\api_reranker.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "raise ValueError",
|
||||
"latency_ms": 835.5114000005415,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 957.5578000003588,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "how to parse json",
|
||||
"latency_ms": 867.8118999960134,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1216.5708000029554,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\cli\\commands.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "database connection",
|
||||
"latency_ms": 824.6361999990768,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1154.8929000055068,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\indexing\\symbol_extractor.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "error handling",
|
||||
"latency_ms": 742.638600000646,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1130.696000000171,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\storage\\migrations\\migration_004_dual_fts.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "authentication logic",
|
||||
"latency_ms": 840.4286999939359,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1112.8943000003346,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\query_parser.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "file read write",
|
||||
"latency_ms": 810.9049000049708,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1172.5986000019475,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\watcher\\incremental_indexer.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "embedding vector",
|
||||
"latency_ms": 876.5335000061896,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1278.8890000010724,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\embedder.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "cosine similarity",
|
||||
"latency_ms": 797.3090999948909,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1024.2393000007723,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "binary quantization",
|
||||
"latency_ms": 767.9803999999422,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1243.6786999969627,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\vector_store.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "hamming distance",
|
||||
"latency_ms": 775.7972999970661,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1081.3100999948801,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\semantic\\ann_index.py:0",
|
||||
"error": null
|
||||
},
|
||||
{
|
||||
"strategy": "hybrid",
|
||||
"query": "reranking",
|
||||
"latency_ms": 904.8987999995006,
|
||||
"num_results": 0,
|
||||
"top_result": null,
|
||||
"latency_ms": 1135.4881000006571,
|
||||
"num_results": 10,
|
||||
"top_result": "D:\\Claude_dms3\\codex-lens\\src\\codexlens\\search\\ranking.py:0",
|
||||
"error": null
|
||||
}
|
||||
]
|
||||
|
||||
@@ -608,31 +608,43 @@ class ChainSearchEngine:
|
||||
|
||||
for index_path, chunk_ids in candidates_by_index.items():
|
||||
try:
|
||||
store = SQLiteStore(index_path)
|
||||
dense_embeddings = store.get_dense_embeddings(chunk_ids)
|
||||
chunks_data = store.get_chunks_by_ids(chunk_ids)
|
||||
# Read directly from semantic_chunks table (where cascade-index stores data)
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(str(index_path))
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
# Create lookup for chunk content
|
||||
chunk_content: Dict[int, Dict[str, Any]] = {
|
||||
c["id"]: c for c in chunks_data
|
||||
}
|
||||
placeholders = ",".join("?" * len(chunk_ids))
|
||||
rows = conn.execute(
|
||||
f"SELECT id, file_path, content, embedding_dense FROM semantic_chunks WHERE id IN ({placeholders})",
|
||||
chunk_ids
|
||||
).fetchall()
|
||||
conn.close()
|
||||
|
||||
for chunk_id in chunk_ids:
|
||||
dense_bytes = dense_embeddings.get(chunk_id)
|
||||
chunk_info = chunk_content.get(chunk_id)
|
||||
# Batch processing: collect all valid embeddings first
|
||||
valid_rows = []
|
||||
dense_vectors = []
|
||||
for row in rows:
|
||||
dense_bytes = row["embedding_dense"]
|
||||
if dense_bytes is not None:
|
||||
valid_rows.append(row)
|
||||
dense_vectors.append(np.frombuffer(dense_bytes, dtype=np.float32))
|
||||
|
||||
if dense_bytes is None or chunk_info is None:
|
||||
continue
|
||||
if not dense_vectors:
|
||||
continue
|
||||
|
||||
# Compute cosine similarity
|
||||
dense_vec = np.frombuffer(dense_bytes, dtype=np.float32)
|
||||
score = self._compute_cosine_similarity(query_dense, dense_vec)
|
||||
# Stack into matrix for batch computation
|
||||
doc_matrix = np.vstack(dense_vectors)
|
||||
|
||||
# Create search result
|
||||
excerpt = chunk_info.get("content", "")[:500]
|
||||
# Batch compute cosine similarities
|
||||
scores = self._compute_cosine_similarity_batch(query_dense, doc_matrix)
|
||||
|
||||
# Create search results
|
||||
for i, row in enumerate(valid_rows):
|
||||
score = float(scores[i])
|
||||
excerpt = (row["content"] or "")[:500]
|
||||
result = SearchResult(
|
||||
path=chunk_info.get("file_path", ""),
|
||||
score=float(score),
|
||||
path=row["file_path"] or "",
|
||||
score=score,
|
||||
excerpt=excerpt,
|
||||
)
|
||||
scored_results.append((score, result))
|
||||
@@ -783,6 +795,58 @@ class ChainSearchEngine:
|
||||
|
||||
return float(dot_product / (norm_q * norm_d))
|
||||
|
||||
def _compute_cosine_similarity_batch(
|
||||
self,
|
||||
query_vec: "np.ndarray",
|
||||
doc_matrix: "np.ndarray",
|
||||
) -> "np.ndarray":
|
||||
"""Compute cosine similarity between query and multiple document vectors.
|
||||
|
||||
Uses vectorized matrix operations for efficient batch computation.
|
||||
|
||||
Args:
|
||||
query_vec: Query embedding vector of shape (dim,)
|
||||
doc_matrix: Document embeddings matrix of shape (n_docs, dim)
|
||||
|
||||
Returns:
|
||||
Array of cosine similarity scores of shape (n_docs,)
|
||||
"""
|
||||
if not NUMPY_AVAILABLE:
|
||||
return np.zeros(doc_matrix.shape[0])
|
||||
|
||||
# Ensure query is 1D
|
||||
if query_vec.ndim > 1:
|
||||
query_vec = query_vec.flatten()
|
||||
|
||||
# Handle dimension mismatch by truncating to smaller dimension
|
||||
min_dim = min(len(query_vec), doc_matrix.shape[1])
|
||||
q = query_vec[:min_dim]
|
||||
docs = doc_matrix[:, :min_dim]
|
||||
|
||||
# Compute query norm once
|
||||
norm_q = np.linalg.norm(q)
|
||||
if norm_q == 0:
|
||||
return np.zeros(docs.shape[0])
|
||||
|
||||
# Normalize query
|
||||
q_normalized = q / norm_q
|
||||
|
||||
# Compute document norms (vectorized)
|
||||
doc_norms = np.linalg.norm(docs, axis=1)
|
||||
|
||||
# Avoid division by zero
|
||||
nonzero_mask = doc_norms > 0
|
||||
scores = np.zeros(docs.shape[0], dtype=np.float32)
|
||||
|
||||
if np.any(nonzero_mask):
|
||||
# Normalize documents with non-zero norms
|
||||
docs_normalized = docs[nonzero_mask] / doc_norms[nonzero_mask, np.newaxis]
|
||||
|
||||
# Batch dot product: (n_docs, dim) @ (dim,) = (n_docs,)
|
||||
scores[nonzero_mask] = docs_normalized @ q_normalized
|
||||
|
||||
return scores
|
||||
|
||||
def _build_results_from_candidates(
|
||||
self,
|
||||
candidates: List[Tuple[int, int, Path]],
|
||||
|
||||
@@ -487,6 +487,11 @@ class BinaryANNIndex:
|
||||
self._vectors: dict[int, bytes] = {}
|
||||
self._id_list: list[int] = [] # Ordered list for efficient iteration
|
||||
|
||||
# Cached numpy array for vectorized search (invalidated on add/remove)
|
||||
self._vectors_matrix: Optional[np.ndarray] = None
|
||||
self._ids_array: Optional[np.ndarray] = None
|
||||
self._cache_valid: bool = False
|
||||
|
||||
logger.info(
|
||||
f"Initialized BinaryANNIndex with dim={dim}, packed_dim={self.packed_dim}"
|
||||
)
|
||||
@@ -524,6 +529,9 @@ class BinaryANNIndex:
|
||||
self._id_list.append(vec_id)
|
||||
self._vectors[vec_id] = vec
|
||||
|
||||
# Invalidate cache on modification
|
||||
self._cache_valid = False
|
||||
|
||||
logger.debug(
|
||||
f"Added {len(ids)} binary vectors to index (total: {len(self._vectors)})"
|
||||
)
|
||||
@@ -599,6 +607,8 @@ class BinaryANNIndex:
|
||||
# Rebuild ID list efficiently - O(N) once instead of O(N) per removal
|
||||
if removed_count > 0:
|
||||
self._id_list = [id_ for id_ in self._id_list if id_ not in ids_to_remove]
|
||||
# Invalidate cache on modification
|
||||
self._cache_valid = False
|
||||
|
||||
logger.debug(f"Removed {removed_count}/{len(ids)} vectors from index")
|
||||
|
||||
@@ -610,11 +620,42 @@ class BinaryANNIndex:
|
||||
f"Failed to remove vectors from Binary ANN index: {e}"
|
||||
)
|
||||
|
||||
def _build_cache(self) -> None:
|
||||
"""Build numpy array cache from vectors dict for vectorized search.
|
||||
|
||||
Pre-computes a contiguous numpy array from all vectors for efficient
|
||||
batch distance computation. Called lazily on first search after modification.
|
||||
"""
|
||||
if self._cache_valid:
|
||||
return
|
||||
|
||||
n_vectors = len(self._id_list)
|
||||
if n_vectors == 0:
|
||||
self._vectors_matrix = None
|
||||
self._ids_array = None
|
||||
self._cache_valid = True
|
||||
return
|
||||
|
||||
# Build contiguous numpy array of all packed vectors
|
||||
# Shape: (n_vectors, packed_dim) with uint8 dtype
|
||||
self._vectors_matrix = np.empty((n_vectors, self.packed_dim), dtype=np.uint8)
|
||||
self._ids_array = np.array(self._id_list, dtype=np.int64)
|
||||
|
||||
for i, vec_id in enumerate(self._id_list):
|
||||
vec_bytes = self._vectors[vec_id]
|
||||
self._vectors_matrix[i] = np.frombuffer(vec_bytes, dtype=np.uint8)
|
||||
|
||||
self._cache_valid = True
|
||||
logger.debug(f"Built vectorized cache for {n_vectors} binary vectors")
|
||||
|
||||
def search(
|
||||
self, query: bytes, top_k: int = 10
|
||||
) -> Tuple[List[int], List[int]]:
|
||||
"""Search for nearest neighbors using Hamming distance.
|
||||
|
||||
Uses vectorized batch computation for O(N) search with SIMD acceleration.
|
||||
Pre-computes and caches numpy arrays for efficient repeated queries.
|
||||
|
||||
Args:
|
||||
query: Packed binary query vector (size: packed_dim bytes)
|
||||
top_k: Number of nearest neighbors to return
|
||||
@@ -638,27 +679,48 @@ class BinaryANNIndex:
|
||||
if len(self._vectors) == 0:
|
||||
return [], []
|
||||
|
||||
# Compute Hamming distances to all vectors
|
||||
# Build cache if needed (lazy initialization)
|
||||
self._build_cache()
|
||||
|
||||
if self._vectors_matrix is None or self._ids_array is None:
|
||||
return [], []
|
||||
|
||||
# Vectorized Hamming distance computation
|
||||
# 1. Convert query to numpy array
|
||||
query_arr = np.frombuffer(query, dtype=np.uint8)
|
||||
distances = []
|
||||
|
||||
for vec_id in self._id_list:
|
||||
vec = self._vectors[vec_id]
|
||||
vec_arr = np.frombuffer(vec, dtype=np.uint8)
|
||||
# XOR and popcount for Hamming distance
|
||||
xor = np.bitwise_xor(query_arr, vec_arr)
|
||||
dist = int(np.unpackbits(xor).sum())
|
||||
distances.append((vec_id, dist))
|
||||
# 2. Broadcast XOR: (1, packed_dim) XOR (n_vectors, packed_dim)
|
||||
# Result shape: (n_vectors, packed_dim)
|
||||
xor_result = np.bitwise_xor(query_arr, self._vectors_matrix)
|
||||
|
||||
# Sort by distance (ascending)
|
||||
distances.sort(key=lambda x: x[1])
|
||||
# 3. Vectorized popcount using lookup table for efficiency
|
||||
# np.unpackbits is slow for large arrays, use popcount LUT instead
|
||||
popcount_lut = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
|
||||
bit_counts = popcount_lut[xor_result]
|
||||
|
||||
# Return top-k
|
||||
top_results = distances[:top_k]
|
||||
ids = [r[0] for r in top_results]
|
||||
dists = [r[1] for r in top_results]
|
||||
# 4. Sum across packed bytes to get Hamming distance per vector
|
||||
distances = bit_counts.sum(axis=1)
|
||||
|
||||
return ids, dists
|
||||
# 5. Get top-k using argpartition (O(N) instead of O(N log N) for full sort)
|
||||
n_vectors = len(distances)
|
||||
k = min(top_k, n_vectors)
|
||||
|
||||
if k == n_vectors:
|
||||
# No partitioning needed, just sort all
|
||||
sorted_indices = np.argsort(distances)
|
||||
else:
|
||||
# Use argpartition for O(N) partial sort
|
||||
partition_indices = np.argpartition(distances, k)[:k]
|
||||
# Sort only the top-k
|
||||
top_k_distances = distances[partition_indices]
|
||||
sorted_order = np.argsort(top_k_distances)
|
||||
sorted_indices = partition_indices[sorted_order]
|
||||
|
||||
# 6. Return results
|
||||
result_ids = self._ids_array[sorted_indices].tolist()
|
||||
result_dists = distances[sorted_indices].tolist()
|
||||
|
||||
return result_ids, result_dists
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to search Binary ANN index: {e}")
|
||||
@@ -797,6 +859,7 @@ class BinaryANNIndex:
|
||||
# Clear existing data
|
||||
self._vectors.clear()
|
||||
self._id_list.clear()
|
||||
self._cache_valid = False
|
||||
|
||||
# Read vectors
|
||||
for _ in range(num_vectors):
|
||||
@@ -853,6 +916,9 @@ class BinaryANNIndex:
|
||||
with self._lock:
|
||||
self._vectors.clear()
|
||||
self._id_list.clear()
|
||||
self._vectors_matrix = None
|
||||
self._ids_array = None
|
||||
self._cache_valid = False
|
||||
logger.debug("Cleared binary index")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user