mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-19 18:58:47 +08:00
fix: correct embedder API call in SearchPipeline and add E2E test script
SearchPipeline.search() called self._embedder.embed() which doesn't exist on BaseEmbedder/FastEmbedEmbedder — only embed_single() and embed_batch() are defined. This was masked by MockEmbedder in tests. Changed to embed_single() which is the correct API for single-query embedding. Also added scripts/test_small_e2e.py for quick end-to-end validation of indexing pipeline and all search features on a small file set. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
193
codex-lens-v2/scripts/test_small_e2e.py
Normal file
193
codex-lens-v2/scripts/test_small_e2e.py
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
"""
|
||||||
|
Small-folder end-to-end test: index tests/ directory (~10 files) and verify
|
||||||
|
indexing pipeline + all search features work correctly.
|
||||||
|
|
||||||
|
Usage: python scripts/test_small_e2e.py
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from codexlens.config import Config
|
||||||
|
from codexlens.core.factory import create_ann_index, create_binary_index
|
||||||
|
from codexlens.embed.local import FastEmbedEmbedder
|
||||||
|
from codexlens.indexing import IndexingPipeline
|
||||||
|
from codexlens.rerank.base import BaseReranker
|
||||||
|
from codexlens.search.fts import FTSEngine
|
||||||
|
from codexlens.search.pipeline import SearchPipeline
|
||||||
|
|
||||||
|
|
||||||
|
class KeywordReranker(BaseReranker):
|
||||||
|
"""Simple keyword-overlap reranker for testing without network."""
|
||||||
|
def score_pairs(self, query: str, documents: list[str]) -> list[float]:
|
||||||
|
q_words = set(query.lower().split())
|
||||||
|
scores = []
|
||||||
|
for doc in documents:
|
||||||
|
d_words = set(doc.lower().split())
|
||||||
|
overlap = len(q_words & d_words)
|
||||||
|
scores.append(float(overlap) / max(len(q_words), 1))
|
||||||
|
return scores
|
||||||
|
|
||||||
|
PROJECT = Path(__file__).parent.parent
|
||||||
|
TARGET_DIR = PROJECT / "src" / "codexlens" # ~21 .py files, small
|
||||||
|
INDEX_DIR = PROJECT / ".test_index_cache"
|
||||||
|
EXTENSIONS = {".py"}
|
||||||
|
|
||||||
|
passed = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
|
||||||
|
def check(name: str, condition: bool, detail: str = ""):
|
||||||
|
global passed, failed
|
||||||
|
if condition:
|
||||||
|
passed += 1
|
||||||
|
print(f" [PASS] {name}")
|
||||||
|
else:
|
||||||
|
failed += 1
|
||||||
|
print(f" [FAIL] {name} — {detail}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
global passed, failed
|
||||||
|
INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
config = Config(
|
||||||
|
embed_model="BAAI/bge-small-en-v1.5",
|
||||||
|
embed_dim=384,
|
||||||
|
embed_batch_size=32,
|
||||||
|
hnsw_ef=100,
|
||||||
|
hnsw_M=16,
|
||||||
|
binary_top_k=100,
|
||||||
|
ann_top_k=30,
|
||||||
|
reranker_model="BAAI/bge-reranker-base",
|
||||||
|
reranker_top_k=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
files = [p for p in TARGET_DIR.rglob("*.py") if p.is_file()]
|
||||||
|
print(f"Target: {TARGET_DIR} ({len(files)} .py files)\n")
|
||||||
|
|
||||||
|
# ── 1. Test IndexingPipeline ──────────────────────────────
|
||||||
|
print("=== 1. IndexingPipeline (parallel) ===")
|
||||||
|
embedder = FastEmbedEmbedder(config)
|
||||||
|
binary_store = create_binary_index(INDEX_DIR, config.embed_dim, config)
|
||||||
|
ann_index = create_ann_index(INDEX_DIR, config.embed_dim, config)
|
||||||
|
fts = FTSEngine(":memory:")
|
||||||
|
|
||||||
|
t0 = time.time()
|
||||||
|
stats = IndexingPipeline(
|
||||||
|
embedder=embedder,
|
||||||
|
binary_store=binary_store,
|
||||||
|
ann_index=ann_index,
|
||||||
|
fts=fts,
|
||||||
|
config=config,
|
||||||
|
).index_files(files, root=TARGET_DIR, max_chunk_chars=800, chunk_overlap=100)
|
||||||
|
elapsed = time.time() - t0
|
||||||
|
|
||||||
|
check("files_processed > 0", stats.files_processed > 0, f"got {stats.files_processed}")
|
||||||
|
check("chunks_created > 0", stats.chunks_created > 0, f"got {stats.chunks_created}")
|
||||||
|
check("indexing completed", elapsed < 120, f"took {elapsed:.1f}s")
|
||||||
|
print(f" Stats: {stats.files_processed} files, {stats.chunks_created} chunks, {elapsed:.1f}s\n")
|
||||||
|
|
||||||
|
# ── 2. Test BinaryStore (pre-allocated, coarse search) ────
|
||||||
|
print("=== 2. BinaryStore coarse search ===")
|
||||||
|
q_vec = embedder.embed_single("def search")
|
||||||
|
b_ids, b_dists = binary_store.coarse_search(q_vec, top_k=10)
|
||||||
|
check("binary returns results", len(b_ids) > 0, f"got {len(b_ids)}")
|
||||||
|
check("binary ids are ints", all(isinstance(int(i), int) for i in b_ids))
|
||||||
|
print(f" Top 5 binary IDs: {b_ids[:5]}\n")
|
||||||
|
|
||||||
|
# ── 3. Test ANNIndex (fine search) ────────────────────────
|
||||||
|
print("=== 3. ANNIndex fine search ===")
|
||||||
|
a_ids, a_dists = ann_index.fine_search(q_vec, top_k=10)
|
||||||
|
check("ann returns results", len(a_ids) > 0, f"got {len(a_ids)}")
|
||||||
|
check("ann scores are floats", all(isinstance(float(d), float) for d in a_dists))
|
||||||
|
print(f" Top 5 ANN IDs: {a_ids[:5]}\n")
|
||||||
|
|
||||||
|
# ── 4. Test FTSEngine (exact + fuzzy) ─────────────────────
|
||||||
|
print("=== 4. FTSEngine search ===")
|
||||||
|
exact = fts.exact_search("def search", top_k=5)
|
||||||
|
fuzzy = fts.fuzzy_search("embedd", top_k=5)
|
||||||
|
check("exact search returns results", len(exact) > 0, f"got {len(exact)}")
|
||||||
|
check("fuzzy search returns results", len(fuzzy) > 0, f"got {len(fuzzy)}")
|
||||||
|
print(f" Exact hits: {len(exact)}, Fuzzy hits: {len(fuzzy)}\n")
|
||||||
|
|
||||||
|
# ── 5. Test SearchPipeline (parallel FTS||vector + fusion + rerank) ──
|
||||||
|
print("=== 5. SearchPipeline (full pipeline) ===")
|
||||||
|
reranker = KeywordReranker()
|
||||||
|
search = SearchPipeline(
|
||||||
|
embedder=embedder,
|
||||||
|
binary_store=binary_store,
|
||||||
|
ann_index=ann_index,
|
||||||
|
reranker=reranker,
|
||||||
|
fts=fts,
|
||||||
|
config=config,
|
||||||
|
)
|
||||||
|
|
||||||
|
queries = [
|
||||||
|
("def embed_single", "code symbol search"),
|
||||||
|
("search pipeline fusion", "natural language search"),
|
||||||
|
("Config dataclass", "exact match search"),
|
||||||
|
("binary store hamming", "domain-specific search"),
|
||||||
|
("", "empty query handling"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for query, desc in queries:
|
||||||
|
t0 = time.time()
|
||||||
|
results = search.search(query, top_k=5)
|
||||||
|
ms = (time.time() - t0) * 1000
|
||||||
|
|
||||||
|
if query == "":
|
||||||
|
check(f"{desc}: no crash", isinstance(results, list))
|
||||||
|
else:
|
||||||
|
check(f"{desc}: returns results", len(results) > 0, f"'{query}' got 0 results")
|
||||||
|
if results:
|
||||||
|
check(f"{desc}: has scores", all(r.score >= 0 for r in results))
|
||||||
|
check(f"{desc}: has paths", all(r.path for r in results))
|
||||||
|
check(f"{desc}: respects top_k", len(results) <= 5)
|
||||||
|
print(f" Top result: [{results[0].score:.3f}] {results[0].path}")
|
||||||
|
print(f" Latency: {ms:.0f}ms")
|
||||||
|
|
||||||
|
# ── 6. Test result quality (sanity) ───────────────────────
|
||||||
|
print("\n=== 6. Result quality sanity checks ===")
|
||||||
|
r1 = search.search("BinaryStore add coarse_search", top_k=3)
|
||||||
|
if r1:
|
||||||
|
paths = [r.path for r in r1]
|
||||||
|
check("BinaryStore query -> binary.py in results",
|
||||||
|
any("binary" in p for p in paths),
|
||||||
|
f"got paths: {paths}")
|
||||||
|
|
||||||
|
r2 = search.search("FTSEngine exact_search fuzzy_search", top_k=3)
|
||||||
|
if r2:
|
||||||
|
paths = [r.path for r in r2]
|
||||||
|
check("FTSEngine query -> fts.py in results",
|
||||||
|
any("fts" in p for p in paths),
|
||||||
|
f"got paths: {paths}")
|
||||||
|
|
||||||
|
r3 = search.search("IndexingPipeline parallel queue", top_k=3)
|
||||||
|
if r3:
|
||||||
|
paths = [r.path for r in r3]
|
||||||
|
check("Pipeline query -> pipeline in results",
|
||||||
|
any("pipeline" in p or "indexing" in p for p in paths),
|
||||||
|
f"got paths: {paths}")
|
||||||
|
|
||||||
|
# ── Summary ───────────────────────────────────────────────
|
||||||
|
print(f"\n{'=' * 50}")
|
||||||
|
print(f"Results: {passed} passed, {failed} failed, {passed + failed} total")
|
||||||
|
if failed == 0:
|
||||||
|
print("ALL TESTS PASSED")
|
||||||
|
else:
|
||||||
|
print(f"WARNING: {failed} test(s) failed")
|
||||||
|
print(f"{'=' * 50}")
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
import shutil
|
||||||
|
shutil.rmtree(INDEX_DIR, ignore_errors=True)
|
||||||
|
|
||||||
|
return 0 if failed == 0 else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
@@ -100,7 +100,7 @@ class SearchPipeline:
|
|||||||
weights = get_adaptive_weights(intent, cfg.fusion_weights)
|
weights = get_adaptive_weights(intent, cfg.fusion_weights)
|
||||||
|
|
||||||
# 2. Embed query
|
# 2. Embed query
|
||||||
query_vec = self._embedder.embed([query])[0]
|
query_vec = self._embedder.embed_single(query)
|
||||||
|
|
||||||
# 3. Parallel vector + FTS search
|
# 3. Parallel vector + FTS search
|
||||||
vector_results: list[tuple[int, float]] = []
|
vector_results: list[tuple[int, float]] = []
|
||||||
|
|||||||
Reference in New Issue
Block a user