diff --git a/codex-lens-v2/scripts/test_small_e2e.py b/codex-lens-v2/scripts/test_small_e2e.py new file mode 100644 index 00000000..1e6b3b64 --- /dev/null +++ b/codex-lens-v2/scripts/test_small_e2e.py @@ -0,0 +1,193 @@ +""" +Small-folder end-to-end test: index tests/ directory (~10 files) and verify +indexing pipeline + all search features work correctly. + +Usage: python scripts/test_small_e2e.py +""" +import sys +import time +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +import numpy as np +from codexlens.config import Config +from codexlens.core.factory import create_ann_index, create_binary_index +from codexlens.embed.local import FastEmbedEmbedder +from codexlens.indexing import IndexingPipeline +from codexlens.rerank.base import BaseReranker +from codexlens.search.fts import FTSEngine +from codexlens.search.pipeline import SearchPipeline + + +class KeywordReranker(BaseReranker): + """Simple keyword-overlap reranker for testing without network.""" + def score_pairs(self, query: str, documents: list[str]) -> list[float]: + q_words = set(query.lower().split()) + scores = [] + for doc in documents: + d_words = set(doc.lower().split()) + overlap = len(q_words & d_words) + scores.append(float(overlap) / max(len(q_words), 1)) + return scores + +PROJECT = Path(__file__).parent.parent +TARGET_DIR = PROJECT / "src" / "codexlens" # ~21 .py files, small +INDEX_DIR = PROJECT / ".test_index_cache" +EXTENSIONS = {".py"} + +passed = 0 +failed = 0 + + +def check(name: str, condition: bool, detail: str = ""): + global passed, failed + if condition: + passed += 1 + print(f" [PASS] {name}") + else: + failed += 1 + print(f" [FAIL] {name} — {detail}") + + +def main(): + global passed, failed + INDEX_DIR.mkdir(parents=True, exist_ok=True) + + config = Config( + embed_model="BAAI/bge-small-en-v1.5", + embed_dim=384, + embed_batch_size=32, + hnsw_ef=100, + hnsw_M=16, + binary_top_k=100, + ann_top_k=30, + reranker_model="BAAI/bge-reranker-base", + reranker_top_k=10, + ) + + files = [p for p in TARGET_DIR.rglob("*.py") if p.is_file()] + print(f"Target: {TARGET_DIR} ({len(files)} .py files)\n") + + # ── 1. Test IndexingPipeline ────────────────────────────── + print("=== 1. IndexingPipeline (parallel) ===") + embedder = FastEmbedEmbedder(config) + binary_store = create_binary_index(INDEX_DIR, config.embed_dim, config) + ann_index = create_ann_index(INDEX_DIR, config.embed_dim, config) + fts = FTSEngine(":memory:") + + t0 = time.time() + stats = IndexingPipeline( + embedder=embedder, + binary_store=binary_store, + ann_index=ann_index, + fts=fts, + config=config, + ).index_files(files, root=TARGET_DIR, max_chunk_chars=800, chunk_overlap=100) + elapsed = time.time() - t0 + + check("files_processed > 0", stats.files_processed > 0, f"got {stats.files_processed}") + check("chunks_created > 0", stats.chunks_created > 0, f"got {stats.chunks_created}") + check("indexing completed", elapsed < 120, f"took {elapsed:.1f}s") + print(f" Stats: {stats.files_processed} files, {stats.chunks_created} chunks, {elapsed:.1f}s\n") + + # ── 2. Test BinaryStore (pre-allocated, coarse search) ──── + print("=== 2. BinaryStore coarse search ===") + q_vec = embedder.embed_single("def search") + b_ids, b_dists = binary_store.coarse_search(q_vec, top_k=10) + check("binary returns results", len(b_ids) > 0, f"got {len(b_ids)}") + check("binary ids are ints", all(isinstance(int(i), int) for i in b_ids)) + print(f" Top 5 binary IDs: {b_ids[:5]}\n") + + # ── 3. Test ANNIndex (fine search) ──────────────────────── + print("=== 3. ANNIndex fine search ===") + a_ids, a_dists = ann_index.fine_search(q_vec, top_k=10) + check("ann returns results", len(a_ids) > 0, f"got {len(a_ids)}") + check("ann scores are floats", all(isinstance(float(d), float) for d in a_dists)) + print(f" Top 5 ANN IDs: {a_ids[:5]}\n") + + # ── 4. Test FTSEngine (exact + fuzzy) ───────────────────── + print("=== 4. FTSEngine search ===") + exact = fts.exact_search("def search", top_k=5) + fuzzy = fts.fuzzy_search("embedd", top_k=5) + check("exact search returns results", len(exact) > 0, f"got {len(exact)}") + check("fuzzy search returns results", len(fuzzy) > 0, f"got {len(fuzzy)}") + print(f" Exact hits: {len(exact)}, Fuzzy hits: {len(fuzzy)}\n") + + # ── 5. Test SearchPipeline (parallel FTS||vector + fusion + rerank) ── + print("=== 5. SearchPipeline (full pipeline) ===") + reranker = KeywordReranker() + search = SearchPipeline( + embedder=embedder, + binary_store=binary_store, + ann_index=ann_index, + reranker=reranker, + fts=fts, + config=config, + ) + + queries = [ + ("def embed_single", "code symbol search"), + ("search pipeline fusion", "natural language search"), + ("Config dataclass", "exact match search"), + ("binary store hamming", "domain-specific search"), + ("", "empty query handling"), + ] + + for query, desc in queries: + t0 = time.time() + results = search.search(query, top_k=5) + ms = (time.time() - t0) * 1000 + + if query == "": + check(f"{desc}: no crash", isinstance(results, list)) + else: + check(f"{desc}: returns results", len(results) > 0, f"'{query}' got 0 results") + if results: + check(f"{desc}: has scores", all(r.score >= 0 for r in results)) + check(f"{desc}: has paths", all(r.path for r in results)) + check(f"{desc}: respects top_k", len(results) <= 5) + print(f" Top result: [{results[0].score:.3f}] {results[0].path}") + print(f" Latency: {ms:.0f}ms") + + # ── 6. Test result quality (sanity) ─────────────────────── + print("\n=== 6. Result quality sanity checks ===") + r1 = search.search("BinaryStore add coarse_search", top_k=3) + if r1: + paths = [r.path for r in r1] + check("BinaryStore query -> binary.py in results", + any("binary" in p for p in paths), + f"got paths: {paths}") + + r2 = search.search("FTSEngine exact_search fuzzy_search", top_k=3) + if r2: + paths = [r.path for r in r2] + check("FTSEngine query -> fts.py in results", + any("fts" in p for p in paths), + f"got paths: {paths}") + + r3 = search.search("IndexingPipeline parallel queue", top_k=3) + if r3: + paths = [r.path for r in r3] + check("Pipeline query -> pipeline in results", + any("pipeline" in p or "indexing" in p for p in paths), + f"got paths: {paths}") + + # ── Summary ─────────────────────────────────────────────── + print(f"\n{'=' * 50}") + print(f"Results: {passed} passed, {failed} failed, {passed + failed} total") + if failed == 0: + print("ALL TESTS PASSED") + else: + print(f"WARNING: {failed} test(s) failed") + print(f"{'=' * 50}") + + # Cleanup + import shutil + shutil.rmtree(INDEX_DIR, ignore_errors=True) + + return 0 if failed == 0 else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/codex-lens-v2/src/codexlens/search/pipeline.py b/codex-lens-v2/src/codexlens/search/pipeline.py index 21e2810e..d3eb51e4 100644 --- a/codex-lens-v2/src/codexlens/search/pipeline.py +++ b/codex-lens-v2/src/codexlens/search/pipeline.py @@ -100,7 +100,7 @@ class SearchPipeline: weights = get_adaptive_weights(intent, cfg.fusion_weights) # 2. Embed query - query_vec = self._embedder.embed([query])[0] + query_vec = self._embedder.embed_single(query) # 3. Parallel vector + FTS search vector_results: list[tuple[int, float]] = []