Implement search and reranking functionality with FTS and embedding support

- Added BaseReranker abstract class for defining reranking interfaces. - Implemented FastEmbedReranker using fastembed's TextCrossEncoder for scoring document-query pairs. - Introduced FTSEngine for full-text search capabilities using SQLite FTS5. - Developed SearchPipeline to integrate embedding, binary search, ANN indexing, FTS, and reranking. - Added fusion methods for combining results from different search strategies using Reciprocal Rank Fusion. - Created unit and integration tests for the new search and reranking components. - Established configuration management for search parameters and models.
2026-03-19 18:58:47 +08:00 · 2026-03-16 23:03:17 +08:00
parent 5a4b18d9b1
commit de4158597b
41 changed files with 2655 additions and 1848 deletions
--- a/codex-lens-v2/scripts/index_and_search.py
+++ b/codex-lens-v2/scripts/index_and_search.py
@@ -0,0 +1,128 @@
+"""
+对 D:/Claude_dms3 仓库进行索引并测试搜索。
+用法: python scripts/index_and_search.py
+"""
+import sys
+import time
+from pathlib import Path
+
+# 确保 src 可被导入
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+
+from codexlens.config import Config
+from codexlens.core.factory import create_ann_index, create_binary_index
+from codexlens.embed.local import FastEmbedEmbedder
+from codexlens.indexing import IndexingPipeline
+from codexlens.rerank.local import FastEmbedReranker
+from codexlens.search.fts import FTSEngine
+from codexlens.search.pipeline import SearchPipeline
+
+# ─── 配置 ──────────────────────────────────────────────────────────────────
+REPO_ROOT = Path("D:/Claude_dms3")
+INDEX_DIR = Path("D:/Claude_dms3/codex-lens-v2/.index_cache")
+EXTENSIONS = {".py", ".ts", ".js", ".md"}
+MAX_FILE_SIZE = 50_000   # bytes
+MAX_CHUNK_CHARS = 800    # 每个 chunk 的最大字符数
+CHUNK_OVERLAP = 100
+
+# ─── 文件收集 ───────────────────────────────────────────────────────────────
+SKIP_DIRS = {
+    ".git", "node_modules", "__pycache__", ".pytest_cache",
+    "dist", "build", ".venv", "venv", ".cache", ".index_cache",
+    "codex-lens-v2",  # 不索引自身
+}
+
+def collect_files(root: Path) -> list[Path]:
+    files = []
+    for p in root.rglob("*"):
+        if any(part in SKIP_DIRS for part in p.parts):
+            continue
+        if p.is_file() and p.suffix in EXTENSIONS:
+            if p.stat().st_size <= MAX_FILE_SIZE:
+                files.append(p)
+    return files
+
+# ─── 主流程 ─────────────────────────────────────────────────────────────────
+def main():
+    INDEX_DIR.mkdir(parents=True, exist_ok=True)
+
+    # 1. 使用小 profile 加快速度
+    config = Config(
+        embed_model="BAAI/bge-small-en-v1.5",
+        embed_dim=384,
+        embed_batch_size=32,
+        hnsw_ef=100,
+        hnsw_M=16,
+        binary_top_k=100,
+        ann_top_k=30,
+        reranker_top_k=10,
+    )
+
+    print("=== codex-lens-v2 索引测试 ===\n")
+
+    # 2. 收集文件
+    print(f"[1/4] 扫描 {REPO_ROOT} ...")
+    files = collect_files(REPO_ROOT)
+    print(f"      找到 {len(files)} 个文件")
+
+    # 3. 初始化组件
+    print(f"\n[2/4] 加载嵌入模型 (bge-small-en-v1.5, dim=384) ...")
+    embedder = FastEmbedEmbedder(config)
+    binary_store = create_binary_index(INDEX_DIR, config.embed_dim, config)
+    ann_index = create_ann_index(INDEX_DIR, config.embed_dim, config)
+    fts = FTSEngine(":memory:")   # 内存 FTS，不持久化
+
+    # 4. 使用 IndexingPipeline 并行索引 (chunk -> embed -> index)
+    print(f"[3/4] 并行索引 {len(files)} 个文件 ...")
+    pipeline = IndexingPipeline(
+        embedder=embedder,
+        binary_store=binary_store,
+        ann_index=ann_index,
+        fts=fts,
+        config=config,
+    )
+    stats = pipeline.index_files(
+        files,
+        root=REPO_ROOT,
+        max_chunk_chars=MAX_CHUNK_CHARS,
+        chunk_overlap=CHUNK_OVERLAP,
+        max_file_size=MAX_FILE_SIZE,
+    )
+    print(f"      索引完成: {stats.files_processed} 文件, {stats.chunks_created} chunks ({stats.duration_seconds:.1f}s)")
+
+    # 5. 搜索测试
+    print(f"\n[4/4] 构建 SearchPipeline ...")
+    reranker = FastEmbedReranker(config)
+    pipeline = SearchPipeline(
+        embedder=embedder,
+        binary_store=binary_store,
+        ann_index=ann_index,
+        reranker=reranker,
+        fts=fts,
+        config=config,
+    )
+
+    queries = [
+        "authentication middleware function",
+        "def embed_single",
+        "RRF fusion weights",
+        "fastembed TextCrossEncoder reranker",
+        "how to search code semantic",
+    ]
+
+    print("\n" + "=" * 60)
+    for query in queries:
+        t0 = time.time()
+        results = pipeline.search(query, top_k=5)
+        elapsed = time.time() - t0
+        print(f"\nQuery: {query!r}  ({elapsed*1000:.0f}ms)")
+        if results:
+            for r in results:
+                print(f"  [{r.score:.3f}] {r.path}")
+        else:
+            print("  (无结果)")
+    print("=" * 60)
+    print("\n测试完成 ✓")
+
+if __name__ == "__main__":
+    main()