Implement search and reranking functionality with FTS and embedding support

- Added BaseReranker abstract class for defining reranking interfaces.
- Implemented FastEmbedReranker using fastembed's TextCrossEncoder for scoring document-query pairs.
- Introduced FTSEngine for full-text search capabilities using SQLite FTS5.
- Developed SearchPipeline to integrate embedding, binary search, ANN indexing, FTS, and reranking.
- Added fusion methods for combining results from different search strategies using Reciprocal Rank Fusion.
- Created unit and integration tests for the new search and reranking components.
- Established configuration management for search parameters and models.
This commit is contained in:
catlog22
2026-03-16 23:03:17 +08:00
parent 5a4b18d9b1
commit de4158597b
41 changed files with 2655 additions and 1848 deletions

View File

@@ -0,0 +1,128 @@
"""
对 D:/Claude_dms3 仓库进行索引并测试搜索。
用法: python scripts/index_and_search.py
"""
import sys
import time
from pathlib import Path
# 确保 src 可被导入
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.config import Config
from codexlens.core.factory import create_ann_index, create_binary_index
from codexlens.embed.local import FastEmbedEmbedder
from codexlens.indexing import IndexingPipeline
from codexlens.rerank.local import FastEmbedReranker
from codexlens.search.fts import FTSEngine
from codexlens.search.pipeline import SearchPipeline
# ─── 配置 ──────────────────────────────────────────────────────────────────
REPO_ROOT = Path("D:/Claude_dms3")
INDEX_DIR = Path("D:/Claude_dms3/codex-lens-v2/.index_cache")
EXTENSIONS = {".py", ".ts", ".js", ".md"}
MAX_FILE_SIZE = 50_000 # bytes
MAX_CHUNK_CHARS = 800 # 每个 chunk 的最大字符数
CHUNK_OVERLAP = 100
# ─── 文件收集 ───────────────────────────────────────────────────────────────
SKIP_DIRS = {
".git", "node_modules", "__pycache__", ".pytest_cache",
"dist", "build", ".venv", "venv", ".cache", ".index_cache",
"codex-lens-v2", # 不索引自身
}
def collect_files(root: Path) -> list[Path]:
files = []
for p in root.rglob("*"):
if any(part in SKIP_DIRS for part in p.parts):
continue
if p.is_file() and p.suffix in EXTENSIONS:
if p.stat().st_size <= MAX_FILE_SIZE:
files.append(p)
return files
# ─── 主流程 ─────────────────────────────────────────────────────────────────
def main():
INDEX_DIR.mkdir(parents=True, exist_ok=True)
# 1. 使用小 profile 加快速度
config = Config(
embed_model="BAAI/bge-small-en-v1.5",
embed_dim=384,
embed_batch_size=32,
hnsw_ef=100,
hnsw_M=16,
binary_top_k=100,
ann_top_k=30,
reranker_top_k=10,
)
print("=== codex-lens-v2 索引测试 ===\n")
# 2. 收集文件
print(f"[1/4] 扫描 {REPO_ROOT} ...")
files = collect_files(REPO_ROOT)
print(f" 找到 {len(files)} 个文件")
# 3. 初始化组件
print(f"\n[2/4] 加载嵌入模型 (bge-small-en-v1.5, dim=384) ...")
embedder = FastEmbedEmbedder(config)
binary_store = create_binary_index(INDEX_DIR, config.embed_dim, config)
ann_index = create_ann_index(INDEX_DIR, config.embed_dim, config)
fts = FTSEngine(":memory:") # 内存 FTS不持久化
# 4. 使用 IndexingPipeline 并行索引 (chunk -> embed -> index)
print(f"[3/4] 并行索引 {len(files)} 个文件 ...")
pipeline = IndexingPipeline(
embedder=embedder,
binary_store=binary_store,
ann_index=ann_index,
fts=fts,
config=config,
)
stats = pipeline.index_files(
files,
root=REPO_ROOT,
max_chunk_chars=MAX_CHUNK_CHARS,
chunk_overlap=CHUNK_OVERLAP,
max_file_size=MAX_FILE_SIZE,
)
print(f" 索引完成: {stats.files_processed} 文件, {stats.chunks_created} chunks ({stats.duration_seconds:.1f}s)")
# 5. 搜索测试
print(f"\n[4/4] 构建 SearchPipeline ...")
reranker = FastEmbedReranker(config)
pipeline = SearchPipeline(
embedder=embedder,
binary_store=binary_store,
ann_index=ann_index,
reranker=reranker,
fts=fts,
config=config,
)
queries = [
"authentication middleware function",
"def embed_single",
"RRF fusion weights",
"fastembed TextCrossEncoder reranker",
"how to search code semantic",
]
print("\n" + "=" * 60)
for query in queries:
t0 = time.time()
results = pipeline.search(query, top_k=5)
elapsed = time.time() - t0
print(f"\nQuery: {query!r} ({elapsed*1000:.0f}ms)")
if results:
for r in results:
print(f" [{r.score:.3f}] {r.path}")
else:
print(" (无结果)")
print("=" * 60)
print("\n测试完成 ✓")
if __name__ == "__main__":
main()