Implement search and reranking functionality with FTS and embedding support

- Added BaseReranker abstract class for defining reranking interfaces.
- Implemented FastEmbedReranker using fastembed's TextCrossEncoder for scoring document-query pairs.
- Introduced FTSEngine for full-text search capabilities using SQLite FTS5.
- Developed SearchPipeline to integrate embedding, binary search, ANN indexing, FTS, and reranking.
- Added fusion methods for combining results from different search strategies using Reciprocal Rank Fusion.
- Created unit and integration tests for the new search and reranking components.
- Established configuration management for search parameters and models.
This commit is contained in:
catlog22
2026-03-16 23:03:17 +08:00
parent 5a4b18d9b1
commit de4158597b
41 changed files with 2655 additions and 1848 deletions

View File

@@ -0,0 +1,5 @@
import sys
import os
# Ensure the local src directory takes precedence over any installed codexlens package
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))

View File

@@ -0,0 +1,36 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "codex-lens-v2"
version = "0.1.0"
description = "Minimal code semantic search library with 2-stage pipeline"
requires-python = ">=3.10"
dependencies = []
[project.optional-dependencies]
semantic = [
"hnswlib>=0.8.0",
"numpy>=1.26",
"fastembed>=0.4.0,<2.0",
]
gpu = [
"onnxruntime-gpu>=1.16",
]
faiss-cpu = [
"faiss-cpu>=1.7.4",
]
faiss-gpu = [
"faiss-gpu>=1.7.4",
]
reranker-api = [
"httpx>=0.25",
]
dev = [
"pytest>=7.0",
"pytest-cov",
]
[tool.hatch.build.targets.wheel]
packages = ["src/codexlens"]

View File

@@ -0,0 +1,128 @@
"""
对 D:/Claude_dms3 仓库进行索引并测试搜索。
用法: python scripts/index_and_search.py
"""
import sys
import time
from pathlib import Path
# 确保 src 可被导入
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from codexlens.config import Config
from codexlens.core.factory import create_ann_index, create_binary_index
from codexlens.embed.local import FastEmbedEmbedder
from codexlens.indexing import IndexingPipeline
from codexlens.rerank.local import FastEmbedReranker
from codexlens.search.fts import FTSEngine
from codexlens.search.pipeline import SearchPipeline
# ─── 配置 ──────────────────────────────────────────────────────────────────
REPO_ROOT = Path("D:/Claude_dms3")
INDEX_DIR = Path("D:/Claude_dms3/codex-lens-v2/.index_cache")
EXTENSIONS = {".py", ".ts", ".js", ".md"}
MAX_FILE_SIZE = 50_000 # bytes
MAX_CHUNK_CHARS = 800 # 每个 chunk 的最大字符数
CHUNK_OVERLAP = 100
# ─── 文件收集 ───────────────────────────────────────────────────────────────
SKIP_DIRS = {
".git", "node_modules", "__pycache__", ".pytest_cache",
"dist", "build", ".venv", "venv", ".cache", ".index_cache",
"codex-lens-v2", # 不索引自身
}
def collect_files(root: Path) -> list[Path]:
files = []
for p in root.rglob("*"):
if any(part in SKIP_DIRS for part in p.parts):
continue
if p.is_file() and p.suffix in EXTENSIONS:
if p.stat().st_size <= MAX_FILE_SIZE:
files.append(p)
return files
# ─── 主流程 ─────────────────────────────────────────────────────────────────
def main():
INDEX_DIR.mkdir(parents=True, exist_ok=True)
# 1. 使用小 profile 加快速度
config = Config(
embed_model="BAAI/bge-small-en-v1.5",
embed_dim=384,
embed_batch_size=32,
hnsw_ef=100,
hnsw_M=16,
binary_top_k=100,
ann_top_k=30,
reranker_top_k=10,
)
print("=== codex-lens-v2 索引测试 ===\n")
# 2. 收集文件
print(f"[1/4] 扫描 {REPO_ROOT} ...")
files = collect_files(REPO_ROOT)
print(f" 找到 {len(files)} 个文件")
# 3. 初始化组件
print(f"\n[2/4] 加载嵌入模型 (bge-small-en-v1.5, dim=384) ...")
embedder = FastEmbedEmbedder(config)
binary_store = create_binary_index(INDEX_DIR, config.embed_dim, config)
ann_index = create_ann_index(INDEX_DIR, config.embed_dim, config)
fts = FTSEngine(":memory:") # 内存 FTS不持久化
# 4. 使用 IndexingPipeline 并行索引 (chunk -> embed -> index)
print(f"[3/4] 并行索引 {len(files)} 个文件 ...")
pipeline = IndexingPipeline(
embedder=embedder,
binary_store=binary_store,
ann_index=ann_index,
fts=fts,
config=config,
)
stats = pipeline.index_files(
files,
root=REPO_ROOT,
max_chunk_chars=MAX_CHUNK_CHARS,
chunk_overlap=CHUNK_OVERLAP,
max_file_size=MAX_FILE_SIZE,
)
print(f" 索引完成: {stats.files_processed} 文件, {stats.chunks_created} chunks ({stats.duration_seconds:.1f}s)")
# 5. 搜索测试
print(f"\n[4/4] 构建 SearchPipeline ...")
reranker = FastEmbedReranker(config)
pipeline = SearchPipeline(
embedder=embedder,
binary_store=binary_store,
ann_index=ann_index,
reranker=reranker,
fts=fts,
config=config,
)
queries = [
"authentication middleware function",
"def embed_single",
"RRF fusion weights",
"fastembed TextCrossEncoder reranker",
"how to search code semantic",
]
print("\n" + "=" * 60)
for query in queries:
t0 = time.time()
results = pipeline.search(query, top_k=5)
elapsed = time.time() - t0
print(f"\nQuery: {query!r} ({elapsed*1000:.0f}ms)")
if results:
for r in results:
print(f" [{r.score:.3f}] {r.path}")
else:
print(" (无结果)")
print("=" * 60)
print("\n测试完成 ✓")
if __name__ == "__main__":
main()

View File

View File

@@ -0,0 +1,99 @@
from __future__ import annotations
import logging
from dataclasses import dataclass, field
log = logging.getLogger(__name__)
@dataclass
class Config:
# Embedding
embed_model: str = "jinaai/jina-embeddings-v2-base-code"
embed_dim: int = 768
embed_batch_size: int = 64
# GPU / execution providers
device: str = "auto" # 'auto', 'cuda', 'cpu'
embed_providers: list[str] | None = None # explicit ONNX providers override
# Backend selection: 'auto', 'faiss', 'hnswlib'
ann_backend: str = "auto"
binary_backend: str = "auto"
# Indexing pipeline
index_workers: int = 2 # number of parallel indexing workers
# HNSW index (ANNIndex)
hnsw_ef: int = 150
hnsw_M: int = 32
hnsw_ef_construction: int = 200
# Binary coarse search (BinaryStore)
binary_top_k: int = 200
# ANN fine search
ann_top_k: int = 50
# Reranker
reranker_model: str = "BAAI/bge-reranker-v2-m3"
reranker_top_k: int = 20
reranker_batch_size: int = 32
# API reranker (optional)
reranker_api_url: str = ""
reranker_api_key: str = ""
reranker_api_model: str = ""
reranker_api_max_tokens_per_batch: int = 2048
# FTS
fts_top_k: int = 50
# Fusion
fusion_k: int = 60 # RRF k parameter
fusion_weights: dict = field(default_factory=lambda: {
"exact": 0.25,
"fuzzy": 0.10,
"vector": 0.50,
"graph": 0.15,
})
def resolve_embed_providers(self) -> list[str]:
"""Return ONNX execution providers based on device config.
Priority: explicit embed_providers > device setting > auto-detect.
"""
if self.embed_providers is not None:
return list(self.embed_providers)
if self.device == "cuda":
return ["CUDAExecutionProvider", "CPUExecutionProvider"]
if self.device == "cpu":
return ["CPUExecutionProvider"]
# auto-detect
try:
import onnxruntime
available = onnxruntime.get_available_providers()
if "CUDAExecutionProvider" in available:
log.info("CUDA detected via onnxruntime, using GPU for embedding")
return ["CUDAExecutionProvider", "CPUExecutionProvider"]
except ImportError:
pass
return ["CPUExecutionProvider"]
@classmethod
def defaults(cls) -> "Config":
return cls()
@classmethod
def small(cls) -> "Config":
"""Smaller config for testing or small corpora."""
return cls(
hnsw_ef=50,
hnsw_M=16,
binary_top_k=50,
ann_top_k=20,
reranker_top_k=10,
)

View File

@@ -0,0 +1,13 @@
from .base import BaseANNIndex, BaseBinaryIndex
from .binary import BinaryStore
from .factory import create_ann_index, create_binary_index
from .index import ANNIndex
__all__ = [
"BaseANNIndex",
"BaseBinaryIndex",
"ANNIndex",
"BinaryStore",
"create_ann_index",
"create_binary_index",
]

View File

@@ -0,0 +1,83 @@
from __future__ import annotations
from abc import ABC, abstractmethod
import numpy as np
class BaseANNIndex(ABC):
"""Abstract base class for approximate nearest neighbor indexes."""
@abstractmethod
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
"""Add float32 vectors with corresponding IDs.
Args:
ids: shape (N,) int64
vectors: shape (N, dim) float32
"""
@abstractmethod
def fine_search(
self, query_vec: np.ndarray, top_k: int | None = None
) -> tuple[np.ndarray, np.ndarray]:
"""Search for nearest neighbors.
Args:
query_vec: float32 vector of shape (dim,)
top_k: number of results
Returns:
(ids, distances) as numpy arrays
"""
@abstractmethod
def save(self) -> None:
"""Persist index to disk."""
@abstractmethod
def load(self) -> None:
"""Load index from disk."""
@abstractmethod
def __len__(self) -> int:
"""Return the number of indexed items."""
class BaseBinaryIndex(ABC):
"""Abstract base class for binary vector indexes (Hamming distance)."""
@abstractmethod
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
"""Add float32 vectors (will be binary-quantized internally).
Args:
ids: shape (N,) int64
vectors: shape (N, dim) float32
"""
@abstractmethod
def coarse_search(
self, query_vec: np.ndarray, top_k: int | None = None
) -> tuple[np.ndarray, np.ndarray]:
"""Search by Hamming distance.
Args:
query_vec: float32 vector of shape (dim,)
top_k: number of results
Returns:
(ids, distances) sorted ascending by distance
"""
@abstractmethod
def save(self) -> None:
"""Persist store to disk."""
@abstractmethod
def load(self) -> None:
"""Load store from disk."""
@abstractmethod
def __len__(self) -> int:
"""Return the number of stored items."""

View File

@@ -0,0 +1,173 @@
from __future__ import annotations
import logging
import math
from pathlib import Path
import numpy as np
from codexlens.config import Config
from codexlens.core.base import BaseBinaryIndex
logger = logging.getLogger(__name__)
class BinaryStore(BaseBinaryIndex):
"""Persistent binary vector store using numpy memmap.
Stores binary-quantized float32 vectors as packed uint8 arrays on disk.
Supports fast coarse search via XOR + popcount Hamming distance.
"""
def __init__(self, path: str | Path, dim: int, config: Config) -> None:
self._dir = Path(path)
self._dim = dim
self._config = config
self._packed_bytes = math.ceil(dim / 8)
self._bin_path = self._dir / "binary_store.bin"
self._ids_path = self._dir / "binary_store_ids.npy"
self._matrix: np.ndarray | None = None # shape (N, packed_bytes), uint8
self._ids: np.ndarray | None = None # shape (N,), int64
self._count: int = 0
if self._bin_path.exists() and self._ids_path.exists():
self.load()
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _quantize(self, vectors: np.ndarray) -> np.ndarray:
"""Convert float32 vectors (N, dim) to packed uint8 (N, packed_bytes)."""
binary = (vectors > 0).astype(np.uint8)
packed = np.packbits(binary, axis=1)
return packed
def _quantize_single(self, vec: np.ndarray) -> np.ndarray:
"""Convert a single float32 vector (dim,) to packed uint8 (packed_bytes,)."""
binary = (vec > 0).astype(np.uint8)
return np.packbits(binary)
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def _ensure_capacity(self, needed: int) -> None:
"""Grow pre-allocated matrix/ids arrays to fit *needed* total items."""
if self._matrix is not None and self._matrix.shape[0] >= needed:
return
new_cap = max(1024, needed)
# Double until large enough
if self._matrix is not None:
cur_cap = self._matrix.shape[0]
new_cap = max(cur_cap, 1024)
while new_cap < needed:
new_cap *= 2
new_matrix = np.zeros((new_cap, self._packed_bytes), dtype=np.uint8)
new_ids = np.zeros(new_cap, dtype=np.int64)
if self._matrix is not None and self._count > 0:
new_matrix[: self._count] = self._matrix[: self._count]
new_ids[: self._count] = self._ids[: self._count]
self._matrix = new_matrix
self._ids = new_ids
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
"""Add float32 vectors and their ids.
Does NOT call save() internally -- callers must call save()
explicitly after batch indexing.
Args:
ids: shape (N,) int64
vectors: shape (N, dim) float32
"""
if len(ids) == 0:
return
packed = self._quantize(vectors) # (N, packed_bytes)
n = len(ids)
self._ensure_capacity(self._count + n)
self._matrix[self._count : self._count + n] = packed
self._ids[self._count : self._count + n] = ids.astype(np.int64)
self._count += n
def coarse_search(
self, query_vec: np.ndarray, top_k: int | None = None
) -> tuple[np.ndarray, np.ndarray]:
"""Search by Hamming distance.
Args:
query_vec: float32 vector of shape (dim,)
top_k: number of results; defaults to config.binary_top_k
Returns:
(ids, distances) sorted ascending by Hamming distance
"""
if self._matrix is None or self._count == 0:
return np.array([], dtype=np.int64), np.array([], dtype=np.int32)
k = top_k if top_k is not None else self._config.binary_top_k
k = min(k, self._count)
query_bin = self._quantize_single(query_vec) # (packed_bytes,)
# Slice to active region (matrix may be pre-allocated larger)
active_matrix = self._matrix[: self._count]
active_ids = self._ids[: self._count]
# XOR then popcount via unpackbits
xor = np.bitwise_xor(active_matrix, query_bin[np.newaxis, :]) # (N, packed_bytes)
dists = np.unpackbits(xor, axis=1).sum(axis=1).astype(np.int32) # (N,)
if k >= self._count:
order = np.argsort(dists)
else:
part = np.argpartition(dists, k)[:k]
order = part[np.argsort(dists[part])]
return active_ids[order], dists[order]
def save(self) -> None:
"""Flush binary store to disk."""
if self._matrix is None or self._count == 0:
return
self._dir.mkdir(parents=True, exist_ok=True)
# Write only the occupied portion of the pre-allocated matrix
active_matrix = self._matrix[: self._count]
mm = np.memmap(
str(self._bin_path),
dtype=np.uint8,
mode="w+",
shape=active_matrix.shape,
)
mm[:] = active_matrix
mm.flush()
del mm
np.save(str(self._ids_path), self._ids[: self._count])
def load(self) -> None:
"""Reload binary store from disk."""
ids = np.load(str(self._ids_path))
n = len(ids)
if n == 0:
return
mm = np.memmap(
str(self._bin_path),
dtype=np.uint8,
mode="r",
shape=(n, self._packed_bytes),
)
self._matrix = np.array(mm) # copy into RAM for mutation support
del mm
self._ids = ids.astype(np.int64)
self._count = n
def __len__(self) -> int:
return self._count

View File

@@ -0,0 +1,116 @@
from __future__ import annotations
import logging
from pathlib import Path
from codexlens.config import Config
from codexlens.core.base import BaseANNIndex, BaseBinaryIndex
logger = logging.getLogger(__name__)
try:
import faiss as _faiss # noqa: F401
_FAISS_AVAILABLE = True
except ImportError:
_FAISS_AVAILABLE = False
try:
import hnswlib as _hnswlib # noqa: F401
_HNSWLIB_AVAILABLE = True
except ImportError:
_HNSWLIB_AVAILABLE = False
def _has_faiss_gpu() -> bool:
"""Check whether faiss-gpu is available (has GPU resources)."""
if not _FAISS_AVAILABLE:
return False
try:
import faiss
res = faiss.StandardGpuResources() # noqa: F841
return True
except (AttributeError, RuntimeError):
return False
def create_ann_index(path: str | Path, dim: int, config: Config) -> BaseANNIndex:
"""Create an ANN index based on config.ann_backend.
Fallback chain for 'auto': faiss-gpu -> faiss-cpu -> hnswlib.
Args:
path: directory for index persistence
dim: vector dimensionality
config: project configuration
Returns:
A BaseANNIndex implementation
Raises:
ImportError: if no suitable backend is available
"""
backend = config.ann_backend
if backend == "faiss":
from codexlens.core.faiss_index import FAISSANNIndex
return FAISSANNIndex(path, dim, config)
if backend == "hnswlib":
from codexlens.core.index import ANNIndex
return ANNIndex(path, dim, config)
# auto: try faiss first, then hnswlib
if _FAISS_AVAILABLE:
from codexlens.core.faiss_index import FAISSANNIndex
gpu_tag = " (GPU available)" if _has_faiss_gpu() else " (CPU)"
logger.info("Auto-selected FAISS ANN backend%s", gpu_tag)
return FAISSANNIndex(path, dim, config)
if _HNSWLIB_AVAILABLE:
from codexlens.core.index import ANNIndex
logger.info("Auto-selected hnswlib ANN backend")
return ANNIndex(path, dim, config)
raise ImportError(
"No ANN backend available. Install faiss-cpu, faiss-gpu, or hnswlib."
)
def create_binary_index(
path: str | Path, dim: int, config: Config
) -> BaseBinaryIndex:
"""Create a binary index based on config.binary_backend.
Fallback chain for 'auto': faiss -> numpy BinaryStore.
Args:
path: directory for index persistence
dim: vector dimensionality
config: project configuration
Returns:
A BaseBinaryIndex implementation
Raises:
ImportError: if no suitable backend is available
"""
backend = config.binary_backend
if backend == "faiss":
from codexlens.core.faiss_index import FAISSBinaryIndex
return FAISSBinaryIndex(path, dim, config)
if backend == "hnswlib":
from codexlens.core.binary import BinaryStore
return BinaryStore(path, dim, config)
# auto: try faiss first, then numpy-based BinaryStore
if _FAISS_AVAILABLE:
from codexlens.core.faiss_index import FAISSBinaryIndex
logger.info("Auto-selected FAISS binary backend")
return FAISSBinaryIndex(path, dim, config)
# numpy BinaryStore is always available (no extra deps)
from codexlens.core.binary import BinaryStore
logger.info("Auto-selected numpy BinaryStore backend")
return BinaryStore(path, dim, config)

View File

@@ -0,0 +1,275 @@
from __future__ import annotations
import logging
import math
import threading
from pathlib import Path
import numpy as np
from codexlens.config import Config
from codexlens.core.base import BaseANNIndex, BaseBinaryIndex
logger = logging.getLogger(__name__)
try:
import faiss
_FAISS_AVAILABLE = True
except ImportError:
faiss = None # type: ignore[assignment]
_FAISS_AVAILABLE = False
def _try_gpu_index(index: "faiss.Index") -> "faiss.Index":
"""Transfer a FAISS index to GPU if faiss-gpu is available.
Returns the GPU index on success, or the original CPU index on failure.
"""
try:
res = faiss.StandardGpuResources()
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
logger.info("FAISS index transferred to GPU 0")
return gpu_index
except (AttributeError, RuntimeError) as exc:
logger.debug("GPU transfer unavailable, staying on CPU: %s", exc)
return index
def _to_cpu_for_save(index: "faiss.Index") -> "faiss.Index":
"""Convert a GPU index back to CPU for serialization."""
try:
return faiss.index_gpu_to_cpu(index)
except (AttributeError, RuntimeError):
return index
class FAISSANNIndex(BaseANNIndex):
"""FAISS-based ANN index using IndexHNSWFlat with optional GPU.
Uses Inner Product space with L2-normalized vectors for cosine similarity.
Thread-safe via RLock.
"""
def __init__(self, path: str | Path, dim: int, config: Config) -> None:
if not _FAISS_AVAILABLE:
raise ImportError(
"faiss is required. Install with: pip install faiss-cpu "
"or pip install faiss-gpu"
)
self._path = Path(path)
self._index_path = self._path / "faiss_ann.index"
self._dim = dim
self._config = config
self._lock = threading.RLock()
self._index: faiss.Index | None = None
def _ensure_loaded(self) -> None:
"""Load or initialize the index (caller holds lock)."""
if self._index is not None:
return
self.load()
def load(self) -> None:
"""Load index from disk or initialize a fresh one."""
with self._lock:
if self._index_path.exists():
idx = faiss.read_index(str(self._index_path))
logger.debug(
"Loaded FAISS ANN index from %s (%d items)",
self._index_path, idx.ntotal,
)
else:
# HNSW with flat storage, M=32 by default
m = self._config.hnsw_M
idx = faiss.IndexHNSWFlat(self._dim, m, faiss.METRIC_INNER_PRODUCT)
idx.hnsw.efConstruction = self._config.hnsw_ef_construction
idx.hnsw.efSearch = self._config.hnsw_ef
logger.debug(
"Initialized fresh FAISS HNSW index (dim=%d, M=%d)",
self._dim, m,
)
self._index = _try_gpu_index(idx)
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
"""Add L2-normalized float32 vectors.
Vectors are normalized before insertion so that Inner Product
distance equals cosine similarity.
Args:
ids: shape (N,) int64 -- currently unused by FAISS flat index
but kept for API compatibility. FAISS uses sequential IDs.
vectors: shape (N, dim) float32
"""
if len(ids) == 0:
return
vecs = np.ascontiguousarray(vectors, dtype=np.float32)
# Normalize for cosine similarity via Inner Product
faiss.normalize_L2(vecs)
with self._lock:
self._ensure_loaded()
self._index.add(vecs)
def fine_search(
self, query_vec: np.ndarray, top_k: int | None = None
) -> tuple[np.ndarray, np.ndarray]:
"""Search for nearest neighbors.
Args:
query_vec: float32 vector of shape (dim,)
top_k: number of results; defaults to config.ann_top_k
Returns:
(ids, distances) as numpy arrays. For IP space, higher = more
similar, but distances are returned as-is for consumer handling.
"""
k = top_k if top_k is not None else self._config.ann_top_k
with self._lock:
self._ensure_loaded()
count = self._index.ntotal
if count == 0:
return np.array([], dtype=np.int64), np.array([], dtype=np.float32)
k = min(k, count)
# Set efSearch for HNSW accuracy
try:
self._index.hnsw.efSearch = max(self._config.hnsw_ef, k)
except AttributeError:
pass # GPU index may not expose hnsw attribute directly
q = np.ascontiguousarray(query_vec, dtype=np.float32).reshape(1, -1)
faiss.normalize_L2(q)
distances, labels = self._index.search(q, k)
return labels[0].astype(np.int64), distances[0].astype(np.float32)
def save(self) -> None:
"""Save index to disk."""
with self._lock:
if self._index is None:
return
self._path.mkdir(parents=True, exist_ok=True)
cpu_index = _to_cpu_for_save(self._index)
faiss.write_index(cpu_index, str(self._index_path))
def __len__(self) -> int:
with self._lock:
if self._index is None:
return 0
return self._index.ntotal
class FAISSBinaryIndex(BaseBinaryIndex):
"""FAISS-based binary index using IndexBinaryFlat for Hamming distance.
Vectors are binary-quantized (sign bit) before insertion.
Thread-safe via RLock.
"""
def __init__(self, path: str | Path, dim: int, config: Config) -> None:
if not _FAISS_AVAILABLE:
raise ImportError(
"faiss is required. Install with: pip install faiss-cpu "
"or pip install faiss-gpu"
)
self._path = Path(path)
self._index_path = self._path / "faiss_binary.index"
self._dim = dim
self._config = config
self._packed_bytes = math.ceil(dim / 8)
self._lock = threading.RLock()
self._index: faiss.IndexBinary | None = None
def _ensure_loaded(self) -> None:
if self._index is not None:
return
self.load()
def _quantize(self, vectors: np.ndarray) -> np.ndarray:
"""Convert float32 vectors (N, dim) to packed uint8 (N, packed_bytes)."""
binary = (vectors > 0).astype(np.uint8)
return np.packbits(binary, axis=1)
def _quantize_single(self, vec: np.ndarray) -> np.ndarray:
"""Convert a single float32 vector (dim,) to packed uint8 (1, packed_bytes)."""
binary = (vec > 0).astype(np.uint8)
return np.packbits(binary).reshape(1, -1)
def load(self) -> None:
"""Load binary index from disk or initialize a fresh one."""
with self._lock:
if self._index_path.exists():
idx = faiss.read_index_binary(str(self._index_path))
logger.debug(
"Loaded FAISS binary index from %s (%d items)",
self._index_path, idx.ntotal,
)
else:
# IndexBinaryFlat takes dimension in bits
idx = faiss.IndexBinaryFlat(self._dim)
logger.debug(
"Initialized fresh FAISS binary index (dim_bits=%d)", self._dim,
)
self._index = idx
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
"""Add float32 vectors (binary-quantized internally).
Args:
ids: shape (N,) int64 -- kept for API compatibility
vectors: shape (N, dim) float32
"""
if len(ids) == 0:
return
packed = self._quantize(vectors)
packed = np.ascontiguousarray(packed, dtype=np.uint8)
with self._lock:
self._ensure_loaded()
self._index.add(packed)
def coarse_search(
self, query_vec: np.ndarray, top_k: int | None = None
) -> tuple[np.ndarray, np.ndarray]:
"""Search by Hamming distance.
Args:
query_vec: float32 vector of shape (dim,)
top_k: number of results; defaults to config.binary_top_k
Returns:
(ids, distances) sorted ascending by Hamming distance
"""
with self._lock:
self._ensure_loaded()
if self._index.ntotal == 0:
return np.array([], dtype=np.int64), np.array([], dtype=np.int32)
k = top_k if top_k is not None else self._config.binary_top_k
k = min(k, self._index.ntotal)
q = self._quantize_single(query_vec)
q = np.ascontiguousarray(q, dtype=np.uint8)
distances, labels = self._index.search(q, k)
return labels[0].astype(np.int64), distances[0].astype(np.int32)
def save(self) -> None:
"""Save binary index to disk."""
with self._lock:
if self._index is None:
return
self._path.mkdir(parents=True, exist_ok=True)
faiss.write_index_binary(self._index, str(self._index_path))
def __len__(self) -> int:
with self._lock:
if self._index is None:
return 0
return self._index.ntotal

View File

@@ -0,0 +1,136 @@
from __future__ import annotations
import logging
import threading
from pathlib import Path
import numpy as np
from codexlens.config import Config
from codexlens.core.base import BaseANNIndex
logger = logging.getLogger(__name__)
try:
import hnswlib
_HNSWLIB_AVAILABLE = True
except ImportError:
_HNSWLIB_AVAILABLE = False
class ANNIndex(BaseANNIndex):
"""HNSW-based approximate nearest neighbor index.
Lazy-loads on first use, thread-safe via RLock.
"""
def __init__(self, path: str | Path, dim: int, config: Config) -> None:
if not _HNSWLIB_AVAILABLE:
raise ImportError("hnswlib is required. Install with: pip install hnswlib")
self._path = Path(path)
self._hnsw_path = self._path / "ann_index.hnsw"
self._dim = dim
self._config = config
self._lock = threading.RLock()
self._index: hnswlib.Index | None = None
# ------------------------------------------------------------------
# Internal helpers
# ------------------------------------------------------------------
def _ensure_loaded(self) -> None:
"""Load or initialize the index (caller holds lock)."""
if self._index is not None:
return
self.load()
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def load(self) -> None:
"""Load index from disk or initialize a fresh one."""
with self._lock:
idx = hnswlib.Index(space="cosine", dim=self._dim)
if self._hnsw_path.exists():
idx.load_index(str(self._hnsw_path), max_elements=0)
idx.set_ef(self._config.hnsw_ef)
logger.debug("Loaded HNSW index from %s (%d items)", self._hnsw_path, idx.get_current_count())
else:
idx.init_index(
max_elements=1000,
ef_construction=self._config.hnsw_ef_construction,
M=self._config.hnsw_M,
)
idx.set_ef(self._config.hnsw_ef)
logger.debug("Initialized fresh HNSW index (dim=%d)", self._dim)
self._index = idx
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
"""Add float32 vectors.
Does NOT call save() internally -- callers must call save()
explicitly after batch indexing.
Args:
ids: shape (N,) int64
vectors: shape (N, dim) float32
"""
if len(ids) == 0:
return
vecs = np.ascontiguousarray(vectors, dtype=np.float32)
with self._lock:
self._ensure_loaded()
# Expand capacity if needed
current = self._index.get_current_count()
max_el = self._index.get_max_elements()
needed = current + len(ids)
if needed > max_el:
new_cap = max(max_el * 2, needed + 100)
self._index.resize_index(new_cap)
self._index.add_items(vecs, ids.astype(np.int64))
def fine_search(
self, query_vec: np.ndarray, top_k: int | None = None
) -> tuple[np.ndarray, np.ndarray]:
"""Search for nearest neighbors.
Args:
query_vec: float32 vector of shape (dim,)
top_k: number of results; defaults to config.ann_top_k
Returns:
(ids, distances) as numpy arrays
"""
k = top_k if top_k is not None else self._config.ann_top_k
with self._lock:
self._ensure_loaded()
count = self._index.get_current_count()
if count == 0:
return np.array([], dtype=np.int64), np.array([], dtype=np.float32)
k = min(k, count)
self._index.set_ef(max(self._config.hnsw_ef, k))
q = np.ascontiguousarray(query_vec, dtype=np.float32).reshape(1, -1)
labels, distances = self._index.knn_query(q, k=k)
return labels[0].astype(np.int64), distances[0].astype(np.float32)
def save(self) -> None:
"""Save index to disk (caller may or may not hold lock)."""
with self._lock:
if self._index is None:
return
self._path.mkdir(parents=True, exist_ok=True)
self._index.save_index(str(self._hnsw_path))
def __len__(self) -> int:
with self._lock:
if self._index is None:
return 0
return self._index.get_current_count()

View File

@@ -0,0 +1,4 @@
from .base import BaseEmbedder
from .local import FastEmbedEmbedder, EMBED_PROFILES
__all__ = ["BaseEmbedder", "FastEmbedEmbedder", "EMBED_PROFILES"]

View File

@@ -0,0 +1,13 @@
from __future__ import annotations
from abc import ABC, abstractmethod
import numpy as np
class BaseEmbedder(ABC):
@abstractmethod
def embed_single(self, text: str) -> np.ndarray:
"""Embed a single text, returns float32 ndarray shape (dim,)."""
@abstractmethod
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""Embed a list of texts, returns list of float32 ndarrays."""

View File

@@ -0,0 +1,53 @@
from __future__ import annotations
import numpy as np
from ..config import Config
from .base import BaseEmbedder
EMBED_PROFILES = {
"small": "BAAI/bge-small-en-v1.5", # 384d
"base": "BAAI/bge-base-en-v1.5", # 768d
"large": "BAAI/bge-large-en-v1.5", # 1024d
"code": "jinaai/jina-embeddings-v2-base-code", # 768d
}
class FastEmbedEmbedder(BaseEmbedder):
"""Embedder backed by fastembed.TextEmbedding with lazy model loading."""
def __init__(self, config: Config) -> None:
self._config = config
self._model = None
def _load(self) -> None:
"""Lazy-load the fastembed TextEmbedding model on first use."""
if self._model is not None:
return
from fastembed import TextEmbedding
providers = self._config.resolve_embed_providers()
try:
self._model = TextEmbedding(
model_name=self._config.embed_model,
providers=providers,
)
except TypeError:
# Older fastembed versions may not accept providers kwarg
self._model = TextEmbedding(model_name=self._config.embed_model)
def embed_single(self, text: str) -> np.ndarray:
"""Embed a single text, returns float32 ndarray of shape (dim,)."""
self._load()
result = list(self._model.embed([text]))
return result[0].astype(np.float32)
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""Embed a list of texts in batches, returns list of float32 ndarrays."""
self._load()
batch_size = self._config.embed_batch_size
results: list[np.ndarray] = []
for start in range(0, len(texts), batch_size):
batch = texts[start : start + batch_size]
for vec in self._model.embed(batch):
results.append(vec.astype(np.float32))
return results

View File

@@ -0,0 +1,5 @@
from __future__ import annotations
from .pipeline import IndexingPipeline, IndexStats
__all__ = ["IndexingPipeline", "IndexStats"]

View File

@@ -0,0 +1,277 @@
"""Three-stage parallel indexing pipeline: chunk -> embed -> index.
Uses threading.Thread with queue.Queue for producer-consumer handoff.
The GIL is acceptable because embedding (onnxruntime) releases it in C extensions.
"""
from __future__ import annotations
import logging
import queue
import threading
import time
from dataclasses import dataclass
from pathlib import Path
import numpy as np
from codexlens.config import Config
from codexlens.core.binary import BinaryStore
from codexlens.core.index import ANNIndex
from codexlens.embed.base import BaseEmbedder
from codexlens.search.fts import FTSEngine
logger = logging.getLogger(__name__)
# Sentinel value to signal worker shutdown
_SENTINEL = None
# Defaults for chunking (can be overridden via index_files kwargs)
_DEFAULT_MAX_CHUNK_CHARS = 800
_DEFAULT_CHUNK_OVERLAP = 100
@dataclass
class IndexStats:
"""Statistics returned after indexing completes."""
files_processed: int = 0
chunks_created: int = 0
duration_seconds: float = 0.0
class IndexingPipeline:
"""Parallel 3-stage indexing pipeline with queue-based handoff.
Stage 1 (main thread): Read files, chunk text, push to embed_queue.
Stage 2 (embed worker): Pull text batches, call embed_batch(), push vectors to index_queue.
Stage 3 (index worker): Pull vectors+ids, call BinaryStore.add(), ANNIndex.add(), FTS.add_documents().
After all stages complete, save() is called on BinaryStore and ANNIndex exactly once.
"""
def __init__(
self,
embedder: BaseEmbedder,
binary_store: BinaryStore,
ann_index: ANNIndex,
fts: FTSEngine,
config: Config,
) -> None:
self._embedder = embedder
self._binary_store = binary_store
self._ann_index = ann_index
self._fts = fts
self._config = config
def index_files(
self,
files: list[Path],
*,
root: Path | None = None,
max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS,
chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP,
max_file_size: int = 50_000,
) -> IndexStats:
"""Run the 3-stage pipeline on the given files.
Args:
files: List of file paths to index.
root: Optional root for computing relative paths. If None, uses
each file's absolute path as its identifier.
max_chunk_chars: Maximum characters per chunk.
chunk_overlap: Character overlap between consecutive chunks.
max_file_size: Skip files larger than this (bytes).
Returns:
IndexStats with counts and timing.
"""
if not files:
return IndexStats()
t0 = time.monotonic()
embed_queue: queue.Queue = queue.Queue(maxsize=4)
index_queue: queue.Queue = queue.Queue(maxsize=4)
# Track errors from workers
worker_errors: list[Exception] = []
error_lock = threading.Lock()
def _record_error(exc: Exception) -> None:
with error_lock:
worker_errors.append(exc)
# --- Start workers ---
embed_thread = threading.Thread(
target=self._embed_worker,
args=(embed_queue, index_queue, _record_error),
daemon=True,
name="indexing-embed",
)
index_thread = threading.Thread(
target=self._index_worker,
args=(index_queue, _record_error),
daemon=True,
name="indexing-index",
)
embed_thread.start()
index_thread.start()
# --- Stage 1: chunk files (main thread) ---
chunk_id = 0
files_processed = 0
chunks_created = 0
for fpath in files:
try:
if fpath.stat().st_size > max_file_size:
continue
text = fpath.read_text(encoding="utf-8", errors="replace")
except Exception as exc:
logger.debug("Skipping %s: %s", fpath, exc)
continue
rel_path = str(fpath.relative_to(root)) if root else str(fpath)
file_chunks = self._chunk_text(text, rel_path, max_chunk_chars, chunk_overlap)
if not file_chunks:
continue
files_processed += 1
# Assign sequential IDs and push batch to embed queue
batch_ids = []
batch_texts = []
batch_paths = []
for chunk_text, path in file_chunks:
batch_ids.append(chunk_id)
batch_texts.append(chunk_text)
batch_paths.append(path)
chunk_id += 1
chunks_created += len(batch_ids)
embed_queue.put((batch_ids, batch_texts, batch_paths))
# Signal embed worker: no more data
embed_queue.put(_SENTINEL)
# Wait for workers to finish
embed_thread.join()
index_thread.join()
# --- Final flush ---
self._binary_store.save()
self._ann_index.save()
duration = time.monotonic() - t0
stats = IndexStats(
files_processed=files_processed,
chunks_created=chunks_created,
duration_seconds=round(duration, 2),
)
logger.info(
"Indexing complete: %d files, %d chunks in %.1fs",
stats.files_processed,
stats.chunks_created,
stats.duration_seconds,
)
# Raise first worker error if any occurred
if worker_errors:
raise worker_errors[0]
return stats
# ------------------------------------------------------------------
# Workers
# ------------------------------------------------------------------
def _embed_worker(
self,
in_q: queue.Queue,
out_q: queue.Queue,
on_error: callable,
) -> None:
"""Stage 2: Pull chunk batches, embed, push (ids, vecs, docs) to index queue."""
try:
while True:
item = in_q.get()
if item is _SENTINEL:
break
batch_ids, batch_texts, batch_paths = item
try:
vecs = self._embedder.embed_batch(batch_texts)
vec_array = np.array(vecs, dtype=np.float32)
id_array = np.array(batch_ids, dtype=np.int64)
out_q.put((id_array, vec_array, batch_texts, batch_paths))
except Exception as exc:
logger.error("Embed worker error: %s", exc)
on_error(exc)
finally:
# Signal index worker: no more data
out_q.put(_SENTINEL)
def _index_worker(
self,
in_q: queue.Queue,
on_error: callable,
) -> None:
"""Stage 3: Pull (ids, vecs, texts, paths), write to stores."""
while True:
item = in_q.get()
if item is _SENTINEL:
break
id_array, vec_array, texts, paths = item
try:
self._binary_store.add(id_array, vec_array)
self._ann_index.add(id_array, vec_array)
fts_docs = [
(int(id_array[i]), paths[i], texts[i])
for i in range(len(id_array))
]
self._fts.add_documents(fts_docs)
except Exception as exc:
logger.error("Index worker error: %s", exc)
on_error(exc)
# ------------------------------------------------------------------
# Chunking
# ------------------------------------------------------------------
@staticmethod
def _chunk_text(
text: str,
path: str,
max_chars: int,
overlap: int,
) -> list[tuple[str, str]]:
"""Split file text into overlapping chunks.
Returns list of (chunk_text, path) tuples.
"""
if not text.strip():
return []
chunks: list[tuple[str, str]] = []
lines = text.splitlines(keepends=True)
current: list[str] = []
current_len = 0
for line in lines:
if current_len + len(line) > max_chars and current:
chunk = "".join(current)
chunks.append((chunk, path))
# overlap: keep last N characters
tail = "".join(current)[-overlap:]
current = [tail] if tail else []
current_len = len(tail)
current.append(line)
current_len += len(line)
if current:
chunks.append(("".join(current), path))
return chunks

View File

@@ -0,0 +1,5 @@
from .base import BaseReranker
from .local import FastEmbedReranker
from .api import APIReranker
__all__ = ["BaseReranker", "FastEmbedReranker", "APIReranker"]

View File

@@ -0,0 +1,103 @@
from __future__ import annotations
import logging
import time
import httpx
from codexlens.config import Config
from .base import BaseReranker
logger = logging.getLogger(__name__)
class APIReranker(BaseReranker):
"""Reranker backed by a remote HTTP API (SiliconFlow/Cohere/Jina format)."""
def __init__(self, config: Config) -> None:
self._config = config
self._client = httpx.Client(
headers={
"Authorization": f"Bearer {config.reranker_api_key}",
"Content-Type": "application/json",
},
)
def score_pairs(self, query: str, documents: list[str]) -> list[float]:
if not documents:
return []
max_tokens = self._config.reranker_api_max_tokens_per_batch
batches = self._split_batches(documents, max_tokens)
scores = [0.0] * len(documents)
for batch in batches:
batch_scores = self._call_api_with_retry(query, batch)
for orig_idx, score in batch_scores.items():
scores[orig_idx] = score
return scores
def _split_batches(
self, documents: list[str], max_tokens: int
) -> list[list[tuple[int, str]]]:
batches: list[list[tuple[int, str]]] = []
current_batch: list[tuple[int, str]] = []
current_tokens = 0
for idx, text in enumerate(documents):
doc_tokens = len(text) // 4
if current_tokens + doc_tokens > max_tokens and current_batch:
batches.append(current_batch)
current_batch = []
current_tokens = 0
current_batch.append((idx, text))
current_tokens += doc_tokens
if current_batch:
batches.append(current_batch)
return batches
def _call_api_with_retry(
self,
query: str,
docs: list[tuple[int, str]],
max_retries: int = 3,
) -> dict[int, float]:
url = self._config.reranker_api_url.rstrip("/") + "/rerank"
payload = {
"model": self._config.reranker_api_model,
"query": query,
"documents": [t for _, t in docs],
}
last_exc: Exception | None = None
for attempt in range(max_retries):
try:
response = self._client.post(url, json=payload)
except Exception as exc:
last_exc = exc
time.sleep((2 ** attempt) * 0.5)
continue
if response.status_code in (429, 503):
logger.warning(
"API reranker returned HTTP %s (attempt %d/%d), retrying...",
response.status_code,
attempt + 1,
max_retries,
)
time.sleep((2 ** attempt) * 0.5)
continue
response.raise_for_status()
data = response.json()
results = data.get("results", [])
scores: dict[int, float] = {}
for item in results:
local_idx = int(item["index"])
orig_idx = docs[local_idx][0]
scores[orig_idx] = float(item["relevance_score"])
return scores
raise RuntimeError(
f"API reranker failed after {max_retries} attempts. Last error: {last_exc}"
)

View File

@@ -0,0 +1,8 @@
from __future__ import annotations
from abc import ABC, abstractmethod
class BaseReranker(ABC):
@abstractmethod
def score_pairs(self, query: str, documents: list[str]) -> list[float]:
"""Score (query, doc) pairs. Returns list of floats same length as documents."""

View File

@@ -0,0 +1,25 @@
from __future__ import annotations
from codexlens.config import Config
from .base import BaseReranker
class FastEmbedReranker(BaseReranker):
"""Local reranker backed by fastembed TextCrossEncoder."""
def __init__(self, config: Config) -> None:
self._config = config
self._model = None
def _load(self) -> None:
if self._model is None:
from fastembed.rerank.cross_encoder import TextCrossEncoder
self._model = TextCrossEncoder(model_name=self._config.reranker_model)
def score_pairs(self, query: str, documents: list[str]) -> list[float]:
self._load()
results = list(self._model.rerank(query, documents))
scores = [0.0] * len(documents)
for r in results:
scores[r.index] = float(r.score)
return scores

View File

@@ -0,0 +1,8 @@
from .fts import FTSEngine
from .fusion import reciprocal_rank_fusion, detect_query_intent, QueryIntent, DEFAULT_WEIGHTS
from .pipeline import SearchPipeline, SearchResult
__all__ = [
"FTSEngine", "reciprocal_rank_fusion", "detect_query_intent",
"QueryIntent", "DEFAULT_WEIGHTS", "SearchPipeline", "SearchResult",
]

View File

@@ -0,0 +1,69 @@
from __future__ import annotations
import sqlite3
from pathlib import Path
class FTSEngine:
def __init__(self, db_path: str | Path) -> None:
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
self._conn.execute(
"CREATE VIRTUAL TABLE IF NOT EXISTS docs "
"USING fts5(content, tokenize='porter unicode61')"
)
self._conn.execute(
"CREATE TABLE IF NOT EXISTS docs_meta "
"(id INTEGER PRIMARY KEY, path TEXT)"
)
self._conn.commit()
def add_documents(self, docs: list[tuple[int, str, str]]) -> None:
"""Add documents in batch. docs: list of (id, path, content)."""
if not docs:
return
self._conn.executemany(
"INSERT OR REPLACE INTO docs_meta (id, path) VALUES (?, ?)",
[(doc_id, path) for doc_id, path, content in docs],
)
self._conn.executemany(
"INSERT OR REPLACE INTO docs (rowid, content) VALUES (?, ?)",
[(doc_id, content) for doc_id, path, content in docs],
)
self._conn.commit()
def exact_search(self, query: str, top_k: int = 50) -> list[tuple[int, float]]:
"""FTS5 MATCH query, return (id, bm25_score) sorted by score descending."""
try:
rows = self._conn.execute(
"SELECT rowid, bm25(docs) AS score FROM docs "
"WHERE docs MATCH ? ORDER BY score LIMIT ?",
(query, top_k),
).fetchall()
except sqlite3.OperationalError:
return []
# bm25 in SQLite FTS5 returns negative values (lower = better match)
# Negate so higher is better
return [(int(row[0]), -float(row[1])) for row in rows]
def fuzzy_search(self, query: str, top_k: int = 50) -> list[tuple[int, float]]:
"""Prefix search: each token + '*', return (id, score) sorted descending."""
tokens = query.strip().split()
if not tokens:
return []
prefix_query = " ".join(t + "*" for t in tokens)
try:
rows = self._conn.execute(
"SELECT rowid, bm25(docs) AS score FROM docs "
"WHERE docs MATCH ? ORDER BY score LIMIT ?",
(prefix_query, top_k),
).fetchall()
except sqlite3.OperationalError:
return []
return [(int(row[0]), -float(row[1])) for row in rows]
def get_content(self, doc_id: int) -> str:
"""Retrieve content for a doc_id."""
row = self._conn.execute(
"SELECT content FROM docs WHERE rowid = ?", (doc_id,)
).fetchone()
return row[0] if row else ""

View File

@@ -0,0 +1,106 @@
from __future__ import annotations
import re
from enum import Enum
DEFAULT_WEIGHTS: dict[str, float] = {
"exact": 0.25,
"fuzzy": 0.10,
"vector": 0.50,
"graph": 0.15,
}
_CODE_CAMEL_RE = re.compile(r"[a-z][A-Z]")
_CODE_SNAKE_RE = re.compile(r"\b[a-z_]+_[a-z_]+\b")
_CODE_SYMBOLS_RE = re.compile(r"[.\[\](){}]|->|::")
_CODE_KEYWORDS_RE = re.compile(r"\b(import|def|class|return|from|async|await|lambda|yield)\b")
_QUESTION_WORDS_RE = re.compile(r"\b(how|what|why|when|where|which|who|does|do|is|are|can|should)\b", re.IGNORECASE)
class QueryIntent(Enum):
CODE_SYMBOL = "code_symbol"
NATURAL_LANGUAGE = "natural"
MIXED = "mixed"
def detect_query_intent(query: str) -> QueryIntent:
"""Detect whether query is a code symbol, natural language, or mixed."""
words = query.strip().split()
word_count = len(words)
code_signals = 0
natural_signals = 0
if _CODE_CAMEL_RE.search(query):
code_signals += 2
if _CODE_SNAKE_RE.search(query):
code_signals += 2
if _CODE_SYMBOLS_RE.search(query):
code_signals += 2
if _CODE_KEYWORDS_RE.search(query):
code_signals += 2
if "`" in query:
code_signals += 1
if word_count < 4:
code_signals += 1
if _QUESTION_WORDS_RE.search(query):
natural_signals += 2
if word_count > 5:
natural_signals += 2
if code_signals == 0 and word_count >= 3:
natural_signals += 1
if code_signals >= 2 and natural_signals == 0:
return QueryIntent.CODE_SYMBOL
if natural_signals >= 2 and code_signals == 0:
return QueryIntent.NATURAL_LANGUAGE
if code_signals >= 2 and natural_signals == 0:
return QueryIntent.CODE_SYMBOL
if natural_signals > code_signals:
return QueryIntent.NATURAL_LANGUAGE
if code_signals > natural_signals:
return QueryIntent.CODE_SYMBOL
return QueryIntent.MIXED
def get_adaptive_weights(intent: QueryIntent, base: dict | None = None) -> dict[str, float]:
"""Return weights adapted to query intent."""
weights = dict(base or DEFAULT_WEIGHTS)
if intent == QueryIntent.CODE_SYMBOL:
weights["exact"] = 0.45
weights["vector"] = 0.35
elif intent == QueryIntent.NATURAL_LANGUAGE:
weights["vector"] = 0.65
weights["exact"] = 0.15
# MIXED: use weights as-is
return weights
def reciprocal_rank_fusion(
results: dict[str, list[tuple[int, float]]],
weights: dict[str, float] | None = None,
k: int = 60,
) -> list[tuple[int, float]]:
"""Fuse ranked result lists using Reciprocal Rank Fusion.
results: {source_name: [(doc_id, score), ...]} each list sorted desc by score.
weights: weight per source (defaults to equal weight across all sources).
k: RRF constant (default 60).
Returns sorted list of (doc_id, fused_score) descending.
"""
if not results:
return []
sources = list(results.keys())
if weights is None:
equal_w = 1.0 / len(sources)
weights = {s: equal_w for s in sources}
scores: dict[int, float] = {}
for source, ranked_list in results.items():
w = weights.get(source, 0.0)
for rank, (doc_id, _) in enumerate(ranked_list, start=1):
scores[doc_id] = scores.get(doc_id, 0.0) + w * (1.0 / (k + rank))
return sorted(scores.items(), key=lambda x: x[1], reverse=True)

View File

@@ -0,0 +1,163 @@
from __future__ import annotations
import logging
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
import numpy as np
from ..config import Config
from ..core import ANNIndex, BinaryStore
from ..embed import BaseEmbedder
from ..rerank import BaseReranker
from .fts import FTSEngine
from .fusion import (
DEFAULT_WEIGHTS,
detect_query_intent,
get_adaptive_weights,
reciprocal_rank_fusion,
)
_log = logging.getLogger(__name__)
@dataclass
class SearchResult:
id: int
path: str
score: float
snippet: str = ""
class SearchPipeline:
def __init__(
self,
embedder: BaseEmbedder,
binary_store: BinaryStore,
ann_index: ANNIndex,
reranker: BaseReranker,
fts: FTSEngine,
config: Config,
) -> None:
self._embedder = embedder
self._binary_store = binary_store
self._ann_index = ann_index
self._reranker = reranker
self._fts = fts
self._config = config
# -- Helper: vector search (binary coarse + ANN fine) -----------------
def _vector_search(
self, query_vec: np.ndarray
) -> list[tuple[int, float]]:
"""Run binary coarse search then ANN fine search and intersect."""
cfg = self._config
# Binary coarse search -> candidate_ids set
candidate_ids_list, _ = self._binary_store.coarse_search(
query_vec, top_k=cfg.binary_top_k
)
candidate_ids = set(candidate_ids_list)
# ANN fine search on full index, then intersect with binary candidates
ann_ids, ann_scores = self._ann_index.fine_search(
query_vec, top_k=cfg.ann_top_k
)
# Keep only results that appear in binary candidates (2-stage funnel)
vector_results: list[tuple[int, float]] = [
(int(doc_id), float(score))
for doc_id, score in zip(ann_ids, ann_scores)
if int(doc_id) in candidate_ids
]
# Fall back to full ANN results if intersection is empty
if not vector_results:
vector_results = [
(int(doc_id), float(score))
for doc_id, score in zip(ann_ids, ann_scores)
]
return vector_results
# -- Helper: FTS search (exact + fuzzy) ------------------------------
def _fts_search(
self, query: str
) -> tuple[list[tuple[int, float]], list[tuple[int, float]]]:
"""Run exact and fuzzy full-text search."""
cfg = self._config
exact_results = self._fts.exact_search(query, top_k=cfg.fts_top_k)
fuzzy_results = self._fts.fuzzy_search(query, top_k=cfg.fts_top_k)
return exact_results, fuzzy_results
# -- Main search entry point -----------------------------------------
def search(self, query: str, top_k: int | None = None) -> list[SearchResult]:
cfg = self._config
final_top_k = top_k if top_k is not None else cfg.reranker_top_k
# 1. Detect intent -> adaptive weights
intent = detect_query_intent(query)
weights = get_adaptive_weights(intent, cfg.fusion_weights)
# 2. Embed query
query_vec = self._embedder.embed([query])[0]
# 3. Parallel vector + FTS search
vector_results: list[tuple[int, float]] = []
exact_results: list[tuple[int, float]] = []
fuzzy_results: list[tuple[int, float]] = []
with ThreadPoolExecutor(max_workers=2) as pool:
vec_future = pool.submit(self._vector_search, query_vec)
fts_future = pool.submit(self._fts_search, query)
# Collect vector results
try:
vector_results = vec_future.result()
except Exception:
_log.warning("Vector search failed, using empty results", exc_info=True)
# Collect FTS results
try:
exact_results, fuzzy_results = fts_future.result()
except Exception:
_log.warning("FTS search failed, using empty results", exc_info=True)
# 4. RRF fusion
fusion_input: dict[str, list[tuple[int, float]]] = {}
if vector_results:
fusion_input["vector"] = vector_results
if exact_results:
fusion_input["exact"] = exact_results
if fuzzy_results:
fusion_input["fuzzy"] = fuzzy_results
if not fusion_input:
return []
fused = reciprocal_rank_fusion(fusion_input, weights=weights, k=cfg.fusion_k)
# 5. Rerank top candidates
rerank_ids = [doc_id for doc_id, _ in fused[:50]]
contents = [self._fts.get_content(doc_id) for doc_id in rerank_ids]
rerank_scores = self._reranker.score_pairs(query, contents)
# 6. Sort by rerank score, build SearchResult list
ranked = sorted(
zip(rerank_ids, rerank_scores), key=lambda x: x[1], reverse=True
)
results: list[SearchResult] = []
for doc_id, score in ranked[:final_top_k]:
path = self._fts._conn.execute(
"SELECT path FROM docs_meta WHERE id = ?", (doc_id,)
).fetchone()
results.append(
SearchResult(
id=doc_id,
path=path[0] if path else "",
score=float(score),
snippet=self._fts.get_content(doc_id)[:200],
)
)
return results

View File

View File

@@ -0,0 +1,108 @@
import pytest
import numpy as np
import tempfile
from pathlib import Path
from codexlens.config import Config
from codexlens.core import ANNIndex, BinaryStore
from codexlens.embed.base import BaseEmbedder
from codexlens.rerank.base import BaseReranker
from codexlens.search.fts import FTSEngine
from codexlens.search.pipeline import SearchPipeline
# Test documents: 20 code snippets with id, path, content
TEST_DOCS = [
(0, "auth.py", "def authenticate(user, password): return check_hash(password, user.hash)"),
(1, "auth.py", "def authorize(user, permission): return permission in user.roles"),
(2, "models.py", "class User: def __init__(self, name, email): self.name = name; self.email = email"),
(3, "models.py", "class Session: token = None; expires_at = None"),
(4, "middleware.py", "def auth_middleware(request): token = request.headers.get('Authorization')"),
(5, "utils.py", "def hash_password(password): import bcrypt; return bcrypt.hashpw(password)"),
(6, "config.py", "DATABASE_URL = os.environ.get('DATABASE_URL', 'sqlite:///db.sqlite3')"),
(7, "search.py", "def search_users(query): return User.objects.filter(name__icontains=query)"),
(8, "api.py", "def get_user(request, user_id): user = User.objects.get(id=user_id)"),
(9, "api.py", "def create_user(request): data = request.json(); user = User(**data)"),
(10, "tests.py", "def test_authenticate(): assert authenticate('admin', 'pass') is not None"),
(11, "tests.py", "def test_search(): results = search_users('alice'); assert len(results) > 0"),
(12, "router.py", "app.route('/users', methods=['GET'])(list_users)"),
(13, "router.py", "app.route('/login', methods=['POST'])(login_handler)"),
(14, "db.py", "def get_connection(): return sqlite3.connect(DATABASE_URL)"),
(15, "cache.py", "def cache_get(key): return redis_client.get(key)"),
(16, "cache.py", "def cache_set(key, value, ttl=3600): redis_client.setex(key, ttl, value)"),
(17, "errors.py", "class AuthError(Exception): status_code = 401"),
(18, "errors.py", "class NotFoundError(Exception): status_code = 404"),
(19, "validators.py", "def validate_email(email): return '@' in email and '.' in email.split('@')[1]"),
]
DIM = 32 # Use small dim for fast tests
def make_stable_vec(doc_id: int, dim: int = DIM) -> np.ndarray:
"""Generate a deterministic float32 vector for a given doc_id."""
rng = np.random.default_rng(seed=doc_id)
vec = rng.standard_normal(dim).astype(np.float32)
vec /= np.linalg.norm(vec)
return vec
class MockEmbedder(BaseEmbedder):
"""Returns stable deterministic vectors based on content hash."""
def embed_single(self, text: str) -> np.ndarray:
seed = hash(text) % (2**31)
rng = np.random.default_rng(seed=seed)
vec = rng.standard_normal(DIM).astype(np.float32)
vec /= np.linalg.norm(vec)
return vec
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
return [self.embed_single(t) for t in texts]
def embed(self, texts: list[str]) -> list[np.ndarray]:
"""Called by SearchPipeline as self._embedder.embed([query])[0]."""
return self.embed_batch(texts)
class MockReranker(BaseReranker):
"""Returns score based on simple keyword overlap."""
def score_pairs(self, query: str, documents: list[str]) -> list[float]:
query_words = set(query.lower().split())
scores = []
for doc in documents:
doc_words = set(doc.lower().split())
overlap = len(query_words & doc_words)
scores.append(float(overlap) / max(len(query_words), 1))
return scores
@pytest.fixture
def config():
return Config.small() # hnsw_ef=50, hnsw_M=16, binary_top_k=50, ann_top_k=20, rerank_top_k=10
@pytest.fixture
def search_pipeline(tmp_path, config):
"""Build a full SearchPipeline with 20 test docs indexed."""
embedder = MockEmbedder()
binary_store = BinaryStore(tmp_path / "binary", dim=DIM, config=config)
ann_index = ANNIndex(tmp_path / "ann.hnsw", dim=DIM, config=config)
fts = FTSEngine(tmp_path / "fts.db")
reranker = MockReranker()
# Index all test docs
ids = np.array([d[0] for d in TEST_DOCS], dtype=np.int64)
vectors = np.array([embedder.embed_single(d[2]) for d in TEST_DOCS], dtype=np.float32)
binary_store.add(ids, vectors)
ann_index.add(ids, vectors)
fts.add_documents(TEST_DOCS)
return SearchPipeline(
embedder=embedder,
binary_store=binary_store,
ann_index=ann_index,
reranker=reranker,
fts=fts,
config=config,
)

View File

@@ -0,0 +1,44 @@
"""Integration tests for SearchPipeline using real components and mock embedder/reranker."""
from __future__ import annotations
def test_vector_search_returns_results(search_pipeline):
results = search_pipeline.search("authentication middleware")
assert len(results) > 0
assert all(isinstance(r.score, float) for r in results)
def test_exact_keyword_search(search_pipeline):
results = search_pipeline.search("authenticate")
assert len(results) > 0
result_ids = {r.id for r in results}
# Doc 0 and 10 both contain "authenticate"
assert result_ids & {0, 10}, f"Expected doc 0 or 10 in results, got {result_ids}"
def test_pipeline_top_k_limit(search_pipeline):
results = search_pipeline.search("user", top_k=5)
assert len(results) <= 5
def test_search_result_fields_populated(search_pipeline):
results = search_pipeline.search("password")
assert len(results) > 0
for r in results:
assert r.id >= 0
assert r.score >= 0
assert isinstance(r.path, str)
def test_empty_query_handled(search_pipeline):
results = search_pipeline.search("")
assert isinstance(results, list) # no exception
def test_different_queries_give_different_results(search_pipeline):
r1 = search_pipeline.search("authenticate user")
r2 = search_pipeline.search("cache redis")
# Results should differ (different top IDs or scores), unless both are empty
ids1 = [r.id for r in r1]
ids2 = [r.id for r in r2]
assert ids1 != ids2 or len(r1) == 0

View File

View File

@@ -0,0 +1,31 @@
from codexlens.config import Config
def test_config_instantiates_no_args():
cfg = Config()
assert cfg is not None
def test_defaults_hnsw_ef():
cfg = Config.defaults()
assert cfg.hnsw_ef == 150
def test_defaults_hnsw_M():
cfg = Config.defaults()
assert cfg.hnsw_M == 32
def test_small_hnsw_ef():
cfg = Config.small()
assert cfg.hnsw_ef == 50
def test_custom_instantiation():
cfg = Config(hnsw_ef=100)
assert cfg.hnsw_ef == 100
def test_fusion_weights_keys():
cfg = Config()
assert set(cfg.fusion_weights.keys()) == {"exact", "fuzzy", "vector", "graph"}

View File

@@ -0,0 +1,136 @@
"""Unit tests for BinaryStore and ANNIndex (no fastembed required)."""
from __future__ import annotations
import concurrent.futures
import tempfile
from pathlib import Path
import numpy as np
import pytest
from codexlens.config import Config
from codexlens.core import ANNIndex, BinaryStore
DIM = 32
RNG = np.random.default_rng(42)
def make_vectors(n: int, dim: int = DIM) -> np.ndarray:
return RNG.standard_normal((n, dim)).astype(np.float32)
def make_ids(n: int, start: int = 0) -> np.ndarray:
return np.arange(start, start + n, dtype=np.int64)
# ---------------------------------------------------------------------------
# BinaryStore tests
# ---------------------------------------------------------------------------
class TestBinaryStore:
def test_binary_store_add_and_search(self, tmp_path: Path) -> None:
cfg = Config.small()
store = BinaryStore(tmp_path, DIM, cfg)
vecs = make_vectors(10)
ids = make_ids(10)
store.add(ids, vecs)
assert len(store) == 10
top_k = 5
ret_ids, ret_dists = store.coarse_search(vecs[0], top_k=top_k)
assert ret_ids.shape == (top_k,)
assert ret_dists.shape == (top_k,)
# distances are non-negative integers
assert (ret_dists >= 0).all()
def test_binary_hamming_correctness(self, tmp_path: Path) -> None:
cfg = Config.small()
store = BinaryStore(tmp_path, DIM, cfg)
vecs = make_vectors(20)
ids = make_ids(20)
store.add(ids, vecs)
# Query with the exact stored vector; it must be the top-1 result
query = vecs[7]
ret_ids, ret_dists = store.coarse_search(query, top_k=1)
assert ret_ids[0] == 7
assert ret_dists[0] == 0 # Hamming distance to itself is 0
def test_binary_store_persist(self, tmp_path: Path) -> None:
cfg = Config.small()
store = BinaryStore(tmp_path, DIM, cfg)
vecs = make_vectors(15)
ids = make_ids(15)
store.add(ids, vecs)
store.save()
# Load into a fresh instance
store2 = BinaryStore(tmp_path, DIM, cfg)
assert len(store2) == 15
query = vecs[3]
ret_ids, ret_dists = store2.coarse_search(query, top_k=1)
assert ret_ids[0] == 3
assert ret_dists[0] == 0
# ---------------------------------------------------------------------------
# ANNIndex tests
# ---------------------------------------------------------------------------
class TestANNIndex:
def test_ann_index_add_and_search(self, tmp_path: Path) -> None:
cfg = Config.small()
idx = ANNIndex(tmp_path, DIM, cfg)
vecs = make_vectors(50)
ids = make_ids(50)
idx.add(ids, vecs)
assert len(idx) == 50
ret_ids, ret_dists = idx.fine_search(vecs[0], top_k=5)
assert len(ret_ids) == 5
assert len(ret_dists) == 5
def test_ann_index_thread_safety(self, tmp_path: Path) -> None:
cfg = Config.small()
idx = ANNIndex(tmp_path, DIM, cfg)
vecs = make_vectors(50)
ids = make_ids(50)
idx.add(ids, vecs)
query = vecs[0]
errors: list[Exception] = []
def search() -> None:
try:
idx.fine_search(query, top_k=3)
except Exception as exc:
errors.append(exc)
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as pool:
futures = [pool.submit(search) for _ in range(5)]
concurrent.futures.wait(futures)
assert errors == [], f"Thread safety errors: {errors}"
def test_ann_index_save_load(self, tmp_path: Path) -> None:
cfg = Config.small()
idx = ANNIndex(tmp_path, DIM, cfg)
vecs = make_vectors(30)
ids = make_ids(30)
idx.add(ids, vecs)
idx.save()
# Load into a fresh instance
idx2 = ANNIndex(tmp_path, DIM, cfg)
idx2.load()
assert len(idx2) == 30
ret_ids, ret_dists = idx2.fine_search(vecs[10], top_k=1)
assert len(ret_ids) == 1
assert ret_ids[0] == 10

View File

@@ -0,0 +1,80 @@
from __future__ import annotations
import sys
import types
import unittest
from unittest.mock import MagicMock, patch
import numpy as np
def _make_fastembed_mock():
"""Build a minimal fastembed stub so imports succeed without the real package."""
fastembed_mod = types.ModuleType("fastembed")
fastembed_mod.TextEmbedding = MagicMock()
sys.modules.setdefault("fastembed", fastembed_mod)
return fastembed_mod
_make_fastembed_mock()
from codexlens.config import Config # noqa: E402
from codexlens.embed.base import BaseEmbedder # noqa: E402
from codexlens.embed.local import EMBED_PROFILES, FastEmbedEmbedder # noqa: E402
class TestEmbedSingle(unittest.TestCase):
def test_embed_single_returns_float32_ndarray(self):
config = Config()
embedder = FastEmbedEmbedder(config)
mock_model = MagicMock()
mock_model.embed.return_value = iter([np.ones(384, dtype=np.float64)])
# Inject mock model directly to bypass lazy load (no real fastembed needed)
embedder._model = mock_model
result = embedder.embed_single("hello world")
self.assertIsInstance(result, np.ndarray)
self.assertEqual(result.dtype, np.float32)
self.assertEqual(result.shape, (384,))
class TestEmbedBatch(unittest.TestCase):
def test_embed_batch_returns_list(self):
config = Config()
embedder = FastEmbedEmbedder(config)
vecs = [np.ones(384, dtype=np.float64) * i for i in range(3)]
mock_model = MagicMock()
mock_model.embed.return_value = iter(vecs)
embedder._model = mock_model
result = embedder.embed_batch(["a", "b", "c"])
self.assertIsInstance(result, list)
self.assertEqual(len(result), 3)
for arr in result:
self.assertIsInstance(arr, np.ndarray)
self.assertEqual(arr.dtype, np.float32)
class TestEmbedProfiles(unittest.TestCase):
def test_embed_profiles_all_have_valid_keys(self):
expected_keys = {"small", "base", "large", "code"}
self.assertEqual(set(EMBED_PROFILES.keys()), expected_keys)
def test_embed_profiles_model_ids_non_empty(self):
for key, model_id in EMBED_PROFILES.items():
self.assertIsInstance(model_id, str, msg=f"{key} model id should be str")
self.assertTrue(len(model_id) > 0, msg=f"{key} model id should be non-empty")
class TestBaseEmbedderAbstract(unittest.TestCase):
def test_base_embedder_is_abstract(self):
with self.assertRaises(TypeError):
BaseEmbedder() # type: ignore[abstract]
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,179 @@
from __future__ import annotations
import types
from unittest.mock import MagicMock, patch
import pytest
from codexlens.config import Config
from codexlens.rerank.base import BaseReranker
from codexlens.rerank.local import FastEmbedReranker
from codexlens.rerank.api import APIReranker
# ---------------------------------------------------------------------------
# BaseReranker
# ---------------------------------------------------------------------------
def test_base_reranker_is_abstract():
with pytest.raises(TypeError):
BaseReranker() # type: ignore[abstract]
# ---------------------------------------------------------------------------
# FastEmbedReranker
# ---------------------------------------------------------------------------
def _make_rerank_result(index: int, score: float) -> object:
obj = types.SimpleNamespace(index=index, score=score)
return obj
def test_local_reranker_score_pairs_length():
config = Config()
reranker = FastEmbedReranker(config)
mock_results = [
_make_rerank_result(0, 0.9),
_make_rerank_result(1, 0.5),
_make_rerank_result(2, 0.1),
]
mock_model = MagicMock()
mock_model.rerank.return_value = iter(mock_results)
reranker._model = mock_model
docs = ["doc0", "doc1", "doc2"]
scores = reranker.score_pairs("query", docs)
assert len(scores) == 3
def test_local_reranker_preserves_order():
config = Config()
reranker = FastEmbedReranker(config)
# rerank returns results in reverse order (index 2, 1, 0)
mock_results = [
_make_rerank_result(2, 0.1),
_make_rerank_result(1, 0.5),
_make_rerank_result(0, 0.9),
]
mock_model = MagicMock()
mock_model.rerank.return_value = iter(mock_results)
reranker._model = mock_model
docs = ["doc0", "doc1", "doc2"]
scores = reranker.score_pairs("query", docs)
assert scores[0] == pytest.approx(0.9)
assert scores[1] == pytest.approx(0.5)
assert scores[2] == pytest.approx(0.1)
# ---------------------------------------------------------------------------
# APIReranker
# ---------------------------------------------------------------------------
def _make_config(max_tokens_per_batch: int = 512) -> Config:
return Config(
reranker_api_url="https://api.example.com",
reranker_api_key="test-key",
reranker_api_model="test-model",
reranker_api_max_tokens_per_batch=max_tokens_per_batch,
)
def test_api_reranker_batch_splitting():
config = _make_config(max_tokens_per_batch=512)
with patch("httpx.Client"):
reranker = APIReranker(config)
# 10 docs, each ~200 tokens (800 chars)
docs = ["x" * 800] * 10
batches = reranker._split_batches(docs, max_tokens=512)
# Each doc is 200 tokens; batches should have at most 2 docs (200+200=400 <= 512, 400+200=600 > 512)
assert len(batches) > 1
for batch in batches:
total = sum(len(text) // 4 for _, text in batch)
assert total <= 512 or len(batch) == 1
def test_api_reranker_retry_on_429():
config = _make_config()
mock_429 = MagicMock()
mock_429.status_code = 429
mock_200 = MagicMock()
mock_200.status_code = 200
mock_200.json.return_value = {
"results": [
{"index": 0, "relevance_score": 0.8},
{"index": 1, "relevance_score": 0.3},
]
}
mock_200.raise_for_status = MagicMock()
with patch("httpx.Client") as mock_client_cls:
mock_client = MagicMock()
mock_client_cls.return_value = mock_client
mock_client.post.side_effect = [mock_429, mock_429, mock_200]
reranker = APIReranker(config)
with patch("time.sleep"):
result = reranker._call_api_with_retry(
"query",
[(0, "doc0"), (1, "doc1")],
max_retries=3,
)
assert mock_client.post.call_count == 3
assert 0 in result
assert 1 in result
def test_api_reranker_merge_batches():
config = _make_config(max_tokens_per_batch=100)
# 4 docs of 25 tokens each (100 chars); each batch holds at most 4 docs
# Use smaller docs to force 2 batches: 2 docs per batch (50 tokens each = 200 chars)
docs = ["x" * 200] * 4 # 50 tokens each; 50+50=100 <= 100, 100+50=150 > 100 -> 2 per batch
batch0_response = MagicMock()
batch0_response.status_code = 200
batch0_response.json.return_value = {
"results": [
{"index": 0, "relevance_score": 0.9},
{"index": 1, "relevance_score": 0.8},
]
}
batch0_response.raise_for_status = MagicMock()
batch1_response = MagicMock()
batch1_response.status_code = 200
batch1_response.json.return_value = {
"results": [
{"index": 0, "relevance_score": 0.7},
{"index": 1, "relevance_score": 0.6},
]
}
batch1_response.raise_for_status = MagicMock()
with patch("httpx.Client") as mock_client_cls:
mock_client = MagicMock()
mock_client_cls.return_value = mock_client
mock_client.post.side_effect = [batch0_response, batch1_response]
reranker = APIReranker(config)
with patch("time.sleep"):
scores = reranker.score_pairs("query", docs)
assert len(scores) == 4
# All original indices should have scores
assert all(s > 0 for s in scores)

View File

@@ -0,0 +1,156 @@
"""Unit tests for search layer: FTSEngine, fusion, and SearchPipeline."""
from __future__ import annotations
from unittest.mock import MagicMock
import pytest
from codexlens.search.fts import FTSEngine
from codexlens.search.fusion import (
DEFAULT_WEIGHTS,
QueryIntent,
detect_query_intent,
get_adaptive_weights,
reciprocal_rank_fusion,
)
from codexlens.search.pipeline import SearchPipeline, SearchResult
from codexlens.config import Config
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def make_fts(docs: list[tuple[int, str, str]] | None = None) -> FTSEngine:
"""Create an in-memory FTSEngine and optionally add documents."""
engine = FTSEngine(":memory:")
if docs:
engine.add_documents(docs)
return engine
# ---------------------------------------------------------------------------
# FTSEngine tests
# ---------------------------------------------------------------------------
def test_fts_add_and_exact_search():
docs = [
(1, "a.py", "def authenticate user password login"),
(2, "b.py", "connect to database with credentials"),
(3, "c.py", "render template html response"),
]
engine = make_fts(docs)
results = engine.exact_search("authenticate", top_k=10)
ids = [r[0] for r in results]
assert 1 in ids, "doc 1 should match 'authenticate'"
assert 2 not in ids or results[0][0] == 1 # doc 1 must rank higher
def test_fts_fuzzy_search_prefix():
docs = [
(10, "auth.py", "authentication token refresh"),
(11, "db.py", "database connection pool"),
(12, "ui.py", "render button click handler"),
]
engine = make_fts(docs)
# Prefix 'auth' should match 'authentication' in doc 10
results = engine.fuzzy_search("auth", top_k=10)
ids = [r[0] for r in results]
assert 10 in ids, "prefix 'auth' should match doc 10 with 'authentication'"
# ---------------------------------------------------------------------------
# RRF fusion tests
# ---------------------------------------------------------------------------
def test_rrf_fusion_ordering():
"""When two sources agree on top-1, it should rank first in fused result."""
source_a = [(1, 0.9), (2, 0.5), (3, 0.2)]
source_b = [(1, 0.8), (3, 0.6), (2, 0.1)]
fused = reciprocal_rank_fusion({"a": source_a, "b": source_b})
assert fused[0][0] == 1, "doc 1 agreed top by both sources must rank first"
def test_rrf_equal_weight_default():
"""Calling with None weights should use DEFAULT_WEIGHTS shape (not crash)."""
source_exact = [(5, 1.0), (6, 0.8)]
source_vector = [(6, 0.9), (5, 0.7)]
# Should not raise and should return results
fused = reciprocal_rank_fusion(
{"exact": source_exact, "vector": source_vector},
weights=None,
)
assert len(fused) == 2
ids = [r[0] for r in fused]
assert 5 in ids and 6 in ids
# ---------------------------------------------------------------------------
# detect_query_intent tests
# ---------------------------------------------------------------------------
def test_detect_intent_code_symbol():
assert detect_query_intent("def authenticate()") == QueryIntent.CODE_SYMBOL
def test_detect_intent_natural():
assert detect_query_intent("how do I authenticate users") == QueryIntent.NATURAL_LANGUAGE
# ---------------------------------------------------------------------------
# SearchPipeline tests
# ---------------------------------------------------------------------------
def _make_pipeline(fts: FTSEngine, top_k: int = 5) -> SearchPipeline:
"""Build a SearchPipeline with mocked heavy components."""
cfg = Config.small()
cfg.reranker_top_k = top_k
embedder = MagicMock()
embedder.embed.return_value = [[0.1] * cfg.embed_dim]
binary_store = MagicMock()
binary_store.coarse_search.return_value = ([1, 2, 3], None)
ann_index = MagicMock()
ann_index.fine_search.return_value = ([1, 2, 3], [0.9, 0.8, 0.7])
reranker = MagicMock()
# Return a score for each content string passed
reranker.score_pairs.side_effect = lambda q, contents: [0.9 - i * 0.1 for i in range(len(contents))]
return SearchPipeline(
embedder=embedder,
binary_store=binary_store,
ann_index=ann_index,
reranker=reranker,
fts=fts,
config=cfg,
)
def test_pipeline_search_returns_results():
docs = [
(1, "a.py", "test content alpha"),
(2, "b.py", "test content beta"),
(3, "c.py", "test content gamma"),
]
fts = make_fts(docs)
pipeline = _make_pipeline(fts)
results = pipeline.search("test")
assert len(results) > 0
assert all(isinstance(r, SearchResult) for r in results)
def test_pipeline_top_k_limit():
docs = [
(1, "a.py", "hello world one"),
(2, "b.py", "hello world two"),
(3, "c.py", "hello world three"),
(4, "d.py", "hello world four"),
(5, "e.py", "hello world five"),
]
fts = make_fts(docs)
pipeline = _make_pipeline(fts, top_k=2)
results = pipeline.search("hello", top_k=2)
assert len(results) <= 2, "pipeline must respect top_k limit"