mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-18 18:48:48 +08:00
refactor: rename package codexlens -> codexlens_search for independent distribution
Rename the v2 search engine package to `codexlens-search` (import as `codexlens_search`) so it can be installed independently and consumed by the original codex-lens as a dependency. This avoids package path conflicts since both previously used `src/codexlens/`. Changes: - Rename src/codexlens/ -> src/codexlens_search/ - Update pyproject.toml: name=codexlens-search, version=0.2.0 - Update all imports across source, tests, and scripts - Add public API exports in __init__.py (Config, SearchPipeline, IndexingPipeline, SearchResult, IndexStats) 37/37 tests pass. No functional changes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
20
codex-lens-v2/src/codexlens_search/__init__.py
Normal file
20
codex-lens-v2/src/codexlens_search/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""codexlens-search: Lightweight semantic code search engine.
|
||||
|
||||
Public API for consumers (e.g. codex-lens):
|
||||
|
||||
from codexlens_search import SearchPipeline, IndexingPipeline, Config
|
||||
from codexlens_search.core import create_ann_index, create_binary_index
|
||||
from codexlens_search.embed.local import FastEmbedEmbedder
|
||||
from codexlens_search.rerank.api import APIReranker
|
||||
"""
|
||||
from codexlens_search.config import Config
|
||||
from codexlens_search.indexing import IndexingPipeline, IndexStats
|
||||
from codexlens_search.search.pipeline import SearchPipeline, SearchResult
|
||||
|
||||
__all__ = [
|
||||
"Config",
|
||||
"IndexingPipeline",
|
||||
"IndexStats",
|
||||
"SearchPipeline",
|
||||
"SearchResult",
|
||||
]
|
||||
99
codex-lens-v2/src/codexlens_search/config.py
Normal file
99
codex-lens-v2/src/codexlens_search/config.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
# Embedding
|
||||
embed_model: str = "jinaai/jina-embeddings-v2-base-code"
|
||||
embed_dim: int = 768
|
||||
embed_batch_size: int = 64
|
||||
|
||||
# GPU / execution providers
|
||||
device: str = "auto" # 'auto', 'cuda', 'cpu'
|
||||
embed_providers: list[str] | None = None # explicit ONNX providers override
|
||||
|
||||
# Backend selection: 'auto', 'faiss', 'hnswlib'
|
||||
ann_backend: str = "auto"
|
||||
binary_backend: str = "auto"
|
||||
|
||||
# Indexing pipeline
|
||||
index_workers: int = 2 # number of parallel indexing workers
|
||||
|
||||
# HNSW index (ANNIndex)
|
||||
hnsw_ef: int = 150
|
||||
hnsw_M: int = 32
|
||||
hnsw_ef_construction: int = 200
|
||||
|
||||
# Binary coarse search (BinaryStore)
|
||||
binary_top_k: int = 200
|
||||
|
||||
# ANN fine search
|
||||
ann_top_k: int = 50
|
||||
|
||||
# Reranker
|
||||
reranker_model: str = "BAAI/bge-reranker-v2-m3"
|
||||
reranker_top_k: int = 20
|
||||
reranker_batch_size: int = 32
|
||||
|
||||
# API reranker (optional)
|
||||
reranker_api_url: str = ""
|
||||
reranker_api_key: str = ""
|
||||
reranker_api_model: str = ""
|
||||
reranker_api_max_tokens_per_batch: int = 2048
|
||||
|
||||
# FTS
|
||||
fts_top_k: int = 50
|
||||
|
||||
# Fusion
|
||||
fusion_k: int = 60 # RRF k parameter
|
||||
fusion_weights: dict = field(default_factory=lambda: {
|
||||
"exact": 0.25,
|
||||
"fuzzy": 0.10,
|
||||
"vector": 0.50,
|
||||
"graph": 0.15,
|
||||
})
|
||||
|
||||
def resolve_embed_providers(self) -> list[str]:
|
||||
"""Return ONNX execution providers based on device config.
|
||||
|
||||
Priority: explicit embed_providers > device setting > auto-detect.
|
||||
"""
|
||||
if self.embed_providers is not None:
|
||||
return list(self.embed_providers)
|
||||
|
||||
if self.device == "cuda":
|
||||
return ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
|
||||
if self.device == "cpu":
|
||||
return ["CPUExecutionProvider"]
|
||||
|
||||
# auto-detect
|
||||
try:
|
||||
import onnxruntime
|
||||
available = onnxruntime.get_available_providers()
|
||||
if "CUDAExecutionProvider" in available:
|
||||
log.info("CUDA detected via onnxruntime, using GPU for embedding")
|
||||
return ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
return ["CPUExecutionProvider"]
|
||||
|
||||
@classmethod
|
||||
def defaults(cls) -> "Config":
|
||||
return cls()
|
||||
|
||||
@classmethod
|
||||
def small(cls) -> "Config":
|
||||
"""Smaller config for testing or small corpora."""
|
||||
return cls(
|
||||
hnsw_ef=50,
|
||||
hnsw_M=16,
|
||||
binary_top_k=50,
|
||||
ann_top_k=20,
|
||||
reranker_top_k=10,
|
||||
)
|
||||
13
codex-lens-v2/src/codexlens_search/core/__init__.py
Normal file
13
codex-lens-v2/src/codexlens_search/core/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from .base import BaseANNIndex, BaseBinaryIndex
|
||||
from .binary import BinaryStore
|
||||
from .factory import create_ann_index, create_binary_index
|
||||
from .index import ANNIndex
|
||||
|
||||
__all__ = [
|
||||
"BaseANNIndex",
|
||||
"BaseBinaryIndex",
|
||||
"ANNIndex",
|
||||
"BinaryStore",
|
||||
"create_ann_index",
|
||||
"create_binary_index",
|
||||
]
|
||||
83
codex-lens-v2/src/codexlens_search/core/base.py
Normal file
83
codex-lens-v2/src/codexlens_search/core/base.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class BaseANNIndex(ABC):
|
||||
"""Abstract base class for approximate nearest neighbor indexes."""
|
||||
|
||||
@abstractmethod
|
||||
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
|
||||
"""Add float32 vectors with corresponding IDs.
|
||||
|
||||
Args:
|
||||
ids: shape (N,) int64
|
||||
vectors: shape (N, dim) float32
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def fine_search(
|
||||
self, query_vec: np.ndarray, top_k: int | None = None
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""Search for nearest neighbors.
|
||||
|
||||
Args:
|
||||
query_vec: float32 vector of shape (dim,)
|
||||
top_k: number of results
|
||||
|
||||
Returns:
|
||||
(ids, distances) as numpy arrays
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def save(self) -> None:
|
||||
"""Persist index to disk."""
|
||||
|
||||
@abstractmethod
|
||||
def load(self) -> None:
|
||||
"""Load index from disk."""
|
||||
|
||||
@abstractmethod
|
||||
def __len__(self) -> int:
|
||||
"""Return the number of indexed items."""
|
||||
|
||||
|
||||
class BaseBinaryIndex(ABC):
|
||||
"""Abstract base class for binary vector indexes (Hamming distance)."""
|
||||
|
||||
@abstractmethod
|
||||
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
|
||||
"""Add float32 vectors (will be binary-quantized internally).
|
||||
|
||||
Args:
|
||||
ids: shape (N,) int64
|
||||
vectors: shape (N, dim) float32
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def coarse_search(
|
||||
self, query_vec: np.ndarray, top_k: int | None = None
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""Search by Hamming distance.
|
||||
|
||||
Args:
|
||||
query_vec: float32 vector of shape (dim,)
|
||||
top_k: number of results
|
||||
|
||||
Returns:
|
||||
(ids, distances) sorted ascending by distance
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def save(self) -> None:
|
||||
"""Persist store to disk."""
|
||||
|
||||
@abstractmethod
|
||||
def load(self) -> None:
|
||||
"""Load store from disk."""
|
||||
|
||||
@abstractmethod
|
||||
def __len__(self) -> int:
|
||||
"""Return the number of stored items."""
|
||||
173
codex-lens-v2/src/codexlens_search/core/binary.py
Normal file
173
codex-lens-v2/src/codexlens_search/core/binary.py
Normal file
@@ -0,0 +1,173 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import math
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from codexlens_search.config import Config
|
||||
from codexlens_search.core.base import BaseBinaryIndex
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BinaryStore(BaseBinaryIndex):
|
||||
"""Persistent binary vector store using numpy memmap.
|
||||
|
||||
Stores binary-quantized float32 vectors as packed uint8 arrays on disk.
|
||||
Supports fast coarse search via XOR + popcount Hamming distance.
|
||||
"""
|
||||
|
||||
def __init__(self, path: str | Path, dim: int, config: Config) -> None:
|
||||
self._dir = Path(path)
|
||||
self._dim = dim
|
||||
self._config = config
|
||||
self._packed_bytes = math.ceil(dim / 8)
|
||||
|
||||
self._bin_path = self._dir / "binary_store.bin"
|
||||
self._ids_path = self._dir / "binary_store_ids.npy"
|
||||
|
||||
self._matrix: np.ndarray | None = None # shape (N, packed_bytes), uint8
|
||||
self._ids: np.ndarray | None = None # shape (N,), int64
|
||||
self._count: int = 0
|
||||
|
||||
if self._bin_path.exists() and self._ids_path.exists():
|
||||
self.load()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _quantize(self, vectors: np.ndarray) -> np.ndarray:
|
||||
"""Convert float32 vectors (N, dim) to packed uint8 (N, packed_bytes)."""
|
||||
binary = (vectors > 0).astype(np.uint8)
|
||||
packed = np.packbits(binary, axis=1)
|
||||
return packed
|
||||
|
||||
def _quantize_single(self, vec: np.ndarray) -> np.ndarray:
|
||||
"""Convert a single float32 vector (dim,) to packed uint8 (packed_bytes,)."""
|
||||
binary = (vec > 0).astype(np.uint8)
|
||||
return np.packbits(binary)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _ensure_capacity(self, needed: int) -> None:
|
||||
"""Grow pre-allocated matrix/ids arrays to fit *needed* total items."""
|
||||
if self._matrix is not None and self._matrix.shape[0] >= needed:
|
||||
return
|
||||
|
||||
new_cap = max(1024, needed)
|
||||
# Double until large enough
|
||||
if self._matrix is not None:
|
||||
cur_cap = self._matrix.shape[0]
|
||||
new_cap = max(cur_cap, 1024)
|
||||
while new_cap < needed:
|
||||
new_cap *= 2
|
||||
|
||||
new_matrix = np.zeros((new_cap, self._packed_bytes), dtype=np.uint8)
|
||||
new_ids = np.zeros(new_cap, dtype=np.int64)
|
||||
|
||||
if self._matrix is not None and self._count > 0:
|
||||
new_matrix[: self._count] = self._matrix[: self._count]
|
||||
new_ids[: self._count] = self._ids[: self._count]
|
||||
|
||||
self._matrix = new_matrix
|
||||
self._ids = new_ids
|
||||
|
||||
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
|
||||
"""Add float32 vectors and their ids.
|
||||
|
||||
Does NOT call save() internally -- callers must call save()
|
||||
explicitly after batch indexing.
|
||||
|
||||
Args:
|
||||
ids: shape (N,) int64
|
||||
vectors: shape (N, dim) float32
|
||||
"""
|
||||
if len(ids) == 0:
|
||||
return
|
||||
|
||||
packed = self._quantize(vectors) # (N, packed_bytes)
|
||||
n = len(ids)
|
||||
|
||||
self._ensure_capacity(self._count + n)
|
||||
self._matrix[self._count : self._count + n] = packed
|
||||
self._ids[self._count : self._count + n] = ids.astype(np.int64)
|
||||
self._count += n
|
||||
|
||||
def coarse_search(
|
||||
self, query_vec: np.ndarray, top_k: int | None = None
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""Search by Hamming distance.
|
||||
|
||||
Args:
|
||||
query_vec: float32 vector of shape (dim,)
|
||||
top_k: number of results; defaults to config.binary_top_k
|
||||
|
||||
Returns:
|
||||
(ids, distances) sorted ascending by Hamming distance
|
||||
"""
|
||||
if self._matrix is None or self._count == 0:
|
||||
return np.array([], dtype=np.int64), np.array([], dtype=np.int32)
|
||||
|
||||
k = top_k if top_k is not None else self._config.binary_top_k
|
||||
k = min(k, self._count)
|
||||
|
||||
query_bin = self._quantize_single(query_vec) # (packed_bytes,)
|
||||
|
||||
# Slice to active region (matrix may be pre-allocated larger)
|
||||
active_matrix = self._matrix[: self._count]
|
||||
active_ids = self._ids[: self._count]
|
||||
|
||||
# XOR then popcount via unpackbits
|
||||
xor = np.bitwise_xor(active_matrix, query_bin[np.newaxis, :]) # (N, packed_bytes)
|
||||
dists = np.unpackbits(xor, axis=1).sum(axis=1).astype(np.int32) # (N,)
|
||||
|
||||
if k >= self._count:
|
||||
order = np.argsort(dists)
|
||||
else:
|
||||
part = np.argpartition(dists, k)[:k]
|
||||
order = part[np.argsort(dists[part])]
|
||||
|
||||
return active_ids[order], dists[order]
|
||||
|
||||
def save(self) -> None:
|
||||
"""Flush binary store to disk."""
|
||||
if self._matrix is None or self._count == 0:
|
||||
return
|
||||
self._dir.mkdir(parents=True, exist_ok=True)
|
||||
# Write only the occupied portion of the pre-allocated matrix
|
||||
active_matrix = self._matrix[: self._count]
|
||||
mm = np.memmap(
|
||||
str(self._bin_path),
|
||||
dtype=np.uint8,
|
||||
mode="w+",
|
||||
shape=active_matrix.shape,
|
||||
)
|
||||
mm[:] = active_matrix
|
||||
mm.flush()
|
||||
del mm
|
||||
np.save(str(self._ids_path), self._ids[: self._count])
|
||||
|
||||
def load(self) -> None:
|
||||
"""Reload binary store from disk."""
|
||||
ids = np.load(str(self._ids_path))
|
||||
n = len(ids)
|
||||
if n == 0:
|
||||
return
|
||||
mm = np.memmap(
|
||||
str(self._bin_path),
|
||||
dtype=np.uint8,
|
||||
mode="r",
|
||||
shape=(n, self._packed_bytes),
|
||||
)
|
||||
self._matrix = np.array(mm) # copy into RAM for mutation support
|
||||
del mm
|
||||
self._ids = ids.astype(np.int64)
|
||||
self._count = n
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self._count
|
||||
116
codex-lens-v2/src/codexlens_search/core/factory.py
Normal file
116
codex-lens-v2/src/codexlens_search/core/factory.py
Normal file
@@ -0,0 +1,116 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from codexlens_search.config import Config
|
||||
from codexlens_search.core.base import BaseANNIndex, BaseBinaryIndex
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import faiss as _faiss # noqa: F401
|
||||
_FAISS_AVAILABLE = True
|
||||
except ImportError:
|
||||
_FAISS_AVAILABLE = False
|
||||
|
||||
try:
|
||||
import hnswlib as _hnswlib # noqa: F401
|
||||
_HNSWLIB_AVAILABLE = True
|
||||
except ImportError:
|
||||
_HNSWLIB_AVAILABLE = False
|
||||
|
||||
|
||||
def _has_faiss_gpu() -> bool:
|
||||
"""Check whether faiss-gpu is available (has GPU resources)."""
|
||||
if not _FAISS_AVAILABLE:
|
||||
return False
|
||||
try:
|
||||
import faiss
|
||||
res = faiss.StandardGpuResources() # noqa: F841
|
||||
return True
|
||||
except (AttributeError, RuntimeError):
|
||||
return False
|
||||
|
||||
|
||||
def create_ann_index(path: str | Path, dim: int, config: Config) -> BaseANNIndex:
|
||||
"""Create an ANN index based on config.ann_backend.
|
||||
|
||||
Fallback chain for 'auto': faiss-gpu -> faiss-cpu -> hnswlib.
|
||||
|
||||
Args:
|
||||
path: directory for index persistence
|
||||
dim: vector dimensionality
|
||||
config: project configuration
|
||||
|
||||
Returns:
|
||||
A BaseANNIndex implementation
|
||||
|
||||
Raises:
|
||||
ImportError: if no suitable backend is available
|
||||
"""
|
||||
backend = config.ann_backend
|
||||
|
||||
if backend == "faiss":
|
||||
from codexlens_search.core.faiss_index import FAISSANNIndex
|
||||
return FAISSANNIndex(path, dim, config)
|
||||
|
||||
if backend == "hnswlib":
|
||||
from codexlens_search.core.index import ANNIndex
|
||||
return ANNIndex(path, dim, config)
|
||||
|
||||
# auto: try faiss first, then hnswlib
|
||||
if _FAISS_AVAILABLE:
|
||||
from codexlens_search.core.faiss_index import FAISSANNIndex
|
||||
gpu_tag = " (GPU available)" if _has_faiss_gpu() else " (CPU)"
|
||||
logger.info("Auto-selected FAISS ANN backend%s", gpu_tag)
|
||||
return FAISSANNIndex(path, dim, config)
|
||||
|
||||
if _HNSWLIB_AVAILABLE:
|
||||
from codexlens_search.core.index import ANNIndex
|
||||
logger.info("Auto-selected hnswlib ANN backend")
|
||||
return ANNIndex(path, dim, config)
|
||||
|
||||
raise ImportError(
|
||||
"No ANN backend available. Install faiss-cpu, faiss-gpu, or hnswlib."
|
||||
)
|
||||
|
||||
|
||||
def create_binary_index(
|
||||
path: str | Path, dim: int, config: Config
|
||||
) -> BaseBinaryIndex:
|
||||
"""Create a binary index based on config.binary_backend.
|
||||
|
||||
Fallback chain for 'auto': faiss -> numpy BinaryStore.
|
||||
|
||||
Args:
|
||||
path: directory for index persistence
|
||||
dim: vector dimensionality
|
||||
config: project configuration
|
||||
|
||||
Returns:
|
||||
A BaseBinaryIndex implementation
|
||||
|
||||
Raises:
|
||||
ImportError: if no suitable backend is available
|
||||
"""
|
||||
backend = config.binary_backend
|
||||
|
||||
if backend == "faiss":
|
||||
from codexlens_search.core.faiss_index import FAISSBinaryIndex
|
||||
return FAISSBinaryIndex(path, dim, config)
|
||||
|
||||
if backend == "hnswlib":
|
||||
from codexlens_search.core.binary import BinaryStore
|
||||
return BinaryStore(path, dim, config)
|
||||
|
||||
# auto: try faiss first, then numpy-based BinaryStore
|
||||
if _FAISS_AVAILABLE:
|
||||
from codexlens_search.core.faiss_index import FAISSBinaryIndex
|
||||
logger.info("Auto-selected FAISS binary backend")
|
||||
return FAISSBinaryIndex(path, dim, config)
|
||||
|
||||
# numpy BinaryStore is always available (no extra deps)
|
||||
from codexlens_search.core.binary import BinaryStore
|
||||
logger.info("Auto-selected numpy BinaryStore backend")
|
||||
return BinaryStore(path, dim, config)
|
||||
275
codex-lens-v2/src/codexlens_search/core/faiss_index.py
Normal file
275
codex-lens-v2/src/codexlens_search/core/faiss_index.py
Normal file
@@ -0,0 +1,275 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import math
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from codexlens_search.config import Config
|
||||
from codexlens_search.core.base import BaseANNIndex, BaseBinaryIndex
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import faiss
|
||||
_FAISS_AVAILABLE = True
|
||||
except ImportError:
|
||||
faiss = None # type: ignore[assignment]
|
||||
_FAISS_AVAILABLE = False
|
||||
|
||||
|
||||
def _try_gpu_index(index: "faiss.Index") -> "faiss.Index":
|
||||
"""Transfer a FAISS index to GPU if faiss-gpu is available.
|
||||
|
||||
Returns the GPU index on success, or the original CPU index on failure.
|
||||
"""
|
||||
try:
|
||||
res = faiss.StandardGpuResources()
|
||||
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
|
||||
logger.info("FAISS index transferred to GPU 0")
|
||||
return gpu_index
|
||||
except (AttributeError, RuntimeError) as exc:
|
||||
logger.debug("GPU transfer unavailable, staying on CPU: %s", exc)
|
||||
return index
|
||||
|
||||
|
||||
def _to_cpu_for_save(index: "faiss.Index") -> "faiss.Index":
|
||||
"""Convert a GPU index back to CPU for serialization."""
|
||||
try:
|
||||
return faiss.index_gpu_to_cpu(index)
|
||||
except (AttributeError, RuntimeError):
|
||||
return index
|
||||
|
||||
|
||||
class FAISSANNIndex(BaseANNIndex):
|
||||
"""FAISS-based ANN index using IndexHNSWFlat with optional GPU.
|
||||
|
||||
Uses Inner Product space with L2-normalized vectors for cosine similarity.
|
||||
Thread-safe via RLock.
|
||||
"""
|
||||
|
||||
def __init__(self, path: str | Path, dim: int, config: Config) -> None:
|
||||
if not _FAISS_AVAILABLE:
|
||||
raise ImportError(
|
||||
"faiss is required. Install with: pip install faiss-cpu "
|
||||
"or pip install faiss-gpu"
|
||||
)
|
||||
|
||||
self._path = Path(path)
|
||||
self._index_path = self._path / "faiss_ann.index"
|
||||
self._dim = dim
|
||||
self._config = config
|
||||
self._lock = threading.RLock()
|
||||
self._index: faiss.Index | None = None
|
||||
|
||||
def _ensure_loaded(self) -> None:
|
||||
"""Load or initialize the index (caller holds lock)."""
|
||||
if self._index is not None:
|
||||
return
|
||||
self.load()
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load index from disk or initialize a fresh one."""
|
||||
with self._lock:
|
||||
if self._index_path.exists():
|
||||
idx = faiss.read_index(str(self._index_path))
|
||||
logger.debug(
|
||||
"Loaded FAISS ANN index from %s (%d items)",
|
||||
self._index_path, idx.ntotal,
|
||||
)
|
||||
else:
|
||||
# HNSW with flat storage, M=32 by default
|
||||
m = self._config.hnsw_M
|
||||
idx = faiss.IndexHNSWFlat(self._dim, m, faiss.METRIC_INNER_PRODUCT)
|
||||
idx.hnsw.efConstruction = self._config.hnsw_ef_construction
|
||||
idx.hnsw.efSearch = self._config.hnsw_ef
|
||||
logger.debug(
|
||||
"Initialized fresh FAISS HNSW index (dim=%d, M=%d)",
|
||||
self._dim, m,
|
||||
)
|
||||
self._index = _try_gpu_index(idx)
|
||||
|
||||
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
|
||||
"""Add L2-normalized float32 vectors.
|
||||
|
||||
Vectors are normalized before insertion so that Inner Product
|
||||
distance equals cosine similarity.
|
||||
|
||||
Args:
|
||||
ids: shape (N,) int64 -- currently unused by FAISS flat index
|
||||
but kept for API compatibility. FAISS uses sequential IDs.
|
||||
vectors: shape (N, dim) float32
|
||||
"""
|
||||
if len(ids) == 0:
|
||||
return
|
||||
|
||||
vecs = np.ascontiguousarray(vectors, dtype=np.float32)
|
||||
# Normalize for cosine similarity via Inner Product
|
||||
faiss.normalize_L2(vecs)
|
||||
|
||||
with self._lock:
|
||||
self._ensure_loaded()
|
||||
self._index.add(vecs)
|
||||
|
||||
def fine_search(
|
||||
self, query_vec: np.ndarray, top_k: int | None = None
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""Search for nearest neighbors.
|
||||
|
||||
Args:
|
||||
query_vec: float32 vector of shape (dim,)
|
||||
top_k: number of results; defaults to config.ann_top_k
|
||||
|
||||
Returns:
|
||||
(ids, distances) as numpy arrays. For IP space, higher = more
|
||||
similar, but distances are returned as-is for consumer handling.
|
||||
"""
|
||||
k = top_k if top_k is not None else self._config.ann_top_k
|
||||
|
||||
with self._lock:
|
||||
self._ensure_loaded()
|
||||
|
||||
count = self._index.ntotal
|
||||
if count == 0:
|
||||
return np.array([], dtype=np.int64), np.array([], dtype=np.float32)
|
||||
|
||||
k = min(k, count)
|
||||
# Set efSearch for HNSW accuracy
|
||||
try:
|
||||
self._index.hnsw.efSearch = max(self._config.hnsw_ef, k)
|
||||
except AttributeError:
|
||||
pass # GPU index may not expose hnsw attribute directly
|
||||
|
||||
q = np.ascontiguousarray(query_vec, dtype=np.float32).reshape(1, -1)
|
||||
faiss.normalize_L2(q)
|
||||
distances, labels = self._index.search(q, k)
|
||||
return labels[0].astype(np.int64), distances[0].astype(np.float32)
|
||||
|
||||
def save(self) -> None:
|
||||
"""Save index to disk."""
|
||||
with self._lock:
|
||||
if self._index is None:
|
||||
return
|
||||
self._path.mkdir(parents=True, exist_ok=True)
|
||||
cpu_index = _to_cpu_for_save(self._index)
|
||||
faiss.write_index(cpu_index, str(self._index_path))
|
||||
|
||||
def __len__(self) -> int:
|
||||
with self._lock:
|
||||
if self._index is None:
|
||||
return 0
|
||||
return self._index.ntotal
|
||||
|
||||
|
||||
class FAISSBinaryIndex(BaseBinaryIndex):
|
||||
"""FAISS-based binary index using IndexBinaryFlat for Hamming distance.
|
||||
|
||||
Vectors are binary-quantized (sign bit) before insertion.
|
||||
Thread-safe via RLock.
|
||||
"""
|
||||
|
||||
def __init__(self, path: str | Path, dim: int, config: Config) -> None:
|
||||
if not _FAISS_AVAILABLE:
|
||||
raise ImportError(
|
||||
"faiss is required. Install with: pip install faiss-cpu "
|
||||
"or pip install faiss-gpu"
|
||||
)
|
||||
|
||||
self._path = Path(path)
|
||||
self._index_path = self._path / "faiss_binary.index"
|
||||
self._dim = dim
|
||||
self._config = config
|
||||
self._packed_bytes = math.ceil(dim / 8)
|
||||
self._lock = threading.RLock()
|
||||
self._index: faiss.IndexBinary | None = None
|
||||
|
||||
def _ensure_loaded(self) -> None:
|
||||
if self._index is not None:
|
||||
return
|
||||
self.load()
|
||||
|
||||
def _quantize(self, vectors: np.ndarray) -> np.ndarray:
|
||||
"""Convert float32 vectors (N, dim) to packed uint8 (N, packed_bytes)."""
|
||||
binary = (vectors > 0).astype(np.uint8)
|
||||
return np.packbits(binary, axis=1)
|
||||
|
||||
def _quantize_single(self, vec: np.ndarray) -> np.ndarray:
|
||||
"""Convert a single float32 vector (dim,) to packed uint8 (1, packed_bytes)."""
|
||||
binary = (vec > 0).astype(np.uint8)
|
||||
return np.packbits(binary).reshape(1, -1)
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load binary index from disk or initialize a fresh one."""
|
||||
with self._lock:
|
||||
if self._index_path.exists():
|
||||
idx = faiss.read_index_binary(str(self._index_path))
|
||||
logger.debug(
|
||||
"Loaded FAISS binary index from %s (%d items)",
|
||||
self._index_path, idx.ntotal,
|
||||
)
|
||||
else:
|
||||
# IndexBinaryFlat takes dimension in bits
|
||||
idx = faiss.IndexBinaryFlat(self._dim)
|
||||
logger.debug(
|
||||
"Initialized fresh FAISS binary index (dim_bits=%d)", self._dim,
|
||||
)
|
||||
self._index = idx
|
||||
|
||||
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
|
||||
"""Add float32 vectors (binary-quantized internally).
|
||||
|
||||
Args:
|
||||
ids: shape (N,) int64 -- kept for API compatibility
|
||||
vectors: shape (N, dim) float32
|
||||
"""
|
||||
if len(ids) == 0:
|
||||
return
|
||||
|
||||
packed = self._quantize(vectors)
|
||||
packed = np.ascontiguousarray(packed, dtype=np.uint8)
|
||||
|
||||
with self._lock:
|
||||
self._ensure_loaded()
|
||||
self._index.add(packed)
|
||||
|
||||
def coarse_search(
|
||||
self, query_vec: np.ndarray, top_k: int | None = None
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""Search by Hamming distance.
|
||||
|
||||
Args:
|
||||
query_vec: float32 vector of shape (dim,)
|
||||
top_k: number of results; defaults to config.binary_top_k
|
||||
|
||||
Returns:
|
||||
(ids, distances) sorted ascending by Hamming distance
|
||||
"""
|
||||
with self._lock:
|
||||
self._ensure_loaded()
|
||||
|
||||
if self._index.ntotal == 0:
|
||||
return np.array([], dtype=np.int64), np.array([], dtype=np.int32)
|
||||
|
||||
k = top_k if top_k is not None else self._config.binary_top_k
|
||||
k = min(k, self._index.ntotal)
|
||||
|
||||
q = self._quantize_single(query_vec)
|
||||
q = np.ascontiguousarray(q, dtype=np.uint8)
|
||||
distances, labels = self._index.search(q, k)
|
||||
return labels[0].astype(np.int64), distances[0].astype(np.int32)
|
||||
|
||||
def save(self) -> None:
|
||||
"""Save binary index to disk."""
|
||||
with self._lock:
|
||||
if self._index is None:
|
||||
return
|
||||
self._path.mkdir(parents=True, exist_ok=True)
|
||||
faiss.write_index_binary(self._index, str(self._index_path))
|
||||
|
||||
def __len__(self) -> int:
|
||||
with self._lock:
|
||||
if self._index is None:
|
||||
return 0
|
||||
return self._index.ntotal
|
||||
136
codex-lens-v2/src/codexlens_search/core/index.py
Normal file
136
codex-lens-v2/src/codexlens_search/core/index.py
Normal file
@@ -0,0 +1,136 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from codexlens_search.config import Config
|
||||
from codexlens_search.core.base import BaseANNIndex
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import hnswlib
|
||||
_HNSWLIB_AVAILABLE = True
|
||||
except ImportError:
|
||||
_HNSWLIB_AVAILABLE = False
|
||||
|
||||
|
||||
class ANNIndex(BaseANNIndex):
|
||||
"""HNSW-based approximate nearest neighbor index.
|
||||
|
||||
Lazy-loads on first use, thread-safe via RLock.
|
||||
"""
|
||||
|
||||
def __init__(self, path: str | Path, dim: int, config: Config) -> None:
|
||||
if not _HNSWLIB_AVAILABLE:
|
||||
raise ImportError("hnswlib is required. Install with: pip install hnswlib")
|
||||
|
||||
self._path = Path(path)
|
||||
self._hnsw_path = self._path / "ann_index.hnsw"
|
||||
self._dim = dim
|
||||
self._config = config
|
||||
self._lock = threading.RLock()
|
||||
self._index: hnswlib.Index | None = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _ensure_loaded(self) -> None:
|
||||
"""Load or initialize the index (caller holds lock)."""
|
||||
if self._index is not None:
|
||||
return
|
||||
self.load()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def load(self) -> None:
|
||||
"""Load index from disk or initialize a fresh one."""
|
||||
with self._lock:
|
||||
idx = hnswlib.Index(space="cosine", dim=self._dim)
|
||||
if self._hnsw_path.exists():
|
||||
idx.load_index(str(self._hnsw_path), max_elements=0)
|
||||
idx.set_ef(self._config.hnsw_ef)
|
||||
logger.debug("Loaded HNSW index from %s (%d items)", self._hnsw_path, idx.get_current_count())
|
||||
else:
|
||||
idx.init_index(
|
||||
max_elements=1000,
|
||||
ef_construction=self._config.hnsw_ef_construction,
|
||||
M=self._config.hnsw_M,
|
||||
)
|
||||
idx.set_ef(self._config.hnsw_ef)
|
||||
logger.debug("Initialized fresh HNSW index (dim=%d)", self._dim)
|
||||
self._index = idx
|
||||
|
||||
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
|
||||
"""Add float32 vectors.
|
||||
|
||||
Does NOT call save() internally -- callers must call save()
|
||||
explicitly after batch indexing.
|
||||
|
||||
Args:
|
||||
ids: shape (N,) int64
|
||||
vectors: shape (N, dim) float32
|
||||
"""
|
||||
if len(ids) == 0:
|
||||
return
|
||||
|
||||
vecs = np.ascontiguousarray(vectors, dtype=np.float32)
|
||||
|
||||
with self._lock:
|
||||
self._ensure_loaded()
|
||||
# Expand capacity if needed
|
||||
current = self._index.get_current_count()
|
||||
max_el = self._index.get_max_elements()
|
||||
needed = current + len(ids)
|
||||
if needed > max_el:
|
||||
new_cap = max(max_el * 2, needed + 100)
|
||||
self._index.resize_index(new_cap)
|
||||
self._index.add_items(vecs, ids.astype(np.int64))
|
||||
|
||||
def fine_search(
|
||||
self, query_vec: np.ndarray, top_k: int | None = None
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""Search for nearest neighbors.
|
||||
|
||||
Args:
|
||||
query_vec: float32 vector of shape (dim,)
|
||||
top_k: number of results; defaults to config.ann_top_k
|
||||
|
||||
Returns:
|
||||
(ids, distances) as numpy arrays
|
||||
"""
|
||||
k = top_k if top_k is not None else self._config.ann_top_k
|
||||
|
||||
with self._lock:
|
||||
self._ensure_loaded()
|
||||
|
||||
count = self._index.get_current_count()
|
||||
if count == 0:
|
||||
return np.array([], dtype=np.int64), np.array([], dtype=np.float32)
|
||||
|
||||
k = min(k, count)
|
||||
self._index.set_ef(max(self._config.hnsw_ef, k))
|
||||
|
||||
q = np.ascontiguousarray(query_vec, dtype=np.float32).reshape(1, -1)
|
||||
labels, distances = self._index.knn_query(q, k=k)
|
||||
return labels[0].astype(np.int64), distances[0].astype(np.float32)
|
||||
|
||||
def save(self) -> None:
|
||||
"""Save index to disk (caller may or may not hold lock)."""
|
||||
with self._lock:
|
||||
if self._index is None:
|
||||
return
|
||||
self._path.mkdir(parents=True, exist_ok=True)
|
||||
self._index.save_index(str(self._hnsw_path))
|
||||
|
||||
def __len__(self) -> int:
|
||||
with self._lock:
|
||||
if self._index is None:
|
||||
return 0
|
||||
return self._index.get_current_count()
|
||||
4
codex-lens-v2/src/codexlens_search/embed/__init__.py
Normal file
4
codex-lens-v2/src/codexlens_search/embed/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .base import BaseEmbedder
|
||||
from .local import FastEmbedEmbedder, EMBED_PROFILES
|
||||
|
||||
__all__ = ["BaseEmbedder", "FastEmbedEmbedder", "EMBED_PROFILES"]
|
||||
13
codex-lens-v2/src/codexlens_search/embed/base.py
Normal file
13
codex-lens-v2/src/codexlens_search/embed/base.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from __future__ import annotations
|
||||
from abc import ABC, abstractmethod
|
||||
import numpy as np
|
||||
|
||||
|
||||
class BaseEmbedder(ABC):
|
||||
@abstractmethod
|
||||
def embed_single(self, text: str) -> np.ndarray:
|
||||
"""Embed a single text, returns float32 ndarray shape (dim,)."""
|
||||
|
||||
@abstractmethod
|
||||
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
||||
"""Embed a list of texts, returns list of float32 ndarrays."""
|
||||
53
codex-lens-v2/src/codexlens_search/embed/local.py
Normal file
53
codex-lens-v2/src/codexlens_search/embed/local.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..config import Config
|
||||
from .base import BaseEmbedder
|
||||
|
||||
EMBED_PROFILES = {
|
||||
"small": "BAAI/bge-small-en-v1.5", # 384d
|
||||
"base": "BAAI/bge-base-en-v1.5", # 768d
|
||||
"large": "BAAI/bge-large-en-v1.5", # 1024d
|
||||
"code": "jinaai/jina-embeddings-v2-base-code", # 768d
|
||||
}
|
||||
|
||||
|
||||
class FastEmbedEmbedder(BaseEmbedder):
|
||||
"""Embedder backed by fastembed.TextEmbedding with lazy model loading."""
|
||||
|
||||
def __init__(self, config: Config) -> None:
|
||||
self._config = config
|
||||
self._model = None
|
||||
|
||||
def _load(self) -> None:
|
||||
"""Lazy-load the fastembed TextEmbedding model on first use."""
|
||||
if self._model is not None:
|
||||
return
|
||||
from fastembed import TextEmbedding
|
||||
providers = self._config.resolve_embed_providers()
|
||||
try:
|
||||
self._model = TextEmbedding(
|
||||
model_name=self._config.embed_model,
|
||||
providers=providers,
|
||||
)
|
||||
except TypeError:
|
||||
# Older fastembed versions may not accept providers kwarg
|
||||
self._model = TextEmbedding(model_name=self._config.embed_model)
|
||||
|
||||
def embed_single(self, text: str) -> np.ndarray:
|
||||
"""Embed a single text, returns float32 ndarray of shape (dim,)."""
|
||||
self._load()
|
||||
result = list(self._model.embed([text]))
|
||||
return result[0].astype(np.float32)
|
||||
|
||||
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
||||
"""Embed a list of texts in batches, returns list of float32 ndarrays."""
|
||||
self._load()
|
||||
batch_size = self._config.embed_batch_size
|
||||
results: list[np.ndarray] = []
|
||||
for start in range(0, len(texts), batch_size):
|
||||
batch = texts[start : start + batch_size]
|
||||
for vec in self._model.embed(batch):
|
||||
results.append(vec.astype(np.float32))
|
||||
return results
|
||||
5
codex-lens-v2/src/codexlens_search/indexing/__init__.py
Normal file
5
codex-lens-v2/src/codexlens_search/indexing/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from .pipeline import IndexingPipeline, IndexStats
|
||||
|
||||
__all__ = ["IndexingPipeline", "IndexStats"]
|
||||
277
codex-lens-v2/src/codexlens_search/indexing/pipeline.py
Normal file
277
codex-lens-v2/src/codexlens_search/indexing/pipeline.py
Normal file
@@ -0,0 +1,277 @@
|
||||
"""Three-stage parallel indexing pipeline: chunk -> embed -> index.
|
||||
|
||||
Uses threading.Thread with queue.Queue for producer-consumer handoff.
|
||||
The GIL is acceptable because embedding (onnxruntime) releases it in C extensions.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from codexlens_search.config import Config
|
||||
from codexlens_search.core.binary import BinaryStore
|
||||
from codexlens_search.core.index import ANNIndex
|
||||
from codexlens_search.embed.base import BaseEmbedder
|
||||
from codexlens_search.search.fts import FTSEngine
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Sentinel value to signal worker shutdown
|
||||
_SENTINEL = None
|
||||
|
||||
# Defaults for chunking (can be overridden via index_files kwargs)
|
||||
_DEFAULT_MAX_CHUNK_CHARS = 800
|
||||
_DEFAULT_CHUNK_OVERLAP = 100
|
||||
|
||||
|
||||
@dataclass
|
||||
class IndexStats:
|
||||
"""Statistics returned after indexing completes."""
|
||||
files_processed: int = 0
|
||||
chunks_created: int = 0
|
||||
duration_seconds: float = 0.0
|
||||
|
||||
|
||||
class IndexingPipeline:
|
||||
"""Parallel 3-stage indexing pipeline with queue-based handoff.
|
||||
|
||||
Stage 1 (main thread): Read files, chunk text, push to embed_queue.
|
||||
Stage 2 (embed worker): Pull text batches, call embed_batch(), push vectors to index_queue.
|
||||
Stage 3 (index worker): Pull vectors+ids, call BinaryStore.add(), ANNIndex.add(), FTS.add_documents().
|
||||
|
||||
After all stages complete, save() is called on BinaryStore and ANNIndex exactly once.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedder: BaseEmbedder,
|
||||
binary_store: BinaryStore,
|
||||
ann_index: ANNIndex,
|
||||
fts: FTSEngine,
|
||||
config: Config,
|
||||
) -> None:
|
||||
self._embedder = embedder
|
||||
self._binary_store = binary_store
|
||||
self._ann_index = ann_index
|
||||
self._fts = fts
|
||||
self._config = config
|
||||
|
||||
def index_files(
|
||||
self,
|
||||
files: list[Path],
|
||||
*,
|
||||
root: Path | None = None,
|
||||
max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS,
|
||||
chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP,
|
||||
max_file_size: int = 50_000,
|
||||
) -> IndexStats:
|
||||
"""Run the 3-stage pipeline on the given files.
|
||||
|
||||
Args:
|
||||
files: List of file paths to index.
|
||||
root: Optional root for computing relative paths. If None, uses
|
||||
each file's absolute path as its identifier.
|
||||
max_chunk_chars: Maximum characters per chunk.
|
||||
chunk_overlap: Character overlap between consecutive chunks.
|
||||
max_file_size: Skip files larger than this (bytes).
|
||||
|
||||
Returns:
|
||||
IndexStats with counts and timing.
|
||||
"""
|
||||
if not files:
|
||||
return IndexStats()
|
||||
|
||||
t0 = time.monotonic()
|
||||
|
||||
embed_queue: queue.Queue = queue.Queue(maxsize=4)
|
||||
index_queue: queue.Queue = queue.Queue(maxsize=4)
|
||||
|
||||
# Track errors from workers
|
||||
worker_errors: list[Exception] = []
|
||||
error_lock = threading.Lock()
|
||||
|
||||
def _record_error(exc: Exception) -> None:
|
||||
with error_lock:
|
||||
worker_errors.append(exc)
|
||||
|
||||
# --- Start workers ---
|
||||
embed_thread = threading.Thread(
|
||||
target=self._embed_worker,
|
||||
args=(embed_queue, index_queue, _record_error),
|
||||
daemon=True,
|
||||
name="indexing-embed",
|
||||
)
|
||||
index_thread = threading.Thread(
|
||||
target=self._index_worker,
|
||||
args=(index_queue, _record_error),
|
||||
daemon=True,
|
||||
name="indexing-index",
|
||||
)
|
||||
embed_thread.start()
|
||||
index_thread.start()
|
||||
|
||||
# --- Stage 1: chunk files (main thread) ---
|
||||
chunk_id = 0
|
||||
files_processed = 0
|
||||
chunks_created = 0
|
||||
|
||||
for fpath in files:
|
||||
try:
|
||||
if fpath.stat().st_size > max_file_size:
|
||||
continue
|
||||
text = fpath.read_text(encoding="utf-8", errors="replace")
|
||||
except Exception as exc:
|
||||
logger.debug("Skipping %s: %s", fpath, exc)
|
||||
continue
|
||||
|
||||
rel_path = str(fpath.relative_to(root)) if root else str(fpath)
|
||||
file_chunks = self._chunk_text(text, rel_path, max_chunk_chars, chunk_overlap)
|
||||
|
||||
if not file_chunks:
|
||||
continue
|
||||
|
||||
files_processed += 1
|
||||
|
||||
# Assign sequential IDs and push batch to embed queue
|
||||
batch_ids = []
|
||||
batch_texts = []
|
||||
batch_paths = []
|
||||
for chunk_text, path in file_chunks:
|
||||
batch_ids.append(chunk_id)
|
||||
batch_texts.append(chunk_text)
|
||||
batch_paths.append(path)
|
||||
chunk_id += 1
|
||||
|
||||
chunks_created += len(batch_ids)
|
||||
embed_queue.put((batch_ids, batch_texts, batch_paths))
|
||||
|
||||
# Signal embed worker: no more data
|
||||
embed_queue.put(_SENTINEL)
|
||||
|
||||
# Wait for workers to finish
|
||||
embed_thread.join()
|
||||
index_thread.join()
|
||||
|
||||
# --- Final flush ---
|
||||
self._binary_store.save()
|
||||
self._ann_index.save()
|
||||
|
||||
duration = time.monotonic() - t0
|
||||
stats = IndexStats(
|
||||
files_processed=files_processed,
|
||||
chunks_created=chunks_created,
|
||||
duration_seconds=round(duration, 2),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Indexing complete: %d files, %d chunks in %.1fs",
|
||||
stats.files_processed,
|
||||
stats.chunks_created,
|
||||
stats.duration_seconds,
|
||||
)
|
||||
|
||||
# Raise first worker error if any occurred
|
||||
if worker_errors:
|
||||
raise worker_errors[0]
|
||||
|
||||
return stats
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Workers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _embed_worker(
|
||||
self,
|
||||
in_q: queue.Queue,
|
||||
out_q: queue.Queue,
|
||||
on_error: callable,
|
||||
) -> None:
|
||||
"""Stage 2: Pull chunk batches, embed, push (ids, vecs, docs) to index queue."""
|
||||
try:
|
||||
while True:
|
||||
item = in_q.get()
|
||||
if item is _SENTINEL:
|
||||
break
|
||||
|
||||
batch_ids, batch_texts, batch_paths = item
|
||||
try:
|
||||
vecs = self._embedder.embed_batch(batch_texts)
|
||||
vec_array = np.array(vecs, dtype=np.float32)
|
||||
id_array = np.array(batch_ids, dtype=np.int64)
|
||||
out_q.put((id_array, vec_array, batch_texts, batch_paths))
|
||||
except Exception as exc:
|
||||
logger.error("Embed worker error: %s", exc)
|
||||
on_error(exc)
|
||||
finally:
|
||||
# Signal index worker: no more data
|
||||
out_q.put(_SENTINEL)
|
||||
|
||||
def _index_worker(
|
||||
self,
|
||||
in_q: queue.Queue,
|
||||
on_error: callable,
|
||||
) -> None:
|
||||
"""Stage 3: Pull (ids, vecs, texts, paths), write to stores."""
|
||||
while True:
|
||||
item = in_q.get()
|
||||
if item is _SENTINEL:
|
||||
break
|
||||
|
||||
id_array, vec_array, texts, paths = item
|
||||
try:
|
||||
self._binary_store.add(id_array, vec_array)
|
||||
self._ann_index.add(id_array, vec_array)
|
||||
|
||||
fts_docs = [
|
||||
(int(id_array[i]), paths[i], texts[i])
|
||||
for i in range(len(id_array))
|
||||
]
|
||||
self._fts.add_documents(fts_docs)
|
||||
except Exception as exc:
|
||||
logger.error("Index worker error: %s", exc)
|
||||
on_error(exc)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Chunking
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _chunk_text(
|
||||
text: str,
|
||||
path: str,
|
||||
max_chars: int,
|
||||
overlap: int,
|
||||
) -> list[tuple[str, str]]:
|
||||
"""Split file text into overlapping chunks.
|
||||
|
||||
Returns list of (chunk_text, path) tuples.
|
||||
"""
|
||||
if not text.strip():
|
||||
return []
|
||||
|
||||
chunks: list[tuple[str, str]] = []
|
||||
lines = text.splitlines(keepends=True)
|
||||
current: list[str] = []
|
||||
current_len = 0
|
||||
|
||||
for line in lines:
|
||||
if current_len + len(line) > max_chars and current:
|
||||
chunk = "".join(current)
|
||||
chunks.append((chunk, path))
|
||||
# overlap: keep last N characters
|
||||
tail = "".join(current)[-overlap:]
|
||||
current = [tail] if tail else []
|
||||
current_len = len(tail)
|
||||
current.append(line)
|
||||
current_len += len(line)
|
||||
|
||||
if current:
|
||||
chunks.append(("".join(current), path))
|
||||
|
||||
return chunks
|
||||
5
codex-lens-v2/src/codexlens_search/rerank/__init__.py
Normal file
5
codex-lens-v2/src/codexlens_search/rerank/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .base import BaseReranker
|
||||
from .local import FastEmbedReranker
|
||||
from .api import APIReranker
|
||||
|
||||
__all__ = ["BaseReranker", "FastEmbedReranker", "APIReranker"]
|
||||
103
codex-lens-v2/src/codexlens_search/rerank/api.py
Normal file
103
codex-lens-v2/src/codexlens_search/rerank/api.py
Normal file
@@ -0,0 +1,103 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
import httpx
|
||||
|
||||
from codexlens_search.config import Config
|
||||
from .base import BaseReranker
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class APIReranker(BaseReranker):
|
||||
"""Reranker backed by a remote HTTP API (SiliconFlow/Cohere/Jina format)."""
|
||||
|
||||
def __init__(self, config: Config) -> None:
|
||||
self._config = config
|
||||
self._client = httpx.Client(
|
||||
headers={
|
||||
"Authorization": f"Bearer {config.reranker_api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
)
|
||||
|
||||
def score_pairs(self, query: str, documents: list[str]) -> list[float]:
|
||||
if not documents:
|
||||
return []
|
||||
max_tokens = self._config.reranker_api_max_tokens_per_batch
|
||||
batches = self._split_batches(documents, max_tokens)
|
||||
scores = [0.0] * len(documents)
|
||||
for batch in batches:
|
||||
batch_scores = self._call_api_with_retry(query, batch)
|
||||
for orig_idx, score in batch_scores.items():
|
||||
scores[orig_idx] = score
|
||||
return scores
|
||||
|
||||
def _split_batches(
|
||||
self, documents: list[str], max_tokens: int
|
||||
) -> list[list[tuple[int, str]]]:
|
||||
batches: list[list[tuple[int, str]]] = []
|
||||
current_batch: list[tuple[int, str]] = []
|
||||
current_tokens = 0
|
||||
|
||||
for idx, text in enumerate(documents):
|
||||
doc_tokens = len(text) // 4
|
||||
if current_tokens + doc_tokens > max_tokens and current_batch:
|
||||
batches.append(current_batch)
|
||||
current_batch = []
|
||||
current_tokens = 0
|
||||
current_batch.append((idx, text))
|
||||
current_tokens += doc_tokens
|
||||
|
||||
if current_batch:
|
||||
batches.append(current_batch)
|
||||
|
||||
return batches
|
||||
|
||||
def _call_api_with_retry(
|
||||
self,
|
||||
query: str,
|
||||
docs: list[tuple[int, str]],
|
||||
max_retries: int = 3,
|
||||
) -> dict[int, float]:
|
||||
url = self._config.reranker_api_url.rstrip("/") + "/rerank"
|
||||
payload = {
|
||||
"model": self._config.reranker_api_model,
|
||||
"query": query,
|
||||
"documents": [t for _, t in docs],
|
||||
}
|
||||
|
||||
last_exc: Exception | None = None
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = self._client.post(url, json=payload)
|
||||
except Exception as exc:
|
||||
last_exc = exc
|
||||
time.sleep((2 ** attempt) * 0.5)
|
||||
continue
|
||||
|
||||
if response.status_code in (429, 503):
|
||||
logger.warning(
|
||||
"API reranker returned HTTP %s (attempt %d/%d), retrying...",
|
||||
response.status_code,
|
||||
attempt + 1,
|
||||
max_retries,
|
||||
)
|
||||
time.sleep((2 ** attempt) * 0.5)
|
||||
continue
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
results = data.get("results", [])
|
||||
scores: dict[int, float] = {}
|
||||
for item in results:
|
||||
local_idx = int(item["index"])
|
||||
orig_idx = docs[local_idx][0]
|
||||
scores[orig_idx] = float(item["relevance_score"])
|
||||
return scores
|
||||
|
||||
raise RuntimeError(
|
||||
f"API reranker failed after {max_retries} attempts. Last error: {last_exc}"
|
||||
)
|
||||
8
codex-lens-v2/src/codexlens_search/rerank/base.py
Normal file
8
codex-lens-v2/src/codexlens_search/rerank/base.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from __future__ import annotations
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class BaseReranker(ABC):
|
||||
@abstractmethod
|
||||
def score_pairs(self, query: str, documents: list[str]) -> list[float]:
|
||||
"""Score (query, doc) pairs. Returns list of floats same length as documents."""
|
||||
25
codex-lens-v2/src/codexlens_search/rerank/local.py
Normal file
25
codex-lens-v2/src/codexlens_search/rerank/local.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from codexlens_search.config import Config
|
||||
from .base import BaseReranker
|
||||
|
||||
|
||||
class FastEmbedReranker(BaseReranker):
|
||||
"""Local reranker backed by fastembed TextCrossEncoder."""
|
||||
|
||||
def __init__(self, config: Config) -> None:
|
||||
self._config = config
|
||||
self._model = None
|
||||
|
||||
def _load(self) -> None:
|
||||
if self._model is None:
|
||||
from fastembed.rerank.cross_encoder import TextCrossEncoder
|
||||
self._model = TextCrossEncoder(model_name=self._config.reranker_model)
|
||||
|
||||
def score_pairs(self, query: str, documents: list[str]) -> list[float]:
|
||||
self._load()
|
||||
results = list(self._model.rerank(query, documents))
|
||||
scores = [0.0] * len(documents)
|
||||
for r in results:
|
||||
scores[r.index] = float(r.score)
|
||||
return scores
|
||||
8
codex-lens-v2/src/codexlens_search/search/__init__.py
Normal file
8
codex-lens-v2/src/codexlens_search/search/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from .fts import FTSEngine
|
||||
from .fusion import reciprocal_rank_fusion, detect_query_intent, QueryIntent, DEFAULT_WEIGHTS
|
||||
from .pipeline import SearchPipeline, SearchResult
|
||||
|
||||
__all__ = [
|
||||
"FTSEngine", "reciprocal_rank_fusion", "detect_query_intent",
|
||||
"QueryIntent", "DEFAULT_WEIGHTS", "SearchPipeline", "SearchResult",
|
||||
]
|
||||
69
codex-lens-v2/src/codexlens_search/search/fts.py
Normal file
69
codex-lens-v2/src/codexlens_search/search/fts.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class FTSEngine:
|
||||
def __init__(self, db_path: str | Path) -> None:
|
||||
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
|
||||
self._conn.execute(
|
||||
"CREATE VIRTUAL TABLE IF NOT EXISTS docs "
|
||||
"USING fts5(content, tokenize='porter unicode61')"
|
||||
)
|
||||
self._conn.execute(
|
||||
"CREATE TABLE IF NOT EXISTS docs_meta "
|
||||
"(id INTEGER PRIMARY KEY, path TEXT)"
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def add_documents(self, docs: list[tuple[int, str, str]]) -> None:
|
||||
"""Add documents in batch. docs: list of (id, path, content)."""
|
||||
if not docs:
|
||||
return
|
||||
self._conn.executemany(
|
||||
"INSERT OR REPLACE INTO docs_meta (id, path) VALUES (?, ?)",
|
||||
[(doc_id, path) for doc_id, path, content in docs],
|
||||
)
|
||||
self._conn.executemany(
|
||||
"INSERT OR REPLACE INTO docs (rowid, content) VALUES (?, ?)",
|
||||
[(doc_id, content) for doc_id, path, content in docs],
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def exact_search(self, query: str, top_k: int = 50) -> list[tuple[int, float]]:
|
||||
"""FTS5 MATCH query, return (id, bm25_score) sorted by score descending."""
|
||||
try:
|
||||
rows = self._conn.execute(
|
||||
"SELECT rowid, bm25(docs) AS score FROM docs "
|
||||
"WHERE docs MATCH ? ORDER BY score LIMIT ?",
|
||||
(query, top_k),
|
||||
).fetchall()
|
||||
except sqlite3.OperationalError:
|
||||
return []
|
||||
# bm25 in SQLite FTS5 returns negative values (lower = better match)
|
||||
# Negate so higher is better
|
||||
return [(int(row[0]), -float(row[1])) for row in rows]
|
||||
|
||||
def fuzzy_search(self, query: str, top_k: int = 50) -> list[tuple[int, float]]:
|
||||
"""Prefix search: each token + '*', return (id, score) sorted descending."""
|
||||
tokens = query.strip().split()
|
||||
if not tokens:
|
||||
return []
|
||||
prefix_query = " ".join(t + "*" for t in tokens)
|
||||
try:
|
||||
rows = self._conn.execute(
|
||||
"SELECT rowid, bm25(docs) AS score FROM docs "
|
||||
"WHERE docs MATCH ? ORDER BY score LIMIT ?",
|
||||
(prefix_query, top_k),
|
||||
).fetchall()
|
||||
except sqlite3.OperationalError:
|
||||
return []
|
||||
return [(int(row[0]), -float(row[1])) for row in rows]
|
||||
|
||||
def get_content(self, doc_id: int) -> str:
|
||||
"""Retrieve content for a doc_id."""
|
||||
row = self._conn.execute(
|
||||
"SELECT content FROM docs WHERE rowid = ?", (doc_id,)
|
||||
).fetchone()
|
||||
return row[0] if row else ""
|
||||
106
codex-lens-v2/src/codexlens_search/search/fusion.py
Normal file
106
codex-lens-v2/src/codexlens_search/search/fusion.py
Normal file
@@ -0,0 +1,106 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from enum import Enum
|
||||
|
||||
DEFAULT_WEIGHTS: dict[str, float] = {
|
||||
"exact": 0.25,
|
||||
"fuzzy": 0.10,
|
||||
"vector": 0.50,
|
||||
"graph": 0.15,
|
||||
}
|
||||
|
||||
_CODE_CAMEL_RE = re.compile(r"[a-z][A-Z]")
|
||||
_CODE_SNAKE_RE = re.compile(r"\b[a-z_]+_[a-z_]+\b")
|
||||
_CODE_SYMBOLS_RE = re.compile(r"[.\[\](){}]|->|::")
|
||||
_CODE_KEYWORDS_RE = re.compile(r"\b(import|def|class|return|from|async|await|lambda|yield)\b")
|
||||
_QUESTION_WORDS_RE = re.compile(r"\b(how|what|why|when|where|which|who|does|do|is|are|can|should)\b", re.IGNORECASE)
|
||||
|
||||
|
||||
class QueryIntent(Enum):
|
||||
CODE_SYMBOL = "code_symbol"
|
||||
NATURAL_LANGUAGE = "natural"
|
||||
MIXED = "mixed"
|
||||
|
||||
|
||||
def detect_query_intent(query: str) -> QueryIntent:
|
||||
"""Detect whether query is a code symbol, natural language, or mixed."""
|
||||
words = query.strip().split()
|
||||
word_count = len(words)
|
||||
|
||||
code_signals = 0
|
||||
natural_signals = 0
|
||||
|
||||
if _CODE_CAMEL_RE.search(query):
|
||||
code_signals += 2
|
||||
if _CODE_SNAKE_RE.search(query):
|
||||
code_signals += 2
|
||||
if _CODE_SYMBOLS_RE.search(query):
|
||||
code_signals += 2
|
||||
if _CODE_KEYWORDS_RE.search(query):
|
||||
code_signals += 2
|
||||
if "`" in query:
|
||||
code_signals += 1
|
||||
if word_count < 4:
|
||||
code_signals += 1
|
||||
|
||||
if _QUESTION_WORDS_RE.search(query):
|
||||
natural_signals += 2
|
||||
if word_count > 5:
|
||||
natural_signals += 2
|
||||
if code_signals == 0 and word_count >= 3:
|
||||
natural_signals += 1
|
||||
|
||||
if code_signals >= 2 and natural_signals == 0:
|
||||
return QueryIntent.CODE_SYMBOL
|
||||
if natural_signals >= 2 and code_signals == 0:
|
||||
return QueryIntent.NATURAL_LANGUAGE
|
||||
if code_signals >= 2 and natural_signals == 0:
|
||||
return QueryIntent.CODE_SYMBOL
|
||||
if natural_signals > code_signals:
|
||||
return QueryIntent.NATURAL_LANGUAGE
|
||||
if code_signals > natural_signals:
|
||||
return QueryIntent.CODE_SYMBOL
|
||||
return QueryIntent.MIXED
|
||||
|
||||
|
||||
def get_adaptive_weights(intent: QueryIntent, base: dict | None = None) -> dict[str, float]:
|
||||
"""Return weights adapted to query intent."""
|
||||
weights = dict(base or DEFAULT_WEIGHTS)
|
||||
if intent == QueryIntent.CODE_SYMBOL:
|
||||
weights["exact"] = 0.45
|
||||
weights["vector"] = 0.35
|
||||
elif intent == QueryIntent.NATURAL_LANGUAGE:
|
||||
weights["vector"] = 0.65
|
||||
weights["exact"] = 0.15
|
||||
# MIXED: use weights as-is
|
||||
return weights
|
||||
|
||||
|
||||
def reciprocal_rank_fusion(
|
||||
results: dict[str, list[tuple[int, float]]],
|
||||
weights: dict[str, float] | None = None,
|
||||
k: int = 60,
|
||||
) -> list[tuple[int, float]]:
|
||||
"""Fuse ranked result lists using Reciprocal Rank Fusion.
|
||||
|
||||
results: {source_name: [(doc_id, score), ...]} each list sorted desc by score.
|
||||
weights: weight per source (defaults to equal weight across all sources).
|
||||
k: RRF constant (default 60).
|
||||
Returns sorted list of (doc_id, fused_score) descending.
|
||||
"""
|
||||
if not results:
|
||||
return []
|
||||
|
||||
sources = list(results.keys())
|
||||
if weights is None:
|
||||
equal_w = 1.0 / len(sources)
|
||||
weights = {s: equal_w for s in sources}
|
||||
|
||||
scores: dict[int, float] = {}
|
||||
for source, ranked_list in results.items():
|
||||
w = weights.get(source, 0.0)
|
||||
for rank, (doc_id, _) in enumerate(ranked_list, start=1):
|
||||
scores[doc_id] = scores.get(doc_id, 0.0) + w * (1.0 / (k + rank))
|
||||
|
||||
return sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
||||
163
codex-lens-v2/src/codexlens_search/search/pipeline.py
Normal file
163
codex-lens-v2/src/codexlens_search/search/pipeline.py
Normal file
@@ -0,0 +1,163 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..config import Config
|
||||
from ..core import ANNIndex, BinaryStore
|
||||
from ..embed import BaseEmbedder
|
||||
from ..rerank import BaseReranker
|
||||
from .fts import FTSEngine
|
||||
from .fusion import (
|
||||
DEFAULT_WEIGHTS,
|
||||
detect_query_intent,
|
||||
get_adaptive_weights,
|
||||
reciprocal_rank_fusion,
|
||||
)
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResult:
|
||||
id: int
|
||||
path: str
|
||||
score: float
|
||||
snippet: str = ""
|
||||
|
||||
|
||||
class SearchPipeline:
|
||||
def __init__(
|
||||
self,
|
||||
embedder: BaseEmbedder,
|
||||
binary_store: BinaryStore,
|
||||
ann_index: ANNIndex,
|
||||
reranker: BaseReranker,
|
||||
fts: FTSEngine,
|
||||
config: Config,
|
||||
) -> None:
|
||||
self._embedder = embedder
|
||||
self._binary_store = binary_store
|
||||
self._ann_index = ann_index
|
||||
self._reranker = reranker
|
||||
self._fts = fts
|
||||
self._config = config
|
||||
|
||||
# -- Helper: vector search (binary coarse + ANN fine) -----------------
|
||||
|
||||
def _vector_search(
|
||||
self, query_vec: np.ndarray
|
||||
) -> list[tuple[int, float]]:
|
||||
"""Run binary coarse search then ANN fine search and intersect."""
|
||||
cfg = self._config
|
||||
|
||||
# Binary coarse search -> candidate_ids set
|
||||
candidate_ids_list, _ = self._binary_store.coarse_search(
|
||||
query_vec, top_k=cfg.binary_top_k
|
||||
)
|
||||
candidate_ids = set(candidate_ids_list)
|
||||
|
||||
# ANN fine search on full index, then intersect with binary candidates
|
||||
ann_ids, ann_scores = self._ann_index.fine_search(
|
||||
query_vec, top_k=cfg.ann_top_k
|
||||
)
|
||||
# Keep only results that appear in binary candidates (2-stage funnel)
|
||||
vector_results: list[tuple[int, float]] = [
|
||||
(int(doc_id), float(score))
|
||||
for doc_id, score in zip(ann_ids, ann_scores)
|
||||
if int(doc_id) in candidate_ids
|
||||
]
|
||||
# Fall back to full ANN results if intersection is empty
|
||||
if not vector_results:
|
||||
vector_results = [
|
||||
(int(doc_id), float(score))
|
||||
for doc_id, score in zip(ann_ids, ann_scores)
|
||||
]
|
||||
return vector_results
|
||||
|
||||
# -- Helper: FTS search (exact + fuzzy) ------------------------------
|
||||
|
||||
def _fts_search(
|
||||
self, query: str
|
||||
) -> tuple[list[tuple[int, float]], list[tuple[int, float]]]:
|
||||
"""Run exact and fuzzy full-text search."""
|
||||
cfg = self._config
|
||||
exact_results = self._fts.exact_search(query, top_k=cfg.fts_top_k)
|
||||
fuzzy_results = self._fts.fuzzy_search(query, top_k=cfg.fts_top_k)
|
||||
return exact_results, fuzzy_results
|
||||
|
||||
# -- Main search entry point -----------------------------------------
|
||||
|
||||
def search(self, query: str, top_k: int | None = None) -> list[SearchResult]:
|
||||
cfg = self._config
|
||||
final_top_k = top_k if top_k is not None else cfg.reranker_top_k
|
||||
|
||||
# 1. Detect intent -> adaptive weights
|
||||
intent = detect_query_intent(query)
|
||||
weights = get_adaptive_weights(intent, cfg.fusion_weights)
|
||||
|
||||
# 2. Embed query
|
||||
query_vec = self._embedder.embed_single(query)
|
||||
|
||||
# 3. Parallel vector + FTS search
|
||||
vector_results: list[tuple[int, float]] = []
|
||||
exact_results: list[tuple[int, float]] = []
|
||||
fuzzy_results: list[tuple[int, float]] = []
|
||||
|
||||
with ThreadPoolExecutor(max_workers=2) as pool:
|
||||
vec_future = pool.submit(self._vector_search, query_vec)
|
||||
fts_future = pool.submit(self._fts_search, query)
|
||||
|
||||
# Collect vector results
|
||||
try:
|
||||
vector_results = vec_future.result()
|
||||
except Exception:
|
||||
_log.warning("Vector search failed, using empty results", exc_info=True)
|
||||
|
||||
# Collect FTS results
|
||||
try:
|
||||
exact_results, fuzzy_results = fts_future.result()
|
||||
except Exception:
|
||||
_log.warning("FTS search failed, using empty results", exc_info=True)
|
||||
|
||||
# 4. RRF fusion
|
||||
fusion_input: dict[str, list[tuple[int, float]]] = {}
|
||||
if vector_results:
|
||||
fusion_input["vector"] = vector_results
|
||||
if exact_results:
|
||||
fusion_input["exact"] = exact_results
|
||||
if fuzzy_results:
|
||||
fusion_input["fuzzy"] = fuzzy_results
|
||||
|
||||
if not fusion_input:
|
||||
return []
|
||||
|
||||
fused = reciprocal_rank_fusion(fusion_input, weights=weights, k=cfg.fusion_k)
|
||||
|
||||
# 5. Rerank top candidates
|
||||
rerank_ids = [doc_id for doc_id, _ in fused[:50]]
|
||||
contents = [self._fts.get_content(doc_id) for doc_id in rerank_ids]
|
||||
rerank_scores = self._reranker.score_pairs(query, contents)
|
||||
|
||||
# 6. Sort by rerank score, build SearchResult list
|
||||
ranked = sorted(
|
||||
zip(rerank_ids, rerank_scores), key=lambda x: x[1], reverse=True
|
||||
)
|
||||
|
||||
results: list[SearchResult] = []
|
||||
for doc_id, score in ranked[:final_top_k]:
|
||||
path = self._fts._conn.execute(
|
||||
"SELECT path FROM docs_meta WHERE id = ?", (doc_id,)
|
||||
).fetchone()
|
||||
results.append(
|
||||
SearchResult(
|
||||
id=doc_id,
|
||||
path=path[0] if path else "",
|
||||
score=float(score),
|
||||
snippet=self._fts.get_content(doc_id)[:200],
|
||||
)
|
||||
)
|
||||
return results
|
||||
Reference in New Issue
Block a user