mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-21 19:08:17 +08:00
chore: remove codex-lens-v2 from outer git tracking
codex-lens-v2 has its own git repo and publishes to PyPI independently. Remove from outer index and add to .gitignore to avoid tracking conflicts. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -161,3 +161,4 @@ codex-lens/.cache/huggingface/models/Xenova--ms-marco-MiniLM-L-6-v2/onnx/model_q
|
|||||||
codex-lens/.cache/huggingface/models/Xenova--ms-marco-MiniLM-L-6-v2/onnx/model_uint8.onnx
|
codex-lens/.cache/huggingface/models/Xenova--ms-marco-MiniLM-L-6-v2/onnx/model_uint8.onnx
|
||||||
codex-lens/.cache/huggingface/models/Xenova--ms-marco-MiniLM-L-6-v2/onnx/model.onnx
|
codex-lens/.cache/huggingface/models/Xenova--ms-marco-MiniLM-L-6-v2/onnx/model.onnx
|
||||||
codex-lens/data/registry.db
|
codex-lens/data/registry.db
|
||||||
|
codex-lens-v2/
|
||||||
|
|||||||
33
codex-lens-v2/.gitignore
vendored
33
codex-lens-v2/.gitignore
vendored
@@ -1,33 +0,0 @@
|
|||||||
# Python
|
|
||||||
__pycache__/
|
|
||||||
*.py[cod]
|
|
||||||
*.egg-info/
|
|
||||||
dist/
|
|
||||||
build/
|
|
||||||
*.egg
|
|
||||||
|
|
||||||
# Virtual environments
|
|
||||||
.venv/
|
|
||||||
venv/
|
|
||||||
|
|
||||||
# IDE
|
|
||||||
.idea/
|
|
||||||
.vscode/
|
|
||||||
*.swp
|
|
||||||
|
|
||||||
# Testing
|
|
||||||
.pytest_cache/
|
|
||||||
.coverage
|
|
||||||
htmlcov/
|
|
||||||
|
|
||||||
# Index / cache
|
|
||||||
.codexlens/
|
|
||||||
.index_cache/
|
|
||||||
.ace-tool/
|
|
||||||
|
|
||||||
# Workflow (internal)
|
|
||||||
.workflow/
|
|
||||||
|
|
||||||
# OS
|
|
||||||
.DS_Store
|
|
||||||
Thumbs.db
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
MIT License
|
|
||||||
|
|
||||||
Copyright (c) 2026 codexlens-search contributors
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
|
||||||
in the Software without restriction, including without limitation the rights
|
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
|
||||||
copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
||||||
@@ -1,226 +0,0 @@
|
|||||||
# codexlens-search
|
|
||||||
|
|
||||||
Semantic code search engine with MCP server for Claude Code.
|
|
||||||
|
|
||||||
2-stage vector search + FTS + RRF fusion + reranking — install once, configure API keys, ready to use.
|
|
||||||
|
|
||||||
## Quick Start (Claude Code MCP)
|
|
||||||
|
|
||||||
Add to your project `.mcp.json`:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"mcpServers": {
|
|
||||||
"codexlens": {
|
|
||||||
"command": "uvx",
|
|
||||||
"args": ["--from", "codexlens-search[mcp]", "codexlens-mcp"],
|
|
||||||
"env": {
|
|
||||||
"CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1",
|
|
||||||
"CODEXLENS_EMBED_API_KEY": "${OPENAI_API_KEY}",
|
|
||||||
"CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small",
|
|
||||||
"CODEXLENS_EMBED_DIM": "1536"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
That's it. Claude Code will auto-discover the tools: `index_project` → `search_code`.
|
|
||||||
|
|
||||||
## Install
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Standard install (includes vector search + API clients)
|
|
||||||
uv pip install codexlens-search
|
|
||||||
|
|
||||||
# With MCP server for Claude Code
|
|
||||||
uv pip install codexlens-search[mcp]
|
|
||||||
```
|
|
||||||
|
|
||||||
Optional extras:
|
|
||||||
|
|
||||||
| Extra | Description |
|
|
||||||
|-------|-------------|
|
|
||||||
| `mcp` | MCP server (`codexlens-mcp` command) |
|
|
||||||
| `gpu` | GPU-accelerated embedding (onnxruntime-gpu) |
|
|
||||||
| `faiss-cpu` | FAISS ANN backend |
|
|
||||||
| `watcher` | File watcher for auto-indexing |
|
|
||||||
|
|
||||||
## MCP Tools
|
|
||||||
|
|
||||||
| Tool | Description |
|
|
||||||
|------|-------------|
|
|
||||||
| `search_code` | Semantic search with hybrid fusion + reranking |
|
|
||||||
| `index_project` | Build or rebuild the search index |
|
|
||||||
| `index_status` | Show index statistics |
|
|
||||||
| `index_update` | Incremental sync (only changed files) |
|
|
||||||
| `find_files` | Glob file discovery |
|
|
||||||
| `list_models` | List models with cache status |
|
|
||||||
| `download_models` | Download local fastembed models |
|
|
||||||
|
|
||||||
## MCP Configuration Examples
|
|
||||||
|
|
||||||
### API Embedding Only (simplest)
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"mcpServers": {
|
|
||||||
"codexlens": {
|
|
||||||
"command": "uvx",
|
|
||||||
"args": ["--from", "codexlens-search[mcp]", "codexlens-mcp"],
|
|
||||||
"env": {
|
|
||||||
"CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1",
|
|
||||||
"CODEXLENS_EMBED_API_KEY": "${OPENAI_API_KEY}",
|
|
||||||
"CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small",
|
|
||||||
"CODEXLENS_EMBED_DIM": "1536"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### API Embedding + API Reranker (best quality)
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"mcpServers": {
|
|
||||||
"codexlens": {
|
|
||||||
"command": "uvx",
|
|
||||||
"args": ["--from", "codexlens-search[mcp]", "codexlens-mcp"],
|
|
||||||
"env": {
|
|
||||||
"CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1",
|
|
||||||
"CODEXLENS_EMBED_API_KEY": "${OPENAI_API_KEY}",
|
|
||||||
"CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small",
|
|
||||||
"CODEXLENS_EMBED_DIM": "1536",
|
|
||||||
"CODEXLENS_RERANKER_API_URL": "https://api.jina.ai/v1",
|
|
||||||
"CODEXLENS_RERANKER_API_KEY": "${JINA_API_KEY}",
|
|
||||||
"CODEXLENS_RERANKER_API_MODEL": "jina-reranker-v2-base-multilingual"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Multi-Endpoint Load Balancing
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"mcpServers": {
|
|
||||||
"codexlens": {
|
|
||||||
"command": "uvx",
|
|
||||||
"args": ["--from", "codexlens-search[mcp]", "codexlens-mcp"],
|
|
||||||
"env": {
|
|
||||||
"CODEXLENS_EMBED_API_ENDPOINTS": "https://api1.example.com/v1|sk-key1|model,https://api2.example.com/v1|sk-key2|model",
|
|
||||||
"CODEXLENS_EMBED_DIM": "1536"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Format: `url|key|model,url|key|model,...`
|
|
||||||
|
|
||||||
### Local Models (Offline, No API)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
uv pip install codexlens-search[mcp]
|
|
||||||
codexlens-search download-models
|
|
||||||
```
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"mcpServers": {
|
|
||||||
"codexlens": {
|
|
||||||
"command": "codexlens-mcp",
|
|
||||||
"env": {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Pre-installed (no uvx)
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"mcpServers": {
|
|
||||||
"codexlens": {
|
|
||||||
"command": "codexlens-mcp",
|
|
||||||
"env": {
|
|
||||||
"CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1",
|
|
||||||
"CODEXLENS_EMBED_API_KEY": "${OPENAI_API_KEY}",
|
|
||||||
"CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small",
|
|
||||||
"CODEXLENS_EMBED_DIM": "1536"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## CLI
|
|
||||||
|
|
||||||
```bash
|
|
||||||
codexlens-search --db-path .codexlens sync --root ./src
|
|
||||||
codexlens-search --db-path .codexlens search -q "auth handler" -k 10
|
|
||||||
codexlens-search --db-path .codexlens status
|
|
||||||
codexlens-search list-models
|
|
||||||
codexlens-search download-models
|
|
||||||
```
|
|
||||||
|
|
||||||
## Environment Variables
|
|
||||||
|
|
||||||
### Embedding
|
|
||||||
|
|
||||||
| Variable | Description | Example |
|
|
||||||
|----------|-------------|---------|
|
|
||||||
| `CODEXLENS_EMBED_API_URL` | Embedding API base URL | `https://api.openai.com/v1` |
|
|
||||||
| `CODEXLENS_EMBED_API_KEY` | API key | `sk-xxx` |
|
|
||||||
| `CODEXLENS_EMBED_API_MODEL` | Model name | `text-embedding-3-small` |
|
|
||||||
| `CODEXLENS_EMBED_API_ENDPOINTS` | Multi-endpoint: `url\|key\|model,...` | See above |
|
|
||||||
| `CODEXLENS_EMBED_DIM` | Vector dimension | `1536` |
|
|
||||||
|
|
||||||
### Reranker
|
|
||||||
|
|
||||||
| Variable | Description | Example |
|
|
||||||
|----------|-------------|---------|
|
|
||||||
| `CODEXLENS_RERANKER_API_URL` | Reranker API base URL | `https://api.jina.ai/v1` |
|
|
||||||
| `CODEXLENS_RERANKER_API_KEY` | API key | `jina-xxx` |
|
|
||||||
| `CODEXLENS_RERANKER_API_MODEL` | Model name | `jina-reranker-v2-base-multilingual` |
|
|
||||||
|
|
||||||
### Tuning
|
|
||||||
|
|
||||||
| Variable | Default | Description |
|
|
||||||
|----------|---------|-------------|
|
|
||||||
| `CODEXLENS_BINARY_TOP_K` | `200` | Binary coarse search candidates |
|
|
||||||
| `CODEXLENS_ANN_TOP_K` | `50` | ANN fine search candidates |
|
|
||||||
| `CODEXLENS_FTS_TOP_K` | `50` | FTS results per method |
|
|
||||||
| `CODEXLENS_FUSION_K` | `60` | RRF fusion k parameter |
|
|
||||||
| `CODEXLENS_RERANKER_TOP_K` | `20` | Results to rerank |
|
|
||||||
| `CODEXLENS_EMBED_BATCH_SIZE` | `32` | Max texts per API batch (auto-splits on 413) |
|
|
||||||
| `CODEXLENS_EMBED_MAX_TOKENS` | `8192` | Max tokens per text (truncate if exceeded, 0=no limit) |
|
|
||||||
| `CODEXLENS_INDEX_WORKERS` | `2` | Parallel indexing workers |
|
|
||||||
| `CODEXLENS_MAX_FILE_SIZE` | `1000000` | Max file size in bytes |
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
```
|
|
||||||
Query → [Embedder] → query vector
|
|
||||||
├→ [BinaryStore] → candidates (Hamming)
|
|
||||||
│ └→ [ANNIndex] → ranked IDs (cosine)
|
|
||||||
├→ [FTS exact] → exact matches
|
|
||||||
└→ [FTS fuzzy] → fuzzy matches
|
|
||||||
└→ [RRF Fusion] → merged ranking
|
|
||||||
└→ [Reranker] → final top-k
|
|
||||||
```
|
|
||||||
|
|
||||||
## Development
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git clone https://github.com/catlog22/codexlens-search.git
|
|
||||||
cd codexlens-search
|
|
||||||
uv pip install -e ".[dev]"
|
|
||||||
pytest
|
|
||||||
```
|
|
||||||
|
|
||||||
## License
|
|
||||||
|
|
||||||
MIT
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
import sys
|
|
||||||
import os
|
|
||||||
|
|
||||||
# Ensure the local src directory takes precedence over any installed codexlens_search package
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src"))
|
|
||||||
@@ -1,63 +0,0 @@
|
|||||||
[build-system]
|
|
||||||
requires = ["hatchling"]
|
|
||||||
build-backend = "hatchling.build"
|
|
||||||
|
|
||||||
[project]
|
|
||||||
name = "codexlens-search"
|
|
||||||
version = "0.4.1"
|
|
||||||
description = "Lightweight semantic code search engine — 2-stage vector + FTS + RRF fusion + MCP server"
|
|
||||||
requires-python = ">=3.10"
|
|
||||||
dependencies = [
|
|
||||||
"hnswlib>=0.8.0",
|
|
||||||
"numpy>=1.26",
|
|
||||||
"fastembed>=0.4.0,<2.0",
|
|
||||||
"httpx>=0.25",
|
|
||||||
]
|
|
||||||
license = {text = "MIT"}
|
|
||||||
readme = "README.md"
|
|
||||||
authors = [
|
|
||||||
{name = "codexlens-search contributors"},
|
|
||||||
]
|
|
||||||
classifiers = [
|
|
||||||
"Programming Language :: Python :: 3",
|
|
||||||
"Programming Language :: Python :: 3.10",
|
|
||||||
"Programming Language :: Python :: 3.11",
|
|
||||||
"Programming Language :: Python :: 3.12",
|
|
||||||
"Programming Language :: Python :: 3.13",
|
|
||||||
"License :: OSI Approved :: MIT License",
|
|
||||||
"Topic :: Software Development :: Libraries",
|
|
||||||
"Topic :: Text Processing :: Indexing",
|
|
||||||
"Operating System :: OS Independent",
|
|
||||||
]
|
|
||||||
|
|
||||||
[project.urls]
|
|
||||||
Homepage = "https://github.com/catlog22/codexlens-search"
|
|
||||||
Repository = "https://github.com/catlog22/codexlens-search"
|
|
||||||
|
|
||||||
[project.optional-dependencies]
|
|
||||||
mcp = [
|
|
||||||
"mcp[cli]>=1.0.0",
|
|
||||||
]
|
|
||||||
gpu = [
|
|
||||||
"onnxruntime-gpu>=1.16",
|
|
||||||
]
|
|
||||||
faiss-cpu = [
|
|
||||||
"faiss-cpu>=1.7.4",
|
|
||||||
]
|
|
||||||
faiss-gpu = [
|
|
||||||
"faiss-gpu>=1.7.4",
|
|
||||||
]
|
|
||||||
watcher = [
|
|
||||||
"watchdog>=3.0",
|
|
||||||
]
|
|
||||||
dev = [
|
|
||||||
"pytest>=7.0",
|
|
||||||
"pytest-cov",
|
|
||||||
]
|
|
||||||
|
|
||||||
[project.scripts]
|
|
||||||
codexlens-search = "codexlens_search.bridge:main"
|
|
||||||
codexlens-mcp = "codexlens_search.mcp_server:main"
|
|
||||||
|
|
||||||
[tool.hatch.build.targets.wheel]
|
|
||||||
packages = ["src/codexlens_search"]
|
|
||||||
@@ -1,128 +0,0 @@
|
|||||||
"""
|
|
||||||
对 D:/Claude_dms3 仓库进行索引并测试搜索。
|
|
||||||
用法: python scripts/index_and_search.py
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# 确保 src 可被导入
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
||||||
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from codexlens_search.core.factory import create_ann_index, create_binary_index
|
|
||||||
from codexlens_search.embed.local import FastEmbedEmbedder
|
|
||||||
from codexlens_search.indexing import IndexingPipeline
|
|
||||||
from codexlens_search.rerank.local import FastEmbedReranker
|
|
||||||
from codexlens_search.search.fts import FTSEngine
|
|
||||||
from codexlens_search.search.pipeline import SearchPipeline
|
|
||||||
|
|
||||||
# ─── 配置 ──────────────────────────────────────────────────────────────────
|
|
||||||
REPO_ROOT = Path("D:/Claude_dms3")
|
|
||||||
INDEX_DIR = Path("D:/Claude_dms3/codex-lens-v2/.index_cache")
|
|
||||||
EXTENSIONS = {".py", ".ts", ".js", ".md"}
|
|
||||||
MAX_FILE_SIZE = 50_000 # bytes
|
|
||||||
MAX_CHUNK_CHARS = 800 # 每个 chunk 的最大字符数
|
|
||||||
CHUNK_OVERLAP = 100
|
|
||||||
|
|
||||||
# ─── 文件收集 ───────────────────────────────────────────────────────────────
|
|
||||||
SKIP_DIRS = {
|
|
||||||
".git", "node_modules", "__pycache__", ".pytest_cache",
|
|
||||||
"dist", "build", ".venv", "venv", ".cache", ".index_cache",
|
|
||||||
"codex-lens-v2", # 不索引自身
|
|
||||||
}
|
|
||||||
|
|
||||||
def collect_files(root: Path) -> list[Path]:
|
|
||||||
files = []
|
|
||||||
for p in root.rglob("*"):
|
|
||||||
if any(part in SKIP_DIRS for part in p.parts):
|
|
||||||
continue
|
|
||||||
if p.is_file() and p.suffix in EXTENSIONS:
|
|
||||||
if p.stat().st_size <= MAX_FILE_SIZE:
|
|
||||||
files.append(p)
|
|
||||||
return files
|
|
||||||
|
|
||||||
# ─── 主流程 ─────────────────────────────────────────────────────────────────
|
|
||||||
def main():
|
|
||||||
INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# 1. 使用小 profile 加快速度
|
|
||||||
config = Config(
|
|
||||||
embed_model="BAAI/bge-small-en-v1.5",
|
|
||||||
embed_dim=384,
|
|
||||||
embed_batch_size=32,
|
|
||||||
hnsw_ef=100,
|
|
||||||
hnsw_M=16,
|
|
||||||
binary_top_k=100,
|
|
||||||
ann_top_k=30,
|
|
||||||
reranker_top_k=10,
|
|
||||||
)
|
|
||||||
|
|
||||||
print("=== codex-lens-v2 索引测试 ===\n")
|
|
||||||
|
|
||||||
# 2. 收集文件
|
|
||||||
print(f"[1/4] 扫描 {REPO_ROOT} ...")
|
|
||||||
files = collect_files(REPO_ROOT)
|
|
||||||
print(f" 找到 {len(files)} 个文件")
|
|
||||||
|
|
||||||
# 3. 初始化组件
|
|
||||||
print(f"\n[2/4] 加载嵌入模型 (bge-small-en-v1.5, dim=384) ...")
|
|
||||||
embedder = FastEmbedEmbedder(config)
|
|
||||||
binary_store = create_binary_index(INDEX_DIR, config.embed_dim, config)
|
|
||||||
ann_index = create_ann_index(INDEX_DIR, config.embed_dim, config)
|
|
||||||
fts = FTSEngine(":memory:") # 内存 FTS,不持久化
|
|
||||||
|
|
||||||
# 4. 使用 IndexingPipeline 并行索引 (chunk -> embed -> index)
|
|
||||||
print(f"[3/4] 并行索引 {len(files)} 个文件 ...")
|
|
||||||
pipeline = IndexingPipeline(
|
|
||||||
embedder=embedder,
|
|
||||||
binary_store=binary_store,
|
|
||||||
ann_index=ann_index,
|
|
||||||
fts=fts,
|
|
||||||
config=config,
|
|
||||||
)
|
|
||||||
stats = pipeline.index_files(
|
|
||||||
files,
|
|
||||||
root=REPO_ROOT,
|
|
||||||
max_chunk_chars=MAX_CHUNK_CHARS,
|
|
||||||
chunk_overlap=CHUNK_OVERLAP,
|
|
||||||
max_file_size=MAX_FILE_SIZE,
|
|
||||||
)
|
|
||||||
print(f" 索引完成: {stats.files_processed} 文件, {stats.chunks_created} chunks ({stats.duration_seconds:.1f}s)")
|
|
||||||
|
|
||||||
# 5. 搜索测试
|
|
||||||
print(f"\n[4/4] 构建 SearchPipeline ...")
|
|
||||||
reranker = FastEmbedReranker(config)
|
|
||||||
pipeline = SearchPipeline(
|
|
||||||
embedder=embedder,
|
|
||||||
binary_store=binary_store,
|
|
||||||
ann_index=ann_index,
|
|
||||||
reranker=reranker,
|
|
||||||
fts=fts,
|
|
||||||
config=config,
|
|
||||||
)
|
|
||||||
|
|
||||||
queries = [
|
|
||||||
"authentication middleware function",
|
|
||||||
"def embed_single",
|
|
||||||
"RRF fusion weights",
|
|
||||||
"fastembed TextCrossEncoder reranker",
|
|
||||||
"how to search code semantic",
|
|
||||||
]
|
|
||||||
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
for query in queries:
|
|
||||||
t0 = time.time()
|
|
||||||
results = pipeline.search(query, top_k=5)
|
|
||||||
elapsed = time.time() - t0
|
|
||||||
print(f"\nQuery: {query!r} ({elapsed*1000:.0f}ms)")
|
|
||||||
if results:
|
|
||||||
for r in results:
|
|
||||||
print(f" [{r.score:.3f}] {r.path}")
|
|
||||||
else:
|
|
||||||
print(" (无结果)")
|
|
||||||
print("=" * 60)
|
|
||||||
print("\n测试完成 ✓")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,181 +0,0 @@
|
|||||||
"""
|
|
||||||
Small-folder end-to-end test: index tests/ directory (~10 files) and verify
|
|
||||||
indexing pipeline + all search features work correctly.
|
|
||||||
|
|
||||||
Usage: python scripts/test_small_e2e.py
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from codexlens_search.core.factory import create_ann_index, create_binary_index
|
|
||||||
from codexlens_search.embed.local import FastEmbedEmbedder
|
|
||||||
from codexlens_search.indexing import IndexingPipeline
|
|
||||||
from codexlens_search.rerank.local import FastEmbedReranker
|
|
||||||
from codexlens_search.search.fts import FTSEngine
|
|
||||||
from codexlens_search.search.pipeline import SearchPipeline
|
|
||||||
|
|
||||||
PROJECT = Path(__file__).parent.parent
|
|
||||||
TARGET_DIR = PROJECT / "src" / "codexlens_search" # ~21 .py files, small
|
|
||||||
INDEX_DIR = PROJECT / ".test_index_cache"
|
|
||||||
EXTENSIONS = {".py"}
|
|
||||||
|
|
||||||
passed = 0
|
|
||||||
failed = 0
|
|
||||||
|
|
||||||
|
|
||||||
def check(name: str, condition: bool, detail: str = ""):
|
|
||||||
global passed, failed
|
|
||||||
if condition:
|
|
||||||
passed += 1
|
|
||||||
print(f" [PASS] {name}")
|
|
||||||
else:
|
|
||||||
failed += 1
|
|
||||||
print(f" [FAIL] {name} — {detail}")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
global passed, failed
|
|
||||||
INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
config = Config(
|
|
||||||
embed_model="BAAI/bge-small-en-v1.5",
|
|
||||||
embed_dim=384,
|
|
||||||
embed_batch_size=32,
|
|
||||||
hnsw_ef=100,
|
|
||||||
hnsw_M=16,
|
|
||||||
binary_top_k=100,
|
|
||||||
ann_top_k=30,
|
|
||||||
reranker_model="Xenova/ms-marco-MiniLM-L-6-v2",
|
|
||||||
reranker_top_k=10,
|
|
||||||
)
|
|
||||||
|
|
||||||
files = [p for p in TARGET_DIR.rglob("*.py") if p.is_file()]
|
|
||||||
print(f"Target: {TARGET_DIR} ({len(files)} .py files)\n")
|
|
||||||
|
|
||||||
# ── 1. Test IndexingPipeline ──────────────────────────────
|
|
||||||
print("=== 1. IndexingPipeline (parallel) ===")
|
|
||||||
embedder = FastEmbedEmbedder(config)
|
|
||||||
binary_store = create_binary_index(INDEX_DIR, config.embed_dim, config)
|
|
||||||
ann_index = create_ann_index(INDEX_DIR, config.embed_dim, config)
|
|
||||||
fts = FTSEngine(":memory:")
|
|
||||||
|
|
||||||
t0 = time.time()
|
|
||||||
stats = IndexingPipeline(
|
|
||||||
embedder=embedder,
|
|
||||||
binary_store=binary_store,
|
|
||||||
ann_index=ann_index,
|
|
||||||
fts=fts,
|
|
||||||
config=config,
|
|
||||||
).index_files(files, root=TARGET_DIR, max_chunk_chars=800, chunk_overlap=100)
|
|
||||||
elapsed = time.time() - t0
|
|
||||||
|
|
||||||
check("files_processed > 0", stats.files_processed > 0, f"got {stats.files_processed}")
|
|
||||||
check("chunks_created > 0", stats.chunks_created > 0, f"got {stats.chunks_created}")
|
|
||||||
check("indexing completed", elapsed < 120, f"took {elapsed:.1f}s")
|
|
||||||
print(f" Stats: {stats.files_processed} files, {stats.chunks_created} chunks, {elapsed:.1f}s\n")
|
|
||||||
|
|
||||||
# ── 2. Test BinaryStore (pre-allocated, coarse search) ────
|
|
||||||
print("=== 2. BinaryStore coarse search ===")
|
|
||||||
q_vec = embedder.embed_single("def search")
|
|
||||||
b_ids, b_dists = binary_store.coarse_search(q_vec, top_k=10)
|
|
||||||
check("binary returns results", len(b_ids) > 0, f"got {len(b_ids)}")
|
|
||||||
check("binary ids are ints", all(isinstance(int(i), int) for i in b_ids))
|
|
||||||
print(f" Top 5 binary IDs: {b_ids[:5]}\n")
|
|
||||||
|
|
||||||
# ── 3. Test ANNIndex (fine search) ────────────────────────
|
|
||||||
print("=== 3. ANNIndex fine search ===")
|
|
||||||
a_ids, a_dists = ann_index.fine_search(q_vec, top_k=10)
|
|
||||||
check("ann returns results", len(a_ids) > 0, f"got {len(a_ids)}")
|
|
||||||
check("ann scores are floats", all(isinstance(float(d), float) for d in a_dists))
|
|
||||||
print(f" Top 5 ANN IDs: {a_ids[:5]}\n")
|
|
||||||
|
|
||||||
# ── 4. Test FTSEngine (exact + fuzzy) ─────────────────────
|
|
||||||
print("=== 4. FTSEngine search ===")
|
|
||||||
exact = fts.exact_search("def search", top_k=5)
|
|
||||||
fuzzy = fts.fuzzy_search("embedd", top_k=5)
|
|
||||||
check("exact search returns results", len(exact) > 0, f"got {len(exact)}")
|
|
||||||
check("fuzzy search returns results", len(fuzzy) > 0, f"got {len(fuzzy)}")
|
|
||||||
print(f" Exact hits: {len(exact)}, Fuzzy hits: {len(fuzzy)}\n")
|
|
||||||
|
|
||||||
# ── 5. Test SearchPipeline (parallel FTS||vector + fusion + rerank) ──
|
|
||||||
print("=== 5. SearchPipeline (full pipeline) ===")
|
|
||||||
reranker = FastEmbedReranker(config)
|
|
||||||
search = SearchPipeline(
|
|
||||||
embedder=embedder,
|
|
||||||
binary_store=binary_store,
|
|
||||||
ann_index=ann_index,
|
|
||||||
reranker=reranker,
|
|
||||||
fts=fts,
|
|
||||||
config=config,
|
|
||||||
)
|
|
||||||
|
|
||||||
queries = [
|
|
||||||
("def embed_single", "code symbol search"),
|
|
||||||
("search pipeline fusion", "natural language search"),
|
|
||||||
("Config dataclass", "exact match search"),
|
|
||||||
("binary store hamming", "domain-specific search"),
|
|
||||||
("", "empty query handling"),
|
|
||||||
]
|
|
||||||
|
|
||||||
for query, desc in queries:
|
|
||||||
t0 = time.time()
|
|
||||||
results = search.search(query, top_k=5)
|
|
||||||
ms = (time.time() - t0) * 1000
|
|
||||||
|
|
||||||
if query == "":
|
|
||||||
check(f"{desc}: no crash", isinstance(results, list))
|
|
||||||
else:
|
|
||||||
check(f"{desc}: returns results", len(results) > 0, f"'{query}' got 0 results")
|
|
||||||
if results:
|
|
||||||
check(f"{desc}: has scores", all(isinstance(r.score, (int, float)) for r in results))
|
|
||||||
check(f"{desc}: has paths", all(r.path for r in results))
|
|
||||||
check(f"{desc}: respects top_k", len(results) <= 5)
|
|
||||||
print(f" Top result: [{results[0].score:.3f}] {results[0].path}")
|
|
||||||
print(f" Latency: {ms:.0f}ms")
|
|
||||||
|
|
||||||
# ── 6. Test result quality (sanity) ───────────────────────
|
|
||||||
print("\n=== 6. Result quality sanity checks ===")
|
|
||||||
r1 = search.search("BinaryStore add coarse_search", top_k=5)
|
|
||||||
if r1:
|
|
||||||
paths = [r.path for r in r1]
|
|
||||||
check("BinaryStore query -> binary/core in results",
|
|
||||||
any("binary" in p or "core" in p for p in paths),
|
|
||||||
f"got paths: {paths}")
|
|
||||||
|
|
||||||
r2 = search.search("FTSEngine exact_search fuzzy_search", top_k=5)
|
|
||||||
if r2:
|
|
||||||
paths = [r.path for r in r2]
|
|
||||||
check("FTSEngine query -> fts/search in results",
|
|
||||||
any("fts" in p or "search" in p for p in paths),
|
|
||||||
f"got paths: {paths}")
|
|
||||||
|
|
||||||
r3 = search.search("IndexingPipeline parallel queue", top_k=3)
|
|
||||||
if r3:
|
|
||||||
paths = [r.path for r in r3]
|
|
||||||
check("Pipeline query -> pipeline in results",
|
|
||||||
any("pipeline" in p or "indexing" in p for p in paths),
|
|
||||||
f"got paths: {paths}")
|
|
||||||
|
|
||||||
# ── Summary ───────────────────────────────────────────────
|
|
||||||
print(f"\n{'=' * 50}")
|
|
||||||
print(f"Results: {passed} passed, {failed} failed, {passed + failed} total")
|
|
||||||
if failed == 0:
|
|
||||||
print("ALL TESTS PASSED")
|
|
||||||
else:
|
|
||||||
print(f"WARNING: {failed} test(s) failed")
|
|
||||||
print(f"{'=' * 50}")
|
|
||||||
|
|
||||||
# Cleanup
|
|
||||||
import shutil
|
|
||||||
shutil.rmtree(INDEX_DIR, ignore_errors=True)
|
|
||||||
|
|
||||||
return 0 if failed == 0 else 1
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
"""codexlens-search: Lightweight semantic code search engine.
|
|
||||||
|
|
||||||
Public API for consumers (e.g. codex-lens):
|
|
||||||
|
|
||||||
from codexlens_search import SearchPipeline, IndexingPipeline, Config
|
|
||||||
from codexlens_search.core import create_ann_index, create_binary_index
|
|
||||||
from codexlens_search.embed.local import FastEmbedEmbedder
|
|
||||||
from codexlens_search.rerank.api import APIReranker
|
|
||||||
"""
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from codexlens_search.indexing import IndexingPipeline, IndexStats
|
|
||||||
from codexlens_search.search.pipeline import SearchPipeline, SearchResult
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"Config",
|
|
||||||
"IndexingPipeline",
|
|
||||||
"IndexStats",
|
|
||||||
"SearchPipeline",
|
|
||||||
"SearchResult",
|
|
||||||
]
|
|
||||||
@@ -1,676 +0,0 @@
|
|||||||
"""CLI bridge for ccw integration.
|
|
||||||
|
|
||||||
Argparse-based CLI with JSON output protocol.
|
|
||||||
Each subcommand outputs a single JSON object to stdout.
|
|
||||||
Watch command outputs JSONL (one JSON per line).
|
|
||||||
All errors are JSON {"error": string} to stdout with non-zero exit code.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import glob
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
log = logging.getLogger("codexlens_search.bridge")
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Helpers
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _ensure_utf8_stdio() -> None:
|
|
||||||
"""Force UTF-8 encoding on stdout/stderr (Windows defaults to GBK/cp936)."""
|
|
||||||
if sys.platform == "win32":
|
|
||||||
for stream_name in ("stdout", "stderr"):
|
|
||||||
stream = getattr(sys, stream_name)
|
|
||||||
if hasattr(stream, "reconfigure"):
|
|
||||||
stream.reconfigure(encoding="utf-8", errors="replace")
|
|
||||||
|
|
||||||
|
|
||||||
def _json_output(data: dict | list) -> None:
|
|
||||||
"""Print JSON to stdout with flush."""
|
|
||||||
print(json.dumps(data, ensure_ascii=True), flush=True)
|
|
||||||
|
|
||||||
|
|
||||||
def _error_exit(message: str, code: int = 1) -> None:
|
|
||||||
"""Print JSON error to stdout and exit."""
|
|
||||||
_json_output({"error": message})
|
|
||||||
sys.exit(code)
|
|
||||||
|
|
||||||
|
|
||||||
def _resolve_db_path(args: argparse.Namespace) -> Path:
|
|
||||||
"""Return the --db-path as a resolved Path, creating parent dirs."""
|
|
||||||
db_path = Path(args.db_path).resolve()
|
|
||||||
db_path.mkdir(parents=True, exist_ok=True)
|
|
||||||
return db_path
|
|
||||||
|
|
||||||
|
|
||||||
def create_config_from_env(db_path: str | Path, **overrides: object) -> "Config":
|
|
||||||
"""Build Config from environment variables and optional overrides.
|
|
||||||
|
|
||||||
Used by both CLI bridge and MCP server.
|
|
||||||
"""
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
|
|
||||||
kwargs: dict = {}
|
|
||||||
# Apply explicit overrides first
|
|
||||||
for key in ("embed_model", "embed_api_url", "embed_api_key", "embed_api_model"):
|
|
||||||
if overrides.get(key):
|
|
||||||
kwargs[key] = overrides[key]
|
|
||||||
# Env vars as fallback
|
|
||||||
if "embed_api_url" not in kwargs and os.environ.get("CODEXLENS_EMBED_API_URL"):
|
|
||||||
kwargs["embed_api_url"] = os.environ["CODEXLENS_EMBED_API_URL"]
|
|
||||||
if "embed_api_key" not in kwargs and os.environ.get("CODEXLENS_EMBED_API_KEY"):
|
|
||||||
kwargs["embed_api_key"] = os.environ["CODEXLENS_EMBED_API_KEY"]
|
|
||||||
if "embed_api_model" not in kwargs and os.environ.get("CODEXLENS_EMBED_API_MODEL"):
|
|
||||||
kwargs["embed_api_model"] = os.environ["CODEXLENS_EMBED_API_MODEL"]
|
|
||||||
# Multi-endpoint: CODEXLENS_EMBED_API_ENDPOINTS=url1|key1|model1,url2|key2|model2
|
|
||||||
endpoints_env = os.environ.get("CODEXLENS_EMBED_API_ENDPOINTS", "")
|
|
||||||
if endpoints_env:
|
|
||||||
endpoints = []
|
|
||||||
for entry in endpoints_env.split(","):
|
|
||||||
parts = entry.strip().split("|")
|
|
||||||
if len(parts) >= 2:
|
|
||||||
ep = {"url": parts[0], "key": parts[1]}
|
|
||||||
if len(parts) >= 3:
|
|
||||||
ep["model"] = parts[2]
|
|
||||||
endpoints.append(ep)
|
|
||||||
if endpoints:
|
|
||||||
kwargs["embed_api_endpoints"] = endpoints
|
|
||||||
# Embed dimension and concurrency from env
|
|
||||||
if os.environ.get("CODEXLENS_EMBED_DIM"):
|
|
||||||
kwargs["embed_dim"] = int(os.environ["CODEXLENS_EMBED_DIM"])
|
|
||||||
if os.environ.get("CODEXLENS_EMBED_BATCH_SIZE"):
|
|
||||||
kwargs["embed_batch_size"] = int(os.environ["CODEXLENS_EMBED_BATCH_SIZE"])
|
|
||||||
if os.environ.get("CODEXLENS_EMBED_API_CONCURRENCY"):
|
|
||||||
kwargs["embed_api_concurrency"] = int(os.environ["CODEXLENS_EMBED_API_CONCURRENCY"])
|
|
||||||
if os.environ.get("CODEXLENS_EMBED_API_MAX_TOKENS"):
|
|
||||||
kwargs["embed_api_max_tokens_per_batch"] = int(os.environ["CODEXLENS_EMBED_API_MAX_TOKENS"])
|
|
||||||
if os.environ.get("CODEXLENS_EMBED_MAX_TOKENS"):
|
|
||||||
kwargs["embed_max_tokens"] = int(os.environ["CODEXLENS_EMBED_MAX_TOKENS"])
|
|
||||||
# Reranker API env vars
|
|
||||||
if os.environ.get("CODEXLENS_RERANKER_API_URL"):
|
|
||||||
kwargs["reranker_api_url"] = os.environ["CODEXLENS_RERANKER_API_URL"]
|
|
||||||
if os.environ.get("CODEXLENS_RERANKER_API_KEY"):
|
|
||||||
kwargs["reranker_api_key"] = os.environ["CODEXLENS_RERANKER_API_KEY"]
|
|
||||||
if os.environ.get("CODEXLENS_RERANKER_API_MODEL"):
|
|
||||||
kwargs["reranker_api_model"] = os.environ["CODEXLENS_RERANKER_API_MODEL"]
|
|
||||||
# Search pipeline params from env
|
|
||||||
if os.environ.get("CODEXLENS_RERANKER_TOP_K"):
|
|
||||||
kwargs["reranker_top_k"] = int(os.environ["CODEXLENS_RERANKER_TOP_K"])
|
|
||||||
if os.environ.get("CODEXLENS_RERANKER_BATCH_SIZE"):
|
|
||||||
kwargs["reranker_batch_size"] = int(os.environ["CODEXLENS_RERANKER_BATCH_SIZE"])
|
|
||||||
if os.environ.get("CODEXLENS_BINARY_TOP_K"):
|
|
||||||
kwargs["binary_top_k"] = int(os.environ["CODEXLENS_BINARY_TOP_K"])
|
|
||||||
if os.environ.get("CODEXLENS_ANN_TOP_K"):
|
|
||||||
kwargs["ann_top_k"] = int(os.environ["CODEXLENS_ANN_TOP_K"])
|
|
||||||
if os.environ.get("CODEXLENS_FTS_TOP_K"):
|
|
||||||
kwargs["fts_top_k"] = int(os.environ["CODEXLENS_FTS_TOP_K"])
|
|
||||||
if os.environ.get("CODEXLENS_FUSION_K"):
|
|
||||||
kwargs["fusion_k"] = int(os.environ["CODEXLENS_FUSION_K"])
|
|
||||||
# Indexing params from env
|
|
||||||
if os.environ.get("CODEXLENS_CODE_AWARE_CHUNKING"):
|
|
||||||
kwargs["code_aware_chunking"] = os.environ["CODEXLENS_CODE_AWARE_CHUNKING"].lower() == "true"
|
|
||||||
if os.environ.get("CODEXLENS_INDEX_WORKERS"):
|
|
||||||
kwargs["index_workers"] = int(os.environ["CODEXLENS_INDEX_WORKERS"])
|
|
||||||
if os.environ.get("CODEXLENS_MAX_FILE_SIZE"):
|
|
||||||
kwargs["max_file_size_bytes"] = int(os.environ["CODEXLENS_MAX_FILE_SIZE"])
|
|
||||||
if os.environ.get("CODEXLENS_HNSW_EF"):
|
|
||||||
kwargs["hnsw_ef"] = int(os.environ["CODEXLENS_HNSW_EF"])
|
|
||||||
if os.environ.get("CODEXLENS_HNSW_M"):
|
|
||||||
kwargs["hnsw_M"] = int(os.environ["CODEXLENS_HNSW_M"])
|
|
||||||
# Tier config from env
|
|
||||||
if os.environ.get("CODEXLENS_TIER_HOT_HOURS"):
|
|
||||||
kwargs["tier_hot_hours"] = int(os.environ["CODEXLENS_TIER_HOT_HOURS"])
|
|
||||||
if os.environ.get("CODEXLENS_TIER_COLD_HOURS"):
|
|
||||||
kwargs["tier_cold_hours"] = int(os.environ["CODEXLENS_TIER_COLD_HOURS"])
|
|
||||||
# Search quality tier from env
|
|
||||||
if os.environ.get("CODEXLENS_SEARCH_QUALITY"):
|
|
||||||
kwargs["default_search_quality"] = os.environ["CODEXLENS_SEARCH_QUALITY"]
|
|
||||||
# Shard config from env
|
|
||||||
if os.environ.get("CODEXLENS_NUM_SHARDS"):
|
|
||||||
kwargs["num_shards"] = int(os.environ["CODEXLENS_NUM_SHARDS"])
|
|
||||||
if os.environ.get("CODEXLENS_MAX_LOADED_SHARDS"):
|
|
||||||
kwargs["max_loaded_shards"] = int(os.environ["CODEXLENS_MAX_LOADED_SHARDS"])
|
|
||||||
resolved = Path(db_path).resolve()
|
|
||||||
kwargs["metadata_db_path"] = str(resolved / "metadata.db")
|
|
||||||
return Config(**kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def _create_config(args: argparse.Namespace) -> "Config":
|
|
||||||
"""Build Config from CLI args (delegates to create_config_from_env)."""
|
|
||||||
overrides: dict = {}
|
|
||||||
if hasattr(args, "embed_model") and args.embed_model:
|
|
||||||
overrides["embed_model"] = args.embed_model
|
|
||||||
if hasattr(args, "embed_api_url") and args.embed_api_url:
|
|
||||||
overrides["embed_api_url"] = args.embed_api_url
|
|
||||||
if hasattr(args, "embed_api_key") and args.embed_api_key:
|
|
||||||
overrides["embed_api_key"] = args.embed_api_key
|
|
||||||
if hasattr(args, "embed_api_model") and args.embed_api_model:
|
|
||||||
overrides["embed_api_model"] = args.embed_api_model
|
|
||||||
return create_config_from_env(args.db_path, **overrides)
|
|
||||||
|
|
||||||
|
|
||||||
def _create_embedder(config: "Config"):
|
|
||||||
"""Create embedder based on config, auto-detecting embed_dim from API."""
|
|
||||||
if config.embed_api_url:
|
|
||||||
from codexlens_search.embed.api import APIEmbedder
|
|
||||||
embedder = APIEmbedder(config)
|
|
||||||
log.info("Using API embedder: %s", config.embed_api_url)
|
|
||||||
# Auto-detect embed_dim from API if still at default
|
|
||||||
if config.embed_dim == 384:
|
|
||||||
probe_vec = embedder.embed_single("dimension probe")
|
|
||||||
detected_dim = probe_vec.shape[0]
|
|
||||||
if detected_dim != config.embed_dim:
|
|
||||||
log.info("Auto-detected embed_dim=%d from API (was %d)", detected_dim, config.embed_dim)
|
|
||||||
config.embed_dim = detected_dim
|
|
||||||
else:
|
|
||||||
from codexlens_search.embed.local import FastEmbedEmbedder
|
|
||||||
embedder = FastEmbedEmbedder(config)
|
|
||||||
return embedder
|
|
||||||
|
|
||||||
|
|
||||||
def _create_reranker(config: "Config"):
|
|
||||||
"""Create reranker based on config."""
|
|
||||||
if config.reranker_api_url:
|
|
||||||
from codexlens_search.rerank.api import APIReranker
|
|
||||||
reranker = APIReranker(config)
|
|
||||||
log.info("Using API reranker: %s", config.reranker_api_url)
|
|
||||||
else:
|
|
||||||
from codexlens_search.rerank.local import FastEmbedReranker
|
|
||||||
reranker = FastEmbedReranker(config)
|
|
||||||
return reranker
|
|
||||||
|
|
||||||
|
|
||||||
def create_pipeline(
|
|
||||||
db_path: str | Path,
|
|
||||||
config: "Config | None" = None,
|
|
||||||
) -> tuple:
|
|
||||||
"""Construct pipeline components from db_path and config.
|
|
||||||
|
|
||||||
Returns (indexing_pipeline, search_pipeline, config).
|
|
||||||
Used by both CLI bridge and MCP server.
|
|
||||||
|
|
||||||
When config.num_shards > 1, returns a ShardManager-backed pipeline
|
|
||||||
where indexing and search are delegated to the ShardManager.
|
|
||||||
The returned tuple is (shard_manager, shard_manager, config) so that
|
|
||||||
callers can use shard_manager.sync() and shard_manager.search().
|
|
||||||
"""
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
|
|
||||||
if config is None:
|
|
||||||
config = create_config_from_env(db_path)
|
|
||||||
resolved = Path(db_path).resolve()
|
|
||||||
resolved.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
embedder = _create_embedder(config)
|
|
||||||
reranker = _create_reranker(config)
|
|
||||||
|
|
||||||
# Sharded mode: delegate to ShardManager
|
|
||||||
if config.num_shards > 1:
|
|
||||||
from codexlens_search.core.shard_manager import ShardManager
|
|
||||||
manager = ShardManager(
|
|
||||||
num_shards=config.num_shards,
|
|
||||||
db_path=resolved,
|
|
||||||
config=config,
|
|
||||||
embedder=embedder,
|
|
||||||
reranker=reranker,
|
|
||||||
)
|
|
||||||
log.info(
|
|
||||||
"Using ShardManager with %d shards (max_loaded=%d)",
|
|
||||||
config.num_shards, config.max_loaded_shards,
|
|
||||||
)
|
|
||||||
return manager, manager, config
|
|
||||||
|
|
||||||
# Single-shard mode: original behavior, no ShardManager overhead
|
|
||||||
from codexlens_search.core.factory import create_ann_index, create_binary_index
|
|
||||||
from codexlens_search.indexing.metadata import MetadataStore
|
|
||||||
from codexlens_search.indexing.pipeline import IndexingPipeline
|
|
||||||
from codexlens_search.search.fts import FTSEngine
|
|
||||||
from codexlens_search.search.pipeline import SearchPipeline
|
|
||||||
|
|
||||||
binary_store = create_binary_index(resolved, config.embed_dim, config)
|
|
||||||
ann_index = create_ann_index(resolved, config.embed_dim, config)
|
|
||||||
fts = FTSEngine(resolved / "fts.db")
|
|
||||||
metadata = MetadataStore(resolved / "metadata.db")
|
|
||||||
|
|
||||||
indexing = IndexingPipeline(
|
|
||||||
embedder=embedder,
|
|
||||||
binary_store=binary_store,
|
|
||||||
ann_index=ann_index,
|
|
||||||
fts=fts,
|
|
||||||
config=config,
|
|
||||||
metadata=metadata,
|
|
||||||
)
|
|
||||||
|
|
||||||
search = SearchPipeline(
|
|
||||||
embedder=embedder,
|
|
||||||
binary_store=binary_store,
|
|
||||||
ann_index=ann_index,
|
|
||||||
reranker=reranker,
|
|
||||||
fts=fts,
|
|
||||||
config=config,
|
|
||||||
metadata_store=metadata,
|
|
||||||
)
|
|
||||||
|
|
||||||
return indexing, search, config
|
|
||||||
|
|
||||||
|
|
||||||
def _create_pipeline(
|
|
||||||
args: argparse.Namespace,
|
|
||||||
) -> tuple:
|
|
||||||
"""CLI wrapper: construct pipeline from argparse args."""
|
|
||||||
config = _create_config(args)
|
|
||||||
db_path = _resolve_db_path(args)
|
|
||||||
return create_pipeline(db_path, config)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Subcommand handlers
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def cmd_init(args: argparse.Namespace) -> None:
|
|
||||||
"""Initialize an empty index at --db-path."""
|
|
||||||
from codexlens_search.indexing.metadata import MetadataStore
|
|
||||||
from codexlens_search.search.fts import FTSEngine
|
|
||||||
|
|
||||||
db_path = _resolve_db_path(args)
|
|
||||||
|
|
||||||
# Create empty stores - just touch the metadata and FTS databases
|
|
||||||
MetadataStore(db_path / "metadata.db")
|
|
||||||
FTSEngine(db_path / "fts.db")
|
|
||||||
|
|
||||||
_json_output({
|
|
||||||
"status": "initialized",
|
|
||||||
"db_path": str(db_path),
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
def cmd_search(args: argparse.Namespace) -> None:
|
|
||||||
"""Run search query, output JSON array of results."""
|
|
||||||
_, search, _ = _create_pipeline(args)
|
|
||||||
|
|
||||||
results = search.search(args.query, top_k=args.top_k)
|
|
||||||
_json_output([
|
|
||||||
{
|
|
||||||
"path": r.path,
|
|
||||||
"score": r.score,
|
|
||||||
"line": r.line,
|
|
||||||
"end_line": r.end_line,
|
|
||||||
"snippet": r.snippet,
|
|
||||||
"content": r.content,
|
|
||||||
}
|
|
||||||
for r in results
|
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
def cmd_index_file(args: argparse.Namespace) -> None:
|
|
||||||
"""Index a single file."""
|
|
||||||
indexing, _, _ = _create_pipeline(args)
|
|
||||||
|
|
||||||
file_path = Path(args.file).resolve()
|
|
||||||
if not file_path.is_file():
|
|
||||||
_error_exit(f"File not found: {file_path}")
|
|
||||||
|
|
||||||
root = Path(args.root).resolve() if args.root else None
|
|
||||||
|
|
||||||
stats = indexing.index_file(file_path, root=root)
|
|
||||||
_json_output({
|
|
||||||
"status": "indexed",
|
|
||||||
"file": str(file_path),
|
|
||||||
"files_processed": stats.files_processed,
|
|
||||||
"chunks_created": stats.chunks_created,
|
|
||||||
"duration_seconds": stats.duration_seconds,
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
def cmd_remove_file(args: argparse.Namespace) -> None:
|
|
||||||
"""Remove a file from the index."""
|
|
||||||
indexing, _, _ = _create_pipeline(args)
|
|
||||||
|
|
||||||
indexing.remove_file(args.file)
|
|
||||||
_json_output({
|
|
||||||
"status": "removed",
|
|
||||||
"file": args.file,
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_EXCLUDES = frozenset({
|
|
||||||
"node_modules", ".git", "__pycache__", "dist", "build",
|
|
||||||
".venv", "venv", ".tox", ".mypy_cache", ".pytest_cache",
|
|
||||||
".next", ".nuxt", "coverage", ".eggs", "*.egg-info", ".codexlens",
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
def should_exclude(path: Path, exclude_dirs: frozenset[str]) -> bool:
|
|
||||||
"""Check if any path component matches an exclude pattern."""
|
|
||||||
parts = path.parts
|
|
||||||
return any(part in exclude_dirs for part in parts)
|
|
||||||
|
|
||||||
|
|
||||||
def cmd_sync(args: argparse.Namespace) -> None:
|
|
||||||
"""Sync index with files under --root matching --glob pattern."""
|
|
||||||
indexing, _, _ = _create_pipeline(args)
|
|
||||||
|
|
||||||
root = Path(args.root).resolve()
|
|
||||||
if not root.is_dir():
|
|
||||||
_error_exit(f"Root directory not found: {root}")
|
|
||||||
|
|
||||||
exclude_dirs = frozenset(args.exclude) if args.exclude else DEFAULT_EXCLUDES
|
|
||||||
pattern = args.glob or "**/*"
|
|
||||||
file_paths = [
|
|
||||||
p for p in root.glob(pattern)
|
|
||||||
if p.is_file() and not should_exclude(p.relative_to(root), exclude_dirs)
|
|
||||||
]
|
|
||||||
|
|
||||||
log.debug("Sync: %d files after exclusion (root=%s, pattern=%s)", len(file_paths), root, pattern)
|
|
||||||
|
|
||||||
stats = indexing.sync(file_paths, root=root)
|
|
||||||
_json_output({
|
|
||||||
"status": "synced",
|
|
||||||
"root": str(root),
|
|
||||||
"files_processed": stats.files_processed,
|
|
||||||
"chunks_created": stats.chunks_created,
|
|
||||||
"duration_seconds": stats.duration_seconds,
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
def cmd_watch(args: argparse.Namespace) -> None:
|
|
||||||
"""Watch --root for changes, output JSONL events."""
|
|
||||||
root = Path(args.root).resolve()
|
|
||||||
if not root.is_dir():
|
|
||||||
_error_exit(f"Root directory not found: {root}")
|
|
||||||
|
|
||||||
debounce_ms = args.debounce_ms
|
|
||||||
|
|
||||||
try:
|
|
||||||
from watchdog.observers import Observer
|
|
||||||
from watchdog.events import FileSystemEventHandler, FileSystemEvent
|
|
||||||
except ImportError:
|
|
||||||
_error_exit(
|
|
||||||
"watchdog is required for watch mode. "
|
|
||||||
"Install with: pip install watchdog"
|
|
||||||
)
|
|
||||||
|
|
||||||
class _JsonEventHandler(FileSystemEventHandler):
|
|
||||||
"""Emit JSONL for file events."""
|
|
||||||
|
|
||||||
def _emit(self, event_type: str, path: str) -> None:
|
|
||||||
_json_output({
|
|
||||||
"event": event_type,
|
|
||||||
"path": path,
|
|
||||||
"timestamp": time.time(),
|
|
||||||
})
|
|
||||||
|
|
||||||
def on_created(self, event: FileSystemEvent) -> None:
|
|
||||||
if not event.is_directory:
|
|
||||||
self._emit("created", event.src_path)
|
|
||||||
|
|
||||||
def on_modified(self, event: FileSystemEvent) -> None:
|
|
||||||
if not event.is_directory:
|
|
||||||
self._emit("modified", event.src_path)
|
|
||||||
|
|
||||||
def on_deleted(self, event: FileSystemEvent) -> None:
|
|
||||||
if not event.is_directory:
|
|
||||||
self._emit("deleted", event.src_path)
|
|
||||||
|
|
||||||
def on_moved(self, event: FileSystemEvent) -> None:
|
|
||||||
if not event.is_directory:
|
|
||||||
self._emit("moved", event.dest_path)
|
|
||||||
|
|
||||||
observer = Observer()
|
|
||||||
observer.schedule(_JsonEventHandler(), str(root), recursive=True)
|
|
||||||
observer.start()
|
|
||||||
|
|
||||||
_json_output({
|
|
||||||
"status": "watching",
|
|
||||||
"root": str(root),
|
|
||||||
"debounce_ms": debounce_ms,
|
|
||||||
})
|
|
||||||
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
time.sleep(debounce_ms / 1000.0)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
observer.stop()
|
|
||||||
observer.join()
|
|
||||||
|
|
||||||
|
|
||||||
def cmd_download_models(args: argparse.Namespace) -> None:
|
|
||||||
"""Download embed + reranker models."""
|
|
||||||
from codexlens_search import model_manager
|
|
||||||
|
|
||||||
config = _create_config(args)
|
|
||||||
|
|
||||||
model_manager.ensure_model(config.embed_model, config)
|
|
||||||
model_manager.ensure_model(config.reranker_model, config)
|
|
||||||
|
|
||||||
_json_output({
|
|
||||||
"status": "downloaded",
|
|
||||||
"embed_model": config.embed_model,
|
|
||||||
"reranker_model": config.reranker_model,
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
def cmd_list_models(args: argparse.Namespace) -> None:
|
|
||||||
"""List known embed/reranker models with cache status."""
|
|
||||||
from codexlens_search import model_manager
|
|
||||||
|
|
||||||
config = _create_config(args)
|
|
||||||
models = model_manager.list_known_models(config)
|
|
||||||
_json_output(models)
|
|
||||||
|
|
||||||
|
|
||||||
def cmd_download_model(args: argparse.Namespace) -> None:
|
|
||||||
"""Download a single model by name."""
|
|
||||||
from codexlens_search import model_manager
|
|
||||||
|
|
||||||
config = _create_config(args)
|
|
||||||
model_name = args.model_name
|
|
||||||
|
|
||||||
model_manager.ensure_model(model_name, config)
|
|
||||||
|
|
||||||
cached = model_manager._model_is_cached(
|
|
||||||
model_name, model_manager._resolve_cache_dir(config)
|
|
||||||
)
|
|
||||||
_json_output({
|
|
||||||
"status": "downloaded" if cached else "failed",
|
|
||||||
"model": model_name,
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
def cmd_delete_model(args: argparse.Namespace) -> None:
|
|
||||||
"""Delete a model from cache."""
|
|
||||||
from codexlens_search import model_manager
|
|
||||||
|
|
||||||
config = _create_config(args)
|
|
||||||
model_name = args.model_name
|
|
||||||
|
|
||||||
deleted = model_manager.delete_model(model_name, config)
|
|
||||||
_json_output({
|
|
||||||
"status": "deleted" if deleted else "not_found",
|
|
||||||
"model": model_name,
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
def cmd_status(args: argparse.Namespace) -> None:
|
|
||||||
"""Report index statistics."""
|
|
||||||
from codexlens_search.indexing.metadata import MetadataStore
|
|
||||||
|
|
||||||
db_path = _resolve_db_path(args)
|
|
||||||
meta_path = db_path / "metadata.db"
|
|
||||||
|
|
||||||
if not meta_path.exists():
|
|
||||||
_json_output({
|
|
||||||
"status": "not_initialized",
|
|
||||||
"db_path": str(db_path),
|
|
||||||
})
|
|
||||||
return
|
|
||||||
|
|
||||||
metadata = MetadataStore(meta_path)
|
|
||||||
all_files = metadata.get_all_files()
|
|
||||||
deleted_ids = metadata.get_deleted_ids()
|
|
||||||
max_chunk = metadata.max_chunk_id()
|
|
||||||
|
|
||||||
_json_output({
|
|
||||||
"status": "ok",
|
|
||||||
"db_path": str(db_path),
|
|
||||||
"files_tracked": len(all_files),
|
|
||||||
"max_chunk_id": max_chunk,
|
|
||||||
"total_chunks_approx": max_chunk + 1 if max_chunk >= 0 else 0,
|
|
||||||
"deleted_chunks": len(deleted_ids),
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# CLI parser
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _build_parser() -> argparse.ArgumentParser:
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog="codexlens-search",
|
|
||||||
description="Lightweight semantic code search - CLI bridge",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--db-path",
|
|
||||||
default=os.environ.get("CODEXLENS_DB_PATH", ".codexlens"),
|
|
||||||
help="Path to index database directory (default: .codexlens or $CODEXLENS_DB_PATH)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--verbose", "-v",
|
|
||||||
action="store_true",
|
|
||||||
help="Enable debug logging to stderr",
|
|
||||||
)
|
|
||||||
|
|
||||||
# API embedding overrides (also read from CODEXLENS_EMBED_API_* env vars)
|
|
||||||
parser.add_argument(
|
|
||||||
"--embed-api-url",
|
|
||||||
default="",
|
|
||||||
help="Remote embedding API URL (OpenAI-compatible, e.g. https://api.openai.com/v1)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--embed-api-key",
|
|
||||||
default="",
|
|
||||||
help="API key for remote embedding",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--embed-api-model",
|
|
||||||
default="",
|
|
||||||
help="Model name for remote embedding (e.g. text-embedding-3-small)",
|
|
||||||
)
|
|
||||||
|
|
||||||
sub = parser.add_subparsers(dest="command")
|
|
||||||
|
|
||||||
# init
|
|
||||||
sub.add_parser("init", help="Initialize empty index")
|
|
||||||
|
|
||||||
# search
|
|
||||||
p_search = sub.add_parser("search", help="Search the index")
|
|
||||||
p_search.add_argument("--query", "-q", required=True, help="Search query")
|
|
||||||
p_search.add_argument("--top-k", "-k", type=int, default=10, help="Number of results")
|
|
||||||
|
|
||||||
# index-file
|
|
||||||
p_index = sub.add_parser("index-file", help="Index a single file")
|
|
||||||
p_index.add_argument("--file", "-f", required=True, help="File path to index")
|
|
||||||
p_index.add_argument("--root", "-r", help="Root directory for relative paths")
|
|
||||||
|
|
||||||
# remove-file
|
|
||||||
p_remove = sub.add_parser("remove-file", help="Remove a file from index")
|
|
||||||
p_remove.add_argument("--file", "-f", required=True, help="Relative file path to remove")
|
|
||||||
|
|
||||||
# sync
|
|
||||||
p_sync = sub.add_parser("sync", help="Sync index with directory")
|
|
||||||
p_sync.add_argument("--root", "-r", required=True, help="Root directory to sync")
|
|
||||||
p_sync.add_argument("--glob", "-g", default="**/*", help="Glob pattern (default: **/*)")
|
|
||||||
p_sync.add_argument(
|
|
||||||
"--exclude", "-e", action="append", default=None,
|
|
||||||
help="Directory names to exclude (repeatable). "
|
|
||||||
"Defaults: node_modules, .git, __pycache__, dist, build, .venv, venv, .tox, .mypy_cache",
|
|
||||||
)
|
|
||||||
|
|
||||||
# watch
|
|
||||||
p_watch = sub.add_parser("watch", help="Watch directory for changes (JSONL output)")
|
|
||||||
p_watch.add_argument("--root", "-r", required=True, help="Root directory to watch")
|
|
||||||
p_watch.add_argument("--debounce-ms", type=int, default=500, help="Debounce interval in ms")
|
|
||||||
|
|
||||||
# download-models
|
|
||||||
p_dl = sub.add_parser("download-models", help="Download embed + reranker models")
|
|
||||||
p_dl.add_argument("--embed-model", help="Override embed model name")
|
|
||||||
|
|
||||||
# list-models
|
|
||||||
sub.add_parser("list-models", help="List known models with cache status")
|
|
||||||
|
|
||||||
# download-model (single model by name)
|
|
||||||
p_dl_single = sub.add_parser("download-model", help="Download a single model by name")
|
|
||||||
p_dl_single.add_argument("model_name", help="HuggingFace model name (e.g. BAAI/bge-small-en-v1.5)")
|
|
||||||
|
|
||||||
# delete-model
|
|
||||||
p_del = sub.add_parser("delete-model", help="Delete a model from cache")
|
|
||||||
p_del.add_argument("model_name", help="HuggingFace model name to delete")
|
|
||||||
|
|
||||||
# status
|
|
||||||
sub.add_parser("status", help="Report index statistics")
|
|
||||||
|
|
||||||
return parser
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
"""CLI entry point."""
|
|
||||||
_ensure_utf8_stdio()
|
|
||||||
parser = _build_parser()
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
if args.verbose:
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.DEBUG,
|
|
||||||
format="%(levelname)s %(name)s: %(message)s",
|
|
||||||
stream=sys.stderr,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.WARNING,
|
|
||||||
format="%(levelname)s: %(message)s",
|
|
||||||
stream=sys.stderr,
|
|
||||||
)
|
|
||||||
|
|
||||||
if not args.command:
|
|
||||||
parser.print_help(sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
dispatch = {
|
|
||||||
"init": cmd_init,
|
|
||||||
"search": cmd_search,
|
|
||||||
"index-file": cmd_index_file,
|
|
||||||
"remove-file": cmd_remove_file,
|
|
||||||
"sync": cmd_sync,
|
|
||||||
"watch": cmd_watch,
|
|
||||||
"download-models": cmd_download_models,
|
|
||||||
"list-models": cmd_list_models,
|
|
||||||
"download-model": cmd_download_model,
|
|
||||||
"delete-model": cmd_delete_model,
|
|
||||||
"status": cmd_status,
|
|
||||||
}
|
|
||||||
|
|
||||||
handler = dispatch.get(args.command)
|
|
||||||
if handler is None:
|
|
||||||
_error_exit(f"Unknown command: {args.command}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
handler(args)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
sys.exit(130)
|
|
||||||
except SystemExit:
|
|
||||||
raise
|
|
||||||
except Exception as exc:
|
|
||||||
log.debug("Command failed", exc_info=True)
|
|
||||||
_error_exit(str(exc))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,165 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
import logging
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Config:
|
|
||||||
# Embedding
|
|
||||||
embed_model: str = "BAAI/bge-small-en-v1.5"
|
|
||||||
embed_dim: int = 384
|
|
||||||
embed_batch_size: int = 32
|
|
||||||
|
|
||||||
# API embedding (optional — overrides local fastembed when set)
|
|
||||||
embed_api_url: str = "" # e.g. "https://api.openai.com/v1"
|
|
||||||
embed_api_key: str = ""
|
|
||||||
embed_api_model: str = "" # e.g. "text-embedding-3-small"
|
|
||||||
# Multi-endpoint: list of {"url": "...", "key": "...", "model": "..."} dicts
|
|
||||||
embed_api_endpoints: list[dict[str, str]] = None # type: ignore[assignment]
|
|
||||||
embed_api_concurrency: int = 4
|
|
||||||
embed_api_max_tokens_per_batch: int = 32768
|
|
||||||
embed_max_tokens: int = 8192 # max tokens per single text (0 = no limit)
|
|
||||||
|
|
||||||
# Model download / cache
|
|
||||||
model_cache_dir: str = "" # empty = fastembed default cache
|
|
||||||
hf_mirror: str = "" # HuggingFace mirror URL, e.g. "https://hf-mirror.com"
|
|
||||||
|
|
||||||
# GPU / execution providers
|
|
||||||
device: str = "auto" # 'auto', 'cuda', 'cpu'
|
|
||||||
embed_providers: list[str] | None = None # explicit ONNX providers override
|
|
||||||
|
|
||||||
# File filtering
|
|
||||||
max_file_size_bytes: int = 1_000_000 # 1MB
|
|
||||||
exclude_extensions: frozenset[str] = None # type: ignore[assignment] # set in __post_init__
|
|
||||||
binary_detect_sample_bytes: int = 2048
|
|
||||||
binary_null_threshold: float = 0.10 # >10% null bytes = binary
|
|
||||||
generated_code_markers: tuple[str, ...] = ("@generated", "DO NOT EDIT", "auto-generated", "AUTO GENERATED")
|
|
||||||
|
|
||||||
# Code-aware chunking
|
|
||||||
code_aware_chunking: bool = True
|
|
||||||
code_extensions: frozenset[str] = frozenset({
|
|
||||||
".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".java", ".cpp", ".c",
|
|
||||||
".h", ".hpp", ".cs", ".rs", ".rb", ".php", ".scala", ".kt", ".swift",
|
|
||||||
".lua", ".sh", ".bash", ".zsh", ".ps1", ".vue", ".svelte",
|
|
||||||
})
|
|
||||||
|
|
||||||
# Backend selection: 'auto', 'faiss', 'hnswlib'
|
|
||||||
ann_backend: str = "auto"
|
|
||||||
binary_backend: str = "faiss"
|
|
||||||
|
|
||||||
# Indexing pipeline
|
|
||||||
index_workers: int = 2 # number of parallel indexing workers
|
|
||||||
|
|
||||||
# HNSW index (ANNIndex)
|
|
||||||
hnsw_ef: int = 150
|
|
||||||
hnsw_M: int = 32
|
|
||||||
hnsw_ef_construction: int = 200
|
|
||||||
|
|
||||||
# Binary coarse search (BinaryStore)
|
|
||||||
binary_top_k: int = 200
|
|
||||||
|
|
||||||
# ANN fine search
|
|
||||||
ann_top_k: int = 50
|
|
||||||
|
|
||||||
# Reranker
|
|
||||||
reranker_model: str = "Xenova/ms-marco-MiniLM-L-6-v2"
|
|
||||||
reranker_top_k: int = 20
|
|
||||||
reranker_batch_size: int = 32
|
|
||||||
|
|
||||||
# API reranker (optional)
|
|
||||||
reranker_api_url: str = ""
|
|
||||||
reranker_api_key: str = ""
|
|
||||||
reranker_api_model: str = ""
|
|
||||||
reranker_api_max_tokens_per_batch: int = 2048
|
|
||||||
|
|
||||||
# Metadata store
|
|
||||||
metadata_db_path: str = "" # empty = no metadata tracking
|
|
||||||
|
|
||||||
# Data tiering (hot/warm/cold)
|
|
||||||
tier_hot_hours: int = 24 # files accessed within this window are 'hot'
|
|
||||||
tier_cold_hours: int = 168 # files not accessed for this long are 'cold'
|
|
||||||
|
|
||||||
# Search quality tier: 'fast', 'balanced', 'thorough', 'auto'
|
|
||||||
default_search_quality: str = "auto"
|
|
||||||
|
|
||||||
# Shard partitioning
|
|
||||||
num_shards: int = 1 # 1 = single partition (no sharding), >1 = hash-based sharding
|
|
||||||
max_loaded_shards: int = 4 # LRU limit for loaded shards in ShardManager
|
|
||||||
|
|
||||||
# FTS
|
|
||||||
fts_top_k: int = 50
|
|
||||||
|
|
||||||
# Fusion
|
|
||||||
fusion_k: int = 60 # RRF k parameter
|
|
||||||
fusion_weights: dict = field(default_factory=lambda: {
|
|
||||||
"exact": 0.25,
|
|
||||||
"fuzzy": 0.10,
|
|
||||||
"vector": 0.50,
|
|
||||||
"graph": 0.15,
|
|
||||||
})
|
|
||||||
|
|
||||||
_DEFAULT_EXCLUDE_EXTENSIONS: frozenset[str] = frozenset({
|
|
||||||
# binaries / images
|
|
||||||
".png", ".jpg", ".jpeg", ".gif", ".webp", ".ico", ".bmp", ".svg",
|
|
||||||
".zip", ".gz", ".tar", ".rar", ".7z", ".bz2",
|
|
||||||
".bin", ".exe", ".dll", ".so", ".dylib", ".a", ".o", ".obj",
|
|
||||||
".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
|
|
||||||
# build / generated
|
|
||||||
".min.js", ".min.css", ".map", ".lock",
|
|
||||||
".pyc", ".pyo", ".class", ".wasm",
|
|
||||||
# data
|
|
||||||
".sqlite", ".db", ".npy", ".npz", ".pkl", ".pickle",
|
|
||||||
".parquet", ".arrow", ".feather",
|
|
||||||
# media
|
|
||||||
".mp3", ".mp4", ".wav", ".avi", ".mov", ".flv",
|
|
||||||
".ttf", ".otf", ".woff", ".woff2", ".eot",
|
|
||||||
})
|
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
|
||||||
if self.exclude_extensions is None:
|
|
||||||
object.__setattr__(self, "exclude_extensions", self._DEFAULT_EXCLUDE_EXTENSIONS)
|
|
||||||
if self.embed_api_endpoints is None:
|
|
||||||
object.__setattr__(self, "embed_api_endpoints", [])
|
|
||||||
|
|
||||||
def resolve_embed_providers(self) -> list[str]:
|
|
||||||
"""Return ONNX execution providers based on device config.
|
|
||||||
|
|
||||||
Priority: explicit embed_providers > device setting > auto-detect.
|
|
||||||
"""
|
|
||||||
if self.embed_providers is not None:
|
|
||||||
return list(self.embed_providers)
|
|
||||||
|
|
||||||
if self.device == "cuda":
|
|
||||||
return ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
|
||||||
|
|
||||||
if self.device == "cpu":
|
|
||||||
return ["CPUExecutionProvider"]
|
|
||||||
|
|
||||||
# auto-detect
|
|
||||||
try:
|
|
||||||
import onnxruntime
|
|
||||||
available = onnxruntime.get_available_providers()
|
|
||||||
if "CUDAExecutionProvider" in available:
|
|
||||||
log.info("CUDA detected via onnxruntime, using GPU for embedding")
|
|
||||||
return ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return ["CPUExecutionProvider"]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def defaults(cls) -> "Config":
|
|
||||||
return cls()
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def small(cls) -> "Config":
|
|
||||||
"""Smaller config for testing or small corpora."""
|
|
||||||
return cls(
|
|
||||||
hnsw_ef=50,
|
|
||||||
hnsw_M=16,
|
|
||||||
binary_top_k=50,
|
|
||||||
ann_top_k=20,
|
|
||||||
reranker_top_k=10,
|
|
||||||
)
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
from .base import BaseANNIndex, BaseBinaryIndex
|
|
||||||
from .binary import BinaryStore
|
|
||||||
from .factory import create_ann_index, create_binary_index
|
|
||||||
from .index import ANNIndex
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"BaseANNIndex",
|
|
||||||
"BaseBinaryIndex",
|
|
||||||
"ANNIndex",
|
|
||||||
"BinaryStore",
|
|
||||||
"create_ann_index",
|
|
||||||
"create_binary_index",
|
|
||||||
]
|
|
||||||
@@ -1,83 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
class BaseANNIndex(ABC):
|
|
||||||
"""Abstract base class for approximate nearest neighbor indexes."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
|
|
||||||
"""Add float32 vectors with corresponding IDs.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
ids: shape (N,) int64
|
|
||||||
vectors: shape (N, dim) float32
|
|
||||||
"""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def fine_search(
|
|
||||||
self, query_vec: np.ndarray, top_k: int | None = None
|
|
||||||
) -> tuple[np.ndarray, np.ndarray]:
|
|
||||||
"""Search for nearest neighbors.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query_vec: float32 vector of shape (dim,)
|
|
||||||
top_k: number of results
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(ids, distances) as numpy arrays
|
|
||||||
"""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def save(self) -> None:
|
|
||||||
"""Persist index to disk."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def load(self) -> None:
|
|
||||||
"""Load index from disk."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def __len__(self) -> int:
|
|
||||||
"""Return the number of indexed items."""
|
|
||||||
|
|
||||||
|
|
||||||
class BaseBinaryIndex(ABC):
|
|
||||||
"""Abstract base class for binary vector indexes (Hamming distance)."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
|
|
||||||
"""Add float32 vectors (will be binary-quantized internally).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
ids: shape (N,) int64
|
|
||||||
vectors: shape (N, dim) float32
|
|
||||||
"""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def coarse_search(
|
|
||||||
self, query_vec: np.ndarray, top_k: int | None = None
|
|
||||||
) -> tuple[np.ndarray, np.ndarray]:
|
|
||||||
"""Search by Hamming distance.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query_vec: float32 vector of shape (dim,)
|
|
||||||
top_k: number of results
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(ids, distances) sorted ascending by distance
|
|
||||||
"""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def save(self) -> None:
|
|
||||||
"""Persist store to disk."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def load(self) -> None:
|
|
||||||
"""Load store from disk."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def __len__(self) -> int:
|
|
||||||
"""Return the number of stored items."""
|
|
||||||
@@ -1,180 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import math
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from codexlens_search.core.base import BaseBinaryIndex
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class BinaryStore(BaseBinaryIndex):
|
|
||||||
"""Persistent binary vector store using numpy memmap.
|
|
||||||
|
|
||||||
.. deprecated::
|
|
||||||
Prefer ``FAISSBinaryIndex`` for binary coarse search. This class is
|
|
||||||
retained as a numpy-only fallback for environments where FAISS is not
|
|
||||||
available. New code should use ``create_binary_index()`` from
|
|
||||||
``codexlens_search.core.factory`` which selects the best backend
|
|
||||||
automatically.
|
|
||||||
|
|
||||||
Stores binary-quantized float32 vectors as packed uint8 arrays on disk.
|
|
||||||
Supports fast coarse search via XOR + popcount Hamming distance.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, path: str | Path, dim: int, config: Config) -> None:
|
|
||||||
self._dir = Path(path)
|
|
||||||
self._dim = dim
|
|
||||||
self._config = config
|
|
||||||
self._packed_bytes = math.ceil(dim / 8)
|
|
||||||
|
|
||||||
self._bin_path = self._dir / "binary_store.bin"
|
|
||||||
self._ids_path = self._dir / "binary_store_ids.npy"
|
|
||||||
|
|
||||||
self._matrix: np.ndarray | None = None # shape (N, packed_bytes), uint8
|
|
||||||
self._ids: np.ndarray | None = None # shape (N,), int64
|
|
||||||
self._count: int = 0
|
|
||||||
|
|
||||||
if self._bin_path.exists() and self._ids_path.exists():
|
|
||||||
self.load()
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Internal helpers
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _quantize(self, vectors: np.ndarray) -> np.ndarray:
|
|
||||||
"""Convert float32 vectors (N, dim) to packed uint8 (N, packed_bytes)."""
|
|
||||||
binary = (vectors > 0).astype(np.uint8)
|
|
||||||
packed = np.packbits(binary, axis=1)
|
|
||||||
return packed
|
|
||||||
|
|
||||||
def _quantize_single(self, vec: np.ndarray) -> np.ndarray:
|
|
||||||
"""Convert a single float32 vector (dim,) to packed uint8 (packed_bytes,)."""
|
|
||||||
binary = (vec > 0).astype(np.uint8)
|
|
||||||
return np.packbits(binary)
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Public API
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _ensure_capacity(self, needed: int) -> None:
|
|
||||||
"""Grow pre-allocated matrix/ids arrays to fit *needed* total items."""
|
|
||||||
if self._matrix is not None and self._matrix.shape[0] >= needed:
|
|
||||||
return
|
|
||||||
|
|
||||||
new_cap = max(1024, needed)
|
|
||||||
# Double until large enough
|
|
||||||
if self._matrix is not None:
|
|
||||||
cur_cap = self._matrix.shape[0]
|
|
||||||
new_cap = max(cur_cap, 1024)
|
|
||||||
while new_cap < needed:
|
|
||||||
new_cap *= 2
|
|
||||||
|
|
||||||
new_matrix = np.zeros((new_cap, self._packed_bytes), dtype=np.uint8)
|
|
||||||
new_ids = np.zeros(new_cap, dtype=np.int64)
|
|
||||||
|
|
||||||
if self._matrix is not None and self._count > 0:
|
|
||||||
new_matrix[: self._count] = self._matrix[: self._count]
|
|
||||||
new_ids[: self._count] = self._ids[: self._count]
|
|
||||||
|
|
||||||
self._matrix = new_matrix
|
|
||||||
self._ids = new_ids
|
|
||||||
|
|
||||||
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
|
|
||||||
"""Add float32 vectors and their ids.
|
|
||||||
|
|
||||||
Does NOT call save() internally -- callers must call save()
|
|
||||||
explicitly after batch indexing.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
ids: shape (N,) int64
|
|
||||||
vectors: shape (N, dim) float32
|
|
||||||
"""
|
|
||||||
if len(ids) == 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
packed = self._quantize(vectors) # (N, packed_bytes)
|
|
||||||
n = len(ids)
|
|
||||||
|
|
||||||
self._ensure_capacity(self._count + n)
|
|
||||||
self._matrix[self._count : self._count + n] = packed
|
|
||||||
self._ids[self._count : self._count + n] = ids.astype(np.int64)
|
|
||||||
self._count += n
|
|
||||||
|
|
||||||
def coarse_search(
|
|
||||||
self, query_vec: np.ndarray, top_k: int | None = None
|
|
||||||
) -> tuple[np.ndarray, np.ndarray]:
|
|
||||||
"""Search by Hamming distance.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query_vec: float32 vector of shape (dim,)
|
|
||||||
top_k: number of results; defaults to config.binary_top_k
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(ids, distances) sorted ascending by Hamming distance
|
|
||||||
"""
|
|
||||||
if self._matrix is None or self._count == 0:
|
|
||||||
return np.array([], dtype=np.int64), np.array([], dtype=np.int32)
|
|
||||||
|
|
||||||
k = top_k if top_k is not None else self._config.binary_top_k
|
|
||||||
k = min(k, self._count)
|
|
||||||
|
|
||||||
query_bin = self._quantize_single(query_vec) # (packed_bytes,)
|
|
||||||
|
|
||||||
# Slice to active region (matrix may be pre-allocated larger)
|
|
||||||
active_matrix = self._matrix[: self._count]
|
|
||||||
active_ids = self._ids[: self._count]
|
|
||||||
|
|
||||||
# XOR then popcount via unpackbits
|
|
||||||
xor = np.bitwise_xor(active_matrix, query_bin[np.newaxis, :]) # (N, packed_bytes)
|
|
||||||
dists = np.unpackbits(xor, axis=1).sum(axis=1).astype(np.int32) # (N,)
|
|
||||||
|
|
||||||
if k >= self._count:
|
|
||||||
order = np.argsort(dists)
|
|
||||||
else:
|
|
||||||
part = np.argpartition(dists, k)[:k]
|
|
||||||
order = part[np.argsort(dists[part])]
|
|
||||||
|
|
||||||
return active_ids[order], dists[order]
|
|
||||||
|
|
||||||
def save(self) -> None:
|
|
||||||
"""Flush binary store to disk."""
|
|
||||||
if self._matrix is None or self._count == 0:
|
|
||||||
return
|
|
||||||
self._dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
# Write only the occupied portion of the pre-allocated matrix
|
|
||||||
active_matrix = self._matrix[: self._count]
|
|
||||||
mm = np.memmap(
|
|
||||||
str(self._bin_path),
|
|
||||||
dtype=np.uint8,
|
|
||||||
mode="w+",
|
|
||||||
shape=active_matrix.shape,
|
|
||||||
)
|
|
||||||
mm[:] = active_matrix
|
|
||||||
mm.flush()
|
|
||||||
del mm
|
|
||||||
np.save(str(self._ids_path), self._ids[: self._count])
|
|
||||||
|
|
||||||
def load(self) -> None:
|
|
||||||
"""Reload binary store from disk."""
|
|
||||||
ids = np.load(str(self._ids_path))
|
|
||||||
n = len(ids)
|
|
||||||
if n == 0:
|
|
||||||
return
|
|
||||||
mm = np.memmap(
|
|
||||||
str(self._bin_path),
|
|
||||||
dtype=np.uint8,
|
|
||||||
mode="r",
|
|
||||||
shape=(n, self._packed_bytes),
|
|
||||||
)
|
|
||||||
self._matrix = np.array(mm) # copy into RAM for mutation support
|
|
||||||
del mm
|
|
||||||
self._ids = ids.astype(np.int64)
|
|
||||||
self._count = n
|
|
||||||
|
|
||||||
def __len__(self) -> int:
|
|
||||||
return self._count
|
|
||||||
@@ -1,141 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import warnings
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from codexlens_search.core.base import BaseANNIndex, BaseBinaryIndex
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
try:
|
|
||||||
import faiss as _faiss # noqa: F401
|
|
||||||
_FAISS_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
_FAISS_AVAILABLE = False
|
|
||||||
|
|
||||||
try:
|
|
||||||
import hnswlib as _hnswlib # noqa: F401
|
|
||||||
_HNSWLIB_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
_HNSWLIB_AVAILABLE = False
|
|
||||||
|
|
||||||
|
|
||||||
def _has_faiss_gpu() -> bool:
|
|
||||||
"""Check whether faiss-gpu is available (has GPU resources)."""
|
|
||||||
if not _FAISS_AVAILABLE:
|
|
||||||
return False
|
|
||||||
try:
|
|
||||||
import faiss
|
|
||||||
res = faiss.StandardGpuResources() # noqa: F841
|
|
||||||
return True
|
|
||||||
except (AttributeError, RuntimeError):
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def create_ann_index(path: str | Path, dim: int, config: Config) -> BaseANNIndex:
|
|
||||||
"""Create an ANN index based on config.ann_backend.
|
|
||||||
|
|
||||||
Fallback chain for 'auto': faiss-gpu -> faiss-cpu -> hnswlib.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
path: directory for index persistence
|
|
||||||
dim: vector dimensionality
|
|
||||||
config: project configuration
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A BaseANNIndex implementation
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ImportError: if no suitable backend is available
|
|
||||||
"""
|
|
||||||
backend = config.ann_backend
|
|
||||||
|
|
||||||
if backend == "faiss":
|
|
||||||
from codexlens_search.core.faiss_index import FAISSANNIndex
|
|
||||||
return FAISSANNIndex(path, dim, config)
|
|
||||||
|
|
||||||
if backend == "hnswlib":
|
|
||||||
from codexlens_search.core.index import ANNIndex
|
|
||||||
return ANNIndex(path, dim, config)
|
|
||||||
|
|
||||||
# auto: try faiss first, then hnswlib
|
|
||||||
if _FAISS_AVAILABLE:
|
|
||||||
from codexlens_search.core.faiss_index import FAISSANNIndex
|
|
||||||
gpu_tag = " (GPU available)" if _has_faiss_gpu() else " (CPU)"
|
|
||||||
logger.info("Auto-selected FAISS ANN backend%s", gpu_tag)
|
|
||||||
return FAISSANNIndex(path, dim, config)
|
|
||||||
|
|
||||||
if _HNSWLIB_AVAILABLE:
|
|
||||||
from codexlens_search.core.index import ANNIndex
|
|
||||||
logger.info("Auto-selected hnswlib ANN backend")
|
|
||||||
return ANNIndex(path, dim, config)
|
|
||||||
|
|
||||||
raise ImportError(
|
|
||||||
"No ANN backend available. Install faiss-cpu, faiss-gpu, or hnswlib."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def create_binary_index(
|
|
||||||
path: str | Path, dim: int, config: Config
|
|
||||||
) -> BaseBinaryIndex:
|
|
||||||
"""Create a binary index based on config.binary_backend.
|
|
||||||
|
|
||||||
Fallback chain for 'auto': faiss -> numpy BinaryStore.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
path: directory for index persistence
|
|
||||||
dim: vector dimensionality
|
|
||||||
config: project configuration
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A BaseBinaryIndex implementation
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ImportError: if no suitable backend is available
|
|
||||||
"""
|
|
||||||
backend = config.binary_backend
|
|
||||||
|
|
||||||
if backend == "faiss":
|
|
||||||
if _FAISS_AVAILABLE:
|
|
||||||
from codexlens_search.core.faiss_index import FAISSBinaryIndex
|
|
||||||
return FAISSBinaryIndex(path, dim, config)
|
|
||||||
# FAISS explicitly requested but not installed: fall back with warning
|
|
||||||
from codexlens_search.core.binary import BinaryStore
|
|
||||||
warnings.warn(
|
|
||||||
"binary_backend='faiss' but FAISS is not installed. "
|
|
||||||
"Falling back to deprecated numpy BinaryStore. "
|
|
||||||
"Install faiss-cpu or faiss-gpu for the recommended binary backend.",
|
|
||||||
DeprecationWarning,
|
|
||||||
stacklevel=2,
|
|
||||||
)
|
|
||||||
logger.warning(
|
|
||||||
"binary_backend='faiss' but FAISS not available, "
|
|
||||||
"falling back to deprecated numpy BinaryStore."
|
|
||||||
)
|
|
||||||
return BinaryStore(path, dim, config)
|
|
||||||
|
|
||||||
if backend == "hnswlib":
|
|
||||||
from codexlens_search.core.binary import BinaryStore
|
|
||||||
return BinaryStore(path, dim, config)
|
|
||||||
|
|
||||||
# auto: try faiss first, then numpy-based BinaryStore (deprecated fallback)
|
|
||||||
if _FAISS_AVAILABLE:
|
|
||||||
from codexlens_search.core.faiss_index import FAISSBinaryIndex
|
|
||||||
logger.info("Auto-selected FAISS binary backend")
|
|
||||||
return FAISSBinaryIndex(path, dim, config)
|
|
||||||
|
|
||||||
# numpy BinaryStore is always available (no extra deps)
|
|
||||||
from codexlens_search.core.binary import BinaryStore
|
|
||||||
warnings.warn(
|
|
||||||
"Falling back to numpy BinaryStore because FAISS is not installed. "
|
|
||||||
"BinaryStore is deprecated; install faiss-cpu or faiss-gpu for better performance.",
|
|
||||||
DeprecationWarning,
|
|
||||||
stacklevel=2,
|
|
||||||
)
|
|
||||||
logger.warning(
|
|
||||||
"FAISS not available, falling back to deprecated numpy BinaryStore. "
|
|
||||||
"Install faiss-cpu or faiss-gpu for the recommended binary backend."
|
|
||||||
)
|
|
||||||
return BinaryStore(path, dim, config)
|
|
||||||
@@ -1,301 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import math
|
|
||||||
import threading
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from codexlens_search.core.base import BaseANNIndex, BaseBinaryIndex
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
try:
|
|
||||||
import faiss
|
|
||||||
_FAISS_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
faiss = None # type: ignore[assignment]
|
|
||||||
_FAISS_AVAILABLE = False
|
|
||||||
|
|
||||||
|
|
||||||
def _try_gpu_index(index: "faiss.Index") -> "faiss.Index":
|
|
||||||
"""Transfer a FAISS index to GPU if faiss-gpu is available.
|
|
||||||
|
|
||||||
Returns the GPU index on success, or the original CPU index on failure.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
res = faiss.StandardGpuResources()
|
|
||||||
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
|
|
||||||
logger.info("FAISS index transferred to GPU 0")
|
|
||||||
return gpu_index
|
|
||||||
except (AttributeError, RuntimeError) as exc:
|
|
||||||
logger.debug("GPU transfer unavailable, staying on CPU: %s", exc)
|
|
||||||
return index
|
|
||||||
|
|
||||||
|
|
||||||
def _to_cpu_for_save(index: "faiss.Index") -> "faiss.Index":
|
|
||||||
"""Convert a GPU index back to CPU for serialization."""
|
|
||||||
try:
|
|
||||||
return faiss.index_gpu_to_cpu(index)
|
|
||||||
except (AttributeError, RuntimeError):
|
|
||||||
return index
|
|
||||||
|
|
||||||
|
|
||||||
class FAISSANNIndex(BaseANNIndex):
|
|
||||||
"""FAISS-based ANN index using IndexHNSWFlat with optional GPU.
|
|
||||||
|
|
||||||
Uses Inner Product space with L2-normalized vectors for cosine similarity.
|
|
||||||
Thread-safe via RLock.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, path: str | Path, dim: int, config: Config) -> None:
|
|
||||||
if not _FAISS_AVAILABLE:
|
|
||||||
raise ImportError(
|
|
||||||
"faiss is required. Install with: pip install faiss-cpu "
|
|
||||||
"or pip install faiss-gpu"
|
|
||||||
)
|
|
||||||
|
|
||||||
self._path = Path(path)
|
|
||||||
self._index_path = self._path / "faiss_ann.index"
|
|
||||||
self._dim = dim
|
|
||||||
self._config = config
|
|
||||||
self._lock = threading.RLock()
|
|
||||||
self._index: faiss.Index | None = None
|
|
||||||
|
|
||||||
def _ensure_loaded(self) -> None:
|
|
||||||
"""Load or initialize the index (caller holds lock)."""
|
|
||||||
if self._index is not None:
|
|
||||||
return
|
|
||||||
self.load()
|
|
||||||
|
|
||||||
def load(self) -> None:
|
|
||||||
"""Load index from disk or initialize a fresh one.
|
|
||||||
|
|
||||||
Uses IO_FLAG_MMAP for zero-copy memory-mapped loading when available,
|
|
||||||
falling back to regular read_index() on older faiss versions.
|
|
||||||
"""
|
|
||||||
with self._lock:
|
|
||||||
if self._index_path.exists():
|
|
||||||
try:
|
|
||||||
idx = faiss.read_index(
|
|
||||||
str(self._index_path), faiss.IO_FLAG_MMAP
|
|
||||||
)
|
|
||||||
except (AttributeError, RuntimeError, Exception) as exc:
|
|
||||||
logger.debug(
|
|
||||||
"MMAP load failed, falling back to regular read: %s",
|
|
||||||
exc,
|
|
||||||
)
|
|
||||||
idx = faiss.read_index(str(self._index_path))
|
|
||||||
logger.debug(
|
|
||||||
"Loaded FAISS ANN index from %s (%d items)",
|
|
||||||
self._index_path, idx.ntotal,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# HNSW with flat storage, M=32 by default
|
|
||||||
m = self._config.hnsw_M
|
|
||||||
idx = faiss.IndexHNSWFlat(self._dim, m, faiss.METRIC_INNER_PRODUCT)
|
|
||||||
idx.hnsw.efConstruction = self._config.hnsw_ef_construction
|
|
||||||
idx.hnsw.efSearch = self._config.hnsw_ef
|
|
||||||
logger.debug(
|
|
||||||
"Initialized fresh FAISS HNSW index (dim=%d, M=%d)",
|
|
||||||
self._dim, m,
|
|
||||||
)
|
|
||||||
self._index = _try_gpu_index(idx)
|
|
||||||
|
|
||||||
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
|
|
||||||
"""Add L2-normalized float32 vectors.
|
|
||||||
|
|
||||||
Vectors are normalized before insertion so that Inner Product
|
|
||||||
distance equals cosine similarity.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
ids: shape (N,) int64 -- currently unused by FAISS flat index
|
|
||||||
but kept for API compatibility. FAISS uses sequential IDs.
|
|
||||||
vectors: shape (N, dim) float32
|
|
||||||
"""
|
|
||||||
if len(ids) == 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
vecs = np.ascontiguousarray(vectors, dtype=np.float32)
|
|
||||||
# Normalize for cosine similarity via Inner Product
|
|
||||||
faiss.normalize_L2(vecs)
|
|
||||||
|
|
||||||
with self._lock:
|
|
||||||
self._ensure_loaded()
|
|
||||||
self._index.add(vecs)
|
|
||||||
|
|
||||||
def fine_search(
|
|
||||||
self, query_vec: np.ndarray, top_k: int | None = None
|
|
||||||
) -> tuple[np.ndarray, np.ndarray]:
|
|
||||||
"""Search for nearest neighbors.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query_vec: float32 vector of shape (dim,)
|
|
||||||
top_k: number of results; defaults to config.ann_top_k
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(ids, distances) as numpy arrays. For IP space, higher = more
|
|
||||||
similar, but distances are returned as-is for consumer handling.
|
|
||||||
"""
|
|
||||||
k = top_k if top_k is not None else self._config.ann_top_k
|
|
||||||
|
|
||||||
with self._lock:
|
|
||||||
self._ensure_loaded()
|
|
||||||
|
|
||||||
count = self._index.ntotal
|
|
||||||
if count == 0:
|
|
||||||
return np.array([], dtype=np.int64), np.array([], dtype=np.float32)
|
|
||||||
|
|
||||||
k = min(k, count)
|
|
||||||
# Set efSearch for HNSW accuracy
|
|
||||||
try:
|
|
||||||
self._index.hnsw.efSearch = max(self._config.hnsw_ef, k)
|
|
||||||
except AttributeError:
|
|
||||||
pass # GPU index may not expose hnsw attribute directly
|
|
||||||
|
|
||||||
q = np.ascontiguousarray(query_vec, dtype=np.float32).reshape(1, -1)
|
|
||||||
faiss.normalize_L2(q)
|
|
||||||
distances, labels = self._index.search(q, k)
|
|
||||||
return labels[0].astype(np.int64), distances[0].astype(np.float32)
|
|
||||||
|
|
||||||
def save(self) -> None:
|
|
||||||
"""Save index to disk."""
|
|
||||||
with self._lock:
|
|
||||||
if self._index is None:
|
|
||||||
return
|
|
||||||
self._path.mkdir(parents=True, exist_ok=True)
|
|
||||||
cpu_index = _to_cpu_for_save(self._index)
|
|
||||||
faiss.write_index(cpu_index, str(self._index_path))
|
|
||||||
|
|
||||||
def __len__(self) -> int:
|
|
||||||
with self._lock:
|
|
||||||
if self._index is None:
|
|
||||||
return 0
|
|
||||||
return self._index.ntotal
|
|
||||||
|
|
||||||
|
|
||||||
class FAISSBinaryIndex(BaseBinaryIndex):
|
|
||||||
"""FAISS-based binary index using IndexBinaryFlat for Hamming distance.
|
|
||||||
|
|
||||||
Vectors are binary-quantized (sign bit) before insertion.
|
|
||||||
Thread-safe via RLock.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, path: str | Path, dim: int, config: Config) -> None:
|
|
||||||
if not _FAISS_AVAILABLE:
|
|
||||||
raise ImportError(
|
|
||||||
"faiss is required. Install with: pip install faiss-cpu "
|
|
||||||
"or pip install faiss-gpu"
|
|
||||||
)
|
|
||||||
|
|
||||||
self._path = Path(path)
|
|
||||||
self._index_path = self._path / "faiss_binary.index"
|
|
||||||
self._dim = dim
|
|
||||||
self._config = config
|
|
||||||
self._packed_bytes = math.ceil(dim / 8)
|
|
||||||
self._lock = threading.RLock()
|
|
||||||
self._index: faiss.IndexBinary | None = None
|
|
||||||
|
|
||||||
def _ensure_loaded(self) -> None:
|
|
||||||
if self._index is not None:
|
|
||||||
return
|
|
||||||
self.load()
|
|
||||||
|
|
||||||
def _quantize(self, vectors: np.ndarray) -> np.ndarray:
|
|
||||||
"""Convert float32 vectors (N, dim) to packed uint8 (N, packed_bytes)."""
|
|
||||||
binary = (vectors > 0).astype(np.uint8)
|
|
||||||
return np.packbits(binary, axis=1)
|
|
||||||
|
|
||||||
def _quantize_single(self, vec: np.ndarray) -> np.ndarray:
|
|
||||||
"""Convert a single float32 vector (dim,) to packed uint8 (1, packed_bytes)."""
|
|
||||||
binary = (vec > 0).astype(np.uint8)
|
|
||||||
return np.packbits(binary).reshape(1, -1)
|
|
||||||
|
|
||||||
def load(self) -> None:
|
|
||||||
"""Load binary index from disk or initialize a fresh one.
|
|
||||||
|
|
||||||
Uses IO_FLAG_MMAP for zero-copy memory-mapped loading when available,
|
|
||||||
falling back to regular read_index_binary() on older faiss versions.
|
|
||||||
"""
|
|
||||||
with self._lock:
|
|
||||||
if self._index_path.exists():
|
|
||||||
try:
|
|
||||||
idx = faiss.read_index_binary(
|
|
||||||
str(self._index_path), faiss.IO_FLAG_MMAP
|
|
||||||
)
|
|
||||||
except (AttributeError, RuntimeError, Exception) as exc:
|
|
||||||
logger.debug(
|
|
||||||
"MMAP load failed, falling back to regular read: %s",
|
|
||||||
exc,
|
|
||||||
)
|
|
||||||
idx = faiss.read_index_binary(str(self._index_path))
|
|
||||||
logger.debug(
|
|
||||||
"Loaded FAISS binary index from %s (%d items)",
|
|
||||||
self._index_path, idx.ntotal,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# IndexBinaryFlat takes dimension in bits
|
|
||||||
idx = faiss.IndexBinaryFlat(self._dim)
|
|
||||||
logger.debug(
|
|
||||||
"Initialized fresh FAISS binary index (dim_bits=%d)", self._dim,
|
|
||||||
)
|
|
||||||
self._index = idx
|
|
||||||
|
|
||||||
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
|
|
||||||
"""Add float32 vectors (binary-quantized internally).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
ids: shape (N,) int64 -- kept for API compatibility
|
|
||||||
vectors: shape (N, dim) float32
|
|
||||||
"""
|
|
||||||
if len(ids) == 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
packed = self._quantize(vectors)
|
|
||||||
packed = np.ascontiguousarray(packed, dtype=np.uint8)
|
|
||||||
|
|
||||||
with self._lock:
|
|
||||||
self._ensure_loaded()
|
|
||||||
self._index.add(packed)
|
|
||||||
|
|
||||||
def coarse_search(
|
|
||||||
self, query_vec: np.ndarray, top_k: int | None = None
|
|
||||||
) -> tuple[np.ndarray, np.ndarray]:
|
|
||||||
"""Search by Hamming distance.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query_vec: float32 vector of shape (dim,)
|
|
||||||
top_k: number of results; defaults to config.binary_top_k
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(ids, distances) sorted ascending by Hamming distance
|
|
||||||
"""
|
|
||||||
with self._lock:
|
|
||||||
self._ensure_loaded()
|
|
||||||
|
|
||||||
if self._index.ntotal == 0:
|
|
||||||
return np.array([], dtype=np.int64), np.array([], dtype=np.int32)
|
|
||||||
|
|
||||||
k = top_k if top_k is not None else self._config.binary_top_k
|
|
||||||
k = min(k, self._index.ntotal)
|
|
||||||
|
|
||||||
q = self._quantize_single(query_vec)
|
|
||||||
q = np.ascontiguousarray(q, dtype=np.uint8)
|
|
||||||
distances, labels = self._index.search(q, k)
|
|
||||||
return labels[0].astype(np.int64), distances[0].astype(np.int32)
|
|
||||||
|
|
||||||
def save(self) -> None:
|
|
||||||
"""Save binary index to disk."""
|
|
||||||
with self._lock:
|
|
||||||
if self._index is None:
|
|
||||||
return
|
|
||||||
self._path.mkdir(parents=True, exist_ok=True)
|
|
||||||
faiss.write_index_binary(self._index, str(self._index_path))
|
|
||||||
|
|
||||||
def __len__(self) -> int:
|
|
||||||
with self._lock:
|
|
||||||
if self._index is None:
|
|
||||||
return 0
|
|
||||||
return self._index.ntotal
|
|
||||||
@@ -1,136 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import threading
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from codexlens_search.core.base import BaseANNIndex
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
try:
|
|
||||||
import hnswlib
|
|
||||||
_HNSWLIB_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
_HNSWLIB_AVAILABLE = False
|
|
||||||
|
|
||||||
|
|
||||||
class ANNIndex(BaseANNIndex):
|
|
||||||
"""HNSW-based approximate nearest neighbor index.
|
|
||||||
|
|
||||||
Lazy-loads on first use, thread-safe via RLock.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, path: str | Path, dim: int, config: Config) -> None:
|
|
||||||
if not _HNSWLIB_AVAILABLE:
|
|
||||||
raise ImportError("hnswlib is required. Install with: pip install hnswlib")
|
|
||||||
|
|
||||||
self._path = Path(path)
|
|
||||||
self._hnsw_path = self._path / "ann_index.hnsw"
|
|
||||||
self._dim = dim
|
|
||||||
self._config = config
|
|
||||||
self._lock = threading.RLock()
|
|
||||||
self._index: hnswlib.Index | None = None
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Internal helpers
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _ensure_loaded(self) -> None:
|
|
||||||
"""Load or initialize the index (caller holds lock)."""
|
|
||||||
if self._index is not None:
|
|
||||||
return
|
|
||||||
self.load()
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Public API
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def load(self) -> None:
|
|
||||||
"""Load index from disk or initialize a fresh one."""
|
|
||||||
with self._lock:
|
|
||||||
idx = hnswlib.Index(space="cosine", dim=self._dim)
|
|
||||||
if self._hnsw_path.exists():
|
|
||||||
idx.load_index(str(self._hnsw_path), max_elements=0)
|
|
||||||
idx.set_ef(self._config.hnsw_ef)
|
|
||||||
logger.debug("Loaded HNSW index from %s (%d items)", self._hnsw_path, idx.get_current_count())
|
|
||||||
else:
|
|
||||||
idx.init_index(
|
|
||||||
max_elements=1000,
|
|
||||||
ef_construction=self._config.hnsw_ef_construction,
|
|
||||||
M=self._config.hnsw_M,
|
|
||||||
)
|
|
||||||
idx.set_ef(self._config.hnsw_ef)
|
|
||||||
logger.debug("Initialized fresh HNSW index (dim=%d)", self._dim)
|
|
||||||
self._index = idx
|
|
||||||
|
|
||||||
def add(self, ids: np.ndarray, vectors: np.ndarray) -> None:
|
|
||||||
"""Add float32 vectors.
|
|
||||||
|
|
||||||
Does NOT call save() internally -- callers must call save()
|
|
||||||
explicitly after batch indexing.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
ids: shape (N,) int64
|
|
||||||
vectors: shape (N, dim) float32
|
|
||||||
"""
|
|
||||||
if len(ids) == 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
vecs = np.ascontiguousarray(vectors, dtype=np.float32)
|
|
||||||
|
|
||||||
with self._lock:
|
|
||||||
self._ensure_loaded()
|
|
||||||
# Expand capacity if needed
|
|
||||||
current = self._index.get_current_count()
|
|
||||||
max_el = self._index.get_max_elements()
|
|
||||||
needed = current + len(ids)
|
|
||||||
if needed > max_el:
|
|
||||||
new_cap = max(max_el * 2, needed + 100)
|
|
||||||
self._index.resize_index(new_cap)
|
|
||||||
self._index.add_items(vecs, ids.astype(np.int64))
|
|
||||||
|
|
||||||
def fine_search(
|
|
||||||
self, query_vec: np.ndarray, top_k: int | None = None
|
|
||||||
) -> tuple[np.ndarray, np.ndarray]:
|
|
||||||
"""Search for nearest neighbors.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query_vec: float32 vector of shape (dim,)
|
|
||||||
top_k: number of results; defaults to config.ann_top_k
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(ids, distances) as numpy arrays
|
|
||||||
"""
|
|
||||||
k = top_k if top_k is not None else self._config.ann_top_k
|
|
||||||
|
|
||||||
with self._lock:
|
|
||||||
self._ensure_loaded()
|
|
||||||
|
|
||||||
count = self._index.get_current_count()
|
|
||||||
if count == 0:
|
|
||||||
return np.array([], dtype=np.int64), np.array([], dtype=np.float32)
|
|
||||||
|
|
||||||
k = min(k, count)
|
|
||||||
self._index.set_ef(max(self._config.hnsw_ef, k))
|
|
||||||
|
|
||||||
q = np.ascontiguousarray(query_vec, dtype=np.float32).reshape(1, -1)
|
|
||||||
labels, distances = self._index.knn_query(q, k=k)
|
|
||||||
return labels[0].astype(np.int64), distances[0].astype(np.float32)
|
|
||||||
|
|
||||||
def save(self) -> None:
|
|
||||||
"""Save index to disk (caller may or may not hold lock)."""
|
|
||||||
with self._lock:
|
|
||||||
if self._index is None:
|
|
||||||
return
|
|
||||||
self._path.mkdir(parents=True, exist_ok=True)
|
|
||||||
self._index.save_index(str(self._hnsw_path))
|
|
||||||
|
|
||||||
def __len__(self) -> int:
|
|
||||||
with self._lock:
|
|
||||||
if self._index is None:
|
|
||||||
return 0
|
|
||||||
return self._index.get_current_count()
|
|
||||||
@@ -1,178 +0,0 @@
|
|||||||
"""Single index partition (shard) that owns FTS, binary, ANN, and metadata stores."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from codexlens_search.core.base import BaseANNIndex, BaseBinaryIndex
|
|
||||||
from codexlens_search.embed.base import BaseEmbedder
|
|
||||||
from codexlens_search.indexing.metadata import MetadataStore
|
|
||||||
from codexlens_search.indexing.pipeline import IndexingPipeline, IndexStats
|
|
||||||
from codexlens_search.rerank import BaseReranker
|
|
||||||
from codexlens_search.search.fts import FTSEngine
|
|
||||||
from codexlens_search.search.pipeline import SearchPipeline, SearchResult
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class Shard:
|
|
||||||
"""A complete index partition with its own FTS, binary, ANN, and metadata stores.
|
|
||||||
|
|
||||||
Components are lazy-loaded on first access and can be explicitly unloaded
|
|
||||||
to release memory. The embedder and reranker are shared across shards
|
|
||||||
(passed in from ShardManager) since they are expensive to instantiate.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
shard_id: int,
|
|
||||||
db_path: str | Path,
|
|
||||||
config: Config,
|
|
||||||
) -> None:
|
|
||||||
self._shard_id = shard_id
|
|
||||||
self._shard_dir = Path(db_path).resolve() / f"shard_{shard_id}"
|
|
||||||
self._config = config
|
|
||||||
|
|
||||||
# Lazy-loaded components (created on _ensure_loaded)
|
|
||||||
self._fts: FTSEngine | None = None
|
|
||||||
self._binary_store: BaseBinaryIndex | None = None
|
|
||||||
self._ann_index: BaseANNIndex | None = None
|
|
||||||
self._metadata: MetadataStore | None = None
|
|
||||||
self._indexing: IndexingPipeline | None = None
|
|
||||||
self._search: SearchPipeline | None = None
|
|
||||||
self._loaded = False
|
|
||||||
|
|
||||||
@property
|
|
||||||
def shard_id(self) -> int:
|
|
||||||
return self._shard_id
|
|
||||||
|
|
||||||
@property
|
|
||||||
def is_loaded(self) -> bool:
|
|
||||||
return self._loaded
|
|
||||||
|
|
||||||
def _ensure_loaded(
|
|
||||||
self,
|
|
||||||
embedder: BaseEmbedder,
|
|
||||||
reranker: BaseReranker,
|
|
||||||
) -> None:
|
|
||||||
"""Lazy-create all per-shard components if not yet loaded."""
|
|
||||||
if self._loaded:
|
|
||||||
return
|
|
||||||
|
|
||||||
from codexlens_search.core.factory import create_ann_index, create_binary_index
|
|
||||||
|
|
||||||
self._shard_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
self._fts = FTSEngine(self._shard_dir / "fts.db")
|
|
||||||
self._binary_store = create_binary_index(
|
|
||||||
self._shard_dir, self._config.embed_dim, self._config
|
|
||||||
)
|
|
||||||
self._ann_index = create_ann_index(
|
|
||||||
self._shard_dir, self._config.embed_dim, self._config
|
|
||||||
)
|
|
||||||
self._metadata = MetadataStore(self._shard_dir / "metadata.db")
|
|
||||||
|
|
||||||
self._indexing = IndexingPipeline(
|
|
||||||
embedder=embedder,
|
|
||||||
binary_store=self._binary_store,
|
|
||||||
ann_index=self._ann_index,
|
|
||||||
fts=self._fts,
|
|
||||||
config=self._config,
|
|
||||||
metadata=self._metadata,
|
|
||||||
)
|
|
||||||
|
|
||||||
self._search = SearchPipeline(
|
|
||||||
embedder=embedder,
|
|
||||||
binary_store=self._binary_store,
|
|
||||||
ann_index=self._ann_index,
|
|
||||||
reranker=reranker,
|
|
||||||
fts=self._fts,
|
|
||||||
config=self._config,
|
|
||||||
metadata_store=self._metadata,
|
|
||||||
)
|
|
||||||
|
|
||||||
self._loaded = True
|
|
||||||
logger.debug("Shard %d loaded from %s", self._shard_id, self._shard_dir)
|
|
||||||
|
|
||||||
def unload(self) -> None:
|
|
||||||
"""Release memory by closing connections and dropping references."""
|
|
||||||
if not self._loaded:
|
|
||||||
return
|
|
||||||
|
|
||||||
if self._metadata is not None:
|
|
||||||
self._metadata.close()
|
|
||||||
|
|
||||||
self._fts = None
|
|
||||||
self._binary_store = None
|
|
||||||
self._ann_index = None
|
|
||||||
self._metadata = None
|
|
||||||
self._indexing = None
|
|
||||||
self._search = None
|
|
||||||
self._loaded = False
|
|
||||||
logger.debug("Shard %d unloaded", self._shard_id)
|
|
||||||
|
|
||||||
def load(
|
|
||||||
self,
|
|
||||||
embedder: BaseEmbedder,
|
|
||||||
reranker: BaseReranker,
|
|
||||||
) -> None:
|
|
||||||
"""Explicitly load shard components."""
|
|
||||||
self._ensure_loaded(embedder, reranker)
|
|
||||||
|
|
||||||
def save(self) -> None:
|
|
||||||
"""Persist binary and ANN indexes to disk."""
|
|
||||||
if not self._loaded:
|
|
||||||
return
|
|
||||||
if self._binary_store is not None:
|
|
||||||
self._binary_store.save()
|
|
||||||
if self._ann_index is not None:
|
|
||||||
self._ann_index.save()
|
|
||||||
|
|
||||||
def search(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
embedder: BaseEmbedder,
|
|
||||||
reranker: BaseReranker,
|
|
||||||
quality: str | None = None,
|
|
||||||
top_k: int | None = None,
|
|
||||||
) -> list[SearchResult]:
|
|
||||||
"""Search this shard's index.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query: Search query string.
|
|
||||||
embedder: Shared embedder instance.
|
|
||||||
reranker: Shared reranker instance.
|
|
||||||
quality: Search quality tier.
|
|
||||||
top_k: Maximum results to return.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of SearchResult from this shard.
|
|
||||||
"""
|
|
||||||
self._ensure_loaded(embedder, reranker)
|
|
||||||
assert self._search is not None
|
|
||||||
return self._search.search(query, top_k=top_k, quality=quality)
|
|
||||||
|
|
||||||
def sync(
|
|
||||||
self,
|
|
||||||
files: list[Path],
|
|
||||||
root: Path | None,
|
|
||||||
embedder: BaseEmbedder,
|
|
||||||
reranker: BaseReranker,
|
|
||||||
**kwargs: object,
|
|
||||||
) -> IndexStats:
|
|
||||||
"""Sync this shard's index with the given files.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
files: Files that belong to this shard.
|
|
||||||
root: Root directory for relative paths.
|
|
||||||
embedder: Shared embedder instance.
|
|
||||||
reranker: Shared reranker instance.
|
|
||||||
**kwargs: Forwarded to IndexingPipeline.sync().
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
IndexStats for this shard's sync operation.
|
|
||||||
"""
|
|
||||||
self._ensure_loaded(embedder, reranker)
|
|
||||||
assert self._indexing is not None
|
|
||||||
return self._indexing.sync(files, root=root, **kwargs)
|
|
||||||
@@ -1,250 +0,0 @@
|
|||||||
"""ShardManager: manages multiple Shard instances with LRU eviction."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import threading
|
|
||||||
from collections import OrderedDict
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from codexlens_search.core.shard import Shard
|
|
||||||
from codexlens_search.embed.base import BaseEmbedder
|
|
||||||
from codexlens_search.indexing.pipeline import IndexStats
|
|
||||||
from codexlens_search.rerank import BaseReranker
|
|
||||||
from codexlens_search.search.fusion import reciprocal_rank_fusion
|
|
||||||
from codexlens_search.search.pipeline import SearchResult
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class ShardManager:
|
|
||||||
"""Manages multiple Shard instances with hash-based file routing and LRU eviction.
|
|
||||||
|
|
||||||
Files are deterministically routed to shards via hash(path) % num_shards.
|
|
||||||
Search queries all shards in parallel and merges results via RRF fusion.
|
|
||||||
At most max_loaded_shards are kept in memory; least-recently-used shards
|
|
||||||
are unloaded when the limit is exceeded.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
num_shards: int,
|
|
||||||
db_path: str | Path,
|
|
||||||
config: Config,
|
|
||||||
embedder: BaseEmbedder,
|
|
||||||
reranker: BaseReranker,
|
|
||||||
) -> None:
|
|
||||||
if num_shards < 1:
|
|
||||||
raise ValueError("num_shards must be >= 1")
|
|
||||||
|
|
||||||
self._num_shards = num_shards
|
|
||||||
self._db_path = Path(db_path).resolve()
|
|
||||||
self._config = config
|
|
||||||
self._embedder = embedder
|
|
||||||
self._reranker = reranker
|
|
||||||
self._max_loaded = config.max_loaded_shards
|
|
||||||
|
|
||||||
# Create all Shard objects (lazy-loaded, no I/O yet)
|
|
||||||
self._shards: dict[int, Shard] = {
|
|
||||||
i: Shard(i, self._db_path, config)
|
|
||||||
for i in range(num_shards)
|
|
||||||
}
|
|
||||||
|
|
||||||
# LRU tracking: keys are shard_ids, most-recently-used at end
|
|
||||||
self._loaded_order: OrderedDict[int, None] = OrderedDict()
|
|
||||||
self._lru_lock = threading.Lock()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def num_shards(self) -> int:
|
|
||||||
return self._num_shards
|
|
||||||
|
|
||||||
def route_file(self, path: str) -> int:
|
|
||||||
"""Deterministically route a file path to a shard ID.
|
|
||||||
|
|
||||||
Uses hash(path) % num_shards for uniform distribution.
|
|
||||||
"""
|
|
||||||
return hash(path) % self._num_shards
|
|
||||||
|
|
||||||
def get_shard(self, shard_id: int) -> Shard:
|
|
||||||
"""Return the Shard instance for a given shard_id."""
|
|
||||||
if shard_id not in self._shards:
|
|
||||||
raise ValueError(
|
|
||||||
f"Invalid shard_id {shard_id}, valid range: 0-{self._num_shards - 1}"
|
|
||||||
)
|
|
||||||
return self._shards[shard_id]
|
|
||||||
|
|
||||||
def _ensure_loaded(self, shard_id: int) -> Shard:
|
|
||||||
"""Load a shard if needed, applying LRU eviction policy.
|
|
||||||
|
|
||||||
Thread-safe: protects OrderedDict mutations with a lock.
|
|
||||||
Returns the loaded Shard.
|
|
||||||
"""
|
|
||||||
shard = self._shards[shard_id]
|
|
||||||
|
|
||||||
with self._lru_lock:
|
|
||||||
# Mark as most-recently-used
|
|
||||||
if shard_id in self._loaded_order:
|
|
||||||
self._loaded_order.move_to_end(shard_id)
|
|
||||||
else:
|
|
||||||
self._loaded_order[shard_id] = None
|
|
||||||
|
|
||||||
# Load if not already loaded
|
|
||||||
if not shard.is_loaded:
|
|
||||||
shard.load(self._embedder, self._reranker)
|
|
||||||
|
|
||||||
# Evict LRU shards if over limit
|
|
||||||
while len(self._loaded_order) > self._max_loaded:
|
|
||||||
evict_id, _ = self._loaded_order.popitem(last=False)
|
|
||||||
evict_shard = self._shards[evict_id]
|
|
||||||
if evict_shard.is_loaded:
|
|
||||||
logger.info("LRU evicting shard %d", evict_id)
|
|
||||||
evict_shard.unload()
|
|
||||||
|
|
||||||
return shard
|
|
||||||
|
|
||||||
def sync(
|
|
||||||
self,
|
|
||||||
files: list[Path],
|
|
||||||
root: Path | None = None,
|
|
||||||
**kwargs: object,
|
|
||||||
) -> IndexStats:
|
|
||||||
"""Sync index with files, routing each file to its shard.
|
|
||||||
|
|
||||||
Groups files by shard via route_file(), then syncs each shard
|
|
||||||
with its subset of files.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
files: Current list of files to index.
|
|
||||||
root: Root directory for relative paths.
|
|
||||||
**kwargs: Forwarded to Shard.sync().
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Aggregated IndexStats across all shards.
|
|
||||||
"""
|
|
||||||
# Group files by shard
|
|
||||||
shard_files: dict[int, list[Path]] = {i: [] for i in range(self._num_shards)}
|
|
||||||
for fpath in files:
|
|
||||||
rel = str(fpath.relative_to(root)) if root else str(fpath)
|
|
||||||
shard_id = self.route_file(rel)
|
|
||||||
shard_files[shard_id].append(fpath)
|
|
||||||
|
|
||||||
total_files = 0
|
|
||||||
total_chunks = 0
|
|
||||||
total_duration = 0.0
|
|
||||||
|
|
||||||
for shard_id, shard_file_list in shard_files.items():
|
|
||||||
if not shard_file_list:
|
|
||||||
continue
|
|
||||||
self._ensure_loaded(shard_id)
|
|
||||||
shard = self._shards[shard_id]
|
|
||||||
stats = shard.sync(
|
|
||||||
shard_file_list,
|
|
||||||
root=root,
|
|
||||||
embedder=self._embedder,
|
|
||||||
reranker=self._reranker,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
total_files += stats.files_processed
|
|
||||||
total_chunks += stats.chunks_created
|
|
||||||
total_duration += stats.duration_seconds
|
|
||||||
|
|
||||||
return IndexStats(
|
|
||||||
files_processed=total_files,
|
|
||||||
chunks_created=total_chunks,
|
|
||||||
duration_seconds=round(total_duration, 2),
|
|
||||||
)
|
|
||||||
|
|
||||||
def search(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
quality: str | None = None,
|
|
||||||
top_k: int | None = None,
|
|
||||||
) -> list[SearchResult]:
|
|
||||||
"""Search all shards in parallel, merge results via RRF fusion.
|
|
||||||
|
|
||||||
Each shard returns its own ranked results. Cross-shard merging
|
|
||||||
uses reciprocal_rank_fusion with equal weights across shards.
|
|
||||||
Per-shard top_k is increased to compensate for cross-shard dilution.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query: Search query string.
|
|
||||||
quality: Search quality tier.
|
|
||||||
top_k: Maximum final results to return.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Merged list of SearchResult ordered by relevance.
|
|
||||||
"""
|
|
||||||
cfg = self._config
|
|
||||||
final_top_k = top_k if top_k is not None else cfg.reranker_top_k
|
|
||||||
|
|
||||||
# Increase per-shard top_k to get enough candidates for cross-shard RRF
|
|
||||||
per_shard_top_k = max(final_top_k, final_top_k * 2)
|
|
||||||
|
|
||||||
# Load all shards for search
|
|
||||||
for shard_id in range(self._num_shards):
|
|
||||||
self._ensure_loaded(shard_id)
|
|
||||||
|
|
||||||
# Parallel search across shards
|
|
||||||
shard_results: dict[int, list[SearchResult]] = {}
|
|
||||||
|
|
||||||
def _search_shard(sid: int) -> tuple[int, list[SearchResult]]:
|
|
||||||
shard = self._shards[sid]
|
|
||||||
results = shard.search(
|
|
||||||
query,
|
|
||||||
embedder=self._embedder,
|
|
||||||
reranker=self._reranker,
|
|
||||||
quality=quality,
|
|
||||||
top_k=per_shard_top_k,
|
|
||||||
)
|
|
||||||
return sid, results
|
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=min(self._num_shards, 4)) as pool:
|
|
||||||
futures = [pool.submit(_search_shard, sid) for sid in range(self._num_shards)]
|
|
||||||
for future in futures:
|
|
||||||
try:
|
|
||||||
sid, results = future.result()
|
|
||||||
shard_results[sid] = results
|
|
||||||
except Exception:
|
|
||||||
logger.warning("Shard search failed", exc_info=True)
|
|
||||||
|
|
||||||
# If only one shard returned results, no merging needed
|
|
||||||
non_empty = {k: v for k, v in shard_results.items() if v}
|
|
||||||
if not non_empty:
|
|
||||||
return []
|
|
||||||
if len(non_empty) == 1:
|
|
||||||
results = list(non_empty.values())[0]
|
|
||||||
return results[:final_top_k]
|
|
||||||
|
|
||||||
# Cross-shard RRF merge
|
|
||||||
# Build ranked lists keyed by shard name, with (doc_id, score) tuples
|
|
||||||
# Use a global result map to look up SearchResult by a unique key
|
|
||||||
# Since doc_ids are shard-local, we need a composite key
|
|
||||||
rrf_input: dict[str, list[tuple[int, float]]] = {}
|
|
||||||
global_results: dict[int, SearchResult] = {}
|
|
||||||
global_id = 0
|
|
||||||
|
|
||||||
for sid, results in non_empty.items():
|
|
||||||
ranked: list[tuple[int, float]] = []
|
|
||||||
for r in results:
|
|
||||||
global_results[global_id] = r
|
|
||||||
ranked.append((global_id, r.score))
|
|
||||||
global_id += 1
|
|
||||||
rrf_input[f"shard_{sid}"] = ranked
|
|
||||||
|
|
||||||
fused = reciprocal_rank_fusion(rrf_input, k=cfg.fusion_k)
|
|
||||||
|
|
||||||
merged: list[SearchResult] = []
|
|
||||||
for gid, fused_score in fused[:final_top_k]:
|
|
||||||
result = global_results[gid]
|
|
||||||
merged.append(SearchResult(
|
|
||||||
id=result.id,
|
|
||||||
path=result.path,
|
|
||||||
score=fused_score,
|
|
||||||
snippet=result.snippet,
|
|
||||||
line=result.line,
|
|
||||||
end_line=result.end_line,
|
|
||||||
content=result.content,
|
|
||||||
))
|
|
||||||
|
|
||||||
return merged
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
from .base import BaseEmbedder
|
|
||||||
from .local import FastEmbedEmbedder, EMBED_PROFILES
|
|
||||||
from .api import APIEmbedder
|
|
||||||
|
|
||||||
__all__ = ["BaseEmbedder", "FastEmbedEmbedder", "APIEmbedder", "EMBED_PROFILES"]
|
|
||||||
@@ -1,263 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import itertools
|
|
||||||
import logging
|
|
||||||
import threading
|
|
||||||
import time
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from ..config import Config
|
|
||||||
from .base import BaseEmbedder
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class _Endpoint:
|
|
||||||
"""A single API endpoint with its own client and rate-limit tracking."""
|
|
||||||
|
|
||||||
__slots__ = ("url", "key", "model", "client", "failures", "lock")
|
|
||||||
|
|
||||||
def __init__(self, url: str, key: str, model: str) -> None:
|
|
||||||
self.url = url.rstrip("/")
|
|
||||||
if not self.url.endswith("/embeddings"):
|
|
||||||
self.url += "/embeddings"
|
|
||||||
self.key = key
|
|
||||||
self.model = model
|
|
||||||
self.client = httpx.Client(
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {key}",
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
},
|
|
||||||
timeout=60.0,
|
|
||||||
)
|
|
||||||
self.failures = 0
|
|
||||||
self.lock = threading.Lock()
|
|
||||||
|
|
||||||
|
|
||||||
class APIEmbedder(BaseEmbedder):
|
|
||||||
"""Embedder backed by remote HTTP API (OpenAI /v1/embeddings format).
|
|
||||||
|
|
||||||
Features:
|
|
||||||
- Token packing: packs small chunks into batches up to max_tokens_per_batch
|
|
||||||
- Multi-endpoint: round-robins across multiple (url, key) pairs
|
|
||||||
- Concurrent dispatch: parallel API calls via ThreadPoolExecutor
|
|
||||||
- Per-endpoint failure tracking and retry with backoff
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, config: Config) -> None:
|
|
||||||
self._config = config
|
|
||||||
self._endpoints = self._build_endpoints(config)
|
|
||||||
self._cycler = itertools.cycle(range(len(self._endpoints)))
|
|
||||||
self._cycler_lock = threading.Lock()
|
|
||||||
self._executor = ThreadPoolExecutor(
|
|
||||||
max_workers=min(config.embed_api_concurrency, len(self._endpoints) * 2),
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _build_endpoints(config: Config) -> list[_Endpoint]:
|
|
||||||
"""Build endpoint list from config. Supports both single and multi configs."""
|
|
||||||
endpoints: list[_Endpoint] = []
|
|
||||||
|
|
||||||
# Multi-endpoint config takes priority
|
|
||||||
if config.embed_api_endpoints:
|
|
||||||
for ep in config.embed_api_endpoints:
|
|
||||||
endpoints.append(_Endpoint(
|
|
||||||
url=ep.get("url", config.embed_api_url),
|
|
||||||
key=ep.get("key", config.embed_api_key),
|
|
||||||
model=ep.get("model", config.embed_api_model),
|
|
||||||
))
|
|
||||||
|
|
||||||
# Fallback: single endpoint from top-level config
|
|
||||||
if not endpoints and config.embed_api_url:
|
|
||||||
endpoints.append(_Endpoint(
|
|
||||||
url=config.embed_api_url,
|
|
||||||
key=config.embed_api_key,
|
|
||||||
model=config.embed_api_model,
|
|
||||||
))
|
|
||||||
|
|
||||||
if not endpoints:
|
|
||||||
raise ValueError("No API embedding endpoints configured")
|
|
||||||
|
|
||||||
return endpoints
|
|
||||||
|
|
||||||
def _next_endpoint(self) -> _Endpoint:
|
|
||||||
with self._cycler_lock:
|
|
||||||
idx = next(self._cycler)
|
|
||||||
return self._endpoints[idx]
|
|
||||||
|
|
||||||
# -- Token packing ------------------------------------------------
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _estimate_tokens(text: str) -> int:
|
|
||||||
"""Rough token estimate: ~4 chars per token for code."""
|
|
||||||
return max(1, len(text) // 4)
|
|
||||||
|
|
||||||
def _truncate_text(self, text: str) -> str:
|
|
||||||
"""Truncate text to embed_max_tokens if configured."""
|
|
||||||
max_tokens = self._config.embed_max_tokens
|
|
||||||
if max_tokens <= 0:
|
|
||||||
return text
|
|
||||||
max_chars = max_tokens * 4 # inverse of _estimate_tokens
|
|
||||||
if len(text) > max_chars:
|
|
||||||
return text[:max_chars]
|
|
||||||
return text
|
|
||||||
|
|
||||||
def _pack_batches(
|
|
||||||
self, texts: list[str]
|
|
||||||
) -> list[list[tuple[int, str]]]:
|
|
||||||
"""Pack texts into batches respecting max_tokens_per_batch.
|
|
||||||
|
|
||||||
Returns list of batches, each batch is list of (original_index, text).
|
|
||||||
Also respects embed_batch_size as max items per batch.
|
|
||||||
"""
|
|
||||||
max_tokens = self._config.embed_api_max_tokens_per_batch
|
|
||||||
max_items = self._config.embed_batch_size
|
|
||||||
batches: list[list[tuple[int, str]]] = []
|
|
||||||
current: list[tuple[int, str]] = []
|
|
||||||
current_tokens = 0
|
|
||||||
|
|
||||||
for i, text in enumerate(texts):
|
|
||||||
tokens = self._estimate_tokens(text)
|
|
||||||
# Start new batch if adding this text would exceed limits
|
|
||||||
if current and (
|
|
||||||
current_tokens + tokens > max_tokens
|
|
||||||
or len(current) >= max_items
|
|
||||||
):
|
|
||||||
batches.append(current)
|
|
||||||
current = []
|
|
||||||
current_tokens = 0
|
|
||||||
current.append((i, text))
|
|
||||||
current_tokens += tokens
|
|
||||||
|
|
||||||
if current:
|
|
||||||
batches.append(current)
|
|
||||||
|
|
||||||
return batches
|
|
||||||
|
|
||||||
# -- API call with retry ------------------------------------------
|
|
||||||
|
|
||||||
def _call_api(
|
|
||||||
self,
|
|
||||||
texts: list[str],
|
|
||||||
endpoint: _Endpoint,
|
|
||||||
max_retries: int = 3,
|
|
||||||
) -> list[np.ndarray]:
|
|
||||||
"""Call a single endpoint with retry logic."""
|
|
||||||
payload: dict = {"input": texts}
|
|
||||||
if endpoint.model:
|
|
||||||
payload["model"] = endpoint.model
|
|
||||||
|
|
||||||
last_exc: Exception | None = None
|
|
||||||
for attempt in range(max_retries):
|
|
||||||
try:
|
|
||||||
response = endpoint.client.post(endpoint.url, json=payload)
|
|
||||||
except Exception as exc:
|
|
||||||
last_exc = exc
|
|
||||||
logger.warning(
|
|
||||||
"API embed %s failed (attempt %d/%d): %s",
|
|
||||||
endpoint.url, attempt + 1, max_retries, exc,
|
|
||||||
)
|
|
||||||
time.sleep((2 ** attempt) * 0.5)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if response.status_code in (429, 503):
|
|
||||||
logger.warning(
|
|
||||||
"API embed %s returned HTTP %s (attempt %d/%d), retrying...",
|
|
||||||
endpoint.url, response.status_code, attempt + 1, max_retries,
|
|
||||||
)
|
|
||||||
time.sleep((2 ** attempt) * 0.5)
|
|
||||||
continue
|
|
||||||
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json()
|
|
||||||
|
|
||||||
items = data.get("data", [])
|
|
||||||
items.sort(key=lambda x: x["index"])
|
|
||||||
vectors = [
|
|
||||||
np.array(item["embedding"], dtype=np.float32)
|
|
||||||
for item in items
|
|
||||||
]
|
|
||||||
|
|
||||||
# Reset failure counter on success
|
|
||||||
with endpoint.lock:
|
|
||||||
endpoint.failures = 0
|
|
||||||
|
|
||||||
return vectors
|
|
||||||
|
|
||||||
# Track failures
|
|
||||||
with endpoint.lock:
|
|
||||||
endpoint.failures += 1
|
|
||||||
|
|
||||||
raise RuntimeError(
|
|
||||||
f"API embed failed at {endpoint.url} after {max_retries} attempts. "
|
|
||||||
f"Last error: {last_exc}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# -- Public interface ---------------------------------------------
|
|
||||||
|
|
||||||
def embed_single(self, text: str) -> np.ndarray:
|
|
||||||
text = self._truncate_text(text)
|
|
||||||
endpoint = self._next_endpoint()
|
|
||||||
vecs = self._call_api([text], endpoint)
|
|
||||||
return vecs[0]
|
|
||||||
|
|
||||||
def _call_api_with_split(
|
|
||||||
self,
|
|
||||||
texts: list[str],
|
|
||||||
endpoint: "_Endpoint",
|
|
||||||
) -> list[np.ndarray]:
|
|
||||||
"""Call API with automatic batch splitting on 413 errors."""
|
|
||||||
try:
|
|
||||||
return self._call_api(texts, endpoint)
|
|
||||||
except Exception as exc:
|
|
||||||
if "413" in str(exc) and len(texts) > 1:
|
|
||||||
mid = len(texts) // 2
|
|
||||||
logger.info("413 received, splitting batch %d → %d + %d", len(texts), mid, len(texts) - mid)
|
|
||||||
left = self._call_api_with_split(texts[:mid], endpoint)
|
|
||||||
right = self._call_api_with_split(texts[mid:], endpoint)
|
|
||||||
return left + right
|
|
||||||
raise
|
|
||||||
|
|
||||||
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
|
||||||
if not texts:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# 0. Truncate texts exceeding model context
|
|
||||||
texts = [self._truncate_text(t) for t in texts]
|
|
||||||
|
|
||||||
# 1. Pack into token-aware batches
|
|
||||||
packed = self._pack_batches(texts)
|
|
||||||
|
|
||||||
if len(packed) == 1:
|
|
||||||
# Single batch — no concurrency overhead needed
|
|
||||||
batch_texts = [t for _, t in packed[0]]
|
|
||||||
batch_indices = [i for i, _ in packed[0]]
|
|
||||||
endpoint = self._next_endpoint()
|
|
||||||
vecs = self._call_api_with_split(batch_texts, endpoint)
|
|
||||||
results: dict[int, np.ndarray] = {}
|
|
||||||
for idx, vec in zip(batch_indices, vecs):
|
|
||||||
results[idx] = vec
|
|
||||||
return [results[i] for i in range(len(texts))]
|
|
||||||
|
|
||||||
# 2. Dispatch batches concurrently across endpoints
|
|
||||||
results: dict[int, np.ndarray] = {}
|
|
||||||
futures = []
|
|
||||||
batch_index_map: list[list[int]] = []
|
|
||||||
|
|
||||||
for batch in packed:
|
|
||||||
batch_texts = [t for _, t in batch]
|
|
||||||
batch_indices = [i for i, _ in batch]
|
|
||||||
endpoint = self._next_endpoint()
|
|
||||||
future = self._executor.submit(self._call_api_with_split, batch_texts, endpoint)
|
|
||||||
futures.append(future)
|
|
||||||
batch_index_map.append(batch_indices)
|
|
||||||
|
|
||||||
for future, indices in zip(futures, batch_index_map):
|
|
||||||
vecs = future.result() # propagates exceptions
|
|
||||||
for idx, vec in zip(indices, vecs):
|
|
||||||
results[idx] = vec
|
|
||||||
|
|
||||||
return [results[i] for i in range(len(texts))]
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
class BaseEmbedder(ABC):
|
|
||||||
@abstractmethod
|
|
||||||
def embed_single(self, text: str) -> np.ndarray:
|
|
||||||
"""Embed a single text, returns float32 ndarray shape (dim,)."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
|
||||||
"""Embed a list of texts, returns list of float32 ndarrays."""
|
|
||||||
@@ -1,60 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from ..config import Config
|
|
||||||
from .base import BaseEmbedder
|
|
||||||
|
|
||||||
EMBED_PROFILES = {
|
|
||||||
"small": "BAAI/bge-small-en-v1.5", # 384d
|
|
||||||
"base": "BAAI/bge-base-en-v1.5", # 768d
|
|
||||||
"large": "BAAI/bge-large-en-v1.5", # 1024d
|
|
||||||
"code": "jinaai/jina-embeddings-v2-base-code", # 768d
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class FastEmbedEmbedder(BaseEmbedder):
|
|
||||||
"""Embedder backed by fastembed.TextEmbedding with lazy model loading."""
|
|
||||||
|
|
||||||
def __init__(self, config: Config) -> None:
|
|
||||||
self._config = config
|
|
||||||
self._model = None
|
|
||||||
|
|
||||||
def _load(self) -> None:
|
|
||||||
"""Lazy-load the fastembed TextEmbedding model on first use."""
|
|
||||||
if self._model is not None:
|
|
||||||
return
|
|
||||||
from .. import model_manager
|
|
||||||
model_manager.ensure_model(self._config.embed_model, self._config)
|
|
||||||
|
|
||||||
from fastembed import TextEmbedding
|
|
||||||
providers = self._config.resolve_embed_providers()
|
|
||||||
cache_kwargs = model_manager.get_cache_kwargs(self._config)
|
|
||||||
try:
|
|
||||||
self._model = TextEmbedding(
|
|
||||||
model_name=self._config.embed_model,
|
|
||||||
providers=providers,
|
|
||||||
**cache_kwargs,
|
|
||||||
)
|
|
||||||
except TypeError:
|
|
||||||
self._model = TextEmbedding(
|
|
||||||
model_name=self._config.embed_model,
|
|
||||||
**cache_kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
def embed_single(self, text: str) -> np.ndarray:
|
|
||||||
"""Embed a single text, returns float32 ndarray of shape (dim,)."""
|
|
||||||
self._load()
|
|
||||||
result = list(self._model.embed([text]))
|
|
||||||
return result[0].astype(np.float32)
|
|
||||||
|
|
||||||
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
|
||||||
"""Embed a list of texts in batches, returns list of float32 ndarrays."""
|
|
||||||
self._load()
|
|
||||||
batch_size = self._config.embed_batch_size
|
|
||||||
results: list[np.ndarray] = []
|
|
||||||
for start in range(0, len(texts), batch_size):
|
|
||||||
batch = texts[start : start + batch_size]
|
|
||||||
for vec in self._model.embed(batch):
|
|
||||||
results.append(vec.astype(np.float32))
|
|
||||||
return results
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from .metadata import MetadataStore
|
|
||||||
from .pipeline import IndexingPipeline, IndexStats
|
|
||||||
|
|
||||||
__all__ = ["IndexingPipeline", "IndexStats", "MetadataStore"]
|
|
||||||
@@ -1,300 +0,0 @@
|
|||||||
"""SQLite-backed metadata store for file-to-chunk mapping and tombstone tracking."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import sqlite3
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
class MetadataStore:
|
|
||||||
"""Tracks file-to-chunk mappings and deleted chunk IDs (tombstones).
|
|
||||||
|
|
||||||
Tables:
|
|
||||||
files - file_path (PK), content_hash, last_modified, file_size,
|
|
||||||
tier ('hot'/'warm'/'cold'), last_accessed (epoch float)
|
|
||||||
chunks - chunk_id (PK), file_path (FK CASCADE), chunk_hash
|
|
||||||
deleted_chunks - chunk_id (PK) for tombstone tracking
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, db_path: str | Path) -> None:
|
|
||||||
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
|
|
||||||
self._conn.execute("PRAGMA foreign_keys = ON")
|
|
||||||
self._conn.execute("PRAGMA journal_mode = WAL")
|
|
||||||
self._create_tables()
|
|
||||||
self._migrate_size_column()
|
|
||||||
self._migrate_tier_columns()
|
|
||||||
|
|
||||||
def _create_tables(self) -> None:
|
|
||||||
self._conn.executescript("""
|
|
||||||
CREATE TABLE IF NOT EXISTS files (
|
|
||||||
file_path TEXT PRIMARY KEY,
|
|
||||||
content_hash TEXT NOT NULL,
|
|
||||||
last_modified REAL NOT NULL,
|
|
||||||
file_size INTEGER NOT NULL DEFAULT 0,
|
|
||||||
tier TEXT NOT NULL DEFAULT 'warm',
|
|
||||||
last_accessed REAL
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS chunks (
|
|
||||||
chunk_id INTEGER PRIMARY KEY,
|
|
||||||
file_path TEXT NOT NULL,
|
|
||||||
chunk_hash TEXT NOT NULL DEFAULT '',
|
|
||||||
FOREIGN KEY (file_path) REFERENCES files(file_path) ON DELETE CASCADE
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS deleted_chunks (
|
|
||||||
chunk_id INTEGER PRIMARY KEY
|
|
||||||
);
|
|
||||||
""")
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def _migrate_size_column(self) -> None:
|
|
||||||
"""Add file_size column if missing (for pre-existing DBs)."""
|
|
||||||
cols = {
|
|
||||||
row[1]
|
|
||||||
for row in self._conn.execute("PRAGMA table_info(files)").fetchall()
|
|
||||||
}
|
|
||||||
if "file_size" not in cols:
|
|
||||||
self._conn.execute(
|
|
||||||
"ALTER TABLE files ADD COLUMN file_size INTEGER NOT NULL DEFAULT 0"
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def _migrate_tier_columns(self) -> None:
|
|
||||||
"""Add tier and last_accessed columns if missing (for pre-existing DBs)."""
|
|
||||||
cols = {
|
|
||||||
row[1]
|
|
||||||
for row in self._conn.execute("PRAGMA table_info(files)").fetchall()
|
|
||||||
}
|
|
||||||
if "tier" not in cols:
|
|
||||||
self._conn.execute(
|
|
||||||
"ALTER TABLE files ADD COLUMN tier TEXT NOT NULL DEFAULT 'warm'"
|
|
||||||
)
|
|
||||||
if "last_accessed" not in cols:
|
|
||||||
self._conn.execute(
|
|
||||||
"ALTER TABLE files ADD COLUMN last_accessed REAL"
|
|
||||||
)
|
|
||||||
if "tier" not in cols or "last_accessed" not in cols:
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def register_file(
|
|
||||||
self,
|
|
||||||
file_path: str,
|
|
||||||
content_hash: str,
|
|
||||||
mtime: float,
|
|
||||||
file_size: int = 0,
|
|
||||||
) -> None:
|
|
||||||
"""Insert or update a file record."""
|
|
||||||
self._conn.execute(
|
|
||||||
"INSERT OR REPLACE INTO files "
|
|
||||||
"(file_path, content_hash, last_modified, file_size) "
|
|
||||||
"VALUES (?, ?, ?, ?)",
|
|
||||||
(file_path, content_hash, mtime, file_size),
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def register_chunks(
|
|
||||||
self, file_path: str, chunk_ids_and_hashes: list[tuple[int, str]]
|
|
||||||
) -> None:
|
|
||||||
"""Register chunk IDs belonging to a file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_path: The owning file path (must already exist in files table).
|
|
||||||
chunk_ids_and_hashes: List of (chunk_id, chunk_hash) tuples.
|
|
||||||
"""
|
|
||||||
if not chunk_ids_and_hashes:
|
|
||||||
return
|
|
||||||
self._conn.executemany(
|
|
||||||
"INSERT OR REPLACE INTO chunks (chunk_id, file_path, chunk_hash) "
|
|
||||||
"VALUES (?, ?, ?)",
|
|
||||||
[(cid, file_path, chash) for cid, chash in chunk_ids_and_hashes],
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def mark_file_deleted(self, file_path: str) -> int:
|
|
||||||
"""Move all chunk IDs for a file to deleted_chunks, then remove the file.
|
|
||||||
|
|
||||||
Returns the number of chunks tombstoned.
|
|
||||||
"""
|
|
||||||
# Collect chunk IDs before CASCADE deletes them
|
|
||||||
rows = self._conn.execute(
|
|
||||||
"SELECT chunk_id FROM chunks WHERE file_path = ?", (file_path,)
|
|
||||||
).fetchall()
|
|
||||||
|
|
||||||
if not rows:
|
|
||||||
# Still remove the file record if it exists
|
|
||||||
self._conn.execute(
|
|
||||||
"DELETE FROM files WHERE file_path = ?", (file_path,)
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
return 0
|
|
||||||
|
|
||||||
chunk_ids = [(r[0],) for r in rows]
|
|
||||||
self._conn.executemany(
|
|
||||||
"INSERT OR IGNORE INTO deleted_chunks (chunk_id) VALUES (?)",
|
|
||||||
chunk_ids,
|
|
||||||
)
|
|
||||||
# CASCADE deletes chunks rows automatically
|
|
||||||
self._conn.execute(
|
|
||||||
"DELETE FROM files WHERE file_path = ?", (file_path,)
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
return len(chunk_ids)
|
|
||||||
|
|
||||||
def get_deleted_ids(self) -> set[int]:
|
|
||||||
"""Return all tombstoned chunk IDs for search-time filtering."""
|
|
||||||
rows = self._conn.execute(
|
|
||||||
"SELECT chunk_id FROM deleted_chunks"
|
|
||||||
).fetchall()
|
|
||||||
return {r[0] for r in rows}
|
|
||||||
|
|
||||||
def get_file_hash(self, file_path: str) -> str | None:
|
|
||||||
"""Return the stored content hash for a file, or None if not tracked."""
|
|
||||||
row = self._conn.execute(
|
|
||||||
"SELECT content_hash FROM files WHERE file_path = ?", (file_path,)
|
|
||||||
).fetchone()
|
|
||||||
return row[0] if row else None
|
|
||||||
|
|
||||||
def file_needs_update(self, file_path: str, content_hash: str) -> bool:
|
|
||||||
"""Check if a file needs re-indexing based on its content hash."""
|
|
||||||
stored = self.get_file_hash(file_path)
|
|
||||||
if stored is None:
|
|
||||||
return True # New file
|
|
||||||
return stored != content_hash
|
|
||||||
|
|
||||||
def file_needs_update_fast(
|
|
||||||
self, file_path: str, mtime: float, size: int
|
|
||||||
) -> bool:
|
|
||||||
"""Fast pre-check using mtime and file size (no content read needed).
|
|
||||||
|
|
||||||
Returns True if the file appears changed or is not yet tracked.
|
|
||||||
When mtime and size both match stored values, the file is assumed
|
|
||||||
unchanged (~1000x faster than content-hash comparison).
|
|
||||||
"""
|
|
||||||
row = self._conn.execute(
|
|
||||||
"SELECT last_modified, file_size FROM files WHERE file_path = ?",
|
|
||||||
(file_path,),
|
|
||||||
).fetchone()
|
|
||||||
if row is None:
|
|
||||||
return True # New file
|
|
||||||
stored_mtime, stored_size = row
|
|
||||||
return stored_mtime != mtime or stored_size != size
|
|
||||||
|
|
||||||
def compact_deleted(self) -> set[int]:
|
|
||||||
"""Return deleted IDs and clear the deleted_chunks table.
|
|
||||||
|
|
||||||
Call this after rebuilding the vector index to reclaim space.
|
|
||||||
"""
|
|
||||||
deleted = self.get_deleted_ids()
|
|
||||||
if deleted:
|
|
||||||
self._conn.execute("DELETE FROM deleted_chunks")
|
|
||||||
self._conn.commit()
|
|
||||||
return deleted
|
|
||||||
|
|
||||||
def get_chunk_ids_for_file(self, file_path: str) -> list[int]:
|
|
||||||
"""Return all chunk IDs belonging to a file."""
|
|
||||||
rows = self._conn.execute(
|
|
||||||
"SELECT chunk_id FROM chunks WHERE file_path = ?", (file_path,)
|
|
||||||
).fetchall()
|
|
||||||
return [r[0] for r in rows]
|
|
||||||
|
|
||||||
def get_all_files(self) -> dict[str, str]:
|
|
||||||
"""Return all tracked files as {file_path: content_hash}."""
|
|
||||||
rows = self._conn.execute(
|
|
||||||
"SELECT file_path, content_hash FROM files"
|
|
||||||
).fetchall()
|
|
||||||
return {r[0]: r[1] for r in rows}
|
|
||||||
|
|
||||||
def max_chunk_id(self) -> int:
|
|
||||||
"""Return the maximum chunk_id across chunks and deleted_chunks.
|
|
||||||
|
|
||||||
Returns -1 if no chunks exist, so that next_id = max_chunk_id() + 1
|
|
||||||
starts at 0 for an empty store.
|
|
||||||
"""
|
|
||||||
row = self._conn.execute(
|
|
||||||
"SELECT MAX(m) FROM ("
|
|
||||||
" SELECT MAX(chunk_id) AS m FROM chunks"
|
|
||||||
" UNION ALL"
|
|
||||||
" SELECT MAX(chunk_id) AS m FROM deleted_chunks"
|
|
||||||
")"
|
|
||||||
).fetchone()
|
|
||||||
return row[0] if row[0] is not None else -1
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Tier management
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def record_access(self, file_path: str) -> None:
|
|
||||||
"""Update last_accessed timestamp for a file."""
|
|
||||||
self._conn.execute(
|
|
||||||
"UPDATE files SET last_accessed = ? WHERE file_path = ?",
|
|
||||||
(time.time(), file_path),
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def record_access_batch(self, file_paths: list[str]) -> None:
|
|
||||||
"""Batch-update last_accessed timestamps for multiple files."""
|
|
||||||
if not file_paths:
|
|
||||||
return
|
|
||||||
now = time.time()
|
|
||||||
self._conn.executemany(
|
|
||||||
"UPDATE files SET last_accessed = ? WHERE file_path = ?",
|
|
||||||
[(now, fp) for fp in file_paths],
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def classify_tiers(
|
|
||||||
self, hot_threshold_hours: int = 24, cold_threshold_hours: int = 168
|
|
||||||
) -> None:
|
|
||||||
"""Reclassify all files into hot/warm/cold tiers based on last_accessed.
|
|
||||||
|
|
||||||
- hot: last_accessed within hot_threshold_hours
|
|
||||||
- cold: last_accessed older than cold_threshold_hours (or never accessed)
|
|
||||||
- warm: everything in between
|
|
||||||
"""
|
|
||||||
now = time.time()
|
|
||||||
hot_cutoff = now - hot_threshold_hours * 3600
|
|
||||||
cold_cutoff = now - cold_threshold_hours * 3600
|
|
||||||
|
|
||||||
# Hot: recently accessed
|
|
||||||
self._conn.execute(
|
|
||||||
"UPDATE files SET tier = 'hot' "
|
|
||||||
"WHERE last_accessed IS NOT NULL AND last_accessed >= ?",
|
|
||||||
(hot_cutoff,),
|
|
||||||
)
|
|
||||||
# Cold: not accessed for a long time, or never accessed
|
|
||||||
self._conn.execute(
|
|
||||||
"UPDATE files SET tier = 'cold' "
|
|
||||||
"WHERE last_accessed IS NULL "
|
|
||||||
"OR (last_accessed < ? AND last_accessed < ?)",
|
|
||||||
(cold_cutoff, hot_cutoff),
|
|
||||||
)
|
|
||||||
# Warm: between hot and cold cutoffs
|
|
||||||
self._conn.execute(
|
|
||||||
"UPDATE files SET tier = 'warm' "
|
|
||||||
"WHERE last_accessed IS NOT NULL "
|
|
||||||
"AND last_accessed >= ? AND last_accessed < ?",
|
|
||||||
(cold_cutoff, hot_cutoff),
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def get_files_by_tier(self, tier: str) -> list[str]:
|
|
||||||
"""Return file paths in the specified tier ('hot', 'warm', or 'cold')."""
|
|
||||||
rows = self._conn.execute(
|
|
||||||
"SELECT file_path FROM files WHERE tier = ?", (tier,)
|
|
||||||
).fetchall()
|
|
||||||
return [r[0] for r in rows]
|
|
||||||
|
|
||||||
def get_cold_files(self) -> list[str]:
|
|
||||||
"""Return file paths in the 'cold' tier."""
|
|
||||||
return self.get_files_by_tier("cold")
|
|
||||||
|
|
||||||
def get_file_tier(self, file_path: str) -> str | None:
|
|
||||||
"""Return the tier for a specific file, or None if not tracked."""
|
|
||||||
row = self._conn.execute(
|
|
||||||
"SELECT tier FROM files WHERE file_path = ?", (file_path,)
|
|
||||||
).fetchone()
|
|
||||||
return row[0] if row else None
|
|
||||||
|
|
||||||
def close(self) -> None:
|
|
||||||
self._conn.close()
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,524 +0,0 @@
|
|||||||
"""MCP server for codexlens-search.
|
|
||||||
|
|
||||||
Exposes semantic code search tools via FastMCP for Claude Code integration.
|
|
||||||
Run as: codexlens-mcp (entry point) or python -m codexlens_search.mcp_server
|
|
||||||
|
|
||||||
## .mcp.json Configuration Examples
|
|
||||||
|
|
||||||
### API embedding + API reranker (single endpoint):
|
|
||||||
{
|
|
||||||
"mcpServers": {
|
|
||||||
"codexlens": {
|
|
||||||
"command": "codexlens-mcp",
|
|
||||||
"env": {
|
|
||||||
"CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1",
|
|
||||||
"CODEXLENS_EMBED_API_KEY": "sk-xxx",
|
|
||||||
"CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small",
|
|
||||||
"CODEXLENS_EMBED_DIM": "1536",
|
|
||||||
"CODEXLENS_RERANKER_API_URL": "https://api.jina.ai/v1",
|
|
||||||
"CODEXLENS_RERANKER_API_KEY": "jina-xxx",
|
|
||||||
"CODEXLENS_RERANKER_API_MODEL": "jina-reranker-v2-base-multilingual"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
### API embedding (multi-endpoint load balancing):
|
|
||||||
{
|
|
||||||
"mcpServers": {
|
|
||||||
"codexlens": {
|
|
||||||
"command": "codexlens-mcp",
|
|
||||||
"env": {
|
|
||||||
"CODEXLENS_EMBED_API_ENDPOINTS": "url1|key1|model1,url2|key2|model2",
|
|
||||||
"CODEXLENS_EMBED_DIM": "1536",
|
|
||||||
"CODEXLENS_RERANKER_API_URL": "https://api.jina.ai/v1",
|
|
||||||
"CODEXLENS_RERANKER_API_KEY": "jina-xxx",
|
|
||||||
"CODEXLENS_RERANKER_API_MODEL": "jina-reranker-v2-base-multilingual"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
### Local fastembed model (no API, requires codexlens-search[semantic]):
|
|
||||||
{
|
|
||||||
"mcpServers": {
|
|
||||||
"codexlens": {
|
|
||||||
"command": "codexlens-mcp",
|
|
||||||
"env": {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Pre-download models via CLI: codexlens-search download-models
|
|
||||||
|
|
||||||
### Env vars reference:
|
|
||||||
Embedding: CODEXLENS_EMBED_API_URL, _KEY, _MODEL, _ENDPOINTS (multi), _DIM
|
|
||||||
Reranker: CODEXLENS_RERANKER_API_URL, _KEY, _MODEL
|
|
||||||
Tuning: CODEXLENS_BINARY_TOP_K, _ANN_TOP_K, _FTS_TOP_K, _FUSION_K,
|
|
||||||
CODEXLENS_RERANKER_TOP_K, _RERANKER_BATCH_SIZE
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import logging
|
|
||||||
import threading
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from mcp.server.fastmcp import Context, FastMCP
|
|
||||||
|
|
||||||
from codexlens_search.bridge import (
|
|
||||||
DEFAULT_EXCLUDES,
|
|
||||||
create_config_from_env,
|
|
||||||
create_pipeline,
|
|
||||||
should_exclude,
|
|
||||||
)
|
|
||||||
|
|
||||||
log = logging.getLogger("codexlens_search.mcp_server")
|
|
||||||
|
|
||||||
mcp = FastMCP("codexlens-search")
|
|
||||||
|
|
||||||
# Pipeline cache: keyed by resolved project_path -> (indexing, search, config)
|
|
||||||
_pipelines: dict[str, tuple] = {}
|
|
||||||
_lock = threading.Lock()
|
|
||||||
|
|
||||||
|
|
||||||
def _db_path_for_project(project_path: str) -> Path:
|
|
||||||
"""Return the index database path for a project."""
|
|
||||||
return Path(project_path).resolve() / ".codexlens"
|
|
||||||
|
|
||||||
|
|
||||||
def _get_pipelines(project_path: str) -> tuple:
|
|
||||||
"""Get or create cached (indexing_pipeline, search_pipeline, config) for a project."""
|
|
||||||
resolved = str(Path(project_path).resolve())
|
|
||||||
with _lock:
|
|
||||||
if resolved not in _pipelines:
|
|
||||||
db_path = _db_path_for_project(resolved)
|
|
||||||
config = create_config_from_env(db_path)
|
|
||||||
_pipelines[resolved] = create_pipeline(db_path, config)
|
|
||||||
return _pipelines[resolved]
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Search tools
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
@mcp.tool()
|
|
||||||
def search_code(
|
|
||||||
project_path: str, query: str, top_k: int = 10, quality: str = "auto"
|
|
||||||
) -> str:
|
|
||||||
"""Semantic code search with hybrid fusion (vector + FTS + reranking).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
project_path: Absolute path to the project root directory.
|
|
||||||
query: Natural language or code search query.
|
|
||||||
top_k: Maximum number of results to return (default 10).
|
|
||||||
quality: Search quality tier (default "auto"):
|
|
||||||
- "fast": FTS-only + rerank (no embedding needed, fastest)
|
|
||||||
- "balanced": FTS + binary coarse search + rerank
|
|
||||||
- "thorough": Full 2-stage vector + FTS + reranking (best quality)
|
|
||||||
- "auto": Uses "thorough" if vector index exists, else "fast"
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Search results as formatted text with file paths, line numbers, scores, and code snippets.
|
|
||||||
"""
|
|
||||||
root = Path(project_path).resolve()
|
|
||||||
if not root.is_dir():
|
|
||||||
return f"Error: project path not found: {root}"
|
|
||||||
|
|
||||||
db_path = _db_path_for_project(project_path)
|
|
||||||
if not (db_path / "metadata.db").exists():
|
|
||||||
return f"Error: no index found at {db_path}. Run index_project first."
|
|
||||||
|
|
||||||
valid_qualities = ("fast", "balanced", "thorough", "auto")
|
|
||||||
if quality not in valid_qualities:
|
|
||||||
return f"Error: invalid quality '{quality}'. Must be one of: {', '.join(valid_qualities)}"
|
|
||||||
|
|
||||||
_, search, _ = _get_pipelines(project_path)
|
|
||||||
results = search.search(query, top_k=top_k, quality=quality)
|
|
||||||
|
|
||||||
if not results:
|
|
||||||
return "No results found."
|
|
||||||
|
|
||||||
lines = []
|
|
||||||
for i, r in enumerate(results, 1):
|
|
||||||
lines.append(f"## Result {i} -- {r.path} (L{r.line}-{r.end_line}, score: {r.score:.4f})")
|
|
||||||
lines.append(f"```\n{r.content}\n```")
|
|
||||||
lines.append("")
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
|
||||||
def search_scope(
|
|
||||||
project_path: str,
|
|
||||||
query: str,
|
|
||||||
scope_path: str,
|
|
||||||
top_k: int = 10,
|
|
||||||
quality: str = "auto",
|
|
||||||
) -> str:
|
|
||||||
"""Search within a specific directory scope of a project.
|
|
||||||
|
|
||||||
Runs a normal search then filters results to only include files
|
|
||||||
under the specified scope path.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
project_path: Absolute path to the project root directory.
|
|
||||||
query: Natural language or code search query.
|
|
||||||
scope_path: Relative directory path to limit search scope (e.g. "src/auth").
|
|
||||||
top_k: Maximum number of scoped results to return (default 10).
|
|
||||||
quality: Search quality tier ("fast", "balanced", "thorough", "auto").
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Search results filtered to the scope path.
|
|
||||||
"""
|
|
||||||
root = Path(project_path).resolve()
|
|
||||||
if not root.is_dir():
|
|
||||||
return f"Error: project path not found: {root}"
|
|
||||||
|
|
||||||
db_path = _db_path_for_project(project_path)
|
|
||||||
if not (db_path / "metadata.db").exists():
|
|
||||||
return f"Error: no index found at {db_path}. Run index_project first."
|
|
||||||
|
|
||||||
# Normalize scope path for prefix matching
|
|
||||||
scope = scope_path.replace("\\", "/").strip("/")
|
|
||||||
|
|
||||||
_, search, _ = _get_pipelines(project_path)
|
|
||||||
# Fetch more results than top_k to account for filtering
|
|
||||||
all_results = search.search(query, top_k=top_k * 5, quality=quality)
|
|
||||||
|
|
||||||
# Filter by scope path prefix
|
|
||||||
scoped = [
|
|
||||||
r for r in all_results
|
|
||||||
if r.path.replace("\\", "/").startswith(scope + "/")
|
|
||||||
or r.path.replace("\\", "/") == scope
|
|
||||||
]
|
|
||||||
|
|
||||||
if not scoped:
|
|
||||||
return f"No results found in scope '{scope_path}'."
|
|
||||||
|
|
||||||
lines = []
|
|
||||||
for i, r in enumerate(scoped[:top_k], 1):
|
|
||||||
lines.append(f"## Result {i} -- {r.path} (L{r.line}-{r.end_line}, score: {r.score:.4f})")
|
|
||||||
lines.append(f"```\n{r.content}\n```")
|
|
||||||
lines.append("")
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Indexing tools
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
@mcp.tool()
|
|
||||||
async def index_project(
|
|
||||||
project_path: str, glob_pattern: str = "**/*", force: bool = False,
|
|
||||||
ctx: Context | None = None,
|
|
||||||
) -> str:
|
|
||||||
"""Build or rebuild the search index for a project.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
project_path: Absolute path to the project root directory.
|
|
||||||
glob_pattern: Glob pattern for files to index (default "**/*").
|
|
||||||
force: If True, rebuild index from scratch even if it exists.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Indexing summary with file count, chunk count, and duration.
|
|
||||||
"""
|
|
||||||
root = Path(project_path).resolve()
|
|
||||||
if not root.is_dir():
|
|
||||||
return f"Error: project path not found: {root}"
|
|
||||||
|
|
||||||
if force:
|
|
||||||
with _lock:
|
|
||||||
_pipelines.pop(str(root), None)
|
|
||||||
|
|
||||||
indexing, _, _ = _get_pipelines(project_path)
|
|
||||||
|
|
||||||
file_paths = [
|
|
||||||
p for p in root.glob(glob_pattern)
|
|
||||||
if p.is_file() and not should_exclude(p.relative_to(root), DEFAULT_EXCLUDES)
|
|
||||||
]
|
|
||||||
|
|
||||||
if ctx:
|
|
||||||
await ctx.report_progress(0, len(file_paths), f"Scanning {len(file_paths)} files...")
|
|
||||||
|
|
||||||
# Progress callback bridging sync pipeline → async MCP context
|
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
|
|
||||||
def _progress(done: int, total: int) -> None:
|
|
||||||
if ctx:
|
|
||||||
asyncio.run_coroutine_threadsafe(
|
|
||||||
ctx.report_progress(done, total, f"Indexed {done}/{total} files"),
|
|
||||||
loop,
|
|
||||||
)
|
|
||||||
|
|
||||||
stats = indexing.sync(file_paths, root=root, progress_callback=_progress)
|
|
||||||
|
|
||||||
if ctx:
|
|
||||||
await ctx.report_progress(
|
|
||||||
stats.files_processed, stats.files_processed,
|
|
||||||
f"Done: {stats.files_processed} files, {stats.chunks_created} chunks"
|
|
||||||
)
|
|
||||||
|
|
||||||
return (
|
|
||||||
f"Indexed {stats.files_processed} files, "
|
|
||||||
f"{stats.chunks_created} chunks in {stats.duration_seconds:.1f}s. "
|
|
||||||
f"DB: {_db_path_for_project(project_path)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
|
||||||
def index_status(project_path: str) -> str:
|
|
||||||
"""Show index statistics for a project.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
project_path: Absolute path to the project root directory.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Index statistics including file count, chunk count, and deleted chunks.
|
|
||||||
"""
|
|
||||||
from codexlens_search.indexing.metadata import MetadataStore
|
|
||||||
|
|
||||||
db_path = _db_path_for_project(project_path)
|
|
||||||
meta_path = db_path / "metadata.db"
|
|
||||||
|
|
||||||
if not meta_path.exists():
|
|
||||||
return f"No index found at {db_path}. Run index_project first."
|
|
||||||
|
|
||||||
metadata = MetadataStore(meta_path)
|
|
||||||
all_files = metadata.get_all_files()
|
|
||||||
deleted_ids = metadata.get_deleted_ids()
|
|
||||||
max_chunk = metadata.max_chunk_id()
|
|
||||||
|
|
||||||
total = max_chunk + 1 if max_chunk >= 0 else 0
|
|
||||||
return (
|
|
||||||
f"Index: {db_path}\n"
|
|
||||||
f"Files tracked: {len(all_files)}\n"
|
|
||||||
f"Total chunks: {total}\n"
|
|
||||||
f"Deleted chunks: {len(deleted_ids)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
|
||||||
async def index_update(
|
|
||||||
project_path: str, glob_pattern: str = "**/*",
|
|
||||||
ctx: Context | None = None,
|
|
||||||
) -> str:
|
|
||||||
"""Incrementally sync the index with current project files.
|
|
||||||
|
|
||||||
Only re-indexes files that changed since last indexing.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
project_path: Absolute path to the project root directory.
|
|
||||||
glob_pattern: Glob pattern for files to sync (default "**/*").
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Sync summary with processed file count and duration.
|
|
||||||
"""
|
|
||||||
root = Path(project_path).resolve()
|
|
||||||
if not root.is_dir():
|
|
||||||
return f"Error: project path not found: {root}"
|
|
||||||
|
|
||||||
indexing, _, _ = _get_pipelines(project_path)
|
|
||||||
|
|
||||||
file_paths = [
|
|
||||||
p for p in root.glob(glob_pattern)
|
|
||||||
if p.is_file() and not should_exclude(p.relative_to(root), DEFAULT_EXCLUDES)
|
|
||||||
]
|
|
||||||
|
|
||||||
if ctx:
|
|
||||||
await ctx.report_progress(0, len(file_paths), f"Scanning {len(file_paths)} files...")
|
|
||||||
|
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
|
|
||||||
def _progress(done: int, total: int) -> None:
|
|
||||||
if ctx:
|
|
||||||
asyncio.run_coroutine_threadsafe(
|
|
||||||
ctx.report_progress(done, total, f"Synced {done}/{total} files"),
|
|
||||||
loop,
|
|
||||||
)
|
|
||||||
|
|
||||||
stats = indexing.sync(file_paths, root=root, progress_callback=_progress)
|
|
||||||
return (
|
|
||||||
f"Synced {stats.files_processed} files, "
|
|
||||||
f"{stats.chunks_created} chunks in {stats.duration_seconds:.1f}s."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
|
||||||
def index_scope(
|
|
||||||
project_path: str,
|
|
||||||
scope_path: str,
|
|
||||||
glob_pattern: str = "**/*",
|
|
||||||
tier: str = "full",
|
|
||||||
) -> str:
|
|
||||||
"""Index a specific directory scope within a project.
|
|
||||||
|
|
||||||
Useful for quickly indexing a subdirectory (e.g. after editing files
|
|
||||||
in a specific module) without re-indexing the entire project.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
project_path: Absolute path to the project root directory.
|
|
||||||
scope_path: Relative directory path to index (e.g. "src/auth").
|
|
||||||
glob_pattern: Glob pattern for files within scope (default "**/*").
|
|
||||||
tier: Indexing tier - "full" (default) runs full pipeline with
|
|
||||||
embedding, "fts_only" indexes text only (faster, no vectors).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Indexing summary for the scoped directory.
|
|
||||||
"""
|
|
||||||
root = Path(project_path).resolve()
|
|
||||||
if not root.is_dir():
|
|
||||||
return f"Error: project path not found: {root}"
|
|
||||||
|
|
||||||
scope_dir = root / scope_path
|
|
||||||
if not scope_dir.is_dir():
|
|
||||||
return f"Error: scope directory not found: {scope_dir}"
|
|
||||||
|
|
||||||
valid_tiers = ("full", "fts_only")
|
|
||||||
if tier not in valid_tiers:
|
|
||||||
return f"Error: invalid tier '{tier}'. Must be one of: {', '.join(valid_tiers)}"
|
|
||||||
|
|
||||||
indexing, _, _ = _get_pipelines(project_path)
|
|
||||||
|
|
||||||
file_paths = [
|
|
||||||
p for p in scope_dir.glob(glob_pattern)
|
|
||||||
if p.is_file() and not should_exclude(p.relative_to(root), DEFAULT_EXCLUDES)
|
|
||||||
]
|
|
||||||
|
|
||||||
if not file_paths:
|
|
||||||
return f"No files found in {scope_path} matching '{glob_pattern}'."
|
|
||||||
|
|
||||||
stats = indexing.sync(file_paths, root=root, tier=tier)
|
|
||||||
tier_label = "FTS-only" if tier == "fts_only" else "full"
|
|
||||||
return (
|
|
||||||
f"Indexed {stats.files_processed} files ({tier_label}), "
|
|
||||||
f"{stats.chunks_created} chunks in {stats.duration_seconds:.1f}s. "
|
|
||||||
f"Scope: {scope_path}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# File discovery
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
@mcp.tool()
|
|
||||||
def find_files(
|
|
||||||
project_path: str, pattern: str = "**/*", max_results: int = 100
|
|
||||||
) -> str:
|
|
||||||
"""Find files in a project by glob pattern.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
project_path: Absolute path to the project root directory.
|
|
||||||
pattern: Glob pattern to match files (default "**/*").
|
|
||||||
max_results: Maximum number of file paths to return (default 100).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of matching file paths (relative to project root), one per line.
|
|
||||||
"""
|
|
||||||
root = Path(project_path).resolve()
|
|
||||||
if not root.is_dir():
|
|
||||||
return f"Error: project path not found: {root}"
|
|
||||||
|
|
||||||
matches = []
|
|
||||||
for p in root.glob(pattern):
|
|
||||||
if p.is_file() and not should_exclude(p.relative_to(root), DEFAULT_EXCLUDES):
|
|
||||||
matches.append(str(p.relative_to(root)))
|
|
||||||
if len(matches) >= max_results:
|
|
||||||
break
|
|
||||||
|
|
||||||
if not matches:
|
|
||||||
return "No files found matching the pattern."
|
|
||||||
|
|
||||||
header = f"Found {len(matches)} files"
|
|
||||||
if len(matches) >= max_results:
|
|
||||||
header += f" (limited to {max_results})"
|
|
||||||
return header + ":\n" + "\n".join(matches)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Model management tools
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
@mcp.tool()
|
|
||||||
def list_models() -> str:
|
|
||||||
"""List available embedding and reranker models with cache status.
|
|
||||||
|
|
||||||
Shows which models are downloaded locally and ready for use.
|
|
||||||
Models are needed when using local fastembed mode (no API URL configured).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Table of models with name, type, and installed status.
|
|
||||||
"""
|
|
||||||
from codexlens_search import model_manager
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
|
|
||||||
config = create_config_from_env(".")
|
|
||||||
models = model_manager.list_known_models(config)
|
|
||||||
|
|
||||||
if not models:
|
|
||||||
return "No known models found."
|
|
||||||
|
|
||||||
lines = ["| Model | Type | Installed |", "| --- | --- | --- |"]
|
|
||||||
for m in models:
|
|
||||||
status = "Yes" if m["installed"] else "No"
|
|
||||||
lines.append(f"| {m['name']} | {m['type']} | {status} |")
|
|
||||||
|
|
||||||
# Show current config
|
|
||||||
lines.append("")
|
|
||||||
if config.embed_api_url:
|
|
||||||
lines.append(f"Mode: API embedding ({config.embed_api_url})")
|
|
||||||
else:
|
|
||||||
lines.append(f"Mode: Local fastembed (model: {config.embed_model})")
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
|
||||||
def download_models(embed_model: str = "", reranker_model: str = "") -> str:
|
|
||||||
"""Download embedding and reranker models for local (fastembed) mode.
|
|
||||||
|
|
||||||
Not needed when using API embedding (CODEXLENS_EMBED_API_URL is set).
|
|
||||||
Downloads are cached — subsequent calls are no-ops if already downloaded.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
embed_model: Embedding model name (default: BAAI/bge-small-en-v1.5).
|
|
||||||
reranker_model: Reranker model name (default: Xenova/ms-marco-MiniLM-L-6-v2).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Download status for each model.
|
|
||||||
"""
|
|
||||||
from codexlens_search import model_manager
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
|
|
||||||
config = create_config_from_env(".")
|
|
||||||
if embed_model:
|
|
||||||
config.embed_model = embed_model
|
|
||||||
if reranker_model:
|
|
||||||
config.reranker_model = reranker_model
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for name, kind in [
|
|
||||||
(config.embed_model, "embedding"),
|
|
||||||
(config.reranker_model, "reranker"),
|
|
||||||
]:
|
|
||||||
try:
|
|
||||||
model_manager.ensure_model(name, config)
|
|
||||||
results.append(f"{kind}: {name} — ready")
|
|
||||||
except Exception as e:
|
|
||||||
results.append(f"{kind}: {name} — failed: {e}")
|
|
||||||
|
|
||||||
return "\n".join(results)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Entry point
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
"""Entry point for codexlens-mcp command."""
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.INFO,
|
|
||||||
format="%(levelname)s %(name)s: %(message)s",
|
|
||||||
)
|
|
||||||
mcp.run()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,242 +0,0 @@
|
|||||||
"""Lightweight model download manager for fastembed models.
|
|
||||||
|
|
||||||
Handles HuggingFace mirror configuration and cache pre-population so that
|
|
||||||
fastembed can load models from local cache without network access.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from .config import Config
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Models that fastembed maps internally (HF repo may differ from model_name)
|
|
||||||
_EMBED_MODEL_FILES = ["*.onnx", "*.json"]
|
|
||||||
_RERANK_MODEL_FILES = ["*.onnx", "*.json"]
|
|
||||||
|
|
||||||
|
|
||||||
def _resolve_cache_dir(config: Config) -> str | None:
|
|
||||||
"""Return cache_dir for fastembed, or None for default."""
|
|
||||||
return config.model_cache_dir or None
|
|
||||||
|
|
||||||
|
|
||||||
def _apply_mirror(config: Config) -> None:
|
|
||||||
"""Set HF_ENDPOINT env var if mirror is configured."""
|
|
||||||
if config.hf_mirror:
|
|
||||||
os.environ["HF_ENDPOINT"] = config.hf_mirror
|
|
||||||
|
|
||||||
|
|
||||||
def _model_is_cached(model_name: str, cache_dir: str | None) -> bool:
|
|
||||||
"""Check if a model already exists in the fastembed/HF hub cache.
|
|
||||||
|
|
||||||
Note: fastembed may remap model names internally (e.g. BAAI/bge-small-en-v1.5
|
|
||||||
-> qdrant/bge-small-en-v1.5-onnx-q), so we also search by partial name match.
|
|
||||||
"""
|
|
||||||
base = cache_dir or _default_fastembed_cache()
|
|
||||||
base_path = Path(base)
|
|
||||||
if not base_path.exists():
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Exact match first
|
|
||||||
safe_name = model_name.replace("/", "--")
|
|
||||||
model_dir = base_path / f"models--{safe_name}"
|
|
||||||
if _dir_has_onnx(model_dir):
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Partial match: fastembed remaps some model names
|
|
||||||
short_name = model_name.split("/")[-1].lower()
|
|
||||||
for d in base_path.iterdir():
|
|
||||||
if short_name in d.name.lower() and _dir_has_onnx(d):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def _dir_has_onnx(model_dir: Path) -> bool:
|
|
||||||
"""Check if a model directory has at least one ONNX file in snapshots."""
|
|
||||||
snapshots = model_dir / "snapshots"
|
|
||||||
if not snapshots.exists():
|
|
||||||
return False
|
|
||||||
for snap in snapshots.iterdir():
|
|
||||||
if list(snap.rglob("*.onnx")):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def _default_fastembed_cache() -> str:
|
|
||||||
"""Return fastembed's default cache directory."""
|
|
||||||
return os.path.join(os.environ.get("TMPDIR", os.path.join(
|
|
||||||
os.environ.get("LOCALAPPDATA", os.path.expanduser("~")),
|
|
||||||
)), "fastembed_cache")
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_model(model_name: str, config: Config) -> None:
|
|
||||||
"""Ensure a model is available in the local cache.
|
|
||||||
|
|
||||||
If the model is already cached, this is a no-op.
|
|
||||||
If not cached, attempts to download via huggingface_hub with mirror support.
|
|
||||||
"""
|
|
||||||
cache_dir = _resolve_cache_dir(config)
|
|
||||||
|
|
||||||
if _model_is_cached(model_name, cache_dir):
|
|
||||||
log.debug("Model %s found in cache", model_name)
|
|
||||||
return
|
|
||||||
|
|
||||||
log.info("Model %s not in cache, downloading...", model_name)
|
|
||||||
_apply_mirror(config)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
|
|
||||||
kwargs: dict = {
|
|
||||||
"repo_id": model_name,
|
|
||||||
"allow_patterns": ["*.onnx", "*.json"],
|
|
||||||
}
|
|
||||||
if cache_dir:
|
|
||||||
kwargs["cache_dir"] = cache_dir
|
|
||||||
if config.hf_mirror:
|
|
||||||
kwargs["endpoint"] = config.hf_mirror
|
|
||||||
|
|
||||||
path = snapshot_download(**kwargs)
|
|
||||||
log.info("Model %s downloaded to %s", model_name, path)
|
|
||||||
|
|
||||||
# fastembed for some reranker models expects model.onnx but repo may
|
|
||||||
# only have quantized variants. Create a symlink/copy if needed.
|
|
||||||
_ensure_model_onnx(Path(path))
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
log.warning(
|
|
||||||
"huggingface_hub not installed. Cannot download models. "
|
|
||||||
"Install with: pip install huggingface-hub"
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("Failed to download model %s: %s", model_name, e)
|
|
||||||
|
|
||||||
|
|
||||||
def _ensure_model_onnx(model_dir: Path) -> None:
|
|
||||||
"""If model.onnx is missing but a quantized variant exists, copy it."""
|
|
||||||
onnx_dir = model_dir / "onnx"
|
|
||||||
if not onnx_dir.exists():
|
|
||||||
onnx_dir = model_dir # some models put onnx at root
|
|
||||||
|
|
||||||
target = onnx_dir / "model.onnx"
|
|
||||||
if target.exists():
|
|
||||||
return
|
|
||||||
|
|
||||||
# Look for quantized alternatives
|
|
||||||
for name in ["model_quantized.onnx", "model_optimized.onnx",
|
|
||||||
"model_int8.onnx", "model_uint8.onnx"]:
|
|
||||||
candidate = onnx_dir / name
|
|
||||||
if candidate.exists():
|
|
||||||
import shutil
|
|
||||||
shutil.copy2(candidate, target)
|
|
||||||
log.info("Copied %s -> model.onnx", name)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
def list_known_models(config: Config) -> list[dict]:
|
|
||||||
"""Return info for known embed/reranker models with cache status.
|
|
||||||
|
|
||||||
Checks config defaults plus common alternative models.
|
|
||||||
Returns list of dicts with keys: name, type, installed, cache_path.
|
|
||||||
"""
|
|
||||||
cache_dir = _resolve_cache_dir(config)
|
|
||||||
base = cache_dir or _default_fastembed_cache()
|
|
||||||
|
|
||||||
# Known embedding models
|
|
||||||
embed_models = [
|
|
||||||
config.embed_model,
|
|
||||||
"BAAI/bge-small-en-v1.5",
|
|
||||||
"BAAI/bge-base-en-v1.5",
|
|
||||||
"BAAI/bge-large-en-v1.5",
|
|
||||||
"sentence-transformers/all-MiniLM-L6-v2",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Known reranker models
|
|
||||||
reranker_models = [
|
|
||||||
config.reranker_model,
|
|
||||||
"Xenova/ms-marco-MiniLM-L-6-v2",
|
|
||||||
"BAAI/bge-reranker-base",
|
|
||||||
"BAAI/bge-reranker-v2-m3",
|
|
||||||
]
|
|
||||||
|
|
||||||
seen: set[str] = set()
|
|
||||||
results: list[dict] = []
|
|
||||||
|
|
||||||
for name in embed_models:
|
|
||||||
if name in seen:
|
|
||||||
continue
|
|
||||||
seen.add(name)
|
|
||||||
cache_path = _find_model_cache_path(name, base)
|
|
||||||
results.append({
|
|
||||||
"name": name,
|
|
||||||
"type": "embedding",
|
|
||||||
"installed": cache_path is not None,
|
|
||||||
"cache_path": cache_path,
|
|
||||||
})
|
|
||||||
|
|
||||||
for name in reranker_models:
|
|
||||||
if name in seen:
|
|
||||||
continue
|
|
||||||
seen.add(name)
|
|
||||||
cache_path = _find_model_cache_path(name, base)
|
|
||||||
results.append({
|
|
||||||
"name": name,
|
|
||||||
"type": "reranker",
|
|
||||||
"installed": cache_path is not None,
|
|
||||||
"cache_path": cache_path,
|
|
||||||
})
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def delete_model(model_name: str, config: Config) -> bool:
|
|
||||||
"""Remove a model from the HF/fastembed cache.
|
|
||||||
|
|
||||||
Returns True if deleted, False if not found.
|
|
||||||
"""
|
|
||||||
import shutil
|
|
||||||
|
|
||||||
cache_dir = _resolve_cache_dir(config)
|
|
||||||
base = cache_dir or _default_fastembed_cache()
|
|
||||||
cache_path = _find_model_cache_path(model_name, base)
|
|
||||||
|
|
||||||
if cache_path is None:
|
|
||||||
log.warning("Model %s not found in cache", model_name)
|
|
||||||
return False
|
|
||||||
|
|
||||||
shutil.rmtree(cache_path)
|
|
||||||
log.info("Deleted model %s from %s", model_name, cache_path)
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def _find_model_cache_path(model_name: str, base: str) -> str | None:
|
|
||||||
"""Find the cache directory path for a model, or None if not cached."""
|
|
||||||
base_path = Path(base)
|
|
||||||
if not base_path.exists():
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Exact match first
|
|
||||||
safe_name = model_name.replace("/", "--")
|
|
||||||
model_dir = base_path / f"models--{safe_name}"
|
|
||||||
if _dir_has_onnx(model_dir):
|
|
||||||
return str(model_dir)
|
|
||||||
|
|
||||||
# Partial match: fastembed remaps some model names
|
|
||||||
short_name = model_name.split("/")[-1].lower()
|
|
||||||
for d in base_path.iterdir():
|
|
||||||
if short_name in d.name.lower() and _dir_has_onnx(d):
|
|
||||||
return str(d)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def get_cache_kwargs(config: Config) -> dict:
|
|
||||||
"""Return kwargs to pass to fastembed constructors for cache_dir."""
|
|
||||||
cache_dir = _resolve_cache_dir(config)
|
|
||||||
if cache_dir:
|
|
||||||
return {"cache_dir": cache_dir}
|
|
||||||
return {}
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
from .base import BaseReranker
|
|
||||||
from .local import FastEmbedReranker
|
|
||||||
from .api import APIReranker
|
|
||||||
|
|
||||||
__all__ = ["BaseReranker", "FastEmbedReranker", "APIReranker"]
|
|
||||||
@@ -1,103 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import time
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from .base import BaseReranker
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class APIReranker(BaseReranker):
|
|
||||||
"""Reranker backed by a remote HTTP API (SiliconFlow/Cohere/Jina format)."""
|
|
||||||
|
|
||||||
def __init__(self, config: Config) -> None:
|
|
||||||
self._config = config
|
|
||||||
self._client = httpx.Client(
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {config.reranker_api_key}",
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
def score_pairs(self, query: str, documents: list[str]) -> list[float]:
|
|
||||||
if not documents:
|
|
||||||
return []
|
|
||||||
max_tokens = self._config.reranker_api_max_tokens_per_batch
|
|
||||||
batches = self._split_batches(documents, max_tokens)
|
|
||||||
scores = [0.0] * len(documents)
|
|
||||||
for batch in batches:
|
|
||||||
batch_scores = self._call_api_with_retry(query, batch)
|
|
||||||
for orig_idx, score in batch_scores.items():
|
|
||||||
scores[orig_idx] = score
|
|
||||||
return scores
|
|
||||||
|
|
||||||
def _split_batches(
|
|
||||||
self, documents: list[str], max_tokens: int
|
|
||||||
) -> list[list[tuple[int, str]]]:
|
|
||||||
batches: list[list[tuple[int, str]]] = []
|
|
||||||
current_batch: list[tuple[int, str]] = []
|
|
||||||
current_tokens = 0
|
|
||||||
|
|
||||||
for idx, text in enumerate(documents):
|
|
||||||
doc_tokens = len(text) // 4
|
|
||||||
if current_tokens + doc_tokens > max_tokens and current_batch:
|
|
||||||
batches.append(current_batch)
|
|
||||||
current_batch = []
|
|
||||||
current_tokens = 0
|
|
||||||
current_batch.append((idx, text))
|
|
||||||
current_tokens += doc_tokens
|
|
||||||
|
|
||||||
if current_batch:
|
|
||||||
batches.append(current_batch)
|
|
||||||
|
|
||||||
return batches
|
|
||||||
|
|
||||||
def _call_api_with_retry(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
docs: list[tuple[int, str]],
|
|
||||||
max_retries: int = 3,
|
|
||||||
) -> dict[int, float]:
|
|
||||||
url = self._config.reranker_api_url.rstrip("/") + "/rerank"
|
|
||||||
payload = {
|
|
||||||
"model": self._config.reranker_api_model,
|
|
||||||
"query": query,
|
|
||||||
"documents": [t for _, t in docs],
|
|
||||||
}
|
|
||||||
|
|
||||||
last_exc: Exception | None = None
|
|
||||||
for attempt in range(max_retries):
|
|
||||||
try:
|
|
||||||
response = self._client.post(url, json=payload)
|
|
||||||
except Exception as exc:
|
|
||||||
last_exc = exc
|
|
||||||
time.sleep((2 ** attempt) * 0.5)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if response.status_code in (429, 503):
|
|
||||||
logger.warning(
|
|
||||||
"API reranker returned HTTP %s (attempt %d/%d), retrying...",
|
|
||||||
response.status_code,
|
|
||||||
attempt + 1,
|
|
||||||
max_retries,
|
|
||||||
)
|
|
||||||
time.sleep((2 ** attempt) * 0.5)
|
|
||||||
continue
|
|
||||||
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json()
|
|
||||||
results = data.get("results", [])
|
|
||||||
scores: dict[int, float] = {}
|
|
||||||
for item in results:
|
|
||||||
local_idx = int(item["index"])
|
|
||||||
orig_idx = docs[local_idx][0]
|
|
||||||
scores[orig_idx] = float(item["relevance_score"])
|
|
||||||
return scores
|
|
||||||
|
|
||||||
raise RuntimeError(
|
|
||||||
f"API reranker failed after {max_retries} attempts. Last error: {last_exc}"
|
|
||||||
)
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
|
|
||||||
|
|
||||||
class BaseReranker(ABC):
|
|
||||||
@abstractmethod
|
|
||||||
def score_pairs(self, query: str, documents: list[str]) -> list[float]:
|
|
||||||
"""Score (query, doc) pairs. Returns list of floats same length as documents."""
|
|
||||||
@@ -1,39 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from .base import BaseReranker
|
|
||||||
|
|
||||||
|
|
||||||
class FastEmbedReranker(BaseReranker):
|
|
||||||
"""Local reranker backed by fastembed TextCrossEncoder."""
|
|
||||||
|
|
||||||
def __init__(self, config: Config) -> None:
|
|
||||||
self._config = config
|
|
||||||
self._model = None
|
|
||||||
|
|
||||||
def _load(self) -> None:
|
|
||||||
if self._model is None:
|
|
||||||
from .. import model_manager
|
|
||||||
model_manager.ensure_model(self._config.reranker_model, self._config)
|
|
||||||
|
|
||||||
from fastembed.rerank.cross_encoder import TextCrossEncoder
|
|
||||||
cache_kwargs = model_manager.get_cache_kwargs(self._config)
|
|
||||||
self._model = TextCrossEncoder(
|
|
||||||
model_name=self._config.reranker_model,
|
|
||||||
**cache_kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
def score_pairs(self, query: str, documents: list[str]) -> list[float]:
|
|
||||||
self._load()
|
|
||||||
results = list(self._model.rerank(query, documents))
|
|
||||||
if not results:
|
|
||||||
return [0.0] * len(documents)
|
|
||||||
# fastembed may return list[float] or list[RerankResult] depending on version
|
|
||||||
first = results[0]
|
|
||||||
if isinstance(first, (int, float)):
|
|
||||||
return [float(s) for s in results]
|
|
||||||
# Older format: objects with .index and .score
|
|
||||||
scores = [0.0] * len(documents)
|
|
||||||
for r in results:
|
|
||||||
scores[r.index] = float(r.score)
|
|
||||||
return scores
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
from .fts import FTSEngine
|
|
||||||
from .fusion import reciprocal_rank_fusion, detect_query_intent, QueryIntent, DEFAULT_WEIGHTS
|
|
||||||
from .pipeline import SearchPipeline, SearchResult
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"FTSEngine", "reciprocal_rank_fusion", "detect_query_intent",
|
|
||||||
"QueryIntent", "DEFAULT_WEIGHTS", "SearchPipeline", "SearchResult",
|
|
||||||
]
|
|
||||||
@@ -1,133 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import sqlite3
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
class FTSEngine:
|
|
||||||
def __init__(self, db_path: str | Path) -> None:
|
|
||||||
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
|
|
||||||
self._conn.execute(
|
|
||||||
"CREATE VIRTUAL TABLE IF NOT EXISTS docs "
|
|
||||||
"USING fts5(content, tokenize='porter unicode61')"
|
|
||||||
)
|
|
||||||
self._conn.execute(
|
|
||||||
"CREATE TABLE IF NOT EXISTS docs_meta "
|
|
||||||
"(id INTEGER PRIMARY KEY, path TEXT, "
|
|
||||||
"start_line INTEGER DEFAULT 0, end_line INTEGER DEFAULT 0)"
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
self._migrate_line_columns()
|
|
||||||
|
|
||||||
def _migrate_line_columns(self) -> None:
|
|
||||||
"""Add start_line/end_line columns if missing (for pre-existing DBs)."""
|
|
||||||
cols = {
|
|
||||||
row[1]
|
|
||||||
for row in self._conn.execute("PRAGMA table_info(docs_meta)").fetchall()
|
|
||||||
}
|
|
||||||
for col in ("start_line", "end_line"):
|
|
||||||
if col not in cols:
|
|
||||||
self._conn.execute(
|
|
||||||
f"ALTER TABLE docs_meta ADD COLUMN {col} INTEGER DEFAULT 0"
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def add_documents(self, docs: list[tuple]) -> None:
|
|
||||||
"""Add documents in batch.
|
|
||||||
|
|
||||||
docs: list of (id, path, content) or (id, path, content, start_line, end_line).
|
|
||||||
"""
|
|
||||||
if not docs:
|
|
||||||
return
|
|
||||||
meta_rows = []
|
|
||||||
fts_rows = []
|
|
||||||
for doc in docs:
|
|
||||||
if len(doc) >= 5:
|
|
||||||
doc_id, path, content, sl, el = doc[0], doc[1], doc[2], doc[3], doc[4]
|
|
||||||
else:
|
|
||||||
doc_id, path, content = doc[0], doc[1], doc[2]
|
|
||||||
sl, el = 0, 0
|
|
||||||
meta_rows.append((doc_id, path, sl, el))
|
|
||||||
fts_rows.append((doc_id, content))
|
|
||||||
self._conn.executemany(
|
|
||||||
"INSERT OR REPLACE INTO docs_meta (id, path, start_line, end_line) "
|
|
||||||
"VALUES (?, ?, ?, ?)",
|
|
||||||
meta_rows,
|
|
||||||
)
|
|
||||||
self._conn.executemany(
|
|
||||||
"INSERT OR REPLACE INTO docs (rowid, content) VALUES (?, ?)",
|
|
||||||
fts_rows,
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def exact_search(self, query: str, top_k: int = 50) -> list[tuple[int, float]]:
|
|
||||||
"""FTS5 MATCH query, return (id, bm25_score) sorted by score descending."""
|
|
||||||
try:
|
|
||||||
rows = self._conn.execute(
|
|
||||||
"SELECT rowid, bm25(docs) AS score FROM docs "
|
|
||||||
"WHERE docs MATCH ? ORDER BY score LIMIT ?",
|
|
||||||
(query, top_k),
|
|
||||||
).fetchall()
|
|
||||||
except sqlite3.OperationalError:
|
|
||||||
return []
|
|
||||||
# bm25 in SQLite FTS5 returns negative values (lower = better match)
|
|
||||||
# Negate so higher is better
|
|
||||||
return [(int(row[0]), -float(row[1])) for row in rows]
|
|
||||||
|
|
||||||
def fuzzy_search(self, query: str, top_k: int = 50) -> list[tuple[int, float]]:
|
|
||||||
"""Prefix search: each token + '*', return (id, score) sorted descending."""
|
|
||||||
tokens = query.strip().split()
|
|
||||||
if not tokens:
|
|
||||||
return []
|
|
||||||
prefix_query = " ".join(t + "*" for t in tokens)
|
|
||||||
try:
|
|
||||||
rows = self._conn.execute(
|
|
||||||
"SELECT rowid, bm25(docs) AS score FROM docs "
|
|
||||||
"WHERE docs MATCH ? ORDER BY score LIMIT ?",
|
|
||||||
(prefix_query, top_k),
|
|
||||||
).fetchall()
|
|
||||||
except sqlite3.OperationalError:
|
|
||||||
return []
|
|
||||||
return [(int(row[0]), -float(row[1])) for row in rows]
|
|
||||||
|
|
||||||
def get_content(self, doc_id: int) -> str:
|
|
||||||
"""Retrieve content for a doc_id."""
|
|
||||||
row = self._conn.execute(
|
|
||||||
"SELECT content FROM docs WHERE rowid = ?", (doc_id,)
|
|
||||||
).fetchone()
|
|
||||||
return row[0] if row else ""
|
|
||||||
|
|
||||||
def get_chunk_ids_by_path(self, path: str) -> list[int]:
|
|
||||||
"""Return all doc IDs associated with a given file path."""
|
|
||||||
rows = self._conn.execute(
|
|
||||||
"SELECT id FROM docs_meta WHERE path = ?", (path,)
|
|
||||||
).fetchall()
|
|
||||||
return [r[0] for r in rows]
|
|
||||||
|
|
||||||
def delete_by_path(self, path: str) -> int:
|
|
||||||
"""Delete all docs and docs_meta rows for a given file path.
|
|
||||||
|
|
||||||
Returns the number of deleted documents.
|
|
||||||
"""
|
|
||||||
ids = self.get_chunk_ids_by_path(path)
|
|
||||||
if not ids:
|
|
||||||
return 0
|
|
||||||
placeholders = ",".join("?" for _ in ids)
|
|
||||||
self._conn.execute(
|
|
||||||
f"DELETE FROM docs WHERE rowid IN ({placeholders})", ids
|
|
||||||
)
|
|
||||||
self._conn.execute(
|
|
||||||
f"DELETE FROM docs_meta WHERE id IN ({placeholders})", ids
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
return len(ids)
|
|
||||||
|
|
||||||
def get_doc_meta(self, doc_id: int) -> tuple[str, int, int]:
|
|
||||||
"""Return (path, start_line, end_line) for a doc_id."""
|
|
||||||
row = self._conn.execute(
|
|
||||||
"SELECT path, start_line, end_line FROM docs_meta WHERE id = ?",
|
|
||||||
(doc_id,),
|
|
||||||
).fetchone()
|
|
||||||
if row:
|
|
||||||
return row[0], row[1] or 0, row[2] or 0
|
|
||||||
return "", 0, 0
|
|
||||||
@@ -1,106 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import re
|
|
||||||
from enum import Enum
|
|
||||||
|
|
||||||
DEFAULT_WEIGHTS: dict[str, float] = {
|
|
||||||
"exact": 0.25,
|
|
||||||
"fuzzy": 0.10,
|
|
||||||
"vector": 0.50,
|
|
||||||
"graph": 0.15,
|
|
||||||
}
|
|
||||||
|
|
||||||
_CODE_CAMEL_RE = re.compile(r"[a-z][A-Z]")
|
|
||||||
_CODE_SNAKE_RE = re.compile(r"\b[a-z_]+_[a-z_]+\b")
|
|
||||||
_CODE_SYMBOLS_RE = re.compile(r"[.\[\](){}]|->|::")
|
|
||||||
_CODE_KEYWORDS_RE = re.compile(r"\b(import|def|class|return|from|async|await|lambda|yield)\b")
|
|
||||||
_QUESTION_WORDS_RE = re.compile(r"\b(how|what|why|when|where|which|who|does|do|is|are|can|should)\b", re.IGNORECASE)
|
|
||||||
|
|
||||||
|
|
||||||
class QueryIntent(Enum):
|
|
||||||
CODE_SYMBOL = "code_symbol"
|
|
||||||
NATURAL_LANGUAGE = "natural"
|
|
||||||
MIXED = "mixed"
|
|
||||||
|
|
||||||
|
|
||||||
def detect_query_intent(query: str) -> QueryIntent:
|
|
||||||
"""Detect whether query is a code symbol, natural language, or mixed."""
|
|
||||||
words = query.strip().split()
|
|
||||||
word_count = len(words)
|
|
||||||
|
|
||||||
code_signals = 0
|
|
||||||
natural_signals = 0
|
|
||||||
|
|
||||||
if _CODE_CAMEL_RE.search(query):
|
|
||||||
code_signals += 2
|
|
||||||
if _CODE_SNAKE_RE.search(query):
|
|
||||||
code_signals += 2
|
|
||||||
if _CODE_SYMBOLS_RE.search(query):
|
|
||||||
code_signals += 2
|
|
||||||
if _CODE_KEYWORDS_RE.search(query):
|
|
||||||
code_signals += 2
|
|
||||||
if "`" in query:
|
|
||||||
code_signals += 1
|
|
||||||
if word_count < 4:
|
|
||||||
code_signals += 1
|
|
||||||
|
|
||||||
if _QUESTION_WORDS_RE.search(query):
|
|
||||||
natural_signals += 2
|
|
||||||
if word_count > 5:
|
|
||||||
natural_signals += 2
|
|
||||||
if code_signals == 0 and word_count >= 3:
|
|
||||||
natural_signals += 1
|
|
||||||
|
|
||||||
if code_signals >= 2 and natural_signals == 0:
|
|
||||||
return QueryIntent.CODE_SYMBOL
|
|
||||||
if natural_signals >= 2 and code_signals == 0:
|
|
||||||
return QueryIntent.NATURAL_LANGUAGE
|
|
||||||
if code_signals >= 2 and natural_signals == 0:
|
|
||||||
return QueryIntent.CODE_SYMBOL
|
|
||||||
if natural_signals > code_signals:
|
|
||||||
return QueryIntent.NATURAL_LANGUAGE
|
|
||||||
if code_signals > natural_signals:
|
|
||||||
return QueryIntent.CODE_SYMBOL
|
|
||||||
return QueryIntent.MIXED
|
|
||||||
|
|
||||||
|
|
||||||
def get_adaptive_weights(intent: QueryIntent, base: dict | None = None) -> dict[str, float]:
|
|
||||||
"""Return weights adapted to query intent."""
|
|
||||||
weights = dict(base or DEFAULT_WEIGHTS)
|
|
||||||
if intent == QueryIntent.CODE_SYMBOL:
|
|
||||||
weights["exact"] = 0.45
|
|
||||||
weights["vector"] = 0.35
|
|
||||||
elif intent == QueryIntent.NATURAL_LANGUAGE:
|
|
||||||
weights["vector"] = 0.65
|
|
||||||
weights["exact"] = 0.15
|
|
||||||
# MIXED: use weights as-is
|
|
||||||
return weights
|
|
||||||
|
|
||||||
|
|
||||||
def reciprocal_rank_fusion(
|
|
||||||
results: dict[str, list[tuple[int, float]]],
|
|
||||||
weights: dict[str, float] | None = None,
|
|
||||||
k: int = 60,
|
|
||||||
) -> list[tuple[int, float]]:
|
|
||||||
"""Fuse ranked result lists using Reciprocal Rank Fusion.
|
|
||||||
|
|
||||||
results: {source_name: [(doc_id, score), ...]} each list sorted desc by score.
|
|
||||||
weights: weight per source (defaults to equal weight across all sources).
|
|
||||||
k: RRF constant (default 60).
|
|
||||||
Returns sorted list of (doc_id, fused_score) descending.
|
|
||||||
"""
|
|
||||||
if not results:
|
|
||||||
return []
|
|
||||||
|
|
||||||
sources = list(results.keys())
|
|
||||||
if weights is None:
|
|
||||||
equal_w = 1.0 / len(sources)
|
|
||||||
weights = {s: equal_w for s in sources}
|
|
||||||
|
|
||||||
scores: dict[int, float] = {}
|
|
||||||
for source, ranked_list in results.items():
|
|
||||||
w = weights.get(source, 0.0)
|
|
||||||
for rank, (doc_id, _) in enumerate(ranked_list, start=1):
|
|
||||||
scores[doc_id] = scores.get(doc_id, 0.0) + w * (1.0 / (k + rank))
|
|
||||||
|
|
||||||
return sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
||||||
@@ -1,353 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from ..config import Config
|
|
||||||
from ..core.base import BaseANNIndex, BaseBinaryIndex
|
|
||||||
from ..embed import BaseEmbedder
|
|
||||||
from ..indexing.metadata import MetadataStore
|
|
||||||
from ..rerank import BaseReranker
|
|
||||||
from .fts import FTSEngine
|
|
||||||
from .fusion import (
|
|
||||||
DEFAULT_WEIGHTS,
|
|
||||||
detect_query_intent,
|
|
||||||
get_adaptive_weights,
|
|
||||||
reciprocal_rank_fusion,
|
|
||||||
)
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_VALID_QUALITIES = ("fast", "balanced", "thorough", "auto")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class SearchResult:
|
|
||||||
id: int
|
|
||||||
path: str
|
|
||||||
score: float
|
|
||||||
snippet: str = ""
|
|
||||||
line: int = 0
|
|
||||||
end_line: int = 0
|
|
||||||
content: str = ""
|
|
||||||
|
|
||||||
|
|
||||||
class SearchPipeline:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
embedder: BaseEmbedder,
|
|
||||||
binary_store: BaseBinaryIndex,
|
|
||||||
ann_index: BaseANNIndex,
|
|
||||||
reranker: BaseReranker,
|
|
||||||
fts: FTSEngine,
|
|
||||||
config: Config,
|
|
||||||
metadata_store: MetadataStore | None = None,
|
|
||||||
) -> None:
|
|
||||||
self._embedder = embedder
|
|
||||||
self._binary_store = binary_store
|
|
||||||
self._ann_index = ann_index
|
|
||||||
self._reranker = reranker
|
|
||||||
self._fts = fts
|
|
||||||
self._config = config
|
|
||||||
self._metadata_store = metadata_store
|
|
||||||
|
|
||||||
# -- Helper: check if vector index has data ----------------------------
|
|
||||||
|
|
||||||
def _has_vector_index(self) -> bool:
|
|
||||||
"""Check if the binary store has any indexed entries."""
|
|
||||||
try:
|
|
||||||
return len(self._binary_store) > 0
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# -- Helper: vector search (binary coarse + ANN fine) -----------------
|
|
||||||
|
|
||||||
def _vector_search(
|
|
||||||
self, query_vec: np.ndarray
|
|
||||||
) -> list[tuple[int, float]]:
|
|
||||||
"""Run binary coarse search then ANN fine search and intersect."""
|
|
||||||
cfg = self._config
|
|
||||||
|
|
||||||
# Binary coarse search -> candidate_ids set
|
|
||||||
candidate_ids_list, _ = self._binary_store.coarse_search(
|
|
||||||
query_vec, top_k=cfg.binary_top_k
|
|
||||||
)
|
|
||||||
candidate_ids = set(candidate_ids_list)
|
|
||||||
|
|
||||||
# ANN fine search on full index, then intersect with binary candidates
|
|
||||||
ann_ids, ann_scores = self._ann_index.fine_search(
|
|
||||||
query_vec, top_k=cfg.ann_top_k
|
|
||||||
)
|
|
||||||
# Keep only results that appear in binary candidates (2-stage funnel)
|
|
||||||
vector_results: list[tuple[int, float]] = [
|
|
||||||
(int(doc_id), float(score))
|
|
||||||
for doc_id, score in zip(ann_ids, ann_scores)
|
|
||||||
if int(doc_id) in candidate_ids
|
|
||||||
]
|
|
||||||
# Fall back to full ANN results if intersection is empty
|
|
||||||
if not vector_results:
|
|
||||||
vector_results = [
|
|
||||||
(int(doc_id), float(score))
|
|
||||||
for doc_id, score in zip(ann_ids, ann_scores)
|
|
||||||
]
|
|
||||||
return vector_results
|
|
||||||
|
|
||||||
# -- Helper: binary coarse search only --------------------------------
|
|
||||||
|
|
||||||
def _binary_coarse_search(
|
|
||||||
self, query_vec: np.ndarray
|
|
||||||
) -> list[tuple[int, float]]:
|
|
||||||
"""Run binary coarse search only (no ANN fine search)."""
|
|
||||||
cfg = self._config
|
|
||||||
candidate_ids, distances = self._binary_store.coarse_search(
|
|
||||||
query_vec, top_k=cfg.binary_top_k
|
|
||||||
)
|
|
||||||
return [
|
|
||||||
(int(doc_id), float(dist))
|
|
||||||
for doc_id, dist in zip(candidate_ids, distances)
|
|
||||||
]
|
|
||||||
|
|
||||||
# -- Helper: FTS search (exact + fuzzy) ------------------------------
|
|
||||||
|
|
||||||
def _fts_search(
|
|
||||||
self, query: str
|
|
||||||
) -> tuple[list[tuple[int, float]], list[tuple[int, float]]]:
|
|
||||||
"""Run exact and fuzzy full-text search."""
|
|
||||||
cfg = self._config
|
|
||||||
exact_results = self._fts.exact_search(query, top_k=cfg.fts_top_k)
|
|
||||||
fuzzy_results = self._fts.fuzzy_search(query, top_k=cfg.fts_top_k)
|
|
||||||
return exact_results, fuzzy_results
|
|
||||||
|
|
||||||
# -- Helper: filter deleted IDs ---------------------------------------
|
|
||||||
|
|
||||||
def _filter_deleted(
|
|
||||||
self, fused: list[tuple[int, float]]
|
|
||||||
) -> list[tuple[int, float]]:
|
|
||||||
"""Remove tombstoned chunk IDs from results."""
|
|
||||||
if self._metadata_store is not None:
|
|
||||||
deleted_ids = self._metadata_store.get_deleted_ids()
|
|
||||||
if deleted_ids:
|
|
||||||
fused = [
|
|
||||||
(doc_id, score)
|
|
||||||
for doc_id, score in fused
|
|
||||||
if doc_id not in deleted_ids
|
|
||||||
]
|
|
||||||
return fused
|
|
||||||
|
|
||||||
# -- Helper: rerank and build results ---------------------------------
|
|
||||||
|
|
||||||
def _rerank_and_build(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
fused: list[tuple[int, float]],
|
|
||||||
final_top_k: int,
|
|
||||||
use_reranker: bool = True,
|
|
||||||
) -> list[SearchResult]:
|
|
||||||
"""Rerank candidates (optionally) and build SearchResult list."""
|
|
||||||
if not fused:
|
|
||||||
return []
|
|
||||||
|
|
||||||
if use_reranker:
|
|
||||||
rerank_ids = [doc_id for doc_id, _ in fused[:50]]
|
|
||||||
contents = [self._fts.get_content(doc_id) for doc_id in rerank_ids]
|
|
||||||
rerank_scores = self._reranker.score_pairs(query, contents)
|
|
||||||
ranked = sorted(
|
|
||||||
zip(rerank_ids, rerank_scores), key=lambda x: x[1], reverse=True
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
ranked = fused
|
|
||||||
|
|
||||||
results: list[SearchResult] = []
|
|
||||||
for doc_id, score in ranked[:final_top_k]:
|
|
||||||
path, start_line, end_line = self._fts.get_doc_meta(doc_id)
|
|
||||||
full_content = self._fts.get_content(doc_id)
|
|
||||||
results.append(
|
|
||||||
SearchResult(
|
|
||||||
id=doc_id,
|
|
||||||
path=path,
|
|
||||||
score=float(score),
|
|
||||||
snippet=full_content[:200],
|
|
||||||
line=start_line,
|
|
||||||
end_line=end_line,
|
|
||||||
content=full_content,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return results
|
|
||||||
|
|
||||||
# -- Helper: record access for tier tracking --------------------------
|
|
||||||
|
|
||||||
def _record_access(self, results: list[SearchResult]) -> None:
|
|
||||||
"""Record file access for data tier tracking."""
|
|
||||||
if results and self._metadata_store is not None:
|
|
||||||
unique_paths = list({r.path for r in results})
|
|
||||||
try:
|
|
||||||
self._metadata_store.record_access_batch(unique_paths)
|
|
||||||
except Exception:
|
|
||||||
_log.debug("Failed to record access for tier tracking", exc_info=True)
|
|
||||||
|
|
||||||
# -- Quality-routed search methods ------------------------------------
|
|
||||||
|
|
||||||
def _search_fast(
|
|
||||||
self, query: str, final_top_k: int
|
|
||||||
) -> list[SearchResult]:
|
|
||||||
"""FTS-only search with reranking. No embedding needed."""
|
|
||||||
exact_results, fuzzy_results = self._fts_search(query)
|
|
||||||
|
|
||||||
fusion_input: dict[str, list[tuple[int, float]]] = {}
|
|
||||||
if exact_results:
|
|
||||||
fusion_input["exact"] = exact_results
|
|
||||||
if fuzzy_results:
|
|
||||||
fusion_input["fuzzy"] = fuzzy_results
|
|
||||||
|
|
||||||
if not fusion_input:
|
|
||||||
return []
|
|
||||||
|
|
||||||
fused = reciprocal_rank_fusion(
|
|
||||||
fusion_input, weights={"exact": 0.7, "fuzzy": 0.3},
|
|
||||||
k=self._config.fusion_k,
|
|
||||||
)
|
|
||||||
fused = self._filter_deleted(fused)
|
|
||||||
return self._rerank_and_build(query, fused, final_top_k, use_reranker=True)
|
|
||||||
|
|
||||||
def _search_balanced(
|
|
||||||
self, query: str, final_top_k: int
|
|
||||||
) -> list[SearchResult]:
|
|
||||||
"""FTS + binary coarse search with RRF fusion and reranking.
|
|
||||||
|
|
||||||
Embeds the query for binary coarse search but skips ANN fine search.
|
|
||||||
"""
|
|
||||||
intent = detect_query_intent(query)
|
|
||||||
weights = get_adaptive_weights(intent, self._config.fusion_weights)
|
|
||||||
|
|
||||||
query_vec = self._embedder.embed_single(query)
|
|
||||||
|
|
||||||
# Parallel: binary coarse + FTS
|
|
||||||
coarse_results: list[tuple[int, float]] = []
|
|
||||||
exact_results: list[tuple[int, float]] = []
|
|
||||||
fuzzy_results: list[tuple[int, float]] = []
|
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=2) as pool:
|
|
||||||
coarse_future = pool.submit(self._binary_coarse_search, query_vec)
|
|
||||||
fts_future = pool.submit(self._fts_search, query)
|
|
||||||
|
|
||||||
try:
|
|
||||||
coarse_results = coarse_future.result()
|
|
||||||
except Exception:
|
|
||||||
_log.warning("Binary coarse search failed", exc_info=True)
|
|
||||||
|
|
||||||
try:
|
|
||||||
exact_results, fuzzy_results = fts_future.result()
|
|
||||||
except Exception:
|
|
||||||
_log.warning("FTS search failed", exc_info=True)
|
|
||||||
|
|
||||||
fusion_input: dict[str, list[tuple[int, float]]] = {}
|
|
||||||
if coarse_results:
|
|
||||||
fusion_input["vector"] = coarse_results
|
|
||||||
if exact_results:
|
|
||||||
fusion_input["exact"] = exact_results
|
|
||||||
if fuzzy_results:
|
|
||||||
fusion_input["fuzzy"] = fuzzy_results
|
|
||||||
|
|
||||||
if not fusion_input:
|
|
||||||
return []
|
|
||||||
|
|
||||||
fused = reciprocal_rank_fusion(fusion_input, weights=weights, k=self._config.fusion_k)
|
|
||||||
fused = self._filter_deleted(fused)
|
|
||||||
return self._rerank_and_build(query, fused, final_top_k, use_reranker=True)
|
|
||||||
|
|
||||||
def _search_thorough(
|
|
||||||
self, query: str, final_top_k: int
|
|
||||||
) -> list[SearchResult]:
|
|
||||||
"""Full 2-stage vector + FTS + reranking pipeline (original behavior)."""
|
|
||||||
cfg = self._config
|
|
||||||
|
|
||||||
intent = detect_query_intent(query)
|
|
||||||
weights = get_adaptive_weights(intent, cfg.fusion_weights)
|
|
||||||
|
|
||||||
query_vec = self._embedder.embed_single(query)
|
|
||||||
|
|
||||||
# Parallel vector + FTS search
|
|
||||||
vector_results: list[tuple[int, float]] = []
|
|
||||||
exact_results: list[tuple[int, float]] = []
|
|
||||||
fuzzy_results: list[tuple[int, float]] = []
|
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=2) as pool:
|
|
||||||
vec_future = pool.submit(self._vector_search, query_vec)
|
|
||||||
fts_future = pool.submit(self._fts_search, query)
|
|
||||||
|
|
||||||
try:
|
|
||||||
vector_results = vec_future.result()
|
|
||||||
except Exception:
|
|
||||||
_log.warning("Vector search failed, using empty results", exc_info=True)
|
|
||||||
|
|
||||||
try:
|
|
||||||
exact_results, fuzzy_results = fts_future.result()
|
|
||||||
except Exception:
|
|
||||||
_log.warning("FTS search failed, using empty results", exc_info=True)
|
|
||||||
|
|
||||||
fusion_input: dict[str, list[tuple[int, float]]] = {}
|
|
||||||
if vector_results:
|
|
||||||
fusion_input["vector"] = vector_results
|
|
||||||
if exact_results:
|
|
||||||
fusion_input["exact"] = exact_results
|
|
||||||
if fuzzy_results:
|
|
||||||
fusion_input["fuzzy"] = fuzzy_results
|
|
||||||
|
|
||||||
if not fusion_input:
|
|
||||||
return []
|
|
||||||
|
|
||||||
fused = reciprocal_rank_fusion(fusion_input, weights=weights, k=cfg.fusion_k)
|
|
||||||
fused = self._filter_deleted(fused)
|
|
||||||
return self._rerank_and_build(query, fused, final_top_k, use_reranker=True)
|
|
||||||
|
|
||||||
# -- Main search entry point -----------------------------------------
|
|
||||||
|
|
||||||
def search(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
top_k: int | None = None,
|
|
||||||
quality: str | None = None,
|
|
||||||
) -> list[SearchResult]:
|
|
||||||
"""Search with quality-based routing.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query: Search query string.
|
|
||||||
top_k: Maximum results to return.
|
|
||||||
quality: Search quality tier:
|
|
||||||
- 'fast': FTS-only + rerank (no embedding, no vector search)
|
|
||||||
- 'balanced': FTS + binary coarse + rerank (no ANN fine search)
|
|
||||||
- 'thorough': Full 2-stage vector + FTS + reranking
|
|
||||||
- 'auto': Selects 'thorough' if vectors exist, else 'fast'
|
|
||||||
- None: Uses config.default_search_quality
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of SearchResult ordered by relevance.
|
|
||||||
"""
|
|
||||||
cfg = self._config
|
|
||||||
final_top_k = top_k if top_k is not None else cfg.reranker_top_k
|
|
||||||
|
|
||||||
# Resolve quality tier
|
|
||||||
effective_quality = quality or cfg.default_search_quality
|
|
||||||
if effective_quality not in _VALID_QUALITIES:
|
|
||||||
_log.warning(
|
|
||||||
"Invalid search quality '%s', falling back to 'auto'",
|
|
||||||
effective_quality,
|
|
||||||
)
|
|
||||||
effective_quality = "auto"
|
|
||||||
|
|
||||||
# Auto-detect: use thorough if vector index has data, else fast
|
|
||||||
if effective_quality == "auto":
|
|
||||||
effective_quality = "thorough" if self._has_vector_index() else "fast"
|
|
||||||
|
|
||||||
if effective_quality == "fast":
|
|
||||||
results = self._search_fast(query, final_top_k)
|
|
||||||
elif effective_quality == "balanced":
|
|
||||||
results = self._search_balanced(query, final_top_k)
|
|
||||||
else:
|
|
||||||
results = self._search_thorough(query, final_top_k)
|
|
||||||
|
|
||||||
self._record_access(results)
|
|
||||||
return results
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
"""File watcher and incremental indexer for codexlens-search.
|
|
||||||
|
|
||||||
Requires the ``watcher`` extra::
|
|
||||||
|
|
||||||
pip install codexlens-search[watcher]
|
|
||||||
"""
|
|
||||||
from codexlens_search.watcher.events import ChangeType, FileEvent, WatcherConfig
|
|
||||||
from codexlens_search.watcher.file_watcher import FileWatcher
|
|
||||||
from codexlens_search.watcher.incremental_indexer import IncrementalIndexer
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"ChangeType",
|
|
||||||
"FileEvent",
|
|
||||||
"FileWatcher",
|
|
||||||
"IncrementalIndexer",
|
|
||||||
"WatcherConfig",
|
|
||||||
]
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
"""Event types for file watcher."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import time
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from enum import Enum
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional, Set
|
|
||||||
|
|
||||||
|
|
||||||
class ChangeType(Enum):
|
|
||||||
"""Type of file system change."""
|
|
||||||
|
|
||||||
CREATED = "created"
|
|
||||||
MODIFIED = "modified"
|
|
||||||
DELETED = "deleted"
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class FileEvent:
|
|
||||||
"""A file system change event."""
|
|
||||||
|
|
||||||
path: Path
|
|
||||||
change_type: ChangeType
|
|
||||||
timestamp: float = field(default_factory=time.time)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class WatcherConfig:
|
|
||||||
"""Configuration for file watcher.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
debounce_ms: Milliseconds to wait after the last event before
|
|
||||||
flushing the batch. Default 500ms for low-latency indexing.
|
|
||||||
ignored_patterns: Directory/file name patterns to skip. Any
|
|
||||||
path component matching one of these strings is ignored.
|
|
||||||
"""
|
|
||||||
|
|
||||||
debounce_ms: int = 500
|
|
||||||
ignored_patterns: Set[str] = field(default_factory=lambda: {
|
|
||||||
# Version control
|
|
||||||
".git", ".svn", ".hg",
|
|
||||||
# Python
|
|
||||||
".venv", "venv", "env", "__pycache__", ".pytest_cache",
|
|
||||||
".mypy_cache", ".ruff_cache",
|
|
||||||
# Node.js
|
|
||||||
"node_modules", "bower_components",
|
|
||||||
# Build artifacts
|
|
||||||
"dist", "build", "out", "target", "bin", "obj",
|
|
||||||
"coverage", "htmlcov",
|
|
||||||
# IDE / Editor
|
|
||||||
".idea", ".vscode", ".vs",
|
|
||||||
# Package / cache
|
|
||||||
".cache", ".parcel-cache", ".turbo", ".next", ".nuxt", ".codexlens",
|
|
||||||
# Logs / temp
|
|
||||||
"logs", "tmp", "temp",
|
|
||||||
})
|
|
||||||
@@ -1,285 +0,0 @@
|
|||||||
"""File system watcher using watchdog library.
|
|
||||||
|
|
||||||
Ported from codex-lens v1 with simplifications:
|
|
||||||
- Removed v1-specific Config dependency (uses WatcherConfig directly)
|
|
||||||
- Removed MAX_QUEUE_SIZE (v2 processes immediately via debounce)
|
|
||||||
- Removed flush.signal file mechanism
|
|
||||||
- Added optional JSONL output mode for bridge CLI integration
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import sys
|
|
||||||
import threading
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Callable, Dict, List, Optional
|
|
||||||
|
|
||||||
from watchdog.events import FileSystemEventHandler
|
|
||||||
from watchdog.observers import Observer
|
|
||||||
|
|
||||||
from .events import ChangeType, FileEvent, WatcherConfig
|
|
||||||
from .incremental_indexer import IncrementalIndexer
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
# Event priority for deduplication: higher wins when same file appears
|
|
||||||
# multiple times within one debounce window.
|
|
||||||
_EVENT_PRIORITY: Dict[ChangeType, int] = {
|
|
||||||
ChangeType.CREATED: 1,
|
|
||||||
ChangeType.MODIFIED: 2,
|
|
||||||
ChangeType.DELETED: 3,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class _Handler(FileSystemEventHandler):
|
|
||||||
"""Internal watchdog handler that converts events to FileEvent."""
|
|
||||||
|
|
||||||
def __init__(self, watcher: FileWatcher) -> None:
|
|
||||||
super().__init__()
|
|
||||||
self._watcher = watcher
|
|
||||||
|
|
||||||
def on_created(self, event) -> None:
|
|
||||||
if not event.is_directory:
|
|
||||||
self._watcher._on_raw_event(event.src_path, ChangeType.CREATED)
|
|
||||||
|
|
||||||
def on_modified(self, event) -> None:
|
|
||||||
if not event.is_directory:
|
|
||||||
self._watcher._on_raw_event(event.src_path, ChangeType.MODIFIED)
|
|
||||||
|
|
||||||
def on_deleted(self, event) -> None:
|
|
||||||
if not event.is_directory:
|
|
||||||
self._watcher._on_raw_event(event.src_path, ChangeType.DELETED)
|
|
||||||
|
|
||||||
def on_moved(self, event) -> None:
|
|
||||||
if event.is_directory:
|
|
||||||
return
|
|
||||||
# Treat move as delete old + create new
|
|
||||||
self._watcher._on_raw_event(event.src_path, ChangeType.DELETED)
|
|
||||||
self._watcher._on_raw_event(event.dest_path, ChangeType.CREATED)
|
|
||||||
|
|
||||||
|
|
||||||
class FileWatcher:
|
|
||||||
"""File system watcher with debounce and event deduplication.
|
|
||||||
|
|
||||||
Monitors a directory recursively using watchdog. Raw events are
|
|
||||||
collected into a queue. After *debounce_ms* of silence the queue
|
|
||||||
is flushed: events are deduplicated per-path (keeping the highest
|
|
||||||
priority change type) and delivered via *on_changes*.
|
|
||||||
|
|
||||||
Example::
|
|
||||||
|
|
||||||
def handle(events: list[FileEvent]) -> None:
|
|
||||||
for e in events:
|
|
||||||
print(e.change_type.value, e.path)
|
|
||||||
|
|
||||||
watcher = FileWatcher(Path("."), WatcherConfig(), handle)
|
|
||||||
watcher.start()
|
|
||||||
watcher.wait()
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
root_path: Path,
|
|
||||||
config: WatcherConfig,
|
|
||||||
on_changes: Callable[[List[FileEvent]], None],
|
|
||||||
) -> None:
|
|
||||||
self.root_path = Path(root_path).resolve()
|
|
||||||
self.config = config
|
|
||||||
self.on_changes = on_changes
|
|
||||||
|
|
||||||
self._observer: Optional[Observer] = None
|
|
||||||
self._running = False
|
|
||||||
self._stop_event = threading.Event()
|
|
||||||
self._lock = threading.RLock()
|
|
||||||
|
|
||||||
# Pending events keyed by resolved path
|
|
||||||
self._pending: Dict[Path, FileEvent] = {}
|
|
||||||
self._pending_lock = threading.Lock()
|
|
||||||
|
|
||||||
# True-debounce timer: resets on every new event
|
|
||||||
self._flush_timer: Optional[threading.Timer] = None
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Filtering
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _should_watch(self, path: Path) -> bool:
|
|
||||||
"""Return True if *path* should not be ignored."""
|
|
||||||
parts = path.parts
|
|
||||||
for pattern in self.config.ignored_patterns:
|
|
||||||
if pattern in parts:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Event intake (called from watchdog thread)
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _on_raw_event(self, raw_path: str, change_type: ChangeType) -> None:
|
|
||||||
"""Accept a raw watchdog event, filter, and queue with debounce."""
|
|
||||||
path = Path(raw_path).resolve()
|
|
||||||
|
|
||||||
if not self._should_watch(path):
|
|
||||||
return
|
|
||||||
|
|
||||||
event = FileEvent(path=path, change_type=change_type)
|
|
||||||
|
|
||||||
with self._pending_lock:
|
|
||||||
existing = self._pending.get(path)
|
|
||||||
if existing is None or _EVENT_PRIORITY[change_type] >= _EVENT_PRIORITY[existing.change_type]:
|
|
||||||
self._pending[path] = event
|
|
||||||
|
|
||||||
# Cancel previous timer and start a new one (true debounce)
|
|
||||||
if self._flush_timer is not None:
|
|
||||||
self._flush_timer.cancel()
|
|
||||||
|
|
||||||
self._flush_timer = threading.Timer(
|
|
||||||
self.config.debounce_ms / 1000.0,
|
|
||||||
self._flush,
|
|
||||||
)
|
|
||||||
self._flush_timer.daemon = True
|
|
||||||
self._flush_timer.start()
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Flush
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _flush(self) -> None:
|
|
||||||
"""Deduplicate and deliver pending events."""
|
|
||||||
with self._pending_lock:
|
|
||||||
if not self._pending:
|
|
||||||
return
|
|
||||||
events = list(self._pending.values())
|
|
||||||
self._pending.clear()
|
|
||||||
self._flush_timer = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.on_changes(events)
|
|
||||||
except Exception:
|
|
||||||
logger.exception("Error in on_changes callback")
|
|
||||||
|
|
||||||
def flush_now(self) -> None:
|
|
||||||
"""Immediately flush pending events (manual trigger)."""
|
|
||||||
with self._pending_lock:
|
|
||||||
if self._flush_timer is not None:
|
|
||||||
self._flush_timer.cancel()
|
|
||||||
self._flush_timer = None
|
|
||||||
self._flush()
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# Lifecycle
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
def start(self) -> None:
|
|
||||||
"""Start watching the directory (non-blocking)."""
|
|
||||||
with self._lock:
|
|
||||||
if self._running:
|
|
||||||
logger.warning("Watcher already running")
|
|
||||||
return
|
|
||||||
|
|
||||||
if not self.root_path.exists():
|
|
||||||
raise ValueError(f"Root path does not exist: {self.root_path}")
|
|
||||||
|
|
||||||
self._observer = Observer()
|
|
||||||
handler = _Handler(self)
|
|
||||||
self._observer.schedule(handler, str(self.root_path), recursive=True)
|
|
||||||
|
|
||||||
self._running = True
|
|
||||||
self._stop_event.clear()
|
|
||||||
self._observer.start()
|
|
||||||
logger.info("Started watching: %s", self.root_path)
|
|
||||||
|
|
||||||
def stop(self) -> None:
|
|
||||||
"""Stop watching and flush remaining events."""
|
|
||||||
with self._lock:
|
|
||||||
if not self._running:
|
|
||||||
return
|
|
||||||
|
|
||||||
self._running = False
|
|
||||||
self._stop_event.set()
|
|
||||||
|
|
||||||
with self._pending_lock:
|
|
||||||
if self._flush_timer is not None:
|
|
||||||
self._flush_timer.cancel()
|
|
||||||
self._flush_timer = None
|
|
||||||
|
|
||||||
if self._observer is not None:
|
|
||||||
self._observer.stop()
|
|
||||||
self._observer.join(timeout=5.0)
|
|
||||||
self._observer = None
|
|
||||||
|
|
||||||
# Deliver any remaining events
|
|
||||||
self._flush()
|
|
||||||
logger.info("Stopped watching: %s", self.root_path)
|
|
||||||
|
|
||||||
def wait(self) -> None:
|
|
||||||
"""Block until stopped (Ctrl+C or stop() from another thread)."""
|
|
||||||
try:
|
|
||||||
while self._running:
|
|
||||||
self._stop_event.wait(timeout=1.0)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
logger.info("Received interrupt, stopping watcher...")
|
|
||||||
self.stop()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def is_running(self) -> bool:
|
|
||||||
"""True if the watcher is currently running."""
|
|
||||||
return self._running
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# JSONL output helper
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def events_to_jsonl(events: List[FileEvent]) -> str:
|
|
||||||
"""Serialize a batch of events as newline-delimited JSON.
|
|
||||||
|
|
||||||
Each line is a JSON object with keys: ``path``, ``change_type``,
|
|
||||||
``timestamp``. Useful for bridge CLI integration.
|
|
||||||
"""
|
|
||||||
lines: list[str] = []
|
|
||||||
for evt in events:
|
|
||||||
obj = {
|
|
||||||
"path": str(evt.path),
|
|
||||||
"change_type": evt.change_type.value,
|
|
||||||
"timestamp": evt.timestamp,
|
|
||||||
}
|
|
||||||
lines.append(json.dumps(obj, ensure_ascii=False))
|
|
||||||
return "\n".join(lines)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def jsonl_callback(events: List[FileEvent]) -> None:
|
|
||||||
"""Callback that writes JSONL to stdout.
|
|
||||||
|
|
||||||
Suitable as *on_changes* when running in bridge/CLI mode::
|
|
||||||
|
|
||||||
watcher = FileWatcher(root, config, FileWatcher.jsonl_callback)
|
|
||||||
"""
|
|
||||||
output = FileWatcher.events_to_jsonl(events)
|
|
||||||
if output:
|
|
||||||
sys.stdout.write(output + "\n")
|
|
||||||
sys.stdout.flush()
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_with_indexer(
|
|
||||||
cls,
|
|
||||||
root_path: Path,
|
|
||||||
config: WatcherConfig,
|
|
||||||
indexer: IncrementalIndexer,
|
|
||||||
) -> "FileWatcher":
|
|
||||||
"""Create a FileWatcher wired to an IncrementalIndexer's async path.
|
|
||||||
|
|
||||||
Uses ``indexer.process_events_async()`` as the callback so that
|
|
||||||
events are debounced and batched within the indexer before
|
|
||||||
processing, preventing redundant per-file pipeline startups.
|
|
||||||
|
|
||||||
Example::
|
|
||||||
|
|
||||||
indexer = IncrementalIndexer(pipeline, root=root)
|
|
||||||
watcher = FileWatcher.create_with_indexer(root, config, indexer)
|
|
||||||
watcher.start()
|
|
||||||
"""
|
|
||||||
return cls(root_path, config, indexer.process_events_async)
|
|
||||||
@@ -1,185 +0,0 @@
|
|||||||
"""Incremental indexer that processes FileEvents via IndexingPipeline.
|
|
||||||
|
|
||||||
Ported from codex-lens v1 with simplifications:
|
|
||||||
- Uses IndexingPipeline.index_file() / remove_file() directly
|
|
||||||
- No v1-specific Config, ParserFactory, DirIndexStore dependencies
|
|
||||||
- Per-file error isolation: one failure does not stop batch processing
|
|
||||||
- Debounce batching: process_events_async() buffers events and flushes
|
|
||||||
after a configurable window to prevent redundant per-file pipeline startups
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import threading
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
from codexlens_search.indexing.pipeline import IndexingPipeline
|
|
||||||
|
|
||||||
from .events import ChangeType, FileEvent
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class BatchResult:
|
|
||||||
"""Result of processing a batch of file events."""
|
|
||||||
|
|
||||||
files_indexed: int = 0
|
|
||||||
files_removed: int = 0
|
|
||||||
chunks_created: int = 0
|
|
||||||
errors: List[str] = field(default_factory=list)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def total_processed(self) -> int:
|
|
||||||
return self.files_indexed + self.files_removed
|
|
||||||
|
|
||||||
@property
|
|
||||||
def has_errors(self) -> bool:
|
|
||||||
return len(self.errors) > 0
|
|
||||||
|
|
||||||
|
|
||||||
class IncrementalIndexer:
|
|
||||||
"""Routes file change events to IndexingPipeline operations.
|
|
||||||
|
|
||||||
CREATED / MODIFIED events call ``pipeline.index_file()``.
|
|
||||||
DELETED events call ``pipeline.remove_file()``.
|
|
||||||
|
|
||||||
Each file is processed in isolation so that a single failure
|
|
||||||
does not prevent the rest of the batch from being indexed.
|
|
||||||
|
|
||||||
Example::
|
|
||||||
|
|
||||||
indexer = IncrementalIndexer(pipeline, root=Path("/project"))
|
|
||||||
result = indexer.process_events([
|
|
||||||
FileEvent(Path("src/main.py"), ChangeType.MODIFIED),
|
|
||||||
])
|
|
||||||
print(f"Indexed {result.files_indexed}, removed {result.files_removed}")
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
pipeline: IndexingPipeline,
|
|
||||||
*,
|
|
||||||
root: Optional[Path] = None,
|
|
||||||
debounce_window_ms: int = 500,
|
|
||||||
) -> None:
|
|
||||||
"""Initialize the incremental indexer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pipeline: The indexing pipeline with metadata store configured.
|
|
||||||
root: Optional project root for computing relative paths.
|
|
||||||
If None, absolute paths are used as identifiers.
|
|
||||||
debounce_window_ms: Milliseconds to buffer events before flushing
|
|
||||||
in process_events_async(). Default 500ms.
|
|
||||||
"""
|
|
||||||
self._pipeline = pipeline
|
|
||||||
self._root = root
|
|
||||||
self._debounce_window_ms = debounce_window_ms
|
|
||||||
self._event_buffer: List[FileEvent] = []
|
|
||||||
self._buffer_lock = threading.Lock()
|
|
||||||
self._flush_timer: Optional[threading.Timer] = None
|
|
||||||
|
|
||||||
def process_events(self, events: List[FileEvent]) -> BatchResult:
|
|
||||||
"""Process a batch of file events with per-file error isolation.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
events: List of file events to process.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
BatchResult with per-batch statistics.
|
|
||||||
"""
|
|
||||||
result = BatchResult()
|
|
||||||
|
|
||||||
for event in events:
|
|
||||||
try:
|
|
||||||
if event.change_type in (ChangeType.CREATED, ChangeType.MODIFIED):
|
|
||||||
self._handle_index(event, result)
|
|
||||||
elif event.change_type == ChangeType.DELETED:
|
|
||||||
self._handle_remove(event, result)
|
|
||||||
except Exception as exc:
|
|
||||||
error_msg = (
|
|
||||||
f"Error processing {event.path} "
|
|
||||||
f"({event.change_type.value}): "
|
|
||||||
f"{type(exc).__name__}: {exc}"
|
|
||||||
)
|
|
||||||
logger.error(error_msg)
|
|
||||||
result.errors.append(error_msg)
|
|
||||||
|
|
||||||
if result.total_processed > 0:
|
|
||||||
logger.info(
|
|
||||||
"Batch complete: %d indexed, %d removed, %d errors",
|
|
||||||
result.files_indexed,
|
|
||||||
result.files_removed,
|
|
||||||
len(result.errors),
|
|
||||||
)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def process_events_async(self, events: List[FileEvent]) -> None:
|
|
||||||
"""Buffer events and flush after the debounce window expires.
|
|
||||||
|
|
||||||
Non-blocking: events are accumulated in an internal buffer.
|
|
||||||
When no new events arrive within *debounce_window_ms*, the buffer
|
|
||||||
is flushed and all accumulated events are processed as a single
|
|
||||||
batch via process_events().
|
|
||||||
|
|
||||||
Args:
|
|
||||||
events: List of file events to buffer.
|
|
||||||
"""
|
|
||||||
with self._buffer_lock:
|
|
||||||
self._event_buffer.extend(events)
|
|
||||||
|
|
||||||
# Cancel previous timer and start a new one (true debounce)
|
|
||||||
if self._flush_timer is not None:
|
|
||||||
self._flush_timer.cancel()
|
|
||||||
|
|
||||||
self._flush_timer = threading.Timer(
|
|
||||||
self._debounce_window_ms / 1000.0,
|
|
||||||
self._flush_buffer,
|
|
||||||
)
|
|
||||||
self._flush_timer.daemon = True
|
|
||||||
self._flush_timer.start()
|
|
||||||
|
|
||||||
def _flush_buffer(self) -> None:
|
|
||||||
"""Flush the event buffer and process all accumulated events."""
|
|
||||||
with self._buffer_lock:
|
|
||||||
if not self._event_buffer:
|
|
||||||
return
|
|
||||||
events = list(self._event_buffer)
|
|
||||||
self._event_buffer.clear()
|
|
||||||
self._flush_timer = None
|
|
||||||
|
|
||||||
# Deduplicate: keep the last event per path
|
|
||||||
seen: dict[Path, FileEvent] = {}
|
|
||||||
for event in events:
|
|
||||||
seen[event.path] = event
|
|
||||||
deduped = list(seen.values())
|
|
||||||
|
|
||||||
logger.debug(
|
|
||||||
"Flushing debounce buffer: %d events (%d after dedup)",
|
|
||||||
len(events), len(deduped),
|
|
||||||
)
|
|
||||||
self.process_events(deduped)
|
|
||||||
|
|
||||||
def _handle_index(self, event: FileEvent, result: BatchResult) -> None:
|
|
||||||
"""Index a created or modified file."""
|
|
||||||
stats = self._pipeline.index_file(
|
|
||||||
event.path,
|
|
||||||
root=self._root,
|
|
||||||
force=(event.change_type == ChangeType.MODIFIED),
|
|
||||||
)
|
|
||||||
if stats.files_processed > 0:
|
|
||||||
result.files_indexed += 1
|
|
||||||
result.chunks_created += stats.chunks_created
|
|
||||||
|
|
||||||
def _handle_remove(self, event: FileEvent, result: BatchResult) -> None:
|
|
||||||
"""Remove a deleted file from the index."""
|
|
||||||
rel_path = (
|
|
||||||
str(event.path.relative_to(self._root))
|
|
||||||
if self._root
|
|
||||||
else str(event.path)
|
|
||||||
)
|
|
||||||
self._pipeline.remove_file(rel_path)
|
|
||||||
result.files_removed += 1
|
|
||||||
@@ -1,108 +0,0 @@
|
|||||||
import pytest
|
|
||||||
import numpy as np
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from codexlens_search.core import ANNIndex, BinaryStore
|
|
||||||
from codexlens_search.embed.base import BaseEmbedder
|
|
||||||
from codexlens_search.rerank.base import BaseReranker
|
|
||||||
from codexlens_search.search.fts import FTSEngine
|
|
||||||
from codexlens_search.search.pipeline import SearchPipeline
|
|
||||||
|
|
||||||
# Test documents: 20 code snippets with id, path, content
|
|
||||||
TEST_DOCS = [
|
|
||||||
(0, "auth.py", "def authenticate(user, password): return check_hash(password, user.hash)"),
|
|
||||||
(1, "auth.py", "def authorize(user, permission): return permission in user.roles"),
|
|
||||||
(2, "models.py", "class User: def __init__(self, name, email): self.name = name; self.email = email"),
|
|
||||||
(3, "models.py", "class Session: token = None; expires_at = None"),
|
|
||||||
(4, "middleware.py", "def auth_middleware(request): token = request.headers.get('Authorization')"),
|
|
||||||
(5, "utils.py", "def hash_password(password): import bcrypt; return bcrypt.hashpw(password)"),
|
|
||||||
(6, "config.py", "DATABASE_URL = os.environ.get('DATABASE_URL', 'sqlite:///db.sqlite3')"),
|
|
||||||
(7, "search.py", "def search_users(query): return User.objects.filter(name__icontains=query)"),
|
|
||||||
(8, "api.py", "def get_user(request, user_id): user = User.objects.get(id=user_id)"),
|
|
||||||
(9, "api.py", "def create_user(request): data = request.json(); user = User(**data)"),
|
|
||||||
(10, "tests.py", "def test_authenticate(): assert authenticate('admin', 'pass') is not None"),
|
|
||||||
(11, "tests.py", "def test_search(): results = search_users('alice'); assert len(results) > 0"),
|
|
||||||
(12, "router.py", "app.route('/users', methods=['GET'])(list_users)"),
|
|
||||||
(13, "router.py", "app.route('/login', methods=['POST'])(login_handler)"),
|
|
||||||
(14, "db.py", "def get_connection(): return sqlite3.connect(DATABASE_URL)"),
|
|
||||||
(15, "cache.py", "def cache_get(key): return redis_client.get(key)"),
|
|
||||||
(16, "cache.py", "def cache_set(key, value, ttl=3600): redis_client.setex(key, ttl, value)"),
|
|
||||||
(17, "errors.py", "class AuthError(Exception): status_code = 401"),
|
|
||||||
(18, "errors.py", "class NotFoundError(Exception): status_code = 404"),
|
|
||||||
(19, "validators.py", "def validate_email(email): return '@' in email and '.' in email.split('@')[1]"),
|
|
||||||
]
|
|
||||||
|
|
||||||
DIM = 32 # Use small dim for fast tests
|
|
||||||
|
|
||||||
|
|
||||||
def make_stable_vec(doc_id: int, dim: int = DIM) -> np.ndarray:
|
|
||||||
"""Generate a deterministic float32 vector for a given doc_id."""
|
|
||||||
rng = np.random.default_rng(seed=doc_id)
|
|
||||||
vec = rng.standard_normal(dim).astype(np.float32)
|
|
||||||
vec /= np.linalg.norm(vec)
|
|
||||||
return vec
|
|
||||||
|
|
||||||
|
|
||||||
class MockEmbedder(BaseEmbedder):
|
|
||||||
"""Returns stable deterministic vectors based on content hash."""
|
|
||||||
|
|
||||||
def embed_single(self, text: str) -> np.ndarray:
|
|
||||||
seed = hash(text) % (2**31)
|
|
||||||
rng = np.random.default_rng(seed=seed)
|
|
||||||
vec = rng.standard_normal(DIM).astype(np.float32)
|
|
||||||
vec /= np.linalg.norm(vec)
|
|
||||||
return vec
|
|
||||||
|
|
||||||
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
|
||||||
return [self.embed_single(t) for t in texts]
|
|
||||||
|
|
||||||
def embed(self, texts: list[str]) -> list[np.ndarray]:
|
|
||||||
"""Called by SearchPipeline as self._embedder.embed([query])[0]."""
|
|
||||||
return self.embed_batch(texts)
|
|
||||||
|
|
||||||
|
|
||||||
class MockReranker(BaseReranker):
|
|
||||||
"""Returns score based on simple keyword overlap."""
|
|
||||||
|
|
||||||
def score_pairs(self, query: str, documents: list[str]) -> list[float]:
|
|
||||||
query_words = set(query.lower().split())
|
|
||||||
scores = []
|
|
||||||
for doc in documents:
|
|
||||||
doc_words = set(doc.lower().split())
|
|
||||||
overlap = len(query_words & doc_words)
|
|
||||||
scores.append(float(overlap) / max(len(query_words), 1))
|
|
||||||
return scores
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def config():
|
|
||||||
return Config.small() # hnsw_ef=50, hnsw_M=16, binary_top_k=50, ann_top_k=20, rerank_top_k=10
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def search_pipeline(tmp_path, config):
|
|
||||||
"""Build a full SearchPipeline with 20 test docs indexed."""
|
|
||||||
embedder = MockEmbedder()
|
|
||||||
binary_store = BinaryStore(tmp_path / "binary", dim=DIM, config=config)
|
|
||||||
ann_index = ANNIndex(tmp_path / "ann.hnsw", dim=DIM, config=config)
|
|
||||||
fts = FTSEngine(tmp_path / "fts.db")
|
|
||||||
reranker = MockReranker()
|
|
||||||
|
|
||||||
# Index all test docs
|
|
||||||
ids = np.array([d[0] for d in TEST_DOCS], dtype=np.int64)
|
|
||||||
vectors = np.array([embedder.embed_single(d[2]) for d in TEST_DOCS], dtype=np.float32)
|
|
||||||
|
|
||||||
binary_store.add(ids, vectors)
|
|
||||||
ann_index.add(ids, vectors)
|
|
||||||
fts.add_documents(TEST_DOCS)
|
|
||||||
|
|
||||||
return SearchPipeline(
|
|
||||||
embedder=embedder,
|
|
||||||
binary_store=binary_store,
|
|
||||||
ann_index=ann_index,
|
|
||||||
reranker=reranker,
|
|
||||||
fts=fts,
|
|
||||||
config=config,
|
|
||||||
)
|
|
||||||
@@ -1,44 +0,0 @@
|
|||||||
"""Integration tests for SearchPipeline using real components and mock embedder/reranker."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
|
|
||||||
def test_vector_search_returns_results(search_pipeline):
|
|
||||||
results = search_pipeline.search("authentication middleware")
|
|
||||||
assert len(results) > 0
|
|
||||||
assert all(isinstance(r.score, float) for r in results)
|
|
||||||
|
|
||||||
|
|
||||||
def test_exact_keyword_search(search_pipeline):
|
|
||||||
results = search_pipeline.search("authenticate")
|
|
||||||
assert len(results) > 0
|
|
||||||
result_ids = {r.id for r in results}
|
|
||||||
# Doc 0 and 10 both contain "authenticate"
|
|
||||||
assert result_ids & {0, 10}, f"Expected doc 0 or 10 in results, got {result_ids}"
|
|
||||||
|
|
||||||
|
|
||||||
def test_pipeline_top_k_limit(search_pipeline):
|
|
||||||
results = search_pipeline.search("user", top_k=5)
|
|
||||||
assert len(results) <= 5
|
|
||||||
|
|
||||||
|
|
||||||
def test_search_result_fields_populated(search_pipeline):
|
|
||||||
results = search_pipeline.search("password")
|
|
||||||
assert len(results) > 0
|
|
||||||
for r in results:
|
|
||||||
assert r.id >= 0
|
|
||||||
assert r.score >= 0
|
|
||||||
assert isinstance(r.path, str)
|
|
||||||
|
|
||||||
|
|
||||||
def test_empty_query_handled(search_pipeline):
|
|
||||||
results = search_pipeline.search("")
|
|
||||||
assert isinstance(results, list) # no exception
|
|
||||||
|
|
||||||
|
|
||||||
def test_different_queries_give_different_results(search_pipeline):
|
|
||||||
r1 = search_pipeline.search("authenticate user")
|
|
||||||
r2 = search_pipeline.search("cache redis")
|
|
||||||
# Results should differ (different top IDs or scores), unless both are empty
|
|
||||||
ids1 = [r.id for r in r1]
|
|
||||||
ids2 = [r.id for r in r2]
|
|
||||||
assert ids1 != ids2 or len(r1) == 0
|
|
||||||
@@ -1,165 +0,0 @@
|
|||||||
"""Unit tests for bridge.py CLI — argparse parsing, JSON protocol, error handling."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from unittest.mock import patch
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from codexlens_search.bridge import (
|
|
||||||
DEFAULT_EXCLUDES,
|
|
||||||
_build_parser,
|
|
||||||
_json_output,
|
|
||||||
_error_exit,
|
|
||||||
should_exclude,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Parser construction
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestParser:
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def _parser(self):
|
|
||||||
self.parser = _build_parser()
|
|
||||||
|
|
||||||
def test_all_subcommands_exist(self):
|
|
||||||
expected = {
|
|
||||||
"init", "search", "index-file", "remove-file",
|
|
||||||
"sync", "watch", "download-models", "status",
|
|
||||||
}
|
|
||||||
# parse each subcommand with minimal required args to verify it exists
|
|
||||||
for cmd in expected:
|
|
||||||
if cmd == "search":
|
|
||||||
args = self.parser.parse_args(["search", "--query", "test"])
|
|
||||||
elif cmd == "index-file":
|
|
||||||
args = self.parser.parse_args(["index-file", "--file", "x.py"])
|
|
||||||
elif cmd == "remove-file":
|
|
||||||
args = self.parser.parse_args(["remove-file", "--file", "x.py"])
|
|
||||||
elif cmd == "sync":
|
|
||||||
args = self.parser.parse_args(["sync", "--root", "/tmp"])
|
|
||||||
elif cmd == "watch":
|
|
||||||
args = self.parser.parse_args(["watch", "--root", "/tmp"])
|
|
||||||
else:
|
|
||||||
args = self.parser.parse_args([cmd])
|
|
||||||
assert args.command == cmd
|
|
||||||
|
|
||||||
def test_global_db_path_default(self):
|
|
||||||
args = self.parser.parse_args(["status"])
|
|
||||||
assert args.db_path # has a default
|
|
||||||
|
|
||||||
def test_global_db_path_override(self):
|
|
||||||
args = self.parser.parse_args(["--db-path", "/custom/path", "status"])
|
|
||||||
assert args.db_path == "/custom/path"
|
|
||||||
|
|
||||||
def test_search_args(self):
|
|
||||||
args = self.parser.parse_args(["search", "-q", "hello", "-k", "5"])
|
|
||||||
assert args.query == "hello"
|
|
||||||
assert args.top_k == 5
|
|
||||||
|
|
||||||
def test_search_default_top_k(self):
|
|
||||||
args = self.parser.parse_args(["search", "--query", "test"])
|
|
||||||
assert args.top_k == 10
|
|
||||||
|
|
||||||
def test_sync_glob_default(self):
|
|
||||||
args = self.parser.parse_args(["sync", "--root", "/tmp"])
|
|
||||||
assert args.glob == "**/*"
|
|
||||||
|
|
||||||
def test_watch_debounce_default(self):
|
|
||||||
args = self.parser.parse_args(["watch", "--root", "/tmp"])
|
|
||||||
assert args.debounce_ms == 500
|
|
||||||
|
|
||||||
def test_no_command_returns_none(self):
|
|
||||||
args = self.parser.parse_args([])
|
|
||||||
assert args.command is None
|
|
||||||
|
|
||||||
def test_default_excludes_include_codexlens(self):
|
|
||||||
assert ".codexlens" in DEFAULT_EXCLUDES
|
|
||||||
|
|
||||||
def test_should_exclude_codexlens_directory(self):
|
|
||||||
assert should_exclude(Path(".codexlens") / "metadata.db", DEFAULT_EXCLUDES) is True
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# JSON output helpers
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestJsonHelpers:
|
|
||||||
def test_json_output(self, capsys):
|
|
||||||
_json_output({"key": "value"})
|
|
||||||
out = capsys.readouterr().out.strip()
|
|
||||||
parsed = json.loads(out)
|
|
||||||
assert parsed == {"key": "value"}
|
|
||||||
|
|
||||||
def test_json_output_list(self, capsys):
|
|
||||||
_json_output([1, 2, 3])
|
|
||||||
out = capsys.readouterr().out.strip()
|
|
||||||
assert json.loads(out) == [1, 2, 3]
|
|
||||||
|
|
||||||
def test_json_output_unicode(self, capsys):
|
|
||||||
_json_output({"msg": "中文测试"})
|
|
||||||
out = capsys.readouterr().out.strip()
|
|
||||||
parsed = json.loads(out)
|
|
||||||
assert parsed["msg"] == "中文测试"
|
|
||||||
|
|
||||||
def test_error_exit(self):
|
|
||||||
with pytest.raises(SystemExit) as exc_info:
|
|
||||||
_error_exit("something broke")
|
|
||||||
assert exc_info.value.code == 1
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# cmd_init (lightweight, no model loading)
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestCmdInit:
|
|
||||||
def test_init_creates_databases(self, tmp_path):
|
|
||||||
"""Init should create metadata.db and fts.db."""
|
|
||||||
from codexlens_search.bridge import cmd_init
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
db_path = str(tmp_path / "test_idx")
|
|
||||||
args = argparse.Namespace(db_path=db_path, verbose=False)
|
|
||||||
cmd_init(args)
|
|
||||||
|
|
||||||
assert (Path(db_path) / "metadata.db").exists()
|
|
||||||
assert (Path(db_path) / "fts.db").exists()
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# cmd_status (lightweight, no model loading)
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestCmdStatus:
|
|
||||||
def test_status_not_initialized(self, tmp_path, capsys):
|
|
||||||
from codexlens_search.bridge import cmd_status
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
db_path = str(tmp_path / "empty_idx")
|
|
||||||
Path(db_path).mkdir()
|
|
||||||
args = argparse.Namespace(db_path=db_path, verbose=False)
|
|
||||||
cmd_status(args)
|
|
||||||
|
|
||||||
out = json.loads(capsys.readouterr().out.strip())
|
|
||||||
assert out["status"] == "not_initialized"
|
|
||||||
|
|
||||||
def test_status_after_init(self, tmp_path, capsys):
|
|
||||||
from codexlens_search.bridge import cmd_init, cmd_status
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
db_path = str(tmp_path / "idx")
|
|
||||||
args = argparse.Namespace(db_path=db_path, verbose=False)
|
|
||||||
cmd_init(args)
|
|
||||||
|
|
||||||
# Re-capture after init output
|
|
||||||
capsys.readouterr()
|
|
||||||
|
|
||||||
cmd_status(args)
|
|
||||||
out = json.loads(capsys.readouterr().out.strip())
|
|
||||||
assert out["status"] == "ok"
|
|
||||||
assert out["files_tracked"] == 0
|
|
||||||
assert out["deleted_chunks"] == 0
|
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
from codexlens_search.config import Config
|
|
||||||
|
|
||||||
|
|
||||||
def test_config_instantiates_no_args():
|
|
||||||
cfg = Config()
|
|
||||||
assert cfg is not None
|
|
||||||
|
|
||||||
|
|
||||||
def test_defaults_hnsw_ef():
|
|
||||||
cfg = Config.defaults()
|
|
||||||
assert cfg.hnsw_ef == 150
|
|
||||||
|
|
||||||
|
|
||||||
def test_defaults_hnsw_M():
|
|
||||||
cfg = Config.defaults()
|
|
||||||
assert cfg.hnsw_M == 32
|
|
||||||
|
|
||||||
|
|
||||||
def test_small_hnsw_ef():
|
|
||||||
cfg = Config.small()
|
|
||||||
assert cfg.hnsw_ef == 50
|
|
||||||
|
|
||||||
|
|
||||||
def test_custom_instantiation():
|
|
||||||
cfg = Config(hnsw_ef=100)
|
|
||||||
assert cfg.hnsw_ef == 100
|
|
||||||
|
|
||||||
|
|
||||||
def test_fusion_weights_keys():
|
|
||||||
cfg = Config()
|
|
||||||
assert set(cfg.fusion_weights.keys()) == {"exact", "fuzzy", "vector", "graph"}
|
|
||||||
@@ -1,136 +0,0 @@
|
|||||||
"""Unit tests for BinaryStore and ANNIndex (no fastembed required)."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import concurrent.futures
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from codexlens_search.core import ANNIndex, BinaryStore
|
|
||||||
|
|
||||||
|
|
||||||
DIM = 32
|
|
||||||
RNG = np.random.default_rng(42)
|
|
||||||
|
|
||||||
|
|
||||||
def make_vectors(n: int, dim: int = DIM) -> np.ndarray:
|
|
||||||
return RNG.standard_normal((n, dim)).astype(np.float32)
|
|
||||||
|
|
||||||
|
|
||||||
def make_ids(n: int, start: int = 0) -> np.ndarray:
|
|
||||||
return np.arange(start, start + n, dtype=np.int64)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# BinaryStore tests
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestBinaryStore:
|
|
||||||
def test_binary_store_add_and_search(self, tmp_path: Path) -> None:
|
|
||||||
cfg = Config.small()
|
|
||||||
store = BinaryStore(tmp_path, DIM, cfg)
|
|
||||||
vecs = make_vectors(10)
|
|
||||||
ids = make_ids(10)
|
|
||||||
store.add(ids, vecs)
|
|
||||||
|
|
||||||
assert len(store) == 10
|
|
||||||
|
|
||||||
top_k = 5
|
|
||||||
ret_ids, ret_dists = store.coarse_search(vecs[0], top_k=top_k)
|
|
||||||
assert ret_ids.shape == (top_k,)
|
|
||||||
assert ret_dists.shape == (top_k,)
|
|
||||||
# distances are non-negative integers
|
|
||||||
assert (ret_dists >= 0).all()
|
|
||||||
|
|
||||||
def test_binary_hamming_correctness(self, tmp_path: Path) -> None:
|
|
||||||
cfg = Config.small()
|
|
||||||
store = BinaryStore(tmp_path, DIM, cfg)
|
|
||||||
vecs = make_vectors(20)
|
|
||||||
ids = make_ids(20)
|
|
||||||
store.add(ids, vecs)
|
|
||||||
|
|
||||||
# Query with the exact stored vector; it must be the top-1 result
|
|
||||||
query = vecs[7]
|
|
||||||
ret_ids, ret_dists = store.coarse_search(query, top_k=1)
|
|
||||||
assert ret_ids[0] == 7
|
|
||||||
assert ret_dists[0] == 0 # Hamming distance to itself is 0
|
|
||||||
|
|
||||||
def test_binary_store_persist(self, tmp_path: Path) -> None:
|
|
||||||
cfg = Config.small()
|
|
||||||
store = BinaryStore(tmp_path, DIM, cfg)
|
|
||||||
vecs = make_vectors(15)
|
|
||||||
ids = make_ids(15)
|
|
||||||
store.add(ids, vecs)
|
|
||||||
store.save()
|
|
||||||
|
|
||||||
# Load into a fresh instance
|
|
||||||
store2 = BinaryStore(tmp_path, DIM, cfg)
|
|
||||||
assert len(store2) == 15
|
|
||||||
|
|
||||||
query = vecs[3]
|
|
||||||
ret_ids, ret_dists = store2.coarse_search(query, top_k=1)
|
|
||||||
assert ret_ids[0] == 3
|
|
||||||
assert ret_dists[0] == 0
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# ANNIndex tests
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestANNIndex:
|
|
||||||
def test_ann_index_add_and_search(self, tmp_path: Path) -> None:
|
|
||||||
cfg = Config.small()
|
|
||||||
idx = ANNIndex(tmp_path, DIM, cfg)
|
|
||||||
vecs = make_vectors(50)
|
|
||||||
ids = make_ids(50)
|
|
||||||
idx.add(ids, vecs)
|
|
||||||
|
|
||||||
assert len(idx) == 50
|
|
||||||
|
|
||||||
ret_ids, ret_dists = idx.fine_search(vecs[0], top_k=5)
|
|
||||||
assert len(ret_ids) == 5
|
|
||||||
assert len(ret_dists) == 5
|
|
||||||
|
|
||||||
def test_ann_index_thread_safety(self, tmp_path: Path) -> None:
|
|
||||||
cfg = Config.small()
|
|
||||||
idx = ANNIndex(tmp_path, DIM, cfg)
|
|
||||||
vecs = make_vectors(50)
|
|
||||||
ids = make_ids(50)
|
|
||||||
idx.add(ids, vecs)
|
|
||||||
|
|
||||||
query = vecs[0]
|
|
||||||
errors: list[Exception] = []
|
|
||||||
|
|
||||||
def search() -> None:
|
|
||||||
try:
|
|
||||||
idx.fine_search(query, top_k=3)
|
|
||||||
except Exception as exc:
|
|
||||||
errors.append(exc)
|
|
||||||
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as pool:
|
|
||||||
futures = [pool.submit(search) for _ in range(5)]
|
|
||||||
concurrent.futures.wait(futures)
|
|
||||||
|
|
||||||
assert errors == [], f"Thread safety errors: {errors}"
|
|
||||||
|
|
||||||
def test_ann_index_save_load(self, tmp_path: Path) -> None:
|
|
||||||
cfg = Config.small()
|
|
||||||
idx = ANNIndex(tmp_path, DIM, cfg)
|
|
||||||
vecs = make_vectors(30)
|
|
||||||
ids = make_ids(30)
|
|
||||||
idx.add(ids, vecs)
|
|
||||||
idx.save()
|
|
||||||
|
|
||||||
# Load into a fresh instance
|
|
||||||
idx2 = ANNIndex(tmp_path, DIM, cfg)
|
|
||||||
idx2.load()
|
|
||||||
assert len(idx2) == 30
|
|
||||||
|
|
||||||
ret_ids, ret_dists = idx2.fine_search(vecs[10], top_k=1)
|
|
||||||
assert len(ret_ids) == 1
|
|
||||||
assert ret_ids[0] == 10
|
|
||||||
@@ -1,258 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import types
|
|
||||||
import unittest
|
|
||||||
from unittest.mock import MagicMock, patch
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def _make_fastembed_mock():
|
|
||||||
"""Build a minimal fastembed stub so imports succeed without the real package."""
|
|
||||||
fastembed_mod = types.ModuleType("fastembed")
|
|
||||||
fastembed_mod.TextEmbedding = MagicMock()
|
|
||||||
sys.modules.setdefault("fastembed", fastembed_mod)
|
|
||||||
return fastembed_mod
|
|
||||||
|
|
||||||
|
|
||||||
_make_fastembed_mock()
|
|
||||||
|
|
||||||
from codexlens_search.config import Config # noqa: E402
|
|
||||||
from codexlens_search.embed.base import BaseEmbedder # noqa: E402
|
|
||||||
from codexlens_search.embed.local import EMBED_PROFILES, FastEmbedEmbedder # noqa: E402
|
|
||||||
from codexlens_search.embed.api import APIEmbedder # noqa: E402
|
|
||||||
|
|
||||||
|
|
||||||
class TestEmbedSingle(unittest.TestCase):
|
|
||||||
def test_embed_single_returns_float32_ndarray(self):
|
|
||||||
config = Config()
|
|
||||||
embedder = FastEmbedEmbedder(config)
|
|
||||||
|
|
||||||
mock_model = MagicMock()
|
|
||||||
mock_model.embed.return_value = iter([np.ones(384, dtype=np.float64)])
|
|
||||||
|
|
||||||
# Inject mock model directly to bypass lazy load (no real fastembed needed)
|
|
||||||
embedder._model = mock_model
|
|
||||||
result = embedder.embed_single("hello world")
|
|
||||||
|
|
||||||
self.assertIsInstance(result, np.ndarray)
|
|
||||||
self.assertEqual(result.dtype, np.float32)
|
|
||||||
self.assertEqual(result.shape, (384,))
|
|
||||||
|
|
||||||
|
|
||||||
class TestEmbedBatch(unittest.TestCase):
|
|
||||||
def test_embed_batch_returns_list(self):
|
|
||||||
config = Config()
|
|
||||||
embedder = FastEmbedEmbedder(config)
|
|
||||||
|
|
||||||
vecs = [np.ones(384, dtype=np.float64) * i for i in range(3)]
|
|
||||||
mock_model = MagicMock()
|
|
||||||
mock_model.embed.return_value = iter(vecs)
|
|
||||||
|
|
||||||
embedder._model = mock_model
|
|
||||||
result = embedder.embed_batch(["a", "b", "c"])
|
|
||||||
|
|
||||||
self.assertIsInstance(result, list)
|
|
||||||
self.assertEqual(len(result), 3)
|
|
||||||
for arr in result:
|
|
||||||
self.assertIsInstance(arr, np.ndarray)
|
|
||||||
self.assertEqual(arr.dtype, np.float32)
|
|
||||||
|
|
||||||
|
|
||||||
class TestEmbedProfiles(unittest.TestCase):
|
|
||||||
def test_embed_profiles_all_have_valid_keys(self):
|
|
||||||
expected_keys = {"small", "base", "large", "code"}
|
|
||||||
self.assertEqual(set(EMBED_PROFILES.keys()), expected_keys)
|
|
||||||
|
|
||||||
def test_embed_profiles_model_ids_non_empty(self):
|
|
||||||
for key, model_id in EMBED_PROFILES.items():
|
|
||||||
self.assertIsInstance(model_id, str, msg=f"{key} model id should be str")
|
|
||||||
self.assertTrue(len(model_id) > 0, msg=f"{key} model id should be non-empty")
|
|
||||||
|
|
||||||
|
|
||||||
class TestBaseEmbedderAbstract(unittest.TestCase):
|
|
||||||
def test_base_embedder_is_abstract(self):
|
|
||||||
with self.assertRaises(TypeError):
|
|
||||||
BaseEmbedder() # type: ignore[abstract]
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# APIEmbedder
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _make_api_config(**overrides) -> Config:
|
|
||||||
defaults = dict(
|
|
||||||
embed_api_url="https://api.example.com/v1",
|
|
||||||
embed_api_key="test-key",
|
|
||||||
embed_api_model="text-embedding-3-small",
|
|
||||||
embed_dim=384,
|
|
||||||
embed_batch_size=2,
|
|
||||||
embed_api_max_tokens_per_batch=8192,
|
|
||||||
embed_api_concurrency=2,
|
|
||||||
)
|
|
||||||
defaults.update(overrides)
|
|
||||||
return Config(**defaults)
|
|
||||||
|
|
||||||
|
|
||||||
def _mock_200(count=1, dim=384):
|
|
||||||
r = MagicMock()
|
|
||||||
r.status_code = 200
|
|
||||||
r.json.return_value = {
|
|
||||||
"data": [{"index": j, "embedding": [0.1 * (j + 1)] * dim} for j in range(count)]
|
|
||||||
}
|
|
||||||
r.raise_for_status = MagicMock()
|
|
||||||
return r
|
|
||||||
|
|
||||||
|
|
||||||
class TestAPIEmbedderSingle(unittest.TestCase):
|
|
||||||
def test_embed_single_returns_float32(self):
|
|
||||||
config = _make_api_config()
|
|
||||||
with patch("httpx.Client") as mock_client_cls:
|
|
||||||
mock_client = MagicMock()
|
|
||||||
mock_client_cls.return_value = mock_client
|
|
||||||
mock_client.post.return_value = _mock_200(1, 384)
|
|
||||||
|
|
||||||
embedder = APIEmbedder(config)
|
|
||||||
result = embedder.embed_single("hello")
|
|
||||||
|
|
||||||
self.assertIsInstance(result, np.ndarray)
|
|
||||||
self.assertEqual(result.dtype, np.float32)
|
|
||||||
self.assertEqual(result.shape, (384,))
|
|
||||||
|
|
||||||
|
|
||||||
class TestAPIEmbedderBatch(unittest.TestCase):
|
|
||||||
def test_embed_batch_splits_by_batch_size(self):
|
|
||||||
config = _make_api_config(embed_batch_size=2)
|
|
||||||
|
|
||||||
with patch("httpx.Client") as mock_client_cls:
|
|
||||||
mock_client = MagicMock()
|
|
||||||
mock_client_cls.return_value = mock_client
|
|
||||||
mock_client.post.side_effect = [_mock_200(2, 384), _mock_200(1, 384)]
|
|
||||||
|
|
||||||
embedder = APIEmbedder(config)
|
|
||||||
result = embedder.embed_batch(["a", "b", "c"])
|
|
||||||
|
|
||||||
self.assertEqual(len(result), 3)
|
|
||||||
for arr in result:
|
|
||||||
self.assertIsInstance(arr, np.ndarray)
|
|
||||||
self.assertEqual(arr.dtype, np.float32)
|
|
||||||
|
|
||||||
def test_embed_batch_empty_returns_empty(self):
|
|
||||||
config = _make_api_config()
|
|
||||||
with patch("httpx.Client"):
|
|
||||||
embedder = APIEmbedder(config)
|
|
||||||
result = embedder.embed_batch([])
|
|
||||||
self.assertEqual(result, [])
|
|
||||||
|
|
||||||
|
|
||||||
class TestAPIEmbedderRetry(unittest.TestCase):
|
|
||||||
def test_retry_on_429(self):
|
|
||||||
config = _make_api_config()
|
|
||||||
mock_429 = MagicMock()
|
|
||||||
mock_429.status_code = 429
|
|
||||||
|
|
||||||
with patch("httpx.Client") as mock_client_cls:
|
|
||||||
mock_client = MagicMock()
|
|
||||||
mock_client_cls.return_value = mock_client
|
|
||||||
mock_client.post.side_effect = [mock_429, _mock_200(1, 384)]
|
|
||||||
|
|
||||||
embedder = APIEmbedder(config)
|
|
||||||
ep = embedder._endpoints[0]
|
|
||||||
with patch("time.sleep"):
|
|
||||||
result = embedder._call_api(["test"], ep)
|
|
||||||
|
|
||||||
self.assertEqual(len(result), 1)
|
|
||||||
self.assertEqual(mock_client.post.call_count, 2)
|
|
||||||
|
|
||||||
def test_raises_after_max_retries(self):
|
|
||||||
config = _make_api_config()
|
|
||||||
mock_429 = MagicMock()
|
|
||||||
mock_429.status_code = 429
|
|
||||||
|
|
||||||
with patch("httpx.Client") as mock_client_cls:
|
|
||||||
mock_client = MagicMock()
|
|
||||||
mock_client_cls.return_value = mock_client
|
|
||||||
mock_client.post.return_value = mock_429
|
|
||||||
|
|
||||||
embedder = APIEmbedder(config)
|
|
||||||
ep = embedder._endpoints[0]
|
|
||||||
with patch("time.sleep"):
|
|
||||||
with self.assertRaises(RuntimeError):
|
|
||||||
embedder._call_api(["test"], ep, max_retries=2)
|
|
||||||
|
|
||||||
|
|
||||||
class TestAPIEmbedderTokenPacking(unittest.TestCase):
|
|
||||||
def test_packs_small_texts_together(self):
|
|
||||||
config = _make_api_config(
|
|
||||||
embed_batch_size=100,
|
|
||||||
embed_api_max_tokens_per_batch=100, # ~400 chars
|
|
||||||
)
|
|
||||||
with patch("httpx.Client"):
|
|
||||||
embedder = APIEmbedder(config)
|
|
||||||
|
|
||||||
# 5 texts of 80 chars each (~20 tokens) -> 100 tokens = 1 batch at limit
|
|
||||||
texts = ["x" * 80] * 5
|
|
||||||
batches = embedder._pack_batches(texts)
|
|
||||||
# Should pack as many as fit under 100 tokens
|
|
||||||
self.assertTrue(len(batches) >= 1)
|
|
||||||
total_items = sum(len(b) for b in batches)
|
|
||||||
self.assertEqual(total_items, 5)
|
|
||||||
|
|
||||||
def test_large_text_gets_own_batch(self):
|
|
||||||
config = _make_api_config(
|
|
||||||
embed_batch_size=100,
|
|
||||||
embed_api_max_tokens_per_batch=50, # ~200 chars
|
|
||||||
)
|
|
||||||
with patch("httpx.Client"):
|
|
||||||
embedder = APIEmbedder(config)
|
|
||||||
|
|
||||||
# Mix of small and large texts
|
|
||||||
texts = ["small" * 10, "x" * 800, "tiny"]
|
|
||||||
batches = embedder._pack_batches(texts)
|
|
||||||
# Large text (200 tokens) exceeds 50 limit, should be separate
|
|
||||||
self.assertTrue(len(batches) >= 2)
|
|
||||||
|
|
||||||
|
|
||||||
class TestAPIEmbedderMultiEndpoint(unittest.TestCase):
|
|
||||||
def test_multi_endpoint_config(self):
|
|
||||||
config = _make_api_config(
|
|
||||||
embed_api_endpoints=[
|
|
||||||
{"url": "https://ep1.example.com/v1", "key": "k1", "model": "m1"},
|
|
||||||
{"url": "https://ep2.example.com/v1", "key": "k2", "model": "m2"},
|
|
||||||
]
|
|
||||||
)
|
|
||||||
with patch("httpx.Client"):
|
|
||||||
embedder = APIEmbedder(config)
|
|
||||||
self.assertEqual(len(embedder._endpoints), 2)
|
|
||||||
self.assertTrue(embedder._endpoints[0].url.endswith("/embeddings"))
|
|
||||||
self.assertTrue(embedder._endpoints[1].url.endswith("/embeddings"))
|
|
||||||
|
|
||||||
def test_single_endpoint_fallback(self):
|
|
||||||
config = _make_api_config() # no embed_api_endpoints
|
|
||||||
with patch("httpx.Client"):
|
|
||||||
embedder = APIEmbedder(config)
|
|
||||||
self.assertEqual(len(embedder._endpoints), 1)
|
|
||||||
|
|
||||||
|
|
||||||
class TestAPIEmbedderUrlNormalization(unittest.TestCase):
|
|
||||||
def test_appends_embeddings_path(self):
|
|
||||||
config = _make_api_config(embed_api_url="https://api.example.com/v1")
|
|
||||||
with patch("httpx.Client") as mock_client_cls:
|
|
||||||
mock_client = MagicMock()
|
|
||||||
mock_client_cls.return_value = mock_client
|
|
||||||
mock_client.post.return_value = _mock_200(1, 384)
|
|
||||||
embedder = APIEmbedder(config)
|
|
||||||
ep = embedder._endpoints[0]
|
|
||||||
self.assertTrue(ep.url.endswith("/embeddings"))
|
|
||||||
|
|
||||||
def test_does_not_double_append(self):
|
|
||||||
config = _make_api_config(embed_api_url="https://api.example.com/v1/embeddings")
|
|
||||||
with patch("httpx.Client"):
|
|
||||||
embedder = APIEmbedder(config)
|
|
||||||
ep = embedder._endpoints[0]
|
|
||||||
self.assertFalse(ep.url.endswith("/embeddings/embeddings"))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
unittest.main()
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
"""Unit tests for FTSEngine delete_by_path and get_chunk_ids_by_path."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from codexlens_search.search.fts import FTSEngine
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def fts(tmp_path):
|
|
||||||
return FTSEngine(str(tmp_path / "fts.db"))
|
|
||||||
|
|
||||||
|
|
||||||
class TestGetChunkIdsByPath:
|
|
||||||
def test_empty(self, fts):
|
|
||||||
assert fts.get_chunk_ids_by_path("a.py") == []
|
|
||||||
|
|
||||||
def test_returns_matching_ids(self, fts):
|
|
||||||
fts.add_documents([
|
|
||||||
(0, "a.py", "hello world"),
|
|
||||||
(1, "a.py", "foo bar"),
|
|
||||||
(2, "b.py", "other content"),
|
|
||||||
])
|
|
||||||
ids = fts.get_chunk_ids_by_path("a.py")
|
|
||||||
assert sorted(ids) == [0, 1]
|
|
||||||
|
|
||||||
def test_no_match(self, fts):
|
|
||||||
fts.add_documents([(0, "a.py", "content")])
|
|
||||||
assert fts.get_chunk_ids_by_path("b.py") == []
|
|
||||||
|
|
||||||
|
|
||||||
class TestDeleteByPath:
|
|
||||||
def test_deletes_docs_and_meta(self, fts):
|
|
||||||
fts.add_documents([
|
|
||||||
(0, "target.py", "to be deleted"),
|
|
||||||
(1, "target.py", "also deleted"),
|
|
||||||
(2, "keep.py", "keep this"),
|
|
||||||
])
|
|
||||||
count = fts.delete_by_path("target.py")
|
|
||||||
assert count == 2
|
|
||||||
|
|
||||||
# target.py gone from both tables
|
|
||||||
assert fts.get_chunk_ids_by_path("target.py") == []
|
|
||||||
assert fts.get_content(0) == ""
|
|
||||||
assert fts.get_content(1) == ""
|
|
||||||
|
|
||||||
# keep.py still there
|
|
||||||
assert fts.get_chunk_ids_by_path("keep.py") == [2]
|
|
||||||
assert fts.get_content(2) == "keep this"
|
|
||||||
|
|
||||||
def test_delete_nonexistent_path(self, fts):
|
|
||||||
count = fts.delete_by_path("nonexistent.py")
|
|
||||||
assert count == 0
|
|
||||||
|
|
||||||
def test_delete_then_search(self, fts):
|
|
||||||
fts.add_documents([
|
|
||||||
(0, "a.py", "unique searchable content"),
|
|
||||||
(1, "b.py", "different content here"),
|
|
||||||
])
|
|
||||||
fts.delete_by_path("a.py")
|
|
||||||
results = fts.exact_search("unique searchable")
|
|
||||||
assert len(results) == 0
|
|
||||||
|
|
||||||
results = fts.exact_search("different")
|
|
||||||
assert len(results) == 1
|
|
||||||
assert results[0][0] == 1
|
|
||||||
@@ -1,388 +0,0 @@
|
|||||||
"""Unit tests for IndexingPipeline incremental API (index_file, remove_file, sync, compact)."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
from unittest.mock import MagicMock
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from codexlens_search.core.binary import BinaryStore
|
|
||||||
from codexlens_search.core.index import ANNIndex
|
|
||||||
from codexlens_search.embed.base import BaseEmbedder
|
|
||||||
from codexlens_search.indexing.metadata import MetadataStore
|
|
||||||
from codexlens_search.indexing.pipeline import IndexingPipeline, IndexStats
|
|
||||||
from codexlens_search.search.fts import FTSEngine
|
|
||||||
|
|
||||||
|
|
||||||
DIM = 32
|
|
||||||
|
|
||||||
|
|
||||||
class FakeEmbedder(BaseEmbedder):
|
|
||||||
"""Deterministic embedder for testing."""
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def embed_single(self, text: str) -> np.ndarray:
|
|
||||||
rng = np.random.default_rng(hash(text) % (2**31))
|
|
||||||
return rng.standard_normal(DIM).astype(np.float32)
|
|
||||||
|
|
||||||
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
|
||||||
return [self.embed_single(t) for t in texts]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def workspace(tmp_path: Path):
|
|
||||||
"""Create workspace with stores, metadata, and pipeline."""
|
|
||||||
cfg = Config.small()
|
|
||||||
# Override embed_dim to match our test dim
|
|
||||||
cfg.embed_dim = DIM
|
|
||||||
|
|
||||||
store_dir = tmp_path / "stores"
|
|
||||||
store_dir.mkdir()
|
|
||||||
|
|
||||||
binary_store = BinaryStore(store_dir, DIM, cfg)
|
|
||||||
ann_index = ANNIndex(store_dir, DIM, cfg)
|
|
||||||
fts = FTSEngine(str(store_dir / "fts.db"))
|
|
||||||
metadata = MetadataStore(str(store_dir / "metadata.db"))
|
|
||||||
embedder = FakeEmbedder()
|
|
||||||
|
|
||||||
pipeline = IndexingPipeline(
|
|
||||||
embedder=embedder,
|
|
||||||
binary_store=binary_store,
|
|
||||||
ann_index=ann_index,
|
|
||||||
fts=fts,
|
|
||||||
config=cfg,
|
|
||||||
metadata=metadata,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create sample source files
|
|
||||||
src_dir = tmp_path / "src"
|
|
||||||
src_dir.mkdir()
|
|
||||||
|
|
||||||
return {
|
|
||||||
"pipeline": pipeline,
|
|
||||||
"metadata": metadata,
|
|
||||||
"binary_store": binary_store,
|
|
||||||
"ann_index": ann_index,
|
|
||||||
"fts": fts,
|
|
||||||
"src_dir": src_dir,
|
|
||||||
"store_dir": store_dir,
|
|
||||||
"config": cfg,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _write_file(src_dir: Path, name: str, content: str) -> Path:
|
|
||||||
"""Write a file and return its path."""
|
|
||||||
p = src_dir / name
|
|
||||||
p.write_text(content, encoding="utf-8")
|
|
||||||
return p
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# MetadataStore helper method tests
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestMetadataHelpers:
|
|
||||||
def test_get_all_files_empty(self, workspace):
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
assert meta.get_all_files() == {}
|
|
||||||
|
|
||||||
def test_get_all_files_after_register(self, workspace):
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
meta.register_file("a.py", "hash_a", 1000.0)
|
|
||||||
meta.register_file("b.py", "hash_b", 2000.0)
|
|
||||||
result = meta.get_all_files()
|
|
||||||
assert result == {"a.py": "hash_a", "b.py": "hash_b"}
|
|
||||||
|
|
||||||
def test_max_chunk_id_empty(self, workspace):
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
assert meta.max_chunk_id() == -1
|
|
||||||
|
|
||||||
def test_max_chunk_id_with_chunks(self, workspace):
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
meta.register_file("a.py", "hash_a", 1000.0)
|
|
||||||
meta.register_chunks("a.py", [(0, "h0"), (1, "h1"), (5, "h5")])
|
|
||||||
assert meta.max_chunk_id() == 5
|
|
||||||
|
|
||||||
def test_max_chunk_id_includes_deleted(self, workspace):
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
meta.register_file("a.py", "hash_a", 1000.0)
|
|
||||||
meta.register_chunks("a.py", [(0, "h0"), (3, "h3")])
|
|
||||||
meta.mark_file_deleted("a.py")
|
|
||||||
# Chunks moved to deleted_chunks, max should still be 3
|
|
||||||
assert meta.max_chunk_id() == 3
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# index_file tests
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestIndexFile:
|
|
||||||
def test_index_file_basic(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
|
|
||||||
f = _write_file(src_dir, "hello.py", "print('hello world')\n")
|
|
||||||
stats = pipeline.index_file(f, root=src_dir)
|
|
||||||
|
|
||||||
assert stats.files_processed == 1
|
|
||||||
assert stats.chunks_created >= 1
|
|
||||||
assert meta.get_file_hash("hello.py") is not None
|
|
||||||
assert len(meta.get_chunk_ids_for_file("hello.py")) >= 1
|
|
||||||
|
|
||||||
def test_index_file_skips_unchanged(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
|
|
||||||
f = _write_file(src_dir, "same.py", "x = 1\n")
|
|
||||||
stats1 = pipeline.index_file(f, root=src_dir)
|
|
||||||
assert stats1.files_processed == 1
|
|
||||||
|
|
||||||
stats2 = pipeline.index_file(f, root=src_dir)
|
|
||||||
assert stats2.files_processed == 0
|
|
||||||
assert stats2.chunks_created == 0
|
|
||||||
|
|
||||||
def test_index_file_force_reindex(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
|
|
||||||
f = _write_file(src_dir, "force.py", "x = 1\n")
|
|
||||||
pipeline.index_file(f, root=src_dir)
|
|
||||||
|
|
||||||
stats = pipeline.index_file(f, root=src_dir, force=True)
|
|
||||||
assert stats.files_processed == 1
|
|
||||||
assert stats.chunks_created >= 1
|
|
||||||
|
|
||||||
def test_index_file_updates_changed_file(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
|
|
||||||
f = _write_file(src_dir, "changing.py", "version = 1\n")
|
|
||||||
pipeline.index_file(f, root=src_dir)
|
|
||||||
old_chunks = meta.get_chunk_ids_for_file("changing.py")
|
|
||||||
|
|
||||||
# Modify file
|
|
||||||
f.write_text("version = 2\nmore code\n", encoding="utf-8")
|
|
||||||
stats = pipeline.index_file(f, root=src_dir)
|
|
||||||
assert stats.files_processed == 1
|
|
||||||
|
|
||||||
new_chunks = meta.get_chunk_ids_for_file("changing.py")
|
|
||||||
# Old chunks should have been tombstoned, new ones assigned
|
|
||||||
assert set(old_chunks) != set(new_chunks)
|
|
||||||
|
|
||||||
def test_index_file_registers_in_metadata(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
fts = workspace["fts"]
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
|
|
||||||
f = _write_file(src_dir, "meta_test.py", "def foo(): pass\n")
|
|
||||||
pipeline.index_file(f, root=src_dir)
|
|
||||||
|
|
||||||
# MetadataStore has file registered
|
|
||||||
assert meta.get_file_hash("meta_test.py") is not None
|
|
||||||
chunk_ids = meta.get_chunk_ids_for_file("meta_test.py")
|
|
||||||
assert len(chunk_ids) >= 1
|
|
||||||
|
|
||||||
# FTS has the content
|
|
||||||
fts_ids = fts.get_chunk_ids_by_path("meta_test.py")
|
|
||||||
assert len(fts_ids) >= 1
|
|
||||||
|
|
||||||
def test_index_file_no_metadata_raises(self, workspace):
|
|
||||||
cfg = workspace["config"]
|
|
||||||
pipeline_no_meta = IndexingPipeline(
|
|
||||||
embedder=FakeEmbedder(),
|
|
||||||
binary_store=workspace["binary_store"],
|
|
||||||
ann_index=workspace["ann_index"],
|
|
||||||
fts=workspace["fts"],
|
|
||||||
config=cfg,
|
|
||||||
)
|
|
||||||
f = _write_file(workspace["src_dir"], "no_meta.py", "x = 1\n")
|
|
||||||
with pytest.raises(RuntimeError, match="MetadataStore is required"):
|
|
||||||
pipeline_no_meta.index_file(f)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# remove_file tests
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestRemoveFile:
|
|
||||||
def test_remove_file_tombstones_and_fts(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
fts = workspace["fts"]
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
|
|
||||||
f = _write_file(src_dir, "to_remove.py", "data = [1, 2, 3]\n")
|
|
||||||
pipeline.index_file(f, root=src_dir)
|
|
||||||
|
|
||||||
chunk_ids = meta.get_chunk_ids_for_file("to_remove.py")
|
|
||||||
assert len(chunk_ids) >= 1
|
|
||||||
|
|
||||||
pipeline.remove_file("to_remove.py")
|
|
||||||
|
|
||||||
# File should be gone from metadata
|
|
||||||
assert meta.get_file_hash("to_remove.py") is None
|
|
||||||
assert meta.get_chunk_ids_for_file("to_remove.py") == []
|
|
||||||
|
|
||||||
# Chunks should be in deleted_chunks
|
|
||||||
deleted = meta.get_deleted_ids()
|
|
||||||
for cid in chunk_ids:
|
|
||||||
assert cid in deleted
|
|
||||||
|
|
||||||
# FTS should be cleared
|
|
||||||
assert fts.get_chunk_ids_by_path("to_remove.py") == []
|
|
||||||
|
|
||||||
def test_remove_nonexistent_file(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
# Should not raise
|
|
||||||
pipeline.remove_file("nonexistent.py")
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# sync tests
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestSync:
|
|
||||||
def test_sync_indexes_new_files(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
|
|
||||||
f1 = _write_file(src_dir, "a.py", "a = 1\n")
|
|
||||||
f2 = _write_file(src_dir, "b.py", "b = 2\n")
|
|
||||||
|
|
||||||
stats = pipeline.sync([f1, f2], root=src_dir)
|
|
||||||
assert stats.files_processed == 2
|
|
||||||
assert meta.get_file_hash("a.py") is not None
|
|
||||||
assert meta.get_file_hash("b.py") is not None
|
|
||||||
|
|
||||||
def test_sync_removes_missing_files(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
|
|
||||||
f1 = _write_file(src_dir, "keep.py", "keep = True\n")
|
|
||||||
f2 = _write_file(src_dir, "remove.py", "remove = True\n")
|
|
||||||
|
|
||||||
pipeline.sync([f1, f2], root=src_dir)
|
|
||||||
assert meta.get_file_hash("remove.py") is not None
|
|
||||||
|
|
||||||
# Sync with only f1 -- f2 should be removed
|
|
||||||
stats = pipeline.sync([f1], root=src_dir)
|
|
||||||
assert meta.get_file_hash("remove.py") is None
|
|
||||||
deleted = meta.get_deleted_ids()
|
|
||||||
assert len(deleted) > 0
|
|
||||||
|
|
||||||
def test_sync_detects_changed_files(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
|
|
||||||
f = _write_file(src_dir, "mutable.py", "v1\n")
|
|
||||||
pipeline.sync([f], root=src_dir)
|
|
||||||
old_hash = meta.get_file_hash("mutable.py")
|
|
||||||
|
|
||||||
f.write_text("v2\n", encoding="utf-8")
|
|
||||||
stats = pipeline.sync([f], root=src_dir)
|
|
||||||
assert stats.files_processed == 1
|
|
||||||
new_hash = meta.get_file_hash("mutable.py")
|
|
||||||
assert old_hash != new_hash
|
|
||||||
|
|
||||||
def test_sync_skips_unchanged(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
|
|
||||||
f = _write_file(src_dir, "stable.py", "stable = True\n")
|
|
||||||
pipeline.sync([f], root=src_dir)
|
|
||||||
|
|
||||||
# Second sync with same file, unchanged
|
|
||||||
stats = pipeline.sync([f], root=src_dir)
|
|
||||||
assert stats.files_processed == 0
|
|
||||||
assert stats.chunks_created == 0
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# compact tests
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestCompact:
|
|
||||||
def test_compact_removes_tombstoned_from_binary_store(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
binary_store = workspace["binary_store"]
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
|
|
||||||
f1 = _write_file(src_dir, "alive.py", "alive = True\n")
|
|
||||||
f2 = _write_file(src_dir, "dead.py", "dead = True\n")
|
|
||||||
|
|
||||||
pipeline.index_file(f1, root=src_dir)
|
|
||||||
pipeline.index_file(f2, root=src_dir)
|
|
||||||
|
|
||||||
count_before = binary_store._count
|
|
||||||
assert count_before >= 2
|
|
||||||
|
|
||||||
pipeline.remove_file("dead.py")
|
|
||||||
pipeline.compact()
|
|
||||||
|
|
||||||
# BinaryStore should have fewer entries
|
|
||||||
assert binary_store._count < count_before
|
|
||||||
# deleted_chunks should be cleared
|
|
||||||
assert meta.get_deleted_ids() == set()
|
|
||||||
|
|
||||||
def test_compact_noop_when_no_deletions(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
meta = workspace["metadata"]
|
|
||||||
binary_store = workspace["binary_store"]
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
|
|
||||||
f = _write_file(src_dir, "solo.py", "solo = True\n")
|
|
||||||
pipeline.index_file(f, root=src_dir)
|
|
||||||
count_before = binary_store._count
|
|
||||||
|
|
||||||
pipeline.compact()
|
|
||||||
assert binary_store._count == count_before
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Backward compatibility: existing batch API still works
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
class TestBatchAPIUnchanged:
|
|
||||||
def test_index_files_still_works(self, workspace):
|
|
||||||
pipeline = workspace["pipeline"]
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
|
|
||||||
f1 = _write_file(src_dir, "batch1.py", "batch1 = 1\n")
|
|
||||||
f2 = _write_file(src_dir, "batch2.py", "batch2 = 2\n")
|
|
||||||
|
|
||||||
stats = pipeline.index_files([f1, f2], root=src_dir)
|
|
||||||
assert stats.files_processed == 2
|
|
||||||
assert stats.chunks_created >= 2
|
|
||||||
|
|
||||||
def test_index_files_works_without_metadata(self, workspace):
|
|
||||||
"""Batch API should work even without MetadataStore."""
|
|
||||||
cfg = workspace["config"]
|
|
||||||
pipeline_no_meta = IndexingPipeline(
|
|
||||||
embedder=FakeEmbedder(),
|
|
||||||
binary_store=BinaryStore(workspace["store_dir"] / "no_meta", DIM, cfg),
|
|
||||||
ann_index=ANNIndex(workspace["store_dir"] / "no_meta", DIM, cfg),
|
|
||||||
fts=FTSEngine(str(workspace["store_dir"] / "no_meta_fts.db")),
|
|
||||||
config=cfg,
|
|
||||||
)
|
|
||||||
src_dir = workspace["src_dir"]
|
|
||||||
f = _write_file(src_dir, "no_meta_batch.py", "x = 1\n")
|
|
||||||
stats = pipeline_no_meta.index_files([f], root=src_dir)
|
|
||||||
assert stats.files_processed == 1
|
|
||||||
@@ -1,184 +0,0 @@
|
|||||||
"""Unit tests for MetadataStore — SQLite file-to-chunk mapping + tombstone tracking."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from codexlens_search.indexing.metadata import MetadataStore
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def store(tmp_path):
|
|
||||||
"""Create a fresh MetadataStore backed by a temp db."""
|
|
||||||
return MetadataStore(str(tmp_path / "meta.db"))
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Table creation
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestTableCreation:
|
|
||||||
def test_creates_three_tables(self, store):
|
|
||||||
"""MetadataStore should create files, chunks, deleted_chunks tables."""
|
|
||||||
tables = store._conn.execute(
|
|
||||||
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
|
|
||||||
).fetchall()
|
|
||||||
names = {r[0] for r in tables}
|
|
||||||
assert "files" in names
|
|
||||||
assert "chunks" in names
|
|
||||||
assert "deleted_chunks" in names
|
|
||||||
|
|
||||||
def test_foreign_keys_enabled(self, store):
|
|
||||||
"""PRAGMA foreign_keys must be ON."""
|
|
||||||
row = store._conn.execute("PRAGMA foreign_keys").fetchone()
|
|
||||||
assert row[0] == 1
|
|
||||||
|
|
||||||
def test_wal_mode(self, store):
|
|
||||||
"""journal_mode should be WAL for concurrency."""
|
|
||||||
row = store._conn.execute("PRAGMA journal_mode").fetchone()
|
|
||||||
assert row[0].lower() == "wal"
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# register_file
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestRegisterFile:
|
|
||||||
def test_register_and_retrieve(self, store):
|
|
||||||
store.register_file("src/main.py", "abc123", 1000.0)
|
|
||||||
assert store.get_file_hash("src/main.py") == "abc123"
|
|
||||||
|
|
||||||
def test_register_updates_existing(self, store):
|
|
||||||
store.register_file("a.py", "hash1", 1000.0)
|
|
||||||
store.register_file("a.py", "hash2", 2000.0)
|
|
||||||
assert store.get_file_hash("a.py") == "hash2"
|
|
||||||
|
|
||||||
def test_get_file_hash_returns_none_for_unknown(self, store):
|
|
||||||
assert store.get_file_hash("nonexistent.py") is None
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# register_chunks
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestRegisterChunks:
|
|
||||||
def test_register_and_retrieve_chunks(self, store):
|
|
||||||
store.register_file("a.py", "h", 1.0)
|
|
||||||
store.register_chunks("a.py", [(0, "c0"), (1, "c1"), (2, "c2")])
|
|
||||||
ids = store.get_chunk_ids_for_file("a.py")
|
|
||||||
assert sorted(ids) == [0, 1, 2]
|
|
||||||
|
|
||||||
def test_empty_chunks_list(self, store):
|
|
||||||
store.register_file("a.py", "h", 1.0)
|
|
||||||
store.register_chunks("a.py", [])
|
|
||||||
assert store.get_chunk_ids_for_file("a.py") == []
|
|
||||||
|
|
||||||
def test_chunks_for_unknown_file(self, store):
|
|
||||||
assert store.get_chunk_ids_for_file("unknown.py") == []
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# mark_file_deleted
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestMarkFileDeleted:
|
|
||||||
def test_tombstones_chunks(self, store):
|
|
||||||
store.register_file("a.py", "h", 1.0)
|
|
||||||
store.register_chunks("a.py", [(10, "c10"), (11, "c11")])
|
|
||||||
count = store.mark_file_deleted("a.py")
|
|
||||||
assert count == 2
|
|
||||||
assert store.get_deleted_ids() == {10, 11}
|
|
||||||
|
|
||||||
def test_file_removed_after_delete(self, store):
|
|
||||||
store.register_file("a.py", "h", 1.0)
|
|
||||||
store.register_chunks("a.py", [(0, "c0")])
|
|
||||||
store.mark_file_deleted("a.py")
|
|
||||||
assert store.get_file_hash("a.py") is None
|
|
||||||
|
|
||||||
def test_chunks_cascaded_after_delete(self, store):
|
|
||||||
store.register_file("a.py", "h", 1.0)
|
|
||||||
store.register_chunks("a.py", [(0, "c0")])
|
|
||||||
store.mark_file_deleted("a.py")
|
|
||||||
assert store.get_chunk_ids_for_file("a.py") == []
|
|
||||||
|
|
||||||
def test_delete_nonexistent_file(self, store):
|
|
||||||
count = store.mark_file_deleted("nonexistent.py")
|
|
||||||
assert count == 0
|
|
||||||
|
|
||||||
def test_delete_file_without_chunks(self, store):
|
|
||||||
store.register_file("empty.py", "h", 1.0)
|
|
||||||
count = store.mark_file_deleted("empty.py")
|
|
||||||
assert count == 0
|
|
||||||
assert store.get_file_hash("empty.py") is None
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# file_needs_update
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestFileNeedsUpdate:
|
|
||||||
def test_new_file_needs_update(self, store):
|
|
||||||
assert store.file_needs_update("new.py", "any_hash") is True
|
|
||||||
|
|
||||||
def test_unchanged_file(self, store):
|
|
||||||
store.register_file("a.py", "same_hash", 1.0)
|
|
||||||
assert store.file_needs_update("a.py", "same_hash") is False
|
|
||||||
|
|
||||||
def test_changed_file(self, store):
|
|
||||||
store.register_file("a.py", "old_hash", 1.0)
|
|
||||||
assert store.file_needs_update("a.py", "new_hash") is True
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# get_deleted_ids / compact_deleted
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestDeletedIdsAndCompact:
|
|
||||||
def test_empty_deleted_ids(self, store):
|
|
||||||
assert store.get_deleted_ids() == set()
|
|
||||||
|
|
||||||
def test_compact_returns_and_clears(self, store):
|
|
||||||
store.register_file("a.py", "h", 1.0)
|
|
||||||
store.register_chunks("a.py", [(5, "c5"), (6, "c6")])
|
|
||||||
store.mark_file_deleted("a.py")
|
|
||||||
|
|
||||||
deleted = store.compact_deleted()
|
|
||||||
assert deleted == {5, 6}
|
|
||||||
assert store.get_deleted_ids() == set()
|
|
||||||
|
|
||||||
def test_compact_noop_when_empty(self, store):
|
|
||||||
deleted = store.compact_deleted()
|
|
||||||
assert deleted == set()
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# get_all_files / max_chunk_id
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestHelpers:
|
|
||||||
def test_get_all_files(self, store):
|
|
||||||
store.register_file("a.py", "h1", 1.0)
|
|
||||||
store.register_file("b.py", "h2", 2.0)
|
|
||||||
assert store.get_all_files() == {"a.py": "h1", "b.py": "h2"}
|
|
||||||
|
|
||||||
def test_max_chunk_id_empty(self, store):
|
|
||||||
assert store.max_chunk_id() == -1
|
|
||||||
|
|
||||||
def test_max_chunk_id_active(self, store):
|
|
||||||
store.register_file("a.py", "h", 1.0)
|
|
||||||
store.register_chunks("a.py", [(0, "c"), (5, "c"), (3, "c")])
|
|
||||||
assert store.max_chunk_id() == 5
|
|
||||||
|
|
||||||
def test_max_chunk_id_includes_deleted(self, store):
|
|
||||||
store.register_file("a.py", "h", 1.0)
|
|
||||||
store.register_chunks("a.py", [(10, "c")])
|
|
||||||
store.mark_file_deleted("a.py")
|
|
||||||
assert store.max_chunk_id() == 10
|
|
||||||
|
|
||||||
def test_max_chunk_id_mixed(self, store):
|
|
||||||
store.register_file("a.py", "h", 1.0)
|
|
||||||
store.register_chunks("a.py", [(3, "c")])
|
|
||||||
store.register_file("b.py", "h2", 1.0)
|
|
||||||
store.register_chunks("b.py", [(7, "c")])
|
|
||||||
store.mark_file_deleted("a.py")
|
|
||||||
# deleted has 3, active has 7
|
|
||||||
assert store.max_chunk_id() == 7
|
|
||||||
@@ -1,179 +0,0 @@
|
|||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import types
|
|
||||||
from unittest.mock import MagicMock, patch
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
from codexlens_search.rerank.base import BaseReranker
|
|
||||||
from codexlens_search.rerank.local import FastEmbedReranker
|
|
||||||
from codexlens_search.rerank.api import APIReranker
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# BaseReranker
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def test_base_reranker_is_abstract():
|
|
||||||
with pytest.raises(TypeError):
|
|
||||||
BaseReranker() # type: ignore[abstract]
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# FastEmbedReranker
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _make_rerank_result(index: int, score: float) -> object:
|
|
||||||
obj = types.SimpleNamespace(index=index, score=score)
|
|
||||||
return obj
|
|
||||||
|
|
||||||
|
|
||||||
def test_local_reranker_score_pairs_length():
|
|
||||||
config = Config()
|
|
||||||
reranker = FastEmbedReranker(config)
|
|
||||||
|
|
||||||
mock_results = [
|
|
||||||
_make_rerank_result(0, 0.9),
|
|
||||||
_make_rerank_result(1, 0.5),
|
|
||||||
_make_rerank_result(2, 0.1),
|
|
||||||
]
|
|
||||||
|
|
||||||
mock_model = MagicMock()
|
|
||||||
mock_model.rerank.return_value = iter(mock_results)
|
|
||||||
reranker._model = mock_model
|
|
||||||
|
|
||||||
docs = ["doc0", "doc1", "doc2"]
|
|
||||||
scores = reranker.score_pairs("query", docs)
|
|
||||||
|
|
||||||
assert len(scores) == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_local_reranker_preserves_order():
|
|
||||||
config = Config()
|
|
||||||
reranker = FastEmbedReranker(config)
|
|
||||||
|
|
||||||
# rerank returns results in reverse order (index 2, 1, 0)
|
|
||||||
mock_results = [
|
|
||||||
_make_rerank_result(2, 0.1),
|
|
||||||
_make_rerank_result(1, 0.5),
|
|
||||||
_make_rerank_result(0, 0.9),
|
|
||||||
]
|
|
||||||
|
|
||||||
mock_model = MagicMock()
|
|
||||||
mock_model.rerank.return_value = iter(mock_results)
|
|
||||||
reranker._model = mock_model
|
|
||||||
|
|
||||||
docs = ["doc0", "doc1", "doc2"]
|
|
||||||
scores = reranker.score_pairs("query", docs)
|
|
||||||
|
|
||||||
assert scores[0] == pytest.approx(0.9)
|
|
||||||
assert scores[1] == pytest.approx(0.5)
|
|
||||||
assert scores[2] == pytest.approx(0.1)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# APIReranker
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _make_config(max_tokens_per_batch: int = 512) -> Config:
|
|
||||||
return Config(
|
|
||||||
reranker_api_url="https://api.example.com",
|
|
||||||
reranker_api_key="test-key",
|
|
||||||
reranker_api_model="test-model",
|
|
||||||
reranker_api_max_tokens_per_batch=max_tokens_per_batch,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_api_reranker_batch_splitting():
|
|
||||||
config = _make_config(max_tokens_per_batch=512)
|
|
||||||
|
|
||||||
with patch("httpx.Client"):
|
|
||||||
reranker = APIReranker(config)
|
|
||||||
|
|
||||||
# 10 docs, each ~200 tokens (800 chars)
|
|
||||||
docs = ["x" * 800] * 10
|
|
||||||
batches = reranker._split_batches(docs, max_tokens=512)
|
|
||||||
|
|
||||||
# Each doc is 200 tokens; batches should have at most 2 docs (200+200=400 <= 512, 400+200=600 > 512)
|
|
||||||
assert len(batches) > 1
|
|
||||||
for batch in batches:
|
|
||||||
total = sum(len(text) // 4 for _, text in batch)
|
|
||||||
assert total <= 512 or len(batch) == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_api_reranker_retry_on_429():
|
|
||||||
config = _make_config()
|
|
||||||
|
|
||||||
mock_429 = MagicMock()
|
|
||||||
mock_429.status_code = 429
|
|
||||||
|
|
||||||
mock_200 = MagicMock()
|
|
||||||
mock_200.status_code = 200
|
|
||||||
mock_200.json.return_value = {
|
|
||||||
"results": [
|
|
||||||
{"index": 0, "relevance_score": 0.8},
|
|
||||||
{"index": 1, "relevance_score": 0.3},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
mock_200.raise_for_status = MagicMock()
|
|
||||||
|
|
||||||
with patch("httpx.Client") as mock_client_cls:
|
|
||||||
mock_client = MagicMock()
|
|
||||||
mock_client_cls.return_value = mock_client
|
|
||||||
mock_client.post.side_effect = [mock_429, mock_429, mock_200]
|
|
||||||
|
|
||||||
reranker = APIReranker(config)
|
|
||||||
|
|
||||||
with patch("time.sleep"):
|
|
||||||
result = reranker._call_api_with_retry(
|
|
||||||
"query",
|
|
||||||
[(0, "doc0"), (1, "doc1")],
|
|
||||||
max_retries=3,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert mock_client.post.call_count == 3
|
|
||||||
assert 0 in result
|
|
||||||
assert 1 in result
|
|
||||||
|
|
||||||
|
|
||||||
def test_api_reranker_merge_batches():
|
|
||||||
config = _make_config(max_tokens_per_batch=100)
|
|
||||||
|
|
||||||
# 4 docs of 25 tokens each (100 chars); each batch holds at most 4 docs
|
|
||||||
# Use smaller docs to force 2 batches: 2 docs per batch (50 tokens each = 200 chars)
|
|
||||||
docs = ["x" * 200] * 4 # 50 tokens each; 50+50=100 <= 100, 100+50=150 > 100 -> 2 per batch
|
|
||||||
|
|
||||||
batch0_response = MagicMock()
|
|
||||||
batch0_response.status_code = 200
|
|
||||||
batch0_response.json.return_value = {
|
|
||||||
"results": [
|
|
||||||
{"index": 0, "relevance_score": 0.9},
|
|
||||||
{"index": 1, "relevance_score": 0.8},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
batch0_response.raise_for_status = MagicMock()
|
|
||||||
|
|
||||||
batch1_response = MagicMock()
|
|
||||||
batch1_response.status_code = 200
|
|
||||||
batch1_response.json.return_value = {
|
|
||||||
"results": [
|
|
||||||
{"index": 0, "relevance_score": 0.7},
|
|
||||||
{"index": 1, "relevance_score": 0.6},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
batch1_response.raise_for_status = MagicMock()
|
|
||||||
|
|
||||||
with patch("httpx.Client") as mock_client_cls:
|
|
||||||
mock_client = MagicMock()
|
|
||||||
mock_client_cls.return_value = mock_client
|
|
||||||
mock_client.post.side_effect = [batch0_response, batch1_response]
|
|
||||||
|
|
||||||
reranker = APIReranker(config)
|
|
||||||
|
|
||||||
with patch("time.sleep"):
|
|
||||||
scores = reranker.score_pairs("query", docs)
|
|
||||||
|
|
||||||
assert len(scores) == 4
|
|
||||||
# All original indices should have scores
|
|
||||||
assert all(s > 0 for s in scores)
|
|
||||||
@@ -1,156 +0,0 @@
|
|||||||
"""Unit tests for search layer: FTSEngine, fusion, and SearchPipeline."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from unittest.mock import MagicMock
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from codexlens_search.search.fts import FTSEngine
|
|
||||||
from codexlens_search.search.fusion import (
|
|
||||||
DEFAULT_WEIGHTS,
|
|
||||||
QueryIntent,
|
|
||||||
detect_query_intent,
|
|
||||||
get_adaptive_weights,
|
|
||||||
reciprocal_rank_fusion,
|
|
||||||
)
|
|
||||||
from codexlens_search.search.pipeline import SearchPipeline, SearchResult
|
|
||||||
from codexlens_search.config import Config
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Helpers
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def make_fts(docs: list[tuple[int, str, str]] | None = None) -> FTSEngine:
|
|
||||||
"""Create an in-memory FTSEngine and optionally add documents."""
|
|
||||||
engine = FTSEngine(":memory:")
|
|
||||||
if docs:
|
|
||||||
engine.add_documents(docs)
|
|
||||||
return engine
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# FTSEngine tests
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def test_fts_add_and_exact_search():
|
|
||||||
docs = [
|
|
||||||
(1, "a.py", "def authenticate user password login"),
|
|
||||||
(2, "b.py", "connect to database with credentials"),
|
|
||||||
(3, "c.py", "render template html response"),
|
|
||||||
]
|
|
||||||
engine = make_fts(docs)
|
|
||||||
results = engine.exact_search("authenticate", top_k=10)
|
|
||||||
ids = [r[0] for r in results]
|
|
||||||
assert 1 in ids, "doc 1 should match 'authenticate'"
|
|
||||||
assert 2 not in ids or results[0][0] == 1 # doc 1 must rank higher
|
|
||||||
|
|
||||||
|
|
||||||
def test_fts_fuzzy_search_prefix():
|
|
||||||
docs = [
|
|
||||||
(10, "auth.py", "authentication token refresh"),
|
|
||||||
(11, "db.py", "database connection pool"),
|
|
||||||
(12, "ui.py", "render button click handler"),
|
|
||||||
]
|
|
||||||
engine = make_fts(docs)
|
|
||||||
# Prefix 'auth' should match 'authentication' in doc 10
|
|
||||||
results = engine.fuzzy_search("auth", top_k=10)
|
|
||||||
ids = [r[0] for r in results]
|
|
||||||
assert 10 in ids, "prefix 'auth' should match doc 10 with 'authentication'"
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# RRF fusion tests
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def test_rrf_fusion_ordering():
|
|
||||||
"""When two sources agree on top-1, it should rank first in fused result."""
|
|
||||||
source_a = [(1, 0.9), (2, 0.5), (3, 0.2)]
|
|
||||||
source_b = [(1, 0.8), (3, 0.6), (2, 0.1)]
|
|
||||||
fused = reciprocal_rank_fusion({"a": source_a, "b": source_b})
|
|
||||||
assert fused[0][0] == 1, "doc 1 agreed top by both sources must rank first"
|
|
||||||
|
|
||||||
|
|
||||||
def test_rrf_equal_weight_default():
|
|
||||||
"""Calling with None weights should use DEFAULT_WEIGHTS shape (not crash)."""
|
|
||||||
source_exact = [(5, 1.0), (6, 0.8)]
|
|
||||||
source_vector = [(6, 0.9), (5, 0.7)]
|
|
||||||
# Should not raise and should return results
|
|
||||||
fused = reciprocal_rank_fusion(
|
|
||||||
{"exact": source_exact, "vector": source_vector},
|
|
||||||
weights=None,
|
|
||||||
)
|
|
||||||
assert len(fused) == 2
|
|
||||||
ids = [r[0] for r in fused]
|
|
||||||
assert 5 in ids and 6 in ids
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# detect_query_intent tests
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def test_detect_intent_code_symbol():
|
|
||||||
assert detect_query_intent("def authenticate()") == QueryIntent.CODE_SYMBOL
|
|
||||||
|
|
||||||
|
|
||||||
def test_detect_intent_natural():
|
|
||||||
assert detect_query_intent("how do I authenticate users") == QueryIntent.NATURAL_LANGUAGE
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# SearchPipeline tests
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _make_pipeline(fts: FTSEngine, top_k: int = 5) -> SearchPipeline:
|
|
||||||
"""Build a SearchPipeline with mocked heavy components."""
|
|
||||||
cfg = Config.small()
|
|
||||||
cfg.reranker_top_k = top_k
|
|
||||||
|
|
||||||
embedder = MagicMock()
|
|
||||||
embedder.embed.return_value = [[0.1] * cfg.embed_dim]
|
|
||||||
|
|
||||||
binary_store = MagicMock()
|
|
||||||
binary_store.coarse_search.return_value = ([1, 2, 3], None)
|
|
||||||
|
|
||||||
ann_index = MagicMock()
|
|
||||||
ann_index.fine_search.return_value = ([1, 2, 3], [0.9, 0.8, 0.7])
|
|
||||||
|
|
||||||
reranker = MagicMock()
|
|
||||||
# Return a score for each content string passed
|
|
||||||
reranker.score_pairs.side_effect = lambda q, contents: [0.9 - i * 0.1 for i in range(len(contents))]
|
|
||||||
|
|
||||||
return SearchPipeline(
|
|
||||||
embedder=embedder,
|
|
||||||
binary_store=binary_store,
|
|
||||||
ann_index=ann_index,
|
|
||||||
reranker=reranker,
|
|
||||||
fts=fts,
|
|
||||||
config=cfg,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_pipeline_search_returns_results():
|
|
||||||
docs = [
|
|
||||||
(1, "a.py", "test content alpha"),
|
|
||||||
(2, "b.py", "test content beta"),
|
|
||||||
(3, "c.py", "test content gamma"),
|
|
||||||
]
|
|
||||||
fts = make_fts(docs)
|
|
||||||
pipeline = _make_pipeline(fts)
|
|
||||||
results = pipeline.search("test")
|
|
||||||
assert len(results) > 0
|
|
||||||
assert all(isinstance(r, SearchResult) for r in results)
|
|
||||||
|
|
||||||
|
|
||||||
def test_pipeline_top_k_limit():
|
|
||||||
docs = [
|
|
||||||
(1, "a.py", "hello world one"),
|
|
||||||
(2, "b.py", "hello world two"),
|
|
||||||
(3, "c.py", "hello world three"),
|
|
||||||
(4, "d.py", "hello world four"),
|
|
||||||
(5, "e.py", "hello world five"),
|
|
||||||
]
|
|
||||||
fts = make_fts(docs)
|
|
||||||
pipeline = _make_pipeline(fts, top_k=2)
|
|
||||||
results = pipeline.search("hello", top_k=2)
|
|
||||||
assert len(results) <= 2, "pipeline must respect top_k limit"
|
|
||||||
@@ -1,271 +0,0 @@
|
|||||||
"""Unit tests for watcher module — events, FileWatcher debounce/dedup, IncrementalIndexer."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
from unittest.mock import MagicMock, patch
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from codexlens_search.watcher.events import ChangeType, FileEvent, WatcherConfig
|
|
||||||
from codexlens_search.watcher.incremental_indexer import BatchResult, IncrementalIndexer
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# ChangeType enum
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestChangeType:
|
|
||||||
def test_values(self):
|
|
||||||
assert ChangeType.CREATED.value == "created"
|
|
||||||
assert ChangeType.MODIFIED.value == "modified"
|
|
||||||
assert ChangeType.DELETED.value == "deleted"
|
|
||||||
|
|
||||||
def test_all_members(self):
|
|
||||||
assert len(ChangeType) == 3
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# FileEvent
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestFileEvent:
|
|
||||||
def test_creation(self):
|
|
||||||
e = FileEvent(path=Path("a.py"), change_type=ChangeType.CREATED)
|
|
||||||
assert e.path == Path("a.py")
|
|
||||||
assert e.change_type == ChangeType.CREATED
|
|
||||||
assert isinstance(e.timestamp, float)
|
|
||||||
|
|
||||||
def test_custom_timestamp(self):
|
|
||||||
e = FileEvent(path=Path("b.py"), change_type=ChangeType.DELETED, timestamp=42.0)
|
|
||||||
assert e.timestamp == 42.0
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# WatcherConfig
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestWatcherConfig:
|
|
||||||
def test_defaults(self):
|
|
||||||
cfg = WatcherConfig()
|
|
||||||
assert cfg.debounce_ms == 500
|
|
||||||
assert ".git" in cfg.ignored_patterns
|
|
||||||
assert "__pycache__" in cfg.ignored_patterns
|
|
||||||
assert "node_modules" in cfg.ignored_patterns
|
|
||||||
assert ".codexlens" in cfg.ignored_patterns
|
|
||||||
|
|
||||||
def test_custom(self):
|
|
||||||
cfg = WatcherConfig(debounce_ms=1000, ignored_patterns={".custom"})
|
|
||||||
assert cfg.debounce_ms == 1000
|
|
||||||
assert cfg.ignored_patterns == {".custom"}
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# BatchResult
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestBatchResult:
|
|
||||||
def test_defaults(self):
|
|
||||||
r = BatchResult()
|
|
||||||
assert r.files_indexed == 0
|
|
||||||
assert r.files_removed == 0
|
|
||||||
assert r.chunks_created == 0
|
|
||||||
assert r.errors == []
|
|
||||||
|
|
||||||
def test_total_processed(self):
|
|
||||||
r = BatchResult(files_indexed=3, files_removed=2)
|
|
||||||
assert r.total_processed == 5
|
|
||||||
|
|
||||||
def test_has_errors(self):
|
|
||||||
r = BatchResult()
|
|
||||||
assert r.has_errors is False
|
|
||||||
r.errors.append("oops")
|
|
||||||
assert r.has_errors is True
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# IncrementalIndexer — event routing
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestIncrementalIndexer:
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_pipeline(self):
|
|
||||||
pipeline = MagicMock()
|
|
||||||
pipeline.index_file.return_value = MagicMock(
|
|
||||||
files_processed=1, chunks_created=3
|
|
||||||
)
|
|
||||||
return pipeline
|
|
||||||
|
|
||||||
def test_routes_created_to_index_file(self, mock_pipeline):
|
|
||||||
indexer = IncrementalIndexer(mock_pipeline, root=Path("/project"))
|
|
||||||
events = [
|
|
||||||
FileEvent(Path("/project/src/new.py"), ChangeType.CREATED),
|
|
||||||
]
|
|
||||||
result = indexer.process_events(events)
|
|
||||||
assert result.files_indexed == 1
|
|
||||||
mock_pipeline.index_file.assert_called_once()
|
|
||||||
# CREATED should NOT use force=True
|
|
||||||
call_kwargs = mock_pipeline.index_file.call_args
|
|
||||||
assert call_kwargs.kwargs.get("force", call_kwargs[1].get("force")) is False
|
|
||||||
|
|
||||||
def test_routes_modified_to_index_file_with_force(self, mock_pipeline):
|
|
||||||
indexer = IncrementalIndexer(mock_pipeline, root=Path("/project"))
|
|
||||||
events = [
|
|
||||||
FileEvent(Path("/project/src/changed.py"), ChangeType.MODIFIED),
|
|
||||||
]
|
|
||||||
result = indexer.process_events(events)
|
|
||||||
assert result.files_indexed == 1
|
|
||||||
call_kwargs = mock_pipeline.index_file.call_args
|
|
||||||
assert call_kwargs.kwargs.get("force", call_kwargs[1].get("force")) is True
|
|
||||||
|
|
||||||
def test_routes_deleted_to_remove_file(self, mock_pipeline, tmp_path):
|
|
||||||
root = tmp_path / "project"
|
|
||||||
root.mkdir()
|
|
||||||
indexer = IncrementalIndexer(mock_pipeline, root=root)
|
|
||||||
events = [
|
|
||||||
FileEvent(root / "src" / "old.py", ChangeType.DELETED),
|
|
||||||
]
|
|
||||||
result = indexer.process_events(events)
|
|
||||||
assert result.files_removed == 1
|
|
||||||
# On Windows relative_to produces backslashes, normalize
|
|
||||||
actual_arg = mock_pipeline.remove_file.call_args[0][0]
|
|
||||||
assert actual_arg.replace("\\", "/") == "src/old.py"
|
|
||||||
|
|
||||||
def test_batch_with_mixed_events(self, mock_pipeline):
|
|
||||||
indexer = IncrementalIndexer(mock_pipeline, root=Path("/project"))
|
|
||||||
events = [
|
|
||||||
FileEvent(Path("/project/a.py"), ChangeType.CREATED),
|
|
||||||
FileEvent(Path("/project/b.py"), ChangeType.MODIFIED),
|
|
||||||
FileEvent(Path("/project/c.py"), ChangeType.DELETED),
|
|
||||||
]
|
|
||||||
result = indexer.process_events(events)
|
|
||||||
assert result.files_indexed == 2
|
|
||||||
assert result.files_removed == 1
|
|
||||||
assert result.total_processed == 3
|
|
||||||
|
|
||||||
def test_error_isolation(self, mock_pipeline):
|
|
||||||
"""One file failure should not stop processing of others."""
|
|
||||||
call_count = [0]
|
|
||||||
|
|
||||||
def side_effect(*args, **kwargs):
|
|
||||||
call_count[0] += 1
|
|
||||||
if call_count[0] == 1:
|
|
||||||
raise RuntimeError("disk error")
|
|
||||||
return MagicMock(files_processed=1, chunks_created=1)
|
|
||||||
|
|
||||||
mock_pipeline.index_file.side_effect = side_effect
|
|
||||||
|
|
||||||
indexer = IncrementalIndexer(mock_pipeline, root=Path("/project"))
|
|
||||||
events = [
|
|
||||||
FileEvent(Path("/project/fail.py"), ChangeType.CREATED),
|
|
||||||
FileEvent(Path("/project/ok.py"), ChangeType.CREATED),
|
|
||||||
]
|
|
||||||
result = indexer.process_events(events)
|
|
||||||
|
|
||||||
assert result.files_indexed == 1 # second succeeded
|
|
||||||
assert len(result.errors) == 1 # first failed
|
|
||||||
assert "disk error" in result.errors[0]
|
|
||||||
|
|
||||||
def test_empty_events(self, mock_pipeline):
|
|
||||||
indexer = IncrementalIndexer(mock_pipeline)
|
|
||||||
result = indexer.process_events([])
|
|
||||||
assert result.total_processed == 0
|
|
||||||
mock_pipeline.index_file.assert_not_called()
|
|
||||||
mock_pipeline.remove_file.assert_not_called()
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# FileWatcher — debounce and dedup logic (unit-level, no actual FS)
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
class TestFileWatcherLogic:
|
|
||||||
"""Test FileWatcher internals without starting a real watchdog Observer."""
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def watcher_parts(self):
|
|
||||||
"""Create a FileWatcher with mocked observer, capture callbacks."""
|
|
||||||
# Import here since watchdog is optional
|
|
||||||
from codexlens_search.watcher.file_watcher import FileWatcher, _EVENT_PRIORITY
|
|
||||||
|
|
||||||
collected = []
|
|
||||||
|
|
||||||
def on_changes(events):
|
|
||||||
collected.extend(events)
|
|
||||||
|
|
||||||
cfg = WatcherConfig(debounce_ms=100)
|
|
||||||
watcher = FileWatcher(Path("."), cfg, on_changes)
|
|
||||||
return watcher, collected, _EVENT_PRIORITY
|
|
||||||
|
|
||||||
def test_event_priority_ordering(self, watcher_parts):
|
|
||||||
_, _, priority = watcher_parts
|
|
||||||
assert priority[ChangeType.DELETED] > priority[ChangeType.MODIFIED]
|
|
||||||
assert priority[ChangeType.MODIFIED] > priority[ChangeType.CREATED]
|
|
||||||
|
|
||||||
def test_dedup_keeps_higher_priority(self, watcher_parts, tmp_path):
|
|
||||||
watcher, collected, _ = watcher_parts
|
|
||||||
f = str(tmp_path / "a.py")
|
|
||||||
watcher._on_raw_event(f, ChangeType.CREATED)
|
|
||||||
watcher._on_raw_event(f, ChangeType.DELETED)
|
|
||||||
|
|
||||||
watcher.flush_now()
|
|
||||||
|
|
||||||
assert len(collected) == 1
|
|
||||||
assert collected[0].change_type == ChangeType.DELETED
|
|
||||||
|
|
||||||
def test_dedup_does_not_downgrade(self, watcher_parts, tmp_path):
|
|
||||||
watcher, collected, _ = watcher_parts
|
|
||||||
f = str(tmp_path / "b.py")
|
|
||||||
watcher._on_raw_event(f, ChangeType.DELETED)
|
|
||||||
watcher._on_raw_event(f, ChangeType.CREATED)
|
|
||||||
|
|
||||||
watcher.flush_now()
|
|
||||||
assert len(collected) == 1
|
|
||||||
# CREATED (priority 1) < DELETED (priority 3), so DELETED stays
|
|
||||||
assert collected[0].change_type == ChangeType.DELETED
|
|
||||||
|
|
||||||
def test_multiple_files_kept(self, watcher_parts, tmp_path):
|
|
||||||
watcher, collected, _ = watcher_parts
|
|
||||||
watcher._on_raw_event(str(tmp_path / "a.py"), ChangeType.CREATED)
|
|
||||||
watcher._on_raw_event(str(tmp_path / "b.py"), ChangeType.MODIFIED)
|
|
||||||
watcher._on_raw_event(str(tmp_path / "c.py"), ChangeType.DELETED)
|
|
||||||
|
|
||||||
watcher.flush_now()
|
|
||||||
assert len(collected) == 3
|
|
||||||
paths = {str(e.path) for e in collected}
|
|
||||||
assert len(paths) == 3
|
|
||||||
|
|
||||||
def test_flush_clears_pending(self, watcher_parts, tmp_path):
|
|
||||||
watcher, collected, _ = watcher_parts
|
|
||||||
watcher._on_raw_event(str(tmp_path / "a.py"), ChangeType.CREATED)
|
|
||||||
watcher.flush_now()
|
|
||||||
assert len(collected) == 1
|
|
||||||
|
|
||||||
collected.clear()
|
|
||||||
watcher.flush_now()
|
|
||||||
assert len(collected) == 0
|
|
||||||
|
|
||||||
def test_should_watch_filters_ignored(self, watcher_parts):
|
|
||||||
watcher, _, _ = watcher_parts
|
|
||||||
assert watcher._should_watch(Path("/project/src/main.py")) is True
|
|
||||||
assert watcher._should_watch(Path("/project/.git/config")) is False
|
|
||||||
assert watcher._should_watch(Path("/project/node_modules/foo.js")) is False
|
|
||||||
assert watcher._should_watch(Path("/project/__pycache__/mod.pyc")) is False
|
|
||||||
|
|
||||||
def test_jsonl_serialization(self):
|
|
||||||
from codexlens_search.watcher.file_watcher import FileWatcher
|
|
||||||
import json
|
|
||||||
|
|
||||||
events = [
|
|
||||||
FileEvent(Path("/tmp/a.py"), ChangeType.CREATED, 1000.0),
|
|
||||||
FileEvent(Path("/tmp/b.py"), ChangeType.DELETED, 2000.0),
|
|
||||||
]
|
|
||||||
output = FileWatcher.events_to_jsonl(events)
|
|
||||||
lines = output.strip().split("\n")
|
|
||||||
assert len(lines) == 2
|
|
||||||
|
|
||||||
obj1 = json.loads(lines[0])
|
|
||||||
assert obj1["change_type"] == "created"
|
|
||||||
assert obj1["timestamp"] == 1000.0
|
|
||||||
|
|
||||||
obj2 = json.loads(lines[1])
|
|
||||||
assert obj2["change_type"] == "deleted"
|
|
||||||
Reference in New Issue
Block a user