diff --git a/.gitignore b/.gitignore index dd10c194..2ce4cdaa 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,4 @@ codex-lens/.cache/huggingface/models/Xenova--ms-marco-MiniLM-L-6-v2/onnx/model_q codex-lens/.cache/huggingface/models/Xenova--ms-marco-MiniLM-L-6-v2/onnx/model_uint8.onnx codex-lens/.cache/huggingface/models/Xenova--ms-marco-MiniLM-L-6-v2/onnx/model.onnx codex-lens/data/registry.db +codex-lens-v2/ diff --git a/codex-lens-v2/.gitignore b/codex-lens-v2/.gitignore deleted file mode 100644 index 7cd674bd..00000000 --- a/codex-lens-v2/.gitignore +++ /dev/null @@ -1,33 +0,0 @@ -# Python -__pycache__/ -*.py[cod] -*.egg-info/ -dist/ -build/ -*.egg - -# Virtual environments -.venv/ -venv/ - -# IDE -.idea/ -.vscode/ -*.swp - -# Testing -.pytest_cache/ -.coverage -htmlcov/ - -# Index / cache -.codexlens/ -.index_cache/ -.ace-tool/ - -# Workflow (internal) -.workflow/ - -# OS -.DS_Store -Thumbs.db diff --git a/codex-lens-v2/LICENSE b/codex-lens-v2/LICENSE deleted file mode 100644 index b6226a54..00000000 --- a/codex-lens-v2/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2026 codexlens-search contributors - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/codex-lens-v2/README.md b/codex-lens-v2/README.md deleted file mode 100644 index 23c52d16..00000000 --- a/codex-lens-v2/README.md +++ /dev/null @@ -1,226 +0,0 @@ -# codexlens-search - -Semantic code search engine with MCP server for Claude Code. - -2-stage vector search + FTS + RRF fusion + reranking — install once, configure API keys, ready to use. - -## Quick Start (Claude Code MCP) - -Add to your project `.mcp.json`: - -```json -{ - "mcpServers": { - "codexlens": { - "command": "uvx", - "args": ["--from", "codexlens-search[mcp]", "codexlens-mcp"], - "env": { - "CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1", - "CODEXLENS_EMBED_API_KEY": "${OPENAI_API_KEY}", - "CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small", - "CODEXLENS_EMBED_DIM": "1536" - } - } - } -} -``` - -That's it. Claude Code will auto-discover the tools: `index_project` → `search_code`. - -## Install - -```bash -# Standard install (includes vector search + API clients) -uv pip install codexlens-search - -# With MCP server for Claude Code -uv pip install codexlens-search[mcp] -``` - -Optional extras: - -| Extra | Description | -|-------|-------------| -| `mcp` | MCP server (`codexlens-mcp` command) | -| `gpu` | GPU-accelerated embedding (onnxruntime-gpu) | -| `faiss-cpu` | FAISS ANN backend | -| `watcher` | File watcher for auto-indexing | - -## MCP Tools - -| Tool | Description | -|------|-------------| -| `search_code` | Semantic search with hybrid fusion + reranking | -| `index_project` | Build or rebuild the search index | -| `index_status` | Show index statistics | -| `index_update` | Incremental sync (only changed files) | -| `find_files` | Glob file discovery | -| `list_models` | List models with cache status | -| `download_models` | Download local fastembed models | - -## MCP Configuration Examples - -### API Embedding Only (simplest) - -```json -{ - "mcpServers": { - "codexlens": { - "command": "uvx", - "args": ["--from", "codexlens-search[mcp]", "codexlens-mcp"], - "env": { - "CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1", - "CODEXLENS_EMBED_API_KEY": "${OPENAI_API_KEY}", - "CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small", - "CODEXLENS_EMBED_DIM": "1536" - } - } - } -} -``` - -### API Embedding + API Reranker (best quality) - -```json -{ - "mcpServers": { - "codexlens": { - "command": "uvx", - "args": ["--from", "codexlens-search[mcp]", "codexlens-mcp"], - "env": { - "CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1", - "CODEXLENS_EMBED_API_KEY": "${OPENAI_API_KEY}", - "CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small", - "CODEXLENS_EMBED_DIM": "1536", - "CODEXLENS_RERANKER_API_URL": "https://api.jina.ai/v1", - "CODEXLENS_RERANKER_API_KEY": "${JINA_API_KEY}", - "CODEXLENS_RERANKER_API_MODEL": "jina-reranker-v2-base-multilingual" - } - } - } -} -``` - -### Multi-Endpoint Load Balancing - -```json -{ - "mcpServers": { - "codexlens": { - "command": "uvx", - "args": ["--from", "codexlens-search[mcp]", "codexlens-mcp"], - "env": { - "CODEXLENS_EMBED_API_ENDPOINTS": "https://api1.example.com/v1|sk-key1|model,https://api2.example.com/v1|sk-key2|model", - "CODEXLENS_EMBED_DIM": "1536" - } - } - } -} -``` - -Format: `url|key|model,url|key|model,...` - -### Local Models (Offline, No API) - -```bash -uv pip install codexlens-search[mcp] -codexlens-search download-models -``` - -```json -{ - "mcpServers": { - "codexlens": { - "command": "codexlens-mcp", - "env": {} - } - } -} -``` - -### Pre-installed (no uvx) - -```json -{ - "mcpServers": { - "codexlens": { - "command": "codexlens-mcp", - "env": { - "CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1", - "CODEXLENS_EMBED_API_KEY": "${OPENAI_API_KEY}", - "CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small", - "CODEXLENS_EMBED_DIM": "1536" - } - } - } -} -``` - -## CLI - -```bash -codexlens-search --db-path .codexlens sync --root ./src -codexlens-search --db-path .codexlens search -q "auth handler" -k 10 -codexlens-search --db-path .codexlens status -codexlens-search list-models -codexlens-search download-models -``` - -## Environment Variables - -### Embedding - -| Variable | Description | Example | -|----------|-------------|---------| -| `CODEXLENS_EMBED_API_URL` | Embedding API base URL | `https://api.openai.com/v1` | -| `CODEXLENS_EMBED_API_KEY` | API key | `sk-xxx` | -| `CODEXLENS_EMBED_API_MODEL` | Model name | `text-embedding-3-small` | -| `CODEXLENS_EMBED_API_ENDPOINTS` | Multi-endpoint: `url\|key\|model,...` | See above | -| `CODEXLENS_EMBED_DIM` | Vector dimension | `1536` | - -### Reranker - -| Variable | Description | Example | -|----------|-------------|---------| -| `CODEXLENS_RERANKER_API_URL` | Reranker API base URL | `https://api.jina.ai/v1` | -| `CODEXLENS_RERANKER_API_KEY` | API key | `jina-xxx` | -| `CODEXLENS_RERANKER_API_MODEL` | Model name | `jina-reranker-v2-base-multilingual` | - -### Tuning - -| Variable | Default | Description | -|----------|---------|-------------| -| `CODEXLENS_BINARY_TOP_K` | `200` | Binary coarse search candidates | -| `CODEXLENS_ANN_TOP_K` | `50` | ANN fine search candidates | -| `CODEXLENS_FTS_TOP_K` | `50` | FTS results per method | -| `CODEXLENS_FUSION_K` | `60` | RRF fusion k parameter | -| `CODEXLENS_RERANKER_TOP_K` | `20` | Results to rerank | -| `CODEXLENS_EMBED_BATCH_SIZE` | `32` | Max texts per API batch (auto-splits on 413) | -| `CODEXLENS_EMBED_MAX_TOKENS` | `8192` | Max tokens per text (truncate if exceeded, 0=no limit) | -| `CODEXLENS_INDEX_WORKERS` | `2` | Parallel indexing workers | -| `CODEXLENS_MAX_FILE_SIZE` | `1000000` | Max file size in bytes | - -## Architecture - -``` -Query → [Embedder] → query vector - ├→ [BinaryStore] → candidates (Hamming) - │ └→ [ANNIndex] → ranked IDs (cosine) - ├→ [FTS exact] → exact matches - └→ [FTS fuzzy] → fuzzy matches - └→ [RRF Fusion] → merged ranking - └→ [Reranker] → final top-k -``` - -## Development - -```bash -git clone https://github.com/catlog22/codexlens-search.git -cd codexlens-search -uv pip install -e ".[dev]" -pytest -``` - -## License - -MIT diff --git a/codex-lens-v2/conftest.py b/codex-lens-v2/conftest.py deleted file mode 100644 index 9753a55a..00000000 --- a/codex-lens-v2/conftest.py +++ /dev/null @@ -1,5 +0,0 @@ -import sys -import os - -# Ensure the local src directory takes precedence over any installed codexlens_search package -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "src")) diff --git a/codex-lens-v2/pyproject.toml b/codex-lens-v2/pyproject.toml deleted file mode 100644 index 9e10e7ea..00000000 --- a/codex-lens-v2/pyproject.toml +++ /dev/null @@ -1,63 +0,0 @@ -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[project] -name = "codexlens-search" -version = "0.4.1" -description = "Lightweight semantic code search engine — 2-stage vector + FTS + RRF fusion + MCP server" -requires-python = ">=3.10" -dependencies = [ - "hnswlib>=0.8.0", - "numpy>=1.26", - "fastembed>=0.4.0,<2.0", - "httpx>=0.25", -] -license = {text = "MIT"} -readme = "README.md" -authors = [ - {name = "codexlens-search contributors"}, -] -classifiers = [ - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "License :: OSI Approved :: MIT License", - "Topic :: Software Development :: Libraries", - "Topic :: Text Processing :: Indexing", - "Operating System :: OS Independent", -] - -[project.urls] -Homepage = "https://github.com/catlog22/codexlens-search" -Repository = "https://github.com/catlog22/codexlens-search" - -[project.optional-dependencies] -mcp = [ - "mcp[cli]>=1.0.0", -] -gpu = [ - "onnxruntime-gpu>=1.16", -] -faiss-cpu = [ - "faiss-cpu>=1.7.4", -] -faiss-gpu = [ - "faiss-gpu>=1.7.4", -] -watcher = [ - "watchdog>=3.0", -] -dev = [ - "pytest>=7.0", - "pytest-cov", -] - -[project.scripts] -codexlens-search = "codexlens_search.bridge:main" -codexlens-mcp = "codexlens_search.mcp_server:main" - -[tool.hatch.build.targets.wheel] -packages = ["src/codexlens_search"] diff --git a/codex-lens-v2/scripts/index_and_search.py b/codex-lens-v2/scripts/index_and_search.py deleted file mode 100644 index d20c1ef4..00000000 --- a/codex-lens-v2/scripts/index_and_search.py +++ /dev/null @@ -1,128 +0,0 @@ -""" -对 D:/Claude_dms3 仓库进行索引并测试搜索。 -用法: python scripts/index_and_search.py -""" -import sys -import time -from pathlib import Path - -# 确保 src 可被导入 -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from codexlens_search.config import Config -from codexlens_search.core.factory import create_ann_index, create_binary_index -from codexlens_search.embed.local import FastEmbedEmbedder -from codexlens_search.indexing import IndexingPipeline -from codexlens_search.rerank.local import FastEmbedReranker -from codexlens_search.search.fts import FTSEngine -from codexlens_search.search.pipeline import SearchPipeline - -# ─── 配置 ────────────────────────────────────────────────────────────────── -REPO_ROOT = Path("D:/Claude_dms3") -INDEX_DIR = Path("D:/Claude_dms3/codex-lens-v2/.index_cache") -EXTENSIONS = {".py", ".ts", ".js", ".md"} -MAX_FILE_SIZE = 50_000 # bytes -MAX_CHUNK_CHARS = 800 # 每个 chunk 的最大字符数 -CHUNK_OVERLAP = 100 - -# ─── 文件收集 ─────────────────────────────────────────────────────────────── -SKIP_DIRS = { - ".git", "node_modules", "__pycache__", ".pytest_cache", - "dist", "build", ".venv", "venv", ".cache", ".index_cache", - "codex-lens-v2", # 不索引自身 -} - -def collect_files(root: Path) -> list[Path]: - files = [] - for p in root.rglob("*"): - if any(part in SKIP_DIRS for part in p.parts): - continue - if p.is_file() and p.suffix in EXTENSIONS: - if p.stat().st_size <= MAX_FILE_SIZE: - files.append(p) - return files - -# ─── 主流程 ───────────────────────────────────────────────────────────────── -def main(): - INDEX_DIR.mkdir(parents=True, exist_ok=True) - - # 1. 使用小 profile 加快速度 - config = Config( - embed_model="BAAI/bge-small-en-v1.5", - embed_dim=384, - embed_batch_size=32, - hnsw_ef=100, - hnsw_M=16, - binary_top_k=100, - ann_top_k=30, - reranker_top_k=10, - ) - - print("=== codex-lens-v2 索引测试 ===\n") - - # 2. 收集文件 - print(f"[1/4] 扫描 {REPO_ROOT} ...") - files = collect_files(REPO_ROOT) - print(f" 找到 {len(files)} 个文件") - - # 3. 初始化组件 - print(f"\n[2/4] 加载嵌入模型 (bge-small-en-v1.5, dim=384) ...") - embedder = FastEmbedEmbedder(config) - binary_store = create_binary_index(INDEX_DIR, config.embed_dim, config) - ann_index = create_ann_index(INDEX_DIR, config.embed_dim, config) - fts = FTSEngine(":memory:") # 内存 FTS,不持久化 - - # 4. 使用 IndexingPipeline 并行索引 (chunk -> embed -> index) - print(f"[3/4] 并行索引 {len(files)} 个文件 ...") - pipeline = IndexingPipeline( - embedder=embedder, - binary_store=binary_store, - ann_index=ann_index, - fts=fts, - config=config, - ) - stats = pipeline.index_files( - files, - root=REPO_ROOT, - max_chunk_chars=MAX_CHUNK_CHARS, - chunk_overlap=CHUNK_OVERLAP, - max_file_size=MAX_FILE_SIZE, - ) - print(f" 索引完成: {stats.files_processed} 文件, {stats.chunks_created} chunks ({stats.duration_seconds:.1f}s)") - - # 5. 搜索测试 - print(f"\n[4/4] 构建 SearchPipeline ...") - reranker = FastEmbedReranker(config) - pipeline = SearchPipeline( - embedder=embedder, - binary_store=binary_store, - ann_index=ann_index, - reranker=reranker, - fts=fts, - config=config, - ) - - queries = [ - "authentication middleware function", - "def embed_single", - "RRF fusion weights", - "fastembed TextCrossEncoder reranker", - "how to search code semantic", - ] - - print("\n" + "=" * 60) - for query in queries: - t0 = time.time() - results = pipeline.search(query, top_k=5) - elapsed = time.time() - t0 - print(f"\nQuery: {query!r} ({elapsed*1000:.0f}ms)") - if results: - for r in results: - print(f" [{r.score:.3f}] {r.path}") - else: - print(" (无结果)") - print("=" * 60) - print("\n测试完成 ✓") - -if __name__ == "__main__": - main() diff --git a/codex-lens-v2/scripts/test_small_e2e.py b/codex-lens-v2/scripts/test_small_e2e.py deleted file mode 100644 index c9236817..00000000 --- a/codex-lens-v2/scripts/test_small_e2e.py +++ /dev/null @@ -1,181 +0,0 @@ -""" -Small-folder end-to-end test: index tests/ directory (~10 files) and verify -indexing pipeline + all search features work correctly. - -Usage: python scripts/test_small_e2e.py -""" -import sys -import time -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -import numpy as np -from codexlens_search.config import Config -from codexlens_search.core.factory import create_ann_index, create_binary_index -from codexlens_search.embed.local import FastEmbedEmbedder -from codexlens_search.indexing import IndexingPipeline -from codexlens_search.rerank.local import FastEmbedReranker -from codexlens_search.search.fts import FTSEngine -from codexlens_search.search.pipeline import SearchPipeline - -PROJECT = Path(__file__).parent.parent -TARGET_DIR = PROJECT / "src" / "codexlens_search" # ~21 .py files, small -INDEX_DIR = PROJECT / ".test_index_cache" -EXTENSIONS = {".py"} - -passed = 0 -failed = 0 - - -def check(name: str, condition: bool, detail: str = ""): - global passed, failed - if condition: - passed += 1 - print(f" [PASS] {name}") - else: - failed += 1 - print(f" [FAIL] {name} — {detail}") - - -def main(): - global passed, failed - INDEX_DIR.mkdir(parents=True, exist_ok=True) - - config = Config( - embed_model="BAAI/bge-small-en-v1.5", - embed_dim=384, - embed_batch_size=32, - hnsw_ef=100, - hnsw_M=16, - binary_top_k=100, - ann_top_k=30, - reranker_model="Xenova/ms-marco-MiniLM-L-6-v2", - reranker_top_k=10, - ) - - files = [p for p in TARGET_DIR.rglob("*.py") if p.is_file()] - print(f"Target: {TARGET_DIR} ({len(files)} .py files)\n") - - # ── 1. Test IndexingPipeline ────────────────────────────── - print("=== 1. IndexingPipeline (parallel) ===") - embedder = FastEmbedEmbedder(config) - binary_store = create_binary_index(INDEX_DIR, config.embed_dim, config) - ann_index = create_ann_index(INDEX_DIR, config.embed_dim, config) - fts = FTSEngine(":memory:") - - t0 = time.time() - stats = IndexingPipeline( - embedder=embedder, - binary_store=binary_store, - ann_index=ann_index, - fts=fts, - config=config, - ).index_files(files, root=TARGET_DIR, max_chunk_chars=800, chunk_overlap=100) - elapsed = time.time() - t0 - - check("files_processed > 0", stats.files_processed > 0, f"got {stats.files_processed}") - check("chunks_created > 0", stats.chunks_created > 0, f"got {stats.chunks_created}") - check("indexing completed", elapsed < 120, f"took {elapsed:.1f}s") - print(f" Stats: {stats.files_processed} files, {stats.chunks_created} chunks, {elapsed:.1f}s\n") - - # ── 2. Test BinaryStore (pre-allocated, coarse search) ──── - print("=== 2. BinaryStore coarse search ===") - q_vec = embedder.embed_single("def search") - b_ids, b_dists = binary_store.coarse_search(q_vec, top_k=10) - check("binary returns results", len(b_ids) > 0, f"got {len(b_ids)}") - check("binary ids are ints", all(isinstance(int(i), int) for i in b_ids)) - print(f" Top 5 binary IDs: {b_ids[:5]}\n") - - # ── 3. Test ANNIndex (fine search) ──────────────────────── - print("=== 3. ANNIndex fine search ===") - a_ids, a_dists = ann_index.fine_search(q_vec, top_k=10) - check("ann returns results", len(a_ids) > 0, f"got {len(a_ids)}") - check("ann scores are floats", all(isinstance(float(d), float) for d in a_dists)) - print(f" Top 5 ANN IDs: {a_ids[:5]}\n") - - # ── 4. Test FTSEngine (exact + fuzzy) ───────────────────── - print("=== 4. FTSEngine search ===") - exact = fts.exact_search("def search", top_k=5) - fuzzy = fts.fuzzy_search("embedd", top_k=5) - check("exact search returns results", len(exact) > 0, f"got {len(exact)}") - check("fuzzy search returns results", len(fuzzy) > 0, f"got {len(fuzzy)}") - print(f" Exact hits: {len(exact)}, Fuzzy hits: {len(fuzzy)}\n") - - # ── 5. Test SearchPipeline (parallel FTS||vector + fusion + rerank) ── - print("=== 5. SearchPipeline (full pipeline) ===") - reranker = FastEmbedReranker(config) - search = SearchPipeline( - embedder=embedder, - binary_store=binary_store, - ann_index=ann_index, - reranker=reranker, - fts=fts, - config=config, - ) - - queries = [ - ("def embed_single", "code symbol search"), - ("search pipeline fusion", "natural language search"), - ("Config dataclass", "exact match search"), - ("binary store hamming", "domain-specific search"), - ("", "empty query handling"), - ] - - for query, desc in queries: - t0 = time.time() - results = search.search(query, top_k=5) - ms = (time.time() - t0) * 1000 - - if query == "": - check(f"{desc}: no crash", isinstance(results, list)) - else: - check(f"{desc}: returns results", len(results) > 0, f"'{query}' got 0 results") - if results: - check(f"{desc}: has scores", all(isinstance(r.score, (int, float)) for r in results)) - check(f"{desc}: has paths", all(r.path for r in results)) - check(f"{desc}: respects top_k", len(results) <= 5) - print(f" Top result: [{results[0].score:.3f}] {results[0].path}") - print(f" Latency: {ms:.0f}ms") - - # ── 6. Test result quality (sanity) ─────────────────────── - print("\n=== 6. Result quality sanity checks ===") - r1 = search.search("BinaryStore add coarse_search", top_k=5) - if r1: - paths = [r.path for r in r1] - check("BinaryStore query -> binary/core in results", - any("binary" in p or "core" in p for p in paths), - f"got paths: {paths}") - - r2 = search.search("FTSEngine exact_search fuzzy_search", top_k=5) - if r2: - paths = [r.path for r in r2] - check("FTSEngine query -> fts/search in results", - any("fts" in p or "search" in p for p in paths), - f"got paths: {paths}") - - r3 = search.search("IndexingPipeline parallel queue", top_k=3) - if r3: - paths = [r.path for r in r3] - check("Pipeline query -> pipeline in results", - any("pipeline" in p or "indexing" in p for p in paths), - f"got paths: {paths}") - - # ── Summary ─────────────────────────────────────────────── - print(f"\n{'=' * 50}") - print(f"Results: {passed} passed, {failed} failed, {passed + failed} total") - if failed == 0: - print("ALL TESTS PASSED") - else: - print(f"WARNING: {failed} test(s) failed") - print(f"{'=' * 50}") - - # Cleanup - import shutil - shutil.rmtree(INDEX_DIR, ignore_errors=True) - - return 0 if failed == 0 else 1 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/codex-lens-v2/src/codexlens_search/__init__.py b/codex-lens-v2/src/codexlens_search/__init__.py deleted file mode 100644 index 979f92db..00000000 --- a/codex-lens-v2/src/codexlens_search/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -"""codexlens-search: Lightweight semantic code search engine. - -Public API for consumers (e.g. codex-lens): - - from codexlens_search import SearchPipeline, IndexingPipeline, Config - from codexlens_search.core import create_ann_index, create_binary_index - from codexlens_search.embed.local import FastEmbedEmbedder - from codexlens_search.rerank.api import APIReranker -""" -from codexlens_search.config import Config -from codexlens_search.indexing import IndexingPipeline, IndexStats -from codexlens_search.search.pipeline import SearchPipeline, SearchResult - -__all__ = [ - "Config", - "IndexingPipeline", - "IndexStats", - "SearchPipeline", - "SearchResult", -] diff --git a/codex-lens-v2/src/codexlens_search/bridge.py b/codex-lens-v2/src/codexlens_search/bridge.py deleted file mode 100644 index 0590b694..00000000 --- a/codex-lens-v2/src/codexlens_search/bridge.py +++ /dev/null @@ -1,676 +0,0 @@ -"""CLI bridge for ccw integration. - -Argparse-based CLI with JSON output protocol. -Each subcommand outputs a single JSON object to stdout. -Watch command outputs JSONL (one JSON per line). -All errors are JSON {"error": string} to stdout with non-zero exit code. -""" -from __future__ import annotations - -import argparse -import glob -import json -import logging -import os -import sys -import time -from pathlib import Path - -log = logging.getLogger("codexlens_search.bridge") - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -def _ensure_utf8_stdio() -> None: - """Force UTF-8 encoding on stdout/stderr (Windows defaults to GBK/cp936).""" - if sys.platform == "win32": - for stream_name in ("stdout", "stderr"): - stream = getattr(sys, stream_name) - if hasattr(stream, "reconfigure"): - stream.reconfigure(encoding="utf-8", errors="replace") - - -def _json_output(data: dict | list) -> None: - """Print JSON to stdout with flush.""" - print(json.dumps(data, ensure_ascii=True), flush=True) - - -def _error_exit(message: str, code: int = 1) -> None: - """Print JSON error to stdout and exit.""" - _json_output({"error": message}) - sys.exit(code) - - -def _resolve_db_path(args: argparse.Namespace) -> Path: - """Return the --db-path as a resolved Path, creating parent dirs.""" - db_path = Path(args.db_path).resolve() - db_path.mkdir(parents=True, exist_ok=True) - return db_path - - -def create_config_from_env(db_path: str | Path, **overrides: object) -> "Config": - """Build Config from environment variables and optional overrides. - - Used by both CLI bridge and MCP server. - """ - from codexlens_search.config import Config - - kwargs: dict = {} - # Apply explicit overrides first - for key in ("embed_model", "embed_api_url", "embed_api_key", "embed_api_model"): - if overrides.get(key): - kwargs[key] = overrides[key] - # Env vars as fallback - if "embed_api_url" not in kwargs and os.environ.get("CODEXLENS_EMBED_API_URL"): - kwargs["embed_api_url"] = os.environ["CODEXLENS_EMBED_API_URL"] - if "embed_api_key" not in kwargs and os.environ.get("CODEXLENS_EMBED_API_KEY"): - kwargs["embed_api_key"] = os.environ["CODEXLENS_EMBED_API_KEY"] - if "embed_api_model" not in kwargs and os.environ.get("CODEXLENS_EMBED_API_MODEL"): - kwargs["embed_api_model"] = os.environ["CODEXLENS_EMBED_API_MODEL"] - # Multi-endpoint: CODEXLENS_EMBED_API_ENDPOINTS=url1|key1|model1,url2|key2|model2 - endpoints_env = os.environ.get("CODEXLENS_EMBED_API_ENDPOINTS", "") - if endpoints_env: - endpoints = [] - for entry in endpoints_env.split(","): - parts = entry.strip().split("|") - if len(parts) >= 2: - ep = {"url": parts[0], "key": parts[1]} - if len(parts) >= 3: - ep["model"] = parts[2] - endpoints.append(ep) - if endpoints: - kwargs["embed_api_endpoints"] = endpoints - # Embed dimension and concurrency from env - if os.environ.get("CODEXLENS_EMBED_DIM"): - kwargs["embed_dim"] = int(os.environ["CODEXLENS_EMBED_DIM"]) - if os.environ.get("CODEXLENS_EMBED_BATCH_SIZE"): - kwargs["embed_batch_size"] = int(os.environ["CODEXLENS_EMBED_BATCH_SIZE"]) - if os.environ.get("CODEXLENS_EMBED_API_CONCURRENCY"): - kwargs["embed_api_concurrency"] = int(os.environ["CODEXLENS_EMBED_API_CONCURRENCY"]) - if os.environ.get("CODEXLENS_EMBED_API_MAX_TOKENS"): - kwargs["embed_api_max_tokens_per_batch"] = int(os.environ["CODEXLENS_EMBED_API_MAX_TOKENS"]) - if os.environ.get("CODEXLENS_EMBED_MAX_TOKENS"): - kwargs["embed_max_tokens"] = int(os.environ["CODEXLENS_EMBED_MAX_TOKENS"]) - # Reranker API env vars - if os.environ.get("CODEXLENS_RERANKER_API_URL"): - kwargs["reranker_api_url"] = os.environ["CODEXLENS_RERANKER_API_URL"] - if os.environ.get("CODEXLENS_RERANKER_API_KEY"): - kwargs["reranker_api_key"] = os.environ["CODEXLENS_RERANKER_API_KEY"] - if os.environ.get("CODEXLENS_RERANKER_API_MODEL"): - kwargs["reranker_api_model"] = os.environ["CODEXLENS_RERANKER_API_MODEL"] - # Search pipeline params from env - if os.environ.get("CODEXLENS_RERANKER_TOP_K"): - kwargs["reranker_top_k"] = int(os.environ["CODEXLENS_RERANKER_TOP_K"]) - if os.environ.get("CODEXLENS_RERANKER_BATCH_SIZE"): - kwargs["reranker_batch_size"] = int(os.environ["CODEXLENS_RERANKER_BATCH_SIZE"]) - if os.environ.get("CODEXLENS_BINARY_TOP_K"): - kwargs["binary_top_k"] = int(os.environ["CODEXLENS_BINARY_TOP_K"]) - if os.environ.get("CODEXLENS_ANN_TOP_K"): - kwargs["ann_top_k"] = int(os.environ["CODEXLENS_ANN_TOP_K"]) - if os.environ.get("CODEXLENS_FTS_TOP_K"): - kwargs["fts_top_k"] = int(os.environ["CODEXLENS_FTS_TOP_K"]) - if os.environ.get("CODEXLENS_FUSION_K"): - kwargs["fusion_k"] = int(os.environ["CODEXLENS_FUSION_K"]) - # Indexing params from env - if os.environ.get("CODEXLENS_CODE_AWARE_CHUNKING"): - kwargs["code_aware_chunking"] = os.environ["CODEXLENS_CODE_AWARE_CHUNKING"].lower() == "true" - if os.environ.get("CODEXLENS_INDEX_WORKERS"): - kwargs["index_workers"] = int(os.environ["CODEXLENS_INDEX_WORKERS"]) - if os.environ.get("CODEXLENS_MAX_FILE_SIZE"): - kwargs["max_file_size_bytes"] = int(os.environ["CODEXLENS_MAX_FILE_SIZE"]) - if os.environ.get("CODEXLENS_HNSW_EF"): - kwargs["hnsw_ef"] = int(os.environ["CODEXLENS_HNSW_EF"]) - if os.environ.get("CODEXLENS_HNSW_M"): - kwargs["hnsw_M"] = int(os.environ["CODEXLENS_HNSW_M"]) - # Tier config from env - if os.environ.get("CODEXLENS_TIER_HOT_HOURS"): - kwargs["tier_hot_hours"] = int(os.environ["CODEXLENS_TIER_HOT_HOURS"]) - if os.environ.get("CODEXLENS_TIER_COLD_HOURS"): - kwargs["tier_cold_hours"] = int(os.environ["CODEXLENS_TIER_COLD_HOURS"]) - # Search quality tier from env - if os.environ.get("CODEXLENS_SEARCH_QUALITY"): - kwargs["default_search_quality"] = os.environ["CODEXLENS_SEARCH_QUALITY"] - # Shard config from env - if os.environ.get("CODEXLENS_NUM_SHARDS"): - kwargs["num_shards"] = int(os.environ["CODEXLENS_NUM_SHARDS"]) - if os.environ.get("CODEXLENS_MAX_LOADED_SHARDS"): - kwargs["max_loaded_shards"] = int(os.environ["CODEXLENS_MAX_LOADED_SHARDS"]) - resolved = Path(db_path).resolve() - kwargs["metadata_db_path"] = str(resolved / "metadata.db") - return Config(**kwargs) - - -def _create_config(args: argparse.Namespace) -> "Config": - """Build Config from CLI args (delegates to create_config_from_env).""" - overrides: dict = {} - if hasattr(args, "embed_model") and args.embed_model: - overrides["embed_model"] = args.embed_model - if hasattr(args, "embed_api_url") and args.embed_api_url: - overrides["embed_api_url"] = args.embed_api_url - if hasattr(args, "embed_api_key") and args.embed_api_key: - overrides["embed_api_key"] = args.embed_api_key - if hasattr(args, "embed_api_model") and args.embed_api_model: - overrides["embed_api_model"] = args.embed_api_model - return create_config_from_env(args.db_path, **overrides) - - -def _create_embedder(config: "Config"): - """Create embedder based on config, auto-detecting embed_dim from API.""" - if config.embed_api_url: - from codexlens_search.embed.api import APIEmbedder - embedder = APIEmbedder(config) - log.info("Using API embedder: %s", config.embed_api_url) - # Auto-detect embed_dim from API if still at default - if config.embed_dim == 384: - probe_vec = embedder.embed_single("dimension probe") - detected_dim = probe_vec.shape[0] - if detected_dim != config.embed_dim: - log.info("Auto-detected embed_dim=%d from API (was %d)", detected_dim, config.embed_dim) - config.embed_dim = detected_dim - else: - from codexlens_search.embed.local import FastEmbedEmbedder - embedder = FastEmbedEmbedder(config) - return embedder - - -def _create_reranker(config: "Config"): - """Create reranker based on config.""" - if config.reranker_api_url: - from codexlens_search.rerank.api import APIReranker - reranker = APIReranker(config) - log.info("Using API reranker: %s", config.reranker_api_url) - else: - from codexlens_search.rerank.local import FastEmbedReranker - reranker = FastEmbedReranker(config) - return reranker - - -def create_pipeline( - db_path: str | Path, - config: "Config | None" = None, -) -> tuple: - """Construct pipeline components from db_path and config. - - Returns (indexing_pipeline, search_pipeline, config). - Used by both CLI bridge and MCP server. - - When config.num_shards > 1, returns a ShardManager-backed pipeline - where indexing and search are delegated to the ShardManager. - The returned tuple is (shard_manager, shard_manager, config) so that - callers can use shard_manager.sync() and shard_manager.search(). - """ - from codexlens_search.config import Config - - if config is None: - config = create_config_from_env(db_path) - resolved = Path(db_path).resolve() - resolved.mkdir(parents=True, exist_ok=True) - - embedder = _create_embedder(config) - reranker = _create_reranker(config) - - # Sharded mode: delegate to ShardManager - if config.num_shards > 1: - from codexlens_search.core.shard_manager import ShardManager - manager = ShardManager( - num_shards=config.num_shards, - db_path=resolved, - config=config, - embedder=embedder, - reranker=reranker, - ) - log.info( - "Using ShardManager with %d shards (max_loaded=%d)", - config.num_shards, config.max_loaded_shards, - ) - return manager, manager, config - - # Single-shard mode: original behavior, no ShardManager overhead - from codexlens_search.core.factory import create_ann_index, create_binary_index - from codexlens_search.indexing.metadata import MetadataStore - from codexlens_search.indexing.pipeline import IndexingPipeline - from codexlens_search.search.fts import FTSEngine - from codexlens_search.search.pipeline import SearchPipeline - - binary_store = create_binary_index(resolved, config.embed_dim, config) - ann_index = create_ann_index(resolved, config.embed_dim, config) - fts = FTSEngine(resolved / "fts.db") - metadata = MetadataStore(resolved / "metadata.db") - - indexing = IndexingPipeline( - embedder=embedder, - binary_store=binary_store, - ann_index=ann_index, - fts=fts, - config=config, - metadata=metadata, - ) - - search = SearchPipeline( - embedder=embedder, - binary_store=binary_store, - ann_index=ann_index, - reranker=reranker, - fts=fts, - config=config, - metadata_store=metadata, - ) - - return indexing, search, config - - -def _create_pipeline( - args: argparse.Namespace, -) -> tuple: - """CLI wrapper: construct pipeline from argparse args.""" - config = _create_config(args) - db_path = _resolve_db_path(args) - return create_pipeline(db_path, config) - - -# --------------------------------------------------------------------------- -# Subcommand handlers -# --------------------------------------------------------------------------- - -def cmd_init(args: argparse.Namespace) -> None: - """Initialize an empty index at --db-path.""" - from codexlens_search.indexing.metadata import MetadataStore - from codexlens_search.search.fts import FTSEngine - - db_path = _resolve_db_path(args) - - # Create empty stores - just touch the metadata and FTS databases - MetadataStore(db_path / "metadata.db") - FTSEngine(db_path / "fts.db") - - _json_output({ - "status": "initialized", - "db_path": str(db_path), - }) - - -def cmd_search(args: argparse.Namespace) -> None: - """Run search query, output JSON array of results.""" - _, search, _ = _create_pipeline(args) - - results = search.search(args.query, top_k=args.top_k) - _json_output([ - { - "path": r.path, - "score": r.score, - "line": r.line, - "end_line": r.end_line, - "snippet": r.snippet, - "content": r.content, - } - for r in results - ]) - - -def cmd_index_file(args: argparse.Namespace) -> None: - """Index a single file.""" - indexing, _, _ = _create_pipeline(args) - - file_path = Path(args.file).resolve() - if not file_path.is_file(): - _error_exit(f"File not found: {file_path}") - - root = Path(args.root).resolve() if args.root else None - - stats = indexing.index_file(file_path, root=root) - _json_output({ - "status": "indexed", - "file": str(file_path), - "files_processed": stats.files_processed, - "chunks_created": stats.chunks_created, - "duration_seconds": stats.duration_seconds, - }) - - -def cmd_remove_file(args: argparse.Namespace) -> None: - """Remove a file from the index.""" - indexing, _, _ = _create_pipeline(args) - - indexing.remove_file(args.file) - _json_output({ - "status": "removed", - "file": args.file, - }) - - -DEFAULT_EXCLUDES = frozenset({ - "node_modules", ".git", "__pycache__", "dist", "build", - ".venv", "venv", ".tox", ".mypy_cache", ".pytest_cache", - ".next", ".nuxt", "coverage", ".eggs", "*.egg-info", ".codexlens", -}) - - -def should_exclude(path: Path, exclude_dirs: frozenset[str]) -> bool: - """Check if any path component matches an exclude pattern.""" - parts = path.parts - return any(part in exclude_dirs for part in parts) - - -def cmd_sync(args: argparse.Namespace) -> None: - """Sync index with files under --root matching --glob pattern.""" - indexing, _, _ = _create_pipeline(args) - - root = Path(args.root).resolve() - if not root.is_dir(): - _error_exit(f"Root directory not found: {root}") - - exclude_dirs = frozenset(args.exclude) if args.exclude else DEFAULT_EXCLUDES - pattern = args.glob or "**/*" - file_paths = [ - p for p in root.glob(pattern) - if p.is_file() and not should_exclude(p.relative_to(root), exclude_dirs) - ] - - log.debug("Sync: %d files after exclusion (root=%s, pattern=%s)", len(file_paths), root, pattern) - - stats = indexing.sync(file_paths, root=root) - _json_output({ - "status": "synced", - "root": str(root), - "files_processed": stats.files_processed, - "chunks_created": stats.chunks_created, - "duration_seconds": stats.duration_seconds, - }) - - -def cmd_watch(args: argparse.Namespace) -> None: - """Watch --root for changes, output JSONL events.""" - root = Path(args.root).resolve() - if not root.is_dir(): - _error_exit(f"Root directory not found: {root}") - - debounce_ms = args.debounce_ms - - try: - from watchdog.observers import Observer - from watchdog.events import FileSystemEventHandler, FileSystemEvent - except ImportError: - _error_exit( - "watchdog is required for watch mode. " - "Install with: pip install watchdog" - ) - - class _JsonEventHandler(FileSystemEventHandler): - """Emit JSONL for file events.""" - - def _emit(self, event_type: str, path: str) -> None: - _json_output({ - "event": event_type, - "path": path, - "timestamp": time.time(), - }) - - def on_created(self, event: FileSystemEvent) -> None: - if not event.is_directory: - self._emit("created", event.src_path) - - def on_modified(self, event: FileSystemEvent) -> None: - if not event.is_directory: - self._emit("modified", event.src_path) - - def on_deleted(self, event: FileSystemEvent) -> None: - if not event.is_directory: - self._emit("deleted", event.src_path) - - def on_moved(self, event: FileSystemEvent) -> None: - if not event.is_directory: - self._emit("moved", event.dest_path) - - observer = Observer() - observer.schedule(_JsonEventHandler(), str(root), recursive=True) - observer.start() - - _json_output({ - "status": "watching", - "root": str(root), - "debounce_ms": debounce_ms, - }) - - try: - while True: - time.sleep(debounce_ms / 1000.0) - except KeyboardInterrupt: - observer.stop() - observer.join() - - -def cmd_download_models(args: argparse.Namespace) -> None: - """Download embed + reranker models.""" - from codexlens_search import model_manager - - config = _create_config(args) - - model_manager.ensure_model(config.embed_model, config) - model_manager.ensure_model(config.reranker_model, config) - - _json_output({ - "status": "downloaded", - "embed_model": config.embed_model, - "reranker_model": config.reranker_model, - }) - - -def cmd_list_models(args: argparse.Namespace) -> None: - """List known embed/reranker models with cache status.""" - from codexlens_search import model_manager - - config = _create_config(args) - models = model_manager.list_known_models(config) - _json_output(models) - - -def cmd_download_model(args: argparse.Namespace) -> None: - """Download a single model by name.""" - from codexlens_search import model_manager - - config = _create_config(args) - model_name = args.model_name - - model_manager.ensure_model(model_name, config) - - cached = model_manager._model_is_cached( - model_name, model_manager._resolve_cache_dir(config) - ) - _json_output({ - "status": "downloaded" if cached else "failed", - "model": model_name, - }) - - -def cmd_delete_model(args: argparse.Namespace) -> None: - """Delete a model from cache.""" - from codexlens_search import model_manager - - config = _create_config(args) - model_name = args.model_name - - deleted = model_manager.delete_model(model_name, config) - _json_output({ - "status": "deleted" if deleted else "not_found", - "model": model_name, - }) - - -def cmd_status(args: argparse.Namespace) -> None: - """Report index statistics.""" - from codexlens_search.indexing.metadata import MetadataStore - - db_path = _resolve_db_path(args) - meta_path = db_path / "metadata.db" - - if not meta_path.exists(): - _json_output({ - "status": "not_initialized", - "db_path": str(db_path), - }) - return - - metadata = MetadataStore(meta_path) - all_files = metadata.get_all_files() - deleted_ids = metadata.get_deleted_ids() - max_chunk = metadata.max_chunk_id() - - _json_output({ - "status": "ok", - "db_path": str(db_path), - "files_tracked": len(all_files), - "max_chunk_id": max_chunk, - "total_chunks_approx": max_chunk + 1 if max_chunk >= 0 else 0, - "deleted_chunks": len(deleted_ids), - }) - - -# --------------------------------------------------------------------------- -# CLI parser -# --------------------------------------------------------------------------- - -def _build_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( - prog="codexlens-search", - description="Lightweight semantic code search - CLI bridge", - ) - parser.add_argument( - "--db-path", - default=os.environ.get("CODEXLENS_DB_PATH", ".codexlens"), - help="Path to index database directory (default: .codexlens or $CODEXLENS_DB_PATH)", - ) - parser.add_argument( - "--verbose", "-v", - action="store_true", - help="Enable debug logging to stderr", - ) - - # API embedding overrides (also read from CODEXLENS_EMBED_API_* env vars) - parser.add_argument( - "--embed-api-url", - default="", - help="Remote embedding API URL (OpenAI-compatible, e.g. https://api.openai.com/v1)", - ) - parser.add_argument( - "--embed-api-key", - default="", - help="API key for remote embedding", - ) - parser.add_argument( - "--embed-api-model", - default="", - help="Model name for remote embedding (e.g. text-embedding-3-small)", - ) - - sub = parser.add_subparsers(dest="command") - - # init - sub.add_parser("init", help="Initialize empty index") - - # search - p_search = sub.add_parser("search", help="Search the index") - p_search.add_argument("--query", "-q", required=True, help="Search query") - p_search.add_argument("--top-k", "-k", type=int, default=10, help="Number of results") - - # index-file - p_index = sub.add_parser("index-file", help="Index a single file") - p_index.add_argument("--file", "-f", required=True, help="File path to index") - p_index.add_argument("--root", "-r", help="Root directory for relative paths") - - # remove-file - p_remove = sub.add_parser("remove-file", help="Remove a file from index") - p_remove.add_argument("--file", "-f", required=True, help="Relative file path to remove") - - # sync - p_sync = sub.add_parser("sync", help="Sync index with directory") - p_sync.add_argument("--root", "-r", required=True, help="Root directory to sync") - p_sync.add_argument("--glob", "-g", default="**/*", help="Glob pattern (default: **/*)") - p_sync.add_argument( - "--exclude", "-e", action="append", default=None, - help="Directory names to exclude (repeatable). " - "Defaults: node_modules, .git, __pycache__, dist, build, .venv, venv, .tox, .mypy_cache", - ) - - # watch - p_watch = sub.add_parser("watch", help="Watch directory for changes (JSONL output)") - p_watch.add_argument("--root", "-r", required=True, help="Root directory to watch") - p_watch.add_argument("--debounce-ms", type=int, default=500, help="Debounce interval in ms") - - # download-models - p_dl = sub.add_parser("download-models", help="Download embed + reranker models") - p_dl.add_argument("--embed-model", help="Override embed model name") - - # list-models - sub.add_parser("list-models", help="List known models with cache status") - - # download-model (single model by name) - p_dl_single = sub.add_parser("download-model", help="Download a single model by name") - p_dl_single.add_argument("model_name", help="HuggingFace model name (e.g. BAAI/bge-small-en-v1.5)") - - # delete-model - p_del = sub.add_parser("delete-model", help="Delete a model from cache") - p_del.add_argument("model_name", help="HuggingFace model name to delete") - - # status - sub.add_parser("status", help="Report index statistics") - - return parser - - -def main() -> None: - """CLI entry point.""" - _ensure_utf8_stdio() - parser = _build_parser() - args = parser.parse_args() - - # Configure logging - if args.verbose: - logging.basicConfig( - level=logging.DEBUG, - format="%(levelname)s %(name)s: %(message)s", - stream=sys.stderr, - ) - else: - logging.basicConfig( - level=logging.WARNING, - format="%(levelname)s: %(message)s", - stream=sys.stderr, - ) - - if not args.command: - parser.print_help(sys.stderr) - sys.exit(1) - - dispatch = { - "init": cmd_init, - "search": cmd_search, - "index-file": cmd_index_file, - "remove-file": cmd_remove_file, - "sync": cmd_sync, - "watch": cmd_watch, - "download-models": cmd_download_models, - "list-models": cmd_list_models, - "download-model": cmd_download_model, - "delete-model": cmd_delete_model, - "status": cmd_status, - } - - handler = dispatch.get(args.command) - if handler is None: - _error_exit(f"Unknown command: {args.command}") - - try: - handler(args) - except KeyboardInterrupt: - sys.exit(130) - except SystemExit: - raise - except Exception as exc: - log.debug("Command failed", exc_info=True) - _error_exit(str(exc)) - - -if __name__ == "__main__": - main() diff --git a/codex-lens-v2/src/codexlens_search/config.py b/codex-lens-v2/src/codexlens_search/config.py deleted file mode 100644 index f56df7e4..00000000 --- a/codex-lens-v2/src/codexlens_search/config.py +++ /dev/null @@ -1,165 +0,0 @@ -from __future__ import annotations -import logging -from dataclasses import dataclass, field - -log = logging.getLogger(__name__) - - -@dataclass -class Config: - # Embedding - embed_model: str = "BAAI/bge-small-en-v1.5" - embed_dim: int = 384 - embed_batch_size: int = 32 - - # API embedding (optional — overrides local fastembed when set) - embed_api_url: str = "" # e.g. "https://api.openai.com/v1" - embed_api_key: str = "" - embed_api_model: str = "" # e.g. "text-embedding-3-small" - # Multi-endpoint: list of {"url": "...", "key": "...", "model": "..."} dicts - embed_api_endpoints: list[dict[str, str]] = None # type: ignore[assignment] - embed_api_concurrency: int = 4 - embed_api_max_tokens_per_batch: int = 32768 - embed_max_tokens: int = 8192 # max tokens per single text (0 = no limit) - - # Model download / cache - model_cache_dir: str = "" # empty = fastembed default cache - hf_mirror: str = "" # HuggingFace mirror URL, e.g. "https://hf-mirror.com" - - # GPU / execution providers - device: str = "auto" # 'auto', 'cuda', 'cpu' - embed_providers: list[str] | None = None # explicit ONNX providers override - - # File filtering - max_file_size_bytes: int = 1_000_000 # 1MB - exclude_extensions: frozenset[str] = None # type: ignore[assignment] # set in __post_init__ - binary_detect_sample_bytes: int = 2048 - binary_null_threshold: float = 0.10 # >10% null bytes = binary - generated_code_markers: tuple[str, ...] = ("@generated", "DO NOT EDIT", "auto-generated", "AUTO GENERATED") - - # Code-aware chunking - code_aware_chunking: bool = True - code_extensions: frozenset[str] = frozenset({ - ".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".java", ".cpp", ".c", - ".h", ".hpp", ".cs", ".rs", ".rb", ".php", ".scala", ".kt", ".swift", - ".lua", ".sh", ".bash", ".zsh", ".ps1", ".vue", ".svelte", - }) - - # Backend selection: 'auto', 'faiss', 'hnswlib' - ann_backend: str = "auto" - binary_backend: str = "faiss" - - # Indexing pipeline - index_workers: int = 2 # number of parallel indexing workers - - # HNSW index (ANNIndex) - hnsw_ef: int = 150 - hnsw_M: int = 32 - hnsw_ef_construction: int = 200 - - # Binary coarse search (BinaryStore) - binary_top_k: int = 200 - - # ANN fine search - ann_top_k: int = 50 - - # Reranker - reranker_model: str = "Xenova/ms-marco-MiniLM-L-6-v2" - reranker_top_k: int = 20 - reranker_batch_size: int = 32 - - # API reranker (optional) - reranker_api_url: str = "" - reranker_api_key: str = "" - reranker_api_model: str = "" - reranker_api_max_tokens_per_batch: int = 2048 - - # Metadata store - metadata_db_path: str = "" # empty = no metadata tracking - - # Data tiering (hot/warm/cold) - tier_hot_hours: int = 24 # files accessed within this window are 'hot' - tier_cold_hours: int = 168 # files not accessed for this long are 'cold' - - # Search quality tier: 'fast', 'balanced', 'thorough', 'auto' - default_search_quality: str = "auto" - - # Shard partitioning - num_shards: int = 1 # 1 = single partition (no sharding), >1 = hash-based sharding - max_loaded_shards: int = 4 # LRU limit for loaded shards in ShardManager - - # FTS - fts_top_k: int = 50 - - # Fusion - fusion_k: int = 60 # RRF k parameter - fusion_weights: dict = field(default_factory=lambda: { - "exact": 0.25, - "fuzzy": 0.10, - "vector": 0.50, - "graph": 0.15, - }) - - _DEFAULT_EXCLUDE_EXTENSIONS: frozenset[str] = frozenset({ - # binaries / images - ".png", ".jpg", ".jpeg", ".gif", ".webp", ".ico", ".bmp", ".svg", - ".zip", ".gz", ".tar", ".rar", ".7z", ".bz2", - ".bin", ".exe", ".dll", ".so", ".dylib", ".a", ".o", ".obj", - ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", - # build / generated - ".min.js", ".min.css", ".map", ".lock", - ".pyc", ".pyo", ".class", ".wasm", - # data - ".sqlite", ".db", ".npy", ".npz", ".pkl", ".pickle", - ".parquet", ".arrow", ".feather", - # media - ".mp3", ".mp4", ".wav", ".avi", ".mov", ".flv", - ".ttf", ".otf", ".woff", ".woff2", ".eot", - }) - - def __post_init__(self) -> None: - if self.exclude_extensions is None: - object.__setattr__(self, "exclude_extensions", self._DEFAULT_EXCLUDE_EXTENSIONS) - if self.embed_api_endpoints is None: - object.__setattr__(self, "embed_api_endpoints", []) - - def resolve_embed_providers(self) -> list[str]: - """Return ONNX execution providers based on device config. - - Priority: explicit embed_providers > device setting > auto-detect. - """ - if self.embed_providers is not None: - return list(self.embed_providers) - - if self.device == "cuda": - return ["CUDAExecutionProvider", "CPUExecutionProvider"] - - if self.device == "cpu": - return ["CPUExecutionProvider"] - - # auto-detect - try: - import onnxruntime - available = onnxruntime.get_available_providers() - if "CUDAExecutionProvider" in available: - log.info("CUDA detected via onnxruntime, using GPU for embedding") - return ["CUDAExecutionProvider", "CPUExecutionProvider"] - except ImportError: - pass - - return ["CPUExecutionProvider"] - - @classmethod - def defaults(cls) -> "Config": - return cls() - - @classmethod - def small(cls) -> "Config": - """Smaller config for testing or small corpora.""" - return cls( - hnsw_ef=50, - hnsw_M=16, - binary_top_k=50, - ann_top_k=20, - reranker_top_k=10, - ) diff --git a/codex-lens-v2/src/codexlens_search/core/__init__.py b/codex-lens-v2/src/codexlens_search/core/__init__.py deleted file mode 100644 index fde43df6..00000000 --- a/codex-lens-v2/src/codexlens_search/core/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from .base import BaseANNIndex, BaseBinaryIndex -from .binary import BinaryStore -from .factory import create_ann_index, create_binary_index -from .index import ANNIndex - -__all__ = [ - "BaseANNIndex", - "BaseBinaryIndex", - "ANNIndex", - "BinaryStore", - "create_ann_index", - "create_binary_index", -] diff --git a/codex-lens-v2/src/codexlens_search/core/base.py b/codex-lens-v2/src/codexlens_search/core/base.py deleted file mode 100644 index 20820347..00000000 --- a/codex-lens-v2/src/codexlens_search/core/base.py +++ /dev/null @@ -1,83 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod - -import numpy as np - - -class BaseANNIndex(ABC): - """Abstract base class for approximate nearest neighbor indexes.""" - - @abstractmethod - def add(self, ids: np.ndarray, vectors: np.ndarray) -> None: - """Add float32 vectors with corresponding IDs. - - Args: - ids: shape (N,) int64 - vectors: shape (N, dim) float32 - """ - - @abstractmethod - def fine_search( - self, query_vec: np.ndarray, top_k: int | None = None - ) -> tuple[np.ndarray, np.ndarray]: - """Search for nearest neighbors. - - Args: - query_vec: float32 vector of shape (dim,) - top_k: number of results - - Returns: - (ids, distances) as numpy arrays - """ - - @abstractmethod - def save(self) -> None: - """Persist index to disk.""" - - @abstractmethod - def load(self) -> None: - """Load index from disk.""" - - @abstractmethod - def __len__(self) -> int: - """Return the number of indexed items.""" - - -class BaseBinaryIndex(ABC): - """Abstract base class for binary vector indexes (Hamming distance).""" - - @abstractmethod - def add(self, ids: np.ndarray, vectors: np.ndarray) -> None: - """Add float32 vectors (will be binary-quantized internally). - - Args: - ids: shape (N,) int64 - vectors: shape (N, dim) float32 - """ - - @abstractmethod - def coarse_search( - self, query_vec: np.ndarray, top_k: int | None = None - ) -> tuple[np.ndarray, np.ndarray]: - """Search by Hamming distance. - - Args: - query_vec: float32 vector of shape (dim,) - top_k: number of results - - Returns: - (ids, distances) sorted ascending by distance - """ - - @abstractmethod - def save(self) -> None: - """Persist store to disk.""" - - @abstractmethod - def load(self) -> None: - """Load store from disk.""" - - @abstractmethod - def __len__(self) -> int: - """Return the number of stored items.""" diff --git a/codex-lens-v2/src/codexlens_search/core/binary.py b/codex-lens-v2/src/codexlens_search/core/binary.py deleted file mode 100644 index fe94997f..00000000 --- a/codex-lens-v2/src/codexlens_search/core/binary.py +++ /dev/null @@ -1,180 +0,0 @@ -from __future__ import annotations - -import logging -import math -from pathlib import Path - -import numpy as np - -from codexlens_search.config import Config -from codexlens_search.core.base import BaseBinaryIndex - -logger = logging.getLogger(__name__) - - -class BinaryStore(BaseBinaryIndex): - """Persistent binary vector store using numpy memmap. - - .. deprecated:: - Prefer ``FAISSBinaryIndex`` for binary coarse search. This class is - retained as a numpy-only fallback for environments where FAISS is not - available. New code should use ``create_binary_index()`` from - ``codexlens_search.core.factory`` which selects the best backend - automatically. - - Stores binary-quantized float32 vectors as packed uint8 arrays on disk. - Supports fast coarse search via XOR + popcount Hamming distance. - """ - - def __init__(self, path: str | Path, dim: int, config: Config) -> None: - self._dir = Path(path) - self._dim = dim - self._config = config - self._packed_bytes = math.ceil(dim / 8) - - self._bin_path = self._dir / "binary_store.bin" - self._ids_path = self._dir / "binary_store_ids.npy" - - self._matrix: np.ndarray | None = None # shape (N, packed_bytes), uint8 - self._ids: np.ndarray | None = None # shape (N,), int64 - self._count: int = 0 - - if self._bin_path.exists() and self._ids_path.exists(): - self.load() - - # ------------------------------------------------------------------ - # Internal helpers - # ------------------------------------------------------------------ - - def _quantize(self, vectors: np.ndarray) -> np.ndarray: - """Convert float32 vectors (N, dim) to packed uint8 (N, packed_bytes).""" - binary = (vectors > 0).astype(np.uint8) - packed = np.packbits(binary, axis=1) - return packed - - def _quantize_single(self, vec: np.ndarray) -> np.ndarray: - """Convert a single float32 vector (dim,) to packed uint8 (packed_bytes,).""" - binary = (vec > 0).astype(np.uint8) - return np.packbits(binary) - - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ - - def _ensure_capacity(self, needed: int) -> None: - """Grow pre-allocated matrix/ids arrays to fit *needed* total items.""" - if self._matrix is not None and self._matrix.shape[0] >= needed: - return - - new_cap = max(1024, needed) - # Double until large enough - if self._matrix is not None: - cur_cap = self._matrix.shape[0] - new_cap = max(cur_cap, 1024) - while new_cap < needed: - new_cap *= 2 - - new_matrix = np.zeros((new_cap, self._packed_bytes), dtype=np.uint8) - new_ids = np.zeros(new_cap, dtype=np.int64) - - if self._matrix is not None and self._count > 0: - new_matrix[: self._count] = self._matrix[: self._count] - new_ids[: self._count] = self._ids[: self._count] - - self._matrix = new_matrix - self._ids = new_ids - - def add(self, ids: np.ndarray, vectors: np.ndarray) -> None: - """Add float32 vectors and their ids. - - Does NOT call save() internally -- callers must call save() - explicitly after batch indexing. - - Args: - ids: shape (N,) int64 - vectors: shape (N, dim) float32 - """ - if len(ids) == 0: - return - - packed = self._quantize(vectors) # (N, packed_bytes) - n = len(ids) - - self._ensure_capacity(self._count + n) - self._matrix[self._count : self._count + n] = packed - self._ids[self._count : self._count + n] = ids.astype(np.int64) - self._count += n - - def coarse_search( - self, query_vec: np.ndarray, top_k: int | None = None - ) -> tuple[np.ndarray, np.ndarray]: - """Search by Hamming distance. - - Args: - query_vec: float32 vector of shape (dim,) - top_k: number of results; defaults to config.binary_top_k - - Returns: - (ids, distances) sorted ascending by Hamming distance - """ - if self._matrix is None or self._count == 0: - return np.array([], dtype=np.int64), np.array([], dtype=np.int32) - - k = top_k if top_k is not None else self._config.binary_top_k - k = min(k, self._count) - - query_bin = self._quantize_single(query_vec) # (packed_bytes,) - - # Slice to active region (matrix may be pre-allocated larger) - active_matrix = self._matrix[: self._count] - active_ids = self._ids[: self._count] - - # XOR then popcount via unpackbits - xor = np.bitwise_xor(active_matrix, query_bin[np.newaxis, :]) # (N, packed_bytes) - dists = np.unpackbits(xor, axis=1).sum(axis=1).astype(np.int32) # (N,) - - if k >= self._count: - order = np.argsort(dists) - else: - part = np.argpartition(dists, k)[:k] - order = part[np.argsort(dists[part])] - - return active_ids[order], dists[order] - - def save(self) -> None: - """Flush binary store to disk.""" - if self._matrix is None or self._count == 0: - return - self._dir.mkdir(parents=True, exist_ok=True) - # Write only the occupied portion of the pre-allocated matrix - active_matrix = self._matrix[: self._count] - mm = np.memmap( - str(self._bin_path), - dtype=np.uint8, - mode="w+", - shape=active_matrix.shape, - ) - mm[:] = active_matrix - mm.flush() - del mm - np.save(str(self._ids_path), self._ids[: self._count]) - - def load(self) -> None: - """Reload binary store from disk.""" - ids = np.load(str(self._ids_path)) - n = len(ids) - if n == 0: - return - mm = np.memmap( - str(self._bin_path), - dtype=np.uint8, - mode="r", - shape=(n, self._packed_bytes), - ) - self._matrix = np.array(mm) # copy into RAM for mutation support - del mm - self._ids = ids.astype(np.int64) - self._count = n - - def __len__(self) -> int: - return self._count diff --git a/codex-lens-v2/src/codexlens_search/core/factory.py b/codex-lens-v2/src/codexlens_search/core/factory.py deleted file mode 100644 index 6ad952bb..00000000 --- a/codex-lens-v2/src/codexlens_search/core/factory.py +++ /dev/null @@ -1,141 +0,0 @@ -from __future__ import annotations - -import logging -import warnings -from pathlib import Path - -from codexlens_search.config import Config -from codexlens_search.core.base import BaseANNIndex, BaseBinaryIndex - -logger = logging.getLogger(__name__) - -try: - import faiss as _faiss # noqa: F401 - _FAISS_AVAILABLE = True -except ImportError: - _FAISS_AVAILABLE = False - -try: - import hnswlib as _hnswlib # noqa: F401 - _HNSWLIB_AVAILABLE = True -except ImportError: - _HNSWLIB_AVAILABLE = False - - -def _has_faiss_gpu() -> bool: - """Check whether faiss-gpu is available (has GPU resources).""" - if not _FAISS_AVAILABLE: - return False - try: - import faiss - res = faiss.StandardGpuResources() # noqa: F841 - return True - except (AttributeError, RuntimeError): - return False - - -def create_ann_index(path: str | Path, dim: int, config: Config) -> BaseANNIndex: - """Create an ANN index based on config.ann_backend. - - Fallback chain for 'auto': faiss-gpu -> faiss-cpu -> hnswlib. - - Args: - path: directory for index persistence - dim: vector dimensionality - config: project configuration - - Returns: - A BaseANNIndex implementation - - Raises: - ImportError: if no suitable backend is available - """ - backend = config.ann_backend - - if backend == "faiss": - from codexlens_search.core.faiss_index import FAISSANNIndex - return FAISSANNIndex(path, dim, config) - - if backend == "hnswlib": - from codexlens_search.core.index import ANNIndex - return ANNIndex(path, dim, config) - - # auto: try faiss first, then hnswlib - if _FAISS_AVAILABLE: - from codexlens_search.core.faiss_index import FAISSANNIndex - gpu_tag = " (GPU available)" if _has_faiss_gpu() else " (CPU)" - logger.info("Auto-selected FAISS ANN backend%s", gpu_tag) - return FAISSANNIndex(path, dim, config) - - if _HNSWLIB_AVAILABLE: - from codexlens_search.core.index import ANNIndex - logger.info("Auto-selected hnswlib ANN backend") - return ANNIndex(path, dim, config) - - raise ImportError( - "No ANN backend available. Install faiss-cpu, faiss-gpu, or hnswlib." - ) - - -def create_binary_index( - path: str | Path, dim: int, config: Config -) -> BaseBinaryIndex: - """Create a binary index based on config.binary_backend. - - Fallback chain for 'auto': faiss -> numpy BinaryStore. - - Args: - path: directory for index persistence - dim: vector dimensionality - config: project configuration - - Returns: - A BaseBinaryIndex implementation - - Raises: - ImportError: if no suitable backend is available - """ - backend = config.binary_backend - - if backend == "faiss": - if _FAISS_AVAILABLE: - from codexlens_search.core.faiss_index import FAISSBinaryIndex - return FAISSBinaryIndex(path, dim, config) - # FAISS explicitly requested but not installed: fall back with warning - from codexlens_search.core.binary import BinaryStore - warnings.warn( - "binary_backend='faiss' but FAISS is not installed. " - "Falling back to deprecated numpy BinaryStore. " - "Install faiss-cpu or faiss-gpu for the recommended binary backend.", - DeprecationWarning, - stacklevel=2, - ) - logger.warning( - "binary_backend='faiss' but FAISS not available, " - "falling back to deprecated numpy BinaryStore." - ) - return BinaryStore(path, dim, config) - - if backend == "hnswlib": - from codexlens_search.core.binary import BinaryStore - return BinaryStore(path, dim, config) - - # auto: try faiss first, then numpy-based BinaryStore (deprecated fallback) - if _FAISS_AVAILABLE: - from codexlens_search.core.faiss_index import FAISSBinaryIndex - logger.info("Auto-selected FAISS binary backend") - return FAISSBinaryIndex(path, dim, config) - - # numpy BinaryStore is always available (no extra deps) - from codexlens_search.core.binary import BinaryStore - warnings.warn( - "Falling back to numpy BinaryStore because FAISS is not installed. " - "BinaryStore is deprecated; install faiss-cpu or faiss-gpu for better performance.", - DeprecationWarning, - stacklevel=2, - ) - logger.warning( - "FAISS not available, falling back to deprecated numpy BinaryStore. " - "Install faiss-cpu or faiss-gpu for the recommended binary backend." - ) - return BinaryStore(path, dim, config) diff --git a/codex-lens-v2/src/codexlens_search/core/faiss_index.py b/codex-lens-v2/src/codexlens_search/core/faiss_index.py deleted file mode 100644 index 6298de51..00000000 --- a/codex-lens-v2/src/codexlens_search/core/faiss_index.py +++ /dev/null @@ -1,301 +0,0 @@ -from __future__ import annotations - -import logging -import math -import threading -from pathlib import Path - -import numpy as np - -from codexlens_search.config import Config -from codexlens_search.core.base import BaseANNIndex, BaseBinaryIndex - -logger = logging.getLogger(__name__) - -try: - import faiss - _FAISS_AVAILABLE = True -except ImportError: - faiss = None # type: ignore[assignment] - _FAISS_AVAILABLE = False - - -def _try_gpu_index(index: "faiss.Index") -> "faiss.Index": - """Transfer a FAISS index to GPU if faiss-gpu is available. - - Returns the GPU index on success, or the original CPU index on failure. - """ - try: - res = faiss.StandardGpuResources() - gpu_index = faiss.index_cpu_to_gpu(res, 0, index) - logger.info("FAISS index transferred to GPU 0") - return gpu_index - except (AttributeError, RuntimeError) as exc: - logger.debug("GPU transfer unavailable, staying on CPU: %s", exc) - return index - - -def _to_cpu_for_save(index: "faiss.Index") -> "faiss.Index": - """Convert a GPU index back to CPU for serialization.""" - try: - return faiss.index_gpu_to_cpu(index) - except (AttributeError, RuntimeError): - return index - - -class FAISSANNIndex(BaseANNIndex): - """FAISS-based ANN index using IndexHNSWFlat with optional GPU. - - Uses Inner Product space with L2-normalized vectors for cosine similarity. - Thread-safe via RLock. - """ - - def __init__(self, path: str | Path, dim: int, config: Config) -> None: - if not _FAISS_AVAILABLE: - raise ImportError( - "faiss is required. Install with: pip install faiss-cpu " - "or pip install faiss-gpu" - ) - - self._path = Path(path) - self._index_path = self._path / "faiss_ann.index" - self._dim = dim - self._config = config - self._lock = threading.RLock() - self._index: faiss.Index | None = None - - def _ensure_loaded(self) -> None: - """Load or initialize the index (caller holds lock).""" - if self._index is not None: - return - self.load() - - def load(self) -> None: - """Load index from disk or initialize a fresh one. - - Uses IO_FLAG_MMAP for zero-copy memory-mapped loading when available, - falling back to regular read_index() on older faiss versions. - """ - with self._lock: - if self._index_path.exists(): - try: - idx = faiss.read_index( - str(self._index_path), faiss.IO_FLAG_MMAP - ) - except (AttributeError, RuntimeError, Exception) as exc: - logger.debug( - "MMAP load failed, falling back to regular read: %s", - exc, - ) - idx = faiss.read_index(str(self._index_path)) - logger.debug( - "Loaded FAISS ANN index from %s (%d items)", - self._index_path, idx.ntotal, - ) - else: - # HNSW with flat storage, M=32 by default - m = self._config.hnsw_M - idx = faiss.IndexHNSWFlat(self._dim, m, faiss.METRIC_INNER_PRODUCT) - idx.hnsw.efConstruction = self._config.hnsw_ef_construction - idx.hnsw.efSearch = self._config.hnsw_ef - logger.debug( - "Initialized fresh FAISS HNSW index (dim=%d, M=%d)", - self._dim, m, - ) - self._index = _try_gpu_index(idx) - - def add(self, ids: np.ndarray, vectors: np.ndarray) -> None: - """Add L2-normalized float32 vectors. - - Vectors are normalized before insertion so that Inner Product - distance equals cosine similarity. - - Args: - ids: shape (N,) int64 -- currently unused by FAISS flat index - but kept for API compatibility. FAISS uses sequential IDs. - vectors: shape (N, dim) float32 - """ - if len(ids) == 0: - return - - vecs = np.ascontiguousarray(vectors, dtype=np.float32) - # Normalize for cosine similarity via Inner Product - faiss.normalize_L2(vecs) - - with self._lock: - self._ensure_loaded() - self._index.add(vecs) - - def fine_search( - self, query_vec: np.ndarray, top_k: int | None = None - ) -> tuple[np.ndarray, np.ndarray]: - """Search for nearest neighbors. - - Args: - query_vec: float32 vector of shape (dim,) - top_k: number of results; defaults to config.ann_top_k - - Returns: - (ids, distances) as numpy arrays. For IP space, higher = more - similar, but distances are returned as-is for consumer handling. - """ - k = top_k if top_k is not None else self._config.ann_top_k - - with self._lock: - self._ensure_loaded() - - count = self._index.ntotal - if count == 0: - return np.array([], dtype=np.int64), np.array([], dtype=np.float32) - - k = min(k, count) - # Set efSearch for HNSW accuracy - try: - self._index.hnsw.efSearch = max(self._config.hnsw_ef, k) - except AttributeError: - pass # GPU index may not expose hnsw attribute directly - - q = np.ascontiguousarray(query_vec, dtype=np.float32).reshape(1, -1) - faiss.normalize_L2(q) - distances, labels = self._index.search(q, k) - return labels[0].astype(np.int64), distances[0].astype(np.float32) - - def save(self) -> None: - """Save index to disk.""" - with self._lock: - if self._index is None: - return - self._path.mkdir(parents=True, exist_ok=True) - cpu_index = _to_cpu_for_save(self._index) - faiss.write_index(cpu_index, str(self._index_path)) - - def __len__(self) -> int: - with self._lock: - if self._index is None: - return 0 - return self._index.ntotal - - -class FAISSBinaryIndex(BaseBinaryIndex): - """FAISS-based binary index using IndexBinaryFlat for Hamming distance. - - Vectors are binary-quantized (sign bit) before insertion. - Thread-safe via RLock. - """ - - def __init__(self, path: str | Path, dim: int, config: Config) -> None: - if not _FAISS_AVAILABLE: - raise ImportError( - "faiss is required. Install with: pip install faiss-cpu " - "or pip install faiss-gpu" - ) - - self._path = Path(path) - self._index_path = self._path / "faiss_binary.index" - self._dim = dim - self._config = config - self._packed_bytes = math.ceil(dim / 8) - self._lock = threading.RLock() - self._index: faiss.IndexBinary | None = None - - def _ensure_loaded(self) -> None: - if self._index is not None: - return - self.load() - - def _quantize(self, vectors: np.ndarray) -> np.ndarray: - """Convert float32 vectors (N, dim) to packed uint8 (N, packed_bytes).""" - binary = (vectors > 0).astype(np.uint8) - return np.packbits(binary, axis=1) - - def _quantize_single(self, vec: np.ndarray) -> np.ndarray: - """Convert a single float32 vector (dim,) to packed uint8 (1, packed_bytes).""" - binary = (vec > 0).astype(np.uint8) - return np.packbits(binary).reshape(1, -1) - - def load(self) -> None: - """Load binary index from disk or initialize a fresh one. - - Uses IO_FLAG_MMAP for zero-copy memory-mapped loading when available, - falling back to regular read_index_binary() on older faiss versions. - """ - with self._lock: - if self._index_path.exists(): - try: - idx = faiss.read_index_binary( - str(self._index_path), faiss.IO_FLAG_MMAP - ) - except (AttributeError, RuntimeError, Exception) as exc: - logger.debug( - "MMAP load failed, falling back to regular read: %s", - exc, - ) - idx = faiss.read_index_binary(str(self._index_path)) - logger.debug( - "Loaded FAISS binary index from %s (%d items)", - self._index_path, idx.ntotal, - ) - else: - # IndexBinaryFlat takes dimension in bits - idx = faiss.IndexBinaryFlat(self._dim) - logger.debug( - "Initialized fresh FAISS binary index (dim_bits=%d)", self._dim, - ) - self._index = idx - - def add(self, ids: np.ndarray, vectors: np.ndarray) -> None: - """Add float32 vectors (binary-quantized internally). - - Args: - ids: shape (N,) int64 -- kept for API compatibility - vectors: shape (N, dim) float32 - """ - if len(ids) == 0: - return - - packed = self._quantize(vectors) - packed = np.ascontiguousarray(packed, dtype=np.uint8) - - with self._lock: - self._ensure_loaded() - self._index.add(packed) - - def coarse_search( - self, query_vec: np.ndarray, top_k: int | None = None - ) -> tuple[np.ndarray, np.ndarray]: - """Search by Hamming distance. - - Args: - query_vec: float32 vector of shape (dim,) - top_k: number of results; defaults to config.binary_top_k - - Returns: - (ids, distances) sorted ascending by Hamming distance - """ - with self._lock: - self._ensure_loaded() - - if self._index.ntotal == 0: - return np.array([], dtype=np.int64), np.array([], dtype=np.int32) - - k = top_k if top_k is not None else self._config.binary_top_k - k = min(k, self._index.ntotal) - - q = self._quantize_single(query_vec) - q = np.ascontiguousarray(q, dtype=np.uint8) - distances, labels = self._index.search(q, k) - return labels[0].astype(np.int64), distances[0].astype(np.int32) - - def save(self) -> None: - """Save binary index to disk.""" - with self._lock: - if self._index is None: - return - self._path.mkdir(parents=True, exist_ok=True) - faiss.write_index_binary(self._index, str(self._index_path)) - - def __len__(self) -> int: - with self._lock: - if self._index is None: - return 0 - return self._index.ntotal diff --git a/codex-lens-v2/src/codexlens_search/core/index.py b/codex-lens-v2/src/codexlens_search/core/index.py deleted file mode 100644 index 1a7e4e93..00000000 --- a/codex-lens-v2/src/codexlens_search/core/index.py +++ /dev/null @@ -1,136 +0,0 @@ -from __future__ import annotations - -import logging -import threading -from pathlib import Path - -import numpy as np - -from codexlens_search.config import Config -from codexlens_search.core.base import BaseANNIndex - -logger = logging.getLogger(__name__) - -try: - import hnswlib - _HNSWLIB_AVAILABLE = True -except ImportError: - _HNSWLIB_AVAILABLE = False - - -class ANNIndex(BaseANNIndex): - """HNSW-based approximate nearest neighbor index. - - Lazy-loads on first use, thread-safe via RLock. - """ - - def __init__(self, path: str | Path, dim: int, config: Config) -> None: - if not _HNSWLIB_AVAILABLE: - raise ImportError("hnswlib is required. Install with: pip install hnswlib") - - self._path = Path(path) - self._hnsw_path = self._path / "ann_index.hnsw" - self._dim = dim - self._config = config - self._lock = threading.RLock() - self._index: hnswlib.Index | None = None - - # ------------------------------------------------------------------ - # Internal helpers - # ------------------------------------------------------------------ - - def _ensure_loaded(self) -> None: - """Load or initialize the index (caller holds lock).""" - if self._index is not None: - return - self.load() - - # ------------------------------------------------------------------ - # Public API - # ------------------------------------------------------------------ - - def load(self) -> None: - """Load index from disk or initialize a fresh one.""" - with self._lock: - idx = hnswlib.Index(space="cosine", dim=self._dim) - if self._hnsw_path.exists(): - idx.load_index(str(self._hnsw_path), max_elements=0) - idx.set_ef(self._config.hnsw_ef) - logger.debug("Loaded HNSW index from %s (%d items)", self._hnsw_path, idx.get_current_count()) - else: - idx.init_index( - max_elements=1000, - ef_construction=self._config.hnsw_ef_construction, - M=self._config.hnsw_M, - ) - idx.set_ef(self._config.hnsw_ef) - logger.debug("Initialized fresh HNSW index (dim=%d)", self._dim) - self._index = idx - - def add(self, ids: np.ndarray, vectors: np.ndarray) -> None: - """Add float32 vectors. - - Does NOT call save() internally -- callers must call save() - explicitly after batch indexing. - - Args: - ids: shape (N,) int64 - vectors: shape (N, dim) float32 - """ - if len(ids) == 0: - return - - vecs = np.ascontiguousarray(vectors, dtype=np.float32) - - with self._lock: - self._ensure_loaded() - # Expand capacity if needed - current = self._index.get_current_count() - max_el = self._index.get_max_elements() - needed = current + len(ids) - if needed > max_el: - new_cap = max(max_el * 2, needed + 100) - self._index.resize_index(new_cap) - self._index.add_items(vecs, ids.astype(np.int64)) - - def fine_search( - self, query_vec: np.ndarray, top_k: int | None = None - ) -> tuple[np.ndarray, np.ndarray]: - """Search for nearest neighbors. - - Args: - query_vec: float32 vector of shape (dim,) - top_k: number of results; defaults to config.ann_top_k - - Returns: - (ids, distances) as numpy arrays - """ - k = top_k if top_k is not None else self._config.ann_top_k - - with self._lock: - self._ensure_loaded() - - count = self._index.get_current_count() - if count == 0: - return np.array([], dtype=np.int64), np.array([], dtype=np.float32) - - k = min(k, count) - self._index.set_ef(max(self._config.hnsw_ef, k)) - - q = np.ascontiguousarray(query_vec, dtype=np.float32).reshape(1, -1) - labels, distances = self._index.knn_query(q, k=k) - return labels[0].astype(np.int64), distances[0].astype(np.float32) - - def save(self) -> None: - """Save index to disk (caller may or may not hold lock).""" - with self._lock: - if self._index is None: - return - self._path.mkdir(parents=True, exist_ok=True) - self._index.save_index(str(self._hnsw_path)) - - def __len__(self) -> int: - with self._lock: - if self._index is None: - return 0 - return self._index.get_current_count() diff --git a/codex-lens-v2/src/codexlens_search/core/shard.py b/codex-lens-v2/src/codexlens_search/core/shard.py deleted file mode 100644 index 2f3527df..00000000 --- a/codex-lens-v2/src/codexlens_search/core/shard.py +++ /dev/null @@ -1,178 +0,0 @@ -"""Single index partition (shard) that owns FTS, binary, ANN, and metadata stores.""" -from __future__ import annotations - -import logging -from pathlib import Path - -from codexlens_search.config import Config -from codexlens_search.core.base import BaseANNIndex, BaseBinaryIndex -from codexlens_search.embed.base import BaseEmbedder -from codexlens_search.indexing.metadata import MetadataStore -from codexlens_search.indexing.pipeline import IndexingPipeline, IndexStats -from codexlens_search.rerank import BaseReranker -from codexlens_search.search.fts import FTSEngine -from codexlens_search.search.pipeline import SearchPipeline, SearchResult - -logger = logging.getLogger(__name__) - - -class Shard: - """A complete index partition with its own FTS, binary, ANN, and metadata stores. - - Components are lazy-loaded on first access and can be explicitly unloaded - to release memory. The embedder and reranker are shared across shards - (passed in from ShardManager) since they are expensive to instantiate. - """ - - def __init__( - self, - shard_id: int, - db_path: str | Path, - config: Config, - ) -> None: - self._shard_id = shard_id - self._shard_dir = Path(db_path).resolve() / f"shard_{shard_id}" - self._config = config - - # Lazy-loaded components (created on _ensure_loaded) - self._fts: FTSEngine | None = None - self._binary_store: BaseBinaryIndex | None = None - self._ann_index: BaseANNIndex | None = None - self._metadata: MetadataStore | None = None - self._indexing: IndexingPipeline | None = None - self._search: SearchPipeline | None = None - self._loaded = False - - @property - def shard_id(self) -> int: - return self._shard_id - - @property - def is_loaded(self) -> bool: - return self._loaded - - def _ensure_loaded( - self, - embedder: BaseEmbedder, - reranker: BaseReranker, - ) -> None: - """Lazy-create all per-shard components if not yet loaded.""" - if self._loaded: - return - - from codexlens_search.core.factory import create_ann_index, create_binary_index - - self._shard_dir.mkdir(parents=True, exist_ok=True) - - self._fts = FTSEngine(self._shard_dir / "fts.db") - self._binary_store = create_binary_index( - self._shard_dir, self._config.embed_dim, self._config - ) - self._ann_index = create_ann_index( - self._shard_dir, self._config.embed_dim, self._config - ) - self._metadata = MetadataStore(self._shard_dir / "metadata.db") - - self._indexing = IndexingPipeline( - embedder=embedder, - binary_store=self._binary_store, - ann_index=self._ann_index, - fts=self._fts, - config=self._config, - metadata=self._metadata, - ) - - self._search = SearchPipeline( - embedder=embedder, - binary_store=self._binary_store, - ann_index=self._ann_index, - reranker=reranker, - fts=self._fts, - config=self._config, - metadata_store=self._metadata, - ) - - self._loaded = True - logger.debug("Shard %d loaded from %s", self._shard_id, self._shard_dir) - - def unload(self) -> None: - """Release memory by closing connections and dropping references.""" - if not self._loaded: - return - - if self._metadata is not None: - self._metadata.close() - - self._fts = None - self._binary_store = None - self._ann_index = None - self._metadata = None - self._indexing = None - self._search = None - self._loaded = False - logger.debug("Shard %d unloaded", self._shard_id) - - def load( - self, - embedder: BaseEmbedder, - reranker: BaseReranker, - ) -> None: - """Explicitly load shard components.""" - self._ensure_loaded(embedder, reranker) - - def save(self) -> None: - """Persist binary and ANN indexes to disk.""" - if not self._loaded: - return - if self._binary_store is not None: - self._binary_store.save() - if self._ann_index is not None: - self._ann_index.save() - - def search( - self, - query: str, - embedder: BaseEmbedder, - reranker: BaseReranker, - quality: str | None = None, - top_k: int | None = None, - ) -> list[SearchResult]: - """Search this shard's index. - - Args: - query: Search query string. - embedder: Shared embedder instance. - reranker: Shared reranker instance. - quality: Search quality tier. - top_k: Maximum results to return. - - Returns: - List of SearchResult from this shard. - """ - self._ensure_loaded(embedder, reranker) - assert self._search is not None - return self._search.search(query, top_k=top_k, quality=quality) - - def sync( - self, - files: list[Path], - root: Path | None, - embedder: BaseEmbedder, - reranker: BaseReranker, - **kwargs: object, - ) -> IndexStats: - """Sync this shard's index with the given files. - - Args: - files: Files that belong to this shard. - root: Root directory for relative paths. - embedder: Shared embedder instance. - reranker: Shared reranker instance. - **kwargs: Forwarded to IndexingPipeline.sync(). - - Returns: - IndexStats for this shard's sync operation. - """ - self._ensure_loaded(embedder, reranker) - assert self._indexing is not None - return self._indexing.sync(files, root=root, **kwargs) diff --git a/codex-lens-v2/src/codexlens_search/core/shard_manager.py b/codex-lens-v2/src/codexlens_search/core/shard_manager.py deleted file mode 100644 index 639aa3f2..00000000 --- a/codex-lens-v2/src/codexlens_search/core/shard_manager.py +++ /dev/null @@ -1,250 +0,0 @@ -"""ShardManager: manages multiple Shard instances with LRU eviction.""" -from __future__ import annotations - -import logging -import threading -from collections import OrderedDict -from concurrent.futures import ThreadPoolExecutor -from pathlib import Path - -from codexlens_search.config import Config -from codexlens_search.core.shard import Shard -from codexlens_search.embed.base import BaseEmbedder -from codexlens_search.indexing.pipeline import IndexStats -from codexlens_search.rerank import BaseReranker -from codexlens_search.search.fusion import reciprocal_rank_fusion -from codexlens_search.search.pipeline import SearchResult - -logger = logging.getLogger(__name__) - - -class ShardManager: - """Manages multiple Shard instances with hash-based file routing and LRU eviction. - - Files are deterministically routed to shards via hash(path) % num_shards. - Search queries all shards in parallel and merges results via RRF fusion. - At most max_loaded_shards are kept in memory; least-recently-used shards - are unloaded when the limit is exceeded. - """ - - def __init__( - self, - num_shards: int, - db_path: str | Path, - config: Config, - embedder: BaseEmbedder, - reranker: BaseReranker, - ) -> None: - if num_shards < 1: - raise ValueError("num_shards must be >= 1") - - self._num_shards = num_shards - self._db_path = Path(db_path).resolve() - self._config = config - self._embedder = embedder - self._reranker = reranker - self._max_loaded = config.max_loaded_shards - - # Create all Shard objects (lazy-loaded, no I/O yet) - self._shards: dict[int, Shard] = { - i: Shard(i, self._db_path, config) - for i in range(num_shards) - } - - # LRU tracking: keys are shard_ids, most-recently-used at end - self._loaded_order: OrderedDict[int, None] = OrderedDict() - self._lru_lock = threading.Lock() - - @property - def num_shards(self) -> int: - return self._num_shards - - def route_file(self, path: str) -> int: - """Deterministically route a file path to a shard ID. - - Uses hash(path) % num_shards for uniform distribution. - """ - return hash(path) % self._num_shards - - def get_shard(self, shard_id: int) -> Shard: - """Return the Shard instance for a given shard_id.""" - if shard_id not in self._shards: - raise ValueError( - f"Invalid shard_id {shard_id}, valid range: 0-{self._num_shards - 1}" - ) - return self._shards[shard_id] - - def _ensure_loaded(self, shard_id: int) -> Shard: - """Load a shard if needed, applying LRU eviction policy. - - Thread-safe: protects OrderedDict mutations with a lock. - Returns the loaded Shard. - """ - shard = self._shards[shard_id] - - with self._lru_lock: - # Mark as most-recently-used - if shard_id in self._loaded_order: - self._loaded_order.move_to_end(shard_id) - else: - self._loaded_order[shard_id] = None - - # Load if not already loaded - if not shard.is_loaded: - shard.load(self._embedder, self._reranker) - - # Evict LRU shards if over limit - while len(self._loaded_order) > self._max_loaded: - evict_id, _ = self._loaded_order.popitem(last=False) - evict_shard = self._shards[evict_id] - if evict_shard.is_loaded: - logger.info("LRU evicting shard %d", evict_id) - evict_shard.unload() - - return shard - - def sync( - self, - files: list[Path], - root: Path | None = None, - **kwargs: object, - ) -> IndexStats: - """Sync index with files, routing each file to its shard. - - Groups files by shard via route_file(), then syncs each shard - with its subset of files. - - Args: - files: Current list of files to index. - root: Root directory for relative paths. - **kwargs: Forwarded to Shard.sync(). - - Returns: - Aggregated IndexStats across all shards. - """ - # Group files by shard - shard_files: dict[int, list[Path]] = {i: [] for i in range(self._num_shards)} - for fpath in files: - rel = str(fpath.relative_to(root)) if root else str(fpath) - shard_id = self.route_file(rel) - shard_files[shard_id].append(fpath) - - total_files = 0 - total_chunks = 0 - total_duration = 0.0 - - for shard_id, shard_file_list in shard_files.items(): - if not shard_file_list: - continue - self._ensure_loaded(shard_id) - shard = self._shards[shard_id] - stats = shard.sync( - shard_file_list, - root=root, - embedder=self._embedder, - reranker=self._reranker, - **kwargs, - ) - total_files += stats.files_processed - total_chunks += stats.chunks_created - total_duration += stats.duration_seconds - - return IndexStats( - files_processed=total_files, - chunks_created=total_chunks, - duration_seconds=round(total_duration, 2), - ) - - def search( - self, - query: str, - quality: str | None = None, - top_k: int | None = None, - ) -> list[SearchResult]: - """Search all shards in parallel, merge results via RRF fusion. - - Each shard returns its own ranked results. Cross-shard merging - uses reciprocal_rank_fusion with equal weights across shards. - Per-shard top_k is increased to compensate for cross-shard dilution. - - Args: - query: Search query string. - quality: Search quality tier. - top_k: Maximum final results to return. - - Returns: - Merged list of SearchResult ordered by relevance. - """ - cfg = self._config - final_top_k = top_k if top_k is not None else cfg.reranker_top_k - - # Increase per-shard top_k to get enough candidates for cross-shard RRF - per_shard_top_k = max(final_top_k, final_top_k * 2) - - # Load all shards for search - for shard_id in range(self._num_shards): - self._ensure_loaded(shard_id) - - # Parallel search across shards - shard_results: dict[int, list[SearchResult]] = {} - - def _search_shard(sid: int) -> tuple[int, list[SearchResult]]: - shard = self._shards[sid] - results = shard.search( - query, - embedder=self._embedder, - reranker=self._reranker, - quality=quality, - top_k=per_shard_top_k, - ) - return sid, results - - with ThreadPoolExecutor(max_workers=min(self._num_shards, 4)) as pool: - futures = [pool.submit(_search_shard, sid) for sid in range(self._num_shards)] - for future in futures: - try: - sid, results = future.result() - shard_results[sid] = results - except Exception: - logger.warning("Shard search failed", exc_info=True) - - # If only one shard returned results, no merging needed - non_empty = {k: v for k, v in shard_results.items() if v} - if not non_empty: - return [] - if len(non_empty) == 1: - results = list(non_empty.values())[0] - return results[:final_top_k] - - # Cross-shard RRF merge - # Build ranked lists keyed by shard name, with (doc_id, score) tuples - # Use a global result map to look up SearchResult by a unique key - # Since doc_ids are shard-local, we need a composite key - rrf_input: dict[str, list[tuple[int, float]]] = {} - global_results: dict[int, SearchResult] = {} - global_id = 0 - - for sid, results in non_empty.items(): - ranked: list[tuple[int, float]] = [] - for r in results: - global_results[global_id] = r - ranked.append((global_id, r.score)) - global_id += 1 - rrf_input[f"shard_{sid}"] = ranked - - fused = reciprocal_rank_fusion(rrf_input, k=cfg.fusion_k) - - merged: list[SearchResult] = [] - for gid, fused_score in fused[:final_top_k]: - result = global_results[gid] - merged.append(SearchResult( - id=result.id, - path=result.path, - score=fused_score, - snippet=result.snippet, - line=result.line, - end_line=result.end_line, - content=result.content, - )) - - return merged diff --git a/codex-lens-v2/src/codexlens_search/embed/__init__.py b/codex-lens-v2/src/codexlens_search/embed/__init__.py deleted file mode 100644 index f6c9608a..00000000 --- a/codex-lens-v2/src/codexlens_search/embed/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .base import BaseEmbedder -from .local import FastEmbedEmbedder, EMBED_PROFILES -from .api import APIEmbedder - -__all__ = ["BaseEmbedder", "FastEmbedEmbedder", "APIEmbedder", "EMBED_PROFILES"] diff --git a/codex-lens-v2/src/codexlens_search/embed/api.py b/codex-lens-v2/src/codexlens_search/embed/api.py deleted file mode 100644 index a716fcd2..00000000 --- a/codex-lens-v2/src/codexlens_search/embed/api.py +++ /dev/null @@ -1,263 +0,0 @@ -from __future__ import annotations - -import itertools -import logging -import threading -import time -from concurrent.futures import ThreadPoolExecutor, as_completed - -import httpx -import numpy as np - -from ..config import Config -from .base import BaseEmbedder - -logger = logging.getLogger(__name__) - - -class _Endpoint: - """A single API endpoint with its own client and rate-limit tracking.""" - - __slots__ = ("url", "key", "model", "client", "failures", "lock") - - def __init__(self, url: str, key: str, model: str) -> None: - self.url = url.rstrip("/") - if not self.url.endswith("/embeddings"): - self.url += "/embeddings" - self.key = key - self.model = model - self.client = httpx.Client( - headers={ - "Authorization": f"Bearer {key}", - "Content-Type": "application/json", - }, - timeout=60.0, - ) - self.failures = 0 - self.lock = threading.Lock() - - -class APIEmbedder(BaseEmbedder): - """Embedder backed by remote HTTP API (OpenAI /v1/embeddings format). - - Features: - - Token packing: packs small chunks into batches up to max_tokens_per_batch - - Multi-endpoint: round-robins across multiple (url, key) pairs - - Concurrent dispatch: parallel API calls via ThreadPoolExecutor - - Per-endpoint failure tracking and retry with backoff - """ - - def __init__(self, config: Config) -> None: - self._config = config - self._endpoints = self._build_endpoints(config) - self._cycler = itertools.cycle(range(len(self._endpoints))) - self._cycler_lock = threading.Lock() - self._executor = ThreadPoolExecutor( - max_workers=min(config.embed_api_concurrency, len(self._endpoints) * 2), - ) - - @staticmethod - def _build_endpoints(config: Config) -> list[_Endpoint]: - """Build endpoint list from config. Supports both single and multi configs.""" - endpoints: list[_Endpoint] = [] - - # Multi-endpoint config takes priority - if config.embed_api_endpoints: - for ep in config.embed_api_endpoints: - endpoints.append(_Endpoint( - url=ep.get("url", config.embed_api_url), - key=ep.get("key", config.embed_api_key), - model=ep.get("model", config.embed_api_model), - )) - - # Fallback: single endpoint from top-level config - if not endpoints and config.embed_api_url: - endpoints.append(_Endpoint( - url=config.embed_api_url, - key=config.embed_api_key, - model=config.embed_api_model, - )) - - if not endpoints: - raise ValueError("No API embedding endpoints configured") - - return endpoints - - def _next_endpoint(self) -> _Endpoint: - with self._cycler_lock: - idx = next(self._cycler) - return self._endpoints[idx] - - # -- Token packing ------------------------------------------------ - - @staticmethod - def _estimate_tokens(text: str) -> int: - """Rough token estimate: ~4 chars per token for code.""" - return max(1, len(text) // 4) - - def _truncate_text(self, text: str) -> str: - """Truncate text to embed_max_tokens if configured.""" - max_tokens = self._config.embed_max_tokens - if max_tokens <= 0: - return text - max_chars = max_tokens * 4 # inverse of _estimate_tokens - if len(text) > max_chars: - return text[:max_chars] - return text - - def _pack_batches( - self, texts: list[str] - ) -> list[list[tuple[int, str]]]: - """Pack texts into batches respecting max_tokens_per_batch. - - Returns list of batches, each batch is list of (original_index, text). - Also respects embed_batch_size as max items per batch. - """ - max_tokens = self._config.embed_api_max_tokens_per_batch - max_items = self._config.embed_batch_size - batches: list[list[tuple[int, str]]] = [] - current: list[tuple[int, str]] = [] - current_tokens = 0 - - for i, text in enumerate(texts): - tokens = self._estimate_tokens(text) - # Start new batch if adding this text would exceed limits - if current and ( - current_tokens + tokens > max_tokens - or len(current) >= max_items - ): - batches.append(current) - current = [] - current_tokens = 0 - current.append((i, text)) - current_tokens += tokens - - if current: - batches.append(current) - - return batches - - # -- API call with retry ------------------------------------------ - - def _call_api( - self, - texts: list[str], - endpoint: _Endpoint, - max_retries: int = 3, - ) -> list[np.ndarray]: - """Call a single endpoint with retry logic.""" - payload: dict = {"input": texts} - if endpoint.model: - payload["model"] = endpoint.model - - last_exc: Exception | None = None - for attempt in range(max_retries): - try: - response = endpoint.client.post(endpoint.url, json=payload) - except Exception as exc: - last_exc = exc - logger.warning( - "API embed %s failed (attempt %d/%d): %s", - endpoint.url, attempt + 1, max_retries, exc, - ) - time.sleep((2 ** attempt) * 0.5) - continue - - if response.status_code in (429, 503): - logger.warning( - "API embed %s returned HTTP %s (attempt %d/%d), retrying...", - endpoint.url, response.status_code, attempt + 1, max_retries, - ) - time.sleep((2 ** attempt) * 0.5) - continue - - response.raise_for_status() - data = response.json() - - items = data.get("data", []) - items.sort(key=lambda x: x["index"]) - vectors = [ - np.array(item["embedding"], dtype=np.float32) - for item in items - ] - - # Reset failure counter on success - with endpoint.lock: - endpoint.failures = 0 - - return vectors - - # Track failures - with endpoint.lock: - endpoint.failures += 1 - - raise RuntimeError( - f"API embed failed at {endpoint.url} after {max_retries} attempts. " - f"Last error: {last_exc}" - ) - - # -- Public interface --------------------------------------------- - - def embed_single(self, text: str) -> np.ndarray: - text = self._truncate_text(text) - endpoint = self._next_endpoint() - vecs = self._call_api([text], endpoint) - return vecs[0] - - def _call_api_with_split( - self, - texts: list[str], - endpoint: "_Endpoint", - ) -> list[np.ndarray]: - """Call API with automatic batch splitting on 413 errors.""" - try: - return self._call_api(texts, endpoint) - except Exception as exc: - if "413" in str(exc) and len(texts) > 1: - mid = len(texts) // 2 - logger.info("413 received, splitting batch %d → %d + %d", len(texts), mid, len(texts) - mid) - left = self._call_api_with_split(texts[:mid], endpoint) - right = self._call_api_with_split(texts[mid:], endpoint) - return left + right - raise - - def embed_batch(self, texts: list[str]) -> list[np.ndarray]: - if not texts: - return [] - - # 0. Truncate texts exceeding model context - texts = [self._truncate_text(t) for t in texts] - - # 1. Pack into token-aware batches - packed = self._pack_batches(texts) - - if len(packed) == 1: - # Single batch — no concurrency overhead needed - batch_texts = [t for _, t in packed[0]] - batch_indices = [i for i, _ in packed[0]] - endpoint = self._next_endpoint() - vecs = self._call_api_with_split(batch_texts, endpoint) - results: dict[int, np.ndarray] = {} - for idx, vec in zip(batch_indices, vecs): - results[idx] = vec - return [results[i] for i in range(len(texts))] - - # 2. Dispatch batches concurrently across endpoints - results: dict[int, np.ndarray] = {} - futures = [] - batch_index_map: list[list[int]] = [] - - for batch in packed: - batch_texts = [t for _, t in batch] - batch_indices = [i for i, _ in batch] - endpoint = self._next_endpoint() - future = self._executor.submit(self._call_api_with_split, batch_texts, endpoint) - futures.append(future) - batch_index_map.append(batch_indices) - - for future, indices in zip(futures, batch_index_map): - vecs = future.result() # propagates exceptions - for idx, vec in zip(indices, vecs): - results[idx] = vec - - return [results[i] for i in range(len(texts))] diff --git a/codex-lens-v2/src/codexlens_search/embed/base.py b/codex-lens-v2/src/codexlens_search/embed/base.py deleted file mode 100644 index 7e78e75f..00000000 --- a/codex-lens-v2/src/codexlens_search/embed/base.py +++ /dev/null @@ -1,13 +0,0 @@ -from __future__ import annotations -from abc import ABC, abstractmethod -import numpy as np - - -class BaseEmbedder(ABC): - @abstractmethod - def embed_single(self, text: str) -> np.ndarray: - """Embed a single text, returns float32 ndarray shape (dim,).""" - - @abstractmethod - def embed_batch(self, texts: list[str]) -> list[np.ndarray]: - """Embed a list of texts, returns list of float32 ndarrays.""" diff --git a/codex-lens-v2/src/codexlens_search/embed/local.py b/codex-lens-v2/src/codexlens_search/embed/local.py deleted file mode 100644 index b61413e3..00000000 --- a/codex-lens-v2/src/codexlens_search/embed/local.py +++ /dev/null @@ -1,60 +0,0 @@ -from __future__ import annotations - -import numpy as np - -from ..config import Config -from .base import BaseEmbedder - -EMBED_PROFILES = { - "small": "BAAI/bge-small-en-v1.5", # 384d - "base": "BAAI/bge-base-en-v1.5", # 768d - "large": "BAAI/bge-large-en-v1.5", # 1024d - "code": "jinaai/jina-embeddings-v2-base-code", # 768d -} - - -class FastEmbedEmbedder(BaseEmbedder): - """Embedder backed by fastembed.TextEmbedding with lazy model loading.""" - - def __init__(self, config: Config) -> None: - self._config = config - self._model = None - - def _load(self) -> None: - """Lazy-load the fastembed TextEmbedding model on first use.""" - if self._model is not None: - return - from .. import model_manager - model_manager.ensure_model(self._config.embed_model, self._config) - - from fastembed import TextEmbedding - providers = self._config.resolve_embed_providers() - cache_kwargs = model_manager.get_cache_kwargs(self._config) - try: - self._model = TextEmbedding( - model_name=self._config.embed_model, - providers=providers, - **cache_kwargs, - ) - except TypeError: - self._model = TextEmbedding( - model_name=self._config.embed_model, - **cache_kwargs, - ) - - def embed_single(self, text: str) -> np.ndarray: - """Embed a single text, returns float32 ndarray of shape (dim,).""" - self._load() - result = list(self._model.embed([text])) - return result[0].astype(np.float32) - - def embed_batch(self, texts: list[str]) -> list[np.ndarray]: - """Embed a list of texts in batches, returns list of float32 ndarrays.""" - self._load() - batch_size = self._config.embed_batch_size - results: list[np.ndarray] = [] - for start in range(0, len(texts), batch_size): - batch = texts[start : start + batch_size] - for vec in self._model.embed(batch): - results.append(vec.astype(np.float32)) - return results diff --git a/codex-lens-v2/src/codexlens_search/indexing/__init__.py b/codex-lens-v2/src/codexlens_search/indexing/__init__.py deleted file mode 100644 index cf1f4727..00000000 --- a/codex-lens-v2/src/codexlens_search/indexing/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from __future__ import annotations - -from .metadata import MetadataStore -from .pipeline import IndexingPipeline, IndexStats - -__all__ = ["IndexingPipeline", "IndexStats", "MetadataStore"] diff --git a/codex-lens-v2/src/codexlens_search/indexing/metadata.py b/codex-lens-v2/src/codexlens_search/indexing/metadata.py deleted file mode 100644 index c10b4b96..00000000 --- a/codex-lens-v2/src/codexlens_search/indexing/metadata.py +++ /dev/null @@ -1,300 +0,0 @@ -"""SQLite-backed metadata store for file-to-chunk mapping and tombstone tracking.""" -from __future__ import annotations - -import sqlite3 -import time -from pathlib import Path - - -class MetadataStore: - """Tracks file-to-chunk mappings and deleted chunk IDs (tombstones). - - Tables: - files - file_path (PK), content_hash, last_modified, file_size, - tier ('hot'/'warm'/'cold'), last_accessed (epoch float) - chunks - chunk_id (PK), file_path (FK CASCADE), chunk_hash - deleted_chunks - chunk_id (PK) for tombstone tracking - """ - - def __init__(self, db_path: str | Path) -> None: - self._conn = sqlite3.connect(str(db_path), check_same_thread=False) - self._conn.execute("PRAGMA foreign_keys = ON") - self._conn.execute("PRAGMA journal_mode = WAL") - self._create_tables() - self._migrate_size_column() - self._migrate_tier_columns() - - def _create_tables(self) -> None: - self._conn.executescript(""" - CREATE TABLE IF NOT EXISTS files ( - file_path TEXT PRIMARY KEY, - content_hash TEXT NOT NULL, - last_modified REAL NOT NULL, - file_size INTEGER NOT NULL DEFAULT 0, - tier TEXT NOT NULL DEFAULT 'warm', - last_accessed REAL - ); - - CREATE TABLE IF NOT EXISTS chunks ( - chunk_id INTEGER PRIMARY KEY, - file_path TEXT NOT NULL, - chunk_hash TEXT NOT NULL DEFAULT '', - FOREIGN KEY (file_path) REFERENCES files(file_path) ON DELETE CASCADE - ); - - CREATE TABLE IF NOT EXISTS deleted_chunks ( - chunk_id INTEGER PRIMARY KEY - ); - """) - self._conn.commit() - - def _migrate_size_column(self) -> None: - """Add file_size column if missing (for pre-existing DBs).""" - cols = { - row[1] - for row in self._conn.execute("PRAGMA table_info(files)").fetchall() - } - if "file_size" not in cols: - self._conn.execute( - "ALTER TABLE files ADD COLUMN file_size INTEGER NOT NULL DEFAULT 0" - ) - self._conn.commit() - - def _migrate_tier_columns(self) -> None: - """Add tier and last_accessed columns if missing (for pre-existing DBs).""" - cols = { - row[1] - for row in self._conn.execute("PRAGMA table_info(files)").fetchall() - } - if "tier" not in cols: - self._conn.execute( - "ALTER TABLE files ADD COLUMN tier TEXT NOT NULL DEFAULT 'warm'" - ) - if "last_accessed" not in cols: - self._conn.execute( - "ALTER TABLE files ADD COLUMN last_accessed REAL" - ) - if "tier" not in cols or "last_accessed" not in cols: - self._conn.commit() - - def register_file( - self, - file_path: str, - content_hash: str, - mtime: float, - file_size: int = 0, - ) -> None: - """Insert or update a file record.""" - self._conn.execute( - "INSERT OR REPLACE INTO files " - "(file_path, content_hash, last_modified, file_size) " - "VALUES (?, ?, ?, ?)", - (file_path, content_hash, mtime, file_size), - ) - self._conn.commit() - - def register_chunks( - self, file_path: str, chunk_ids_and_hashes: list[tuple[int, str]] - ) -> None: - """Register chunk IDs belonging to a file. - - Args: - file_path: The owning file path (must already exist in files table). - chunk_ids_and_hashes: List of (chunk_id, chunk_hash) tuples. - """ - if not chunk_ids_and_hashes: - return - self._conn.executemany( - "INSERT OR REPLACE INTO chunks (chunk_id, file_path, chunk_hash) " - "VALUES (?, ?, ?)", - [(cid, file_path, chash) for cid, chash in chunk_ids_and_hashes], - ) - self._conn.commit() - - def mark_file_deleted(self, file_path: str) -> int: - """Move all chunk IDs for a file to deleted_chunks, then remove the file. - - Returns the number of chunks tombstoned. - """ - # Collect chunk IDs before CASCADE deletes them - rows = self._conn.execute( - "SELECT chunk_id FROM chunks WHERE file_path = ?", (file_path,) - ).fetchall() - - if not rows: - # Still remove the file record if it exists - self._conn.execute( - "DELETE FROM files WHERE file_path = ?", (file_path,) - ) - self._conn.commit() - return 0 - - chunk_ids = [(r[0],) for r in rows] - self._conn.executemany( - "INSERT OR IGNORE INTO deleted_chunks (chunk_id) VALUES (?)", - chunk_ids, - ) - # CASCADE deletes chunks rows automatically - self._conn.execute( - "DELETE FROM files WHERE file_path = ?", (file_path,) - ) - self._conn.commit() - return len(chunk_ids) - - def get_deleted_ids(self) -> set[int]: - """Return all tombstoned chunk IDs for search-time filtering.""" - rows = self._conn.execute( - "SELECT chunk_id FROM deleted_chunks" - ).fetchall() - return {r[0] for r in rows} - - def get_file_hash(self, file_path: str) -> str | None: - """Return the stored content hash for a file, or None if not tracked.""" - row = self._conn.execute( - "SELECT content_hash FROM files WHERE file_path = ?", (file_path,) - ).fetchone() - return row[0] if row else None - - def file_needs_update(self, file_path: str, content_hash: str) -> bool: - """Check if a file needs re-indexing based on its content hash.""" - stored = self.get_file_hash(file_path) - if stored is None: - return True # New file - return stored != content_hash - - def file_needs_update_fast( - self, file_path: str, mtime: float, size: int - ) -> bool: - """Fast pre-check using mtime and file size (no content read needed). - - Returns True if the file appears changed or is not yet tracked. - When mtime and size both match stored values, the file is assumed - unchanged (~1000x faster than content-hash comparison). - """ - row = self._conn.execute( - "SELECT last_modified, file_size FROM files WHERE file_path = ?", - (file_path,), - ).fetchone() - if row is None: - return True # New file - stored_mtime, stored_size = row - return stored_mtime != mtime or stored_size != size - - def compact_deleted(self) -> set[int]: - """Return deleted IDs and clear the deleted_chunks table. - - Call this after rebuilding the vector index to reclaim space. - """ - deleted = self.get_deleted_ids() - if deleted: - self._conn.execute("DELETE FROM deleted_chunks") - self._conn.commit() - return deleted - - def get_chunk_ids_for_file(self, file_path: str) -> list[int]: - """Return all chunk IDs belonging to a file.""" - rows = self._conn.execute( - "SELECT chunk_id FROM chunks WHERE file_path = ?", (file_path,) - ).fetchall() - return [r[0] for r in rows] - - def get_all_files(self) -> dict[str, str]: - """Return all tracked files as {file_path: content_hash}.""" - rows = self._conn.execute( - "SELECT file_path, content_hash FROM files" - ).fetchall() - return {r[0]: r[1] for r in rows} - - def max_chunk_id(self) -> int: - """Return the maximum chunk_id across chunks and deleted_chunks. - - Returns -1 if no chunks exist, so that next_id = max_chunk_id() + 1 - starts at 0 for an empty store. - """ - row = self._conn.execute( - "SELECT MAX(m) FROM (" - " SELECT MAX(chunk_id) AS m FROM chunks" - " UNION ALL" - " SELECT MAX(chunk_id) AS m FROM deleted_chunks" - ")" - ).fetchone() - return row[0] if row[0] is not None else -1 - - # ------------------------------------------------------------------ - # Tier management - # ------------------------------------------------------------------ - - def record_access(self, file_path: str) -> None: - """Update last_accessed timestamp for a file.""" - self._conn.execute( - "UPDATE files SET last_accessed = ? WHERE file_path = ?", - (time.time(), file_path), - ) - self._conn.commit() - - def record_access_batch(self, file_paths: list[str]) -> None: - """Batch-update last_accessed timestamps for multiple files.""" - if not file_paths: - return - now = time.time() - self._conn.executemany( - "UPDATE files SET last_accessed = ? WHERE file_path = ?", - [(now, fp) for fp in file_paths], - ) - self._conn.commit() - - def classify_tiers( - self, hot_threshold_hours: int = 24, cold_threshold_hours: int = 168 - ) -> None: - """Reclassify all files into hot/warm/cold tiers based on last_accessed. - - - hot: last_accessed within hot_threshold_hours - - cold: last_accessed older than cold_threshold_hours (or never accessed) - - warm: everything in between - """ - now = time.time() - hot_cutoff = now - hot_threshold_hours * 3600 - cold_cutoff = now - cold_threshold_hours * 3600 - - # Hot: recently accessed - self._conn.execute( - "UPDATE files SET tier = 'hot' " - "WHERE last_accessed IS NOT NULL AND last_accessed >= ?", - (hot_cutoff,), - ) - # Cold: not accessed for a long time, or never accessed - self._conn.execute( - "UPDATE files SET tier = 'cold' " - "WHERE last_accessed IS NULL " - "OR (last_accessed < ? AND last_accessed < ?)", - (cold_cutoff, hot_cutoff), - ) - # Warm: between hot and cold cutoffs - self._conn.execute( - "UPDATE files SET tier = 'warm' " - "WHERE last_accessed IS NOT NULL " - "AND last_accessed >= ? AND last_accessed < ?", - (cold_cutoff, hot_cutoff), - ) - self._conn.commit() - - def get_files_by_tier(self, tier: str) -> list[str]: - """Return file paths in the specified tier ('hot', 'warm', or 'cold').""" - rows = self._conn.execute( - "SELECT file_path FROM files WHERE tier = ?", (tier,) - ).fetchall() - return [r[0] for r in rows] - - def get_cold_files(self) -> list[str]: - """Return file paths in the 'cold' tier.""" - return self.get_files_by_tier("cold") - - def get_file_tier(self, file_path: str) -> str | None: - """Return the tier for a specific file, or None if not tracked.""" - row = self._conn.execute( - "SELECT tier FROM files WHERE file_path = ?", (file_path,) - ).fetchone() - return row[0] if row else None - - def close(self) -> None: - self._conn.close() diff --git a/codex-lens-v2/src/codexlens_search/indexing/pipeline.py b/codex-lens-v2/src/codexlens_search/indexing/pipeline.py deleted file mode 100644 index 64172c23..00000000 --- a/codex-lens-v2/src/codexlens_search/indexing/pipeline.py +++ /dev/null @@ -1,1020 +0,0 @@ -"""Three-stage parallel indexing pipeline: chunk -> embed -> index. - -Uses threading.Thread with queue.Queue for producer-consumer handoff. -The GIL is acceptable because embedding (onnxruntime) releases it in C extensions. -""" -from __future__ import annotations - -import hashlib -import logging -import queue -import re -import threading -import time -from dataclasses import dataclass -from pathlib import Path - -import numpy as np - -from codexlens_search.config import Config -from codexlens_search.core.base import BaseANNIndex, BaseBinaryIndex -from codexlens_search.embed.base import BaseEmbedder -from codexlens_search.indexing.metadata import MetadataStore -from codexlens_search.search.fts import FTSEngine - -logger = logging.getLogger(__name__) - -# Sentinel value to signal worker shutdown -_SENTINEL = None - -# Defaults for chunking (can be overridden via index_files kwargs) -_DEFAULT_MAX_CHUNK_CHARS = 800 -_DEFAULT_CHUNK_OVERLAP = 100 - - -def is_file_excluded(file_path: Path, config: Config) -> str | None: - """Check if a file should be excluded from indexing. - - Returns exclusion reason string, or None if file should be indexed. - """ - # Extension check - suffix = file_path.suffix.lower() - # Handle compound extensions like .min.js - name_lower = file_path.name.lower() - for ext in config.exclude_extensions: - if name_lower.endswith(ext): - return f"excluded extension: {ext}" - - # File size check - try: - size = file_path.stat().st_size - except OSError: - return "cannot stat file" - if size > config.max_file_size_bytes: - return f"exceeds max size ({size} > {config.max_file_size_bytes})" - if size == 0: - return "empty file" - - # Binary detection: sample first N bytes - try: - with open(file_path, "rb") as f: - sample = f.read(config.binary_detect_sample_bytes) - except OSError: - return "cannot read file" - if sample: - null_ratio = sample.count(b"\x00") / len(sample) - if null_ratio > config.binary_null_threshold: - return f"binary file (null ratio: {null_ratio:.2%})" - - # Generated code markers (check first 1KB of text) - try: - head = file_path.read_text(encoding="utf-8", errors="replace")[:1024] - except OSError: - return None # can't check, let it through - for marker in config.generated_code_markers: - if marker in head: - return f"generated code marker: {marker}" - - return None - - -@dataclass -class IndexStats: - """Statistics returned after indexing completes.""" - files_processed: int = 0 - chunks_created: int = 0 - duration_seconds: float = 0.0 - - -class IndexingPipeline: - """Parallel 3-stage indexing pipeline with queue-based handoff. - - Stage 1 (main thread): Read files, chunk text, push to embed_queue. - Stage 2 (embed worker): Pull text batches, call embed_batch(), push vectors to index_queue. - Stage 3 (index worker): Pull vectors+ids, call BinaryStore.add(), ANNIndex.add(), FTS.add_documents(). - - After all stages complete, save() is called on BinaryStore and ANNIndex exactly once. - """ - - def __init__( - self, - embedder: BaseEmbedder, - binary_store: BaseBinaryIndex, - ann_index: BaseANNIndex, - fts: FTSEngine, - config: Config, - metadata: MetadataStore | None = None, - ) -> None: - self._embedder = embedder - self._binary_store = binary_store - self._ann_index = ann_index - self._fts = fts - self._config = config - self._metadata = metadata - - def index_files( - self, - files: list[Path], - *, - root: Path | None = None, - max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS, - chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP, - max_file_size: int = 50_000, - ) -> IndexStats: - """Run the 3-stage pipeline on the given files. - - Args: - files: List of file paths to index. - root: Optional root for computing relative paths. If None, uses - each file's absolute path as its identifier. - max_chunk_chars: Maximum characters per chunk. - chunk_overlap: Character overlap between consecutive chunks. - max_file_size: Skip files larger than this (bytes). - - Returns: - IndexStats with counts and timing. - """ - if not files: - return IndexStats() - - t0 = time.monotonic() - - embed_queue: queue.Queue = queue.Queue(maxsize=4) - index_queue: queue.Queue = queue.Queue(maxsize=4) - - # Track errors from workers - worker_errors: list[Exception] = [] - error_lock = threading.Lock() - - def _record_error(exc: Exception) -> None: - with error_lock: - worker_errors.append(exc) - - # --- Start workers --- - embed_thread = threading.Thread( - target=self._embed_worker, - args=(embed_queue, index_queue, _record_error), - daemon=True, - name="indexing-embed", - ) - index_thread = threading.Thread( - target=self._index_worker, - args=(index_queue, _record_error), - daemon=True, - name="indexing-index", - ) - embed_thread.start() - index_thread.start() - - # --- Stage 1: chunk files (main thread) --- - chunk_id = 0 - files_processed = 0 - chunks_created = 0 - - for fpath in files: - # Noise file filter - exclude_reason = is_file_excluded(fpath, self._config) - if exclude_reason: - logger.debug("Skipping %s: %s", fpath, exclude_reason) - continue - try: - text = fpath.read_text(encoding="utf-8", errors="replace") - except Exception as exc: - logger.debug("Skipping %s: %s", fpath, exc) - continue - - rel_path = str(fpath.relative_to(root)) if root else str(fpath) - file_chunks = self._smart_chunk(text, rel_path, max_chunk_chars, chunk_overlap) - - if not file_chunks: - continue - - files_processed += 1 - - # Assign sequential IDs and push batch to embed queue - batch_ids = [] - batch_texts = [] - batch_paths = [] - batch_lines: list[tuple[int, int]] = [] - for chunk_text, path, sl, el in file_chunks: - batch_ids.append(chunk_id) - batch_texts.append(chunk_text) - batch_paths.append(path) - batch_lines.append((sl, el)) - chunk_id += 1 - - chunks_created += len(batch_ids) - embed_queue.put((batch_ids, batch_texts, batch_paths, batch_lines)) - - # Signal embed worker: no more data - embed_queue.put(_SENTINEL) - - # Wait for workers to finish - embed_thread.join() - index_thread.join() - - # --- Final flush --- - self._binary_store.save() - self._ann_index.save() - - duration = time.monotonic() - t0 - stats = IndexStats( - files_processed=files_processed, - chunks_created=chunks_created, - duration_seconds=round(duration, 2), - ) - - logger.info( - "Indexing complete: %d files, %d chunks in %.1fs", - stats.files_processed, - stats.chunks_created, - stats.duration_seconds, - ) - - # Raise first worker error if any occurred - if worker_errors: - raise worker_errors[0] - - return stats - - # ------------------------------------------------------------------ - # Workers - # ------------------------------------------------------------------ - - def _embed_worker( - self, - in_q: queue.Queue, - out_q: queue.Queue, - on_error: callable, - ) -> None: - """Stage 2: Pull chunk batches, embed, push (ids, vecs, docs) to index queue.""" - try: - while True: - item = in_q.get() - if item is _SENTINEL: - break - - batch_ids, batch_texts, batch_paths, batch_lines = item - try: - vecs = self._embedder.embed_batch(batch_texts) - vec_array = np.array(vecs, dtype=np.float32) - id_array = np.array(batch_ids, dtype=np.int64) - out_q.put((id_array, vec_array, batch_texts, batch_paths, batch_lines)) - except Exception as exc: - logger.error("Embed worker error: %s", exc) - on_error(exc) - finally: - # Signal index worker: no more data - out_q.put(_SENTINEL) - - def _index_worker( - self, - in_q: queue.Queue, - on_error: callable, - ) -> None: - """Stage 3: Pull (ids, vecs, texts, paths, lines), write to stores.""" - while True: - item = in_q.get() - if item is _SENTINEL: - break - - id_array, vec_array, texts, paths, line_ranges = item - try: - self._binary_store.add(id_array, vec_array) - self._ann_index.add(id_array, vec_array) - - fts_docs = [ - (int(id_array[i]), paths[i], texts[i], - line_ranges[i][0], line_ranges[i][1]) - for i in range(len(id_array)) - ] - self._fts.add_documents(fts_docs) - except Exception as exc: - logger.error("Index worker error: %s", exc) - on_error(exc) - - # ------------------------------------------------------------------ - # Chunking - # ------------------------------------------------------------------ - - @staticmethod - def _chunk_text( - text: str, - path: str, - max_chars: int, - overlap: int, - ) -> list[tuple[str, str, int, int]]: - """Split file text into overlapping chunks. - - Returns list of (chunk_text, path, start_line, end_line) tuples. - Line numbers are 1-based. - """ - if not text.strip(): - return [] - - chunks: list[tuple[str, str, int, int]] = [] - lines = text.splitlines(keepends=True) - current: list[str] = [] - current_len = 0 - chunk_start_line = 1 # 1-based - lines_consumed = 0 - - for line in lines: - lines_consumed += 1 - if current_len + len(line) > max_chars and current: - chunk = "".join(current) - end_line = lines_consumed - 1 - chunks.append((chunk, path, chunk_start_line, end_line)) - # overlap: keep last N characters - tail = chunk[-overlap:] if overlap else "" - tail_newlines = tail.count("\n") - chunk_start_line = max(1, end_line - tail_newlines + 1) - current = [tail] if tail else [] - current_len = len(tail) - current.append(line) - current_len += len(line) - - if current: - chunks.append(("".join(current), path, chunk_start_line, lines_consumed)) - - return chunks - - # Pattern matching top-level definitions across languages - _CODE_BOUNDARY_RE = re.compile( - r"^(?:" - r"(?:export\s+)?(?:async\s+)?(?:def|class|function)\s+" # Python/JS/TS - r"|(?:pub\s+)?(?:fn|struct|impl|enum|trait|mod)\s+" # Rust - r"|(?:func|type)\s+" # Go - r"|(?:public|private|protected|internal)?\s*(?:static\s+)?(?:class|interface|enum|record)\s+" # Java/C# - r"|(?:namespace|template)\s+" # C++ - r")", - re.MULTILINE, - ) - - def _chunk_code( - self, - text: str, - path: str, - max_chars: int, - overlap: int, - ) -> list[tuple[str, str, int, int]]: - """Split code at function/class boundaries with fallback to _chunk_text. - - Strategy: - 1. Find all top-level definition boundaries via regex. - 2. Split text into segments at those boundaries. - 3. Merge small adjacent segments up to max_chars. - 4. If a segment exceeds max_chars, fall back to _chunk_text for that segment. - """ - lines = text.splitlines(keepends=True) - if not lines: - return [] - - # Find boundary line numbers (0-based) - boundaries: list[int] = [0] # always start at line 0 - for i, line in enumerate(lines): - if i == 0: - continue - # Only match lines with no or minimal indentation (top-level) - stripped = line.lstrip() - indent = len(line) - len(stripped) - if indent <= 4 and self._CODE_BOUNDARY_RE.match(stripped): - boundaries.append(i) - - if len(boundaries) <= 1: - # No boundaries found, fall back to text chunking - return self._chunk_text(text, path, max_chars, overlap) - - # Build raw segments between boundaries - raw_segments: list[tuple[int, int]] = [] # (start_line, end_line) 0-based - for idx in range(len(boundaries)): - start = boundaries[idx] - end = boundaries[idx + 1] if idx + 1 < len(boundaries) else len(lines) - raw_segments.append((start, end)) - - # Merge small adjacent segments up to max_chars - merged: list[tuple[int, int]] = [] - cur_start, cur_end = raw_segments[0] - cur_len = sum(len(lines[i]) for i in range(cur_start, cur_end)) - - for seg_start, seg_end in raw_segments[1:]: - seg_len = sum(len(lines[i]) for i in range(seg_start, seg_end)) - if cur_len + seg_len <= max_chars: - cur_end = seg_end - cur_len += seg_len - else: - merged.append((cur_start, cur_end)) - cur_start, cur_end = seg_start, seg_end - cur_len = seg_len - merged.append((cur_start, cur_end)) - - # Build chunks, falling back to _chunk_text for oversized segments - chunks: list[tuple[str, str, int, int]] = [] - for seg_start, seg_end in merged: - seg_text = "".join(lines[seg_start:seg_end]) - if len(seg_text) > max_chars: - # Oversized: sub-chunk with text splitter - sub_chunks = self._chunk_text(seg_text, path, max_chars, overlap) - # Adjust line numbers relative to segment start - for chunk_text, p, sl, el in sub_chunks: - chunks.append((chunk_text, p, sl + seg_start, el + seg_start)) - else: - chunks.append((seg_text, path, seg_start + 1, seg_end)) - - return chunks - - def _smart_chunk( - self, - text: str, - path: str, - max_chars: int, - overlap: int, - ) -> list[tuple[str, str, int, int]]: - """Choose chunking strategy based on file type and config.""" - if self._config.code_aware_chunking: - suffix = Path(path).suffix.lower() - if suffix in self._config.code_extensions: - result = self._chunk_code(text, path, max_chars, overlap) - if result: - return result - return self._chunk_text(text, path, max_chars, overlap) - - # ------------------------------------------------------------------ - # Incremental API - # ------------------------------------------------------------------ - - @staticmethod - def _content_hash(text: str) -> str: - """Compute SHA-256 hex digest of file content.""" - return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest() - - def _require_metadata(self) -> MetadataStore: - """Return metadata store or raise if not configured.""" - if self._metadata is None: - raise RuntimeError( - "MetadataStore is required for incremental indexing. " - "Pass metadata= to IndexingPipeline.__init__." - ) - return self._metadata - - def _next_chunk_id(self) -> int: - """Return the next available chunk ID from MetadataStore.""" - meta = self._require_metadata() - return meta.max_chunk_id() + 1 - - def index_files_fts_only( - self, - files: list[Path], - *, - root: Path | None = None, - max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS, - chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP, - ) -> IndexStats: - """Index files into FTS5 only, without embedding or vector indexing. - - Chunks files using the same logic as the full pipeline, then inserts - directly into FTS. No embedding computation, no binary/ANN store writes. - - Args: - files: List of file paths to index. - root: Optional root for computing relative paths. - max_chunk_chars: Maximum characters per chunk. - chunk_overlap: Character overlap between consecutive chunks. - - Returns: - IndexStats with counts and timing. - """ - if not files: - return IndexStats() - - meta = self._require_metadata() - t0 = time.monotonic() - chunk_id = self._next_chunk_id() - files_processed = 0 - chunks_created = 0 - - for fpath in files: - exclude_reason = is_file_excluded(fpath, self._config) - if exclude_reason: - logger.debug("Skipping %s: %s", fpath, exclude_reason) - continue - try: - text = fpath.read_text(encoding="utf-8", errors="replace") - except Exception as exc: - logger.debug("Skipping %s: %s", fpath, exc) - continue - - rel_path = str(fpath.relative_to(root)) if root else str(fpath) - content_hash = self._content_hash(text) - - # Skip unchanged files - if not meta.file_needs_update(rel_path, content_hash): - continue - - # Remove old FTS data if file was previously indexed - if meta.get_file_hash(rel_path) is not None: - meta.mark_file_deleted(rel_path) - self._fts.delete_by_path(rel_path) - - file_chunks = self._smart_chunk(text, rel_path, max_chunk_chars, chunk_overlap) - if not file_chunks: - st = fpath.stat() - meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size) - continue - - files_processed += 1 - fts_docs = [] - chunk_id_hashes = [] - for chunk_text, path, sl, el in file_chunks: - fts_docs.append((chunk_id, path, chunk_text, sl, el)) - chunk_id_hashes.append((chunk_id, self._content_hash(chunk_text))) - chunk_id += 1 - - self._fts.add_documents(fts_docs) - chunks_created += len(fts_docs) - - # Register metadata - st = fpath.stat() - meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size) - meta.register_chunks(rel_path, chunk_id_hashes) - - duration = time.monotonic() - t0 - stats = IndexStats( - files_processed=files_processed, - chunks_created=chunks_created, - duration_seconds=round(duration, 2), - ) - logger.info( - "FTS-only indexing complete: %d files, %d chunks in %.1fs", - stats.files_processed, stats.chunks_created, stats.duration_seconds, - ) - return stats - - def index_file( - self, - file_path: Path, - *, - root: Path | None = None, - force: bool = False, - max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS, - chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP, - max_file_size: int = 50_000, - ) -> IndexStats: - """Index a single file incrementally. - - Skips files that have not changed (same content_hash) unless - *force* is True. - - Args: - file_path: Path to the file to index. - root: Optional root for computing relative path identifiers. - force: Re-index even if content hash has not changed. - max_chunk_chars: Maximum characters per chunk. - chunk_overlap: Character overlap between consecutive chunks. - max_file_size: Skip files larger than this (bytes). - - Returns: - IndexStats with counts and timing. - """ - meta = self._require_metadata() - t0 = time.monotonic() - - # Noise file filter - exclude_reason = is_file_excluded(file_path, self._config) - if exclude_reason: - logger.debug("Skipping %s: %s", file_path, exclude_reason) - return IndexStats(duration_seconds=round(time.monotonic() - t0, 2)) - - # Read file - try: - text = file_path.read_text(encoding="utf-8", errors="replace") - except Exception as exc: - logger.debug("Skipping %s: %s", file_path, exc) - return IndexStats(duration_seconds=round(time.monotonic() - t0, 2)) - - content_hash = self._content_hash(text) - rel_path = str(file_path.relative_to(root)) if root else str(file_path) - - # Check if update is needed - if not force and not meta.file_needs_update(rel_path, content_hash): - logger.debug("Skipping %s: unchanged", rel_path) - return IndexStats(duration_seconds=round(time.monotonic() - t0, 2)) - - # If file was previously indexed, remove old data first - if meta.get_file_hash(rel_path) is not None: - meta.mark_file_deleted(rel_path) - self._fts.delete_by_path(rel_path) - - # Chunk - file_chunks = self._smart_chunk(text, rel_path, max_chunk_chars, chunk_overlap) - if not file_chunks: - # Register file with no chunks - st = file_path.stat() - meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size) - return IndexStats( - files_processed=1, - duration_seconds=round(time.monotonic() - t0, 2), - ) - - # Assign chunk IDs - start_id = self._next_chunk_id() - batch_ids = [] - batch_texts = [] - batch_paths = [] - batch_lines: list[tuple[int, int]] = [] - for i, (chunk_text, path, sl, el) in enumerate(file_chunks): - batch_ids.append(start_id + i) - batch_texts.append(chunk_text) - batch_paths.append(path) - batch_lines.append((sl, el)) - - # Embed synchronously - vecs = self._embedder.embed_batch(batch_texts) - vec_array = np.array(vecs, dtype=np.float32) - id_array = np.array(batch_ids, dtype=np.int64) - - # Index: write to stores - self._binary_store.add(id_array, vec_array) - self._ann_index.add(id_array, vec_array) - fts_docs = [ - (batch_ids[i], batch_paths[i], batch_texts[i], - batch_lines[i][0], batch_lines[i][1]) - for i in range(len(batch_ids)) - ] - self._fts.add_documents(fts_docs) - - # Register in metadata - st = file_path.stat() - meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size) - chunk_id_hashes = [ - (batch_ids[i], self._content_hash(batch_texts[i])) - for i in range(len(batch_ids)) - ] - meta.register_chunks(rel_path, chunk_id_hashes) - - # Flush stores - self._binary_store.save() - self._ann_index.save() - - duration = time.monotonic() - t0 - stats = IndexStats( - files_processed=1, - chunks_created=len(batch_ids), - duration_seconds=round(duration, 2), - ) - logger.info( - "Indexed file %s: %d chunks in %.2fs", - rel_path, stats.chunks_created, stats.duration_seconds, - ) - return stats - - def remove_file(self, file_path: str) -> None: - """Mark a file as deleted via tombstone strategy. - - Marks all chunk IDs for the file in MetadataStore.deleted_chunks - and removes the file's FTS entries. - - Args: - file_path: The relative path identifier of the file to remove. - """ - meta = self._require_metadata() - count = meta.mark_file_deleted(file_path) - fts_count = self._fts.delete_by_path(file_path) - logger.info( - "Removed file %s: %d chunks tombstoned, %d FTS entries deleted", - file_path, count, fts_count, - ) - - def sync( - self, - file_paths: list[Path], - *, - root: Path | None = None, - max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS, - chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP, - max_file_size: int = 50_000, - progress_callback: callable | None = None, - tier: str = "full", - ) -> IndexStats: - """Reconcile index state against a current file list. - - Identifies files that are new, changed, or removed and processes - each accordingly. - - Args: - file_paths: Current list of files that should be indexed. - root: Optional root for computing relative path identifiers. - max_chunk_chars: Maximum characters per chunk. - chunk_overlap: Character overlap between consecutive chunks. - max_file_size: Skip files larger than this (bytes). - tier: Indexing tier - 'full' (default) runs the full pipeline - with embedding, 'fts_only' runs FTS-only indexing without - embedding or vector stores. - - Returns: - Aggregated IndexStats for all operations. - """ - meta = self._require_metadata() - t0 = time.monotonic() - - # Build set of current relative paths - current_rel_paths: dict[str, Path] = {} - for fpath in file_paths: - rel = str(fpath.relative_to(root)) if root else str(fpath) - current_rel_paths[rel] = fpath - - # Get known files from metadata - known_files = meta.get_all_files() # {rel_path: content_hash} - - # Detect removed files - removed = set(known_files.keys()) - set(current_rel_paths.keys()) - for rel in removed: - self.remove_file(rel) - - # Collect files needing update using 4-level detection: - # Level 1: set diff (removed files) - handled above - # Level 2: mtime + size fast pre-check via stat() - # Level 3: content hash only when mtime/size mismatch - files_to_index: list[Path] = [] - for rel, fpath in current_rel_paths.items(): - # Level 2: stat-based fast check - try: - st = fpath.stat() - except OSError: - continue - if not meta.file_needs_update_fast(rel, st.st_mtime, st.st_size): - # mtime + size match stored values -> skip (no read needed) - continue - - # Level 3: mtime/size changed -> verify with content hash - try: - text = fpath.read_text(encoding="utf-8", errors="replace") - except Exception: - continue - content_hash = self._content_hash(text) - if not meta.file_needs_update(rel, content_hash): - # Content unchanged despite mtime/size change -> update metadata only - meta.register_file(rel, content_hash, st.st_mtime, st.st_size) - continue - - # File genuinely changed -> remove old data and queue for re-index - if meta.get_file_hash(rel) is not None: - meta.mark_file_deleted(rel) - self._fts.delete_by_path(rel) - files_to_index.append(fpath) - - # Sort files by data tier priority: hot first, then warm, then cold - if files_to_index: - _tier_priority = {"hot": 0, "warm": 1, "cold": 2} - def _tier_sort_key(fp: Path) -> int: - rel = str(fp.relative_to(root)) if root else str(fp) - t = meta.get_file_tier(rel) - return _tier_priority.get(t or "warm", 1) - files_to_index.sort(key=_tier_sort_key) - - # Reclassify data tiers after sync detection - meta.classify_tiers( - self._config.tier_hot_hours, self._config.tier_cold_hours - ) - - # Batch index via parallel pipeline or FTS-only - if files_to_index: - if tier == "fts_only": - batch_stats = self.index_files_fts_only( - files_to_index, - root=root, - max_chunk_chars=max_chunk_chars, - chunk_overlap=chunk_overlap, - ) - else: - # Full pipeline with embedding - start_id = self._next_chunk_id() - batch_stats = self._index_files_with_metadata( - files_to_index, - root=root, - max_chunk_chars=max_chunk_chars, - chunk_overlap=chunk_overlap, - start_chunk_id=start_id, - progress_callback=progress_callback, - ) - total_files = batch_stats.files_processed - total_chunks = batch_stats.chunks_created - else: - total_files = 0 - total_chunks = 0 - - duration = time.monotonic() - t0 - result = IndexStats( - files_processed=total_files, - chunks_created=total_chunks, - duration_seconds=round(duration, 2), - ) - logger.info( - "Sync complete: %d files indexed, %d chunks created, " - "%d files removed in %.1fs", - result.files_processed, result.chunks_created, - len(removed), result.duration_seconds, - ) - return result - - def _index_files_with_metadata( - self, - files: list[Path], - *, - root: Path | None = None, - max_chunk_chars: int = _DEFAULT_MAX_CHUNK_CHARS, - chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP, - start_chunk_id: int = 0, - progress_callback: callable | None = None, - ) -> IndexStats: - """Batch index files using the parallel pipeline, registering metadata. - - Like index_files() but also registers each file and its chunks - in the MetadataStore for incremental tracking. - - Args: - files: Files to index. - root: Root for relative paths. - max_chunk_chars: Max chars per chunk. - chunk_overlap: Overlap between chunks. - start_chunk_id: Starting chunk ID. - progress_callback: Optional callback(files_done, total_files) for progress. - """ - meta = self._require_metadata() - if not files: - return IndexStats() - - t0 = time.monotonic() - - embed_queue: queue.Queue = queue.Queue(maxsize=4) - index_queue: queue.Queue = queue.Queue(maxsize=4) - - worker_errors: list[Exception] = [] - error_lock = threading.Lock() - - def _record_error(exc: Exception) -> None: - with error_lock: - worker_errors.append(exc) - - embed_thread = threading.Thread( - target=self._embed_worker, - args=(embed_queue, index_queue, _record_error), - daemon=True, name="sync-embed", - ) - index_thread = threading.Thread( - target=self._index_worker, - args=(index_queue, _record_error), - daemon=True, name="sync-index", - ) - embed_thread.start() - index_thread.start() - - chunk_id = start_chunk_id - files_processed = 0 - chunks_created = 0 - total_files = len(files) - - # Cross-file chunk accumulator for optimal API batch utilization - max_batch_items = self._config.embed_batch_size - max_batch_tokens = self._config.embed_api_max_tokens_per_batch - buf_ids: list[int] = [] - buf_texts: list[str] = [] - buf_paths: list[str] = [] - buf_lines: list[tuple[int, int]] = [] - buf_tokens = 0 - - def _flush_buffer() -> None: - nonlocal buf_ids, buf_texts, buf_paths, buf_lines, buf_tokens - if buf_ids: - embed_queue.put((list(buf_ids), list(buf_texts), list(buf_paths), list(buf_lines))) - buf_ids.clear() - buf_texts.clear() - buf_paths.clear() - buf_lines.clear() - buf_tokens = 0 - - for fpath in files: - exclude_reason = is_file_excluded(fpath, self._config) - if exclude_reason: - logger.debug("Skipping %s: %s", fpath, exclude_reason) - if progress_callback: - progress_callback(files_processed, total_files) - continue - try: - text = fpath.read_text(encoding="utf-8", errors="replace") - except Exception as exc: - logger.debug("Skipping %s: %s", fpath, exc) - if progress_callback: - progress_callback(files_processed, total_files) - continue - - rel_path = str(fpath.relative_to(root)) if root else str(fpath) - content_hash = self._content_hash(text) - file_chunks = self._smart_chunk(text, rel_path, max_chunk_chars, chunk_overlap) - - if not file_chunks: - st = fpath.stat() - meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size) - continue - - files_processed += 1 - file_chunk_ids = [] - for chunk_text, path, sl, el in file_chunks: - chunk_tokens = max(1, len(chunk_text) // 4) - # Flush if adding this chunk would exceed batch limits - if buf_ids and ( - len(buf_ids) >= max_batch_items - or buf_tokens + chunk_tokens > max_batch_tokens - ): - _flush_buffer() - - buf_ids.append(chunk_id) - buf_texts.append(chunk_text) - buf_paths.append(path) - buf_lines.append((sl, el)) - buf_tokens += chunk_tokens - file_chunk_ids.append((chunk_id, chunk_text)) - chunk_id += 1 - - chunks_created += len(file_chunk_ids) - - # Register metadata per file - st = fpath.stat() - meta.register_file(rel_path, content_hash, st.st_mtime, st.st_size) - chunk_id_hashes = [ - (cid, self._content_hash(ct)) for cid, ct in file_chunk_ids - ] - meta.register_chunks(rel_path, chunk_id_hashes) - - if progress_callback: - progress_callback(files_processed, total_files) - - # Final flush for remaining chunks - _flush_buffer() - - embed_queue.put(_SENTINEL) - embed_thread.join() - index_thread.join() - - self._binary_store.save() - self._ann_index.save() - - duration = time.monotonic() - t0 - - if worker_errors: - raise worker_errors[0] - - return IndexStats( - files_processed=files_processed, - chunks_created=chunks_created, - duration_seconds=round(duration, 2), - ) - - def compact(self) -> None: - """Rebuild indexes excluding tombstoned chunk IDs. - - Reads all deleted IDs from MetadataStore, rebuilds BinaryStore - and ANNIndex without those entries, then clears the - deleted_chunks table. - """ - meta = self._require_metadata() - deleted_ids = meta.compact_deleted() - if not deleted_ids: - logger.debug("Compact: no deleted IDs, nothing to do") - return - - logger.info("Compact: rebuilding indexes, excluding %d deleted IDs", len(deleted_ids)) - - # Rebuild BinaryStore: read current data, filter, replace - if self._binary_store._count > 0: - active_ids = self._binary_store._ids[: self._binary_store._count] - active_matrix = self._binary_store._matrix[: self._binary_store._count] - mask = ~np.isin(active_ids, list(deleted_ids)) - kept_ids = active_ids[mask] - kept_matrix = active_matrix[mask] - # Reset store - self._binary_store._count = 0 - self._binary_store._matrix = None - self._binary_store._ids = None - if len(kept_ids) > 0: - self._binary_store._ensure_capacity(len(kept_ids)) - self._binary_store._matrix[: len(kept_ids)] = kept_matrix - self._binary_store._ids[: len(kept_ids)] = kept_ids - self._binary_store._count = len(kept_ids) - self._binary_store.save() - - # Rebuild ANNIndex: must reconstruct from scratch since HNSW - # does not support deletion. We re-initialize and re-add kept items. - # Note: we need the float32 vectors, but BinaryStore only has quantized. - # ANNIndex (hnswlib) supports mark_deleted, but compact means full rebuild. - # Since we don't have original float vectors cached, we rely on the fact - # that ANNIndex.mark_deleted is not available in all hnswlib versions. - # Instead, we reinitialize the index and let future searches filter via - # deleted_ids at query time. The BinaryStore is already compacted above. - # For a full ANN rebuild, the caller should re-run index_files() on all - # files after compact. - logger.info( - "Compact: BinaryStore rebuilt (%d entries kept). " - "Note: ANNIndex retains stale entries; run full re-index for clean ANN state.", - self._binary_store._count, - ) diff --git a/codex-lens-v2/src/codexlens_search/mcp_server.py b/codex-lens-v2/src/codexlens_search/mcp_server.py deleted file mode 100644 index b5448960..00000000 --- a/codex-lens-v2/src/codexlens_search/mcp_server.py +++ /dev/null @@ -1,524 +0,0 @@ -"""MCP server for codexlens-search. - -Exposes semantic code search tools via FastMCP for Claude Code integration. -Run as: codexlens-mcp (entry point) or python -m codexlens_search.mcp_server - -## .mcp.json Configuration Examples - -### API embedding + API reranker (single endpoint): -{ - "mcpServers": { - "codexlens": { - "command": "codexlens-mcp", - "env": { - "CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1", - "CODEXLENS_EMBED_API_KEY": "sk-xxx", - "CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small", - "CODEXLENS_EMBED_DIM": "1536", - "CODEXLENS_RERANKER_API_URL": "https://api.jina.ai/v1", - "CODEXLENS_RERANKER_API_KEY": "jina-xxx", - "CODEXLENS_RERANKER_API_MODEL": "jina-reranker-v2-base-multilingual" - } - } - } -} - -### API embedding (multi-endpoint load balancing): -{ - "mcpServers": { - "codexlens": { - "command": "codexlens-mcp", - "env": { - "CODEXLENS_EMBED_API_ENDPOINTS": "url1|key1|model1,url2|key2|model2", - "CODEXLENS_EMBED_DIM": "1536", - "CODEXLENS_RERANKER_API_URL": "https://api.jina.ai/v1", - "CODEXLENS_RERANKER_API_KEY": "jina-xxx", - "CODEXLENS_RERANKER_API_MODEL": "jina-reranker-v2-base-multilingual" - } - } - } -} - -### Local fastembed model (no API, requires codexlens-search[semantic]): -{ - "mcpServers": { - "codexlens": { - "command": "codexlens-mcp", - "env": {} - } - } -} -Pre-download models via CLI: codexlens-search download-models - -### Env vars reference: -Embedding: CODEXLENS_EMBED_API_URL, _KEY, _MODEL, _ENDPOINTS (multi), _DIM -Reranker: CODEXLENS_RERANKER_API_URL, _KEY, _MODEL -Tuning: CODEXLENS_BINARY_TOP_K, _ANN_TOP_K, _FTS_TOP_K, _FUSION_K, - CODEXLENS_RERANKER_TOP_K, _RERANKER_BATCH_SIZE -""" -from __future__ import annotations - -import asyncio -import logging -import threading -from pathlib import Path - -from mcp.server.fastmcp import Context, FastMCP - -from codexlens_search.bridge import ( - DEFAULT_EXCLUDES, - create_config_from_env, - create_pipeline, - should_exclude, -) - -log = logging.getLogger("codexlens_search.mcp_server") - -mcp = FastMCP("codexlens-search") - -# Pipeline cache: keyed by resolved project_path -> (indexing, search, config) -_pipelines: dict[str, tuple] = {} -_lock = threading.Lock() - - -def _db_path_for_project(project_path: str) -> Path: - """Return the index database path for a project.""" - return Path(project_path).resolve() / ".codexlens" - - -def _get_pipelines(project_path: str) -> tuple: - """Get or create cached (indexing_pipeline, search_pipeline, config) for a project.""" - resolved = str(Path(project_path).resolve()) - with _lock: - if resolved not in _pipelines: - db_path = _db_path_for_project(resolved) - config = create_config_from_env(db_path) - _pipelines[resolved] = create_pipeline(db_path, config) - return _pipelines[resolved] - - -# --------------------------------------------------------------------------- -# Search tools -# --------------------------------------------------------------------------- - -@mcp.tool() -def search_code( - project_path: str, query: str, top_k: int = 10, quality: str = "auto" -) -> str: - """Semantic code search with hybrid fusion (vector + FTS + reranking). - - Args: - project_path: Absolute path to the project root directory. - query: Natural language or code search query. - top_k: Maximum number of results to return (default 10). - quality: Search quality tier (default "auto"): - - "fast": FTS-only + rerank (no embedding needed, fastest) - - "balanced": FTS + binary coarse search + rerank - - "thorough": Full 2-stage vector + FTS + reranking (best quality) - - "auto": Uses "thorough" if vector index exists, else "fast" - - Returns: - Search results as formatted text with file paths, line numbers, scores, and code snippets. - """ - root = Path(project_path).resolve() - if not root.is_dir(): - return f"Error: project path not found: {root}" - - db_path = _db_path_for_project(project_path) - if not (db_path / "metadata.db").exists(): - return f"Error: no index found at {db_path}. Run index_project first." - - valid_qualities = ("fast", "balanced", "thorough", "auto") - if quality not in valid_qualities: - return f"Error: invalid quality '{quality}'. Must be one of: {', '.join(valid_qualities)}" - - _, search, _ = _get_pipelines(project_path) - results = search.search(query, top_k=top_k, quality=quality) - - if not results: - return "No results found." - - lines = [] - for i, r in enumerate(results, 1): - lines.append(f"## Result {i} -- {r.path} (L{r.line}-{r.end_line}, score: {r.score:.4f})") - lines.append(f"```\n{r.content}\n```") - lines.append("") - return "\n".join(lines) - - -@mcp.tool() -def search_scope( - project_path: str, - query: str, - scope_path: str, - top_k: int = 10, - quality: str = "auto", -) -> str: - """Search within a specific directory scope of a project. - - Runs a normal search then filters results to only include files - under the specified scope path. - - Args: - project_path: Absolute path to the project root directory. - query: Natural language or code search query. - scope_path: Relative directory path to limit search scope (e.g. "src/auth"). - top_k: Maximum number of scoped results to return (default 10). - quality: Search quality tier ("fast", "balanced", "thorough", "auto"). - - Returns: - Search results filtered to the scope path. - """ - root = Path(project_path).resolve() - if not root.is_dir(): - return f"Error: project path not found: {root}" - - db_path = _db_path_for_project(project_path) - if not (db_path / "metadata.db").exists(): - return f"Error: no index found at {db_path}. Run index_project first." - - # Normalize scope path for prefix matching - scope = scope_path.replace("\\", "/").strip("/") - - _, search, _ = _get_pipelines(project_path) - # Fetch more results than top_k to account for filtering - all_results = search.search(query, top_k=top_k * 5, quality=quality) - - # Filter by scope path prefix - scoped = [ - r for r in all_results - if r.path.replace("\\", "/").startswith(scope + "/") - or r.path.replace("\\", "/") == scope - ] - - if not scoped: - return f"No results found in scope '{scope_path}'." - - lines = [] - for i, r in enumerate(scoped[:top_k], 1): - lines.append(f"## Result {i} -- {r.path} (L{r.line}-{r.end_line}, score: {r.score:.4f})") - lines.append(f"```\n{r.content}\n```") - lines.append("") - return "\n".join(lines) - - -# --------------------------------------------------------------------------- -# Indexing tools -# --------------------------------------------------------------------------- - -@mcp.tool() -async def index_project( - project_path: str, glob_pattern: str = "**/*", force: bool = False, - ctx: Context | None = None, -) -> str: - """Build or rebuild the search index for a project. - - Args: - project_path: Absolute path to the project root directory. - glob_pattern: Glob pattern for files to index (default "**/*"). - force: If True, rebuild index from scratch even if it exists. - - Returns: - Indexing summary with file count, chunk count, and duration. - """ - root = Path(project_path).resolve() - if not root.is_dir(): - return f"Error: project path not found: {root}" - - if force: - with _lock: - _pipelines.pop(str(root), None) - - indexing, _, _ = _get_pipelines(project_path) - - file_paths = [ - p for p in root.glob(glob_pattern) - if p.is_file() and not should_exclude(p.relative_to(root), DEFAULT_EXCLUDES) - ] - - if ctx: - await ctx.report_progress(0, len(file_paths), f"Scanning {len(file_paths)} files...") - - # Progress callback bridging sync pipeline → async MCP context - loop = asyncio.get_event_loop() - - def _progress(done: int, total: int) -> None: - if ctx: - asyncio.run_coroutine_threadsafe( - ctx.report_progress(done, total, f"Indexed {done}/{total} files"), - loop, - ) - - stats = indexing.sync(file_paths, root=root, progress_callback=_progress) - - if ctx: - await ctx.report_progress( - stats.files_processed, stats.files_processed, - f"Done: {stats.files_processed} files, {stats.chunks_created} chunks" - ) - - return ( - f"Indexed {stats.files_processed} files, " - f"{stats.chunks_created} chunks in {stats.duration_seconds:.1f}s. " - f"DB: {_db_path_for_project(project_path)}" - ) - - -@mcp.tool() -def index_status(project_path: str) -> str: - """Show index statistics for a project. - - Args: - project_path: Absolute path to the project root directory. - - Returns: - Index statistics including file count, chunk count, and deleted chunks. - """ - from codexlens_search.indexing.metadata import MetadataStore - - db_path = _db_path_for_project(project_path) - meta_path = db_path / "metadata.db" - - if not meta_path.exists(): - return f"No index found at {db_path}. Run index_project first." - - metadata = MetadataStore(meta_path) - all_files = metadata.get_all_files() - deleted_ids = metadata.get_deleted_ids() - max_chunk = metadata.max_chunk_id() - - total = max_chunk + 1 if max_chunk >= 0 else 0 - return ( - f"Index: {db_path}\n" - f"Files tracked: {len(all_files)}\n" - f"Total chunks: {total}\n" - f"Deleted chunks: {len(deleted_ids)}" - ) - - -@mcp.tool() -async def index_update( - project_path: str, glob_pattern: str = "**/*", - ctx: Context | None = None, -) -> str: - """Incrementally sync the index with current project files. - - Only re-indexes files that changed since last indexing. - - Args: - project_path: Absolute path to the project root directory. - glob_pattern: Glob pattern for files to sync (default "**/*"). - - Returns: - Sync summary with processed file count and duration. - """ - root = Path(project_path).resolve() - if not root.is_dir(): - return f"Error: project path not found: {root}" - - indexing, _, _ = _get_pipelines(project_path) - - file_paths = [ - p for p in root.glob(glob_pattern) - if p.is_file() and not should_exclude(p.relative_to(root), DEFAULT_EXCLUDES) - ] - - if ctx: - await ctx.report_progress(0, len(file_paths), f"Scanning {len(file_paths)} files...") - - loop = asyncio.get_event_loop() - - def _progress(done: int, total: int) -> None: - if ctx: - asyncio.run_coroutine_threadsafe( - ctx.report_progress(done, total, f"Synced {done}/{total} files"), - loop, - ) - - stats = indexing.sync(file_paths, root=root, progress_callback=_progress) - return ( - f"Synced {stats.files_processed} files, " - f"{stats.chunks_created} chunks in {stats.duration_seconds:.1f}s." - ) - - -@mcp.tool() -def index_scope( - project_path: str, - scope_path: str, - glob_pattern: str = "**/*", - tier: str = "full", -) -> str: - """Index a specific directory scope within a project. - - Useful for quickly indexing a subdirectory (e.g. after editing files - in a specific module) without re-indexing the entire project. - - Args: - project_path: Absolute path to the project root directory. - scope_path: Relative directory path to index (e.g. "src/auth"). - glob_pattern: Glob pattern for files within scope (default "**/*"). - tier: Indexing tier - "full" (default) runs full pipeline with - embedding, "fts_only" indexes text only (faster, no vectors). - - Returns: - Indexing summary for the scoped directory. - """ - root = Path(project_path).resolve() - if not root.is_dir(): - return f"Error: project path not found: {root}" - - scope_dir = root / scope_path - if not scope_dir.is_dir(): - return f"Error: scope directory not found: {scope_dir}" - - valid_tiers = ("full", "fts_only") - if tier not in valid_tiers: - return f"Error: invalid tier '{tier}'. Must be one of: {', '.join(valid_tiers)}" - - indexing, _, _ = _get_pipelines(project_path) - - file_paths = [ - p for p in scope_dir.glob(glob_pattern) - if p.is_file() and not should_exclude(p.relative_to(root), DEFAULT_EXCLUDES) - ] - - if not file_paths: - return f"No files found in {scope_path} matching '{glob_pattern}'." - - stats = indexing.sync(file_paths, root=root, tier=tier) - tier_label = "FTS-only" if tier == "fts_only" else "full" - return ( - f"Indexed {stats.files_processed} files ({tier_label}), " - f"{stats.chunks_created} chunks in {stats.duration_seconds:.1f}s. " - f"Scope: {scope_path}" - ) - - -# --------------------------------------------------------------------------- -# File discovery -# --------------------------------------------------------------------------- - -@mcp.tool() -def find_files( - project_path: str, pattern: str = "**/*", max_results: int = 100 -) -> str: - """Find files in a project by glob pattern. - - Args: - project_path: Absolute path to the project root directory. - pattern: Glob pattern to match files (default "**/*"). - max_results: Maximum number of file paths to return (default 100). - - Returns: - List of matching file paths (relative to project root), one per line. - """ - root = Path(project_path).resolve() - if not root.is_dir(): - return f"Error: project path not found: {root}" - - matches = [] - for p in root.glob(pattern): - if p.is_file() and not should_exclude(p.relative_to(root), DEFAULT_EXCLUDES): - matches.append(str(p.relative_to(root))) - if len(matches) >= max_results: - break - - if not matches: - return "No files found matching the pattern." - - header = f"Found {len(matches)} files" - if len(matches) >= max_results: - header += f" (limited to {max_results})" - return header + ":\n" + "\n".join(matches) - - -# --------------------------------------------------------------------------- -# Model management tools -# --------------------------------------------------------------------------- - -@mcp.tool() -def list_models() -> str: - """List available embedding and reranker models with cache status. - - Shows which models are downloaded locally and ready for use. - Models are needed when using local fastembed mode (no API URL configured). - - Returns: - Table of models with name, type, and installed status. - """ - from codexlens_search import model_manager - from codexlens_search.config import Config - - config = create_config_from_env(".") - models = model_manager.list_known_models(config) - - if not models: - return "No known models found." - - lines = ["| Model | Type | Installed |", "| --- | --- | --- |"] - for m in models: - status = "Yes" if m["installed"] else "No" - lines.append(f"| {m['name']} | {m['type']} | {status} |") - - # Show current config - lines.append("") - if config.embed_api_url: - lines.append(f"Mode: API embedding ({config.embed_api_url})") - else: - lines.append(f"Mode: Local fastembed (model: {config.embed_model})") - return "\n".join(lines) - - -@mcp.tool() -def download_models(embed_model: str = "", reranker_model: str = "") -> str: - """Download embedding and reranker models for local (fastembed) mode. - - Not needed when using API embedding (CODEXLENS_EMBED_API_URL is set). - Downloads are cached — subsequent calls are no-ops if already downloaded. - - Args: - embed_model: Embedding model name (default: BAAI/bge-small-en-v1.5). - reranker_model: Reranker model name (default: Xenova/ms-marco-MiniLM-L-6-v2). - - Returns: - Download status for each model. - """ - from codexlens_search import model_manager - from codexlens_search.config import Config - - config = create_config_from_env(".") - if embed_model: - config.embed_model = embed_model - if reranker_model: - config.reranker_model = reranker_model - - results = [] - for name, kind in [ - (config.embed_model, "embedding"), - (config.reranker_model, "reranker"), - ]: - try: - model_manager.ensure_model(name, config) - results.append(f"{kind}: {name} — ready") - except Exception as e: - results.append(f"{kind}: {name} — failed: {e}") - - return "\n".join(results) - - -# --------------------------------------------------------------------------- -# Entry point -# --------------------------------------------------------------------------- - -def main() -> None: - """Entry point for codexlens-mcp command.""" - logging.basicConfig( - level=logging.INFO, - format="%(levelname)s %(name)s: %(message)s", - ) - mcp.run() - - -if __name__ == "__main__": - main() diff --git a/codex-lens-v2/src/codexlens_search/model_manager.py b/codex-lens-v2/src/codexlens_search/model_manager.py deleted file mode 100644 index 5476c701..00000000 --- a/codex-lens-v2/src/codexlens_search/model_manager.py +++ /dev/null @@ -1,242 +0,0 @@ -"""Lightweight model download manager for fastembed models. - -Handles HuggingFace mirror configuration and cache pre-population so that -fastembed can load models from local cache without network access. -""" -from __future__ import annotations - -import logging -import os -from pathlib import Path - -from .config import Config - -log = logging.getLogger(__name__) - -# Models that fastembed maps internally (HF repo may differ from model_name) -_EMBED_MODEL_FILES = ["*.onnx", "*.json"] -_RERANK_MODEL_FILES = ["*.onnx", "*.json"] - - -def _resolve_cache_dir(config: Config) -> str | None: - """Return cache_dir for fastembed, or None for default.""" - return config.model_cache_dir or None - - -def _apply_mirror(config: Config) -> None: - """Set HF_ENDPOINT env var if mirror is configured.""" - if config.hf_mirror: - os.environ["HF_ENDPOINT"] = config.hf_mirror - - -def _model_is_cached(model_name: str, cache_dir: str | None) -> bool: - """Check if a model already exists in the fastembed/HF hub cache. - - Note: fastembed may remap model names internally (e.g. BAAI/bge-small-en-v1.5 - -> qdrant/bge-small-en-v1.5-onnx-q), so we also search by partial name match. - """ - base = cache_dir or _default_fastembed_cache() - base_path = Path(base) - if not base_path.exists(): - return False - - # Exact match first - safe_name = model_name.replace("/", "--") - model_dir = base_path / f"models--{safe_name}" - if _dir_has_onnx(model_dir): - return True - - # Partial match: fastembed remaps some model names - short_name = model_name.split("/")[-1].lower() - for d in base_path.iterdir(): - if short_name in d.name.lower() and _dir_has_onnx(d): - return True - - return False - - -def _dir_has_onnx(model_dir: Path) -> bool: - """Check if a model directory has at least one ONNX file in snapshots.""" - snapshots = model_dir / "snapshots" - if not snapshots.exists(): - return False - for snap in snapshots.iterdir(): - if list(snap.rglob("*.onnx")): - return True - return False - - -def _default_fastembed_cache() -> str: - """Return fastembed's default cache directory.""" - return os.path.join(os.environ.get("TMPDIR", os.path.join( - os.environ.get("LOCALAPPDATA", os.path.expanduser("~")), - )), "fastembed_cache") - - -def ensure_model(model_name: str, config: Config) -> None: - """Ensure a model is available in the local cache. - - If the model is already cached, this is a no-op. - If not cached, attempts to download via huggingface_hub with mirror support. - """ - cache_dir = _resolve_cache_dir(config) - - if _model_is_cached(model_name, cache_dir): - log.debug("Model %s found in cache", model_name) - return - - log.info("Model %s not in cache, downloading...", model_name) - _apply_mirror(config) - - try: - from huggingface_hub import snapshot_download - - kwargs: dict = { - "repo_id": model_name, - "allow_patterns": ["*.onnx", "*.json"], - } - if cache_dir: - kwargs["cache_dir"] = cache_dir - if config.hf_mirror: - kwargs["endpoint"] = config.hf_mirror - - path = snapshot_download(**kwargs) - log.info("Model %s downloaded to %s", model_name, path) - - # fastembed for some reranker models expects model.onnx but repo may - # only have quantized variants. Create a symlink/copy if needed. - _ensure_model_onnx(Path(path)) - - except ImportError: - log.warning( - "huggingface_hub not installed. Cannot download models. " - "Install with: pip install huggingface-hub" - ) - except Exception as e: - log.warning("Failed to download model %s: %s", model_name, e) - - -def _ensure_model_onnx(model_dir: Path) -> None: - """If model.onnx is missing but a quantized variant exists, copy it.""" - onnx_dir = model_dir / "onnx" - if not onnx_dir.exists(): - onnx_dir = model_dir # some models put onnx at root - - target = onnx_dir / "model.onnx" - if target.exists(): - return - - # Look for quantized alternatives - for name in ["model_quantized.onnx", "model_optimized.onnx", - "model_int8.onnx", "model_uint8.onnx"]: - candidate = onnx_dir / name - if candidate.exists(): - import shutil - shutil.copy2(candidate, target) - log.info("Copied %s -> model.onnx", name) - return - - -def list_known_models(config: Config) -> list[dict]: - """Return info for known embed/reranker models with cache status. - - Checks config defaults plus common alternative models. - Returns list of dicts with keys: name, type, installed, cache_path. - """ - cache_dir = _resolve_cache_dir(config) - base = cache_dir or _default_fastembed_cache() - - # Known embedding models - embed_models = [ - config.embed_model, - "BAAI/bge-small-en-v1.5", - "BAAI/bge-base-en-v1.5", - "BAAI/bge-large-en-v1.5", - "sentence-transformers/all-MiniLM-L6-v2", - ] - - # Known reranker models - reranker_models = [ - config.reranker_model, - "Xenova/ms-marco-MiniLM-L-6-v2", - "BAAI/bge-reranker-base", - "BAAI/bge-reranker-v2-m3", - ] - - seen: set[str] = set() - results: list[dict] = [] - - for name in embed_models: - if name in seen: - continue - seen.add(name) - cache_path = _find_model_cache_path(name, base) - results.append({ - "name": name, - "type": "embedding", - "installed": cache_path is not None, - "cache_path": cache_path, - }) - - for name in reranker_models: - if name in seen: - continue - seen.add(name) - cache_path = _find_model_cache_path(name, base) - results.append({ - "name": name, - "type": "reranker", - "installed": cache_path is not None, - "cache_path": cache_path, - }) - - return results - - -def delete_model(model_name: str, config: Config) -> bool: - """Remove a model from the HF/fastembed cache. - - Returns True if deleted, False if not found. - """ - import shutil - - cache_dir = _resolve_cache_dir(config) - base = cache_dir or _default_fastembed_cache() - cache_path = _find_model_cache_path(model_name, base) - - if cache_path is None: - log.warning("Model %s not found in cache", model_name) - return False - - shutil.rmtree(cache_path) - log.info("Deleted model %s from %s", model_name, cache_path) - return True - - -def _find_model_cache_path(model_name: str, base: str) -> str | None: - """Find the cache directory path for a model, or None if not cached.""" - base_path = Path(base) - if not base_path.exists(): - return None - - # Exact match first - safe_name = model_name.replace("/", "--") - model_dir = base_path / f"models--{safe_name}" - if _dir_has_onnx(model_dir): - return str(model_dir) - - # Partial match: fastembed remaps some model names - short_name = model_name.split("/")[-1].lower() - for d in base_path.iterdir(): - if short_name in d.name.lower() and _dir_has_onnx(d): - return str(d) - - return None - - -def get_cache_kwargs(config: Config) -> dict: - """Return kwargs to pass to fastembed constructors for cache_dir.""" - cache_dir = _resolve_cache_dir(config) - if cache_dir: - return {"cache_dir": cache_dir} - return {} diff --git a/codex-lens-v2/src/codexlens_search/rerank/__init__.py b/codex-lens-v2/src/codexlens_search/rerank/__init__.py deleted file mode 100644 index 2e2832fd..00000000 --- a/codex-lens-v2/src/codexlens_search/rerank/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .base import BaseReranker -from .local import FastEmbedReranker -from .api import APIReranker - -__all__ = ["BaseReranker", "FastEmbedReranker", "APIReranker"] diff --git a/codex-lens-v2/src/codexlens_search/rerank/api.py b/codex-lens-v2/src/codexlens_search/rerank/api.py deleted file mode 100644 index 8633a9ea..00000000 --- a/codex-lens-v2/src/codexlens_search/rerank/api.py +++ /dev/null @@ -1,103 +0,0 @@ -from __future__ import annotations - -import logging -import time - -import httpx - -from codexlens_search.config import Config -from .base import BaseReranker - -logger = logging.getLogger(__name__) - - -class APIReranker(BaseReranker): - """Reranker backed by a remote HTTP API (SiliconFlow/Cohere/Jina format).""" - - def __init__(self, config: Config) -> None: - self._config = config - self._client = httpx.Client( - headers={ - "Authorization": f"Bearer {config.reranker_api_key}", - "Content-Type": "application/json", - }, - ) - - def score_pairs(self, query: str, documents: list[str]) -> list[float]: - if not documents: - return [] - max_tokens = self._config.reranker_api_max_tokens_per_batch - batches = self._split_batches(documents, max_tokens) - scores = [0.0] * len(documents) - for batch in batches: - batch_scores = self._call_api_with_retry(query, batch) - for orig_idx, score in batch_scores.items(): - scores[orig_idx] = score - return scores - - def _split_batches( - self, documents: list[str], max_tokens: int - ) -> list[list[tuple[int, str]]]: - batches: list[list[tuple[int, str]]] = [] - current_batch: list[tuple[int, str]] = [] - current_tokens = 0 - - for idx, text in enumerate(documents): - doc_tokens = len(text) // 4 - if current_tokens + doc_tokens > max_tokens and current_batch: - batches.append(current_batch) - current_batch = [] - current_tokens = 0 - current_batch.append((idx, text)) - current_tokens += doc_tokens - - if current_batch: - batches.append(current_batch) - - return batches - - def _call_api_with_retry( - self, - query: str, - docs: list[tuple[int, str]], - max_retries: int = 3, - ) -> dict[int, float]: - url = self._config.reranker_api_url.rstrip("/") + "/rerank" - payload = { - "model": self._config.reranker_api_model, - "query": query, - "documents": [t for _, t in docs], - } - - last_exc: Exception | None = None - for attempt in range(max_retries): - try: - response = self._client.post(url, json=payload) - except Exception as exc: - last_exc = exc - time.sleep((2 ** attempt) * 0.5) - continue - - if response.status_code in (429, 503): - logger.warning( - "API reranker returned HTTP %s (attempt %d/%d), retrying...", - response.status_code, - attempt + 1, - max_retries, - ) - time.sleep((2 ** attempt) * 0.5) - continue - - response.raise_for_status() - data = response.json() - results = data.get("results", []) - scores: dict[int, float] = {} - for item in results: - local_idx = int(item["index"]) - orig_idx = docs[local_idx][0] - scores[orig_idx] = float(item["relevance_score"]) - return scores - - raise RuntimeError( - f"API reranker failed after {max_retries} attempts. Last error: {last_exc}" - ) diff --git a/codex-lens-v2/src/codexlens_search/rerank/base.py b/codex-lens-v2/src/codexlens_search/rerank/base.py deleted file mode 100644 index 5edaf6db..00000000 --- a/codex-lens-v2/src/codexlens_search/rerank/base.py +++ /dev/null @@ -1,8 +0,0 @@ -from __future__ import annotations -from abc import ABC, abstractmethod - - -class BaseReranker(ABC): - @abstractmethod - def score_pairs(self, query: str, documents: list[str]) -> list[float]: - """Score (query, doc) pairs. Returns list of floats same length as documents.""" diff --git a/codex-lens-v2/src/codexlens_search/rerank/local.py b/codex-lens-v2/src/codexlens_search/rerank/local.py deleted file mode 100644 index 0e50eaf2..00000000 --- a/codex-lens-v2/src/codexlens_search/rerank/local.py +++ /dev/null @@ -1,39 +0,0 @@ -from __future__ import annotations - -from codexlens_search.config import Config -from .base import BaseReranker - - -class FastEmbedReranker(BaseReranker): - """Local reranker backed by fastembed TextCrossEncoder.""" - - def __init__(self, config: Config) -> None: - self._config = config - self._model = None - - def _load(self) -> None: - if self._model is None: - from .. import model_manager - model_manager.ensure_model(self._config.reranker_model, self._config) - - from fastembed.rerank.cross_encoder import TextCrossEncoder - cache_kwargs = model_manager.get_cache_kwargs(self._config) - self._model = TextCrossEncoder( - model_name=self._config.reranker_model, - **cache_kwargs, - ) - - def score_pairs(self, query: str, documents: list[str]) -> list[float]: - self._load() - results = list(self._model.rerank(query, documents)) - if not results: - return [0.0] * len(documents) - # fastembed may return list[float] or list[RerankResult] depending on version - first = results[0] - if isinstance(first, (int, float)): - return [float(s) for s in results] - # Older format: objects with .index and .score - scores = [0.0] * len(documents) - for r in results: - scores[r.index] = float(r.score) - return scores diff --git a/codex-lens-v2/src/codexlens_search/search/__init__.py b/codex-lens-v2/src/codexlens_search/search/__init__.py deleted file mode 100644 index 749b94b9..00000000 --- a/codex-lens-v2/src/codexlens_search/search/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .fts import FTSEngine -from .fusion import reciprocal_rank_fusion, detect_query_intent, QueryIntent, DEFAULT_WEIGHTS -from .pipeline import SearchPipeline, SearchResult - -__all__ = [ - "FTSEngine", "reciprocal_rank_fusion", "detect_query_intent", - "QueryIntent", "DEFAULT_WEIGHTS", "SearchPipeline", "SearchResult", -] diff --git a/codex-lens-v2/src/codexlens_search/search/fts.py b/codex-lens-v2/src/codexlens_search/search/fts.py deleted file mode 100644 index eb4b27fc..00000000 --- a/codex-lens-v2/src/codexlens_search/search/fts.py +++ /dev/null @@ -1,133 +0,0 @@ -from __future__ import annotations - -import sqlite3 -from pathlib import Path - - -class FTSEngine: - def __init__(self, db_path: str | Path) -> None: - self._conn = sqlite3.connect(str(db_path), check_same_thread=False) - self._conn.execute( - "CREATE VIRTUAL TABLE IF NOT EXISTS docs " - "USING fts5(content, tokenize='porter unicode61')" - ) - self._conn.execute( - "CREATE TABLE IF NOT EXISTS docs_meta " - "(id INTEGER PRIMARY KEY, path TEXT, " - "start_line INTEGER DEFAULT 0, end_line INTEGER DEFAULT 0)" - ) - self._conn.commit() - self._migrate_line_columns() - - def _migrate_line_columns(self) -> None: - """Add start_line/end_line columns if missing (for pre-existing DBs).""" - cols = { - row[1] - for row in self._conn.execute("PRAGMA table_info(docs_meta)").fetchall() - } - for col in ("start_line", "end_line"): - if col not in cols: - self._conn.execute( - f"ALTER TABLE docs_meta ADD COLUMN {col} INTEGER DEFAULT 0" - ) - self._conn.commit() - - def add_documents(self, docs: list[tuple]) -> None: - """Add documents in batch. - - docs: list of (id, path, content) or (id, path, content, start_line, end_line). - """ - if not docs: - return - meta_rows = [] - fts_rows = [] - for doc in docs: - if len(doc) >= 5: - doc_id, path, content, sl, el = doc[0], doc[1], doc[2], doc[3], doc[4] - else: - doc_id, path, content = doc[0], doc[1], doc[2] - sl, el = 0, 0 - meta_rows.append((doc_id, path, sl, el)) - fts_rows.append((doc_id, content)) - self._conn.executemany( - "INSERT OR REPLACE INTO docs_meta (id, path, start_line, end_line) " - "VALUES (?, ?, ?, ?)", - meta_rows, - ) - self._conn.executemany( - "INSERT OR REPLACE INTO docs (rowid, content) VALUES (?, ?)", - fts_rows, - ) - self._conn.commit() - - def exact_search(self, query: str, top_k: int = 50) -> list[tuple[int, float]]: - """FTS5 MATCH query, return (id, bm25_score) sorted by score descending.""" - try: - rows = self._conn.execute( - "SELECT rowid, bm25(docs) AS score FROM docs " - "WHERE docs MATCH ? ORDER BY score LIMIT ?", - (query, top_k), - ).fetchall() - except sqlite3.OperationalError: - return [] - # bm25 in SQLite FTS5 returns negative values (lower = better match) - # Negate so higher is better - return [(int(row[0]), -float(row[1])) for row in rows] - - def fuzzy_search(self, query: str, top_k: int = 50) -> list[tuple[int, float]]: - """Prefix search: each token + '*', return (id, score) sorted descending.""" - tokens = query.strip().split() - if not tokens: - return [] - prefix_query = " ".join(t + "*" for t in tokens) - try: - rows = self._conn.execute( - "SELECT rowid, bm25(docs) AS score FROM docs " - "WHERE docs MATCH ? ORDER BY score LIMIT ?", - (prefix_query, top_k), - ).fetchall() - except sqlite3.OperationalError: - return [] - return [(int(row[0]), -float(row[1])) for row in rows] - - def get_content(self, doc_id: int) -> str: - """Retrieve content for a doc_id.""" - row = self._conn.execute( - "SELECT content FROM docs WHERE rowid = ?", (doc_id,) - ).fetchone() - return row[0] if row else "" - - def get_chunk_ids_by_path(self, path: str) -> list[int]: - """Return all doc IDs associated with a given file path.""" - rows = self._conn.execute( - "SELECT id FROM docs_meta WHERE path = ?", (path,) - ).fetchall() - return [r[0] for r in rows] - - def delete_by_path(self, path: str) -> int: - """Delete all docs and docs_meta rows for a given file path. - - Returns the number of deleted documents. - """ - ids = self.get_chunk_ids_by_path(path) - if not ids: - return 0 - placeholders = ",".join("?" for _ in ids) - self._conn.execute( - f"DELETE FROM docs WHERE rowid IN ({placeholders})", ids - ) - self._conn.execute( - f"DELETE FROM docs_meta WHERE id IN ({placeholders})", ids - ) - self._conn.commit() - return len(ids) - - def get_doc_meta(self, doc_id: int) -> tuple[str, int, int]: - """Return (path, start_line, end_line) for a doc_id.""" - row = self._conn.execute( - "SELECT path, start_line, end_line FROM docs_meta WHERE id = ?", - (doc_id,), - ).fetchone() - if row: - return row[0], row[1] or 0, row[2] or 0 - return "", 0, 0 diff --git a/codex-lens-v2/src/codexlens_search/search/fusion.py b/codex-lens-v2/src/codexlens_search/search/fusion.py deleted file mode 100644 index a51d7534..00000000 --- a/codex-lens-v2/src/codexlens_search/search/fusion.py +++ /dev/null @@ -1,106 +0,0 @@ -from __future__ import annotations - -import re -from enum import Enum - -DEFAULT_WEIGHTS: dict[str, float] = { - "exact": 0.25, - "fuzzy": 0.10, - "vector": 0.50, - "graph": 0.15, -} - -_CODE_CAMEL_RE = re.compile(r"[a-z][A-Z]") -_CODE_SNAKE_RE = re.compile(r"\b[a-z_]+_[a-z_]+\b") -_CODE_SYMBOLS_RE = re.compile(r"[.\[\](){}]|->|::") -_CODE_KEYWORDS_RE = re.compile(r"\b(import|def|class|return|from|async|await|lambda|yield)\b") -_QUESTION_WORDS_RE = re.compile(r"\b(how|what|why|when|where|which|who|does|do|is|are|can|should)\b", re.IGNORECASE) - - -class QueryIntent(Enum): - CODE_SYMBOL = "code_symbol" - NATURAL_LANGUAGE = "natural" - MIXED = "mixed" - - -def detect_query_intent(query: str) -> QueryIntent: - """Detect whether query is a code symbol, natural language, or mixed.""" - words = query.strip().split() - word_count = len(words) - - code_signals = 0 - natural_signals = 0 - - if _CODE_CAMEL_RE.search(query): - code_signals += 2 - if _CODE_SNAKE_RE.search(query): - code_signals += 2 - if _CODE_SYMBOLS_RE.search(query): - code_signals += 2 - if _CODE_KEYWORDS_RE.search(query): - code_signals += 2 - if "`" in query: - code_signals += 1 - if word_count < 4: - code_signals += 1 - - if _QUESTION_WORDS_RE.search(query): - natural_signals += 2 - if word_count > 5: - natural_signals += 2 - if code_signals == 0 and word_count >= 3: - natural_signals += 1 - - if code_signals >= 2 and natural_signals == 0: - return QueryIntent.CODE_SYMBOL - if natural_signals >= 2 and code_signals == 0: - return QueryIntent.NATURAL_LANGUAGE - if code_signals >= 2 and natural_signals == 0: - return QueryIntent.CODE_SYMBOL - if natural_signals > code_signals: - return QueryIntent.NATURAL_LANGUAGE - if code_signals > natural_signals: - return QueryIntent.CODE_SYMBOL - return QueryIntent.MIXED - - -def get_adaptive_weights(intent: QueryIntent, base: dict | None = None) -> dict[str, float]: - """Return weights adapted to query intent.""" - weights = dict(base or DEFAULT_WEIGHTS) - if intent == QueryIntent.CODE_SYMBOL: - weights["exact"] = 0.45 - weights["vector"] = 0.35 - elif intent == QueryIntent.NATURAL_LANGUAGE: - weights["vector"] = 0.65 - weights["exact"] = 0.15 - # MIXED: use weights as-is - return weights - - -def reciprocal_rank_fusion( - results: dict[str, list[tuple[int, float]]], - weights: dict[str, float] | None = None, - k: int = 60, -) -> list[tuple[int, float]]: - """Fuse ranked result lists using Reciprocal Rank Fusion. - - results: {source_name: [(doc_id, score), ...]} each list sorted desc by score. - weights: weight per source (defaults to equal weight across all sources). - k: RRF constant (default 60). - Returns sorted list of (doc_id, fused_score) descending. - """ - if not results: - return [] - - sources = list(results.keys()) - if weights is None: - equal_w = 1.0 / len(sources) - weights = {s: equal_w for s in sources} - - scores: dict[int, float] = {} - for source, ranked_list in results.items(): - w = weights.get(source, 0.0) - for rank, (doc_id, _) in enumerate(ranked_list, start=1): - scores[doc_id] = scores.get(doc_id, 0.0) + w * (1.0 / (k + rank)) - - return sorted(scores.items(), key=lambda x: x[1], reverse=True) diff --git a/codex-lens-v2/src/codexlens_search/search/pipeline.py b/codex-lens-v2/src/codexlens_search/search/pipeline.py deleted file mode 100644 index 0331a11f..00000000 --- a/codex-lens-v2/src/codexlens_search/search/pipeline.py +++ /dev/null @@ -1,353 +0,0 @@ -from __future__ import annotations - -import logging -from concurrent.futures import ThreadPoolExecutor -from dataclasses import dataclass - -import numpy as np - -from ..config import Config -from ..core.base import BaseANNIndex, BaseBinaryIndex -from ..embed import BaseEmbedder -from ..indexing.metadata import MetadataStore -from ..rerank import BaseReranker -from .fts import FTSEngine -from .fusion import ( - DEFAULT_WEIGHTS, - detect_query_intent, - get_adaptive_weights, - reciprocal_rank_fusion, -) - -_log = logging.getLogger(__name__) - -_VALID_QUALITIES = ("fast", "balanced", "thorough", "auto") - - -@dataclass -class SearchResult: - id: int - path: str - score: float - snippet: str = "" - line: int = 0 - end_line: int = 0 - content: str = "" - - -class SearchPipeline: - def __init__( - self, - embedder: BaseEmbedder, - binary_store: BaseBinaryIndex, - ann_index: BaseANNIndex, - reranker: BaseReranker, - fts: FTSEngine, - config: Config, - metadata_store: MetadataStore | None = None, - ) -> None: - self._embedder = embedder - self._binary_store = binary_store - self._ann_index = ann_index - self._reranker = reranker - self._fts = fts - self._config = config - self._metadata_store = metadata_store - - # -- Helper: check if vector index has data ---------------------------- - - def _has_vector_index(self) -> bool: - """Check if the binary store has any indexed entries.""" - try: - return len(self._binary_store) > 0 - except Exception: - return False - - # -- Helper: vector search (binary coarse + ANN fine) ----------------- - - def _vector_search( - self, query_vec: np.ndarray - ) -> list[tuple[int, float]]: - """Run binary coarse search then ANN fine search and intersect.""" - cfg = self._config - - # Binary coarse search -> candidate_ids set - candidate_ids_list, _ = self._binary_store.coarse_search( - query_vec, top_k=cfg.binary_top_k - ) - candidate_ids = set(candidate_ids_list) - - # ANN fine search on full index, then intersect with binary candidates - ann_ids, ann_scores = self._ann_index.fine_search( - query_vec, top_k=cfg.ann_top_k - ) - # Keep only results that appear in binary candidates (2-stage funnel) - vector_results: list[tuple[int, float]] = [ - (int(doc_id), float(score)) - for doc_id, score in zip(ann_ids, ann_scores) - if int(doc_id) in candidate_ids - ] - # Fall back to full ANN results if intersection is empty - if not vector_results: - vector_results = [ - (int(doc_id), float(score)) - for doc_id, score in zip(ann_ids, ann_scores) - ] - return vector_results - - # -- Helper: binary coarse search only -------------------------------- - - def _binary_coarse_search( - self, query_vec: np.ndarray - ) -> list[tuple[int, float]]: - """Run binary coarse search only (no ANN fine search).""" - cfg = self._config - candidate_ids, distances = self._binary_store.coarse_search( - query_vec, top_k=cfg.binary_top_k - ) - return [ - (int(doc_id), float(dist)) - for doc_id, dist in zip(candidate_ids, distances) - ] - - # -- Helper: FTS search (exact + fuzzy) ------------------------------ - - def _fts_search( - self, query: str - ) -> tuple[list[tuple[int, float]], list[tuple[int, float]]]: - """Run exact and fuzzy full-text search.""" - cfg = self._config - exact_results = self._fts.exact_search(query, top_k=cfg.fts_top_k) - fuzzy_results = self._fts.fuzzy_search(query, top_k=cfg.fts_top_k) - return exact_results, fuzzy_results - - # -- Helper: filter deleted IDs --------------------------------------- - - def _filter_deleted( - self, fused: list[tuple[int, float]] - ) -> list[tuple[int, float]]: - """Remove tombstoned chunk IDs from results.""" - if self._metadata_store is not None: - deleted_ids = self._metadata_store.get_deleted_ids() - if deleted_ids: - fused = [ - (doc_id, score) - for doc_id, score in fused - if doc_id not in deleted_ids - ] - return fused - - # -- Helper: rerank and build results --------------------------------- - - def _rerank_and_build( - self, - query: str, - fused: list[tuple[int, float]], - final_top_k: int, - use_reranker: bool = True, - ) -> list[SearchResult]: - """Rerank candidates (optionally) and build SearchResult list.""" - if not fused: - return [] - - if use_reranker: - rerank_ids = [doc_id for doc_id, _ in fused[:50]] - contents = [self._fts.get_content(doc_id) for doc_id in rerank_ids] - rerank_scores = self._reranker.score_pairs(query, contents) - ranked = sorted( - zip(rerank_ids, rerank_scores), key=lambda x: x[1], reverse=True - ) - else: - ranked = fused - - results: list[SearchResult] = [] - for doc_id, score in ranked[:final_top_k]: - path, start_line, end_line = self._fts.get_doc_meta(doc_id) - full_content = self._fts.get_content(doc_id) - results.append( - SearchResult( - id=doc_id, - path=path, - score=float(score), - snippet=full_content[:200], - line=start_line, - end_line=end_line, - content=full_content, - ) - ) - return results - - # -- Helper: record access for tier tracking -------------------------- - - def _record_access(self, results: list[SearchResult]) -> None: - """Record file access for data tier tracking.""" - if results and self._metadata_store is not None: - unique_paths = list({r.path for r in results}) - try: - self._metadata_store.record_access_batch(unique_paths) - except Exception: - _log.debug("Failed to record access for tier tracking", exc_info=True) - - # -- Quality-routed search methods ------------------------------------ - - def _search_fast( - self, query: str, final_top_k: int - ) -> list[SearchResult]: - """FTS-only search with reranking. No embedding needed.""" - exact_results, fuzzy_results = self._fts_search(query) - - fusion_input: dict[str, list[tuple[int, float]]] = {} - if exact_results: - fusion_input["exact"] = exact_results - if fuzzy_results: - fusion_input["fuzzy"] = fuzzy_results - - if not fusion_input: - return [] - - fused = reciprocal_rank_fusion( - fusion_input, weights={"exact": 0.7, "fuzzy": 0.3}, - k=self._config.fusion_k, - ) - fused = self._filter_deleted(fused) - return self._rerank_and_build(query, fused, final_top_k, use_reranker=True) - - def _search_balanced( - self, query: str, final_top_k: int - ) -> list[SearchResult]: - """FTS + binary coarse search with RRF fusion and reranking. - - Embeds the query for binary coarse search but skips ANN fine search. - """ - intent = detect_query_intent(query) - weights = get_adaptive_weights(intent, self._config.fusion_weights) - - query_vec = self._embedder.embed_single(query) - - # Parallel: binary coarse + FTS - coarse_results: list[tuple[int, float]] = [] - exact_results: list[tuple[int, float]] = [] - fuzzy_results: list[tuple[int, float]] = [] - - with ThreadPoolExecutor(max_workers=2) as pool: - coarse_future = pool.submit(self._binary_coarse_search, query_vec) - fts_future = pool.submit(self._fts_search, query) - - try: - coarse_results = coarse_future.result() - except Exception: - _log.warning("Binary coarse search failed", exc_info=True) - - try: - exact_results, fuzzy_results = fts_future.result() - except Exception: - _log.warning("FTS search failed", exc_info=True) - - fusion_input: dict[str, list[tuple[int, float]]] = {} - if coarse_results: - fusion_input["vector"] = coarse_results - if exact_results: - fusion_input["exact"] = exact_results - if fuzzy_results: - fusion_input["fuzzy"] = fuzzy_results - - if not fusion_input: - return [] - - fused = reciprocal_rank_fusion(fusion_input, weights=weights, k=self._config.fusion_k) - fused = self._filter_deleted(fused) - return self._rerank_and_build(query, fused, final_top_k, use_reranker=True) - - def _search_thorough( - self, query: str, final_top_k: int - ) -> list[SearchResult]: - """Full 2-stage vector + FTS + reranking pipeline (original behavior).""" - cfg = self._config - - intent = detect_query_intent(query) - weights = get_adaptive_weights(intent, cfg.fusion_weights) - - query_vec = self._embedder.embed_single(query) - - # Parallel vector + FTS search - vector_results: list[tuple[int, float]] = [] - exact_results: list[tuple[int, float]] = [] - fuzzy_results: list[tuple[int, float]] = [] - - with ThreadPoolExecutor(max_workers=2) as pool: - vec_future = pool.submit(self._vector_search, query_vec) - fts_future = pool.submit(self._fts_search, query) - - try: - vector_results = vec_future.result() - except Exception: - _log.warning("Vector search failed, using empty results", exc_info=True) - - try: - exact_results, fuzzy_results = fts_future.result() - except Exception: - _log.warning("FTS search failed, using empty results", exc_info=True) - - fusion_input: dict[str, list[tuple[int, float]]] = {} - if vector_results: - fusion_input["vector"] = vector_results - if exact_results: - fusion_input["exact"] = exact_results - if fuzzy_results: - fusion_input["fuzzy"] = fuzzy_results - - if not fusion_input: - return [] - - fused = reciprocal_rank_fusion(fusion_input, weights=weights, k=cfg.fusion_k) - fused = self._filter_deleted(fused) - return self._rerank_and_build(query, fused, final_top_k, use_reranker=True) - - # -- Main search entry point ----------------------------------------- - - def search( - self, - query: str, - top_k: int | None = None, - quality: str | None = None, - ) -> list[SearchResult]: - """Search with quality-based routing. - - Args: - query: Search query string. - top_k: Maximum results to return. - quality: Search quality tier: - - 'fast': FTS-only + rerank (no embedding, no vector search) - - 'balanced': FTS + binary coarse + rerank (no ANN fine search) - - 'thorough': Full 2-stage vector + FTS + reranking - - 'auto': Selects 'thorough' if vectors exist, else 'fast' - - None: Uses config.default_search_quality - - Returns: - List of SearchResult ordered by relevance. - """ - cfg = self._config - final_top_k = top_k if top_k is not None else cfg.reranker_top_k - - # Resolve quality tier - effective_quality = quality or cfg.default_search_quality - if effective_quality not in _VALID_QUALITIES: - _log.warning( - "Invalid search quality '%s', falling back to 'auto'", - effective_quality, - ) - effective_quality = "auto" - - # Auto-detect: use thorough if vector index has data, else fast - if effective_quality == "auto": - effective_quality = "thorough" if self._has_vector_index() else "fast" - - if effective_quality == "fast": - results = self._search_fast(query, final_top_k) - elif effective_quality == "balanced": - results = self._search_balanced(query, final_top_k) - else: - results = self._search_thorough(query, final_top_k) - - self._record_access(results) - return results diff --git a/codex-lens-v2/src/codexlens_search/watcher/__init__.py b/codex-lens-v2/src/codexlens_search/watcher/__init__.py deleted file mode 100644 index 94cd3919..00000000 --- a/codex-lens-v2/src/codexlens_search/watcher/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -"""File watcher and incremental indexer for codexlens-search. - -Requires the ``watcher`` extra:: - - pip install codexlens-search[watcher] -""" -from codexlens_search.watcher.events import ChangeType, FileEvent, WatcherConfig -from codexlens_search.watcher.file_watcher import FileWatcher -from codexlens_search.watcher.incremental_indexer import IncrementalIndexer - -__all__ = [ - "ChangeType", - "FileEvent", - "FileWatcher", - "IncrementalIndexer", - "WatcherConfig", -] diff --git a/codex-lens-v2/src/codexlens_search/watcher/events.py b/codex-lens-v2/src/codexlens_search/watcher/events.py deleted file mode 100644 index f69eb9c6..00000000 --- a/codex-lens-v2/src/codexlens_search/watcher/events.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Event types for file watcher.""" -from __future__ import annotations - -import time -from dataclasses import dataclass, field -from enum import Enum -from pathlib import Path -from typing import Optional, Set - - -class ChangeType(Enum): - """Type of file system change.""" - - CREATED = "created" - MODIFIED = "modified" - DELETED = "deleted" - - -@dataclass -class FileEvent: - """A file system change event.""" - - path: Path - change_type: ChangeType - timestamp: float = field(default_factory=time.time) - - -@dataclass -class WatcherConfig: - """Configuration for file watcher. - - Attributes: - debounce_ms: Milliseconds to wait after the last event before - flushing the batch. Default 500ms for low-latency indexing. - ignored_patterns: Directory/file name patterns to skip. Any - path component matching one of these strings is ignored. - """ - - debounce_ms: int = 500 - ignored_patterns: Set[str] = field(default_factory=lambda: { - # Version control - ".git", ".svn", ".hg", - # Python - ".venv", "venv", "env", "__pycache__", ".pytest_cache", - ".mypy_cache", ".ruff_cache", - # Node.js - "node_modules", "bower_components", - # Build artifacts - "dist", "build", "out", "target", "bin", "obj", - "coverage", "htmlcov", - # IDE / Editor - ".idea", ".vscode", ".vs", - # Package / cache - ".cache", ".parcel-cache", ".turbo", ".next", ".nuxt", ".codexlens", - # Logs / temp - "logs", "tmp", "temp", - }) diff --git a/codex-lens-v2/src/codexlens_search/watcher/file_watcher.py b/codex-lens-v2/src/codexlens_search/watcher/file_watcher.py deleted file mode 100644 index 20289373..00000000 --- a/codex-lens-v2/src/codexlens_search/watcher/file_watcher.py +++ /dev/null @@ -1,285 +0,0 @@ -"""File system watcher using watchdog library. - -Ported from codex-lens v1 with simplifications: -- Removed v1-specific Config dependency (uses WatcherConfig directly) -- Removed MAX_QUEUE_SIZE (v2 processes immediately via debounce) -- Removed flush.signal file mechanism -- Added optional JSONL output mode for bridge CLI integration -""" -from __future__ import annotations - -import json -import logging -import sys -import threading -import time -from pathlib import Path -from typing import Callable, Dict, List, Optional - -from watchdog.events import FileSystemEventHandler -from watchdog.observers import Observer - -from .events import ChangeType, FileEvent, WatcherConfig -from .incremental_indexer import IncrementalIndexer - -logger = logging.getLogger(__name__) - - -# Event priority for deduplication: higher wins when same file appears -# multiple times within one debounce window. -_EVENT_PRIORITY: Dict[ChangeType, int] = { - ChangeType.CREATED: 1, - ChangeType.MODIFIED: 2, - ChangeType.DELETED: 3, -} - - -class _Handler(FileSystemEventHandler): - """Internal watchdog handler that converts events to FileEvent.""" - - def __init__(self, watcher: FileWatcher) -> None: - super().__init__() - self._watcher = watcher - - def on_created(self, event) -> None: - if not event.is_directory: - self._watcher._on_raw_event(event.src_path, ChangeType.CREATED) - - def on_modified(self, event) -> None: - if not event.is_directory: - self._watcher._on_raw_event(event.src_path, ChangeType.MODIFIED) - - def on_deleted(self, event) -> None: - if not event.is_directory: - self._watcher._on_raw_event(event.src_path, ChangeType.DELETED) - - def on_moved(self, event) -> None: - if event.is_directory: - return - # Treat move as delete old + create new - self._watcher._on_raw_event(event.src_path, ChangeType.DELETED) - self._watcher._on_raw_event(event.dest_path, ChangeType.CREATED) - - -class FileWatcher: - """File system watcher with debounce and event deduplication. - - Monitors a directory recursively using watchdog. Raw events are - collected into a queue. After *debounce_ms* of silence the queue - is flushed: events are deduplicated per-path (keeping the highest - priority change type) and delivered via *on_changes*. - - Example:: - - def handle(events: list[FileEvent]) -> None: - for e in events: - print(e.change_type.value, e.path) - - watcher = FileWatcher(Path("."), WatcherConfig(), handle) - watcher.start() - watcher.wait() - """ - - def __init__( - self, - root_path: Path, - config: WatcherConfig, - on_changes: Callable[[List[FileEvent]], None], - ) -> None: - self.root_path = Path(root_path).resolve() - self.config = config - self.on_changes = on_changes - - self._observer: Optional[Observer] = None - self._running = False - self._stop_event = threading.Event() - self._lock = threading.RLock() - - # Pending events keyed by resolved path - self._pending: Dict[Path, FileEvent] = {} - self._pending_lock = threading.Lock() - - # True-debounce timer: resets on every new event - self._flush_timer: Optional[threading.Timer] = None - - # ------------------------------------------------------------------ - # Filtering - # ------------------------------------------------------------------ - - def _should_watch(self, path: Path) -> bool: - """Return True if *path* should not be ignored.""" - parts = path.parts - for pattern in self.config.ignored_patterns: - if pattern in parts: - return False - return True - - # ------------------------------------------------------------------ - # Event intake (called from watchdog thread) - # ------------------------------------------------------------------ - - def _on_raw_event(self, raw_path: str, change_type: ChangeType) -> None: - """Accept a raw watchdog event, filter, and queue with debounce.""" - path = Path(raw_path).resolve() - - if not self._should_watch(path): - return - - event = FileEvent(path=path, change_type=change_type) - - with self._pending_lock: - existing = self._pending.get(path) - if existing is None or _EVENT_PRIORITY[change_type] >= _EVENT_PRIORITY[existing.change_type]: - self._pending[path] = event - - # Cancel previous timer and start a new one (true debounce) - if self._flush_timer is not None: - self._flush_timer.cancel() - - self._flush_timer = threading.Timer( - self.config.debounce_ms / 1000.0, - self._flush, - ) - self._flush_timer.daemon = True - self._flush_timer.start() - - # ------------------------------------------------------------------ - # Flush - # ------------------------------------------------------------------ - - def _flush(self) -> None: - """Deduplicate and deliver pending events.""" - with self._pending_lock: - if not self._pending: - return - events = list(self._pending.values()) - self._pending.clear() - self._flush_timer = None - - try: - self.on_changes(events) - except Exception: - logger.exception("Error in on_changes callback") - - def flush_now(self) -> None: - """Immediately flush pending events (manual trigger).""" - with self._pending_lock: - if self._flush_timer is not None: - self._flush_timer.cancel() - self._flush_timer = None - self._flush() - - # ------------------------------------------------------------------ - # Lifecycle - # ------------------------------------------------------------------ - - def start(self) -> None: - """Start watching the directory (non-blocking).""" - with self._lock: - if self._running: - logger.warning("Watcher already running") - return - - if not self.root_path.exists(): - raise ValueError(f"Root path does not exist: {self.root_path}") - - self._observer = Observer() - handler = _Handler(self) - self._observer.schedule(handler, str(self.root_path), recursive=True) - - self._running = True - self._stop_event.clear() - self._observer.start() - logger.info("Started watching: %s", self.root_path) - - def stop(self) -> None: - """Stop watching and flush remaining events.""" - with self._lock: - if not self._running: - return - - self._running = False - self._stop_event.set() - - with self._pending_lock: - if self._flush_timer is not None: - self._flush_timer.cancel() - self._flush_timer = None - - if self._observer is not None: - self._observer.stop() - self._observer.join(timeout=5.0) - self._observer = None - - # Deliver any remaining events - self._flush() - logger.info("Stopped watching: %s", self.root_path) - - def wait(self) -> None: - """Block until stopped (Ctrl+C or stop() from another thread).""" - try: - while self._running: - self._stop_event.wait(timeout=1.0) - except KeyboardInterrupt: - logger.info("Received interrupt, stopping watcher...") - self.stop() - - @property - def is_running(self) -> bool: - """True if the watcher is currently running.""" - return self._running - - # ------------------------------------------------------------------ - # JSONL output helper - # ------------------------------------------------------------------ - - @staticmethod - def events_to_jsonl(events: List[FileEvent]) -> str: - """Serialize a batch of events as newline-delimited JSON. - - Each line is a JSON object with keys: ``path``, ``change_type``, - ``timestamp``. Useful for bridge CLI integration. - """ - lines: list[str] = [] - for evt in events: - obj = { - "path": str(evt.path), - "change_type": evt.change_type.value, - "timestamp": evt.timestamp, - } - lines.append(json.dumps(obj, ensure_ascii=False)) - return "\n".join(lines) - - @staticmethod - def jsonl_callback(events: List[FileEvent]) -> None: - """Callback that writes JSONL to stdout. - - Suitable as *on_changes* when running in bridge/CLI mode:: - - watcher = FileWatcher(root, config, FileWatcher.jsonl_callback) - """ - output = FileWatcher.events_to_jsonl(events) - if output: - sys.stdout.write(output + "\n") - sys.stdout.flush() - - @classmethod - def create_with_indexer( - cls, - root_path: Path, - config: WatcherConfig, - indexer: IncrementalIndexer, - ) -> "FileWatcher": - """Create a FileWatcher wired to an IncrementalIndexer's async path. - - Uses ``indexer.process_events_async()`` as the callback so that - events are debounced and batched within the indexer before - processing, preventing redundant per-file pipeline startups. - - Example:: - - indexer = IncrementalIndexer(pipeline, root=root) - watcher = FileWatcher.create_with_indexer(root, config, indexer) - watcher.start() - """ - return cls(root_path, config, indexer.process_events_async) diff --git a/codex-lens-v2/src/codexlens_search/watcher/incremental_indexer.py b/codex-lens-v2/src/codexlens_search/watcher/incremental_indexer.py deleted file mode 100644 index 159d891c..00000000 --- a/codex-lens-v2/src/codexlens_search/watcher/incremental_indexer.py +++ /dev/null @@ -1,185 +0,0 @@ -"""Incremental indexer that processes FileEvents via IndexingPipeline. - -Ported from codex-lens v1 with simplifications: -- Uses IndexingPipeline.index_file() / remove_file() directly -- No v1-specific Config, ParserFactory, DirIndexStore dependencies -- Per-file error isolation: one failure does not stop batch processing -- Debounce batching: process_events_async() buffers events and flushes - after a configurable window to prevent redundant per-file pipeline startups -""" -from __future__ import annotations - -import logging -import threading -from dataclasses import dataclass, field -from pathlib import Path -from typing import List, Optional - -from codexlens_search.indexing.pipeline import IndexingPipeline - -from .events import ChangeType, FileEvent - -logger = logging.getLogger(__name__) - - -@dataclass -class BatchResult: - """Result of processing a batch of file events.""" - - files_indexed: int = 0 - files_removed: int = 0 - chunks_created: int = 0 - errors: List[str] = field(default_factory=list) - - @property - def total_processed(self) -> int: - return self.files_indexed + self.files_removed - - @property - def has_errors(self) -> bool: - return len(self.errors) > 0 - - -class IncrementalIndexer: - """Routes file change events to IndexingPipeline operations. - - CREATED / MODIFIED events call ``pipeline.index_file()``. - DELETED events call ``pipeline.remove_file()``. - - Each file is processed in isolation so that a single failure - does not prevent the rest of the batch from being indexed. - - Example:: - - indexer = IncrementalIndexer(pipeline, root=Path("/project")) - result = indexer.process_events([ - FileEvent(Path("src/main.py"), ChangeType.MODIFIED), - ]) - print(f"Indexed {result.files_indexed}, removed {result.files_removed}") - """ - - def __init__( - self, - pipeline: IndexingPipeline, - *, - root: Optional[Path] = None, - debounce_window_ms: int = 500, - ) -> None: - """Initialize the incremental indexer. - - Args: - pipeline: The indexing pipeline with metadata store configured. - root: Optional project root for computing relative paths. - If None, absolute paths are used as identifiers. - debounce_window_ms: Milliseconds to buffer events before flushing - in process_events_async(). Default 500ms. - """ - self._pipeline = pipeline - self._root = root - self._debounce_window_ms = debounce_window_ms - self._event_buffer: List[FileEvent] = [] - self._buffer_lock = threading.Lock() - self._flush_timer: Optional[threading.Timer] = None - - def process_events(self, events: List[FileEvent]) -> BatchResult: - """Process a batch of file events with per-file error isolation. - - Args: - events: List of file events to process. - - Returns: - BatchResult with per-batch statistics. - """ - result = BatchResult() - - for event in events: - try: - if event.change_type in (ChangeType.CREATED, ChangeType.MODIFIED): - self._handle_index(event, result) - elif event.change_type == ChangeType.DELETED: - self._handle_remove(event, result) - except Exception as exc: - error_msg = ( - f"Error processing {event.path} " - f"({event.change_type.value}): " - f"{type(exc).__name__}: {exc}" - ) - logger.error(error_msg) - result.errors.append(error_msg) - - if result.total_processed > 0: - logger.info( - "Batch complete: %d indexed, %d removed, %d errors", - result.files_indexed, - result.files_removed, - len(result.errors), - ) - - return result - - def process_events_async(self, events: List[FileEvent]) -> None: - """Buffer events and flush after the debounce window expires. - - Non-blocking: events are accumulated in an internal buffer. - When no new events arrive within *debounce_window_ms*, the buffer - is flushed and all accumulated events are processed as a single - batch via process_events(). - - Args: - events: List of file events to buffer. - """ - with self._buffer_lock: - self._event_buffer.extend(events) - - # Cancel previous timer and start a new one (true debounce) - if self._flush_timer is not None: - self._flush_timer.cancel() - - self._flush_timer = threading.Timer( - self._debounce_window_ms / 1000.0, - self._flush_buffer, - ) - self._flush_timer.daemon = True - self._flush_timer.start() - - def _flush_buffer(self) -> None: - """Flush the event buffer and process all accumulated events.""" - with self._buffer_lock: - if not self._event_buffer: - return - events = list(self._event_buffer) - self._event_buffer.clear() - self._flush_timer = None - - # Deduplicate: keep the last event per path - seen: dict[Path, FileEvent] = {} - for event in events: - seen[event.path] = event - deduped = list(seen.values()) - - logger.debug( - "Flushing debounce buffer: %d events (%d after dedup)", - len(events), len(deduped), - ) - self.process_events(deduped) - - def _handle_index(self, event: FileEvent, result: BatchResult) -> None: - """Index a created or modified file.""" - stats = self._pipeline.index_file( - event.path, - root=self._root, - force=(event.change_type == ChangeType.MODIFIED), - ) - if stats.files_processed > 0: - result.files_indexed += 1 - result.chunks_created += stats.chunks_created - - def _handle_remove(self, event: FileEvent, result: BatchResult) -> None: - """Remove a deleted file from the index.""" - rel_path = ( - str(event.path.relative_to(self._root)) - if self._root - else str(event.path) - ) - self._pipeline.remove_file(rel_path) - result.files_removed += 1 diff --git a/codex-lens-v2/tests/__init__.py b/codex-lens-v2/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/codex-lens-v2/tests/integration/__init__.py b/codex-lens-v2/tests/integration/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/codex-lens-v2/tests/integration/conftest.py b/codex-lens-v2/tests/integration/conftest.py deleted file mode 100644 index 9c6012e8..00000000 --- a/codex-lens-v2/tests/integration/conftest.py +++ /dev/null @@ -1,108 +0,0 @@ -import pytest -import numpy as np -import tempfile -from pathlib import Path - -from codexlens_search.config import Config -from codexlens_search.core import ANNIndex, BinaryStore -from codexlens_search.embed.base import BaseEmbedder -from codexlens_search.rerank.base import BaseReranker -from codexlens_search.search.fts import FTSEngine -from codexlens_search.search.pipeline import SearchPipeline - -# Test documents: 20 code snippets with id, path, content -TEST_DOCS = [ - (0, "auth.py", "def authenticate(user, password): return check_hash(password, user.hash)"), - (1, "auth.py", "def authorize(user, permission): return permission in user.roles"), - (2, "models.py", "class User: def __init__(self, name, email): self.name = name; self.email = email"), - (3, "models.py", "class Session: token = None; expires_at = None"), - (4, "middleware.py", "def auth_middleware(request): token = request.headers.get('Authorization')"), - (5, "utils.py", "def hash_password(password): import bcrypt; return bcrypt.hashpw(password)"), - (6, "config.py", "DATABASE_URL = os.environ.get('DATABASE_URL', 'sqlite:///db.sqlite3')"), - (7, "search.py", "def search_users(query): return User.objects.filter(name__icontains=query)"), - (8, "api.py", "def get_user(request, user_id): user = User.objects.get(id=user_id)"), - (9, "api.py", "def create_user(request): data = request.json(); user = User(**data)"), - (10, "tests.py", "def test_authenticate(): assert authenticate('admin', 'pass') is not None"), - (11, "tests.py", "def test_search(): results = search_users('alice'); assert len(results) > 0"), - (12, "router.py", "app.route('/users', methods=['GET'])(list_users)"), - (13, "router.py", "app.route('/login', methods=['POST'])(login_handler)"), - (14, "db.py", "def get_connection(): return sqlite3.connect(DATABASE_URL)"), - (15, "cache.py", "def cache_get(key): return redis_client.get(key)"), - (16, "cache.py", "def cache_set(key, value, ttl=3600): redis_client.setex(key, ttl, value)"), - (17, "errors.py", "class AuthError(Exception): status_code = 401"), - (18, "errors.py", "class NotFoundError(Exception): status_code = 404"), - (19, "validators.py", "def validate_email(email): return '@' in email and '.' in email.split('@')[1]"), -] - -DIM = 32 # Use small dim for fast tests - - -def make_stable_vec(doc_id: int, dim: int = DIM) -> np.ndarray: - """Generate a deterministic float32 vector for a given doc_id.""" - rng = np.random.default_rng(seed=doc_id) - vec = rng.standard_normal(dim).astype(np.float32) - vec /= np.linalg.norm(vec) - return vec - - -class MockEmbedder(BaseEmbedder): - """Returns stable deterministic vectors based on content hash.""" - - def embed_single(self, text: str) -> np.ndarray: - seed = hash(text) % (2**31) - rng = np.random.default_rng(seed=seed) - vec = rng.standard_normal(DIM).astype(np.float32) - vec /= np.linalg.norm(vec) - return vec - - def embed_batch(self, texts: list[str]) -> list[np.ndarray]: - return [self.embed_single(t) for t in texts] - - def embed(self, texts: list[str]) -> list[np.ndarray]: - """Called by SearchPipeline as self._embedder.embed([query])[0].""" - return self.embed_batch(texts) - - -class MockReranker(BaseReranker): - """Returns score based on simple keyword overlap.""" - - def score_pairs(self, query: str, documents: list[str]) -> list[float]: - query_words = set(query.lower().split()) - scores = [] - for doc in documents: - doc_words = set(doc.lower().split()) - overlap = len(query_words & doc_words) - scores.append(float(overlap) / max(len(query_words), 1)) - return scores - - -@pytest.fixture -def config(): - return Config.small() # hnsw_ef=50, hnsw_M=16, binary_top_k=50, ann_top_k=20, rerank_top_k=10 - - -@pytest.fixture -def search_pipeline(tmp_path, config): - """Build a full SearchPipeline with 20 test docs indexed.""" - embedder = MockEmbedder() - binary_store = BinaryStore(tmp_path / "binary", dim=DIM, config=config) - ann_index = ANNIndex(tmp_path / "ann.hnsw", dim=DIM, config=config) - fts = FTSEngine(tmp_path / "fts.db") - reranker = MockReranker() - - # Index all test docs - ids = np.array([d[0] for d in TEST_DOCS], dtype=np.int64) - vectors = np.array([embedder.embed_single(d[2]) for d in TEST_DOCS], dtype=np.float32) - - binary_store.add(ids, vectors) - ann_index.add(ids, vectors) - fts.add_documents(TEST_DOCS) - - return SearchPipeline( - embedder=embedder, - binary_store=binary_store, - ann_index=ann_index, - reranker=reranker, - fts=fts, - config=config, - ) diff --git a/codex-lens-v2/tests/integration/test_search_pipeline.py b/codex-lens-v2/tests/integration/test_search_pipeline.py deleted file mode 100644 index 6f59a612..00000000 --- a/codex-lens-v2/tests/integration/test_search_pipeline.py +++ /dev/null @@ -1,44 +0,0 @@ -"""Integration tests for SearchPipeline using real components and mock embedder/reranker.""" -from __future__ import annotations - - -def test_vector_search_returns_results(search_pipeline): - results = search_pipeline.search("authentication middleware") - assert len(results) > 0 - assert all(isinstance(r.score, float) for r in results) - - -def test_exact_keyword_search(search_pipeline): - results = search_pipeline.search("authenticate") - assert len(results) > 0 - result_ids = {r.id for r in results} - # Doc 0 and 10 both contain "authenticate" - assert result_ids & {0, 10}, f"Expected doc 0 or 10 in results, got {result_ids}" - - -def test_pipeline_top_k_limit(search_pipeline): - results = search_pipeline.search("user", top_k=5) - assert len(results) <= 5 - - -def test_search_result_fields_populated(search_pipeline): - results = search_pipeline.search("password") - assert len(results) > 0 - for r in results: - assert r.id >= 0 - assert r.score >= 0 - assert isinstance(r.path, str) - - -def test_empty_query_handled(search_pipeline): - results = search_pipeline.search("") - assert isinstance(results, list) # no exception - - -def test_different_queries_give_different_results(search_pipeline): - r1 = search_pipeline.search("authenticate user") - r2 = search_pipeline.search("cache redis") - # Results should differ (different top IDs or scores), unless both are empty - ids1 = [r.id for r in r1] - ids2 = [r.id for r in r2] - assert ids1 != ids2 or len(r1) == 0 diff --git a/codex-lens-v2/tests/unit/__init__.py b/codex-lens-v2/tests/unit/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/codex-lens-v2/tests/unit/test_bridge.py b/codex-lens-v2/tests/unit/test_bridge.py deleted file mode 100644 index 5d3be3ba..00000000 --- a/codex-lens-v2/tests/unit/test_bridge.py +++ /dev/null @@ -1,165 +0,0 @@ -"""Unit tests for bridge.py CLI — argparse parsing, JSON protocol, error handling.""" -from __future__ import annotations - -import json -import subprocess -import sys -from pathlib import Path -from unittest.mock import patch - -import pytest - -from codexlens_search.bridge import ( - DEFAULT_EXCLUDES, - _build_parser, - _json_output, - _error_exit, - should_exclude, -) - - -# --------------------------------------------------------------------------- -# Parser construction -# --------------------------------------------------------------------------- - -class TestParser: - @pytest.fixture(autouse=True) - def _parser(self): - self.parser = _build_parser() - - def test_all_subcommands_exist(self): - expected = { - "init", "search", "index-file", "remove-file", - "sync", "watch", "download-models", "status", - } - # parse each subcommand with minimal required args to verify it exists - for cmd in expected: - if cmd == "search": - args = self.parser.parse_args(["search", "--query", "test"]) - elif cmd == "index-file": - args = self.parser.parse_args(["index-file", "--file", "x.py"]) - elif cmd == "remove-file": - args = self.parser.parse_args(["remove-file", "--file", "x.py"]) - elif cmd == "sync": - args = self.parser.parse_args(["sync", "--root", "/tmp"]) - elif cmd == "watch": - args = self.parser.parse_args(["watch", "--root", "/tmp"]) - else: - args = self.parser.parse_args([cmd]) - assert args.command == cmd - - def test_global_db_path_default(self): - args = self.parser.parse_args(["status"]) - assert args.db_path # has a default - - def test_global_db_path_override(self): - args = self.parser.parse_args(["--db-path", "/custom/path", "status"]) - assert args.db_path == "/custom/path" - - def test_search_args(self): - args = self.parser.parse_args(["search", "-q", "hello", "-k", "5"]) - assert args.query == "hello" - assert args.top_k == 5 - - def test_search_default_top_k(self): - args = self.parser.parse_args(["search", "--query", "test"]) - assert args.top_k == 10 - - def test_sync_glob_default(self): - args = self.parser.parse_args(["sync", "--root", "/tmp"]) - assert args.glob == "**/*" - - def test_watch_debounce_default(self): - args = self.parser.parse_args(["watch", "--root", "/tmp"]) - assert args.debounce_ms == 500 - - def test_no_command_returns_none(self): - args = self.parser.parse_args([]) - assert args.command is None - - def test_default_excludes_include_codexlens(self): - assert ".codexlens" in DEFAULT_EXCLUDES - - def test_should_exclude_codexlens_directory(self): - assert should_exclude(Path(".codexlens") / "metadata.db", DEFAULT_EXCLUDES) is True - - -# --------------------------------------------------------------------------- -# JSON output helpers -# --------------------------------------------------------------------------- - -class TestJsonHelpers: - def test_json_output(self, capsys): - _json_output({"key": "value"}) - out = capsys.readouterr().out.strip() - parsed = json.loads(out) - assert parsed == {"key": "value"} - - def test_json_output_list(self, capsys): - _json_output([1, 2, 3]) - out = capsys.readouterr().out.strip() - assert json.loads(out) == [1, 2, 3] - - def test_json_output_unicode(self, capsys): - _json_output({"msg": "中文测试"}) - out = capsys.readouterr().out.strip() - parsed = json.loads(out) - assert parsed["msg"] == "中文测试" - - def test_error_exit(self): - with pytest.raises(SystemExit) as exc_info: - _error_exit("something broke") - assert exc_info.value.code == 1 - - -# --------------------------------------------------------------------------- -# cmd_init (lightweight, no model loading) -# --------------------------------------------------------------------------- - -class TestCmdInit: - def test_init_creates_databases(self, tmp_path): - """Init should create metadata.db and fts.db.""" - from codexlens_search.bridge import cmd_init - import argparse - - db_path = str(tmp_path / "test_idx") - args = argparse.Namespace(db_path=db_path, verbose=False) - cmd_init(args) - - assert (Path(db_path) / "metadata.db").exists() - assert (Path(db_path) / "fts.db").exists() - - -# --------------------------------------------------------------------------- -# cmd_status (lightweight, no model loading) -# --------------------------------------------------------------------------- - -class TestCmdStatus: - def test_status_not_initialized(self, tmp_path, capsys): - from codexlens_search.bridge import cmd_status - import argparse - - db_path = str(tmp_path / "empty_idx") - Path(db_path).mkdir() - args = argparse.Namespace(db_path=db_path, verbose=False) - cmd_status(args) - - out = json.loads(capsys.readouterr().out.strip()) - assert out["status"] == "not_initialized" - - def test_status_after_init(self, tmp_path, capsys): - from codexlens_search.bridge import cmd_init, cmd_status - import argparse - - db_path = str(tmp_path / "idx") - args = argparse.Namespace(db_path=db_path, verbose=False) - cmd_init(args) - - # Re-capture after init output - capsys.readouterr() - - cmd_status(args) - out = json.loads(capsys.readouterr().out.strip()) - assert out["status"] == "ok" - assert out["files_tracked"] == 0 - assert out["deleted_chunks"] == 0 diff --git a/codex-lens-v2/tests/unit/test_config.py b/codex-lens-v2/tests/unit/test_config.py deleted file mode 100644 index e9e4b056..00000000 --- a/codex-lens-v2/tests/unit/test_config.py +++ /dev/null @@ -1,31 +0,0 @@ -from codexlens_search.config import Config - - -def test_config_instantiates_no_args(): - cfg = Config() - assert cfg is not None - - -def test_defaults_hnsw_ef(): - cfg = Config.defaults() - assert cfg.hnsw_ef == 150 - - -def test_defaults_hnsw_M(): - cfg = Config.defaults() - assert cfg.hnsw_M == 32 - - -def test_small_hnsw_ef(): - cfg = Config.small() - assert cfg.hnsw_ef == 50 - - -def test_custom_instantiation(): - cfg = Config(hnsw_ef=100) - assert cfg.hnsw_ef == 100 - - -def test_fusion_weights_keys(): - cfg = Config() - assert set(cfg.fusion_weights.keys()) == {"exact", "fuzzy", "vector", "graph"} diff --git a/codex-lens-v2/tests/unit/test_core.py b/codex-lens-v2/tests/unit/test_core.py deleted file mode 100644 index 41834446..00000000 --- a/codex-lens-v2/tests/unit/test_core.py +++ /dev/null @@ -1,136 +0,0 @@ -"""Unit tests for BinaryStore and ANNIndex (no fastembed required).""" -from __future__ import annotations - -import concurrent.futures -import tempfile -from pathlib import Path - -import numpy as np -import pytest - -from codexlens_search.config import Config -from codexlens_search.core import ANNIndex, BinaryStore - - -DIM = 32 -RNG = np.random.default_rng(42) - - -def make_vectors(n: int, dim: int = DIM) -> np.ndarray: - return RNG.standard_normal((n, dim)).astype(np.float32) - - -def make_ids(n: int, start: int = 0) -> np.ndarray: - return np.arange(start, start + n, dtype=np.int64) - - -# --------------------------------------------------------------------------- -# BinaryStore tests -# --------------------------------------------------------------------------- - - -class TestBinaryStore: - def test_binary_store_add_and_search(self, tmp_path: Path) -> None: - cfg = Config.small() - store = BinaryStore(tmp_path, DIM, cfg) - vecs = make_vectors(10) - ids = make_ids(10) - store.add(ids, vecs) - - assert len(store) == 10 - - top_k = 5 - ret_ids, ret_dists = store.coarse_search(vecs[0], top_k=top_k) - assert ret_ids.shape == (top_k,) - assert ret_dists.shape == (top_k,) - # distances are non-negative integers - assert (ret_dists >= 0).all() - - def test_binary_hamming_correctness(self, tmp_path: Path) -> None: - cfg = Config.small() - store = BinaryStore(tmp_path, DIM, cfg) - vecs = make_vectors(20) - ids = make_ids(20) - store.add(ids, vecs) - - # Query with the exact stored vector; it must be the top-1 result - query = vecs[7] - ret_ids, ret_dists = store.coarse_search(query, top_k=1) - assert ret_ids[0] == 7 - assert ret_dists[0] == 0 # Hamming distance to itself is 0 - - def test_binary_store_persist(self, tmp_path: Path) -> None: - cfg = Config.small() - store = BinaryStore(tmp_path, DIM, cfg) - vecs = make_vectors(15) - ids = make_ids(15) - store.add(ids, vecs) - store.save() - - # Load into a fresh instance - store2 = BinaryStore(tmp_path, DIM, cfg) - assert len(store2) == 15 - - query = vecs[3] - ret_ids, ret_dists = store2.coarse_search(query, top_k=1) - assert ret_ids[0] == 3 - assert ret_dists[0] == 0 - - -# --------------------------------------------------------------------------- -# ANNIndex tests -# --------------------------------------------------------------------------- - - -class TestANNIndex: - def test_ann_index_add_and_search(self, tmp_path: Path) -> None: - cfg = Config.small() - idx = ANNIndex(tmp_path, DIM, cfg) - vecs = make_vectors(50) - ids = make_ids(50) - idx.add(ids, vecs) - - assert len(idx) == 50 - - ret_ids, ret_dists = idx.fine_search(vecs[0], top_k=5) - assert len(ret_ids) == 5 - assert len(ret_dists) == 5 - - def test_ann_index_thread_safety(self, tmp_path: Path) -> None: - cfg = Config.small() - idx = ANNIndex(tmp_path, DIM, cfg) - vecs = make_vectors(50) - ids = make_ids(50) - idx.add(ids, vecs) - - query = vecs[0] - errors: list[Exception] = [] - - def search() -> None: - try: - idx.fine_search(query, top_k=3) - except Exception as exc: - errors.append(exc) - - with concurrent.futures.ThreadPoolExecutor(max_workers=5) as pool: - futures = [pool.submit(search) for _ in range(5)] - concurrent.futures.wait(futures) - - assert errors == [], f"Thread safety errors: {errors}" - - def test_ann_index_save_load(self, tmp_path: Path) -> None: - cfg = Config.small() - idx = ANNIndex(tmp_path, DIM, cfg) - vecs = make_vectors(30) - ids = make_ids(30) - idx.add(ids, vecs) - idx.save() - - # Load into a fresh instance - idx2 = ANNIndex(tmp_path, DIM, cfg) - idx2.load() - assert len(idx2) == 30 - - ret_ids, ret_dists = idx2.fine_search(vecs[10], top_k=1) - assert len(ret_ids) == 1 - assert ret_ids[0] == 10 diff --git a/codex-lens-v2/tests/unit/test_embed.py b/codex-lens-v2/tests/unit/test_embed.py deleted file mode 100644 index 645a0a6a..00000000 --- a/codex-lens-v2/tests/unit/test_embed.py +++ /dev/null @@ -1,258 +0,0 @@ -from __future__ import annotations - -import sys -import types -import unittest -from unittest.mock import MagicMock, patch - -import numpy as np - - -def _make_fastembed_mock(): - """Build a minimal fastembed stub so imports succeed without the real package.""" - fastembed_mod = types.ModuleType("fastembed") - fastembed_mod.TextEmbedding = MagicMock() - sys.modules.setdefault("fastembed", fastembed_mod) - return fastembed_mod - - -_make_fastembed_mock() - -from codexlens_search.config import Config # noqa: E402 -from codexlens_search.embed.base import BaseEmbedder # noqa: E402 -from codexlens_search.embed.local import EMBED_PROFILES, FastEmbedEmbedder # noqa: E402 -from codexlens_search.embed.api import APIEmbedder # noqa: E402 - - -class TestEmbedSingle(unittest.TestCase): - def test_embed_single_returns_float32_ndarray(self): - config = Config() - embedder = FastEmbedEmbedder(config) - - mock_model = MagicMock() - mock_model.embed.return_value = iter([np.ones(384, dtype=np.float64)]) - - # Inject mock model directly to bypass lazy load (no real fastembed needed) - embedder._model = mock_model - result = embedder.embed_single("hello world") - - self.assertIsInstance(result, np.ndarray) - self.assertEqual(result.dtype, np.float32) - self.assertEqual(result.shape, (384,)) - - -class TestEmbedBatch(unittest.TestCase): - def test_embed_batch_returns_list(self): - config = Config() - embedder = FastEmbedEmbedder(config) - - vecs = [np.ones(384, dtype=np.float64) * i for i in range(3)] - mock_model = MagicMock() - mock_model.embed.return_value = iter(vecs) - - embedder._model = mock_model - result = embedder.embed_batch(["a", "b", "c"]) - - self.assertIsInstance(result, list) - self.assertEqual(len(result), 3) - for arr in result: - self.assertIsInstance(arr, np.ndarray) - self.assertEqual(arr.dtype, np.float32) - - -class TestEmbedProfiles(unittest.TestCase): - def test_embed_profiles_all_have_valid_keys(self): - expected_keys = {"small", "base", "large", "code"} - self.assertEqual(set(EMBED_PROFILES.keys()), expected_keys) - - def test_embed_profiles_model_ids_non_empty(self): - for key, model_id in EMBED_PROFILES.items(): - self.assertIsInstance(model_id, str, msg=f"{key} model id should be str") - self.assertTrue(len(model_id) > 0, msg=f"{key} model id should be non-empty") - - -class TestBaseEmbedderAbstract(unittest.TestCase): - def test_base_embedder_is_abstract(self): - with self.assertRaises(TypeError): - BaseEmbedder() # type: ignore[abstract] - - -# --------------------------------------------------------------------------- -# APIEmbedder -# --------------------------------------------------------------------------- - -def _make_api_config(**overrides) -> Config: - defaults = dict( - embed_api_url="https://api.example.com/v1", - embed_api_key="test-key", - embed_api_model="text-embedding-3-small", - embed_dim=384, - embed_batch_size=2, - embed_api_max_tokens_per_batch=8192, - embed_api_concurrency=2, - ) - defaults.update(overrides) - return Config(**defaults) - - -def _mock_200(count=1, dim=384): - r = MagicMock() - r.status_code = 200 - r.json.return_value = { - "data": [{"index": j, "embedding": [0.1 * (j + 1)] * dim} for j in range(count)] - } - r.raise_for_status = MagicMock() - return r - - -class TestAPIEmbedderSingle(unittest.TestCase): - def test_embed_single_returns_float32(self): - config = _make_api_config() - with patch("httpx.Client") as mock_client_cls: - mock_client = MagicMock() - mock_client_cls.return_value = mock_client - mock_client.post.return_value = _mock_200(1, 384) - - embedder = APIEmbedder(config) - result = embedder.embed_single("hello") - - self.assertIsInstance(result, np.ndarray) - self.assertEqual(result.dtype, np.float32) - self.assertEqual(result.shape, (384,)) - - -class TestAPIEmbedderBatch(unittest.TestCase): - def test_embed_batch_splits_by_batch_size(self): - config = _make_api_config(embed_batch_size=2) - - with patch("httpx.Client") as mock_client_cls: - mock_client = MagicMock() - mock_client_cls.return_value = mock_client - mock_client.post.side_effect = [_mock_200(2, 384), _mock_200(1, 384)] - - embedder = APIEmbedder(config) - result = embedder.embed_batch(["a", "b", "c"]) - - self.assertEqual(len(result), 3) - for arr in result: - self.assertIsInstance(arr, np.ndarray) - self.assertEqual(arr.dtype, np.float32) - - def test_embed_batch_empty_returns_empty(self): - config = _make_api_config() - with patch("httpx.Client"): - embedder = APIEmbedder(config) - result = embedder.embed_batch([]) - self.assertEqual(result, []) - - -class TestAPIEmbedderRetry(unittest.TestCase): - def test_retry_on_429(self): - config = _make_api_config() - mock_429 = MagicMock() - mock_429.status_code = 429 - - with patch("httpx.Client") as mock_client_cls: - mock_client = MagicMock() - mock_client_cls.return_value = mock_client - mock_client.post.side_effect = [mock_429, _mock_200(1, 384)] - - embedder = APIEmbedder(config) - ep = embedder._endpoints[0] - with patch("time.sleep"): - result = embedder._call_api(["test"], ep) - - self.assertEqual(len(result), 1) - self.assertEqual(mock_client.post.call_count, 2) - - def test_raises_after_max_retries(self): - config = _make_api_config() - mock_429 = MagicMock() - mock_429.status_code = 429 - - with patch("httpx.Client") as mock_client_cls: - mock_client = MagicMock() - mock_client_cls.return_value = mock_client - mock_client.post.return_value = mock_429 - - embedder = APIEmbedder(config) - ep = embedder._endpoints[0] - with patch("time.sleep"): - with self.assertRaises(RuntimeError): - embedder._call_api(["test"], ep, max_retries=2) - - -class TestAPIEmbedderTokenPacking(unittest.TestCase): - def test_packs_small_texts_together(self): - config = _make_api_config( - embed_batch_size=100, - embed_api_max_tokens_per_batch=100, # ~400 chars - ) - with patch("httpx.Client"): - embedder = APIEmbedder(config) - - # 5 texts of 80 chars each (~20 tokens) -> 100 tokens = 1 batch at limit - texts = ["x" * 80] * 5 - batches = embedder._pack_batches(texts) - # Should pack as many as fit under 100 tokens - self.assertTrue(len(batches) >= 1) - total_items = sum(len(b) for b in batches) - self.assertEqual(total_items, 5) - - def test_large_text_gets_own_batch(self): - config = _make_api_config( - embed_batch_size=100, - embed_api_max_tokens_per_batch=50, # ~200 chars - ) - with patch("httpx.Client"): - embedder = APIEmbedder(config) - - # Mix of small and large texts - texts = ["small" * 10, "x" * 800, "tiny"] - batches = embedder._pack_batches(texts) - # Large text (200 tokens) exceeds 50 limit, should be separate - self.assertTrue(len(batches) >= 2) - - -class TestAPIEmbedderMultiEndpoint(unittest.TestCase): - def test_multi_endpoint_config(self): - config = _make_api_config( - embed_api_endpoints=[ - {"url": "https://ep1.example.com/v1", "key": "k1", "model": "m1"}, - {"url": "https://ep2.example.com/v1", "key": "k2", "model": "m2"}, - ] - ) - with patch("httpx.Client"): - embedder = APIEmbedder(config) - self.assertEqual(len(embedder._endpoints), 2) - self.assertTrue(embedder._endpoints[0].url.endswith("/embeddings")) - self.assertTrue(embedder._endpoints[1].url.endswith("/embeddings")) - - def test_single_endpoint_fallback(self): - config = _make_api_config() # no embed_api_endpoints - with patch("httpx.Client"): - embedder = APIEmbedder(config) - self.assertEqual(len(embedder._endpoints), 1) - - -class TestAPIEmbedderUrlNormalization(unittest.TestCase): - def test_appends_embeddings_path(self): - config = _make_api_config(embed_api_url="https://api.example.com/v1") - with patch("httpx.Client") as mock_client_cls: - mock_client = MagicMock() - mock_client_cls.return_value = mock_client - mock_client.post.return_value = _mock_200(1, 384) - embedder = APIEmbedder(config) - ep = embedder._endpoints[0] - self.assertTrue(ep.url.endswith("/embeddings")) - - def test_does_not_double_append(self): - config = _make_api_config(embed_api_url="https://api.example.com/v1/embeddings") - with patch("httpx.Client"): - embedder = APIEmbedder(config) - ep = embedder._endpoints[0] - self.assertFalse(ep.url.endswith("/embeddings/embeddings")) - - -if __name__ == "__main__": - unittest.main() diff --git a/codex-lens-v2/tests/unit/test_fts_delete.py b/codex-lens-v2/tests/unit/test_fts_delete.py deleted file mode 100644 index 97ce0471..00000000 --- a/codex-lens-v2/tests/unit/test_fts_delete.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Unit tests for FTSEngine delete_by_path and get_chunk_ids_by_path.""" -from __future__ import annotations - -import pytest - -from codexlens_search.search.fts import FTSEngine - - -@pytest.fixture -def fts(tmp_path): - return FTSEngine(str(tmp_path / "fts.db")) - - -class TestGetChunkIdsByPath: - def test_empty(self, fts): - assert fts.get_chunk_ids_by_path("a.py") == [] - - def test_returns_matching_ids(self, fts): - fts.add_documents([ - (0, "a.py", "hello world"), - (1, "a.py", "foo bar"), - (2, "b.py", "other content"), - ]) - ids = fts.get_chunk_ids_by_path("a.py") - assert sorted(ids) == [0, 1] - - def test_no_match(self, fts): - fts.add_documents([(0, "a.py", "content")]) - assert fts.get_chunk_ids_by_path("b.py") == [] - - -class TestDeleteByPath: - def test_deletes_docs_and_meta(self, fts): - fts.add_documents([ - (0, "target.py", "to be deleted"), - (1, "target.py", "also deleted"), - (2, "keep.py", "keep this"), - ]) - count = fts.delete_by_path("target.py") - assert count == 2 - - # target.py gone from both tables - assert fts.get_chunk_ids_by_path("target.py") == [] - assert fts.get_content(0) == "" - assert fts.get_content(1) == "" - - # keep.py still there - assert fts.get_chunk_ids_by_path("keep.py") == [2] - assert fts.get_content(2) == "keep this" - - def test_delete_nonexistent_path(self, fts): - count = fts.delete_by_path("nonexistent.py") - assert count == 0 - - def test_delete_then_search(self, fts): - fts.add_documents([ - (0, "a.py", "unique searchable content"), - (1, "b.py", "different content here"), - ]) - fts.delete_by_path("a.py") - results = fts.exact_search("unique searchable") - assert len(results) == 0 - - results = fts.exact_search("different") - assert len(results) == 1 - assert results[0][0] == 1 diff --git a/codex-lens-v2/tests/unit/test_incremental.py b/codex-lens-v2/tests/unit/test_incremental.py deleted file mode 100644 index 8627831f..00000000 --- a/codex-lens-v2/tests/unit/test_incremental.py +++ /dev/null @@ -1,388 +0,0 @@ -"""Unit tests for IndexingPipeline incremental API (index_file, remove_file, sync, compact).""" -from __future__ import annotations - -import tempfile -from pathlib import Path -from unittest.mock import MagicMock - -import numpy as np -import pytest - -from codexlens_search.config import Config -from codexlens_search.core.binary import BinaryStore -from codexlens_search.core.index import ANNIndex -from codexlens_search.embed.base import BaseEmbedder -from codexlens_search.indexing.metadata import MetadataStore -from codexlens_search.indexing.pipeline import IndexingPipeline, IndexStats -from codexlens_search.search.fts import FTSEngine - - -DIM = 32 - - -class FakeEmbedder(BaseEmbedder): - """Deterministic embedder for testing.""" - - def __init__(self) -> None: - pass - - def embed_single(self, text: str) -> np.ndarray: - rng = np.random.default_rng(hash(text) % (2**31)) - return rng.standard_normal(DIM).astype(np.float32) - - def embed_batch(self, texts: list[str]) -> list[np.ndarray]: - return [self.embed_single(t) for t in texts] - - -@pytest.fixture -def workspace(tmp_path: Path): - """Create workspace with stores, metadata, and pipeline.""" - cfg = Config.small() - # Override embed_dim to match our test dim - cfg.embed_dim = DIM - - store_dir = tmp_path / "stores" - store_dir.mkdir() - - binary_store = BinaryStore(store_dir, DIM, cfg) - ann_index = ANNIndex(store_dir, DIM, cfg) - fts = FTSEngine(str(store_dir / "fts.db")) - metadata = MetadataStore(str(store_dir / "metadata.db")) - embedder = FakeEmbedder() - - pipeline = IndexingPipeline( - embedder=embedder, - binary_store=binary_store, - ann_index=ann_index, - fts=fts, - config=cfg, - metadata=metadata, - ) - - # Create sample source files - src_dir = tmp_path / "src" - src_dir.mkdir() - - return { - "pipeline": pipeline, - "metadata": metadata, - "binary_store": binary_store, - "ann_index": ann_index, - "fts": fts, - "src_dir": src_dir, - "store_dir": store_dir, - "config": cfg, - } - - -def _write_file(src_dir: Path, name: str, content: str) -> Path: - """Write a file and return its path.""" - p = src_dir / name - p.write_text(content, encoding="utf-8") - return p - - -# --------------------------------------------------------------------------- -# MetadataStore helper method tests -# --------------------------------------------------------------------------- - - -class TestMetadataHelpers: - def test_get_all_files_empty(self, workspace): - meta = workspace["metadata"] - assert meta.get_all_files() == {} - - def test_get_all_files_after_register(self, workspace): - meta = workspace["metadata"] - meta.register_file("a.py", "hash_a", 1000.0) - meta.register_file("b.py", "hash_b", 2000.0) - result = meta.get_all_files() - assert result == {"a.py": "hash_a", "b.py": "hash_b"} - - def test_max_chunk_id_empty(self, workspace): - meta = workspace["metadata"] - assert meta.max_chunk_id() == -1 - - def test_max_chunk_id_with_chunks(self, workspace): - meta = workspace["metadata"] - meta.register_file("a.py", "hash_a", 1000.0) - meta.register_chunks("a.py", [(0, "h0"), (1, "h1"), (5, "h5")]) - assert meta.max_chunk_id() == 5 - - def test_max_chunk_id_includes_deleted(self, workspace): - meta = workspace["metadata"] - meta.register_file("a.py", "hash_a", 1000.0) - meta.register_chunks("a.py", [(0, "h0"), (3, "h3")]) - meta.mark_file_deleted("a.py") - # Chunks moved to deleted_chunks, max should still be 3 - assert meta.max_chunk_id() == 3 - - -# --------------------------------------------------------------------------- -# index_file tests -# --------------------------------------------------------------------------- - - -class TestIndexFile: - def test_index_file_basic(self, workspace): - pipeline = workspace["pipeline"] - meta = workspace["metadata"] - src_dir = workspace["src_dir"] - - f = _write_file(src_dir, "hello.py", "print('hello world')\n") - stats = pipeline.index_file(f, root=src_dir) - - assert stats.files_processed == 1 - assert stats.chunks_created >= 1 - assert meta.get_file_hash("hello.py") is not None - assert len(meta.get_chunk_ids_for_file("hello.py")) >= 1 - - def test_index_file_skips_unchanged(self, workspace): - pipeline = workspace["pipeline"] - src_dir = workspace["src_dir"] - - f = _write_file(src_dir, "same.py", "x = 1\n") - stats1 = pipeline.index_file(f, root=src_dir) - assert stats1.files_processed == 1 - - stats2 = pipeline.index_file(f, root=src_dir) - assert stats2.files_processed == 0 - assert stats2.chunks_created == 0 - - def test_index_file_force_reindex(self, workspace): - pipeline = workspace["pipeline"] - src_dir = workspace["src_dir"] - - f = _write_file(src_dir, "force.py", "x = 1\n") - pipeline.index_file(f, root=src_dir) - - stats = pipeline.index_file(f, root=src_dir, force=True) - assert stats.files_processed == 1 - assert stats.chunks_created >= 1 - - def test_index_file_updates_changed_file(self, workspace): - pipeline = workspace["pipeline"] - meta = workspace["metadata"] - src_dir = workspace["src_dir"] - - f = _write_file(src_dir, "changing.py", "version = 1\n") - pipeline.index_file(f, root=src_dir) - old_chunks = meta.get_chunk_ids_for_file("changing.py") - - # Modify file - f.write_text("version = 2\nmore code\n", encoding="utf-8") - stats = pipeline.index_file(f, root=src_dir) - assert stats.files_processed == 1 - - new_chunks = meta.get_chunk_ids_for_file("changing.py") - # Old chunks should have been tombstoned, new ones assigned - assert set(old_chunks) != set(new_chunks) - - def test_index_file_registers_in_metadata(self, workspace): - pipeline = workspace["pipeline"] - meta = workspace["metadata"] - fts = workspace["fts"] - src_dir = workspace["src_dir"] - - f = _write_file(src_dir, "meta_test.py", "def foo(): pass\n") - pipeline.index_file(f, root=src_dir) - - # MetadataStore has file registered - assert meta.get_file_hash("meta_test.py") is not None - chunk_ids = meta.get_chunk_ids_for_file("meta_test.py") - assert len(chunk_ids) >= 1 - - # FTS has the content - fts_ids = fts.get_chunk_ids_by_path("meta_test.py") - assert len(fts_ids) >= 1 - - def test_index_file_no_metadata_raises(self, workspace): - cfg = workspace["config"] - pipeline_no_meta = IndexingPipeline( - embedder=FakeEmbedder(), - binary_store=workspace["binary_store"], - ann_index=workspace["ann_index"], - fts=workspace["fts"], - config=cfg, - ) - f = _write_file(workspace["src_dir"], "no_meta.py", "x = 1\n") - with pytest.raises(RuntimeError, match="MetadataStore is required"): - pipeline_no_meta.index_file(f) - - -# --------------------------------------------------------------------------- -# remove_file tests -# --------------------------------------------------------------------------- - - -class TestRemoveFile: - def test_remove_file_tombstones_and_fts(self, workspace): - pipeline = workspace["pipeline"] - meta = workspace["metadata"] - fts = workspace["fts"] - src_dir = workspace["src_dir"] - - f = _write_file(src_dir, "to_remove.py", "data = [1, 2, 3]\n") - pipeline.index_file(f, root=src_dir) - - chunk_ids = meta.get_chunk_ids_for_file("to_remove.py") - assert len(chunk_ids) >= 1 - - pipeline.remove_file("to_remove.py") - - # File should be gone from metadata - assert meta.get_file_hash("to_remove.py") is None - assert meta.get_chunk_ids_for_file("to_remove.py") == [] - - # Chunks should be in deleted_chunks - deleted = meta.get_deleted_ids() - for cid in chunk_ids: - assert cid in deleted - - # FTS should be cleared - assert fts.get_chunk_ids_by_path("to_remove.py") == [] - - def test_remove_nonexistent_file(self, workspace): - pipeline = workspace["pipeline"] - # Should not raise - pipeline.remove_file("nonexistent.py") - - -# --------------------------------------------------------------------------- -# sync tests -# --------------------------------------------------------------------------- - - -class TestSync: - def test_sync_indexes_new_files(self, workspace): - pipeline = workspace["pipeline"] - meta = workspace["metadata"] - src_dir = workspace["src_dir"] - - f1 = _write_file(src_dir, "a.py", "a = 1\n") - f2 = _write_file(src_dir, "b.py", "b = 2\n") - - stats = pipeline.sync([f1, f2], root=src_dir) - assert stats.files_processed == 2 - assert meta.get_file_hash("a.py") is not None - assert meta.get_file_hash("b.py") is not None - - def test_sync_removes_missing_files(self, workspace): - pipeline = workspace["pipeline"] - meta = workspace["metadata"] - src_dir = workspace["src_dir"] - - f1 = _write_file(src_dir, "keep.py", "keep = True\n") - f2 = _write_file(src_dir, "remove.py", "remove = True\n") - - pipeline.sync([f1, f2], root=src_dir) - assert meta.get_file_hash("remove.py") is not None - - # Sync with only f1 -- f2 should be removed - stats = pipeline.sync([f1], root=src_dir) - assert meta.get_file_hash("remove.py") is None - deleted = meta.get_deleted_ids() - assert len(deleted) > 0 - - def test_sync_detects_changed_files(self, workspace): - pipeline = workspace["pipeline"] - meta = workspace["metadata"] - src_dir = workspace["src_dir"] - - f = _write_file(src_dir, "mutable.py", "v1\n") - pipeline.sync([f], root=src_dir) - old_hash = meta.get_file_hash("mutable.py") - - f.write_text("v2\n", encoding="utf-8") - stats = pipeline.sync([f], root=src_dir) - assert stats.files_processed == 1 - new_hash = meta.get_file_hash("mutable.py") - assert old_hash != new_hash - - def test_sync_skips_unchanged(self, workspace): - pipeline = workspace["pipeline"] - src_dir = workspace["src_dir"] - - f = _write_file(src_dir, "stable.py", "stable = True\n") - pipeline.sync([f], root=src_dir) - - # Second sync with same file, unchanged - stats = pipeline.sync([f], root=src_dir) - assert stats.files_processed == 0 - assert stats.chunks_created == 0 - - -# --------------------------------------------------------------------------- -# compact tests -# --------------------------------------------------------------------------- - - -class TestCompact: - def test_compact_removes_tombstoned_from_binary_store(self, workspace): - pipeline = workspace["pipeline"] - meta = workspace["metadata"] - binary_store = workspace["binary_store"] - src_dir = workspace["src_dir"] - - f1 = _write_file(src_dir, "alive.py", "alive = True\n") - f2 = _write_file(src_dir, "dead.py", "dead = True\n") - - pipeline.index_file(f1, root=src_dir) - pipeline.index_file(f2, root=src_dir) - - count_before = binary_store._count - assert count_before >= 2 - - pipeline.remove_file("dead.py") - pipeline.compact() - - # BinaryStore should have fewer entries - assert binary_store._count < count_before - # deleted_chunks should be cleared - assert meta.get_deleted_ids() == set() - - def test_compact_noop_when_no_deletions(self, workspace): - pipeline = workspace["pipeline"] - meta = workspace["metadata"] - binary_store = workspace["binary_store"] - src_dir = workspace["src_dir"] - - f = _write_file(src_dir, "solo.py", "solo = True\n") - pipeline.index_file(f, root=src_dir) - count_before = binary_store._count - - pipeline.compact() - assert binary_store._count == count_before - - -# --------------------------------------------------------------------------- -# Backward compatibility: existing batch API still works -# --------------------------------------------------------------------------- - - -class TestBatchAPIUnchanged: - def test_index_files_still_works(self, workspace): - pipeline = workspace["pipeline"] - src_dir = workspace["src_dir"] - - f1 = _write_file(src_dir, "batch1.py", "batch1 = 1\n") - f2 = _write_file(src_dir, "batch2.py", "batch2 = 2\n") - - stats = pipeline.index_files([f1, f2], root=src_dir) - assert stats.files_processed == 2 - assert stats.chunks_created >= 2 - - def test_index_files_works_without_metadata(self, workspace): - """Batch API should work even without MetadataStore.""" - cfg = workspace["config"] - pipeline_no_meta = IndexingPipeline( - embedder=FakeEmbedder(), - binary_store=BinaryStore(workspace["store_dir"] / "no_meta", DIM, cfg), - ann_index=ANNIndex(workspace["store_dir"] / "no_meta", DIM, cfg), - fts=FTSEngine(str(workspace["store_dir"] / "no_meta_fts.db")), - config=cfg, - ) - src_dir = workspace["src_dir"] - f = _write_file(src_dir, "no_meta_batch.py", "x = 1\n") - stats = pipeline_no_meta.index_files([f], root=src_dir) - assert stats.files_processed == 1 diff --git a/codex-lens-v2/tests/unit/test_metadata_store.py b/codex-lens-v2/tests/unit/test_metadata_store.py deleted file mode 100644 index bd48f79d..00000000 --- a/codex-lens-v2/tests/unit/test_metadata_store.py +++ /dev/null @@ -1,184 +0,0 @@ -"""Unit tests for MetadataStore — SQLite file-to-chunk mapping + tombstone tracking.""" -from __future__ import annotations - -import pytest - -from codexlens_search.indexing.metadata import MetadataStore - - -@pytest.fixture -def store(tmp_path): - """Create a fresh MetadataStore backed by a temp db.""" - return MetadataStore(str(tmp_path / "meta.db")) - - -# --------------------------------------------------------------------------- -# Table creation -# --------------------------------------------------------------------------- - -class TestTableCreation: - def test_creates_three_tables(self, store): - """MetadataStore should create files, chunks, deleted_chunks tables.""" - tables = store._conn.execute( - "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" - ).fetchall() - names = {r[0] for r in tables} - assert "files" in names - assert "chunks" in names - assert "deleted_chunks" in names - - def test_foreign_keys_enabled(self, store): - """PRAGMA foreign_keys must be ON.""" - row = store._conn.execute("PRAGMA foreign_keys").fetchone() - assert row[0] == 1 - - def test_wal_mode(self, store): - """journal_mode should be WAL for concurrency.""" - row = store._conn.execute("PRAGMA journal_mode").fetchone() - assert row[0].lower() == "wal" - - -# --------------------------------------------------------------------------- -# register_file -# --------------------------------------------------------------------------- - -class TestRegisterFile: - def test_register_and_retrieve(self, store): - store.register_file("src/main.py", "abc123", 1000.0) - assert store.get_file_hash("src/main.py") == "abc123" - - def test_register_updates_existing(self, store): - store.register_file("a.py", "hash1", 1000.0) - store.register_file("a.py", "hash2", 2000.0) - assert store.get_file_hash("a.py") == "hash2" - - def test_get_file_hash_returns_none_for_unknown(self, store): - assert store.get_file_hash("nonexistent.py") is None - - -# --------------------------------------------------------------------------- -# register_chunks -# --------------------------------------------------------------------------- - -class TestRegisterChunks: - def test_register_and_retrieve_chunks(self, store): - store.register_file("a.py", "h", 1.0) - store.register_chunks("a.py", [(0, "c0"), (1, "c1"), (2, "c2")]) - ids = store.get_chunk_ids_for_file("a.py") - assert sorted(ids) == [0, 1, 2] - - def test_empty_chunks_list(self, store): - store.register_file("a.py", "h", 1.0) - store.register_chunks("a.py", []) - assert store.get_chunk_ids_for_file("a.py") == [] - - def test_chunks_for_unknown_file(self, store): - assert store.get_chunk_ids_for_file("unknown.py") == [] - - -# --------------------------------------------------------------------------- -# mark_file_deleted -# --------------------------------------------------------------------------- - -class TestMarkFileDeleted: - def test_tombstones_chunks(self, store): - store.register_file("a.py", "h", 1.0) - store.register_chunks("a.py", [(10, "c10"), (11, "c11")]) - count = store.mark_file_deleted("a.py") - assert count == 2 - assert store.get_deleted_ids() == {10, 11} - - def test_file_removed_after_delete(self, store): - store.register_file("a.py", "h", 1.0) - store.register_chunks("a.py", [(0, "c0")]) - store.mark_file_deleted("a.py") - assert store.get_file_hash("a.py") is None - - def test_chunks_cascaded_after_delete(self, store): - store.register_file("a.py", "h", 1.0) - store.register_chunks("a.py", [(0, "c0")]) - store.mark_file_deleted("a.py") - assert store.get_chunk_ids_for_file("a.py") == [] - - def test_delete_nonexistent_file(self, store): - count = store.mark_file_deleted("nonexistent.py") - assert count == 0 - - def test_delete_file_without_chunks(self, store): - store.register_file("empty.py", "h", 1.0) - count = store.mark_file_deleted("empty.py") - assert count == 0 - assert store.get_file_hash("empty.py") is None - - -# --------------------------------------------------------------------------- -# file_needs_update -# --------------------------------------------------------------------------- - -class TestFileNeedsUpdate: - def test_new_file_needs_update(self, store): - assert store.file_needs_update("new.py", "any_hash") is True - - def test_unchanged_file(self, store): - store.register_file("a.py", "same_hash", 1.0) - assert store.file_needs_update("a.py", "same_hash") is False - - def test_changed_file(self, store): - store.register_file("a.py", "old_hash", 1.0) - assert store.file_needs_update("a.py", "new_hash") is True - - -# --------------------------------------------------------------------------- -# get_deleted_ids / compact_deleted -# --------------------------------------------------------------------------- - -class TestDeletedIdsAndCompact: - def test_empty_deleted_ids(self, store): - assert store.get_deleted_ids() == set() - - def test_compact_returns_and_clears(self, store): - store.register_file("a.py", "h", 1.0) - store.register_chunks("a.py", [(5, "c5"), (6, "c6")]) - store.mark_file_deleted("a.py") - - deleted = store.compact_deleted() - assert deleted == {5, 6} - assert store.get_deleted_ids() == set() - - def test_compact_noop_when_empty(self, store): - deleted = store.compact_deleted() - assert deleted == set() - - -# --------------------------------------------------------------------------- -# get_all_files / max_chunk_id -# --------------------------------------------------------------------------- - -class TestHelpers: - def test_get_all_files(self, store): - store.register_file("a.py", "h1", 1.0) - store.register_file("b.py", "h2", 2.0) - assert store.get_all_files() == {"a.py": "h1", "b.py": "h2"} - - def test_max_chunk_id_empty(self, store): - assert store.max_chunk_id() == -1 - - def test_max_chunk_id_active(self, store): - store.register_file("a.py", "h", 1.0) - store.register_chunks("a.py", [(0, "c"), (5, "c"), (3, "c")]) - assert store.max_chunk_id() == 5 - - def test_max_chunk_id_includes_deleted(self, store): - store.register_file("a.py", "h", 1.0) - store.register_chunks("a.py", [(10, "c")]) - store.mark_file_deleted("a.py") - assert store.max_chunk_id() == 10 - - def test_max_chunk_id_mixed(self, store): - store.register_file("a.py", "h", 1.0) - store.register_chunks("a.py", [(3, "c")]) - store.register_file("b.py", "h2", 1.0) - store.register_chunks("b.py", [(7, "c")]) - store.mark_file_deleted("a.py") - # deleted has 3, active has 7 - assert store.max_chunk_id() == 7 diff --git a/codex-lens-v2/tests/unit/test_rerank.py b/codex-lens-v2/tests/unit/test_rerank.py deleted file mode 100644 index b75073fb..00000000 --- a/codex-lens-v2/tests/unit/test_rerank.py +++ /dev/null @@ -1,179 +0,0 @@ -from __future__ import annotations - -import types -from unittest.mock import MagicMock, patch - -import pytest - -from codexlens_search.config import Config -from codexlens_search.rerank.base import BaseReranker -from codexlens_search.rerank.local import FastEmbedReranker -from codexlens_search.rerank.api import APIReranker - - -# --------------------------------------------------------------------------- -# BaseReranker -# --------------------------------------------------------------------------- - -def test_base_reranker_is_abstract(): - with pytest.raises(TypeError): - BaseReranker() # type: ignore[abstract] - - -# --------------------------------------------------------------------------- -# FastEmbedReranker -# --------------------------------------------------------------------------- - -def _make_rerank_result(index: int, score: float) -> object: - obj = types.SimpleNamespace(index=index, score=score) - return obj - - -def test_local_reranker_score_pairs_length(): - config = Config() - reranker = FastEmbedReranker(config) - - mock_results = [ - _make_rerank_result(0, 0.9), - _make_rerank_result(1, 0.5), - _make_rerank_result(2, 0.1), - ] - - mock_model = MagicMock() - mock_model.rerank.return_value = iter(mock_results) - reranker._model = mock_model - - docs = ["doc0", "doc1", "doc2"] - scores = reranker.score_pairs("query", docs) - - assert len(scores) == 3 - - -def test_local_reranker_preserves_order(): - config = Config() - reranker = FastEmbedReranker(config) - - # rerank returns results in reverse order (index 2, 1, 0) - mock_results = [ - _make_rerank_result(2, 0.1), - _make_rerank_result(1, 0.5), - _make_rerank_result(0, 0.9), - ] - - mock_model = MagicMock() - mock_model.rerank.return_value = iter(mock_results) - reranker._model = mock_model - - docs = ["doc0", "doc1", "doc2"] - scores = reranker.score_pairs("query", docs) - - assert scores[0] == pytest.approx(0.9) - assert scores[1] == pytest.approx(0.5) - assert scores[2] == pytest.approx(0.1) - - -# --------------------------------------------------------------------------- -# APIReranker -# --------------------------------------------------------------------------- - -def _make_config(max_tokens_per_batch: int = 512) -> Config: - return Config( - reranker_api_url="https://api.example.com", - reranker_api_key="test-key", - reranker_api_model="test-model", - reranker_api_max_tokens_per_batch=max_tokens_per_batch, - ) - - -def test_api_reranker_batch_splitting(): - config = _make_config(max_tokens_per_batch=512) - - with patch("httpx.Client"): - reranker = APIReranker(config) - - # 10 docs, each ~200 tokens (800 chars) - docs = ["x" * 800] * 10 - batches = reranker._split_batches(docs, max_tokens=512) - - # Each doc is 200 tokens; batches should have at most 2 docs (200+200=400 <= 512, 400+200=600 > 512) - assert len(batches) > 1 - for batch in batches: - total = sum(len(text) // 4 for _, text in batch) - assert total <= 512 or len(batch) == 1 - - -def test_api_reranker_retry_on_429(): - config = _make_config() - - mock_429 = MagicMock() - mock_429.status_code = 429 - - mock_200 = MagicMock() - mock_200.status_code = 200 - mock_200.json.return_value = { - "results": [ - {"index": 0, "relevance_score": 0.8}, - {"index": 1, "relevance_score": 0.3}, - ] - } - mock_200.raise_for_status = MagicMock() - - with patch("httpx.Client") as mock_client_cls: - mock_client = MagicMock() - mock_client_cls.return_value = mock_client - mock_client.post.side_effect = [mock_429, mock_429, mock_200] - - reranker = APIReranker(config) - - with patch("time.sleep"): - result = reranker._call_api_with_retry( - "query", - [(0, "doc0"), (1, "doc1")], - max_retries=3, - ) - - assert mock_client.post.call_count == 3 - assert 0 in result - assert 1 in result - - -def test_api_reranker_merge_batches(): - config = _make_config(max_tokens_per_batch=100) - - # 4 docs of 25 tokens each (100 chars); each batch holds at most 4 docs - # Use smaller docs to force 2 batches: 2 docs per batch (50 tokens each = 200 chars) - docs = ["x" * 200] * 4 # 50 tokens each; 50+50=100 <= 100, 100+50=150 > 100 -> 2 per batch - - batch0_response = MagicMock() - batch0_response.status_code = 200 - batch0_response.json.return_value = { - "results": [ - {"index": 0, "relevance_score": 0.9}, - {"index": 1, "relevance_score": 0.8}, - ] - } - batch0_response.raise_for_status = MagicMock() - - batch1_response = MagicMock() - batch1_response.status_code = 200 - batch1_response.json.return_value = { - "results": [ - {"index": 0, "relevance_score": 0.7}, - {"index": 1, "relevance_score": 0.6}, - ] - } - batch1_response.raise_for_status = MagicMock() - - with patch("httpx.Client") as mock_client_cls: - mock_client = MagicMock() - mock_client_cls.return_value = mock_client - mock_client.post.side_effect = [batch0_response, batch1_response] - - reranker = APIReranker(config) - - with patch("time.sleep"): - scores = reranker.score_pairs("query", docs) - - assert len(scores) == 4 - # All original indices should have scores - assert all(s > 0 for s in scores) diff --git a/codex-lens-v2/tests/unit/test_search.py b/codex-lens-v2/tests/unit/test_search.py deleted file mode 100644 index 5544665c..00000000 --- a/codex-lens-v2/tests/unit/test_search.py +++ /dev/null @@ -1,156 +0,0 @@ -"""Unit tests for search layer: FTSEngine, fusion, and SearchPipeline.""" -from __future__ import annotations - -from unittest.mock import MagicMock - -import pytest - -from codexlens_search.search.fts import FTSEngine -from codexlens_search.search.fusion import ( - DEFAULT_WEIGHTS, - QueryIntent, - detect_query_intent, - get_adaptive_weights, - reciprocal_rank_fusion, -) -from codexlens_search.search.pipeline import SearchPipeline, SearchResult -from codexlens_search.config import Config - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -def make_fts(docs: list[tuple[int, str, str]] | None = None) -> FTSEngine: - """Create an in-memory FTSEngine and optionally add documents.""" - engine = FTSEngine(":memory:") - if docs: - engine.add_documents(docs) - return engine - - -# --------------------------------------------------------------------------- -# FTSEngine tests -# --------------------------------------------------------------------------- - -def test_fts_add_and_exact_search(): - docs = [ - (1, "a.py", "def authenticate user password login"), - (2, "b.py", "connect to database with credentials"), - (3, "c.py", "render template html response"), - ] - engine = make_fts(docs) - results = engine.exact_search("authenticate", top_k=10) - ids = [r[0] for r in results] - assert 1 in ids, "doc 1 should match 'authenticate'" - assert 2 not in ids or results[0][0] == 1 # doc 1 must rank higher - - -def test_fts_fuzzy_search_prefix(): - docs = [ - (10, "auth.py", "authentication token refresh"), - (11, "db.py", "database connection pool"), - (12, "ui.py", "render button click handler"), - ] - engine = make_fts(docs) - # Prefix 'auth' should match 'authentication' in doc 10 - results = engine.fuzzy_search("auth", top_k=10) - ids = [r[0] for r in results] - assert 10 in ids, "prefix 'auth' should match doc 10 with 'authentication'" - - -# --------------------------------------------------------------------------- -# RRF fusion tests -# --------------------------------------------------------------------------- - -def test_rrf_fusion_ordering(): - """When two sources agree on top-1, it should rank first in fused result.""" - source_a = [(1, 0.9), (2, 0.5), (3, 0.2)] - source_b = [(1, 0.8), (3, 0.6), (2, 0.1)] - fused = reciprocal_rank_fusion({"a": source_a, "b": source_b}) - assert fused[0][0] == 1, "doc 1 agreed top by both sources must rank first" - - -def test_rrf_equal_weight_default(): - """Calling with None weights should use DEFAULT_WEIGHTS shape (not crash).""" - source_exact = [(5, 1.0), (6, 0.8)] - source_vector = [(6, 0.9), (5, 0.7)] - # Should not raise and should return results - fused = reciprocal_rank_fusion( - {"exact": source_exact, "vector": source_vector}, - weights=None, - ) - assert len(fused) == 2 - ids = [r[0] for r in fused] - assert 5 in ids and 6 in ids - - -# --------------------------------------------------------------------------- -# detect_query_intent tests -# --------------------------------------------------------------------------- - -def test_detect_intent_code_symbol(): - assert detect_query_intent("def authenticate()") == QueryIntent.CODE_SYMBOL - - -def test_detect_intent_natural(): - assert detect_query_intent("how do I authenticate users") == QueryIntent.NATURAL_LANGUAGE - - -# --------------------------------------------------------------------------- -# SearchPipeline tests -# --------------------------------------------------------------------------- - -def _make_pipeline(fts: FTSEngine, top_k: int = 5) -> SearchPipeline: - """Build a SearchPipeline with mocked heavy components.""" - cfg = Config.small() - cfg.reranker_top_k = top_k - - embedder = MagicMock() - embedder.embed.return_value = [[0.1] * cfg.embed_dim] - - binary_store = MagicMock() - binary_store.coarse_search.return_value = ([1, 2, 3], None) - - ann_index = MagicMock() - ann_index.fine_search.return_value = ([1, 2, 3], [0.9, 0.8, 0.7]) - - reranker = MagicMock() - # Return a score for each content string passed - reranker.score_pairs.side_effect = lambda q, contents: [0.9 - i * 0.1 for i in range(len(contents))] - - return SearchPipeline( - embedder=embedder, - binary_store=binary_store, - ann_index=ann_index, - reranker=reranker, - fts=fts, - config=cfg, - ) - - -def test_pipeline_search_returns_results(): - docs = [ - (1, "a.py", "test content alpha"), - (2, "b.py", "test content beta"), - (3, "c.py", "test content gamma"), - ] - fts = make_fts(docs) - pipeline = _make_pipeline(fts) - results = pipeline.search("test") - assert len(results) > 0 - assert all(isinstance(r, SearchResult) for r in results) - - -def test_pipeline_top_k_limit(): - docs = [ - (1, "a.py", "hello world one"), - (2, "b.py", "hello world two"), - (3, "c.py", "hello world three"), - (4, "d.py", "hello world four"), - (5, "e.py", "hello world five"), - ] - fts = make_fts(docs) - pipeline = _make_pipeline(fts, top_k=2) - results = pipeline.search("hello", top_k=2) - assert len(results) <= 2, "pipeline must respect top_k limit" diff --git a/codex-lens-v2/tests/unit/test_watcher.py b/codex-lens-v2/tests/unit/test_watcher.py deleted file mode 100644 index 201148f5..00000000 --- a/codex-lens-v2/tests/unit/test_watcher.py +++ /dev/null @@ -1,271 +0,0 @@ -"""Unit tests for watcher module — events, FileWatcher debounce/dedup, IncrementalIndexer.""" -from __future__ import annotations - -import time -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest - -from codexlens_search.watcher.events import ChangeType, FileEvent, WatcherConfig -from codexlens_search.watcher.incremental_indexer import BatchResult, IncrementalIndexer - - -# --------------------------------------------------------------------------- -# ChangeType enum -# --------------------------------------------------------------------------- - -class TestChangeType: - def test_values(self): - assert ChangeType.CREATED.value == "created" - assert ChangeType.MODIFIED.value == "modified" - assert ChangeType.DELETED.value == "deleted" - - def test_all_members(self): - assert len(ChangeType) == 3 - - -# --------------------------------------------------------------------------- -# FileEvent -# --------------------------------------------------------------------------- - -class TestFileEvent: - def test_creation(self): - e = FileEvent(path=Path("a.py"), change_type=ChangeType.CREATED) - assert e.path == Path("a.py") - assert e.change_type == ChangeType.CREATED - assert isinstance(e.timestamp, float) - - def test_custom_timestamp(self): - e = FileEvent(path=Path("b.py"), change_type=ChangeType.DELETED, timestamp=42.0) - assert e.timestamp == 42.0 - - -# --------------------------------------------------------------------------- -# WatcherConfig -# --------------------------------------------------------------------------- - -class TestWatcherConfig: - def test_defaults(self): - cfg = WatcherConfig() - assert cfg.debounce_ms == 500 - assert ".git" in cfg.ignored_patterns - assert "__pycache__" in cfg.ignored_patterns - assert "node_modules" in cfg.ignored_patterns - assert ".codexlens" in cfg.ignored_patterns - - def test_custom(self): - cfg = WatcherConfig(debounce_ms=1000, ignored_patterns={".custom"}) - assert cfg.debounce_ms == 1000 - assert cfg.ignored_patterns == {".custom"} - - -# --------------------------------------------------------------------------- -# BatchResult -# --------------------------------------------------------------------------- - -class TestBatchResult: - def test_defaults(self): - r = BatchResult() - assert r.files_indexed == 0 - assert r.files_removed == 0 - assert r.chunks_created == 0 - assert r.errors == [] - - def test_total_processed(self): - r = BatchResult(files_indexed=3, files_removed=2) - assert r.total_processed == 5 - - def test_has_errors(self): - r = BatchResult() - assert r.has_errors is False - r.errors.append("oops") - assert r.has_errors is True - - -# --------------------------------------------------------------------------- -# IncrementalIndexer — event routing -# --------------------------------------------------------------------------- - -class TestIncrementalIndexer: - @pytest.fixture - def mock_pipeline(self): - pipeline = MagicMock() - pipeline.index_file.return_value = MagicMock( - files_processed=1, chunks_created=3 - ) - return pipeline - - def test_routes_created_to_index_file(self, mock_pipeline): - indexer = IncrementalIndexer(mock_pipeline, root=Path("/project")) - events = [ - FileEvent(Path("/project/src/new.py"), ChangeType.CREATED), - ] - result = indexer.process_events(events) - assert result.files_indexed == 1 - mock_pipeline.index_file.assert_called_once() - # CREATED should NOT use force=True - call_kwargs = mock_pipeline.index_file.call_args - assert call_kwargs.kwargs.get("force", call_kwargs[1].get("force")) is False - - def test_routes_modified_to_index_file_with_force(self, mock_pipeline): - indexer = IncrementalIndexer(mock_pipeline, root=Path("/project")) - events = [ - FileEvent(Path("/project/src/changed.py"), ChangeType.MODIFIED), - ] - result = indexer.process_events(events) - assert result.files_indexed == 1 - call_kwargs = mock_pipeline.index_file.call_args - assert call_kwargs.kwargs.get("force", call_kwargs[1].get("force")) is True - - def test_routes_deleted_to_remove_file(self, mock_pipeline, tmp_path): - root = tmp_path / "project" - root.mkdir() - indexer = IncrementalIndexer(mock_pipeline, root=root) - events = [ - FileEvent(root / "src" / "old.py", ChangeType.DELETED), - ] - result = indexer.process_events(events) - assert result.files_removed == 1 - # On Windows relative_to produces backslashes, normalize - actual_arg = mock_pipeline.remove_file.call_args[0][0] - assert actual_arg.replace("\\", "/") == "src/old.py" - - def test_batch_with_mixed_events(self, mock_pipeline): - indexer = IncrementalIndexer(mock_pipeline, root=Path("/project")) - events = [ - FileEvent(Path("/project/a.py"), ChangeType.CREATED), - FileEvent(Path("/project/b.py"), ChangeType.MODIFIED), - FileEvent(Path("/project/c.py"), ChangeType.DELETED), - ] - result = indexer.process_events(events) - assert result.files_indexed == 2 - assert result.files_removed == 1 - assert result.total_processed == 3 - - def test_error_isolation(self, mock_pipeline): - """One file failure should not stop processing of others.""" - call_count = [0] - - def side_effect(*args, **kwargs): - call_count[0] += 1 - if call_count[0] == 1: - raise RuntimeError("disk error") - return MagicMock(files_processed=1, chunks_created=1) - - mock_pipeline.index_file.side_effect = side_effect - - indexer = IncrementalIndexer(mock_pipeline, root=Path("/project")) - events = [ - FileEvent(Path("/project/fail.py"), ChangeType.CREATED), - FileEvent(Path("/project/ok.py"), ChangeType.CREATED), - ] - result = indexer.process_events(events) - - assert result.files_indexed == 1 # second succeeded - assert len(result.errors) == 1 # first failed - assert "disk error" in result.errors[0] - - def test_empty_events(self, mock_pipeline): - indexer = IncrementalIndexer(mock_pipeline) - result = indexer.process_events([]) - assert result.total_processed == 0 - mock_pipeline.index_file.assert_not_called() - mock_pipeline.remove_file.assert_not_called() - - -# --------------------------------------------------------------------------- -# FileWatcher — debounce and dedup logic (unit-level, no actual FS) -# --------------------------------------------------------------------------- - -class TestFileWatcherLogic: - """Test FileWatcher internals without starting a real watchdog Observer.""" - - @pytest.fixture - def watcher_parts(self): - """Create a FileWatcher with mocked observer, capture callbacks.""" - # Import here since watchdog is optional - from codexlens_search.watcher.file_watcher import FileWatcher, _EVENT_PRIORITY - - collected = [] - - def on_changes(events): - collected.extend(events) - - cfg = WatcherConfig(debounce_ms=100) - watcher = FileWatcher(Path("."), cfg, on_changes) - return watcher, collected, _EVENT_PRIORITY - - def test_event_priority_ordering(self, watcher_parts): - _, _, priority = watcher_parts - assert priority[ChangeType.DELETED] > priority[ChangeType.MODIFIED] - assert priority[ChangeType.MODIFIED] > priority[ChangeType.CREATED] - - def test_dedup_keeps_higher_priority(self, watcher_parts, tmp_path): - watcher, collected, _ = watcher_parts - f = str(tmp_path / "a.py") - watcher._on_raw_event(f, ChangeType.CREATED) - watcher._on_raw_event(f, ChangeType.DELETED) - - watcher.flush_now() - - assert len(collected) == 1 - assert collected[0].change_type == ChangeType.DELETED - - def test_dedup_does_not_downgrade(self, watcher_parts, tmp_path): - watcher, collected, _ = watcher_parts - f = str(tmp_path / "b.py") - watcher._on_raw_event(f, ChangeType.DELETED) - watcher._on_raw_event(f, ChangeType.CREATED) - - watcher.flush_now() - assert len(collected) == 1 - # CREATED (priority 1) < DELETED (priority 3), so DELETED stays - assert collected[0].change_type == ChangeType.DELETED - - def test_multiple_files_kept(self, watcher_parts, tmp_path): - watcher, collected, _ = watcher_parts - watcher._on_raw_event(str(tmp_path / "a.py"), ChangeType.CREATED) - watcher._on_raw_event(str(tmp_path / "b.py"), ChangeType.MODIFIED) - watcher._on_raw_event(str(tmp_path / "c.py"), ChangeType.DELETED) - - watcher.flush_now() - assert len(collected) == 3 - paths = {str(e.path) for e in collected} - assert len(paths) == 3 - - def test_flush_clears_pending(self, watcher_parts, tmp_path): - watcher, collected, _ = watcher_parts - watcher._on_raw_event(str(tmp_path / "a.py"), ChangeType.CREATED) - watcher.flush_now() - assert len(collected) == 1 - - collected.clear() - watcher.flush_now() - assert len(collected) == 0 - - def test_should_watch_filters_ignored(self, watcher_parts): - watcher, _, _ = watcher_parts - assert watcher._should_watch(Path("/project/src/main.py")) is True - assert watcher._should_watch(Path("/project/.git/config")) is False - assert watcher._should_watch(Path("/project/node_modules/foo.js")) is False - assert watcher._should_watch(Path("/project/__pycache__/mod.pyc")) is False - - def test_jsonl_serialization(self): - from codexlens_search.watcher.file_watcher import FileWatcher - import json - - events = [ - FileEvent(Path("/tmp/a.py"), ChangeType.CREATED, 1000.0), - FileEvent(Path("/tmp/b.py"), ChangeType.DELETED, 2000.0), - ] - output = FileWatcher.events_to_jsonl(events) - lines = output.strip().split("\n") - assert len(lines) == 2 - - obj1 = json.loads(lines[0]) - assert obj1["change_type"] == "created" - assert obj1["timestamp"] == 1000.0 - - obj2 = json.loads(lines[1]) - assert obj2["change_type"] == "deleted"