mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-18 18:48:48 +08:00
feat: add MCP server for semantic code search with FastMCP integration
This commit is contained in:
32
codex-lens-v2/.gitignore
vendored
32
codex-lens-v2/.gitignore
vendored
@@ -1 +1,33 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
*.egg
|
||||
|
||||
# Virtual environments
|
||||
.venv/
|
||||
venv/
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
|
||||
# Testing
|
||||
.pytest_cache/
|
||||
.coverage
|
||||
htmlcov/
|
||||
|
||||
# Index / cache
|
||||
.codexlens/
|
||||
.index_cache/
|
||||
.ace-tool/
|
||||
|
||||
# Workflow (internal)
|
||||
.workflow/
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
@@ -1,143 +1,221 @@
|
||||
# codexlens-search
|
||||
|
||||
Lightweight semantic code search engine with 2-stage vector search, full-text search, and Reciprocal Rank Fusion.
|
||||
Semantic code search engine with MCP server for Claude Code.
|
||||
|
||||
## Overview
|
||||
2-stage vector search + FTS + RRF fusion + reranking — install once, configure API keys, ready to use.
|
||||
|
||||
codexlens-search provides fast, accurate code search through a multi-stage retrieval pipeline:
|
||||
## Quick Start (Claude Code MCP)
|
||||
|
||||
1. **Binary coarse search** - Hamming-distance filtering narrows candidates quickly
|
||||
2. **ANN fine search** - HNSW or FAISS refines the candidate set with float vectors
|
||||
3. **Full-text search** - SQLite FTS5 handles exact and fuzzy keyword matching
|
||||
4. **RRF fusion** - Reciprocal Rank Fusion merges vector and text results
|
||||
5. **Reranking** - Optional cross-encoder or API-based reranker for final ordering
|
||||
Add to your project `.mcp.json`:
|
||||
|
||||
The core library has **zero required dependencies**. Install optional extras to enable semantic search, GPU acceleration, or FAISS backends.
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"codexlens": {
|
||||
"command": "uvx",
|
||||
"args": ["--from", "codexlens-search[mcp]", "codexlens-mcp"],
|
||||
"env": {
|
||||
"CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1",
|
||||
"CODEXLENS_EMBED_API_KEY": "${OPENAI_API_KEY}",
|
||||
"CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small",
|
||||
"CODEXLENS_EMBED_DIM": "1536"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Installation
|
||||
That's it. Claude Code will auto-discover the tools: `index_project` → `search_code`.
|
||||
|
||||
## Install
|
||||
|
||||
```bash
|
||||
# Core only (FTS search, no vector search)
|
||||
# Standard install (includes vector search + API clients)
|
||||
pip install codexlens-search
|
||||
|
||||
# With semantic search (recommended)
|
||||
pip install codexlens-search[semantic]
|
||||
|
||||
# Semantic search + GPU acceleration
|
||||
pip install codexlens-search[semantic-gpu]
|
||||
|
||||
# With FAISS backend (CPU)
|
||||
pip install codexlens-search[faiss-cpu]
|
||||
|
||||
# With API-based reranker
|
||||
pip install codexlens-search[reranker-api]
|
||||
|
||||
# Everything (semantic + GPU + FAISS + reranker)
|
||||
pip install codexlens-search[semantic-gpu,faiss-gpu,reranker-api]
|
||||
# With MCP server for Claude Code
|
||||
pip install codexlens-search[mcp]
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
Optional extras for advanced use:
|
||||
|
||||
```python
|
||||
from codexlens_search import Config, IndexingPipeline, SearchPipeline
|
||||
from codexlens_search.core import create_ann_index, create_binary_index
|
||||
from codexlens_search.embed.local import FastEmbedEmbedder
|
||||
from codexlens_search.rerank.local import LocalReranker
|
||||
from codexlens_search.search.fts import FTSEngine
|
||||
| Extra | Description |
|
||||
|-------|-------------|
|
||||
| `mcp` | MCP server (`codexlens-mcp` command) |
|
||||
| `gpu` | GPU-accelerated embedding (onnxruntime-gpu) |
|
||||
| `faiss-cpu` | FAISS ANN backend |
|
||||
| `watcher` | File watcher for auto-indexing |
|
||||
|
||||
# 1. Configure
|
||||
config = Config(embed_model="BAAI/bge-small-en-v1.5", embed_dim=384)
|
||||
## MCP Tools
|
||||
|
||||
# 2. Create components
|
||||
embedder = FastEmbedEmbedder(config)
|
||||
binary_store = create_binary_index(config, db_path="index/binary.db")
|
||||
ann_index = create_ann_index(config, index_path="index/ann.bin")
|
||||
fts = FTSEngine("index/fts.db")
|
||||
reranker = LocalReranker()
|
||||
| Tool | Description |
|
||||
|------|-------------|
|
||||
| `search_code` | Semantic search with hybrid fusion + reranking |
|
||||
| `index_project` | Build or rebuild the search index |
|
||||
| `index_status` | Show index statistics |
|
||||
| `index_update` | Incremental sync (only changed files) |
|
||||
| `find_files` | Glob file discovery |
|
||||
| `list_models` | List models with cache status |
|
||||
| `download_models` | Download local fastembed models |
|
||||
|
||||
# 3. Index files
|
||||
indexer = IndexingPipeline(embedder, binary_store, ann_index, fts, config)
|
||||
stats = indexer.index_directory("./src")
|
||||
print(f"Indexed {stats.files_processed} files, {stats.chunks_created} chunks")
|
||||
## MCP Configuration Examples
|
||||
|
||||
# 4. Search
|
||||
pipeline = SearchPipeline(embedder, binary_store, ann_index, reranker, fts, config)
|
||||
results = pipeline.search("authentication handler", top_k=10)
|
||||
for r in results:
|
||||
print(f" {r.path} (score={r.score:.3f})")
|
||||
### API Embedding Only (simplest)
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"codexlens": {
|
||||
"command": "uvx",
|
||||
"args": ["--from", "codexlens-search[mcp]", "codexlens-mcp"],
|
||||
"env": {
|
||||
"CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1",
|
||||
"CODEXLENS_EMBED_API_KEY": "${OPENAI_API_KEY}",
|
||||
"CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small",
|
||||
"CODEXLENS_EMBED_DIM": "1536"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Extras
|
||||
### API Embedding + API Reranker (best quality)
|
||||
|
||||
| Extra | Dependencies | Description |
|
||||
|-------|-------------|-------------|
|
||||
| `semantic` | hnswlib, numpy, fastembed | Vector search with local embeddings |
|
||||
| `gpu` | onnxruntime-gpu | GPU-accelerated embedding inference |
|
||||
| `semantic-gpu` | semantic + gpu combined | Vector search with GPU acceleration |
|
||||
| `faiss-cpu` | faiss-cpu | FAISS ANN backend (CPU) |
|
||||
| `faiss-gpu` | faiss-gpu | FAISS ANN backend (GPU) |
|
||||
| `reranker-api` | httpx | Remote reranker API client |
|
||||
| `dev` | pytest, pytest-cov | Development and testing |
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"codexlens": {
|
||||
"command": "uvx",
|
||||
"args": ["--from", "codexlens-search[mcp]", "codexlens-mcp"],
|
||||
"env": {
|
||||
"CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1",
|
||||
"CODEXLENS_EMBED_API_KEY": "${OPENAI_API_KEY}",
|
||||
"CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small",
|
||||
"CODEXLENS_EMBED_DIM": "1536",
|
||||
"CODEXLENS_RERANKER_API_URL": "https://api.jina.ai/v1",
|
||||
"CODEXLENS_RERANKER_API_KEY": "${JINA_API_KEY}",
|
||||
"CODEXLENS_RERANKER_API_MODEL": "jina-reranker-v2-base-multilingual"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Multi-Endpoint Load Balancing
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"codexlens": {
|
||||
"command": "uvx",
|
||||
"args": ["--from", "codexlens-search[mcp]", "codexlens-mcp"],
|
||||
"env": {
|
||||
"CODEXLENS_EMBED_API_ENDPOINTS": "https://api1.example.com/v1|sk-key1|model,https://api2.example.com/v1|sk-key2|model",
|
||||
"CODEXLENS_EMBED_DIM": "1536"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Format: `url|key|model,url|key|model,...`
|
||||
|
||||
### Local Models (Offline, No API)
|
||||
|
||||
```bash
|
||||
pip install codexlens-search[mcp]
|
||||
codexlens-search download-models
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"codexlens": {
|
||||
"command": "codexlens-mcp",
|
||||
"env": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Pre-installed (no uvx)
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"codexlens": {
|
||||
"command": "codexlens-mcp",
|
||||
"env": {
|
||||
"CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1",
|
||||
"CODEXLENS_EMBED_API_KEY": "${OPENAI_API_KEY}",
|
||||
"CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small",
|
||||
"CODEXLENS_EMBED_DIM": "1536"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## CLI
|
||||
|
||||
```bash
|
||||
codexlens-search --db-path .codexlens sync --root ./src
|
||||
codexlens-search --db-path .codexlens search -q "auth handler" -k 10
|
||||
codexlens-search --db-path .codexlens status
|
||||
codexlens-search list-models
|
||||
codexlens-search download-models
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
### Embedding
|
||||
|
||||
| Variable | Description | Example |
|
||||
|----------|-------------|---------|
|
||||
| `CODEXLENS_EMBED_API_URL` | Embedding API base URL | `https://api.openai.com/v1` |
|
||||
| `CODEXLENS_EMBED_API_KEY` | API key | `sk-xxx` |
|
||||
| `CODEXLENS_EMBED_API_MODEL` | Model name | `text-embedding-3-small` |
|
||||
| `CODEXLENS_EMBED_API_ENDPOINTS` | Multi-endpoint: `url\|key\|model,...` | See above |
|
||||
| `CODEXLENS_EMBED_DIM` | Vector dimension | `1536` |
|
||||
|
||||
### Reranker
|
||||
|
||||
| Variable | Description | Example |
|
||||
|----------|-------------|---------|
|
||||
| `CODEXLENS_RERANKER_API_URL` | Reranker API base URL | `https://api.jina.ai/v1` |
|
||||
| `CODEXLENS_RERANKER_API_KEY` | API key | `jina-xxx` |
|
||||
| `CODEXLENS_RERANKER_API_MODEL` | Model name | `jina-reranker-v2-base-multilingual` |
|
||||
|
||||
### Tuning
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `CODEXLENS_BINARY_TOP_K` | `200` | Binary coarse search candidates |
|
||||
| `CODEXLENS_ANN_TOP_K` | `50` | ANN fine search candidates |
|
||||
| `CODEXLENS_FTS_TOP_K` | `50` | FTS results per method |
|
||||
| `CODEXLENS_FUSION_K` | `60` | RRF fusion k parameter |
|
||||
| `CODEXLENS_RERANKER_TOP_K` | `20` | Results to rerank |
|
||||
| `CODEXLENS_INDEX_WORKERS` | `2` | Parallel indexing workers |
|
||||
| `CODEXLENS_MAX_FILE_SIZE` | `1000000` | Max file size in bytes |
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
Query
|
||||
|
|
||||
v
|
||||
[Embedder] --> query vector
|
||||
|
|
||||
+---> [BinaryStore.coarse_search] --> candidate IDs (Hamming distance)
|
||||
| |
|
||||
| v
|
||||
+---> [ANNIndex.fine_search] ------> ranked IDs (cosine/L2)
|
||||
| |
|
||||
| v (intersect)
|
||||
| vector_results
|
||||
|
|
||||
+---> [FTSEngine.exact_search] ----> exact text matches
|
||||
+---> [FTSEngine.fuzzy_search] ----> fuzzy text matches
|
||||
|
|
||||
v
|
||||
[RRF Fusion] --> merged ranking (adaptive weights by query intent)
|
||||
|
|
||||
v
|
||||
[Reranker] --> final top-k results
|
||||
```
|
||||
|
||||
### Key Design Decisions
|
||||
|
||||
- **2-stage vector search**: Binary coarse search (fast Hamming distance on binarized vectors) filters candidates before the more expensive ANN search. This keeps memory usage low and search fast even on large corpora.
|
||||
- **Parallel retrieval**: Vector search and FTS run concurrently via ThreadPoolExecutor.
|
||||
- **Adaptive fusion weights**: Query intent detection adjusts RRF weights between vector and text signals.
|
||||
- **Backend abstraction**: ANN index supports both hnswlib and FAISS backends via a factory function.
|
||||
- **Zero core dependencies**: The base package requires only Python 3.10+. All heavy dependencies are optional.
|
||||
|
||||
## Configuration
|
||||
|
||||
The `Config` dataclass controls all pipeline parameters:
|
||||
|
||||
```python
|
||||
from codexlens_search import Config
|
||||
|
||||
config = Config(
|
||||
embed_model="BAAI/bge-small-en-v1.5", # embedding model name
|
||||
embed_dim=384, # embedding dimension
|
||||
embed_batch_size=64, # batch size for embedding
|
||||
ann_backend="auto", # 'auto', 'faiss', 'hnswlib'
|
||||
binary_top_k=200, # binary coarse search candidates
|
||||
ann_top_k=50, # ANN fine search candidates
|
||||
fts_top_k=50, # FTS results per method
|
||||
device="auto", # 'auto', 'cuda', 'cpu'
|
||||
)
|
||||
Query → [Embedder] → query vector
|
||||
├→ [BinaryStore] → candidates (Hamming)
|
||||
│ └→ [ANNIndex] → ranked IDs (cosine)
|
||||
├→ [FTS exact] → exact matches
|
||||
└→ [FTS fuzzy] → fuzzy matches
|
||||
└→ [RRF Fusion] → merged ranking
|
||||
└→ [Reranker] → final top-k
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
```bash
|
||||
git clone https://github.com/nicepkg/codexlens-search.git
|
||||
git clone https://github.com/catlog22/codexlens-search.git
|
||||
cd codexlens-search
|
||||
pip install -e ".[dev,semantic]"
|
||||
pip install -e ".[dev]"
|
||||
pytest
|
||||
```
|
||||
|
||||
|
||||
Binary file not shown.
BIN
codex-lens-v2/dist/codexlens_search-0.2.0.tar.gz
vendored
BIN
codex-lens-v2/dist/codexlens_search-0.2.0.tar.gz
vendored
Binary file not shown.
@@ -4,10 +4,15 @@ build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "codexlens-search"
|
||||
version = "0.2.0"
|
||||
description = "Lightweight semantic code search engine — 2-stage vector + FTS + RRF fusion"
|
||||
version = "0.3.0"
|
||||
description = "Lightweight semantic code search engine — 2-stage vector + FTS + RRF fusion + MCP server"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
dependencies = [
|
||||
"hnswlib>=0.8.0",
|
||||
"numpy>=1.26",
|
||||
"fastembed>=0.4.0,<2.0",
|
||||
"httpx>=0.25",
|
||||
]
|
||||
license = {text = "MIT"}
|
||||
readme = "README.md"
|
||||
authors = [
|
||||
@@ -26,14 +31,12 @@ classifiers = [
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/nicepkg/codexlens-search"
|
||||
Repository = "https://github.com/nicepkg/codexlens-search"
|
||||
Homepage = "https://github.com/catlog22/codexlens-search"
|
||||
Repository = "https://github.com/catlog22/codexlens-search"
|
||||
|
||||
[project.optional-dependencies]
|
||||
semantic = [
|
||||
"hnswlib>=0.8.0",
|
||||
"numpy>=1.26",
|
||||
"fastembed>=0.4.0,<2.0",
|
||||
mcp = [
|
||||
"mcp[cli]>=1.0.0",
|
||||
]
|
||||
gpu = [
|
||||
"onnxruntime-gpu>=1.16",
|
||||
@@ -44,21 +47,9 @@ faiss-cpu = [
|
||||
faiss-gpu = [
|
||||
"faiss-gpu>=1.7.4",
|
||||
]
|
||||
embed-api = [
|
||||
"httpx>=0.25",
|
||||
]
|
||||
reranker-api = [
|
||||
"httpx>=0.25",
|
||||
]
|
||||
watcher = [
|
||||
"watchdog>=3.0",
|
||||
]
|
||||
semantic-gpu = [
|
||||
"hnswlib>=0.8.0",
|
||||
"numpy>=1.26",
|
||||
"fastembed>=0.4.0,<2.0",
|
||||
"onnxruntime-gpu>=1.16",
|
||||
]
|
||||
dev = [
|
||||
"pytest>=7.0",
|
||||
"pytest-cov",
|
||||
@@ -66,6 +57,7 @@ dev = [
|
||||
|
||||
[project.scripts]
|
||||
codexlens-search = "codexlens_search.bridge:main"
|
||||
codexlens-mcp = "codexlens_search.mcp_server:main"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/codexlens_search"]
|
||||
|
||||
@@ -50,21 +50,19 @@ def _resolve_db_path(args: argparse.Namespace) -> Path:
|
||||
return db_path
|
||||
|
||||
|
||||
def _create_config(args: argparse.Namespace) -> "Config":
|
||||
"""Build Config from CLI args."""
|
||||
def create_config_from_env(db_path: str | Path, **overrides: object) -> "Config":
|
||||
"""Build Config from environment variables and optional overrides.
|
||||
|
||||
Used by both CLI bridge and MCP server.
|
||||
"""
|
||||
from codexlens_search.config import Config
|
||||
|
||||
kwargs: dict = {}
|
||||
if hasattr(args, "embed_model") and args.embed_model:
|
||||
kwargs["embed_model"] = args.embed_model
|
||||
# API embedding overrides
|
||||
if hasattr(args, "embed_api_url") and args.embed_api_url:
|
||||
kwargs["embed_api_url"] = args.embed_api_url
|
||||
if hasattr(args, "embed_api_key") and args.embed_api_key:
|
||||
kwargs["embed_api_key"] = args.embed_api_key
|
||||
if hasattr(args, "embed_api_model") and args.embed_api_model:
|
||||
kwargs["embed_api_model"] = args.embed_api_model
|
||||
# Also check env vars as fallback
|
||||
# Apply explicit overrides first
|
||||
for key in ("embed_model", "embed_api_url", "embed_api_key", "embed_api_model"):
|
||||
if overrides.get(key):
|
||||
kwargs[key] = overrides[key]
|
||||
# Env vars as fallback
|
||||
if "embed_api_url" not in kwargs and os.environ.get("CODEXLENS_EMBED_API_URL"):
|
||||
kwargs["embed_api_url"] = os.environ["CODEXLENS_EMBED_API_URL"]
|
||||
if "embed_api_key" not in kwargs and os.environ.get("CODEXLENS_EMBED_API_KEY"):
|
||||
@@ -124,18 +122,33 @@ def _create_config(args: argparse.Namespace) -> "Config":
|
||||
kwargs["hnsw_ef"] = int(os.environ["CODEXLENS_HNSW_EF"])
|
||||
if os.environ.get("CODEXLENS_HNSW_M"):
|
||||
kwargs["hnsw_M"] = int(os.environ["CODEXLENS_HNSW_M"])
|
||||
db_path = Path(args.db_path).resolve()
|
||||
kwargs["metadata_db_path"] = str(db_path / "metadata.db")
|
||||
resolved = Path(db_path).resolve()
|
||||
kwargs["metadata_db_path"] = str(resolved / "metadata.db")
|
||||
return Config(**kwargs)
|
||||
|
||||
|
||||
def _create_pipeline(
|
||||
args: argparse.Namespace,
|
||||
def _create_config(args: argparse.Namespace) -> "Config":
|
||||
"""Build Config from CLI args (delegates to create_config_from_env)."""
|
||||
overrides: dict = {}
|
||||
if hasattr(args, "embed_model") and args.embed_model:
|
||||
overrides["embed_model"] = args.embed_model
|
||||
if hasattr(args, "embed_api_url") and args.embed_api_url:
|
||||
overrides["embed_api_url"] = args.embed_api_url
|
||||
if hasattr(args, "embed_api_key") and args.embed_api_key:
|
||||
overrides["embed_api_key"] = args.embed_api_key
|
||||
if hasattr(args, "embed_api_model") and args.embed_api_model:
|
||||
overrides["embed_api_model"] = args.embed_api_model
|
||||
return create_config_from_env(args.db_path, **overrides)
|
||||
|
||||
|
||||
def create_pipeline(
|
||||
db_path: str | Path,
|
||||
config: "Config | None" = None,
|
||||
) -> tuple:
|
||||
"""Lazily construct pipeline components from CLI args.
|
||||
"""Construct pipeline components from db_path and config.
|
||||
|
||||
Returns (indexing_pipeline, search_pipeline, config).
|
||||
Only loads embedder/reranker models when needed.
|
||||
Used by both CLI bridge and MCP server.
|
||||
"""
|
||||
from codexlens_search.config import Config
|
||||
from codexlens_search.core.factory import create_ann_index, create_binary_index
|
||||
@@ -144,8 +157,10 @@ def _create_pipeline(
|
||||
from codexlens_search.search.fts import FTSEngine
|
||||
from codexlens_search.search.pipeline import SearchPipeline
|
||||
|
||||
config = _create_config(args)
|
||||
db_path = _resolve_db_path(args)
|
||||
if config is None:
|
||||
config = create_config_from_env(db_path)
|
||||
resolved = Path(db_path).resolve()
|
||||
resolved.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Select embedder: API if configured, otherwise local fastembed
|
||||
if config.embed_api_url:
|
||||
@@ -163,10 +178,10 @@ def _create_pipeline(
|
||||
from codexlens_search.embed.local import FastEmbedEmbedder
|
||||
embedder = FastEmbedEmbedder(config)
|
||||
|
||||
binary_store = create_binary_index(db_path, config.embed_dim, config)
|
||||
ann_index = create_ann_index(db_path, config.embed_dim, config)
|
||||
fts = FTSEngine(db_path / "fts.db")
|
||||
metadata = MetadataStore(db_path / "metadata.db")
|
||||
binary_store = create_binary_index(resolved, config.embed_dim, config)
|
||||
ann_index = create_ann_index(resolved, config.embed_dim, config)
|
||||
fts = FTSEngine(resolved / "fts.db")
|
||||
metadata = MetadataStore(resolved / "metadata.db")
|
||||
|
||||
# Select reranker: API if configured, otherwise local fastembed
|
||||
if config.reranker_api_url:
|
||||
@@ -199,6 +214,15 @@ def _create_pipeline(
|
||||
return indexing, search, config
|
||||
|
||||
|
||||
def _create_pipeline(
|
||||
args: argparse.Namespace,
|
||||
) -> tuple:
|
||||
"""CLI wrapper: construct pipeline from argparse args."""
|
||||
config = _create_config(args)
|
||||
db_path = _resolve_db_path(args)
|
||||
return create_pipeline(db_path, config)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Subcommand handlers
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -269,14 +293,14 @@ def cmd_remove_file(args: argparse.Namespace) -> None:
|
||||
})
|
||||
|
||||
|
||||
_DEFAULT_EXCLUDES = frozenset({
|
||||
DEFAULT_EXCLUDES = frozenset({
|
||||
"node_modules", ".git", "__pycache__", "dist", "build",
|
||||
".venv", "venv", ".tox", ".mypy_cache", ".pytest_cache",
|
||||
".next", ".nuxt", "coverage", ".eggs", "*.egg-info",
|
||||
})
|
||||
|
||||
|
||||
def _should_exclude(path: Path, exclude_dirs: frozenset[str]) -> bool:
|
||||
def should_exclude(path: Path, exclude_dirs: frozenset[str]) -> bool:
|
||||
"""Check if any path component matches an exclude pattern."""
|
||||
parts = path.parts
|
||||
return any(part in exclude_dirs for part in parts)
|
||||
@@ -290,11 +314,11 @@ def cmd_sync(args: argparse.Namespace) -> None:
|
||||
if not root.is_dir():
|
||||
_error_exit(f"Root directory not found: {root}")
|
||||
|
||||
exclude_dirs = frozenset(args.exclude) if args.exclude else _DEFAULT_EXCLUDES
|
||||
exclude_dirs = frozenset(args.exclude) if args.exclude else DEFAULT_EXCLUDES
|
||||
pattern = args.glob or "**/*"
|
||||
file_paths = [
|
||||
p for p in root.glob(pattern)
|
||||
if p.is_file() and not _should_exclude(p.relative_to(root), exclude_dirs)
|
||||
if p.is_file() and not should_exclude(p.relative_to(root), exclude_dirs)
|
||||
]
|
||||
|
||||
log.debug("Sync: %d files after exclusion (root=%s, pattern=%s)", len(file_paths), root, pattern)
|
||||
|
||||
367
codex-lens-v2/src/codexlens_search/mcp_server.py
Normal file
367
codex-lens-v2/src/codexlens_search/mcp_server.py
Normal file
@@ -0,0 +1,367 @@
|
||||
"""MCP server for codexlens-search.
|
||||
|
||||
Exposes semantic code search tools via FastMCP for Claude Code integration.
|
||||
Run as: codexlens-mcp (entry point) or python -m codexlens_search.mcp_server
|
||||
|
||||
## .mcp.json Configuration Examples
|
||||
|
||||
### API embedding + API reranker (single endpoint):
|
||||
{
|
||||
"mcpServers": {
|
||||
"codexlens": {
|
||||
"command": "codexlens-mcp",
|
||||
"env": {
|
||||
"CODEXLENS_EMBED_API_URL": "https://api.openai.com/v1",
|
||||
"CODEXLENS_EMBED_API_KEY": "sk-xxx",
|
||||
"CODEXLENS_EMBED_API_MODEL": "text-embedding-3-small",
|
||||
"CODEXLENS_EMBED_DIM": "1536",
|
||||
"CODEXLENS_RERANKER_API_URL": "https://api.jina.ai/v1",
|
||||
"CODEXLENS_RERANKER_API_KEY": "jina-xxx",
|
||||
"CODEXLENS_RERANKER_API_MODEL": "jina-reranker-v2-base-multilingual"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
### API embedding (multi-endpoint load balancing):
|
||||
{
|
||||
"mcpServers": {
|
||||
"codexlens": {
|
||||
"command": "codexlens-mcp",
|
||||
"env": {
|
||||
"CODEXLENS_EMBED_API_ENDPOINTS": "url1|key1|model1,url2|key2|model2",
|
||||
"CODEXLENS_EMBED_DIM": "1536",
|
||||
"CODEXLENS_RERANKER_API_URL": "https://api.jina.ai/v1",
|
||||
"CODEXLENS_RERANKER_API_KEY": "jina-xxx",
|
||||
"CODEXLENS_RERANKER_API_MODEL": "jina-reranker-v2-base-multilingual"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
### Local fastembed model (no API, requires codexlens-search[semantic]):
|
||||
{
|
||||
"mcpServers": {
|
||||
"codexlens": {
|
||||
"command": "codexlens-mcp",
|
||||
"env": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
Pre-download models via CLI: codexlens-search download-models
|
||||
|
||||
### Env vars reference:
|
||||
Embedding: CODEXLENS_EMBED_API_URL, _KEY, _MODEL, _ENDPOINTS (multi), _DIM
|
||||
Reranker: CODEXLENS_RERANKER_API_URL, _KEY, _MODEL
|
||||
Tuning: CODEXLENS_BINARY_TOP_K, _ANN_TOP_K, _FTS_TOP_K, _FUSION_K,
|
||||
CODEXLENS_RERANKER_TOP_K, _RERANKER_BATCH_SIZE
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
from mcp.server.fastmcp import FastMCP
|
||||
|
||||
from codexlens_search.bridge import (
|
||||
DEFAULT_EXCLUDES,
|
||||
create_config_from_env,
|
||||
create_pipeline,
|
||||
should_exclude,
|
||||
)
|
||||
|
||||
log = logging.getLogger("codexlens_search.mcp_server")
|
||||
|
||||
mcp = FastMCP("codexlens-search")
|
||||
|
||||
# Pipeline cache: keyed by resolved project_path -> (indexing, search, config)
|
||||
_pipelines: dict[str, tuple] = {}
|
||||
_lock = threading.Lock()
|
||||
|
||||
|
||||
def _db_path_for_project(project_path: str) -> Path:
|
||||
"""Return the index database path for a project."""
|
||||
return Path(project_path).resolve() / ".codexlens"
|
||||
|
||||
|
||||
def _get_pipelines(project_path: str) -> tuple:
|
||||
"""Get or create cached (indexing_pipeline, search_pipeline, config) for a project."""
|
||||
resolved = str(Path(project_path).resolve())
|
||||
with _lock:
|
||||
if resolved not in _pipelines:
|
||||
db_path = _db_path_for_project(resolved)
|
||||
config = create_config_from_env(db_path)
|
||||
_pipelines[resolved] = create_pipeline(db_path, config)
|
||||
return _pipelines[resolved]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Search tools
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@mcp.tool()
|
||||
def search_code(project_path: str, query: str, top_k: int = 10) -> str:
|
||||
"""Semantic code search with hybrid fusion (vector + FTS + reranking).
|
||||
|
||||
Args:
|
||||
project_path: Absolute path to the project root directory.
|
||||
query: Natural language or code search query.
|
||||
top_k: Maximum number of results to return (default 10).
|
||||
|
||||
Returns:
|
||||
Search results as formatted text with file paths, line numbers, scores, and code snippets.
|
||||
"""
|
||||
root = Path(project_path).resolve()
|
||||
if not root.is_dir():
|
||||
return f"Error: project path not found: {root}"
|
||||
|
||||
db_path = _db_path_for_project(project_path)
|
||||
if not (db_path / "metadata.db").exists():
|
||||
return f"Error: no index found at {db_path}. Run index_project first."
|
||||
|
||||
_, search, _ = _get_pipelines(project_path)
|
||||
results = search.search(query, top_k=top_k)
|
||||
|
||||
if not results:
|
||||
return "No results found."
|
||||
|
||||
lines = []
|
||||
for i, r in enumerate(results, 1):
|
||||
lines.append(f"## Result {i} — {r.path} (L{r.line}-{r.end_line}, score: {r.score:.4f})")
|
||||
lines.append(f"```\n{r.content}\n```")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Indexing tools
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@mcp.tool()
|
||||
def index_project(
|
||||
project_path: str, glob_pattern: str = "**/*", force: bool = False
|
||||
) -> str:
|
||||
"""Build or rebuild the search index for a project.
|
||||
|
||||
Args:
|
||||
project_path: Absolute path to the project root directory.
|
||||
glob_pattern: Glob pattern for files to index (default "**/*").
|
||||
force: If True, rebuild index from scratch even if it exists.
|
||||
|
||||
Returns:
|
||||
Indexing summary with file count, chunk count, and duration.
|
||||
"""
|
||||
root = Path(project_path).resolve()
|
||||
if not root.is_dir():
|
||||
return f"Error: project path not found: {root}"
|
||||
|
||||
if force:
|
||||
with _lock:
|
||||
_pipelines.pop(str(root), None)
|
||||
|
||||
indexing, _, _ = _get_pipelines(project_path)
|
||||
|
||||
file_paths = [
|
||||
p for p in root.glob(glob_pattern)
|
||||
if p.is_file() and not should_exclude(p.relative_to(root), DEFAULT_EXCLUDES)
|
||||
]
|
||||
|
||||
stats = indexing.sync(file_paths, root=root)
|
||||
return (
|
||||
f"Indexed {stats.files_processed} files, "
|
||||
f"{stats.chunks_created} chunks in {stats.duration_seconds:.1f}s. "
|
||||
f"DB: {_db_path_for_project(project_path)}"
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def index_status(project_path: str) -> str:
|
||||
"""Show index statistics for a project.
|
||||
|
||||
Args:
|
||||
project_path: Absolute path to the project root directory.
|
||||
|
||||
Returns:
|
||||
Index statistics including file count, chunk count, and deleted chunks.
|
||||
"""
|
||||
from codexlens_search.indexing.metadata import MetadataStore
|
||||
|
||||
db_path = _db_path_for_project(project_path)
|
||||
meta_path = db_path / "metadata.db"
|
||||
|
||||
if not meta_path.exists():
|
||||
return f"No index found at {db_path}. Run index_project first."
|
||||
|
||||
metadata = MetadataStore(meta_path)
|
||||
all_files = metadata.get_all_files()
|
||||
deleted_ids = metadata.get_deleted_ids()
|
||||
max_chunk = metadata.max_chunk_id()
|
||||
|
||||
total = max_chunk + 1 if max_chunk >= 0 else 0
|
||||
return (
|
||||
f"Index: {db_path}\n"
|
||||
f"Files tracked: {len(all_files)}\n"
|
||||
f"Total chunks: {total}\n"
|
||||
f"Deleted chunks: {len(deleted_ids)}"
|
||||
)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def index_update(project_path: str, glob_pattern: str = "**/*") -> str:
|
||||
"""Incrementally sync the index with current project files.
|
||||
|
||||
Only re-indexes files that changed since last indexing.
|
||||
|
||||
Args:
|
||||
project_path: Absolute path to the project root directory.
|
||||
glob_pattern: Glob pattern for files to sync (default "**/*").
|
||||
|
||||
Returns:
|
||||
Sync summary with processed file count and duration.
|
||||
"""
|
||||
root = Path(project_path).resolve()
|
||||
if not root.is_dir():
|
||||
return f"Error: project path not found: {root}"
|
||||
|
||||
indexing, _, _ = _get_pipelines(project_path)
|
||||
|
||||
file_paths = [
|
||||
p for p in root.glob(glob_pattern)
|
||||
if p.is_file() and not should_exclude(p.relative_to(root), DEFAULT_EXCLUDES)
|
||||
]
|
||||
|
||||
stats = indexing.sync(file_paths, root=root)
|
||||
return (
|
||||
f"Synced {stats.files_processed} files, "
|
||||
f"{stats.chunks_created} chunks in {stats.duration_seconds:.1f}s."
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File discovery
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@mcp.tool()
|
||||
def find_files(
|
||||
project_path: str, pattern: str = "**/*", max_results: int = 100
|
||||
) -> str:
|
||||
"""Find files in a project by glob pattern.
|
||||
|
||||
Args:
|
||||
project_path: Absolute path to the project root directory.
|
||||
pattern: Glob pattern to match files (default "**/*").
|
||||
max_results: Maximum number of file paths to return (default 100).
|
||||
|
||||
Returns:
|
||||
List of matching file paths (relative to project root), one per line.
|
||||
"""
|
||||
root = Path(project_path).resolve()
|
||||
if not root.is_dir():
|
||||
return f"Error: project path not found: {root}"
|
||||
|
||||
matches = []
|
||||
for p in root.glob(pattern):
|
||||
if p.is_file() and not should_exclude(p.relative_to(root), DEFAULT_EXCLUDES):
|
||||
matches.append(str(p.relative_to(root)))
|
||||
if len(matches) >= max_results:
|
||||
break
|
||||
|
||||
if not matches:
|
||||
return "No files found matching the pattern."
|
||||
|
||||
header = f"Found {len(matches)} files"
|
||||
if len(matches) >= max_results:
|
||||
header += f" (limited to {max_results})"
|
||||
return header + ":\n" + "\n".join(matches)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Model management tools
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@mcp.tool()
|
||||
def list_models() -> str:
|
||||
"""List available embedding and reranker models with cache status.
|
||||
|
||||
Shows which models are downloaded locally and ready for use.
|
||||
Models are needed when using local fastembed mode (no API URL configured).
|
||||
|
||||
Returns:
|
||||
Table of models with name, type, and installed status.
|
||||
"""
|
||||
from codexlens_search import model_manager
|
||||
from codexlens_search.config import Config
|
||||
|
||||
config = create_config_from_env(".")
|
||||
models = model_manager.list_known_models(config)
|
||||
|
||||
if not models:
|
||||
return "No known models found."
|
||||
|
||||
lines = ["| Model | Type | Installed |", "| --- | --- | --- |"]
|
||||
for m in models:
|
||||
status = "Yes" if m["installed"] else "No"
|
||||
lines.append(f"| {m['name']} | {m['type']} | {status} |")
|
||||
|
||||
# Show current config
|
||||
lines.append("")
|
||||
if config.embed_api_url:
|
||||
lines.append(f"Mode: API embedding ({config.embed_api_url})")
|
||||
else:
|
||||
lines.append(f"Mode: Local fastembed (model: {config.embed_model})")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def download_models(embed_model: str = "", reranker_model: str = "") -> str:
|
||||
"""Download embedding and reranker models for local (fastembed) mode.
|
||||
|
||||
Not needed when using API embedding (CODEXLENS_EMBED_API_URL is set).
|
||||
Downloads are cached — subsequent calls are no-ops if already downloaded.
|
||||
|
||||
Args:
|
||||
embed_model: Embedding model name (default: BAAI/bge-small-en-v1.5).
|
||||
reranker_model: Reranker model name (default: Xenova/ms-marco-MiniLM-L-6-v2).
|
||||
|
||||
Returns:
|
||||
Download status for each model.
|
||||
"""
|
||||
from codexlens_search import model_manager
|
||||
from codexlens_search.config import Config
|
||||
|
||||
config = create_config_from_env(".")
|
||||
if embed_model:
|
||||
config.embed_model = embed_model
|
||||
if reranker_model:
|
||||
config.reranker_model = reranker_model
|
||||
|
||||
results = []
|
||||
for name, kind in [
|
||||
(config.embed_model, "embedding"),
|
||||
(config.reranker_model, "reranker"),
|
||||
]:
|
||||
try:
|
||||
model_manager.ensure_model(name, config)
|
||||
results.append(f"{kind}: {name} — ready")
|
||||
except Exception as e:
|
||||
results.append(f"{kind}: {name} — failed: {e}")
|
||||
|
||||
return "\n".join(results)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> None:
|
||||
"""Entry point for codexlens-mcp command."""
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(levelname)s %(name)s: %(message)s",
|
||||
)
|
||||
mcp.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user