mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-18 18:48:48 +08:00
Rename the v2 search engine package to `codexlens-search` (import as `codexlens_search`) so it can be installed independently and consumed by the original codex-lens as a dependency. This avoids package path conflicts since both previously used `src/codexlens/`. Changes: - Rename src/codexlens/ -> src/codexlens_search/ - Update pyproject.toml: name=codexlens-search, version=0.2.0 - Update all imports across source, tests, and scripts - Add public API exports in __init__.py (Config, SearchPipeline, IndexingPipeline, SearchResult, IndexStats) 37/37 tests pass. No functional changes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
109 lines
4.6 KiB
Python
109 lines
4.6 KiB
Python
import pytest
|
|
import numpy as np
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
from codexlens_search.config import Config
|
|
from codexlens_search.core import ANNIndex, BinaryStore
|
|
from codexlens_search.embed.base import BaseEmbedder
|
|
from codexlens_search.rerank.base import BaseReranker
|
|
from codexlens_search.search.fts import FTSEngine
|
|
from codexlens_search.search.pipeline import SearchPipeline
|
|
|
|
# Test documents: 20 code snippets with id, path, content
|
|
TEST_DOCS = [
|
|
(0, "auth.py", "def authenticate(user, password): return check_hash(password, user.hash)"),
|
|
(1, "auth.py", "def authorize(user, permission): return permission in user.roles"),
|
|
(2, "models.py", "class User: def __init__(self, name, email): self.name = name; self.email = email"),
|
|
(3, "models.py", "class Session: token = None; expires_at = None"),
|
|
(4, "middleware.py", "def auth_middleware(request): token = request.headers.get('Authorization')"),
|
|
(5, "utils.py", "def hash_password(password): import bcrypt; return bcrypt.hashpw(password)"),
|
|
(6, "config.py", "DATABASE_URL = os.environ.get('DATABASE_URL', 'sqlite:///db.sqlite3')"),
|
|
(7, "search.py", "def search_users(query): return User.objects.filter(name__icontains=query)"),
|
|
(8, "api.py", "def get_user(request, user_id): user = User.objects.get(id=user_id)"),
|
|
(9, "api.py", "def create_user(request): data = request.json(); user = User(**data)"),
|
|
(10, "tests.py", "def test_authenticate(): assert authenticate('admin', 'pass') is not None"),
|
|
(11, "tests.py", "def test_search(): results = search_users('alice'); assert len(results) > 0"),
|
|
(12, "router.py", "app.route('/users', methods=['GET'])(list_users)"),
|
|
(13, "router.py", "app.route('/login', methods=['POST'])(login_handler)"),
|
|
(14, "db.py", "def get_connection(): return sqlite3.connect(DATABASE_URL)"),
|
|
(15, "cache.py", "def cache_get(key): return redis_client.get(key)"),
|
|
(16, "cache.py", "def cache_set(key, value, ttl=3600): redis_client.setex(key, ttl, value)"),
|
|
(17, "errors.py", "class AuthError(Exception): status_code = 401"),
|
|
(18, "errors.py", "class NotFoundError(Exception): status_code = 404"),
|
|
(19, "validators.py", "def validate_email(email): return '@' in email and '.' in email.split('@')[1]"),
|
|
]
|
|
|
|
DIM = 32 # Use small dim for fast tests
|
|
|
|
|
|
def make_stable_vec(doc_id: int, dim: int = DIM) -> np.ndarray:
|
|
"""Generate a deterministic float32 vector for a given doc_id."""
|
|
rng = np.random.default_rng(seed=doc_id)
|
|
vec = rng.standard_normal(dim).astype(np.float32)
|
|
vec /= np.linalg.norm(vec)
|
|
return vec
|
|
|
|
|
|
class MockEmbedder(BaseEmbedder):
|
|
"""Returns stable deterministic vectors based on content hash."""
|
|
|
|
def embed_single(self, text: str) -> np.ndarray:
|
|
seed = hash(text) % (2**31)
|
|
rng = np.random.default_rng(seed=seed)
|
|
vec = rng.standard_normal(DIM).astype(np.float32)
|
|
vec /= np.linalg.norm(vec)
|
|
return vec
|
|
|
|
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
|
return [self.embed_single(t) for t in texts]
|
|
|
|
def embed(self, texts: list[str]) -> list[np.ndarray]:
|
|
"""Called by SearchPipeline as self._embedder.embed([query])[0]."""
|
|
return self.embed_batch(texts)
|
|
|
|
|
|
class MockReranker(BaseReranker):
|
|
"""Returns score based on simple keyword overlap."""
|
|
|
|
def score_pairs(self, query: str, documents: list[str]) -> list[float]:
|
|
query_words = set(query.lower().split())
|
|
scores = []
|
|
for doc in documents:
|
|
doc_words = set(doc.lower().split())
|
|
overlap = len(query_words & doc_words)
|
|
scores.append(float(overlap) / max(len(query_words), 1))
|
|
return scores
|
|
|
|
|
|
@pytest.fixture
|
|
def config():
|
|
return Config.small() # hnsw_ef=50, hnsw_M=16, binary_top_k=50, ann_top_k=20, rerank_top_k=10
|
|
|
|
|
|
@pytest.fixture
|
|
def search_pipeline(tmp_path, config):
|
|
"""Build a full SearchPipeline with 20 test docs indexed."""
|
|
embedder = MockEmbedder()
|
|
binary_store = BinaryStore(tmp_path / "binary", dim=DIM, config=config)
|
|
ann_index = ANNIndex(tmp_path / "ann.hnsw", dim=DIM, config=config)
|
|
fts = FTSEngine(tmp_path / "fts.db")
|
|
reranker = MockReranker()
|
|
|
|
# Index all test docs
|
|
ids = np.array([d[0] for d in TEST_DOCS], dtype=np.int64)
|
|
vectors = np.array([embedder.embed_single(d[2]) for d in TEST_DOCS], dtype=np.float32)
|
|
|
|
binary_store.add(ids, vectors)
|
|
ann_index.add(ids, vectors)
|
|
fts.add_documents(TEST_DOCS)
|
|
|
|
return SearchPipeline(
|
|
embedder=embedder,
|
|
binary_store=binary_store,
|
|
ann_index=ann_index,
|
|
reranker=reranker,
|
|
fts=fts,
|
|
config=config,
|
|
)
|