import pytest import numpy as np import tempfile from pathlib import Path from codexlens_search.config import Config from codexlens_search.core import ANNIndex, BinaryStore from codexlens_search.embed.base import BaseEmbedder from codexlens_search.rerank.base import BaseReranker from codexlens_search.search.fts import FTSEngine from codexlens_search.search.pipeline import SearchPipeline # Test documents: 20 code snippets with id, path, content TEST_DOCS = [ (0, "auth.py", "def authenticate(user, password): return check_hash(password, user.hash)"), (1, "auth.py", "def authorize(user, permission): return permission in user.roles"), (2, "models.py", "class User: def __init__(self, name, email): self.name = name; self.email = email"), (3, "models.py", "class Session: token = None; expires_at = None"), (4, "middleware.py", "def auth_middleware(request): token = request.headers.get('Authorization')"), (5, "utils.py", "def hash_password(password): import bcrypt; return bcrypt.hashpw(password)"), (6, "config.py", "DATABASE_URL = os.environ.get('DATABASE_URL', 'sqlite:///db.sqlite3')"), (7, "search.py", "def search_users(query): return User.objects.filter(name__icontains=query)"), (8, "api.py", "def get_user(request, user_id): user = User.objects.get(id=user_id)"), (9, "api.py", "def create_user(request): data = request.json(); user = User(**data)"), (10, "tests.py", "def test_authenticate(): assert authenticate('admin', 'pass') is not None"), (11, "tests.py", "def test_search(): results = search_users('alice'); assert len(results) > 0"), (12, "router.py", "app.route('/users', methods=['GET'])(list_users)"), (13, "router.py", "app.route('/login', methods=['POST'])(login_handler)"), (14, "db.py", "def get_connection(): return sqlite3.connect(DATABASE_URL)"), (15, "cache.py", "def cache_get(key): return redis_client.get(key)"), (16, "cache.py", "def cache_set(key, value, ttl=3600): redis_client.setex(key, ttl, value)"), (17, "errors.py", "class AuthError(Exception): status_code = 401"), (18, "errors.py", "class NotFoundError(Exception): status_code = 404"), (19, "validators.py", "def validate_email(email): return '@' in email and '.' in email.split('@')[1]"), ] DIM = 32 # Use small dim for fast tests def make_stable_vec(doc_id: int, dim: int = DIM) -> np.ndarray: """Generate a deterministic float32 vector for a given doc_id.""" rng = np.random.default_rng(seed=doc_id) vec = rng.standard_normal(dim).astype(np.float32) vec /= np.linalg.norm(vec) return vec class MockEmbedder(BaseEmbedder): """Returns stable deterministic vectors based on content hash.""" def embed_single(self, text: str) -> np.ndarray: seed = hash(text) % (2**31) rng = np.random.default_rng(seed=seed) vec = rng.standard_normal(DIM).astype(np.float32) vec /= np.linalg.norm(vec) return vec def embed_batch(self, texts: list[str]) -> list[np.ndarray]: return [self.embed_single(t) for t in texts] def embed(self, texts: list[str]) -> list[np.ndarray]: """Called by SearchPipeline as self._embedder.embed([query])[0].""" return self.embed_batch(texts) class MockReranker(BaseReranker): """Returns score based on simple keyword overlap.""" def score_pairs(self, query: str, documents: list[str]) -> list[float]: query_words = set(query.lower().split()) scores = [] for doc in documents: doc_words = set(doc.lower().split()) overlap = len(query_words & doc_words) scores.append(float(overlap) / max(len(query_words), 1)) return scores @pytest.fixture def config(): return Config.small() # hnsw_ef=50, hnsw_M=16, binary_top_k=50, ann_top_k=20, rerank_top_k=10 @pytest.fixture def search_pipeline(tmp_path, config): """Build a full SearchPipeline with 20 test docs indexed.""" embedder = MockEmbedder() binary_store = BinaryStore(tmp_path / "binary", dim=DIM, config=config) ann_index = ANNIndex(tmp_path / "ann.hnsw", dim=DIM, config=config) fts = FTSEngine(tmp_path / "fts.db") reranker = MockReranker() # Index all test docs ids = np.array([d[0] for d in TEST_DOCS], dtype=np.int64) vectors = np.array([embedder.embed_single(d[2]) for d in TEST_DOCS], dtype=np.float32) binary_store.add(ids, vectors) ann_index.add(ids, vectors) fts.add_documents(TEST_DOCS) return SearchPipeline( embedder=embedder, binary_store=binary_store, ann_index=ann_index, reranker=reranker, fts=fts, config=config, )