Claude-Code-Workflow/codex-lens-v2/tests/integration/conftest.py

import pytest
import numpy as np
import tempfile
from pathlib import Path

from codexlens.config import Config
from codexlens.core import ANNIndex, BinaryStore
from codexlens.embed.base import BaseEmbedder
from codexlens.rerank.base import BaseReranker
from codexlens.search.fts import FTSEngine
from codexlens.search.pipeline import SearchPipeline

# Test documents: 20 code snippets with id, path, content
TEST_DOCS = [
    (0, "auth.py", "def authenticate(user, password): return check_hash(password, user.hash)"),
    (1, "auth.py", "def authorize(user, permission): return permission in user.roles"),
    (2, "models.py", "class User: def __init__(self, name, email): self.name = name; self.email = email"),
    (3, "models.py", "class Session: token = None; expires_at = None"),
    (4, "middleware.py", "def auth_middleware(request): token = request.headers.get('Authorization')"),
    (5, "utils.py", "def hash_password(password): import bcrypt; return bcrypt.hashpw(password)"),
    (6, "config.py", "DATABASE_URL = os.environ.get('DATABASE_URL', 'sqlite:///db.sqlite3')"),
    (7, "search.py", "def search_users(query): return User.objects.filter(name__icontains=query)"),
    (8, "api.py", "def get_user(request, user_id): user = User.objects.get(id=user_id)"),
    (9, "api.py", "def create_user(request): data = request.json(); user = User(**data)"),
    (10, "tests.py", "def test_authenticate(): assert authenticate('admin', 'pass') is not None"),
    (11, "tests.py", "def test_search(): results = search_users('alice'); assert len(results) > 0"),
    (12, "router.py", "app.route('/users', methods=['GET'])(list_users)"),
    (13, "router.py", "app.route('/login', methods=['POST'])(login_handler)"),
    (14, "db.py", "def get_connection(): return sqlite3.connect(DATABASE_URL)"),
    (15, "cache.py", "def cache_get(key): return redis_client.get(key)"),
    (16, "cache.py", "def cache_set(key, value, ttl=3600): redis_client.setex(key, ttl, value)"),
    (17, "errors.py", "class AuthError(Exception): status_code = 401"),
    (18, "errors.py", "class NotFoundError(Exception): status_code = 404"),
    (19, "validators.py", "def validate_email(email): return '@' in email and '.' in email.split('@')[1]"),
]

DIM = 32  # Use small dim for fast tests


def make_stable_vec(doc_id: int, dim: int = DIM) -> np.ndarray:
    """Generate a deterministic float32 vector for a given doc_id."""
    rng = np.random.default_rng(seed=doc_id)
    vec = rng.standard_normal(dim).astype(np.float32)
    vec /= np.linalg.norm(vec)
    return vec


class MockEmbedder(BaseEmbedder):
    """Returns stable deterministic vectors based on content hash."""

    def embed_single(self, text: str) -> np.ndarray:
        seed = hash(text) % (2**31)
        rng = np.random.default_rng(seed=seed)
        vec = rng.standard_normal(DIM).astype(np.float32)
        vec /= np.linalg.norm(vec)
        return vec

    def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
        return [self.embed_single(t) for t in texts]

    def embed(self, texts: list[str]) -> list[np.ndarray]:
        """Called by SearchPipeline as self._embedder.embed([query])[0]."""
        return self.embed_batch(texts)


class MockReranker(BaseReranker):
    """Returns score based on simple keyword overlap."""

    def score_pairs(self, query: str, documents: list[str]) -> list[float]:
        query_words = set(query.lower().split())
        scores = []
        for doc in documents:
            doc_words = set(doc.lower().split())
            overlap = len(query_words & doc_words)
            scores.append(float(overlap) / max(len(query_words), 1))
        return scores


@pytest.fixture
def config():
    return Config.small()  # hnsw_ef=50, hnsw_M=16, binary_top_k=50, ann_top_k=20, rerank_top_k=10


@pytest.fixture
def search_pipeline(tmp_path, config):
    """Build a full SearchPipeline with 20 test docs indexed."""
    embedder = MockEmbedder()
    binary_store = BinaryStore(tmp_path / "binary", dim=DIM, config=config)
    ann_index = ANNIndex(tmp_path / "ann.hnsw", dim=DIM, config=config)
    fts = FTSEngine(tmp_path / "fts.db")
    reranker = MockReranker()

    # Index all test docs
    ids = np.array([d[0] for d in TEST_DOCS], dtype=np.int64)
    vectors = np.array([embedder.embed_single(d[2]) for d in TEST_DOCS], dtype=np.float32)

    binary_store.add(ids, vectors)
    ann_index.add(ids, vectors)
    fts.add_documents(TEST_DOCS)

    return SearchPipeline(
        embedder=embedder,
        binary_store=binary_store,
        ann_index=ann_index,
        reranker=reranker,
        fts=fts,
        config=config,
    )