import logging import os import sqlite3 import tempfile from pathlib import Path from unittest.mock import MagicMock import pytest from codexlens.config import ( BINARY_VECTORS_MMAP_NAME, Config, VECTORS_HNSW_NAME, VECTORS_META_DB_NAME, ) from codexlens.entities import SearchResult, Symbol import codexlens.search.chain_search as chain_search_module from codexlens.search.chain_search import ( ChainSearchEngine, ChainSearchResult, SearchOptions, SearchStats, ) from codexlens.storage.global_index import GlobalSymbolIndex from codexlens.storage.path_mapper import PathMapper from codexlens.storage.registry import RegistryStore @pytest.fixture() def temp_paths(): tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) root = Path(tmpdir.name) yield root try: tmpdir.cleanup() except (PermissionError, OSError): pass def test_symbol_filtering_handles_path_failures(monkeypatch: pytest.MonkeyPatch, caplog, temp_paths: Path) -> None: project_root = temp_paths / "project" (project_root / "src").mkdir(parents=True, exist_ok=True) index_root = temp_paths / "indexes" mapper = PathMapper(index_root=index_root) index_db_path = mapper.source_to_index_db(project_root) index_db_path.parent.mkdir(parents=True, exist_ok=True) index_db_path.write_text("", encoding="utf-8") # existence is enough for _find_start_index registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() project_info = registry.register_project(project_root, mapper.source_to_index_dir(project_root)) global_db_path = project_info.index_root / GlobalSymbolIndex.DEFAULT_DB_NAME global_index = GlobalSymbolIndex(global_db_path, project_id=project_info.id) global_index.initialize() valid_file = project_root / "src" / "auth.py" valid_sym = Symbol(name="AuthManager", kind="class", range=(1, 2), file=str(valid_file)) bad_null = Symbol(name="BadNull", kind="class", range=(1, 2), file="bad\0path.py") bad_relative = Symbol(name="BadRelative", kind="class", range=(1, 2), file="relative/path.py") candidates = [valid_sym, bad_null, bad_relative] if os.name == "nt": root_drive, _ = os.path.splitdrive(str(project_root.resolve())) other_drive = "C:" if root_drive.lower() != "c:" else "D:" candidates.append( Symbol(name="CrossDrive", kind="class", range=(1, 2), file=f"{other_drive}\\other\\file.py") ) def fake_search(self, name: str, kind=None, limit: int = 20, prefix_mode: bool = False): return candidates monkeypatch.setattr(GlobalSymbolIndex, "search", fake_search) config = Config(data_dir=temp_paths / "data", global_symbol_index_enabled=True) engine = ChainSearchEngine(registry, mapper, config=config) engine._search_symbols_parallel = MagicMock(side_effect=AssertionError("should not traverse chain")) caplog.set_level(logging.DEBUG, logger="codexlens.search.chain_search") symbols = engine.search_symbols( "Auth", project_root, options=SearchOptions(depth=5, total_limit=10), ) assert [s.name for s in symbols] == ["AuthManager"] assert "BadNull" in caplog.text assert "BadRelative" in caplog.text if os.name == "nt": assert "CrossDrive" in caplog.text def test_cascade_search_strategy_routing(temp_paths: Path) -> None: """Test cascade_search() routes to correct strategy implementation.""" from unittest.mock import patch from codexlens.search.chain_search import ChainSearchResult, SearchStats registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data") engine = ChainSearchEngine(registry, mapper, config=config) source_path = temp_paths / "src" # Test strategy='staged' routing with patch.object(engine, "staged_cascade_search") as mock_staged: mock_staged.return_value = ChainSearchResult( query="query", results=[], symbols=[], stats=SearchStats() ) engine.cascade_search("query", source_path, strategy="staged") mock_staged.assert_called_once() # Test strategy='binary' routing with patch.object(engine, "binary_cascade_search") as mock_binary: mock_binary.return_value = ChainSearchResult( query="query", results=[], symbols=[], stats=SearchStats() ) engine.cascade_search("query", source_path, strategy="binary") mock_binary.assert_called_once() # Test strategy='binary_rerank' routing with patch.object(engine, "binary_rerank_cascade_search") as mock_br: mock_br.return_value = ChainSearchResult( query="query", results=[], symbols=[], stats=SearchStats() ) engine.cascade_search("query", source_path, strategy="binary_rerank") mock_br.assert_called_once() # Test strategy='dense_rerank' routing with patch.object(engine, "dense_rerank_cascade_search") as mock_dr: mock_dr.return_value = ChainSearchResult( query="query", results=[], symbols=[], stats=SearchStats() ) engine.cascade_search("query", source_path, strategy="dense_rerank") mock_dr.assert_called_once() # Test default routing (no strategy specified) - defaults to binary with patch.object(engine, "binary_cascade_search") as mock_default: mock_default.return_value = ChainSearchResult( query="query", results=[], symbols=[], stats=SearchStats() ) engine.cascade_search("query", source_path) mock_default.assert_called_once() def test_cascade_search_invalid_strategy(temp_paths: Path) -> None: """Test cascade_search() defaults to 'binary' for invalid strategy.""" from unittest.mock import patch from codexlens.search.chain_search import ChainSearchResult, SearchStats registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data") engine = ChainSearchEngine(registry, mapper, config=config) source_path = temp_paths / "src" # Invalid strategy should default to binary with patch.object(engine, "binary_cascade_search") as mock_binary: mock_binary.return_value = ChainSearchResult( query="query", results=[], symbols=[], stats=SearchStats() ) engine.cascade_search("query", source_path, strategy="invalid_strategy") mock_binary.assert_called_once() def test_vector_warmup_uses_embedding_config(monkeypatch: pytest.MonkeyPatch, temp_paths: Path) -> None: calls: list[dict[str, object]] = [] def fake_get_embedder(**kwargs: object) -> object: calls.append(dict(kwargs)) return object() import codexlens.semantic.factory as factory monkeypatch.setattr(factory, "get_embedder", fake_get_embedder) registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config( data_dir=temp_paths / "data", embedding_backend="fastembed", embedding_model="fast", embedding_use_gpu=False, ) engine = ChainSearchEngine(registry, mapper, config=config) monkeypatch.setattr(engine, "_get_executor", lambda _workers: MagicMock()) engine._search_parallel([], "query", SearchOptions(enable_vector=True)) assert calls == [ { "backend": "fastembed", "profile": "fast", "use_gpu": False, } ] def test_search_single_index_passes_config_to_hybrid_engine( monkeypatch: pytest.MonkeyPatch, temp_paths: Path ) -> None: registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data", embedding_backend="fastembed", embedding_model="code") engine = ChainSearchEngine(registry, mapper, config=config) index_path = temp_paths / "indexes" / "project" / "_index.db" index_path.parent.mkdir(parents=True, exist_ok=True) index_path.write_bytes(b"\x00" * 128) captured: dict[str, object] = {} class FakeHybridSearchEngine: def __init__(self, *, weights=None, config=None): captured["weights"] = weights captured["config"] = config def search(self, *_args, **_kwargs): return [SearchResult(path="src/app.py", score=0.9, excerpt="hit")] monkeypatch.setattr(chain_search_module, "HybridSearchEngine", FakeHybridSearchEngine) results = engine._search_single_index( index_path, "auth flow", limit=5, hybrid_mode=True, enable_vector=True, hybrid_weights={"vector": 1.0}, ) assert captured["config"] is config assert captured["weights"] == {"vector": 1.0} assert len(results) == 1 assert results[0].path == "src/app.py" def test_search_parallel_reuses_shared_hybrid_engine( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: from concurrent.futures import Future registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data") engine = ChainSearchEngine(registry, mapper, config=config) index_root = temp_paths / "indexes" / "project" index_a = index_root / "src" / "_index.db" index_b = index_root / "tests" / "_index.db" index_a.parent.mkdir(parents=True, exist_ok=True) index_b.parent.mkdir(parents=True, exist_ok=True) index_a.write_bytes(b"\x00" * 128) index_b.write_bytes(b"\x00" * 128) created_engines: list[object] = [] search_calls: list[tuple[object, Path]] = [] class FakeHybridSearchEngine: def __init__(self, *, weights=None, config=None): self.weights = weights self.config = config created_engines.append(self) def search(self, index_path, *_args, **_kwargs): search_calls.append((self, index_path)) return [SearchResult(path=str(index_path), score=0.9, excerpt="hit")] class ImmediateExecutor: def submit(self, fn, *args): future: Future = Future() try: future.set_result(fn(*args)) except Exception as exc: future.set_exception(exc) return future monkeypatch.setattr(chain_search_module, "HybridSearchEngine", FakeHybridSearchEngine) monkeypatch.setattr(engine, "_get_executor", lambda _workers: ImmediateExecutor()) results, stats = engine._search_parallel( [index_a, index_b], "auth flow", SearchOptions( hybrid_mode=True, enable_vector=True, limit_per_dir=5, hybrid_weights={"vector": 1.0}, ), ) assert stats.errors == [] assert len(created_engines) == 1 assert [path for _, path in search_calls] == [index_a, index_b] assert all(shared is created_engines[0] for shared, _ in search_calls) assert len(results) == 2 def test_search_injects_feature_query_anchors_into_merge( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data") engine = ChainSearchEngine(registry, mapper, config=config) source_path = temp_paths / "project" start_index = temp_paths / "indexes" / "project" / "_index.db" start_index.parent.mkdir(parents=True, exist_ok=True) start_index.write_text("", encoding="utf-8") feature_path = str(source_path / "src" / "tools" / "smart-search.ts") platform_path = str(source_path / "src" / "utils" / "path-resolver.ts") anchor_result = SearchResult( path=feature_path, score=8.0, excerpt="smart search anchor", metadata={"feature_query_hint": "smart search"}, ) monkeypatch.setattr(engine, "_find_start_index", lambda _source_path: start_index) monkeypatch.setattr( engine, "_collect_index_paths", lambda _start_index, _options: [start_index], ) monkeypatch.setattr( engine, "_search_parallel", lambda *_args, **_kwargs: ( [ SearchResult( path=platform_path, score=0.9, excerpt="platform hit", ) ], SearchStats(), ), ) monkeypatch.setattr(engine, "_search_symbols_parallel", lambda *_args, **_kwargs: []) collected_queries: list[str] = [] monkeypatch.setattr( engine, "_collect_query_feature_anchor_results", lambda query, *_args, **_kwargs: ( collected_queries.append(query), [anchor_result], )[1], ) result = engine.search( "parse CodexLens JSON output strip ANSI smart_search", source_path, options=SearchOptions( total_limit=5, hybrid_mode=True, enable_fuzzy=False, enable_vector=True, ), ) assert collected_queries == ["parse CodexLens JSON output strip ANSI smart_search"] result_by_path = {item.path: item for item in result.results} assert feature_path in result_by_path assert platform_path in result_by_path assert result_by_path[feature_path].metadata["feature_query_anchor"] is True assert result_by_path[feature_path].metadata["feature_query_hint"] == "smart search" def test_group_index_paths_by_dense_root(temp_paths: Path) -> None: registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") engine = ChainSearchEngine(registry, mapper, config=Config(data_dir=temp_paths / "data")) dense_root_a = temp_paths / "indexes" / "project-a" dense_root_b = temp_paths / "indexes" / "project-b" orphan_root = temp_paths / "indexes" / "orphan" / "pkg" dense_root_a.mkdir(parents=True, exist_ok=True) dense_root_b.mkdir(parents=True, exist_ok=True) orphan_root.mkdir(parents=True, exist_ok=True) (dense_root_a / VECTORS_HNSW_NAME).write_bytes(b"a") (dense_root_b / VECTORS_HNSW_NAME).write_bytes(b"b") index_a = dense_root_a / "src" / "_index.db" index_b = dense_root_b / "tests" / "_index.db" orphan_index = orphan_root / "_index.db" index_a.parent.mkdir(parents=True, exist_ok=True) index_b.parent.mkdir(parents=True, exist_ok=True) index_a.write_text("", encoding="utf-8") index_b.write_text("", encoding="utf-8") orphan_index.write_text("", encoding="utf-8") roots, ungrouped = engine._group_index_paths_by_dense_root( [index_a, orphan_index, index_b] ) assert roots == [dense_root_a, dense_root_b] assert ungrouped == [orphan_index] assert engine._find_nearest_dense_hnsw_root(index_a.parent) == dense_root_a assert engine._find_nearest_dense_hnsw_root(orphan_index.parent) is None def test_stage1_binary_search_merges_multiple_centralized_roots( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: import numpy as np registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) engine = ChainSearchEngine(registry, mapper, config=config) root_a = temp_paths / "indexes" / "project-a" root_b = temp_paths / "indexes" / "project-b" for root in (root_a, root_b): root.mkdir(parents=True, exist_ok=True) (root / BINARY_VECTORS_MMAP_NAME).write_bytes(b"binary") (root / VECTORS_META_DB_NAME).write_bytes(b"meta") index_a = root_a / "src" / "_index.db" index_b = root_b / "src" / "_index.db" index_a.parent.mkdir(parents=True, exist_ok=True) index_b.parent.mkdir(parents=True, exist_ok=True) index_a.write_text("", encoding="utf-8") index_b.write_text("", encoding="utf-8") class FakeBinarySearcher: def __init__(self, root: Path) -> None: self.root = root self.backend = "fastembed" self.model = None self.model_profile = "code" def search(self, _query_dense, top_k: int): return [(1, 8)] if self.root == root_a else [(2, 16)] class FakeEmbedder: def embed_to_numpy(self, _queries): return np.ones((1, 4), dtype=np.float32) class FakeVectorMetadataStore: def __init__(self, path: Path) -> None: self.path = Path(path) def get_chunks_by_ids(self, chunk_ids): return [ { "id": chunk_id, "file_path": str(self.path.parent / f"file{chunk_id}.py"), "content": f"chunk {chunk_id}", "metadata": "{\"start_line\": 1, \"end_line\": 2}", "category": "code", } for chunk_id in chunk_ids ] import codexlens.semantic.embedder as embedder_module from codexlens.search.chain_search import SearchStats monkeypatch.setattr( engine, "_get_centralized_binary_searcher", lambda root: FakeBinarySearcher(root), ) monkeypatch.setattr(embedder_module, "get_embedder", lambda **_kwargs: FakeEmbedder()) monkeypatch.setattr(chain_search_module, "VectorMetadataStore", FakeVectorMetadataStore) coarse_results, stage2_root = engine._stage1_binary_search( "binary query", [index_a, index_b], coarse_k=5, stats=SearchStats(), index_root=index_a.parent, ) assert stage2_root is None assert len(coarse_results) == 2 assert {Path(result.path).name for result in coarse_results} == {"file1.py", "file2.py"} def test_stage1_binary_search_keeps_duplicate_chunk_ids_isolated_per_root( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: import numpy as np registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) engine = ChainSearchEngine(registry, mapper, config=config) root_a = temp_paths / "indexes" / "project-a" root_b = temp_paths / "indexes" / "project-b" for root in (root_a, root_b): root.mkdir(parents=True, exist_ok=True) (root / BINARY_VECTORS_MMAP_NAME).write_bytes(b"binary") (root / VECTORS_META_DB_NAME).write_bytes(b"meta") index_a = root_a / "src" / "_index.db" index_b = root_b / "src" / "_index.db" index_a.parent.mkdir(parents=True, exist_ok=True) index_b.parent.mkdir(parents=True, exist_ok=True) index_a.write_text("", encoding="utf-8") index_b.write_text("", encoding="utf-8") class FakeBinarySearcher: def __init__(self, root: Path) -> None: self.root = root self.backend = "fastembed" self.model = None self.model_profile = "code" def search(self, _query_dense, top_k: int): return [(1, 8)] if self.root == root_a else [(1, 16)] class FakeEmbedder: def embed_to_numpy(self, _queries): return np.ones((1, 4), dtype=np.float32) class FakeVectorMetadataStore: def __init__(self, path: Path) -> None: self.path = Path(path) def get_chunks_by_ids(self, chunk_ids): return [ { "id": chunk_id, "file_path": str(self.path.parent / f"{self.path.parent.name}-file{chunk_id}.py"), "content": f"chunk {self.path.parent.name}-{chunk_id}", "metadata": "{\"start_line\": 1, \"end_line\": 2}", "category": "code", } for chunk_id in chunk_ids ] import codexlens.semantic.embedder as embedder_module from codexlens.search.chain_search import SearchStats monkeypatch.setattr( engine, "_get_centralized_binary_searcher", lambda root: FakeBinarySearcher(root), ) monkeypatch.setattr(embedder_module, "get_embedder", lambda **_kwargs: FakeEmbedder()) monkeypatch.setattr(chain_search_module, "VectorMetadataStore", FakeVectorMetadataStore) coarse_results, stage2_root = engine._stage1_binary_search( "binary query", [index_a, index_b], coarse_k=5, stats=SearchStats(), index_root=index_a.parent, ) assert stage2_root is None scores_by_name = {Path(result.path).name: result.score for result in coarse_results} assert scores_by_name["project-a-file1.py"] == pytest.approx(1.0 - (8.0 / 256.0)) assert scores_by_name["project-b-file1.py"] == pytest.approx(1.0 - (16.0 / 256.0)) def test_collect_index_paths_includes_nested_registered_project_roots( temp_paths: Path, ) -> None: registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") engine = ChainSearchEngine(registry, mapper, config=Config(data_dir=temp_paths / "data")) workspace_root = temp_paths / "workspace" child_root = workspace_root / "packages" / "child" ignored_root = workspace_root / "dist" / "generated" workspace_index = mapper.source_to_index_db(workspace_root) child_index = mapper.source_to_index_db(child_root) ignored_index = mapper.source_to_index_db(ignored_root) for index_path in (workspace_index, child_index, ignored_index): index_path.parent.mkdir(parents=True, exist_ok=True) index_path.write_text("", encoding="utf-8") workspace_project = registry.register_project( workspace_root, mapper.source_to_index_dir(workspace_root), ) child_project = registry.register_project( child_root, mapper.source_to_index_dir(child_root), ) ignored_project = registry.register_project( ignored_root, mapper.source_to_index_dir(ignored_root), ) registry.register_dir( workspace_project.id, workspace_root, workspace_index, depth=0, ) registry.register_dir( child_project.id, child_root, child_index, depth=0, ) registry.register_dir( ignored_project.id, ignored_root, ignored_index, depth=0, ) collected = engine._collect_index_paths(workspace_index, depth=-1) assert collected == [workspace_index, child_index] def test_collect_index_paths_respects_depth_for_nested_registered_project_roots( temp_paths: Path, ) -> None: registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") engine = ChainSearchEngine(registry, mapper, config=Config(data_dir=temp_paths / "data")) workspace_root = temp_paths / "workspace" direct_child_root = workspace_root / "apps" deep_child_root = workspace_root / "packages" / "deep" / "child" workspace_index = mapper.source_to_index_db(workspace_root) direct_child_index = mapper.source_to_index_db(direct_child_root) deep_child_index = mapper.source_to_index_db(deep_child_root) for index_path in (workspace_index, direct_child_index, deep_child_index): index_path.parent.mkdir(parents=True, exist_ok=True) index_path.write_text("", encoding="utf-8") workspace_project = registry.register_project( workspace_root, mapper.source_to_index_dir(workspace_root), ) direct_child_project = registry.register_project( direct_child_root, mapper.source_to_index_dir(direct_child_root), ) deep_child_project = registry.register_project( deep_child_root, mapper.source_to_index_dir(deep_child_root), ) registry.register_dir(workspace_project.id, workspace_root, workspace_index, depth=0) registry.register_dir( direct_child_project.id, direct_child_root, direct_child_index, depth=0, ) registry.register_dir( deep_child_project.id, deep_child_root, deep_child_index, depth=0, ) collected = engine._collect_index_paths(workspace_index, depth=1) assert collected == [workspace_index, direct_child_index] def test_binary_rerank_cascade_search_merges_multiple_centralized_roots( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: import numpy as np registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) engine = ChainSearchEngine(registry, mapper, config=config) root_a = temp_paths / "indexes" / "project-a" root_b = temp_paths / "indexes" / "project-b" for root in (root_a, root_b): root.mkdir(parents=True, exist_ok=True) (root / BINARY_VECTORS_MMAP_NAME).write_bytes(b"binary") (root / VECTORS_META_DB_NAME).write_bytes(b"meta") index_a = root_a / "src" / "_index.db" index_b = root_b / "src" / "_index.db" index_a.parent.mkdir(parents=True, exist_ok=True) index_b.parent.mkdir(parents=True, exist_ok=True) index_a.write_text("", encoding="utf-8") index_b.write_text("", encoding="utf-8") class FakeBinarySearcher: def __init__(self, root: Path) -> None: self.root = root self.backend = "fastembed" self.model = None self.model_profile = "code" def search(self, _query_dense, top_k: int): return [(1, 8)] if self.root == root_a else [(2, 16)] class FakeEmbedder: def embed_to_numpy(self, _queries): return np.ones((1, 4), dtype=np.float32) class FakeVectorMetadataStore: def __init__(self, path: Path) -> None: self.path = Path(path) def get_chunks_by_ids(self, chunk_ids): return [ { "chunk_id": chunk_id, "file_path": str(self.path.parent / f"file{chunk_id}.py"), "content": f"chunk {chunk_id}", "metadata": "{}", "category": "code", } for chunk_id in chunk_ids ] import codexlens.semantic.embedder as embedder_module monkeypatch.setattr(engine, "_find_start_index", lambda _source_path: index_a) monkeypatch.setattr(engine, "_collect_index_paths", lambda _start_index, _depth: [index_a, index_b]) monkeypatch.setattr( engine, "_get_centralized_binary_searcher", lambda root: FakeBinarySearcher(root), ) monkeypatch.setattr(embedder_module, "get_embedder", lambda **_kwargs: FakeEmbedder()) monkeypatch.setattr(chain_search_module, "VectorMetadataStore", FakeVectorMetadataStore) monkeypatch.setattr(engine, "_cross_encoder_rerank", lambda _query, results, top_k: results[:top_k]) monkeypatch.setattr(engine, "search", lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected fallback"))) result = engine.binary_rerank_cascade_search( "binary query", index_a.parent, k=5, coarse_k=5, ) assert len(result.results) == 2 assert {Path(item.path).name for item in result.results} == {"file1.py", "file2.py"} def test_dense_rerank_cascade_search_overfetches_and_applies_path_penalties( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: import numpy as np import codexlens.semantic.ann_index as ann_index_module registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config( data_dir=temp_paths / "data", embedding_use_gpu=False, reranker_top_k=3, test_file_penalty=0.35, generated_file_penalty=0.35, ) engine = ChainSearchEngine(registry, mapper, config=config) dense_root = temp_paths / "indexes" / "project" dense_root.mkdir(parents=True, exist_ok=True) (dense_root / VECTORS_HNSW_NAME).write_bytes(b"hnsw") meta_db_path = dense_root / VECTORS_META_DB_NAME conn = sqlite3.connect(meta_db_path) conn.execute( """ CREATE TABLE chunk_metadata ( chunk_id INTEGER PRIMARY KEY, file_path TEXT NOT NULL, content TEXT NOT NULL, start_line INTEGER, end_line INTEGER ) """ ) conn.executemany( """ INSERT INTO chunk_metadata (chunk_id, file_path, content, start_line, end_line) VALUES (?, ?, ?, ?, ?) """, [ ( 1, "project/tests/test_auth.py", "def test_auth_flow():\n pass", 1, 2, ), ( 2, "project/src/auth.py", "def auth_flow():\n return True", 1, 2, ), ( 3, "project/dist/bundle.js", "function authFlow(){return true;}", 1, 1, ), ], ) conn.commit() conn.close() index_path = dense_root / "src" / "_index.db" index_path.parent.mkdir(parents=True, exist_ok=True) index_path.write_text("", encoding="utf-8") class FakeANNIndex: def __init__(self, root: Path, dim: int) -> None: self.root = root self.dim = dim @classmethod def create_central(cls, *, index_root: Path, dim: int): return cls(index_root, dim) def load(self) -> bool: return True def count(self) -> int: return 3 def search(self, _query_dense, top_k: int): ids = [1, 2, 3][:top_k] distances = [0.01, 0.02, 0.03][:top_k] return ids, distances rerank_calls: list[int] = [] def fake_cross_encoder(_query: str, results: list[SearchResult], top_k: int): rerank_calls.append(top_k) return results[:top_k] monkeypatch.setattr(engine, "_find_start_index", lambda _source_path: index_path) monkeypatch.setattr(engine, "_collect_index_paths", lambda _start_index, _depth: [index_path]) monkeypatch.setattr(engine, "_embed_dense_query", lambda *_args, **_kwargs: np.ones(4, dtype=np.float32)) monkeypatch.setattr(engine, "_cross_encoder_rerank", fake_cross_encoder) monkeypatch.setattr( engine, "search", lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected fallback")), ) monkeypatch.setattr(ann_index_module, "ANNIndex", FakeANNIndex) result = engine.dense_rerank_cascade_search( "auth", index_path.parent, k=1, coarse_k=3, ) assert rerank_calls == [3] assert len(result.results) == 1 assert result.results[0].path.endswith("src\\auth.py") or result.results[0].path.endswith("src/auth.py") assert result.results[0].metadata == {} def test_collect_query_feature_anchor_results_uses_explicit_file_hints( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) engine = ChainSearchEngine(registry, mapper, config=config) recorded_queries: list[str] = [] def fake_search(query: str, _source_path: Path, options: SearchOptions | None = None): recorded_queries.append(query) return ChainSearchResult( query=query, results=[ SearchResult( path="/repo/src/tools/smart-search.ts", score=8.7, excerpt="smart search path anchor", ), SearchResult( path="/repo/src/tools/codex-lens-lsp.ts", score=7.4, excerpt="platform term overlap", ), ], symbols=[], stats=SearchStats(), ) monkeypatch.setattr(engine, "search", fake_search) anchors = engine._collect_query_feature_anchor_results( "parse CodexLens JSON output strip ANSI smart_search", temp_paths, SearchOptions(), limit=4, ) assert recorded_queries == ["smart search"] assert [Path(result.path).name for result in anchors] == ["smart-search.ts"] assert anchors[0].metadata["feature_query_anchor"] is True assert anchors[0].metadata["feature_query_hint_tokens"] == ["smart", "search"] def test_collect_query_feature_anchor_results_falls_back_to_full_lexical_query( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) engine = ChainSearchEngine(registry, mapper, config=config) recorded_calls: list[tuple[str, bool]] = [] full_query = "EMBEDDING_BACKEND and RERANKER_BACKEND environment variables" def fake_search(query: str, _source_path: Path, options: SearchOptions | None = None): recorded_calls.append((query, bool(options.inject_feature_anchors) if options else True)) if query == full_query: return ChainSearchResult( query=query, results=[ SearchResult( path="/repo/src/codexlens/env_config.py", score=8.5, excerpt="ENV vars", ), SearchResult( path="/repo/src/codexlens/config.py", score=8.1, excerpt="backend config", ), ], symbols=[], stats=SearchStats(), ) return ChainSearchResult( query=query, results=[ SearchResult( path="/repo/src/codexlens/env_config.py", score=7.0, excerpt="hint candidate", ) ], symbols=[], stats=SearchStats(), ) monkeypatch.setattr(engine, "search", fake_search) anchors = engine._collect_query_feature_anchor_results( full_query, temp_paths, SearchOptions(), limit=2, ) assert recorded_calls == [ ("embedding backend", False), ("reranker backend", False), (full_query, False), ] assert [Path(result.path).name for result in anchors] == ["env_config.py", "config.py"] assert anchors[0].metadata["feature_query_seed_kind"] == "lexical_query" assert anchors[0].metadata["feature_query_hint"] == full_query def test_stage3_cluster_prune_preserves_feature_query_anchors(temp_paths: Path) -> None: registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) config.staged_clustering_strategy = "score" engine = ChainSearchEngine(registry, mapper, config=config) anchor = SearchResult( path="/repo/src/tools/smart-search.ts", score=0.02, excerpt="parse JSON output and strip ANSI", metadata={ "feature_query_anchor": True, "feature_query_hint": "smart search", "feature_query_hint_tokens": ["smart", "search"], }, ) others = [ SearchResult( path=f"/repo/src/feature-{index}.ts", score=0.9 - (0.05 * index), excerpt="generic feature implementation", ) for index in range(6) ] clustered = engine._stage3_cluster_prune( [anchor, *others], target_count=4, query="parse CodexLens JSON output strip ANSI smart_search", ) assert len(clustered) == 4 assert any(Path(result.path).name == "smart-search.ts" for result in clustered) def test_dense_rerank_cascade_search_interleaves_mixed_embedding_groups( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: import numpy as np import codexlens.semantic.ann_index as ann_index_module registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) engine = ChainSearchEngine(registry, mapper, config=config) root_a = temp_paths / "indexes" / "project-a" root_b = temp_paths / "indexes" / "project-b" for root in (root_a, root_b): root.mkdir(parents=True, exist_ok=True) (root / VECTORS_HNSW_NAME).write_bytes(b"hnsw") for meta_db_path, rows in ( ( root_a / VECTORS_META_DB_NAME, [ (1, str(root_a / "src" / "a.py"), "def a():\n return 1", 1, 2), (3, str(root_a / "src" / "a2.py"), "def a2():\n return 2", 1, 2), ], ), ( root_b / VECTORS_META_DB_NAME, [ (2, str(root_b / "src" / "b.py"), "def b():\n return 3", 1, 2), ], ), ): conn = sqlite3.connect(meta_db_path) conn.execute( """ CREATE TABLE chunk_metadata ( chunk_id INTEGER PRIMARY KEY, file_path TEXT NOT NULL, content TEXT NOT NULL, start_line INTEGER, end_line INTEGER ) """ ) conn.executemany( """ INSERT INTO chunk_metadata (chunk_id, file_path, content, start_line, end_line) VALUES (?, ?, ?, ?, ?) """, rows, ) conn.commit() conn.close() index_a = root_a / "src" / "_index.db" index_b = root_b / "src" / "_index.db" index_a.parent.mkdir(parents=True, exist_ok=True) index_b.parent.mkdir(parents=True, exist_ok=True) index_a.write_text("", encoding="utf-8") index_b.write_text("", encoding="utf-8") class FakeANNIndex: def __init__(self, index_path: Path, dim: int) -> None: source = Path(index_path) self.root = source if source.name != "_index.db" else source.parent self.dim = dim @classmethod def create_central(cls, *, index_root: Path, dim: int): return cls(index_root, dim) def load(self) -> bool: return True def count(self) -> int: return 2 if self.root == root_a else 1 def search(self, _query_dense, top_k: int): if self.root == root_a: return [1, 3][:top_k], [0.01, 0.011][:top_k] return [2][:top_k], [0.02][:top_k] monkeypatch.setattr(engine, "_find_start_index", lambda _source_path: index_a) monkeypatch.setattr(engine, "_collect_index_paths", lambda _start_index, _depth: [index_a, index_b]) monkeypatch.setattr( engine, "_resolve_dense_embedding_settings", lambda *, index_root: ( ("fastembed", "code", False) if Path(index_root) == root_a else ("litellm", "qwen3-embedding-sf", False) ), ) monkeypatch.setattr( engine, "_embed_dense_query", lambda _query, *, index_root=None, query_cache=None: ( np.ones(4, dtype=np.float32) if Path(index_root) == root_a else np.ones(8, dtype=np.float32) ), ) monkeypatch.setattr(engine, "_cross_encoder_rerank", lambda _query, results, top_k: results[:top_k]) monkeypatch.setattr( engine, "search", lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected fallback")), ) monkeypatch.setattr(ann_index_module, "ANNIndex", FakeANNIndex) result = engine.dense_rerank_cascade_search( "route query", index_a.parent, k=2, coarse_k=2, ) assert [Path(item.path).name for item in result.results] == ["a.py", "b.py"] def test_dense_rerank_cascade_search_reuses_cached_dense_indexes( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: import numpy as np import codexlens.semantic.ann_index as ann_index_module registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) engine = ChainSearchEngine(registry, mapper, config=config) dense_root = temp_paths / "indexes" / "project" dense_root.mkdir(parents=True, exist_ok=True) (dense_root / VECTORS_HNSW_NAME).write_bytes(b"hnsw") meta_db_path = dense_root / VECTORS_META_DB_NAME conn = sqlite3.connect(meta_db_path) conn.execute( """ CREATE TABLE chunk_metadata ( chunk_id INTEGER PRIMARY KEY, file_path TEXT NOT NULL, content TEXT NOT NULL, start_line INTEGER, end_line INTEGER ) """ ) conn.execute( "INSERT INTO chunk_metadata (chunk_id, file_path, content, start_line, end_line) VALUES (?, ?, ?, ?, ?)", (1, str((temp_paths / "src" / "impl.py").resolve()), "def impl():\n return 1", 1, 2), ) conn.commit() conn.close() index_path = dense_root / "src" / "_index.db" index_path.parent.mkdir(parents=True, exist_ok=True) index_path.write_text("", encoding="utf-8") create_calls: list[tuple[Path, int]] = [] class FakeANNIndex: def __init__(self, root: Path, dim: int) -> None: self.root = root self.dim = dim @classmethod def create_central(cls, *, index_root: Path, dim: int): create_calls.append((Path(index_root), int(dim))) return cls(index_root, dim) def load(self) -> bool: return True def count(self) -> int: return 1 def search(self, _query_dense, top_k: int): return [1][:top_k], [0.01][:top_k] monkeypatch.setattr(engine, "_find_start_index", lambda _source_path: index_path) monkeypatch.setattr(engine, "_collect_index_paths", lambda _start_index, _depth: [index_path]) monkeypatch.setattr(engine, "_embed_dense_query", lambda *_args, **_kwargs: np.ones(4, dtype=np.float32)) monkeypatch.setattr(engine, "_cross_encoder_rerank", lambda _query, results, top_k: results[:top_k]) monkeypatch.setattr( engine, "search", lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected fallback")), ) monkeypatch.setattr(ann_index_module, "ANNIndex", FakeANNIndex) first = engine.dense_rerank_cascade_search("route query", index_path.parent, k=1, coarse_k=1) second = engine.dense_rerank_cascade_search("route query", index_path.parent, k=1, coarse_k=1) assert len(first.results) == 1 assert len(second.results) == 1 assert create_calls == [(dense_root, 4)] def test_dense_rerank_cascade_search_short_circuits_lexical_priority_queries( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data") engine = ChainSearchEngine(registry, mapper, config=config) expected = ChainSearchResult( query="embedding backend fastembed local litellm api config", results=[SearchResult(path="src/config.py", score=0.9, excerpt="embedding_backend = ...")], symbols=[], stats=SearchStats(dirs_searched=3, files_matched=1, time_ms=12.5), ) search_calls: list[tuple[str, Path, SearchOptions | None]] = [] def fake_search(query: str, source_path: Path, options: SearchOptions | None = None): search_calls.append((query, source_path, options)) return expected monkeypatch.setattr(engine, "search", fake_search) monkeypatch.setattr( engine, "_find_start_index", lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("dense path should not run")), ) monkeypatch.setattr( engine, "_embed_dense_query", lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("dense query should not run")), ) monkeypatch.setattr( engine, "_cross_encoder_rerank", lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("rerank should not run")), ) options = SearchOptions( depth=2, max_workers=3, limit_per_dir=4, total_limit=7, include_symbols=True, files_only=False, code_only=True, exclude_extensions=["md"], inject_feature_anchors=False, ) result = engine.dense_rerank_cascade_search( "embedding backend fastembed local litellm api config", temp_paths / "workspace", k=5, coarse_k=50, options=options, ) assert result is not expected assert result.results == expected.results assert result.related_results == expected.related_results assert result.symbols == [] assert result.stats == expected.stats assert len(search_calls) == 1 called_query, called_source_path, lexical_options = search_calls[0] assert called_query == "embedding backend fastembed local litellm api config" assert called_source_path == temp_paths / "workspace" assert lexical_options is not None assert lexical_options.depth == 2 assert lexical_options.max_workers == 3 assert lexical_options.limit_per_dir == 10 assert lexical_options.total_limit == 20 assert lexical_options.include_symbols is False assert lexical_options.enable_vector is False assert lexical_options.hybrid_mode is False assert lexical_options.enable_cascade is False assert lexical_options.code_only is True assert lexical_options.exclude_extensions == ["md"] assert lexical_options.inject_feature_anchors is False def test_cross_encoder_rerank_reuses_cached_reranker_instance( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config( data_dir=temp_paths / "data", enable_cross_encoder_rerank=True, reranker_backend="onnx", reranker_use_gpu=False, ) engine = ChainSearchEngine(registry, mapper, config=config) calls: dict[str, object] = {"check": [], "get": []} class DummyReranker: def score_pairs(self, pairs, batch_size=32): _ = batch_size return [1.0 for _ in pairs] def fake_check_reranker_available(backend: str): calls["check"].append(backend) return True, None def fake_get_reranker(*, backend: str, model_name=None, device=None, **kwargs): calls["get"].append( { "backend": backend, "model_name": model_name, "device": device, "kwargs": kwargs, } ) return DummyReranker() monkeypatch.setattr( "codexlens.semantic.reranker.check_reranker_available", fake_check_reranker_available, ) monkeypatch.setattr( "codexlens.semantic.reranker.get_reranker", fake_get_reranker, ) results = [ SearchResult(path=str((temp_paths / f"file_{idx}.py").resolve()), score=1.0 / (idx + 1), excerpt=f"def fn_{idx}(): pass") for idx in range(3) ] first = engine._cross_encoder_rerank("find function", results, top_k=2) second = engine._cross_encoder_rerank("find function", results, top_k=2) assert len(first) == len(second) == len(results) assert calls["check"] == ["onnx"] assert len(calls["get"]) == 1 get_call = calls["get"][0] assert isinstance(get_call, dict) assert get_call["backend"] == "onnx" assert get_call["kwargs"]["use_gpu"] is False def test_collect_binary_coarse_candidates_interleaves_mixed_dense_fallback_groups( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: import numpy as np import codexlens.semantic.ann_index as ann_index_module registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) engine = ChainSearchEngine(registry, mapper, config=config) root_a = temp_paths / "indexes" / "project-a" root_b = temp_paths / "indexes" / "project-b" for root in (root_a, root_b): root.mkdir(parents=True, exist_ok=True) (root / VECTORS_HNSW_NAME).write_bytes(b"hnsw") index_a = root_a / "src" / "_index.db" index_b = root_b / "src" / "_index.db" index_a.parent.mkdir(parents=True, exist_ok=True) index_b.parent.mkdir(parents=True, exist_ok=True) index_a.write_text("", encoding="utf-8") index_b.write_text("", encoding="utf-8") class FakeANNIndex: def __init__(self, index_path: Path, dim: int) -> None: source = Path(index_path) self.root = source if source.name != "_index.db" else source.parent self.dim = dim @classmethod def create_central(cls, *, index_root: Path, dim: int): return cls(index_root, dim) def load(self) -> bool: return True def count(self) -> int: return 2 if self.root == root_a else 1 def search(self, _query_dense, top_k: int): if self.root == root_a: return [1, 3][:top_k], [0.01, 0.011][:top_k] return [2][:top_k], [0.02][:top_k] monkeypatch.setattr( engine, "_resolve_dense_embedding_settings", lambda *, index_root: ( ("fastembed", "code", False) if Path(index_root) == root_a else ("litellm", "qwen3-embedding-sf", False) ), ) monkeypatch.setattr( engine, "_embed_dense_query", lambda _query, *, index_root=None, query_cache=None: ( np.ones(4, dtype=np.float32) if Path(index_root) == root_a else np.ones(8, dtype=np.float32) ), ) monkeypatch.setattr(ann_index_module, "ANNIndex", FakeANNIndex) coarse_candidates, used_centralized, using_dense_fallback, stage2_index_root = ( engine._collect_binary_coarse_candidates( "route query", [index_a, index_b], coarse_k=2, stats=SearchStats(), index_root=index_a.parent, allow_dense_fallback=True, ) ) assert used_centralized is False assert using_dense_fallback is True assert stage2_index_root is None assert coarse_candidates == [ (1, 0.01, root_a), (2, 0.02, root_b), ] def test_cross_encoder_rerank_deduplicates_duplicate_paths_before_reranking( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) engine = ChainSearchEngine(registry, mapper, config=config) captured: dict[str, object] = {} monkeypatch.setattr( "codexlens.semantic.reranker.check_reranker_available", lambda _backend: (True, None), ) monkeypatch.setattr( "codexlens.semantic.reranker.get_reranker", lambda **_kwargs: object(), ) def fake_cross_encoder_rerank( *, query: str, results: list[SearchResult], reranker, top_k: int = 50, batch_size: int = 32, chunk_type_weights=None, test_file_penalty: float = 0.0, ) -> list[SearchResult]: captured["query"] = query captured["paths"] = [item.path for item in results] captured["scores"] = [float(item.score) for item in results] captured["top_k"] = top_k captured["batch_size"] = batch_size captured["chunk_type_weights"] = chunk_type_weights captured["test_file_penalty"] = test_file_penalty _ = reranker return results[:top_k] monkeypatch.setattr( "codexlens.search.ranking.cross_encoder_rerank", fake_cross_encoder_rerank, ) reranked = engine._cross_encoder_rerank( "semantic auth query", [ SearchResult(path="/repo/src/router.py", score=0.91, excerpt="chunk 1"), SearchResult(path="/repo/src/router.py", score=0.42, excerpt="chunk 2"), SearchResult(path="/repo/src/config.py", score=0.73, excerpt="chunk 3"), ], top_k=5, ) assert captured["query"] == "semantic auth query" assert captured["paths"] == ["/repo/src/router.py", "/repo/src/config.py"] assert captured["scores"] == pytest.approx([0.91, 0.73]) assert captured["top_k"] == 5 assert len(reranked) == 2 def test_binary_cascade_search_merges_multiple_centralized_roots( monkeypatch: pytest.MonkeyPatch, temp_paths: Path, ) -> None: import sqlite3 import numpy as np registry = RegistryStore(db_path=temp_paths / "registry.db") registry.initialize() mapper = PathMapper(index_root=temp_paths / "indexes") config = Config(data_dir=temp_paths / "data", embedding_use_gpu=False) engine = ChainSearchEngine(registry, mapper, config=config) root_a = temp_paths / "indexes" / "project-a" root_b = temp_paths / "indexes" / "project-b" source_db_a = root_a / "source-a.db" source_db_b = root_b / "source-b.db" for root, source_db, chunk_id in ((root_a, source_db_a, 1), (root_b, source_db_b, 2)): root.mkdir(parents=True, exist_ok=True) (root / BINARY_VECTORS_MMAP_NAME).write_bytes(b"binary") (root / VECTORS_META_DB_NAME).write_bytes(b"meta") conn = sqlite3.connect(source_db) conn.execute("CREATE TABLE semantic_chunks (id INTEGER PRIMARY KEY, embedding_dense BLOB)") conn.execute( "INSERT INTO semantic_chunks (id, embedding_dense) VALUES (?, ?)", (chunk_id, np.ones(4, dtype=np.float32).tobytes()), ) conn.commit() conn.close() index_a = root_a / "src" / "_index.db" index_b = root_b / "src" / "_index.db" index_a.parent.mkdir(parents=True, exist_ok=True) index_b.parent.mkdir(parents=True, exist_ok=True) index_a.write_text("", encoding="utf-8") index_b.write_text("", encoding="utf-8") class FakeBinarySearcher: def __init__(self, root: Path) -> None: self.root = root self.backend = "fastembed" self.model = None self.model_profile = "code" def search(self, _query_dense, top_k: int): return [(1, 8)] if self.root == root_a else [(2, 16)] class FakeEmbedder: def embed_to_numpy(self, _queries): return np.ones((1, 4), dtype=np.float32) class FakeVectorMetadataStore: def __init__(self, path: Path) -> None: self.path = Path(path) def get_chunks_by_ids(self, chunk_ids): source_db = source_db_a if self.path.parent == root_a else source_db_b return [ { "chunk_id": chunk_id, "file_path": str(self.path.parent / f"file{chunk_id}.py"), "content": f"chunk {chunk_id}", "source_index_db": str(source_db), } for chunk_id in chunk_ids ] import codexlens.semantic.embedder as embedder_module monkeypatch.setattr(engine, "_find_start_index", lambda _source_path: index_a) monkeypatch.setattr(engine, "_collect_index_paths", lambda _start_index, _depth: [index_a, index_b]) monkeypatch.setattr( engine, "_get_centralized_binary_searcher", lambda root: FakeBinarySearcher(root), ) monkeypatch.setattr(embedder_module, "get_embedder", lambda **_kwargs: FakeEmbedder()) monkeypatch.setattr(chain_search_module, "VectorMetadataStore", FakeVectorMetadataStore) monkeypatch.setattr( engine, "_embed_dense_query", lambda _query, *, index_root=None, query_cache=None: np.ones(4, dtype=np.float32), ) monkeypatch.setattr(engine, "search", lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("unexpected fallback"))) result = engine.binary_cascade_search( "binary query", index_a.parent, k=5, coarse_k=5, ) assert len(result.results) == 2 assert {Path(item.path).name for item in result.results} == {"file1.py", "file2.py"}