Add tests and implement functionality for staged cascade search and LSP expansion

- Introduced a new JSON file for verbose output of the Codex Lens search results. - Added unit tests for binary search functionality in `test_stage1_binary_search_uses_chunk_lines.py`. - Implemented regression tests for staged cascade Stage 2 expansion depth in `test_staged_cascade_lsp_depth.py`. - Created unit tests for staged cascade Stage 2 realtime LSP graph expansion in `test_staged_cascade_realtime_lsp.py`. - Enhanced the ChainSearchEngine to respect configuration settings for staged LSP depth and improve search accuracy.
2026-02-10 02:24:35 +08:00 · 2026-02-08 21:54:42 +08:00
parent 166211dcd4
commit b9b2932f50
20 changed files with 1882 additions and 283 deletions
--- a/codex-lens/tests/test_stage1_binary_search_uses_chunk_lines.py
+++ b/codex-lens/tests/test_stage1_binary_search_uses_chunk_lines.py
@@ -0,0 +1,65 @@
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+from codexlens.config import VECTORS_META_DB_NAME, Config
+from codexlens.search.chain_search import ChainSearchEngine, SearchStats
+from codexlens.storage.path_mapper import PathMapper
+from codexlens.storage.registry import RegistryStore
+
+
+def test_stage1_binary_search_prefers_chunk_start_line(tmp_path: Path) -> None:
+    registry = RegistryStore(db_path=tmp_path / "registry.db")
+    registry.initialize()
+    mapper = PathMapper(index_root=tmp_path / "indexes")
+    engine = ChainSearchEngine(registry, mapper, config=Config(data_dir=tmp_path / "data"))
+
+    try:
+        index_root = tmp_path / "fake_index_root"
+        index_root.mkdir(parents=True, exist_ok=True)
+        index_db = index_root / "_index.db"
+        index_db.write_text("", encoding="utf-8")
+        (index_root / VECTORS_META_DB_NAME).write_text("", encoding="utf-8")
+
+        class _DummyBinarySearcher:
+            def search(self, query_dense, top_k: int):
+                _ = query_dense
+                _ = top_k
+                return [(123, 10)]
+
+        class _DummyEmbedder:
+            def embed_to_numpy(self, texts):
+                _ = texts
+                return [[0.0]]
+
+        dummy_meta_store = MagicMock()
+        dummy_meta_store.get_chunks_by_ids.return_value = [
+            {
+                "chunk_id": 123,
+                "file_path": str(tmp_path / "a.py"),
+                "content": "def a():\n    return 1\n",
+                "start_line": 12,
+                "end_line": 14,
+                "metadata": {},
+                "category": "code",
+            }
+        ]
+
+        with patch.object(engine, "_get_centralized_binary_searcher", return_value=_DummyBinarySearcher()):
+            with patch("codexlens.search.chain_search.VectorMetadataStore", return_value=dummy_meta_store):
+                with patch("codexlens.semantic.embedder.Embedder", return_value=_DummyEmbedder()):
+                    coarse_results, returned_root = engine._stage1_binary_search(
+                        "a",
+                        [index_db],
+                        coarse_k=1,
+                        stats=SearchStats(),
+                    )
+
+        assert returned_root == index_root
+        assert len(coarse_results) == 1
+        assert coarse_results[0].start_line == 12
+        assert coarse_results[0].end_line == 14
+    finally:
+        engine.close()
+
--- a/codex-lens/tests/test_staged_cascade_lsp_depth.py
+++ b/codex-lens/tests/test_staged_cascade_lsp_depth.py
@@ -0,0 +1,168 @@
+"""Regression tests for staged cascade Stage 2 expansion depth.
+
+Staged cascade is documented as:
+  coarse (binary) → LSP/graph expansion → clustering → optional rerank
+
+This test ensures Stage 2 respects Config.staged_lsp_depth (not unrelated
+graph_expansion_depth settings).
+"""
+
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+from codexlens.config import Config
+from codexlens.entities import CodeRelationship, RelationshipType, SearchResult, Symbol
+from codexlens.search.chain_search import ChainSearchEngine
+from codexlens.storage.dir_index import DirIndexStore
+from codexlens.storage.index_tree import _compute_graph_neighbors
+from codexlens.storage.path_mapper import PathMapper
+from codexlens.storage.registry import RegistryStore
+
+
+@pytest.fixture()
+def temp_paths() -> Path:
+    tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
+    root = Path(tmpdir.name)
+    yield root
+    try:
+        tmpdir.cleanup()
+    except (PermissionError, OSError):
+        pass
+
+
+def _create_index_with_neighbors(root: Path) -> tuple[PathMapper, Path, Path, str]:
+    project_root = root / "project"
+    project_root.mkdir(parents=True, exist_ok=True)
+
+    index_root = root / "indexes"
+    mapper = PathMapper(index_root=index_root)
+    index_db_path = mapper.source_to_index_db(project_root)
+    index_db_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Use 3 files so staged_cascade_search's final "deduplicate by path" step
+    # doesn't collapse all expanded symbols into a single file result.
+    content_a = "\n".join(["def a():", "    b()", ""])
+    content_b = "\n".join(["def b():", "    c()", ""])
+    content_c = "\n".join(["def c():", "    return 1", ""])
+
+    file_a = project_root / "a.py"
+    file_b = project_root / "b.py"
+    file_c = project_root / "c.py"
+    file_a.write_text(content_a, encoding="utf-8")
+    file_b.write_text(content_b, encoding="utf-8")
+    file_c.write_text(content_c, encoding="utf-8")
+
+    symbols_a = [Symbol(name="a", kind="function", range=(1, 2), file=str(file_a))]
+    symbols_b = [Symbol(name="b", kind="function", range=(1, 2), file=str(file_b))]
+    symbols_c = [Symbol(name="c", kind="function", range=(1, 2), file=str(file_c))]
+
+    relationships_a = [
+        CodeRelationship(
+            source_symbol="a",
+            target_symbol="b",
+            relationship_type=RelationshipType.CALL,
+            source_file=str(file_a),
+            target_file=str(file_b),
+            source_line=2,
+        )
+    ]
+    relationships_b = [
+        CodeRelationship(
+            source_symbol="b",
+            target_symbol="c",
+            relationship_type=RelationshipType.CALL,
+            source_file=str(file_b),
+            target_file=str(file_c),
+            source_line=2,
+        )
+    ]
+
+    config = Config(data_dir=root / "data")
+    store = DirIndexStore(index_db_path, config=config)
+    store.initialize()
+    store.add_file(
+        name=file_a.name,
+        full_path=file_a,
+        content=content_a,
+        language="python",
+        symbols=symbols_a,
+        relationships=relationships_a,
+    )
+    store.add_file(
+        name=file_b.name,
+        full_path=file_b,
+        content=content_b,
+        language="python",
+        symbols=symbols_b,
+        relationships=relationships_b,
+    )
+    store.add_file(
+        name=file_c.name,
+        full_path=file_c,
+        content=content_c,
+        language="python",
+        symbols=symbols_c,
+        relationships=[],
+    )
+    _compute_graph_neighbors(store)
+    store.close()
+
+    return mapper, project_root, file_a, content_a
+
+
+def test_staged_cascade_stage2_uses_staged_lsp_depth(temp_paths: Path) -> None:
+    mapper, project_root, file_path, content = _create_index_with_neighbors(temp_paths)
+    index_db_path = mapper.source_to_index_db(project_root)
+
+    registry = RegistryStore(db_path=temp_paths / "registry.db")
+    registry.initialize()
+
+    # Intentionally conflicting depths: staged_lsp_depth should win for staged cascade.
+    config = Config(
+        data_dir=temp_paths / "data",
+        staged_lsp_depth=1,
+        graph_expansion_depth=2,
+        enable_staged_rerank=False,
+        staged_clustering_strategy="noop",
+    )
+
+    engine = ChainSearchEngine(registry, mapper, config=config)
+    try:
+        base = SearchResult(
+            path=str(file_path.resolve()),
+            score=1.0,
+            excerpt="",
+            content=content,
+            start_line=1,
+            end_line=2,
+            symbol_name="a",
+            symbol_kind="function",
+        )
+
+        with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", True):
+            with patch.object(engine, "_find_start_index", return_value=index_db_path):
+                with patch.object(engine, "_collect_index_paths", return_value=[index_db_path]):
+                    # Bypass binary vector infrastructure; Stage 1 output is sufficient for Stage 2 behavior.
+                    with patch.object(
+                        engine,
+                        "_stage1_binary_search",
+                        return_value=([base], index_db_path.parent),
+                    ):
+                        result = engine.staged_cascade_search(
+                            query="test",
+                            source_path=project_root,
+                            k=3,
+                            coarse_k=10,
+                        )
+
+        symbol_names = {r.symbol_name for r in result.results if r.symbol_name}
+        assert "b" in symbol_names
+        # With staged_lsp_depth=1, Stage 2 should NOT include 2-hop neighbor "c".
+        assert "c" not in symbol_names
+    finally:
+        engine.close()
--- a/codex-lens/tests/test_staged_cascade_realtime_lsp.py
+++ b/codex-lens/tests/test_staged_cascade_realtime_lsp.py
@@ -0,0 +1,98 @@
+"""Unit tests for staged cascade Stage 2 realtime LSP graph expansion.
+
+These tests mock out the live LSP components (LspBridge + LspGraphBuilder)
+so they can run without external language servers installed.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import AsyncMock, MagicMock, patch
+
+from codexlens.config import Config
+from codexlens.entities import SearchResult
+from codexlens.hybrid_search.data_structures import CodeAssociationGraph, CodeSymbolNode, Range
+from codexlens.search.chain_search import ChainSearchEngine
+from codexlens.storage.path_mapper import PathMapper
+from codexlens.storage.registry import RegistryStore
+
+
+class _DummyBridge:
+    def __init__(self, *args, **kwargs) -> None:
+        pass
+
+    async def get_document_symbols(self, file_path: str):
+        _ = file_path
+        return []
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_type, exc, tb) -> None:
+        return None
+
+
+def test_stage2_realtime_mode_expands_and_combines(tmp_path: Path) -> None:
+    registry = RegistryStore(db_path=tmp_path / "registry.db")
+    registry.initialize()
+    mapper = PathMapper(index_root=tmp_path / "indexes")
+
+    config = Config(
+        data_dir=tmp_path / "data",
+        staged_stage2_mode="realtime",
+        staged_lsp_depth=1,
+        staged_realtime_lsp_timeout_s=1.0,
+        staged_realtime_lsp_max_nodes=10,
+        staged_realtime_lsp_warmup_s=0.0,
+    )
+
+    engine = ChainSearchEngine(registry, mapper, config=config)
+    try:
+        coarse = [
+            SearchResult(
+                path=str(tmp_path / "a.py"),
+                score=1.0,
+                excerpt="def a(): pass",
+                content="def a():\n    pass\n",
+                symbol_name="a",
+                symbol_kind="function",
+                start_line=1,
+                end_line=2,
+            )
+        ]
+
+        graph = CodeAssociationGraph()
+        seed_id = f"{coarse[0].path}:a:1"
+        graph.nodes[seed_id] = CodeSymbolNode(
+            id=seed_id,
+            name="a",
+            kind="function",
+            file_path=coarse[0].path,
+            range=Range(start_line=1, start_character=1, end_line=2, end_character=1),
+        )
+        related_id = f"{str(tmp_path / 'b.py')}:b:1"
+        graph.nodes[related_id] = CodeSymbolNode(
+            id=related_id,
+            name="b",
+            kind="function",
+            file_path=str(tmp_path / "b.py"),
+            range=Range(start_line=1, start_character=1, end_line=1, end_character=1),
+            raw_code="def b():\n    return 1\n",
+        )
+
+        dummy_builder = MagicMock()
+        dummy_builder.build_from_seeds = AsyncMock(return_value=graph)
+
+        with patch("codexlens.lsp.LspBridge", _DummyBridge):
+            with patch("codexlens.lsp.LspGraphBuilder", return_value=dummy_builder) as mock_builder:
+                # Avoid needing a real index_to_source mapping
+                engine.mapper.index_to_source = MagicMock(return_value=tmp_path)
+                expanded = engine._stage2_lsp_expand(coarse, index_root=tmp_path / "fake_index_root")
+
+        assert mock_builder.call_args is not None
+        assert mock_builder.call_args.kwargs.get("resolve_symbols") is False
+        names = {r.symbol_name for r in expanded if r.symbol_name}
+        assert "a" in names
+        assert "b" in names
+    finally:
+        engine.close()
--- a/codex-lens/tests/unit/lsp/test_lsp_edge_cases.py
+++ b/codex-lens/tests/unit/lsp/test_lsp_edge_cases.py
@@ -760,6 +760,24 @@ class TestLocationParsing:
        assert loc.line == 1
        assert loc.character == 1

+    def test_location_from_file_uri_windows_percent_encoded_drive(self):
+        """Parse Location from percent-encoded Windows drive URIs (pyright-style)."""
+        from codexlens.lsp.lsp_bridge import Location
+
+        data = {
+            "uri": "file:///d%3A/Claude_dms3/codex-lens/src/codexlens/api/semantic.py",
+            "range": {
+                "start": {"line": 18, "character": 3},
+                "end": {"line": 18, "character": 10},
+            },
+        }
+
+        loc = Location.from_lsp_response(data)
+
+        assert loc.file_path == "d:/Claude_dms3/codex-lens/src/codexlens/api/semantic.py"
+        assert loc.line == 19  # 0-based -> 1-based
+        assert loc.character == 4
+
    def test_location_from_direct_fields(self):
        """Parse Location from direct field format."""
        from codexlens.lsp.lsp_bridge import Location