Add tests and implement functionality for staged cascade search and LSP expansion

- Introduced a new JSON file for verbose output of the Codex Lens search results.
- Added unit tests for binary search functionality in `test_stage1_binary_search_uses_chunk_lines.py`.
- Implemented regression tests for staged cascade Stage 2 expansion depth in `test_staged_cascade_lsp_depth.py`.
- Created unit tests for staged cascade Stage 2 realtime LSP graph expansion in `test_staged_cascade_realtime_lsp.py`.
- Enhanced the ChainSearchEngine to respect configuration settings for staged LSP depth and improve search accuracy.
This commit is contained in:
catlog22
2026-02-08 21:54:42 +08:00
parent 166211dcd4
commit b9b2932f50
20 changed files with 1882 additions and 283 deletions

View File

@@ -0,0 +1,65 @@
from __future__ import annotations
from pathlib import Path
from unittest.mock import MagicMock, patch
from codexlens.config import VECTORS_META_DB_NAME, Config
from codexlens.search.chain_search import ChainSearchEngine, SearchStats
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore
def test_stage1_binary_search_prefers_chunk_start_line(tmp_path: Path) -> None:
registry = RegistryStore(db_path=tmp_path / "registry.db")
registry.initialize()
mapper = PathMapper(index_root=tmp_path / "indexes")
engine = ChainSearchEngine(registry, mapper, config=Config(data_dir=tmp_path / "data"))
try:
index_root = tmp_path / "fake_index_root"
index_root.mkdir(parents=True, exist_ok=True)
index_db = index_root / "_index.db"
index_db.write_text("", encoding="utf-8")
(index_root / VECTORS_META_DB_NAME).write_text("", encoding="utf-8")
class _DummyBinarySearcher:
def search(self, query_dense, top_k: int):
_ = query_dense
_ = top_k
return [(123, 10)]
class _DummyEmbedder:
def embed_to_numpy(self, texts):
_ = texts
return [[0.0]]
dummy_meta_store = MagicMock()
dummy_meta_store.get_chunks_by_ids.return_value = [
{
"chunk_id": 123,
"file_path": str(tmp_path / "a.py"),
"content": "def a():\n return 1\n",
"start_line": 12,
"end_line": 14,
"metadata": {},
"category": "code",
}
]
with patch.object(engine, "_get_centralized_binary_searcher", return_value=_DummyBinarySearcher()):
with patch("codexlens.search.chain_search.VectorMetadataStore", return_value=dummy_meta_store):
with patch("codexlens.semantic.embedder.Embedder", return_value=_DummyEmbedder()):
coarse_results, returned_root = engine._stage1_binary_search(
"a",
[index_db],
coarse_k=1,
stats=SearchStats(),
)
assert returned_root == index_root
assert len(coarse_results) == 1
assert coarse_results[0].start_line == 12
assert coarse_results[0].end_line == 14
finally:
engine.close()

View File

@@ -0,0 +1,168 @@
"""Regression tests for staged cascade Stage 2 expansion depth.
Staged cascade is documented as:
coarse (binary) → LSP/graph expansion → clustering → optional rerank
This test ensures Stage 2 respects Config.staged_lsp_depth (not unrelated
graph_expansion_depth settings).
"""
from __future__ import annotations
import tempfile
from pathlib import Path
from unittest.mock import patch
import pytest
from codexlens.config import Config
from codexlens.entities import CodeRelationship, RelationshipType, SearchResult, Symbol
from codexlens.search.chain_search import ChainSearchEngine
from codexlens.storage.dir_index import DirIndexStore
from codexlens.storage.index_tree import _compute_graph_neighbors
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore
@pytest.fixture()
def temp_paths() -> Path:
tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
root = Path(tmpdir.name)
yield root
try:
tmpdir.cleanup()
except (PermissionError, OSError):
pass
def _create_index_with_neighbors(root: Path) -> tuple[PathMapper, Path, Path, str]:
project_root = root / "project"
project_root.mkdir(parents=True, exist_ok=True)
index_root = root / "indexes"
mapper = PathMapper(index_root=index_root)
index_db_path = mapper.source_to_index_db(project_root)
index_db_path.parent.mkdir(parents=True, exist_ok=True)
# Use 3 files so staged_cascade_search's final "deduplicate by path" step
# doesn't collapse all expanded symbols into a single file result.
content_a = "\n".join(["def a():", " b()", ""])
content_b = "\n".join(["def b():", " c()", ""])
content_c = "\n".join(["def c():", " return 1", ""])
file_a = project_root / "a.py"
file_b = project_root / "b.py"
file_c = project_root / "c.py"
file_a.write_text(content_a, encoding="utf-8")
file_b.write_text(content_b, encoding="utf-8")
file_c.write_text(content_c, encoding="utf-8")
symbols_a = [Symbol(name="a", kind="function", range=(1, 2), file=str(file_a))]
symbols_b = [Symbol(name="b", kind="function", range=(1, 2), file=str(file_b))]
symbols_c = [Symbol(name="c", kind="function", range=(1, 2), file=str(file_c))]
relationships_a = [
CodeRelationship(
source_symbol="a",
target_symbol="b",
relationship_type=RelationshipType.CALL,
source_file=str(file_a),
target_file=str(file_b),
source_line=2,
)
]
relationships_b = [
CodeRelationship(
source_symbol="b",
target_symbol="c",
relationship_type=RelationshipType.CALL,
source_file=str(file_b),
target_file=str(file_c),
source_line=2,
)
]
config = Config(data_dir=root / "data")
store = DirIndexStore(index_db_path, config=config)
store.initialize()
store.add_file(
name=file_a.name,
full_path=file_a,
content=content_a,
language="python",
symbols=symbols_a,
relationships=relationships_a,
)
store.add_file(
name=file_b.name,
full_path=file_b,
content=content_b,
language="python",
symbols=symbols_b,
relationships=relationships_b,
)
store.add_file(
name=file_c.name,
full_path=file_c,
content=content_c,
language="python",
symbols=symbols_c,
relationships=[],
)
_compute_graph_neighbors(store)
store.close()
return mapper, project_root, file_a, content_a
def test_staged_cascade_stage2_uses_staged_lsp_depth(temp_paths: Path) -> None:
mapper, project_root, file_path, content = _create_index_with_neighbors(temp_paths)
index_db_path = mapper.source_to_index_db(project_root)
registry = RegistryStore(db_path=temp_paths / "registry.db")
registry.initialize()
# Intentionally conflicting depths: staged_lsp_depth should win for staged cascade.
config = Config(
data_dir=temp_paths / "data",
staged_lsp_depth=1,
graph_expansion_depth=2,
enable_staged_rerank=False,
staged_clustering_strategy="noop",
)
engine = ChainSearchEngine(registry, mapper, config=config)
try:
base = SearchResult(
path=str(file_path.resolve()),
score=1.0,
excerpt="",
content=content,
start_line=1,
end_line=2,
symbol_name="a",
symbol_kind="function",
)
with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", True):
with patch.object(engine, "_find_start_index", return_value=index_db_path):
with patch.object(engine, "_collect_index_paths", return_value=[index_db_path]):
# Bypass binary vector infrastructure; Stage 1 output is sufficient for Stage 2 behavior.
with patch.object(
engine,
"_stage1_binary_search",
return_value=([base], index_db_path.parent),
):
result = engine.staged_cascade_search(
query="test",
source_path=project_root,
k=3,
coarse_k=10,
)
symbol_names = {r.symbol_name for r in result.results if r.symbol_name}
assert "b" in symbol_names
# With staged_lsp_depth=1, Stage 2 should NOT include 2-hop neighbor "c".
assert "c" not in symbol_names
finally:
engine.close()

View File

@@ -0,0 +1,98 @@
"""Unit tests for staged cascade Stage 2 realtime LSP graph expansion.
These tests mock out the live LSP components (LspBridge + LspGraphBuilder)
so they can run without external language servers installed.
"""
from __future__ import annotations
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
from codexlens.config import Config
from codexlens.entities import SearchResult
from codexlens.hybrid_search.data_structures import CodeAssociationGraph, CodeSymbolNode, Range
from codexlens.search.chain_search import ChainSearchEngine
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore
class _DummyBridge:
def __init__(self, *args, **kwargs) -> None:
pass
async def get_document_symbols(self, file_path: str):
_ = file_path
return []
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc, tb) -> None:
return None
def test_stage2_realtime_mode_expands_and_combines(tmp_path: Path) -> None:
registry = RegistryStore(db_path=tmp_path / "registry.db")
registry.initialize()
mapper = PathMapper(index_root=tmp_path / "indexes")
config = Config(
data_dir=tmp_path / "data",
staged_stage2_mode="realtime",
staged_lsp_depth=1,
staged_realtime_lsp_timeout_s=1.0,
staged_realtime_lsp_max_nodes=10,
staged_realtime_lsp_warmup_s=0.0,
)
engine = ChainSearchEngine(registry, mapper, config=config)
try:
coarse = [
SearchResult(
path=str(tmp_path / "a.py"),
score=1.0,
excerpt="def a(): pass",
content="def a():\n pass\n",
symbol_name="a",
symbol_kind="function",
start_line=1,
end_line=2,
)
]
graph = CodeAssociationGraph()
seed_id = f"{coarse[0].path}:a:1"
graph.nodes[seed_id] = CodeSymbolNode(
id=seed_id,
name="a",
kind="function",
file_path=coarse[0].path,
range=Range(start_line=1, start_character=1, end_line=2, end_character=1),
)
related_id = f"{str(tmp_path / 'b.py')}:b:1"
graph.nodes[related_id] = CodeSymbolNode(
id=related_id,
name="b",
kind="function",
file_path=str(tmp_path / "b.py"),
range=Range(start_line=1, start_character=1, end_line=1, end_character=1),
raw_code="def b():\n return 1\n",
)
dummy_builder = MagicMock()
dummy_builder.build_from_seeds = AsyncMock(return_value=graph)
with patch("codexlens.lsp.LspBridge", _DummyBridge):
with patch("codexlens.lsp.LspGraphBuilder", return_value=dummy_builder) as mock_builder:
# Avoid needing a real index_to_source mapping
engine.mapper.index_to_source = MagicMock(return_value=tmp_path)
expanded = engine._stage2_lsp_expand(coarse, index_root=tmp_path / "fake_index_root")
assert mock_builder.call_args is not None
assert mock_builder.call_args.kwargs.get("resolve_symbols") is False
names = {r.symbol_name for r in expanded if r.symbol_name}
assert "a" in names
assert "b" in names
finally:
engine.close()

View File

@@ -760,6 +760,24 @@ class TestLocationParsing:
assert loc.line == 1
assert loc.character == 1
def test_location_from_file_uri_windows_percent_encoded_drive(self):
"""Parse Location from percent-encoded Windows drive URIs (pyright-style)."""
from codexlens.lsp.lsp_bridge import Location
data = {
"uri": "file:///d%3A/Claude_dms3/codex-lens/src/codexlens/api/semantic.py",
"range": {
"start": {"line": 18, "character": 3},
"end": {"line": 18, "character": 10},
},
}
loc = Location.from_lsp_response(data)
assert loc.file_path == "d:/Claude_dms3/codex-lens/src/codexlens/api/semantic.py"
assert loc.line == 19 # 0-based -> 1-based
assert loc.character == 4
def test_location_from_direct_fields(self):
"""Parse Location from direct field format."""
from codexlens.lsp.lsp_bridge import Location