Refactor code structure for improved readability and maintainability

This commit is contained in:
catlog22
2026-02-08 13:47:59 +08:00
parent 54c3234d84
commit 0a04660c80
99 changed files with 4820 additions and 413 deletions

View File

@@ -0,0 +1,264 @@
"""Integration tests for semantic.py API - fusion strategy routing and result transform.
Tests cover:
- _execute_search: Strategy routing for rrf, binary, staged, hybrid (compat), dense_rerank
- _transform_results: Score extraction and kind filtering
"""
from __future__ import annotations
from pathlib import Path
from typing import List, Optional
from unittest.mock import MagicMock, Mock, patch
import pytest
from codexlens.api.models import SemanticResult
from codexlens.api.semantic import _execute_search, _transform_results
from codexlens.entities import SearchResult
from codexlens.search.chain_search import (
ChainSearchEngine,
ChainSearchResult,
SearchOptions,
SearchStats,
)
# =============================================================================
# Test Fixtures
# =============================================================================
@pytest.fixture
def mock_engine():
"""Create mock ChainSearchEngine."""
engine = MagicMock(spec=ChainSearchEngine)
return engine
@pytest.fixture
def mock_chain_result():
"""Create mock ChainSearchResult with sample data."""
return ChainSearchResult(
query="test query",
results=[
SearchResult(
path="auth.py",
score=0.9,
excerpt="def authenticate(user):",
symbol_name="authenticate",
symbol_kind="function",
start_line=10,
end_line=20,
),
SearchResult(
path="login.py",
score=0.7,
excerpt="class LoginHandler:",
symbol_name="LoginHandler",
symbol_kind="class",
start_line=5,
end_line=50,
),
],
symbols=[],
stats=SearchStats(),
)
@pytest.fixture
def mock_options():
"""Create mock SearchOptions."""
return SearchOptions(
hybrid_mode=True,
enable_vector=True,
enable_fuzzy=True,
)
# =============================================================================
# Tests: _execute_search strategy routing
# =============================================================================
class TestExecuteSearchStrategyRouting:
"""Tests for _execute_search() fusion strategy routing."""
def test_fusion_strategy_rrf(self, mock_engine, mock_chain_result, mock_options):
"""Default 'rrf' strategy should call engine.search()."""
mock_engine.search.return_value = mock_chain_result
result = _execute_search(
engine=mock_engine,
query="test",
source_path=Path("/project"),
fusion_strategy="rrf",
options=mock_options,
limit=20,
)
mock_engine.search.assert_called_once()
assert isinstance(result, ChainSearchResult)
def test_fusion_strategy_binary(self, mock_engine, mock_chain_result, mock_options):
"""'binary' strategy should call engine.binary_cascade_search()."""
mock_engine.binary_cascade_search.return_value = mock_chain_result
result = _execute_search(
engine=mock_engine,
query="test",
source_path=Path("/project"),
fusion_strategy="binary",
options=mock_options,
limit=20,
)
mock_engine.binary_cascade_search.assert_called_once()
# Verify k and coarse_k parameters
call_kwargs = mock_engine.binary_cascade_search.call_args
assert call_kwargs[1]["k"] == 20
assert call_kwargs[1]["coarse_k"] == 100 # limit * 5
def test_fusion_strategy_staged(self, mock_engine, mock_chain_result, mock_options):
"""'staged' strategy should call engine.staged_cascade_search()."""
mock_engine.staged_cascade_search.return_value = mock_chain_result
result = _execute_search(
engine=mock_engine,
query="test",
source_path=Path("/project"),
fusion_strategy="staged",
options=mock_options,
limit=20,
)
mock_engine.staged_cascade_search.assert_called_once()
def test_fusion_strategy_hybrid_compat(
self, mock_engine, mock_chain_result, mock_options
):
"""'hybrid' strategy should map to binary_rerank_cascade_search (backward compat)."""
mock_engine.binary_rerank_cascade_search.return_value = mock_chain_result
result = _execute_search(
engine=mock_engine,
query="test",
source_path=Path("/project"),
fusion_strategy="hybrid",
options=mock_options,
limit=20,
)
mock_engine.binary_rerank_cascade_search.assert_called_once()
def test_fusion_strategy_dense_rerank(
self, mock_engine, mock_chain_result, mock_options
):
"""'dense_rerank' strategy should call engine.search() (default fallback)."""
# In the current implementation, dense_rerank is not explicitly handled,
# so it falls through to the default (rrf) branch
mock_engine.search.return_value = mock_chain_result
result = _execute_search(
engine=mock_engine,
query="test",
source_path=Path("/project"),
fusion_strategy="dense_rerank",
options=mock_options,
limit=20,
)
# dense_rerank falls to default (else branch -> engine.search)
mock_engine.search.assert_called_once()
# =============================================================================
# Tests: _transform_results
# =============================================================================
class TestTransformResults:
"""Tests for _transform_results()."""
def test_transform_results_basic(self):
"""_transform_results should convert SearchResult to SemanticResult."""
results = [
SearchResult(
path="auth.py",
score=0.9,
excerpt="def authenticate(user):",
symbol_name="authenticate",
symbol_kind="function",
start_line=10,
end_line=20,
),
SearchResult(
path="models.py",
score=0.7,
excerpt="class UserModel:",
symbol_name="UserModel",
symbol_kind="class",
start_line=1,
end_line=30,
),
]
semantic_results = _transform_results(
results=results,
mode="fusion",
vector_weight=0.5,
structural_weight=0.3,
keyword_weight=0.2,
kind_filter=None,
include_match_reason=False,
query="authentication",
)
assert len(semantic_results) == 2
assert all(isinstance(r, SemanticResult) for r in semantic_results)
# Check first result
first = semantic_results[0]
assert first.fusion_score == 0.9
assert first.symbol_name == "authenticate"
assert first.kind == "function"
assert first.file_path == "auth.py"
assert first.line == 10
# Should be sorted by fusion_score descending
scores = [r.fusion_score for r in semantic_results]
assert scores == sorted(scores, reverse=True)
def test_transform_results_kind_filter(self):
"""_transform_results should filter by kind when kind_filter is set."""
results = [
SearchResult(
path="auth.py",
score=0.9,
excerpt="def auth():",
symbol_name="auth",
symbol_kind="function",
),
SearchResult(
path="models.py",
score=0.8,
excerpt="class User:",
symbol_name="User",
symbol_kind="class",
),
]
# Filter to only functions
semantic_results = _transform_results(
results=results,
mode="fusion",
vector_weight=0.5,
structural_weight=0.3,
keyword_weight=0.2,
kind_filter=["function"],
include_match_reason=False,
query="test",
)
assert len(semantic_results) == 1
assert semantic_results[0].kind == "function"

View File

@@ -0,0 +1,161 @@
"""Unit tests for BinarySearcher - binary vector search using Hamming distance.
Tests cover:
- load: mmap file loading, DB fallback, no data scenario
- search: basic search, top_k limit, empty index
"""
from __future__ import annotations
import json
import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch, mock_open
import numpy as np
import pytest
from codexlens.search.binary_searcher import BinarySearcher
# =============================================================================
# Test Fixtures
# =============================================================================
@pytest.fixture
def temp_paths():
"""Create temporary directory structure."""
tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
root = Path(tmpdir.name)
yield root
try:
tmpdir.cleanup()
except (PermissionError, OSError):
pass
@pytest.fixture
def binary_mmap_setup(temp_paths):
"""Create a mock memory-mapped binary vectors file with metadata."""
num_vectors = 10
dim_bytes = 32 # 256 bits = 32 bytes
# Create binary matrix
rng = np.random.default_rng(42)
binary_matrix = rng.integers(0, 256, size=(num_vectors, dim_bytes), dtype=np.uint8)
chunk_ids = list(range(100, 100 + num_vectors))
# Write mmap file
mmap_path = temp_paths / "_binary_vectors.mmap"
binary_matrix.tofile(str(mmap_path))
# Write metadata
meta_path = mmap_path.with_suffix(".meta.json")
meta = {
"shape": [num_vectors, dim_bytes],
"chunk_ids": chunk_ids,
}
with open(meta_path, "w") as f:
json.dump(meta, f)
return temp_paths, binary_matrix, chunk_ids
# =============================================================================
# Tests: load
# =============================================================================
class TestBinarySearcherLoad:
"""Tests for BinarySearcher.load()."""
def test_load_mmap(self, binary_mmap_setup):
"""Memory-mapped file loading should succeed and mark is_memmap."""
index_root, binary_matrix, chunk_ids = binary_mmap_setup
searcher = BinarySearcher(index_root)
result = searcher.load()
assert result is True
assert searcher._loaded is True
assert searcher.is_memmap is True
assert searcher.vector_count == len(chunk_ids)
def test_load_db_fallback(self, temp_paths):
"""Should fall back to DB loading when no mmap file exists."""
searcher = BinarySearcher(temp_paths)
# Mock the DB fallback
with patch.object(searcher, "_load_from_db", return_value=True) as mock_db:
result = searcher.load()
assert result is True
mock_db.assert_called_once()
def test_load_no_data(self, temp_paths):
"""Should return False when neither mmap nor DB data available."""
searcher = BinarySearcher(temp_paths)
with patch.object(searcher, "_load_from_db", return_value=False):
result = searcher.load()
assert result is False
assert searcher._loaded is False
# =============================================================================
# Tests: search
# =============================================================================
class TestBinarySearcherSearch:
"""Tests for BinarySearcher.search()."""
def test_search_basic(self, binary_mmap_setup):
"""Basic search should return (chunk_id, distance) tuples."""
index_root, binary_matrix, chunk_ids = binary_mmap_setup
searcher = BinarySearcher(index_root)
searcher.load()
# Create a query vector (256 dimensions, will be binarized)
rng = np.random.default_rng(99)
query_vector = rng.standard_normal(256).astype(np.float32)
results = searcher.search(query_vector, top_k=5)
assert len(results) == 5
# Results should be (chunk_id, hamming_distance) tuples
for chunk_id, distance in results:
assert isinstance(chunk_id, int)
assert isinstance(distance, int)
assert chunk_id in chunk_ids
def test_search_top_k(self, binary_mmap_setup):
"""Search should respect top_k limit."""
index_root, binary_matrix, chunk_ids = binary_mmap_setup
searcher = BinarySearcher(index_root)
searcher.load()
query_vector = np.random.default_rng(42).standard_normal(256).astype(np.float32)
results_3 = searcher.search(query_vector, top_k=3)
results_7 = searcher.search(query_vector, top_k=7)
assert len(results_3) == 3
assert len(results_7) == 7
# Results should be sorted by distance (ascending)
distances_3 = [d for _, d in results_3]
assert distances_3 == sorted(distances_3)
def test_search_empty_index(self, temp_paths):
"""Search on empty/unloaded index should return empty list."""
searcher = BinarySearcher(temp_paths)
# Do not load - index is empty
query_vector = np.zeros(256, dtype=np.float32)
with patch.object(searcher, "load", return_value=False):
results = searcher.search(query_vector, top_k=5)
assert results == []

View File

@@ -0,0 +1,392 @@
"""Integration tests for chain_search.py cascade strategies.
Tests cover:
- binary_cascade_search: Full pipeline and numpy-unavailable fallback
- binary_rerank_cascade_search: Pipeline and fallback
- dense_rerank_cascade_search: Pipeline and fallback
- cascade_search: Router dispatching to correct strategy methods
"""
from __future__ import annotations
import tempfile
from pathlib import Path
from typing import List
from unittest.mock import MagicMock, Mock, patch
import pytest
from codexlens.config import Config
from codexlens.entities import SearchResult
from codexlens.search.chain_search import (
ChainSearchEngine,
ChainSearchResult,
SearchOptions,
SearchStats,
)
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore
# =============================================================================
# Test Fixtures
# =============================================================================
@pytest.fixture
def temp_paths():
"""Create temporary directory structure."""
tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
root = Path(tmpdir.name)
yield root
try:
tmpdir.cleanup()
except (PermissionError, OSError):
pass
@pytest.fixture
def mock_registry(temp_paths: Path):
"""Create mock registry store."""
registry = RegistryStore(db_path=temp_paths / "registry.db")
registry.initialize()
return registry
@pytest.fixture
def mock_mapper(temp_paths: Path):
"""Create path mapper."""
return PathMapper(index_root=temp_paths / "indexes")
@pytest.fixture
def mock_config():
"""Create mock config for cascade search."""
config = MagicMock(spec=Config)
config.cascade_coarse_k = 100
config.cascade_fine_k = 10
config.cascade_strategy = "binary"
config.enable_staged_rerank = False
config.staged_clustering_strategy = "auto"
config.staged_clustering_min_size = 3
config.graph_expansion_depth = 2
return config
@pytest.fixture
def sample_search_results() -> List[SearchResult]:
"""Create sample search results for testing."""
return [
SearchResult(path="a.py", score=0.9, excerpt="def auth():"),
SearchResult(path="b.py", score=0.8, excerpt="class User:"),
SearchResult(path="c.py", score=0.7, excerpt="def login():"),
]
# =============================================================================
# Tests: binary_cascade_search
# =============================================================================
class TestBinaryCascadeSearch:
"""Tests for binary_cascade_search()."""
def test_binary_cascade_full_pipeline(
self, mock_registry, mock_mapper, mock_config, temp_paths
):
"""binary_cascade_search should execute full binary+dense pipeline."""
engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config)
with patch.object(engine, "_find_start_index") as mock_find:
mock_find.return_value = temp_paths / "index" / "_index.db"
with patch.object(engine, "_collect_index_paths") as mock_collect:
mock_collect.return_value = [temp_paths / "index" / "_index.db"]
# Mock the embedding backend imports
with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", True):
with patch.dict("sys.modules", {
"codexlens.indexing.embedding": MagicMock(),
"codexlens.semantic.ann_index": MagicMock(),
}):
# Mock _get_or_create_binary_index
with patch.object(
engine, "_get_or_create_binary_index"
) as mock_bin:
mock_index = MagicMock()
mock_index.count.return_value = 10
mock_index.search.return_value = ([1, 2], [10, 20])
mock_bin.return_value = mock_index
# The search should fall back to standard on import issues
with patch.object(engine, "search") as mock_search:
mock_search.return_value = ChainSearchResult(
query="test",
results=[SearchResult(path="a.py", score=0.9, excerpt="a")],
symbols=[],
stats=SearchStats(),
)
result = engine.binary_cascade_search(
"test query", temp_paths / "src",
k=10, coarse_k=100,
)
assert isinstance(result, ChainSearchResult)
def test_binary_cascade_numpy_unavailable(
self, mock_registry, mock_mapper, mock_config, temp_paths
):
"""binary_cascade_search should fall back to standard search when numpy unavailable."""
engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config)
with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", False):
with patch.object(engine, "search") as mock_search:
mock_search.return_value = ChainSearchResult(
query="test",
results=[],
symbols=[],
stats=SearchStats(),
)
result = engine.binary_cascade_search(
"query", temp_paths / "src",
)
mock_search.assert_called_once()
assert isinstance(result, ChainSearchResult)
# =============================================================================
# Tests: binary_rerank_cascade_search
# =============================================================================
class TestBinaryRerankCascadeSearch:
"""Tests for binary_rerank_cascade_search()."""
def test_binary_rerank_cascade_pipeline(
self, mock_registry, mock_mapper, mock_config, temp_paths
):
"""binary_rerank_cascade_search should execute binary+cross-encoder pipeline."""
engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config)
with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", True):
with patch.object(engine, "_find_start_index") as mock_find:
mock_find.return_value = temp_paths / "index" / "_index.db"
with patch.object(engine, "_collect_index_paths") as mock_collect:
mock_collect.return_value = [temp_paths / "index" / "_index.db"]
# Mock BinaryEmbeddingBackend import
with patch.dict("sys.modules", {
"codexlens.indexing.embedding": MagicMock(),
}):
with patch.object(engine, "search") as mock_search:
mock_search.return_value = ChainSearchResult(
query="test",
results=[SearchResult(path="a.py", score=0.9, excerpt="a")],
symbols=[],
stats=SearchStats(),
)
result = engine.binary_rerank_cascade_search(
"test query", temp_paths / "src",
k=10, coarse_k=100,
)
assert isinstance(result, ChainSearchResult)
def test_binary_rerank_fallback(
self, mock_registry, mock_mapper, mock_config, temp_paths
):
"""binary_rerank_cascade_search should fall back when numpy unavailable."""
engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config)
with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", False):
with patch.object(engine, "search") as mock_search:
mock_search.return_value = ChainSearchResult(
query="test",
results=[],
symbols=[],
stats=SearchStats(),
)
result = engine.binary_rerank_cascade_search(
"query", temp_paths / "src",
)
mock_search.assert_called_once()
# =============================================================================
# Tests: dense_rerank_cascade_search
# =============================================================================
class TestDenseRerankCascadeSearch:
"""Tests for dense_rerank_cascade_search()."""
def test_dense_rerank_cascade_pipeline(
self, mock_registry, mock_mapper, mock_config, temp_paths
):
"""dense_rerank_cascade_search should execute dense+cross-encoder pipeline."""
engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config)
with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", True):
with patch.object(engine, "_find_start_index") as mock_find:
mock_find.return_value = temp_paths / "index" / "_index.db"
with patch.object(engine, "_collect_index_paths") as mock_collect:
mock_collect.return_value = [temp_paths / "index" / "_index.db"]
with patch.object(engine, "search") as mock_search:
mock_search.return_value = ChainSearchResult(
query="test",
results=[SearchResult(path="a.py", score=0.9, excerpt="a")],
symbols=[],
stats=SearchStats(),
)
result = engine.dense_rerank_cascade_search(
"test query", temp_paths / "src",
k=10, coarse_k=100,
)
assert isinstance(result, ChainSearchResult)
def test_dense_rerank_fallback(
self, mock_registry, mock_mapper, mock_config, temp_paths
):
"""dense_rerank_cascade_search should fall back when numpy unavailable."""
engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config)
with patch("codexlens.search.chain_search.NUMPY_AVAILABLE", False):
with patch.object(engine, "search") as mock_search:
mock_search.return_value = ChainSearchResult(
query="test",
results=[],
symbols=[],
stats=SearchStats(),
)
result = engine.dense_rerank_cascade_search(
"query", temp_paths / "src",
)
mock_search.assert_called_once()
# =============================================================================
# Tests: cascade_search (unified router)
# =============================================================================
class TestCascadeRouter:
"""Tests for cascade_search() strategy routing."""
def test_cascade_router_binary(
self, mock_registry, mock_mapper, mock_config, temp_paths
):
"""cascade_search with strategy='binary' should route to binary_cascade_search."""
engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config)
with patch.object(engine, "binary_cascade_search") as mock_binary:
mock_binary.return_value = ChainSearchResult(
query="test", results=[], symbols=[], stats=SearchStats()
)
engine.cascade_search(
"query", temp_paths / "src", strategy="binary"
)
mock_binary.assert_called_once()
def test_cascade_router_binary_rerank(
self, mock_registry, mock_mapper, mock_config, temp_paths
):
"""cascade_search with strategy='binary_rerank' should route correctly."""
engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config)
with patch.object(engine, "binary_rerank_cascade_search") as mock_rerank:
mock_rerank.return_value = ChainSearchResult(
query="test", results=[], symbols=[], stats=SearchStats()
)
engine.cascade_search(
"query", temp_paths / "src", strategy="binary_rerank"
)
mock_rerank.assert_called_once()
def test_cascade_router_dense_rerank(
self, mock_registry, mock_mapper, mock_config, temp_paths
):
"""cascade_search with strategy='dense_rerank' should route correctly."""
engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config)
with patch.object(engine, "dense_rerank_cascade_search") as mock_dense:
mock_dense.return_value = ChainSearchResult(
query="test", results=[], symbols=[], stats=SearchStats()
)
engine.cascade_search(
"query", temp_paths / "src", strategy="dense_rerank"
)
mock_dense.assert_called_once()
def test_cascade_router_staged(
self, mock_registry, mock_mapper, mock_config, temp_paths
):
"""cascade_search with strategy='staged' should route to staged_cascade_search."""
engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config)
with patch.object(engine, "staged_cascade_search") as mock_staged:
mock_staged.return_value = ChainSearchResult(
query="test", results=[], symbols=[], stats=SearchStats()
)
engine.cascade_search(
"query", temp_paths / "src", strategy="staged"
)
mock_staged.assert_called_once()
def test_cascade_router_config_default(
self, mock_registry, mock_mapper, temp_paths
):
"""cascade_search with no strategy param should use config cascade_strategy."""
config = MagicMock(spec=Config)
config.cascade_strategy = "binary_rerank"
config.cascade_coarse_k = 100
config.cascade_fine_k = 10
engine = ChainSearchEngine(mock_registry, mock_mapper, config=config)
with patch.object(engine, "binary_rerank_cascade_search") as mock_rerank:
mock_rerank.return_value = ChainSearchResult(
query="test", results=[], symbols=[], stats=SearchStats()
)
# No strategy param -> reads from config
engine.cascade_search("query", temp_paths / "src")
mock_rerank.assert_called_once()
def test_cascade_router_invalid_fallback(
self, mock_registry, mock_mapper, mock_config, temp_paths
):
"""cascade_search with invalid strategy should default to 'binary'."""
engine = ChainSearchEngine(mock_registry, mock_mapper, config=mock_config)
with patch.object(engine, "binary_cascade_search") as mock_binary:
mock_binary.return_value = ChainSearchResult(
query="test", results=[], symbols=[], stats=SearchStats()
)
engine.cascade_search(
"query", temp_paths / "src", strategy="nonexistent"
)
mock_binary.assert_called_once()

View File

@@ -0,0 +1,96 @@
"""Unit tests for Config cascade settings validation.
Tests cover:
- Default cascade_strategy value
- Valid cascade strategies accepted by load_settings
- Invalid cascade strategy fallback behavior
- Staged cascade config defaults
"""
from __future__ import annotations
import json
import tempfile
from pathlib import Path
from unittest.mock import patch
import pytest
from codexlens.config import Config
# =============================================================================
# Test Fixtures
# =============================================================================
@pytest.fixture
def temp_config_dir():
"""Create temporary directory for config data_dir."""
tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
yield Path(tmpdir.name)
try:
tmpdir.cleanup()
except (PermissionError, OSError):
pass
# =============================================================================
# Tests: cascade config defaults
# =============================================================================
class TestConfigCascadeDefaults:
"""Tests for Config cascade-related defaults and load_settings()."""
def test_default_cascade_strategy(self, temp_config_dir):
"""Default cascade_strategy should be 'binary'."""
config = Config(data_dir=temp_config_dir)
assert config.cascade_strategy == "binary"
def test_valid_cascade_strategies(self, temp_config_dir):
"""load_settings should accept all valid cascade strategies."""
valid_strategies = ["binary", "binary_rerank", "dense_rerank", "staged"]
for strategy in valid_strategies:
config = Config(data_dir=temp_config_dir)
settings = {"cascade": {"strategy": strategy}}
settings_path = config.settings_path
settings_path.parent.mkdir(parents=True, exist_ok=True)
with open(settings_path, "w", encoding="utf-8") as f:
json.dump(settings, f)
with patch.object(config, "_apply_env_overrides"):
config.load_settings()
assert config.cascade_strategy == strategy, (
f"Strategy '{strategy}' should be accepted"
)
def test_invalid_cascade_strategy_fallback(self, temp_config_dir):
"""Invalid cascade strategy should keep default (not crash)."""
config = Config(data_dir=temp_config_dir)
settings = {"cascade": {"strategy": "invalid_strategy"}}
settings_path = config.settings_path
settings_path.parent.mkdir(parents=True, exist_ok=True)
with open(settings_path, "w", encoding="utf-8") as f:
json.dump(settings, f)
with patch.object(config, "_apply_env_overrides"):
config.load_settings()
# Should keep the default "binary" strategy
assert config.cascade_strategy == "binary"
def test_staged_config_defaults(self, temp_config_dir):
"""Staged cascade settings should have correct defaults."""
config = Config(data_dir=temp_config_dir)
assert config.staged_coarse_k == 200
assert config.staged_lsp_depth == 2
assert config.staged_clustering_strategy == "auto"
assert config.staged_clustering_min_size == 3
assert config.enable_staged_rerank is True
assert config.cascade_coarse_k == 100
assert config.cascade_fine_k == 10

View File

@@ -0,0 +1,367 @@
"""Unit tests for HybridSearchEngine - parallel search and RRF fusion.
Tests cover:
- search: exact only, fuzzy enabled, vector enabled, pure vector mode
- search: RRF fusion, empty query, no results, reranking, category filtering
- _search_parallel: parallel backend execution
- _search_lsp_graph: LSP graph expansion with seeds, vector-to-FTS fallback
"""
from __future__ import annotations
import tempfile
from pathlib import Path
from typing import Dict, List
from unittest.mock import MagicMock, Mock, patch, PropertyMock
import pytest
from codexlens.config import Config
from codexlens.entities import SearchResult
from codexlens.search.hybrid_search import HybridSearchEngine
# =============================================================================
# Test Fixtures
# =============================================================================
@pytest.fixture
def temp_paths():
"""Create temporary directory structure with a mock index."""
tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
root = Path(tmpdir.name)
# Create a non-empty index file to pass the empty-file guard
index_path = root / "_index.db"
index_path.write_bytes(b"\x00" * 100)
yield root
try:
tmpdir.cleanup()
except (PermissionError, OSError):
pass
@pytest.fixture
def mock_config():
"""Create mock config for hybrid search."""
config = MagicMock(spec=Config)
config.embedding_use_gpu = False
config.enable_reranking = False
config.enable_cross_encoder_rerank = False
config.symbol_boost_factor = 1.5
config.fusion_method = "rrf"
config.rrf_k = 60
config.enable_category_filter = True
return config
@pytest.fixture
def sample_results() -> List[SearchResult]:
"""Create sample search results."""
return [
SearchResult(
path="auth.py",
score=0.9,
excerpt="def authenticate(user):",
symbol_name="authenticate",
symbol_kind="function",
),
SearchResult(
path="login.py",
score=0.7,
excerpt="class LoginHandler:",
symbol_name="LoginHandler",
symbol_kind="class",
),
]
# =============================================================================
# Tests: search with different backends
# =============================================================================
class TestHybridSearchBackends:
"""Tests for HybridSearchEngine.search() backend configurations."""
def test_search_exact_only(self, temp_paths, mock_config):
"""Search with only exact FTS backend."""
engine = HybridSearchEngine(config=mock_config)
index_path = temp_paths / "_index.db"
with patch.object(engine, "_search_parallel") as mock_parallel:
mock_parallel.return_value = {
"exact": [
SearchResult(path="a.py", score=10.0, excerpt="result"),
]
}
results = engine.search(
index_path, "test query",
enable_fuzzy=False, enable_vector=False,
)
assert len(results) == 1
# Verify only exact backend was requested
call_args = mock_parallel.call_args
backends = call_args[0][2] # 3rd positional arg
assert "exact" in backends
assert "fuzzy" not in backends
assert "vector" not in backends
def test_search_fuzzy_enabled(self, temp_paths, mock_config):
"""Search with exact + fuzzy backends."""
engine = HybridSearchEngine(config=mock_config)
index_path = temp_paths / "_index.db"
with patch.object(engine, "_search_parallel") as mock_parallel:
mock_parallel.return_value = {
"exact": [SearchResult(path="a.py", score=10.0, excerpt="exact")],
"fuzzy": [SearchResult(path="b.py", score=8.0, excerpt="fuzzy")],
}
results = engine.search(
index_path, "test_query",
enable_fuzzy=True, enable_vector=False,
)
assert len(results) >= 1
backends = mock_parallel.call_args[0][2]
assert "exact" in backends
assert "fuzzy" in backends
def test_search_vector_enabled(self, temp_paths, mock_config):
"""Search with exact + fuzzy + vector backends."""
engine = HybridSearchEngine(config=mock_config)
index_path = temp_paths / "_index.db"
with patch.object(engine, "_search_parallel") as mock_parallel:
mock_parallel.return_value = {
"exact": [SearchResult(path="a.py", score=10.0, excerpt="exact")],
"vector": [SearchResult(path="c.py", score=0.85, excerpt="vector")],
}
results = engine.search(
index_path, "test_query",
enable_fuzzy=False, enable_vector=True,
)
backends = mock_parallel.call_args[0][2]
assert "exact" in backends
assert "vector" in backends
def test_search_pure_vector(self, temp_paths, mock_config):
"""Pure vector mode should only use vector backend."""
engine = HybridSearchEngine(config=mock_config)
mock_config.enable_category_filter = False
index_path = temp_paths / "_index.db"
with patch.object(engine, "_search_parallel") as mock_parallel:
mock_parallel.return_value = {
"vector": [SearchResult(path="a.py", score=0.9, excerpt="vec")],
}
results = engine.search(
index_path, "semantic query",
enable_vector=True, pure_vector=True,
)
backends = mock_parallel.call_args[0][2]
assert "vector" in backends
assert "exact" not in backends
# =============================================================================
# Tests: search fusion and post-processing
# =============================================================================
class TestHybridSearchFusion:
"""Tests for RRF fusion, empty query, no results, reranking, filtering."""
def test_search_rrf_fusion(self, temp_paths, mock_config):
"""Results from multiple backends should be fused via RRF."""
engine = HybridSearchEngine(config=mock_config)
index_path = temp_paths / "_index.db"
with patch.object(engine, "_search_parallel") as mock_parallel:
mock_parallel.return_value = {
"exact": [
SearchResult(path="a.py", score=10.0, excerpt="exact a"),
SearchResult(path="b.py", score=5.0, excerpt="exact b"),
],
"vector": [
SearchResult(path="b.py", score=0.9, excerpt="vector b"),
SearchResult(path="c.py", score=0.8, excerpt="vector c"),
],
}
results = engine.search(
index_path, "test",
enable_fuzzy=False, enable_vector=True,
)
# b.py appears in both sources - should have high fusion score
assert any(r.path == "b.py" for r in results)
def test_search_empty_query(self, temp_paths, mock_config):
"""Empty query should still execute (handled gracefully)."""
engine = HybridSearchEngine(config=mock_config)
index_path = temp_paths / "_index.db"
with patch.object(engine, "_search_parallel") as mock_parallel:
mock_parallel.return_value = {"exact": []}
results = engine.search(index_path, "", enable_fuzzy=False)
assert results == []
def test_search_no_results(self, temp_paths, mock_config):
"""All backends returning empty should produce empty results."""
engine = HybridSearchEngine(config=mock_config)
index_path = temp_paths / "_index.db"
with patch.object(engine, "_search_parallel") as mock_parallel:
mock_parallel.return_value = {
"exact": [],
"fuzzy": [],
}
results = engine.search(index_path, "nonexistent")
assert results == []
def test_search_reranking(self, temp_paths, mock_config):
"""Reranking should be applied when config enables it."""
mock_config.enable_reranking = True
mock_config.enable_cross_encoder_rerank = False
mock_config.reranking_top_k = 50
engine = HybridSearchEngine(config=mock_config)
index_path = temp_paths / "_index.db"
mock_embedder = MagicMock()
mock_embedder.embed_single.return_value = [0.1] * 128
mock_embedder.embed.return_value = [[0.1] * 128]
engine.embedder = mock_embedder
with patch.object(engine, "_search_parallel") as mock_parallel:
mock_parallel.return_value = {
"exact": [SearchResult(path="a.py", score=10.0, excerpt="code")],
}
with patch("codexlens.search.hybrid_search.rerank_results") as mock_rerank:
mock_rerank.return_value = [
SearchResult(path="a.py", score=0.85, excerpt="code"),
]
results = engine.search(index_path, "query", enable_fuzzy=False)
mock_rerank.assert_called_once()
def test_search_category_filtering(self, temp_paths, mock_config):
"""Category filtering should separate code/doc results by intent."""
mock_config.enable_category_filter = True
engine = HybridSearchEngine(config=mock_config)
index_path = temp_paths / "_index.db"
with patch.object(engine, "_search_parallel") as mock_parallel:
mock_parallel.return_value = {
"exact": [
SearchResult(path="auth.py", score=10.0, excerpt="def auth"),
SearchResult(path="README.md", score=8.0, excerpt="docs"),
],
}
# Keyword-like query should filter to code
results = engine.search(
index_path, "AuthManager",
enable_fuzzy=False,
)
paths = [r.path for r in results]
# Code files should remain, doc files filtered for KEYWORD intent
assert "auth.py" in paths
# =============================================================================
# Tests: _search_parallel
# =============================================================================
class TestSearchParallel:
"""Tests for _search_parallel() parallel backend execution."""
def test_search_parallel_backends(self, temp_paths, mock_config):
"""Parallel execution should run all requested backends."""
engine = HybridSearchEngine(config=mock_config)
index_path = temp_paths / "_index.db"
with patch.object(engine, "_search_exact") as mock_exact, \
patch.object(engine, "_search_fuzzy") as mock_fuzzy:
mock_exact.return_value = [
SearchResult(path="a.py", score=10.0, excerpt="exact"),
]
mock_fuzzy.return_value = [
SearchResult(path="b.py", score=8.0, excerpt="fuzzy"),
]
results_map = engine._search_parallel(
index_path, "query",
backends={"exact": True, "fuzzy": True},
limit=10,
)
assert "exact" in results_map
assert "fuzzy" in results_map
mock_exact.assert_called_once()
mock_fuzzy.assert_called_once()
# =============================================================================
# Tests: _search_lsp_graph
# =============================================================================
class TestSearchLspGraph:
"""Tests for _search_lsp_graph() LSP graph expansion."""
def test_search_lsp_graph(self, temp_paths, mock_config):
"""LSP graph search should use seed results for expansion."""
engine = HybridSearchEngine(config=mock_config)
index_path = temp_paths / "_index.db"
# When HAS_LSP is False, _search_lsp_graph returns []
with patch("codexlens.search.hybrid_search.HAS_LSP", False):
results = engine._search_lsp_graph(
index_path, "auth function", limit=5,
)
assert results == []
def test_lsp_fallback_vector_to_fts(self, temp_paths, mock_config):
"""When vector seeds fail, should fall back to FTS seeds."""
engine = HybridSearchEngine(config=mock_config)
index_path = temp_paths / "_index.db"
with patch("codexlens.search.hybrid_search.HAS_LSP", True):
# Mock _search_vector to return empty (no seeds from vector)
with patch.object(engine, "_search_vector", return_value=[]):
# Mock _search_exact to return seeds
with patch.object(engine, "_search_exact") as mock_exact:
mock_exact.return_value = [
SearchResult(
path="auth.py", score=10.0,
excerpt="def auth():", symbol_name="auth",
start_line=1, end_line=5,
),
]
# Mock the LSP bridge (will fail on import or async)
# The function should attempt FTS fallback before LSP expansion
try:
results = engine._search_lsp_graph(
index_path, "auth", limit=5,
)
except Exception:
pass # LSP deps may not be available, but FTS fallback was attempted
# Verify FTS was called as fallback
mock_exact.assert_called_once()

View File

@@ -0,0 +1,330 @@
"""Unit tests for ranking.py - RRF weights, intent detection, score fusion, and filtering.
Tests cover:
- detect_query_intent: CamelCase/underscore -> KEYWORD, natural language -> SEMANTIC, mixed
- adjust_weights_by_intent: Weight adjustments per intent type
- get_rrf_weights: Composite of detect + adjust
- reciprocal_rank_fusion: Single/multi source, empty, weight normalization
- simple_weighted_fusion: Basic fusion and empty input
- apply_symbol_boost: Symbol match boost and no-match scenario
- filter_results_by_category: KEYWORD -> code only, SEMANTIC -> docs priority
- group_similar_results: Group results by score proximity
- normalize_weights: All-zero weights edge case
"""
from __future__ import annotations
import math
from typing import Dict, List
from unittest.mock import MagicMock
import pytest
from codexlens.entities import SearchResult
from codexlens.search.ranking import (
DEFAULT_WEIGHTS,
QueryIntent,
adjust_weights_by_intent,
apply_symbol_boost,
detect_query_intent,
filter_results_by_category,
get_rrf_weights,
group_similar_results,
normalize_weights,
reciprocal_rank_fusion,
simple_weighted_fusion,
)
# =============================================================================
# Helpers
# =============================================================================
def _make_result(
path: str = "a.py",
score: float = 0.5,
excerpt: str = "def foo():",
symbol_name: str | None = None,
symbol_kind: str | None = None,
start_line: int | None = None,
end_line: int | None = None,
) -> SearchResult:
"""Create a SearchResult with sensible defaults."""
return SearchResult(
path=path,
score=score,
excerpt=excerpt,
symbol_name=symbol_name,
symbol_kind=symbol_kind,
start_line=start_line,
end_line=end_line,
)
# =============================================================================
# Tests: detect_query_intent
# =============================================================================
class TestDetectQueryIntent:
"""Tests for detect_query_intent()."""
def test_detect_keyword_intent(self):
"""CamelCase/underscore queries should be detected as KEYWORD."""
assert detect_query_intent("MyClassName") == QueryIntent.KEYWORD
assert detect_query_intent("my_function_name") == QueryIntent.KEYWORD
assert detect_query_intent("foo::bar") == QueryIntent.KEYWORD
def test_detect_semantic_intent(self):
"""Natural language queries should be detected as SEMANTIC."""
assert detect_query_intent("how to authenticate users safely?") == QueryIntent.SEMANTIC
assert detect_query_intent("explain the login process") == QueryIntent.SEMANTIC
def test_detect_mixed_intent(self):
"""Queries with both code and NL signals should be MIXED."""
# Has code signal (underscore identifier) and NL signal ("how")
assert detect_query_intent("how does my_function work") == QueryIntent.MIXED
def test_detect_empty_query(self):
"""Empty string should return MIXED (safe default)."""
assert detect_query_intent("") == QueryIntent.MIXED
assert detect_query_intent(" ") == QueryIntent.MIXED
# =============================================================================
# Tests: adjust_weights_by_intent
# =============================================================================
class TestAdjustWeightsByIntent:
"""Tests for adjust_weights_by_intent()."""
def test_adjust_keyword_weights(self):
"""KEYWORD intent should boost exact and reduce vector."""
base = {"exact": 0.3, "fuzzy": 0.1, "vector": 0.6}
adjusted = adjust_weights_by_intent(QueryIntent.KEYWORD, base)
# Expected target: exact:0.5, fuzzy:0.1, vector:0.4
assert adjusted["exact"] == pytest.approx(0.5, abs=0.01)
assert adjusted["fuzzy"] == pytest.approx(0.1, abs=0.01)
assert adjusted["vector"] == pytest.approx(0.4, abs=0.01)
def test_adjust_semantic_weights(self):
"""SEMANTIC intent should boost vector and reduce exact."""
base = {"exact": 0.3, "fuzzy": 0.1, "vector": 0.6}
adjusted = adjust_weights_by_intent(QueryIntent.SEMANTIC, base)
# Expected target: exact:0.2, fuzzy:0.1, vector:0.7
assert adjusted["exact"] == pytest.approx(0.2, abs=0.01)
assert adjusted["fuzzy"] == pytest.approx(0.1, abs=0.01)
assert adjusted["vector"] == pytest.approx(0.7, abs=0.01)
def test_adjust_mixed_weights(self):
"""MIXED intent should return normalized base_weights."""
base = {"exact": 0.3, "fuzzy": 0.1, "vector": 0.6}
adjusted = adjust_weights_by_intent(QueryIntent.MIXED, base)
# MIXED returns normalized base_weights
total = sum(adjusted.values())
assert total == pytest.approx(1.0, abs=0.01)
# Proportions should be preserved
assert adjusted["exact"] == pytest.approx(0.3, abs=0.01)
# =============================================================================
# Tests: get_rrf_weights
# =============================================================================
class TestGetRrfWeights:
"""Tests for get_rrf_weights() composite function."""
def test_get_rrf_weights_composite(self):
"""get_rrf_weights should compose detect_query_intent + adjust_weights_by_intent."""
base = {"exact": 0.3, "fuzzy": 0.1, "vector": 0.6}
# Keyword-like query
weights = get_rrf_weights("MyClassName", base)
# MyClassName -> KEYWORD -> exact boosted
assert weights["exact"] > weights["fuzzy"]
# =============================================================================
# Tests: reciprocal_rank_fusion
# =============================================================================
class TestReciprocalRankFusion:
"""Tests for reciprocal_rank_fusion()."""
def test_rrf_single_source(self):
"""Single source RRF should produce ranked results."""
results = {
"exact": [
_make_result(path="a.py", score=10.0),
_make_result(path="b.py", score=5.0),
]
}
fused = reciprocal_rank_fusion(results)
assert len(fused) == 2
# a.py should rank higher (rank 1)
assert fused[0].path == "a.py"
assert fused[0].score > fused[1].score
def test_rrf_multi_source(self):
"""Multi-source RRF should combine rankings from multiple sources."""
results = {
"exact": [
_make_result(path="a.py", score=10.0),
_make_result(path="b.py", score=5.0),
],
"vector": [
_make_result(path="b.py", score=0.9),
_make_result(path="c.py", score=0.8),
],
}
weights = {"exact": 0.5, "vector": 0.5}
fused = reciprocal_rank_fusion(results, weights=weights)
# b.py appears in both sources - should have highest fusion score
assert len(fused) == 3
assert fused[0].path == "b.py"
assert fused[0].metadata["fusion_method"] == "rrf"
def test_rrf_empty_results(self):
"""Empty results map should return empty list."""
assert reciprocal_rank_fusion({}) == []
def test_rrf_weight_normalization(self):
"""Weights not summing to 1.0 should be auto-normalized."""
results = {
"exact": [_make_result(path="a.py", score=10.0)],
}
weights = {"exact": 2.0} # Does not sum to 1.0
fused = reciprocal_rank_fusion(results, weights=weights)
assert len(fused) == 1
# Result should still be valid after weight normalization
assert fused[0].score > 0
# =============================================================================
# Tests: simple_weighted_fusion
# =============================================================================
class TestSimpleWeightedFusion:
"""Tests for simple_weighted_fusion()."""
def test_weighted_fusion_basic(self):
"""Basic weighted fusion should combine scores."""
results = {
"exact": [_make_result(path="a.py", score=10.0)],
"vector": [_make_result(path="a.py", score=0.8)],
}
weights = {"exact": 0.5, "vector": 0.5}
fused = simple_weighted_fusion(results, weights=weights)
assert len(fused) == 1
assert fused[0].path == "a.py"
assert fused[0].metadata["fusion_method"] == "simple_weighted"
assert fused[0].score > 0
def test_weighted_fusion_empty(self):
"""Empty input should return empty list."""
assert simple_weighted_fusion({}) == []
# =============================================================================
# Tests: apply_symbol_boost
# =============================================================================
class TestApplySymbolBoost:
"""Tests for apply_symbol_boost()."""
def test_symbol_boost_applied(self):
"""Results with symbol_name should get boosted by factor."""
results = [
_make_result(path="a.py", score=0.5, symbol_name="authenticate"),
_make_result(path="b.py", score=0.6),
]
boosted = apply_symbol_boost(results, boost_factor=1.5)
# a.py has symbol -> gets 1.5x boost -> 0.75
a_result = next(r for r in boosted if r.path == "a.py")
assert a_result.score == pytest.approx(0.75, abs=0.01)
assert a_result.metadata.get("boosted") is True
def test_symbol_boost_no_match(self):
"""Results without symbol_name should not be boosted."""
results = [
_make_result(path="a.py", score=0.5),
]
boosted = apply_symbol_boost(results, boost_factor=1.5)
assert boosted[0].score == pytest.approx(0.5, abs=0.01)
assert boosted[0].metadata.get("boosted") is not True
# =============================================================================
# Tests: filter_results_by_category
# =============================================================================
class TestFilterResultsByCategory:
"""Tests for filter_results_by_category()."""
def test_filter_keyword_code_only(self):
"""KEYWORD intent should return only code files."""
results = [
_make_result(path="main.py", score=0.9),
_make_result(path="README.md", score=0.8),
_make_result(path="utils.ts", score=0.7),
]
filtered = filter_results_by_category(results, QueryIntent.KEYWORD)
paths = [r.path for r in filtered]
assert "README.md" not in paths
assert "main.py" in paths
assert "utils.ts" in paths
def test_filter_semantic_docs_first(self):
"""SEMANTIC intent should put docs before code."""
results = [
_make_result(path="main.py", score=0.9),
_make_result(path="README.md", score=0.8),
]
filtered = filter_results_by_category(results, QueryIntent.SEMANTIC, allow_mixed=True)
# Docs should come first
assert filtered[0].path == "README.md"
# =============================================================================
# Tests: group_similar_results
# =============================================================================
class TestGroupSimilarResults:
"""Tests for group_similar_results()."""
def test_group_similar_results(self):
"""Results with same excerpt and close scores should be grouped."""
results = [
_make_result(path="a.py", score=0.50, excerpt="def foo():"),
_make_result(path="b.py", score=0.50, excerpt="def foo():"),
_make_result(path="c.py", score=0.30, excerpt="def bar():"),
]
grouped = group_similar_results(results, score_threshold_abs=0.01)
# a.py and b.py should be grouped (same excerpt, same score)
assert len(grouped) == 2
# Find the grouped result
grouped_result = next(r for r in grouped if r.path == "a.py")
assert len(grouped_result.additional_locations) == 1
assert grouped_result.additional_locations[0].path == "b.py"
# =============================================================================
# Tests: normalize_weights
# =============================================================================
class TestNormalizeWeights:
"""Tests for normalize_weights()."""
def test_normalize_weights_zero_total(self):
"""All-zero weights should be returned as-is (no division by zero)."""
weights = {"exact": 0.0, "fuzzy": 0.0, "vector": 0.0}
result = normalize_weights(weights)
assert result == {"exact": 0.0, "fuzzy": 0.0, "vector": 0.0}