Add comprehensive tests for vector/semantic search functionality

- Implement full coverage tests for Embedder model loading and embedding generation
- Add CRUD operations and caching tests for VectorStore
- Include cosine similarity computation tests
- Validate semantic search accuracy and relevance through various queries
- Establish performance benchmarks for embedding and search operations
- Ensure edge cases and error handling are covered
- Test thread safety and concurrent access scenarios
- Verify availability of semantic search dependencies
This commit is contained in:
catlog22
2025-12-14 17:17:09 +08:00
parent 8d542b8e45
commit 79a2953862
47 changed files with 11208 additions and 4336 deletions

View File

@@ -0,0 +1,831 @@
"""Tests for LLM-based semantic enhancement functionality.
Tests cover:
- LLMConfig and data classes
- LLMEnhancer initialization and configuration
- Prompt building and JSON parsing
- Batch processing logic
- CCW CLI invocation (mocked)
- EnhancedSemanticIndexer integration
- Error handling and fallback behavior
"""
import json
import tempfile
from pathlib import Path
from typing import Dict, Any
from unittest.mock import MagicMock, patch, PropertyMock
import pytest
from codexlens.entities import SemanticChunk, Symbol
from codexlens.semantic.llm_enhancer import (
SemanticMetadata,
FileData,
LLMConfig,
LLMEnhancer,
EnhancedSemanticIndexer,
create_enhancer,
create_enhanced_indexer,
)
# === Data Class Tests ===
class TestSemanticMetadata:
"""Tests for SemanticMetadata dataclass."""
def test_basic_creation(self):
"""Test creating SemanticMetadata with required fields."""
metadata = SemanticMetadata(
summary="Authentication handler",
keywords=["auth", "login", "jwt"],
purpose="auth",
)
assert metadata.summary == "Authentication handler"
assert metadata.keywords == ["auth", "login", "jwt"]
assert metadata.purpose == "auth"
assert metadata.file_path is None
assert metadata.symbol_name is None
assert metadata.llm_tool is None
def test_full_creation(self):
"""Test creating SemanticMetadata with all fields."""
metadata = SemanticMetadata(
summary="User login function",
keywords=["login", "user"],
purpose="auth",
file_path="/test/auth.py",
symbol_name="login",
llm_tool="gemini",
)
assert metadata.file_path == "/test/auth.py"
assert metadata.symbol_name == "login"
assert metadata.llm_tool == "gemini"
def test_empty_keywords(self):
"""Test creating SemanticMetadata with empty keywords."""
metadata = SemanticMetadata(
summary="Empty",
keywords=[],
purpose="",
)
assert metadata.keywords == []
class TestFileData:
"""Tests for FileData dataclass."""
def test_basic_creation(self):
"""Test creating FileData with required fields."""
data = FileData(
path="/test/file.py",
content="def hello(): pass",
language="python",
)
assert data.path == "/test/file.py"
assert data.content == "def hello(): pass"
assert data.language == "python"
assert data.symbols == []
def test_with_symbols(self):
"""Test creating FileData with symbols."""
symbols = [
Symbol(name="hello", kind="function", range=(1, 1)),
Symbol(name="MyClass", kind="class", range=(3, 10)),
]
data = FileData(
path="/test/file.py",
content="code",
language="python",
symbols=symbols,
)
assert len(data.symbols) == 2
assert data.symbols[0].name == "hello"
class TestLLMConfig:
"""Tests for LLMConfig dataclass."""
def test_default_values(self):
"""Test default configuration values."""
config = LLMConfig()
assert config.tool == "gemini"
assert config.fallback_tool == "qwen"
assert config.timeout_ms == 300000
assert config.batch_size == 5
assert config.max_content_chars == 8000
assert config.enabled is True
def test_custom_values(self):
"""Test custom configuration values."""
config = LLMConfig(
tool="qwen",
fallback_tool="gemini",
timeout_ms=600000,
batch_size=10,
max_content_chars=4000,
enabled=False,
)
assert config.tool == "qwen"
assert config.fallback_tool == "gemini"
assert config.timeout_ms == 600000
assert config.batch_size == 10
assert config.max_content_chars == 4000
assert config.enabled is False
@patch.dict("os.environ", {"CCW_CLI_SECONDARY_TOOL": "codex", "CCW_CLI_FALLBACK_TOOL": "gemini"})
def test_env_override(self):
"""Test environment variable override."""
config = LLMConfig()
assert config.tool == "codex"
assert config.fallback_tool == "gemini"
# === LLMEnhancer Tests ===
class TestLLMEnhancerInit:
"""Tests for LLMEnhancer initialization."""
def test_default_init(self):
"""Test default initialization."""
enhancer = LLMEnhancer()
assert enhancer.config is not None
assert enhancer.config.tool == "gemini"
assert enhancer._ccw_available is None
def test_custom_config(self):
"""Test initialization with custom config."""
config = LLMConfig(tool="qwen", batch_size=3)
enhancer = LLMEnhancer(config)
assert enhancer.config.tool == "qwen"
assert enhancer.config.batch_size == 3
class TestLLMEnhancerAvailability:
"""Tests for CCW CLI availability check."""
@patch("shutil.which")
def test_ccw_available(self, mock_which):
"""Test CCW available returns True."""
mock_which.return_value = "/usr/bin/ccw"
enhancer = LLMEnhancer()
result = enhancer.check_available()
assert result is True
assert enhancer._ccw_available is True
mock_which.assert_called_with("ccw")
@patch("shutil.which")
def test_ccw_not_available(self, mock_which):
"""Test CCW not available returns False."""
mock_which.return_value = None
enhancer = LLMEnhancer()
result = enhancer.check_available()
assert result is False
assert enhancer._ccw_available is False
@patch("shutil.which")
def test_ccw_availability_cached(self, mock_which):
"""Test availability result is cached."""
mock_which.return_value = "/usr/bin/ccw"
enhancer = LLMEnhancer()
# First call
enhancer.check_available()
# Second call
enhancer.check_available()
# which should only be called once
mock_which.assert_called_once()
class TestPromptBuilding:
"""Tests for prompt building."""
def test_build_single_file_prompt(self):
"""Test prompt building with single file."""
enhancer = LLMEnhancer()
files = [
FileData(
path="/test/auth.py",
content="def login(): pass",
language="python",
)
]
prompt = enhancer._build_batch_prompt(files)
assert "[FILE: /test/auth.py]" in prompt
assert "```python" in prompt
assert "def login(): pass" in prompt
assert "PURPOSE:" in prompt
assert "JSON format output" in prompt
def test_build_multiple_files_prompt(self):
"""Test prompt building with multiple files."""
enhancer = LLMEnhancer()
files = [
FileData(path="/test/a.py", content="def a(): pass", language="python"),
FileData(path="/test/b.js", content="function b() {}", language="javascript"),
]
prompt = enhancer._build_batch_prompt(files)
assert "[FILE: /test/a.py]" in prompt
assert "[FILE: /test/b.js]" in prompt
assert "```python" in prompt
assert "```javascript" in prompt
def test_build_prompt_truncates_long_content(self):
"""Test prompt truncates long content."""
config = LLMConfig(max_content_chars=100)
enhancer = LLMEnhancer(config)
long_content = "x" * 200
files = [FileData(path="/test/long.py", content=long_content, language="python")]
prompt = enhancer._build_batch_prompt(files)
assert "... [truncated]" in prompt
assert "x" * 200 not in prompt
class TestJSONParsing:
"""Tests for JSON response parsing."""
def test_parse_valid_response(self):
"""Test parsing valid JSON response."""
enhancer = LLMEnhancer()
response = json.dumps({
"files": {
"/test/auth.py": {
"summary": "Authentication handler",
"keywords": ["auth", "login"],
"purpose": "auth",
}
}
})
result = enhancer._parse_response(response, "gemini")
assert "/test/auth.py" in result
assert result["/test/auth.py"].summary == "Authentication handler"
assert result["/test/auth.py"].keywords == ["auth", "login"]
assert result["/test/auth.py"].purpose == "auth"
assert result["/test/auth.py"].llm_tool == "gemini"
def test_parse_response_with_markdown(self):
"""Test parsing response wrapped in markdown."""
enhancer = LLMEnhancer()
response = '''```json
{
"files": {
"/test/file.py": {
"summary": "Test file",
"keywords": ["test"],
"purpose": "test"
}
}
}
```'''
result = enhancer._parse_response(response, "qwen")
assert "/test/file.py" in result
assert result["/test/file.py"].summary == "Test file"
def test_parse_response_multiple_files(self):
"""Test parsing response with multiple files."""
enhancer = LLMEnhancer()
response = json.dumps({
"files": {
"/test/a.py": {"summary": "File A", "keywords": ["a"], "purpose": "util"},
"/test/b.py": {"summary": "File B", "keywords": ["b"], "purpose": "api"},
}
})
result = enhancer._parse_response(response, "gemini")
assert len(result) == 2
assert result["/test/a.py"].summary == "File A"
assert result["/test/b.py"].summary == "File B"
def test_parse_invalid_json(self):
"""Test parsing invalid JSON returns empty dict."""
enhancer = LLMEnhancer()
response = "not valid json at all"
result = enhancer._parse_response(response, "gemini")
assert result == {}
def test_parse_empty_response(self):
"""Test parsing empty response returns empty dict."""
enhancer = LLMEnhancer()
result = enhancer._parse_response("", "gemini")
assert result == {}
class TestJSONExtraction:
"""Tests for JSON extraction from mixed text."""
def test_extract_json_from_plain(self):
"""Test extracting JSON from plain text."""
enhancer = LLMEnhancer()
text = '{"key": "value"}'
result = enhancer._extract_json(text)
assert result == '{"key": "value"}'
def test_extract_json_from_markdown(self):
"""Test extracting JSON from markdown code block."""
enhancer = LLMEnhancer()
text = '''```json
{"key": "value"}
```'''
result = enhancer._extract_json(text)
assert result == '{"key": "value"}'
def test_extract_json_with_surrounding_text(self):
"""Test extracting JSON with surrounding text."""
enhancer = LLMEnhancer()
text = 'Here is the result: {"key": "value"} That is all.'
result = enhancer._extract_json(text)
assert result == '{"key": "value"}'
def test_extract_nested_json(self):
"""Test extracting nested JSON."""
enhancer = LLMEnhancer()
text = '{"outer": {"inner": "value"}}'
result = enhancer._extract_json(text)
assert '"outer"' in result
assert '"inner"' in result
def test_extract_no_json(self):
"""Test extracting from text without JSON."""
enhancer = LLMEnhancer()
text = "No JSON here at all"
result = enhancer._extract_json(text)
assert result is None
def test_extract_malformed_json(self):
"""Test extracting malformed JSON returns None."""
enhancer = LLMEnhancer()
text = '{"key": "value"' # Missing closing brace
result = enhancer._extract_json(text)
assert result is None
class TestEnhanceFiles:
"""Tests for enhance_files method."""
@patch.object(LLMEnhancer, "check_available", return_value=False)
def test_enhance_files_ccw_not_available(self, mock_check):
"""Test enhance_files returns empty when CCW not available."""
enhancer = LLMEnhancer()
files = [FileData(path="/test/a.py", content="code", language="python")]
result = enhancer.enhance_files(files)
assert result == {}
def test_enhance_files_disabled(self):
"""Test enhance_files returns empty when disabled."""
config = LLMConfig(enabled=False)
enhancer = LLMEnhancer(config)
files = [FileData(path="/test/a.py", content="code", language="python")]
result = enhancer.enhance_files(files)
assert result == {}
@patch.object(LLMEnhancer, "check_available", return_value=True)
def test_enhance_files_empty_list(self, mock_check):
"""Test enhance_files with empty list returns empty dict."""
enhancer = LLMEnhancer()
result = enhancer.enhance_files([])
assert result == {}
@patch.object(LLMEnhancer, "check_available", return_value=True)
@patch.object(LLMEnhancer, "_invoke_ccw_cli")
def test_enhance_files_success(self, mock_invoke, mock_check):
"""Test enhance_files successful processing."""
mock_invoke.return_value = {
"success": True,
"stdout": json.dumps({
"files": {
"/test/auth.py": {
"summary": "Auth module",
"keywords": ["auth"],
"purpose": "auth",
}
}
}),
"stderr": "",
"exit_code": 0,
}
enhancer = LLMEnhancer()
files = [FileData(path="/test/auth.py", content="def login(): pass", language="python")]
result = enhancer.enhance_files(files)
assert "/test/auth.py" in result
assert result["/test/auth.py"].summary == "Auth module"
@patch.object(LLMEnhancer, "check_available", return_value=True)
@patch.object(LLMEnhancer, "_invoke_ccw_cli")
def test_enhance_files_fallback(self, mock_invoke, mock_check):
"""Test enhance_files falls back to secondary tool."""
# First call fails, second succeeds
mock_invoke.side_effect = [
{"success": False, "stdout": "", "stderr": "error", "exit_code": 1},
{
"success": True,
"stdout": json.dumps({
"files": {
"/test/file.py": {
"summary": "Fallback result",
"keywords": ["fallback"],
"purpose": "util",
}
}
}),
"stderr": "",
"exit_code": 0,
},
]
enhancer = LLMEnhancer()
files = [FileData(path="/test/file.py", content="code", language="python")]
result = enhancer.enhance_files(files)
assert "/test/file.py" in result
assert result["/test/file.py"].summary == "Fallback result"
assert mock_invoke.call_count == 2
class TestEnhanceFile:
"""Tests for enhance_file single file method."""
@patch.object(LLMEnhancer, "enhance_files")
def test_enhance_file_success(self, mock_enhance_files):
"""Test enhance_file returns metadata on success."""
mock_enhance_files.return_value = {
"/test/auth.py": SemanticMetadata(
summary="Auth module",
keywords=["auth", "login"],
purpose="auth",
file_path="/test/auth.py",
llm_tool="gemini",
)
}
enhancer = LLMEnhancer()
result = enhancer.enhance_file("/test/auth.py", "def login(): pass", "python")
assert result.summary == "Auth module"
assert result.keywords == ["auth", "login"]
@patch.object(LLMEnhancer, "enhance_files")
def test_enhance_file_fallback_on_failure(self, mock_enhance_files):
"""Test enhance_file returns default metadata on failure."""
mock_enhance_files.return_value = {} # Enhancement failed
enhancer = LLMEnhancer()
result = enhancer.enhance_file("/test/file.py", "code", "python")
assert "python" in result.summary.lower()
assert "python" in result.keywords
assert result.purpose == "unknown"
class TestBatchProcessing:
"""Tests for batch processing."""
@patch.object(LLMEnhancer, "check_available", return_value=True)
@patch.object(LLMEnhancer, "_process_batch")
def test_batch_processing(self, mock_process, mock_check):
"""Test files are processed in batches."""
mock_process.return_value = {}
config = LLMConfig(batch_size=2)
enhancer = LLMEnhancer(config)
files = [
FileData(path=f"/test/file{i}.py", content="code", language="python")
for i in range(5)
]
enhancer.enhance_files(files)
# 5 files with batch_size=2 should result in 3 batches
assert mock_process.call_count == 3
@patch.object(LLMEnhancer, "check_available", return_value=True)
@patch.object(LLMEnhancer, "_process_batch")
def test_batch_continues_on_error(self, mock_process, mock_check):
"""Test batch processing continues on error."""
# First batch fails, second succeeds
mock_process.side_effect = [
Exception("Batch 1 failed"),
{"/test/file2.py": SemanticMetadata(summary="OK", keywords=[], purpose="")},
]
config = LLMConfig(batch_size=1)
enhancer = LLMEnhancer(config)
files = [
FileData(path="/test/file1.py", content="code", language="python"),
FileData(path="/test/file2.py", content="code", language="python"),
]
result = enhancer.enhance_files(files)
# Should still get results from second batch
assert "/test/file2.py" in result
# === CCW CLI Invocation Tests ===
class TestCCWInvocation:
"""Tests for CCW CLI invocation."""
@patch("subprocess.run")
@patch("shutil.which", return_value="/usr/bin/ccw")
def test_invoke_success(self, mock_which, mock_run):
"""Test successful CCW CLI invocation."""
mock_run.return_value = MagicMock(
returncode=0,
stdout='{"files": {}}',
stderr="",
)
enhancer = LLMEnhancer()
result = enhancer._invoke_ccw_cli("test prompt", tool="gemini")
assert result["success"] is True
assert result["exit_code"] == 0
@patch("subprocess.run")
@patch("shutil.which", return_value="/usr/bin/ccw")
def test_invoke_failure(self, mock_which, mock_run):
"""Test failed CCW CLI invocation."""
mock_run.return_value = MagicMock(
returncode=1,
stdout="",
stderr="Error occurred",
)
enhancer = LLMEnhancer()
result = enhancer._invoke_ccw_cli("test prompt", tool="gemini")
assert result["success"] is False
assert result["exit_code"] == 1
@patch("subprocess.run")
@patch("shutil.which", return_value="/usr/bin/ccw")
def test_invoke_timeout(self, mock_which, mock_run):
"""Test CCW CLI timeout handling."""
import subprocess
mock_run.side_effect = subprocess.TimeoutExpired(cmd="ccw", timeout=300)
enhancer = LLMEnhancer()
result = enhancer._invoke_ccw_cli("test prompt", tool="gemini")
assert result["success"] is False
assert "timeout" in result["stderr"]
@patch("subprocess.run")
@patch("shutil.which", return_value=None)
def test_invoke_ccw_not_found(self, mock_which, mock_run):
"""Test CCW CLI not found handling."""
mock_run.side_effect = FileNotFoundError()
enhancer = LLMEnhancer()
result = enhancer._invoke_ccw_cli("test prompt", tool="gemini")
assert result["success"] is False
assert "not found" in result["stderr"]
# === EnhancedSemanticIndexer Tests ===
class TestEnhancedSemanticIndexer:
"""Tests for EnhancedSemanticIndexer integration."""
@pytest.fixture
def mock_enhancer(self):
"""Create mock LLM enhancer."""
enhancer = MagicMock(spec=LLMEnhancer)
enhancer.enhance_files.return_value = {
"/test/auth.py": SemanticMetadata(
summary="Authentication handler",
keywords=["auth", "login", "jwt"],
purpose="auth",
file_path="/test/auth.py",
llm_tool="gemini",
)
}
return enhancer
@pytest.fixture
def mock_embedder(self):
"""Create mock embedder."""
embedder = MagicMock()
embedder.embed.return_value = [[0.1] * 384]
embedder.embed_single.return_value = [0.1] * 384
return embedder
@pytest.fixture
def mock_vector_store(self):
"""Create mock vector store."""
store = MagicMock()
store.add_chunk.return_value = 1
return store
def test_index_files_empty_list(self, mock_enhancer, mock_embedder, mock_vector_store):
"""Test indexing empty file list."""
indexer = EnhancedSemanticIndexer(mock_enhancer, mock_embedder, mock_vector_store)
result = indexer.index_files([])
assert result == 0
mock_enhancer.enhance_files.assert_not_called()
def test_index_files_with_llm_enhancement(self, mock_enhancer, mock_embedder, mock_vector_store):
"""Test indexing with LLM enhancement."""
indexer = EnhancedSemanticIndexer(mock_enhancer, mock_embedder, mock_vector_store)
files = [FileData(path="/test/auth.py", content="def login(): pass", language="python")]
result = indexer.index_files(files)
assert result == 1
mock_enhancer.enhance_files.assert_called_once()
mock_embedder.embed.assert_called_once()
mock_vector_store.add_chunk.assert_called_once()
def test_index_files_fallback_to_raw_code(self, mock_embedder, mock_vector_store):
"""Test indexing falls back to raw code when LLM fails."""
mock_enhancer = MagicMock(spec=LLMEnhancer)
mock_enhancer.enhance_files.return_value = {} # No enhancement
indexer = EnhancedSemanticIndexer(mock_enhancer, mock_embedder, mock_vector_store)
files = [FileData(path="/test/file.py", content="code", language="python")]
result = indexer.index_files(files)
assert result == 1
mock_embedder.embed_single.assert_called()
def test_create_embeddable_text(self, mock_enhancer, mock_embedder, mock_vector_store):
"""Test embeddable text creation."""
indexer = EnhancedSemanticIndexer(mock_enhancer, mock_embedder, mock_vector_store)
metadata = SemanticMetadata(
summary="Handles user authentication",
keywords=["auth", "login", "user"],
purpose="auth",
)
file_data = FileData(path="/test/auth.py", content="code", language="python")
text = indexer._create_embeddable_text(metadata, file_data)
assert "Handles user authentication" in text
assert "auth" in text.lower()
assert "Keywords:" in text
assert "auth.py" in text
# === Factory Function Tests ===
class TestFactoryFunctions:
"""Tests for factory functions."""
def test_create_enhancer_default(self):
"""Test create_enhancer with defaults."""
enhancer = create_enhancer()
assert enhancer.config.tool == "gemini"
assert enhancer.config.enabled is True
def test_create_enhancer_custom(self):
"""Test create_enhancer with custom params."""
enhancer = create_enhancer(
tool="qwen",
timeout_ms=600000,
batch_size=10,
enabled=False,
)
assert enhancer.config.tool == "qwen"
assert enhancer.config.timeout_ms == 600000
assert enhancer.config.batch_size == 10
assert enhancer.config.enabled is False
@pytest.mark.skipif(
not pytest.importorskip("codexlens.semantic", reason="semantic not available"),
reason="Semantic dependencies not installed"
)
def test_create_enhanced_indexer(self, tmp_path):
"""Test create_enhanced_indexer factory."""
try:
from codexlens.semantic import SEMANTIC_AVAILABLE
if not SEMANTIC_AVAILABLE:
pytest.skip("Semantic dependencies not installed")
db_path = tmp_path / "semantic.db"
indexer = create_enhanced_indexer(db_path, llm_tool="gemini", llm_enabled=False)
assert indexer.enhancer is not None
assert indexer.embedder is not None
assert indexer.vector_store is not None
except ImportError:
pytest.skip("Semantic dependencies not installed")
# === Edge Cases ===
class TestEdgeCases:
"""Tests for edge cases."""
def test_semantic_metadata_with_special_chars(self):
"""Test metadata with special characters."""
metadata = SemanticMetadata(
summary='Test "quoted" and \'single\' quotes',
keywords=["special", "chars", "test's"],
purpose="test",
)
assert '"quoted"' in metadata.summary
assert "test's" in metadata.keywords
def test_file_data_with_unicode(self):
"""Test FileData with unicode content."""
data = FileData(
path="/test/中文.py",
content="def 你好(): return '世界'",
language="python",
)
assert "中文" in data.path
assert "你好" in data.content
@patch.object(LLMEnhancer, "check_available", return_value=True)
@patch.object(LLMEnhancer, "_invoke_ccw_cli")
def test_enhance_with_very_long_content(self, mock_invoke, mock_check):
"""Test enhancement with very long content."""
mock_invoke.return_value = {
"success": True,
"stdout": json.dumps({"files": {}}),
"stderr": "",
"exit_code": 0,
}
config = LLMConfig(max_content_chars=100)
enhancer = LLMEnhancer(config)
long_content = "x" * 10000
files = [FileData(path="/test/long.py", content=long_content, language="python")]
enhancer.enhance_files(files)
# Should not crash, content should be truncated in prompt
mock_invoke.assert_called_once()
def test_parse_response_with_missing_fields(self):
"""Test parsing response with missing fields."""
enhancer = LLMEnhancer()
response = json.dumps({
"files": {
"/test/file.py": {
"summary": "Only summary provided",
# keywords and purpose missing
}
}
})
result = enhancer._parse_response(response, "gemini")
assert "/test/file.py" in result
assert result["/test/file.py"].summary == "Only summary provided"
assert result["/test/file.py"].keywords == []
assert result["/test/file.py"].purpose == ""

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,747 @@
"""Full coverage tests for vector/semantic search functionality.
Tests cover:
- Embedder model loading and embedding generation
- VectorStore CRUD operations and caching
- Cosine similarity computation
- Semantic search accuracy and relevance
- Performance benchmarks
- Edge cases and error handling
- Thread safety and concurrent access
"""
import json
import tempfile
import threading
import time
from pathlib import Path
from typing import List
import pytest
from codexlens.entities import SemanticChunk, Symbol, SearchResult
from codexlens.semantic import SEMANTIC_AVAILABLE, check_semantic_available
# Skip all tests if semantic dependencies not available
pytestmark = pytest.mark.skipif(
not SEMANTIC_AVAILABLE,
reason="Semantic search dependencies not installed (pip install codexlens[semantic])"
)
# === Fixtures ===
@pytest.fixture
def temp_db(tmp_path):
"""Create temporary database path."""
return tmp_path / "test_semantic.db"
@pytest.fixture
def embedder():
"""Create Embedder instance."""
from codexlens.semantic.embedder import Embedder
return Embedder()
@pytest.fixture
def vector_store(temp_db):
"""Create VectorStore instance."""
from codexlens.semantic.vector_store import VectorStore
return VectorStore(temp_db)
@pytest.fixture
def sample_code_chunks():
"""Sample code chunks for testing."""
return [
{
"content": "def authenticate(username, password): return check_credentials(username, password)",
"metadata": {"symbol_name": "authenticate", "symbol_kind": "function", "start_line": 1, "end_line": 1, "language": "python"},
},
{
"content": "class DatabaseConnection:\n def connect(self, host, port): pass\n def execute(self, query): pass",
"metadata": {"symbol_name": "DatabaseConnection", "symbol_kind": "class", "start_line": 1, "end_line": 3, "language": "python"},
},
{
"content": "async function fetchUserData(userId) { return await api.get('/users/' + userId); }",
"metadata": {"symbol_name": "fetchUserData", "symbol_kind": "function", "start_line": 1, "end_line": 1, "language": "javascript"},
},
{
"content": "def calculate_sum(numbers): return sum(numbers)",
"metadata": {"symbol_name": "calculate_sum", "symbol_kind": "function", "start_line": 1, "end_line": 1, "language": "python"},
},
{
"content": "class UserProfile:\n def __init__(self, name, email):\n self.name = name\n self.email = email",
"metadata": {"symbol_name": "UserProfile", "symbol_kind": "class", "start_line": 1, "end_line": 4, "language": "python"},
},
]
# === Embedder Tests ===
class TestEmbedder:
"""Tests for Embedder class."""
def test_embedder_initialization(self, embedder):
"""Test embedder initializes correctly."""
assert embedder.model_name == "BAAI/bge-small-en-v1.5"
assert embedder.EMBEDDING_DIM == 384
assert embedder._model is None # Lazy loading
def test_embed_single_returns_correct_dimension(self, embedder):
"""Test single embedding has correct dimension."""
text = "def hello(): print('world')"
embedding = embedder.embed_single(text)
assert isinstance(embedding, list)
assert len(embedding) == 384
assert all(isinstance(x, float) for x in embedding)
def test_embed_batch_returns_correct_count(self, embedder):
"""Test batch embedding returns correct number of embeddings."""
texts = [
"def foo(): pass",
"def bar(): pass",
"def baz(): pass",
]
embeddings = embedder.embed(texts)
assert len(embeddings) == len(texts)
assert all(len(e) == 384 for e in embeddings)
def test_embed_empty_string(self, embedder):
"""Test embedding empty string."""
embedding = embedder.embed_single("")
assert len(embedding) == 384
def test_embed_unicode_text(self, embedder):
"""Test embedding unicode text."""
text = "def 你好(): return '世界'"
embedding = embedder.embed_single(text)
assert len(embedding) == 384
def test_embed_long_text(self, embedder):
"""Test embedding long text."""
text = "def process(): pass\n" * 100
embedding = embedder.embed_single(text)
assert len(embedding) == 384
def test_embed_special_characters(self, embedder):
"""Test embedding text with special characters."""
text = "def test(): return {'key': 'value', '@decorator': True}"
embedding = embedder.embed_single(text)
assert len(embedding) == 384
def test_lazy_model_loading(self, embedder):
"""Test model loads lazily on first embed call."""
assert embedder._model is None
embedder.embed_single("test")
assert embedder._model is not None
def test_model_reuse(self, embedder):
"""Test model is reused across multiple calls."""
embedder.embed_single("test1")
model_ref = embedder._model
embedder.embed_single("test2")
assert embedder._model is model_ref # Same instance
class TestEmbeddingSimilarity:
"""Tests for embedding similarity."""
def test_identical_text_similarity(self, embedder):
"""Test identical text has similarity ~1.0."""
from codexlens.semantic.vector_store import _cosine_similarity
text = "def calculate_sum(a, b): return a + b"
emb1 = embedder.embed_single(text)
emb2 = embedder.embed_single(text)
similarity = _cosine_similarity(emb1, emb2)
assert similarity > 0.99, "Identical text should have ~1.0 similarity"
def test_similar_code_high_similarity(self, embedder):
"""Test similar code has high similarity."""
from codexlens.semantic.vector_store import _cosine_similarity
code1 = "def add(a, b): return a + b"
code2 = "def sum_numbers(x, y): return x + y"
emb1 = embedder.embed_single(code1)
emb2 = embedder.embed_single(code2)
similarity = _cosine_similarity(emb1, emb2)
assert similarity > 0.6, "Similar functions should have high similarity"
def test_different_code_lower_similarity(self, embedder):
"""Test different code has lower similarity than similar code."""
from codexlens.semantic.vector_store import _cosine_similarity
code1 = "def add(a, b): return a + b"
code2 = "def sum_numbers(x, y): return x + y"
code3 = "class UserAuth: def login(self, user, pwd): pass"
emb1 = embedder.embed_single(code1)
emb2 = embedder.embed_single(code2)
emb3 = embedder.embed_single(code3)
sim_similar = _cosine_similarity(emb1, emb2)
sim_different = _cosine_similarity(emb1, emb3)
assert sim_similar > sim_different, "Similar code should have higher similarity"
def test_zero_vector_similarity(self):
"""Test cosine similarity with zero vector."""
from codexlens.semantic.vector_store import _cosine_similarity
zero_vec = [0.0] * 384
normal_vec = [1.0] * 384
similarity = _cosine_similarity(zero_vec, normal_vec)
assert similarity == 0.0, "Zero vector should have 0 similarity"
# === VectorStore Tests ===
class TestVectorStoreCRUD:
"""Tests for VectorStore CRUD operations."""
def test_add_chunk(self, vector_store, embedder):
"""Test adding a single chunk."""
chunk = SemanticChunk(
content="def test(): pass",
metadata={"language": "python"},
)
chunk.embedding = embedder.embed_single(chunk.content)
chunk_id = vector_store.add_chunk(chunk, "/test/file.py")
assert chunk_id > 0
assert vector_store.count_chunks() == 1
def test_add_chunk_without_embedding_raises(self, vector_store):
"""Test adding chunk without embedding raises error."""
chunk = SemanticChunk(content="def test(): pass", metadata={})
with pytest.raises(ValueError, match="must have embedding"):
vector_store.add_chunk(chunk, "/test/file.py")
def test_add_chunks_batch(self, vector_store, embedder, sample_code_chunks):
"""Test batch adding chunks."""
chunks = []
for data in sample_code_chunks:
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
chunk.embedding = embedder.embed_single(chunk.content)
chunks.append(chunk)
ids = vector_store.add_chunks(chunks, "/test/multi.py")
assert len(ids) == len(chunks)
assert vector_store.count_chunks() == len(chunks)
def test_add_empty_batch(self, vector_store):
"""Test adding empty batch returns empty list."""
ids = vector_store.add_chunks([], "/test/empty.py")
assert ids == []
def test_delete_file_chunks(self, vector_store, embedder):
"""Test deleting chunks by file path."""
# Add chunks for two files
chunk1 = SemanticChunk(content="def a(): pass", metadata={})
chunk1.embedding = embedder.embed_single(chunk1.content)
vector_store.add_chunk(chunk1, "/test/file1.py")
chunk2 = SemanticChunk(content="def b(): pass", metadata={})
chunk2.embedding = embedder.embed_single(chunk2.content)
vector_store.add_chunk(chunk2, "/test/file2.py")
assert vector_store.count_chunks() == 2
# Delete one file's chunks
deleted = vector_store.delete_file_chunks("/test/file1.py")
assert deleted == 1
assert vector_store.count_chunks() == 1
def test_delete_nonexistent_file(self, vector_store):
"""Test deleting non-existent file returns 0."""
deleted = vector_store.delete_file_chunks("/nonexistent/file.py")
assert deleted == 0
def test_count_chunks_empty(self, vector_store):
"""Test count on empty store."""
assert vector_store.count_chunks() == 0
class TestVectorStoreSearch:
"""Tests for VectorStore search functionality."""
def test_search_similar_basic(self, vector_store, embedder, sample_code_chunks):
"""Test basic similarity search."""
# Add chunks
for data in sample_code_chunks:
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/file.py")
# Search
query = "function to authenticate user login"
query_embedding = embedder.embed_single(query)
results = vector_store.search_similar(query_embedding, top_k=3)
assert len(results) > 0
assert all(isinstance(r, SearchResult) for r in results)
# Top result should be auth-related
assert "authenticate" in results[0].excerpt.lower() or "auth" in results[0].path.lower()
def test_search_respects_top_k(self, vector_store, embedder, sample_code_chunks):
"""Test search respects top_k parameter."""
# Add all chunks
for data in sample_code_chunks:
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/file.py")
query_embedding = embedder.embed_single("code")
results_2 = vector_store.search_similar(query_embedding, top_k=2)
results_5 = vector_store.search_similar(query_embedding, top_k=5)
assert len(results_2) <= 2
assert len(results_5) <= 5
def test_search_min_score_filtering(self, vector_store, embedder):
"""Test min_score filtering."""
chunk = SemanticChunk(
content="def hello(): print('hello world')",
metadata={},
)
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/hello.py")
query_embedding = embedder.embed_single("database connection pool")
results_no_filter = vector_store.search_similar(query_embedding, min_score=0.0)
results_high_filter = vector_store.search_similar(query_embedding, min_score=0.9)
assert len(results_no_filter) >= len(results_high_filter)
def test_search_returns_sorted_by_score(self, vector_store, embedder, sample_code_chunks):
"""Test results are sorted by score descending."""
for data in sample_code_chunks:
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/file.py")
query_embedding = embedder.embed_single("function")
results = vector_store.search_similar(query_embedding, top_k=5)
if len(results) > 1:
for i in range(len(results) - 1):
assert results[i].score >= results[i + 1].score
def test_search_includes_metadata(self, vector_store, embedder):
"""Test search results include metadata."""
chunk = SemanticChunk(
content="def test_function(): pass",
metadata={
"symbol_name": "test_function",
"symbol_kind": "function",
"start_line": 10,
"end_line": 15,
},
)
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/func.py")
query_embedding = embedder.embed_single("test function")
results = vector_store.search_similar(query_embedding, top_k=1)
assert len(results) == 1
assert results[0].symbol_name == "test_function"
assert results[0].symbol_kind == "function"
assert results[0].start_line == 10
assert results[0].end_line == 15
def test_search_empty_store_returns_empty(self, vector_store, embedder):
"""Test search on empty store returns empty list."""
query_embedding = embedder.embed_single("anything")
results = vector_store.search_similar(query_embedding)
assert results == []
def test_search_with_return_full_content_false(self, vector_store, embedder):
"""Test search with return_full_content=False."""
chunk = SemanticChunk(
content="def long_function(): " + "pass\n" * 100,
metadata={},
)
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/long.py")
query_embedding = embedder.embed_single("function")
results = vector_store.search_similar(
query_embedding, top_k=1, return_full_content=False
)
assert len(results) == 1
assert results[0].content is None
assert results[0].excerpt is not None
class TestVectorStoreCache:
"""Tests for VectorStore caching behavior."""
def test_cache_invalidation_on_add(self, vector_store, embedder):
"""Test cache is invalidated when chunks are added."""
chunk1 = SemanticChunk(content="def a(): pass", metadata={})
chunk1.embedding = embedder.embed_single(chunk1.content)
vector_store.add_chunk(chunk1, "/test/a.py")
# Trigger cache population
query_embedding = embedder.embed_single("function")
vector_store.search_similar(query_embedding)
initial_version = vector_store._cache_version
# Add another chunk
chunk2 = SemanticChunk(content="def b(): pass", metadata={})
chunk2.embedding = embedder.embed_single(chunk2.content)
vector_store.add_chunk(chunk2, "/test/b.py")
assert vector_store._cache_version > initial_version
assert vector_store._embedding_matrix is None
def test_cache_invalidation_on_delete(self, vector_store, embedder):
"""Test cache is invalidated when chunks are deleted."""
chunk = SemanticChunk(content="def a(): pass", metadata={})
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/a.py")
# Trigger cache population
query_embedding = embedder.embed_single("function")
vector_store.search_similar(query_embedding)
initial_version = vector_store._cache_version
# Delete chunk
vector_store.delete_file_chunks("/test/a.py")
assert vector_store._cache_version > initial_version
def test_manual_cache_clear(self, vector_store, embedder):
"""Test manual cache clearing."""
chunk = SemanticChunk(content="def a(): pass", metadata={})
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/a.py")
# Trigger cache population
query_embedding = embedder.embed_single("function")
vector_store.search_similar(query_embedding)
assert vector_store._embedding_matrix is not None
vector_store.clear_cache()
assert vector_store._embedding_matrix is None
# === Semantic Search Accuracy Tests ===
class TestSemanticSearchAccuracy:
"""Tests for semantic search accuracy and relevance."""
def test_auth_query_finds_auth_code(self, vector_store, embedder, sample_code_chunks):
"""Test authentication query finds auth code."""
for data in sample_code_chunks:
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/file.py")
query = "user authentication login"
query_embedding = embedder.embed_single(query)
results = vector_store.search_similar(query_embedding, top_k=1)
assert len(results) > 0
assert "authenticate" in results[0].excerpt.lower()
def test_database_query_finds_db_code(self, vector_store, embedder, sample_code_chunks):
"""Test database query finds database code."""
for data in sample_code_chunks:
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/file.py")
query = "database connection execute query"
query_embedding = embedder.embed_single(query)
results = vector_store.search_similar(query_embedding, top_k=1)
assert len(results) > 0
assert "database" in results[0].excerpt.lower() or "connect" in results[0].excerpt.lower()
def test_math_query_finds_calculation_code(self, vector_store, embedder, sample_code_chunks):
"""Test math query finds calculation code."""
for data in sample_code_chunks:
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/file.py")
query = "sum numbers add calculation"
query_embedding = embedder.embed_single(query)
results = vector_store.search_similar(query_embedding, top_k=1)
assert len(results) > 0
assert "sum" in results[0].excerpt.lower() or "calculate" in results[0].excerpt.lower()
# === Performance Tests ===
class TestVectorSearchPerformance:
"""Performance tests for vector search."""
def test_embedding_performance(self, embedder):
"""Test embedding generation performance."""
text = "def calculate_sum(a, b): return a + b"
# Warm up
embedder.embed_single(text)
# Measure
start = time.perf_counter()
iterations = 10
for _ in range(iterations):
embedder.embed_single(text)
elapsed = time.perf_counter() - start
avg_ms = (elapsed / iterations) * 1000
assert avg_ms < 100, f"Single embedding should be <100ms, got {avg_ms:.2f}ms"
def test_batch_embedding_performance(self, embedder):
"""Test batch embedding performance."""
texts = [f"def function_{i}(): pass" for i in range(50)]
# Warm up
embedder.embed(texts[:5])
# Measure
start = time.perf_counter()
embedder.embed(texts)
elapsed = time.perf_counter() - start
total_ms = elapsed * 1000
per_text_ms = total_ms / len(texts)
assert per_text_ms < 20, f"Per-text embedding should be <20ms, got {per_text_ms:.2f}ms"
def test_search_performance_small(self, vector_store, embedder):
"""Test search performance with small dataset."""
# Add 100 chunks
for i in range(100):
chunk = SemanticChunk(
content=f"def function_{i}(): return {i}",
metadata={"index": i},
)
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, f"/test/file_{i}.py")
query_embedding = embedder.embed_single("function return value")
# Warm up
vector_store.search_similar(query_embedding)
# Measure
start = time.perf_counter()
iterations = 10
for _ in range(iterations):
vector_store.search_similar(query_embedding)
elapsed = time.perf_counter() - start
avg_ms = (elapsed / iterations) * 1000
assert avg_ms < 50, f"Search with 100 chunks should be <50ms, got {avg_ms:.2f}ms"
def test_search_performance_medium(self, vector_store, embedder):
"""Test search performance with medium dataset."""
# Add 500 chunks in batch
chunks = []
for i in range(500):
chunk = SemanticChunk(
content=f"def function_{i}(x): return x * {i}",
metadata={"index": i},
)
chunk.embedding = embedder.embed_single(chunk.content)
chunks.append(chunk)
vector_store.add_chunks(chunks, "/test/bulk.py")
query_embedding = embedder.embed_single("multiply value")
# Warm up
vector_store.search_similar(query_embedding)
# Measure
start = time.perf_counter()
iterations = 5
for _ in range(iterations):
vector_store.search_similar(query_embedding)
elapsed = time.perf_counter() - start
avg_ms = (elapsed / iterations) * 1000
assert avg_ms < 100, f"Search with 500 chunks should be <100ms, got {avg_ms:.2f}ms"
# === Thread Safety Tests ===
class TestThreadSafety:
"""Tests for thread safety."""
def test_concurrent_searches(self, vector_store, embedder, sample_code_chunks):
"""Test concurrent searches are thread-safe."""
# Populate store
for data in sample_code_chunks:
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/file.py")
results_list = []
errors = []
def search_task(query):
try:
query_embedding = embedder.embed_single(query)
results = vector_store.search_similar(query_embedding, top_k=3)
results_list.append(len(results))
except Exception as e:
errors.append(str(e))
queries = ["authentication", "database", "function", "class", "async"]
threads = [threading.Thread(target=search_task, args=(q,)) for q in queries]
for t in threads:
t.start()
for t in threads:
t.join()
assert len(errors) == 0, f"Errors during concurrent search: {errors}"
assert len(results_list) == len(queries)
def test_concurrent_add_and_search(self, vector_store, embedder):
"""Test concurrent add and search operations."""
errors = []
def add_task(idx):
try:
chunk = SemanticChunk(
content=f"def task_{idx}(): pass",
metadata={"idx": idx},
)
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, f"/test/task_{idx}.py")
except Exception as e:
errors.append(f"Add error: {e}")
def search_task():
try:
query_embedding = embedder.embed_single("function task")
vector_store.search_similar(query_embedding)
except Exception as e:
errors.append(f"Search error: {e}")
threads = []
for i in range(10):
threads.append(threading.Thread(target=add_task, args=(i,)))
threads.append(threading.Thread(target=search_task))
for t in threads:
t.start()
for t in threads:
t.join()
assert len(errors) == 0, f"Errors during concurrent ops: {errors}"
# === Edge Cases ===
class TestEdgeCases:
"""Tests for edge cases."""
def test_very_short_content(self, vector_store, embedder):
"""Test handling very short content."""
chunk = SemanticChunk(content="x", metadata={})
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/short.py")
query_embedding = embedder.embed_single("x")
results = vector_store.search_similar(query_embedding)
assert len(results) == 1
def test_special_characters_in_path(self, vector_store, embedder):
"""Test handling special characters in file path."""
chunk = SemanticChunk(content="def test(): pass", metadata={})
chunk.embedding = embedder.embed_single(chunk.content)
special_path = "/test/path with spaces/file-name_v2.py"
vector_store.add_chunk(chunk, special_path)
query_embedding = embedder.embed_single("test function")
results = vector_store.search_similar(query_embedding)
assert len(results) == 1
assert results[0].path == special_path
def test_json_metadata_special_chars(self, vector_store, embedder):
"""Test metadata with special JSON characters."""
metadata = {
"description": 'Test "quoted" text with \'single\' quotes',
"path": "C:\\Users\\test\\file.py",
"tags": ["tag1", "tag2"],
}
chunk = SemanticChunk(content="def test(): pass", metadata=metadata)
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/special.py")
query_embedding = embedder.embed_single("test")
results = vector_store.search_similar(query_embedding)
assert len(results) == 1
assert results[0].metadata["description"] == metadata["description"]
def test_search_zero_top_k(self, vector_store, embedder):
"""Test search with top_k=0."""
chunk = SemanticChunk(content="def test(): pass", metadata={})
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/file.py")
query_embedding = embedder.embed_single("test")
results = vector_store.search_similar(query_embedding, top_k=0)
assert results == []
def test_search_very_high_min_score(self, vector_store, embedder):
"""Test search with very high min_score filters all results."""
chunk = SemanticChunk(content="def hello(): print('world')", metadata={})
chunk.embedding = embedder.embed_single(chunk.content)
vector_store.add_chunk(chunk, "/test/hello.py")
# Query something unrelated with very high threshold
query_embedding = embedder.embed_single("database connection")
results = vector_store.search_similar(query_embedding, min_score=0.99)
# Should filter out since unrelated
assert len(results) == 0
# === Availability Check Tests ===
class TestAvailabilityCheck:
"""Tests for semantic availability checking."""
def test_check_semantic_available(self):
"""Test check_semantic_available function."""
available, error = check_semantic_available()
assert available is True
assert error is None
def test_semantic_available_flag(self):
"""Test SEMANTIC_AVAILABLE flag is True when deps installed."""
assert SEMANTIC_AVAILABLE is True