mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-29 20:11:04 +08:00
Add comprehensive tests for vector/semantic search functionality
- Implement full coverage tests for Embedder model loading and embedding generation - Add CRUD operations and caching tests for VectorStore - Include cosine similarity computation tests - Validate semantic search accuracy and relevance through various queries - Establish performance benchmarks for embedding and search operations - Ensure edge cases and error handling are covered - Test thread safety and concurrent access scenarios - Verify availability of semantic search dependencies
This commit is contained in:
831
codex-lens/tests/test_llm_enhancer.py
Normal file
831
codex-lens/tests/test_llm_enhancer.py
Normal file
@@ -0,0 +1,831 @@
|
||||
"""Tests for LLM-based semantic enhancement functionality.
|
||||
|
||||
Tests cover:
|
||||
- LLMConfig and data classes
|
||||
- LLMEnhancer initialization and configuration
|
||||
- Prompt building and JSON parsing
|
||||
- Batch processing logic
|
||||
- CCW CLI invocation (mocked)
|
||||
- EnhancedSemanticIndexer integration
|
||||
- Error handling and fallback behavior
|
||||
"""
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
from unittest.mock import MagicMock, patch, PropertyMock
|
||||
|
||||
import pytest
|
||||
|
||||
from codexlens.entities import SemanticChunk, Symbol
|
||||
from codexlens.semantic.llm_enhancer import (
|
||||
SemanticMetadata,
|
||||
FileData,
|
||||
LLMConfig,
|
||||
LLMEnhancer,
|
||||
EnhancedSemanticIndexer,
|
||||
create_enhancer,
|
||||
create_enhanced_indexer,
|
||||
)
|
||||
|
||||
|
||||
# === Data Class Tests ===
|
||||
|
||||
class TestSemanticMetadata:
|
||||
"""Tests for SemanticMetadata dataclass."""
|
||||
|
||||
def test_basic_creation(self):
|
||||
"""Test creating SemanticMetadata with required fields."""
|
||||
metadata = SemanticMetadata(
|
||||
summary="Authentication handler",
|
||||
keywords=["auth", "login", "jwt"],
|
||||
purpose="auth",
|
||||
)
|
||||
assert metadata.summary == "Authentication handler"
|
||||
assert metadata.keywords == ["auth", "login", "jwt"]
|
||||
assert metadata.purpose == "auth"
|
||||
assert metadata.file_path is None
|
||||
assert metadata.symbol_name is None
|
||||
assert metadata.llm_tool is None
|
||||
|
||||
def test_full_creation(self):
|
||||
"""Test creating SemanticMetadata with all fields."""
|
||||
metadata = SemanticMetadata(
|
||||
summary="User login function",
|
||||
keywords=["login", "user"],
|
||||
purpose="auth",
|
||||
file_path="/test/auth.py",
|
||||
symbol_name="login",
|
||||
llm_tool="gemini",
|
||||
)
|
||||
assert metadata.file_path == "/test/auth.py"
|
||||
assert metadata.symbol_name == "login"
|
||||
assert metadata.llm_tool == "gemini"
|
||||
|
||||
def test_empty_keywords(self):
|
||||
"""Test creating SemanticMetadata with empty keywords."""
|
||||
metadata = SemanticMetadata(
|
||||
summary="Empty",
|
||||
keywords=[],
|
||||
purpose="",
|
||||
)
|
||||
assert metadata.keywords == []
|
||||
|
||||
|
||||
class TestFileData:
|
||||
"""Tests for FileData dataclass."""
|
||||
|
||||
def test_basic_creation(self):
|
||||
"""Test creating FileData with required fields."""
|
||||
data = FileData(
|
||||
path="/test/file.py",
|
||||
content="def hello(): pass",
|
||||
language="python",
|
||||
)
|
||||
assert data.path == "/test/file.py"
|
||||
assert data.content == "def hello(): pass"
|
||||
assert data.language == "python"
|
||||
assert data.symbols == []
|
||||
|
||||
def test_with_symbols(self):
|
||||
"""Test creating FileData with symbols."""
|
||||
symbols = [
|
||||
Symbol(name="hello", kind="function", range=(1, 1)),
|
||||
Symbol(name="MyClass", kind="class", range=(3, 10)),
|
||||
]
|
||||
data = FileData(
|
||||
path="/test/file.py",
|
||||
content="code",
|
||||
language="python",
|
||||
symbols=symbols,
|
||||
)
|
||||
assert len(data.symbols) == 2
|
||||
assert data.symbols[0].name == "hello"
|
||||
|
||||
|
||||
class TestLLMConfig:
|
||||
"""Tests for LLMConfig dataclass."""
|
||||
|
||||
def test_default_values(self):
|
||||
"""Test default configuration values."""
|
||||
config = LLMConfig()
|
||||
assert config.tool == "gemini"
|
||||
assert config.fallback_tool == "qwen"
|
||||
assert config.timeout_ms == 300000
|
||||
assert config.batch_size == 5
|
||||
assert config.max_content_chars == 8000
|
||||
assert config.enabled is True
|
||||
|
||||
def test_custom_values(self):
|
||||
"""Test custom configuration values."""
|
||||
config = LLMConfig(
|
||||
tool="qwen",
|
||||
fallback_tool="gemini",
|
||||
timeout_ms=600000,
|
||||
batch_size=10,
|
||||
max_content_chars=4000,
|
||||
enabled=False,
|
||||
)
|
||||
assert config.tool == "qwen"
|
||||
assert config.fallback_tool == "gemini"
|
||||
assert config.timeout_ms == 600000
|
||||
assert config.batch_size == 10
|
||||
assert config.max_content_chars == 4000
|
||||
assert config.enabled is False
|
||||
|
||||
@patch.dict("os.environ", {"CCW_CLI_SECONDARY_TOOL": "codex", "CCW_CLI_FALLBACK_TOOL": "gemini"})
|
||||
def test_env_override(self):
|
||||
"""Test environment variable override."""
|
||||
config = LLMConfig()
|
||||
assert config.tool == "codex"
|
||||
assert config.fallback_tool == "gemini"
|
||||
|
||||
|
||||
# === LLMEnhancer Tests ===
|
||||
|
||||
class TestLLMEnhancerInit:
|
||||
"""Tests for LLMEnhancer initialization."""
|
||||
|
||||
def test_default_init(self):
|
||||
"""Test default initialization."""
|
||||
enhancer = LLMEnhancer()
|
||||
assert enhancer.config is not None
|
||||
assert enhancer.config.tool == "gemini"
|
||||
assert enhancer._ccw_available is None
|
||||
|
||||
def test_custom_config(self):
|
||||
"""Test initialization with custom config."""
|
||||
config = LLMConfig(tool="qwen", batch_size=3)
|
||||
enhancer = LLMEnhancer(config)
|
||||
assert enhancer.config.tool == "qwen"
|
||||
assert enhancer.config.batch_size == 3
|
||||
|
||||
|
||||
class TestLLMEnhancerAvailability:
|
||||
"""Tests for CCW CLI availability check."""
|
||||
|
||||
@patch("shutil.which")
|
||||
def test_ccw_available(self, mock_which):
|
||||
"""Test CCW available returns True."""
|
||||
mock_which.return_value = "/usr/bin/ccw"
|
||||
enhancer = LLMEnhancer()
|
||||
|
||||
result = enhancer.check_available()
|
||||
|
||||
assert result is True
|
||||
assert enhancer._ccw_available is True
|
||||
mock_which.assert_called_with("ccw")
|
||||
|
||||
@patch("shutil.which")
|
||||
def test_ccw_not_available(self, mock_which):
|
||||
"""Test CCW not available returns False."""
|
||||
mock_which.return_value = None
|
||||
enhancer = LLMEnhancer()
|
||||
|
||||
result = enhancer.check_available()
|
||||
|
||||
assert result is False
|
||||
assert enhancer._ccw_available is False
|
||||
|
||||
@patch("shutil.which")
|
||||
def test_ccw_availability_cached(self, mock_which):
|
||||
"""Test availability result is cached."""
|
||||
mock_which.return_value = "/usr/bin/ccw"
|
||||
enhancer = LLMEnhancer()
|
||||
|
||||
# First call
|
||||
enhancer.check_available()
|
||||
# Second call
|
||||
enhancer.check_available()
|
||||
|
||||
# which should only be called once
|
||||
mock_which.assert_called_once()
|
||||
|
||||
|
||||
class TestPromptBuilding:
|
||||
"""Tests for prompt building."""
|
||||
|
||||
def test_build_single_file_prompt(self):
|
||||
"""Test prompt building with single file."""
|
||||
enhancer = LLMEnhancer()
|
||||
files = [
|
||||
FileData(
|
||||
path="/test/auth.py",
|
||||
content="def login(): pass",
|
||||
language="python",
|
||||
)
|
||||
]
|
||||
|
||||
prompt = enhancer._build_batch_prompt(files)
|
||||
|
||||
assert "[FILE: /test/auth.py]" in prompt
|
||||
assert "```python" in prompt
|
||||
assert "def login(): pass" in prompt
|
||||
assert "PURPOSE:" in prompt
|
||||
assert "JSON format output" in prompt
|
||||
|
||||
def test_build_multiple_files_prompt(self):
|
||||
"""Test prompt building with multiple files."""
|
||||
enhancer = LLMEnhancer()
|
||||
files = [
|
||||
FileData(path="/test/a.py", content="def a(): pass", language="python"),
|
||||
FileData(path="/test/b.js", content="function b() {}", language="javascript"),
|
||||
]
|
||||
|
||||
prompt = enhancer._build_batch_prompt(files)
|
||||
|
||||
assert "[FILE: /test/a.py]" in prompt
|
||||
assert "[FILE: /test/b.js]" in prompt
|
||||
assert "```python" in prompt
|
||||
assert "```javascript" in prompt
|
||||
|
||||
def test_build_prompt_truncates_long_content(self):
|
||||
"""Test prompt truncates long content."""
|
||||
config = LLMConfig(max_content_chars=100)
|
||||
enhancer = LLMEnhancer(config)
|
||||
|
||||
long_content = "x" * 200
|
||||
files = [FileData(path="/test/long.py", content=long_content, language="python")]
|
||||
|
||||
prompt = enhancer._build_batch_prompt(files)
|
||||
|
||||
assert "... [truncated]" in prompt
|
||||
assert "x" * 200 not in prompt
|
||||
|
||||
|
||||
class TestJSONParsing:
|
||||
"""Tests for JSON response parsing."""
|
||||
|
||||
def test_parse_valid_response(self):
|
||||
"""Test parsing valid JSON response."""
|
||||
enhancer = LLMEnhancer()
|
||||
response = json.dumps({
|
||||
"files": {
|
||||
"/test/auth.py": {
|
||||
"summary": "Authentication handler",
|
||||
"keywords": ["auth", "login"],
|
||||
"purpose": "auth",
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
result = enhancer._parse_response(response, "gemini")
|
||||
|
||||
assert "/test/auth.py" in result
|
||||
assert result["/test/auth.py"].summary == "Authentication handler"
|
||||
assert result["/test/auth.py"].keywords == ["auth", "login"]
|
||||
assert result["/test/auth.py"].purpose == "auth"
|
||||
assert result["/test/auth.py"].llm_tool == "gemini"
|
||||
|
||||
def test_parse_response_with_markdown(self):
|
||||
"""Test parsing response wrapped in markdown."""
|
||||
enhancer = LLMEnhancer()
|
||||
response = '''```json
|
||||
{
|
||||
"files": {
|
||||
"/test/file.py": {
|
||||
"summary": "Test file",
|
||||
"keywords": ["test"],
|
||||
"purpose": "test"
|
||||
}
|
||||
}
|
||||
}
|
||||
```'''
|
||||
|
||||
result = enhancer._parse_response(response, "qwen")
|
||||
|
||||
assert "/test/file.py" in result
|
||||
assert result["/test/file.py"].summary == "Test file"
|
||||
|
||||
def test_parse_response_multiple_files(self):
|
||||
"""Test parsing response with multiple files."""
|
||||
enhancer = LLMEnhancer()
|
||||
response = json.dumps({
|
||||
"files": {
|
||||
"/test/a.py": {"summary": "File A", "keywords": ["a"], "purpose": "util"},
|
||||
"/test/b.py": {"summary": "File B", "keywords": ["b"], "purpose": "api"},
|
||||
}
|
||||
})
|
||||
|
||||
result = enhancer._parse_response(response, "gemini")
|
||||
|
||||
assert len(result) == 2
|
||||
assert result["/test/a.py"].summary == "File A"
|
||||
assert result["/test/b.py"].summary == "File B"
|
||||
|
||||
def test_parse_invalid_json(self):
|
||||
"""Test parsing invalid JSON returns empty dict."""
|
||||
enhancer = LLMEnhancer()
|
||||
response = "not valid json at all"
|
||||
|
||||
result = enhancer._parse_response(response, "gemini")
|
||||
|
||||
assert result == {}
|
||||
|
||||
def test_parse_empty_response(self):
|
||||
"""Test parsing empty response returns empty dict."""
|
||||
enhancer = LLMEnhancer()
|
||||
|
||||
result = enhancer._parse_response("", "gemini")
|
||||
|
||||
assert result == {}
|
||||
|
||||
|
||||
class TestJSONExtraction:
|
||||
"""Tests for JSON extraction from mixed text."""
|
||||
|
||||
def test_extract_json_from_plain(self):
|
||||
"""Test extracting JSON from plain text."""
|
||||
enhancer = LLMEnhancer()
|
||||
text = '{"key": "value"}'
|
||||
|
||||
result = enhancer._extract_json(text)
|
||||
|
||||
assert result == '{"key": "value"}'
|
||||
|
||||
def test_extract_json_from_markdown(self):
|
||||
"""Test extracting JSON from markdown code block."""
|
||||
enhancer = LLMEnhancer()
|
||||
text = '''```json
|
||||
{"key": "value"}
|
||||
```'''
|
||||
|
||||
result = enhancer._extract_json(text)
|
||||
|
||||
assert result == '{"key": "value"}'
|
||||
|
||||
def test_extract_json_with_surrounding_text(self):
|
||||
"""Test extracting JSON with surrounding text."""
|
||||
enhancer = LLMEnhancer()
|
||||
text = 'Here is the result: {"key": "value"} That is all.'
|
||||
|
||||
result = enhancer._extract_json(text)
|
||||
|
||||
assert result == '{"key": "value"}'
|
||||
|
||||
def test_extract_nested_json(self):
|
||||
"""Test extracting nested JSON."""
|
||||
enhancer = LLMEnhancer()
|
||||
text = '{"outer": {"inner": "value"}}'
|
||||
|
||||
result = enhancer._extract_json(text)
|
||||
|
||||
assert '"outer"' in result
|
||||
assert '"inner"' in result
|
||||
|
||||
def test_extract_no_json(self):
|
||||
"""Test extracting from text without JSON."""
|
||||
enhancer = LLMEnhancer()
|
||||
text = "No JSON here at all"
|
||||
|
||||
result = enhancer._extract_json(text)
|
||||
|
||||
assert result is None
|
||||
|
||||
def test_extract_malformed_json(self):
|
||||
"""Test extracting malformed JSON returns None."""
|
||||
enhancer = LLMEnhancer()
|
||||
text = '{"key": "value"' # Missing closing brace
|
||||
|
||||
result = enhancer._extract_json(text)
|
||||
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestEnhanceFiles:
|
||||
"""Tests for enhance_files method."""
|
||||
|
||||
@patch.object(LLMEnhancer, "check_available", return_value=False)
|
||||
def test_enhance_files_ccw_not_available(self, mock_check):
|
||||
"""Test enhance_files returns empty when CCW not available."""
|
||||
enhancer = LLMEnhancer()
|
||||
files = [FileData(path="/test/a.py", content="code", language="python")]
|
||||
|
||||
result = enhancer.enhance_files(files)
|
||||
|
||||
assert result == {}
|
||||
|
||||
def test_enhance_files_disabled(self):
|
||||
"""Test enhance_files returns empty when disabled."""
|
||||
config = LLMConfig(enabled=False)
|
||||
enhancer = LLMEnhancer(config)
|
||||
files = [FileData(path="/test/a.py", content="code", language="python")]
|
||||
|
||||
result = enhancer.enhance_files(files)
|
||||
|
||||
assert result == {}
|
||||
|
||||
@patch.object(LLMEnhancer, "check_available", return_value=True)
|
||||
def test_enhance_files_empty_list(self, mock_check):
|
||||
"""Test enhance_files with empty list returns empty dict."""
|
||||
enhancer = LLMEnhancer()
|
||||
|
||||
result = enhancer.enhance_files([])
|
||||
|
||||
assert result == {}
|
||||
|
||||
@patch.object(LLMEnhancer, "check_available", return_value=True)
|
||||
@patch.object(LLMEnhancer, "_invoke_ccw_cli")
|
||||
def test_enhance_files_success(self, mock_invoke, mock_check):
|
||||
"""Test enhance_files successful processing."""
|
||||
mock_invoke.return_value = {
|
||||
"success": True,
|
||||
"stdout": json.dumps({
|
||||
"files": {
|
||||
"/test/auth.py": {
|
||||
"summary": "Auth module",
|
||||
"keywords": ["auth"],
|
||||
"purpose": "auth",
|
||||
}
|
||||
}
|
||||
}),
|
||||
"stderr": "",
|
||||
"exit_code": 0,
|
||||
}
|
||||
|
||||
enhancer = LLMEnhancer()
|
||||
files = [FileData(path="/test/auth.py", content="def login(): pass", language="python")]
|
||||
|
||||
result = enhancer.enhance_files(files)
|
||||
|
||||
assert "/test/auth.py" in result
|
||||
assert result["/test/auth.py"].summary == "Auth module"
|
||||
|
||||
@patch.object(LLMEnhancer, "check_available", return_value=True)
|
||||
@patch.object(LLMEnhancer, "_invoke_ccw_cli")
|
||||
def test_enhance_files_fallback(self, mock_invoke, mock_check):
|
||||
"""Test enhance_files falls back to secondary tool."""
|
||||
# First call fails, second succeeds
|
||||
mock_invoke.side_effect = [
|
||||
{"success": False, "stdout": "", "stderr": "error", "exit_code": 1},
|
||||
{
|
||||
"success": True,
|
||||
"stdout": json.dumps({
|
||||
"files": {
|
||||
"/test/file.py": {
|
||||
"summary": "Fallback result",
|
||||
"keywords": ["fallback"],
|
||||
"purpose": "util",
|
||||
}
|
||||
}
|
||||
}),
|
||||
"stderr": "",
|
||||
"exit_code": 0,
|
||||
},
|
||||
]
|
||||
|
||||
enhancer = LLMEnhancer()
|
||||
files = [FileData(path="/test/file.py", content="code", language="python")]
|
||||
|
||||
result = enhancer.enhance_files(files)
|
||||
|
||||
assert "/test/file.py" in result
|
||||
assert result["/test/file.py"].summary == "Fallback result"
|
||||
assert mock_invoke.call_count == 2
|
||||
|
||||
|
||||
class TestEnhanceFile:
|
||||
"""Tests for enhance_file single file method."""
|
||||
|
||||
@patch.object(LLMEnhancer, "enhance_files")
|
||||
def test_enhance_file_success(self, mock_enhance_files):
|
||||
"""Test enhance_file returns metadata on success."""
|
||||
mock_enhance_files.return_value = {
|
||||
"/test/auth.py": SemanticMetadata(
|
||||
summary="Auth module",
|
||||
keywords=["auth", "login"],
|
||||
purpose="auth",
|
||||
file_path="/test/auth.py",
|
||||
llm_tool="gemini",
|
||||
)
|
||||
}
|
||||
|
||||
enhancer = LLMEnhancer()
|
||||
result = enhancer.enhance_file("/test/auth.py", "def login(): pass", "python")
|
||||
|
||||
assert result.summary == "Auth module"
|
||||
assert result.keywords == ["auth", "login"]
|
||||
|
||||
@patch.object(LLMEnhancer, "enhance_files")
|
||||
def test_enhance_file_fallback_on_failure(self, mock_enhance_files):
|
||||
"""Test enhance_file returns default metadata on failure."""
|
||||
mock_enhance_files.return_value = {} # Enhancement failed
|
||||
|
||||
enhancer = LLMEnhancer()
|
||||
result = enhancer.enhance_file("/test/file.py", "code", "python")
|
||||
|
||||
assert "python" in result.summary.lower()
|
||||
assert "python" in result.keywords
|
||||
assert result.purpose == "unknown"
|
||||
|
||||
|
||||
class TestBatchProcessing:
|
||||
"""Tests for batch processing."""
|
||||
|
||||
@patch.object(LLMEnhancer, "check_available", return_value=True)
|
||||
@patch.object(LLMEnhancer, "_process_batch")
|
||||
def test_batch_processing(self, mock_process, mock_check):
|
||||
"""Test files are processed in batches."""
|
||||
mock_process.return_value = {}
|
||||
|
||||
config = LLMConfig(batch_size=2)
|
||||
enhancer = LLMEnhancer(config)
|
||||
|
||||
files = [
|
||||
FileData(path=f"/test/file{i}.py", content="code", language="python")
|
||||
for i in range(5)
|
||||
]
|
||||
|
||||
enhancer.enhance_files(files)
|
||||
|
||||
# 5 files with batch_size=2 should result in 3 batches
|
||||
assert mock_process.call_count == 3
|
||||
|
||||
@patch.object(LLMEnhancer, "check_available", return_value=True)
|
||||
@patch.object(LLMEnhancer, "_process_batch")
|
||||
def test_batch_continues_on_error(self, mock_process, mock_check):
|
||||
"""Test batch processing continues on error."""
|
||||
# First batch fails, second succeeds
|
||||
mock_process.side_effect = [
|
||||
Exception("Batch 1 failed"),
|
||||
{"/test/file2.py": SemanticMetadata(summary="OK", keywords=[], purpose="")},
|
||||
]
|
||||
|
||||
config = LLMConfig(batch_size=1)
|
||||
enhancer = LLMEnhancer(config)
|
||||
|
||||
files = [
|
||||
FileData(path="/test/file1.py", content="code", language="python"),
|
||||
FileData(path="/test/file2.py", content="code", language="python"),
|
||||
]
|
||||
|
||||
result = enhancer.enhance_files(files)
|
||||
|
||||
# Should still get results from second batch
|
||||
assert "/test/file2.py" in result
|
||||
|
||||
|
||||
# === CCW CLI Invocation Tests ===
|
||||
|
||||
class TestCCWInvocation:
|
||||
"""Tests for CCW CLI invocation."""
|
||||
|
||||
@patch("subprocess.run")
|
||||
@patch("shutil.which", return_value="/usr/bin/ccw")
|
||||
def test_invoke_success(self, mock_which, mock_run):
|
||||
"""Test successful CCW CLI invocation."""
|
||||
mock_run.return_value = MagicMock(
|
||||
returncode=0,
|
||||
stdout='{"files": {}}',
|
||||
stderr="",
|
||||
)
|
||||
|
||||
enhancer = LLMEnhancer()
|
||||
result = enhancer._invoke_ccw_cli("test prompt", tool="gemini")
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["exit_code"] == 0
|
||||
|
||||
@patch("subprocess.run")
|
||||
@patch("shutil.which", return_value="/usr/bin/ccw")
|
||||
def test_invoke_failure(self, mock_which, mock_run):
|
||||
"""Test failed CCW CLI invocation."""
|
||||
mock_run.return_value = MagicMock(
|
||||
returncode=1,
|
||||
stdout="",
|
||||
stderr="Error occurred",
|
||||
)
|
||||
|
||||
enhancer = LLMEnhancer()
|
||||
result = enhancer._invoke_ccw_cli("test prompt", tool="gemini")
|
||||
|
||||
assert result["success"] is False
|
||||
assert result["exit_code"] == 1
|
||||
|
||||
@patch("subprocess.run")
|
||||
@patch("shutil.which", return_value="/usr/bin/ccw")
|
||||
def test_invoke_timeout(self, mock_which, mock_run):
|
||||
"""Test CCW CLI timeout handling."""
|
||||
import subprocess
|
||||
mock_run.side_effect = subprocess.TimeoutExpired(cmd="ccw", timeout=300)
|
||||
|
||||
enhancer = LLMEnhancer()
|
||||
result = enhancer._invoke_ccw_cli("test prompt", tool="gemini")
|
||||
|
||||
assert result["success"] is False
|
||||
assert "timeout" in result["stderr"]
|
||||
|
||||
@patch("subprocess.run")
|
||||
@patch("shutil.which", return_value=None)
|
||||
def test_invoke_ccw_not_found(self, mock_which, mock_run):
|
||||
"""Test CCW CLI not found handling."""
|
||||
mock_run.side_effect = FileNotFoundError()
|
||||
|
||||
enhancer = LLMEnhancer()
|
||||
result = enhancer._invoke_ccw_cli("test prompt", tool="gemini")
|
||||
|
||||
assert result["success"] is False
|
||||
assert "not found" in result["stderr"]
|
||||
|
||||
|
||||
# === EnhancedSemanticIndexer Tests ===
|
||||
|
||||
class TestEnhancedSemanticIndexer:
|
||||
"""Tests for EnhancedSemanticIndexer integration."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_enhancer(self):
|
||||
"""Create mock LLM enhancer."""
|
||||
enhancer = MagicMock(spec=LLMEnhancer)
|
||||
enhancer.enhance_files.return_value = {
|
||||
"/test/auth.py": SemanticMetadata(
|
||||
summary="Authentication handler",
|
||||
keywords=["auth", "login", "jwt"],
|
||||
purpose="auth",
|
||||
file_path="/test/auth.py",
|
||||
llm_tool="gemini",
|
||||
)
|
||||
}
|
||||
return enhancer
|
||||
|
||||
@pytest.fixture
|
||||
def mock_embedder(self):
|
||||
"""Create mock embedder."""
|
||||
embedder = MagicMock()
|
||||
embedder.embed.return_value = [[0.1] * 384]
|
||||
embedder.embed_single.return_value = [0.1] * 384
|
||||
return embedder
|
||||
|
||||
@pytest.fixture
|
||||
def mock_vector_store(self):
|
||||
"""Create mock vector store."""
|
||||
store = MagicMock()
|
||||
store.add_chunk.return_value = 1
|
||||
return store
|
||||
|
||||
def test_index_files_empty_list(self, mock_enhancer, mock_embedder, mock_vector_store):
|
||||
"""Test indexing empty file list."""
|
||||
indexer = EnhancedSemanticIndexer(mock_enhancer, mock_embedder, mock_vector_store)
|
||||
|
||||
result = indexer.index_files([])
|
||||
|
||||
assert result == 0
|
||||
mock_enhancer.enhance_files.assert_not_called()
|
||||
|
||||
def test_index_files_with_llm_enhancement(self, mock_enhancer, mock_embedder, mock_vector_store):
|
||||
"""Test indexing with LLM enhancement."""
|
||||
indexer = EnhancedSemanticIndexer(mock_enhancer, mock_embedder, mock_vector_store)
|
||||
files = [FileData(path="/test/auth.py", content="def login(): pass", language="python")]
|
||||
|
||||
result = indexer.index_files(files)
|
||||
|
||||
assert result == 1
|
||||
mock_enhancer.enhance_files.assert_called_once()
|
||||
mock_embedder.embed.assert_called_once()
|
||||
mock_vector_store.add_chunk.assert_called_once()
|
||||
|
||||
def test_index_files_fallback_to_raw_code(self, mock_embedder, mock_vector_store):
|
||||
"""Test indexing falls back to raw code when LLM fails."""
|
||||
mock_enhancer = MagicMock(spec=LLMEnhancer)
|
||||
mock_enhancer.enhance_files.return_value = {} # No enhancement
|
||||
|
||||
indexer = EnhancedSemanticIndexer(mock_enhancer, mock_embedder, mock_vector_store)
|
||||
files = [FileData(path="/test/file.py", content="code", language="python")]
|
||||
|
||||
result = indexer.index_files(files)
|
||||
|
||||
assert result == 1
|
||||
mock_embedder.embed_single.assert_called()
|
||||
|
||||
def test_create_embeddable_text(self, mock_enhancer, mock_embedder, mock_vector_store):
|
||||
"""Test embeddable text creation."""
|
||||
indexer = EnhancedSemanticIndexer(mock_enhancer, mock_embedder, mock_vector_store)
|
||||
|
||||
metadata = SemanticMetadata(
|
||||
summary="Handles user authentication",
|
||||
keywords=["auth", "login", "user"],
|
||||
purpose="auth",
|
||||
)
|
||||
file_data = FileData(path="/test/auth.py", content="code", language="python")
|
||||
|
||||
text = indexer._create_embeddable_text(metadata, file_data)
|
||||
|
||||
assert "Handles user authentication" in text
|
||||
assert "auth" in text.lower()
|
||||
assert "Keywords:" in text
|
||||
assert "auth.py" in text
|
||||
|
||||
|
||||
# === Factory Function Tests ===
|
||||
|
||||
class TestFactoryFunctions:
|
||||
"""Tests for factory functions."""
|
||||
|
||||
def test_create_enhancer_default(self):
|
||||
"""Test create_enhancer with defaults."""
|
||||
enhancer = create_enhancer()
|
||||
|
||||
assert enhancer.config.tool == "gemini"
|
||||
assert enhancer.config.enabled is True
|
||||
|
||||
def test_create_enhancer_custom(self):
|
||||
"""Test create_enhancer with custom params."""
|
||||
enhancer = create_enhancer(
|
||||
tool="qwen",
|
||||
timeout_ms=600000,
|
||||
batch_size=10,
|
||||
enabled=False,
|
||||
)
|
||||
|
||||
assert enhancer.config.tool == "qwen"
|
||||
assert enhancer.config.timeout_ms == 600000
|
||||
assert enhancer.config.batch_size == 10
|
||||
assert enhancer.config.enabled is False
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not pytest.importorskip("codexlens.semantic", reason="semantic not available"),
|
||||
reason="Semantic dependencies not installed"
|
||||
)
|
||||
def test_create_enhanced_indexer(self, tmp_path):
|
||||
"""Test create_enhanced_indexer factory."""
|
||||
try:
|
||||
from codexlens.semantic import SEMANTIC_AVAILABLE
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
pytest.skip("Semantic dependencies not installed")
|
||||
|
||||
db_path = tmp_path / "semantic.db"
|
||||
indexer = create_enhanced_indexer(db_path, llm_tool="gemini", llm_enabled=False)
|
||||
|
||||
assert indexer.enhancer is not None
|
||||
assert indexer.embedder is not None
|
||||
assert indexer.vector_store is not None
|
||||
except ImportError:
|
||||
pytest.skip("Semantic dependencies not installed")
|
||||
|
||||
|
||||
# === Edge Cases ===
|
||||
|
||||
class TestEdgeCases:
|
||||
"""Tests for edge cases."""
|
||||
|
||||
def test_semantic_metadata_with_special_chars(self):
|
||||
"""Test metadata with special characters."""
|
||||
metadata = SemanticMetadata(
|
||||
summary='Test "quoted" and \'single\' quotes',
|
||||
keywords=["special", "chars", "test's"],
|
||||
purpose="test",
|
||||
)
|
||||
assert '"quoted"' in metadata.summary
|
||||
assert "test's" in metadata.keywords
|
||||
|
||||
def test_file_data_with_unicode(self):
|
||||
"""Test FileData with unicode content."""
|
||||
data = FileData(
|
||||
path="/test/中文.py",
|
||||
content="def 你好(): return '世界'",
|
||||
language="python",
|
||||
)
|
||||
assert "中文" in data.path
|
||||
assert "你好" in data.content
|
||||
|
||||
@patch.object(LLMEnhancer, "check_available", return_value=True)
|
||||
@patch.object(LLMEnhancer, "_invoke_ccw_cli")
|
||||
def test_enhance_with_very_long_content(self, mock_invoke, mock_check):
|
||||
"""Test enhancement with very long content."""
|
||||
mock_invoke.return_value = {
|
||||
"success": True,
|
||||
"stdout": json.dumps({"files": {}}),
|
||||
"stderr": "",
|
||||
"exit_code": 0,
|
||||
}
|
||||
|
||||
config = LLMConfig(max_content_chars=100)
|
||||
enhancer = LLMEnhancer(config)
|
||||
|
||||
long_content = "x" * 10000
|
||||
files = [FileData(path="/test/long.py", content=long_content, language="python")]
|
||||
|
||||
enhancer.enhance_files(files)
|
||||
|
||||
# Should not crash, content should be truncated in prompt
|
||||
mock_invoke.assert_called_once()
|
||||
|
||||
def test_parse_response_with_missing_fields(self):
|
||||
"""Test parsing response with missing fields."""
|
||||
enhancer = LLMEnhancer()
|
||||
response = json.dumps({
|
||||
"files": {
|
||||
"/test/file.py": {
|
||||
"summary": "Only summary provided",
|
||||
# keywords and purpose missing
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
result = enhancer._parse_response(response, "gemini")
|
||||
|
||||
assert "/test/file.py" in result
|
||||
assert result["/test/file.py"].summary == "Only summary provided"
|
||||
assert result["/test/file.py"].keywords == []
|
||||
assert result["/test/file.py"].purpose == ""
|
||||
1190
codex-lens/tests/test_search_full_coverage.py
Normal file
1190
codex-lens/tests/test_search_full_coverage.py
Normal file
File diff suppressed because it is too large
Load Diff
747
codex-lens/tests/test_vector_search_full.py
Normal file
747
codex-lens/tests/test_vector_search_full.py
Normal file
@@ -0,0 +1,747 @@
|
||||
"""Full coverage tests for vector/semantic search functionality.
|
||||
|
||||
Tests cover:
|
||||
- Embedder model loading and embedding generation
|
||||
- VectorStore CRUD operations and caching
|
||||
- Cosine similarity computation
|
||||
- Semantic search accuracy and relevance
|
||||
- Performance benchmarks
|
||||
- Edge cases and error handling
|
||||
- Thread safety and concurrent access
|
||||
"""
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from codexlens.entities import SemanticChunk, Symbol, SearchResult
|
||||
from codexlens.semantic import SEMANTIC_AVAILABLE, check_semantic_available
|
||||
|
||||
# Skip all tests if semantic dependencies not available
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not SEMANTIC_AVAILABLE,
|
||||
reason="Semantic search dependencies not installed (pip install codexlens[semantic])"
|
||||
)
|
||||
|
||||
|
||||
# === Fixtures ===
|
||||
|
||||
@pytest.fixture
|
||||
def temp_db(tmp_path):
|
||||
"""Create temporary database path."""
|
||||
return tmp_path / "test_semantic.db"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def embedder():
|
||||
"""Create Embedder instance."""
|
||||
from codexlens.semantic.embedder import Embedder
|
||||
return Embedder()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vector_store(temp_db):
|
||||
"""Create VectorStore instance."""
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
return VectorStore(temp_db)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_code_chunks():
|
||||
"""Sample code chunks for testing."""
|
||||
return [
|
||||
{
|
||||
"content": "def authenticate(username, password): return check_credentials(username, password)",
|
||||
"metadata": {"symbol_name": "authenticate", "symbol_kind": "function", "start_line": 1, "end_line": 1, "language": "python"},
|
||||
},
|
||||
{
|
||||
"content": "class DatabaseConnection:\n def connect(self, host, port): pass\n def execute(self, query): pass",
|
||||
"metadata": {"symbol_name": "DatabaseConnection", "symbol_kind": "class", "start_line": 1, "end_line": 3, "language": "python"},
|
||||
},
|
||||
{
|
||||
"content": "async function fetchUserData(userId) { return await api.get('/users/' + userId); }",
|
||||
"metadata": {"symbol_name": "fetchUserData", "symbol_kind": "function", "start_line": 1, "end_line": 1, "language": "javascript"},
|
||||
},
|
||||
{
|
||||
"content": "def calculate_sum(numbers): return sum(numbers)",
|
||||
"metadata": {"symbol_name": "calculate_sum", "symbol_kind": "function", "start_line": 1, "end_line": 1, "language": "python"},
|
||||
},
|
||||
{
|
||||
"content": "class UserProfile:\n def __init__(self, name, email):\n self.name = name\n self.email = email",
|
||||
"metadata": {"symbol_name": "UserProfile", "symbol_kind": "class", "start_line": 1, "end_line": 4, "language": "python"},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# === Embedder Tests ===
|
||||
|
||||
class TestEmbedder:
|
||||
"""Tests for Embedder class."""
|
||||
|
||||
def test_embedder_initialization(self, embedder):
|
||||
"""Test embedder initializes correctly."""
|
||||
assert embedder.model_name == "BAAI/bge-small-en-v1.5"
|
||||
assert embedder.EMBEDDING_DIM == 384
|
||||
assert embedder._model is None # Lazy loading
|
||||
|
||||
def test_embed_single_returns_correct_dimension(self, embedder):
|
||||
"""Test single embedding has correct dimension."""
|
||||
text = "def hello(): print('world')"
|
||||
embedding = embedder.embed_single(text)
|
||||
|
||||
assert isinstance(embedding, list)
|
||||
assert len(embedding) == 384
|
||||
assert all(isinstance(x, float) for x in embedding)
|
||||
|
||||
def test_embed_batch_returns_correct_count(self, embedder):
|
||||
"""Test batch embedding returns correct number of embeddings."""
|
||||
texts = [
|
||||
"def foo(): pass",
|
||||
"def bar(): pass",
|
||||
"def baz(): pass",
|
||||
]
|
||||
embeddings = embedder.embed(texts)
|
||||
|
||||
assert len(embeddings) == len(texts)
|
||||
assert all(len(e) == 384 for e in embeddings)
|
||||
|
||||
def test_embed_empty_string(self, embedder):
|
||||
"""Test embedding empty string."""
|
||||
embedding = embedder.embed_single("")
|
||||
assert len(embedding) == 384
|
||||
|
||||
def test_embed_unicode_text(self, embedder):
|
||||
"""Test embedding unicode text."""
|
||||
text = "def 你好(): return '世界'"
|
||||
embedding = embedder.embed_single(text)
|
||||
assert len(embedding) == 384
|
||||
|
||||
def test_embed_long_text(self, embedder):
|
||||
"""Test embedding long text."""
|
||||
text = "def process(): pass\n" * 100
|
||||
embedding = embedder.embed_single(text)
|
||||
assert len(embedding) == 384
|
||||
|
||||
def test_embed_special_characters(self, embedder):
|
||||
"""Test embedding text with special characters."""
|
||||
text = "def test(): return {'key': 'value', '@decorator': True}"
|
||||
embedding = embedder.embed_single(text)
|
||||
assert len(embedding) == 384
|
||||
|
||||
def test_lazy_model_loading(self, embedder):
|
||||
"""Test model loads lazily on first embed call."""
|
||||
assert embedder._model is None
|
||||
embedder.embed_single("test")
|
||||
assert embedder._model is not None
|
||||
|
||||
def test_model_reuse(self, embedder):
|
||||
"""Test model is reused across multiple calls."""
|
||||
embedder.embed_single("test1")
|
||||
model_ref = embedder._model
|
||||
embedder.embed_single("test2")
|
||||
assert embedder._model is model_ref # Same instance
|
||||
|
||||
|
||||
class TestEmbeddingSimilarity:
|
||||
"""Tests for embedding similarity."""
|
||||
|
||||
def test_identical_text_similarity(self, embedder):
|
||||
"""Test identical text has similarity ~1.0."""
|
||||
from codexlens.semantic.vector_store import _cosine_similarity
|
||||
|
||||
text = "def calculate_sum(a, b): return a + b"
|
||||
emb1 = embedder.embed_single(text)
|
||||
emb2 = embedder.embed_single(text)
|
||||
|
||||
similarity = _cosine_similarity(emb1, emb2)
|
||||
assert similarity > 0.99, "Identical text should have ~1.0 similarity"
|
||||
|
||||
def test_similar_code_high_similarity(self, embedder):
|
||||
"""Test similar code has high similarity."""
|
||||
from codexlens.semantic.vector_store import _cosine_similarity
|
||||
|
||||
code1 = "def add(a, b): return a + b"
|
||||
code2 = "def sum_numbers(x, y): return x + y"
|
||||
|
||||
emb1 = embedder.embed_single(code1)
|
||||
emb2 = embedder.embed_single(code2)
|
||||
|
||||
similarity = _cosine_similarity(emb1, emb2)
|
||||
assert similarity > 0.6, "Similar functions should have high similarity"
|
||||
|
||||
def test_different_code_lower_similarity(self, embedder):
|
||||
"""Test different code has lower similarity than similar code."""
|
||||
from codexlens.semantic.vector_store import _cosine_similarity
|
||||
|
||||
code1 = "def add(a, b): return a + b"
|
||||
code2 = "def sum_numbers(x, y): return x + y"
|
||||
code3 = "class UserAuth: def login(self, user, pwd): pass"
|
||||
|
||||
emb1 = embedder.embed_single(code1)
|
||||
emb2 = embedder.embed_single(code2)
|
||||
emb3 = embedder.embed_single(code3)
|
||||
|
||||
sim_similar = _cosine_similarity(emb1, emb2)
|
||||
sim_different = _cosine_similarity(emb1, emb3)
|
||||
|
||||
assert sim_similar > sim_different, "Similar code should have higher similarity"
|
||||
|
||||
def test_zero_vector_similarity(self):
|
||||
"""Test cosine similarity with zero vector."""
|
||||
from codexlens.semantic.vector_store import _cosine_similarity
|
||||
|
||||
zero_vec = [0.0] * 384
|
||||
normal_vec = [1.0] * 384
|
||||
|
||||
similarity = _cosine_similarity(zero_vec, normal_vec)
|
||||
assert similarity == 0.0, "Zero vector should have 0 similarity"
|
||||
|
||||
|
||||
# === VectorStore Tests ===
|
||||
|
||||
class TestVectorStoreCRUD:
|
||||
"""Tests for VectorStore CRUD operations."""
|
||||
|
||||
def test_add_chunk(self, vector_store, embedder):
|
||||
"""Test adding a single chunk."""
|
||||
chunk = SemanticChunk(
|
||||
content="def test(): pass",
|
||||
metadata={"language": "python"},
|
||||
)
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
|
||||
chunk_id = vector_store.add_chunk(chunk, "/test/file.py")
|
||||
|
||||
assert chunk_id > 0
|
||||
assert vector_store.count_chunks() == 1
|
||||
|
||||
def test_add_chunk_without_embedding_raises(self, vector_store):
|
||||
"""Test adding chunk without embedding raises error."""
|
||||
chunk = SemanticChunk(content="def test(): pass", metadata={})
|
||||
|
||||
with pytest.raises(ValueError, match="must have embedding"):
|
||||
vector_store.add_chunk(chunk, "/test/file.py")
|
||||
|
||||
def test_add_chunks_batch(self, vector_store, embedder, sample_code_chunks):
|
||||
"""Test batch adding chunks."""
|
||||
chunks = []
|
||||
for data in sample_code_chunks:
|
||||
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
chunks.append(chunk)
|
||||
|
||||
ids = vector_store.add_chunks(chunks, "/test/multi.py")
|
||||
|
||||
assert len(ids) == len(chunks)
|
||||
assert vector_store.count_chunks() == len(chunks)
|
||||
|
||||
def test_add_empty_batch(self, vector_store):
|
||||
"""Test adding empty batch returns empty list."""
|
||||
ids = vector_store.add_chunks([], "/test/empty.py")
|
||||
assert ids == []
|
||||
|
||||
def test_delete_file_chunks(self, vector_store, embedder):
|
||||
"""Test deleting chunks by file path."""
|
||||
# Add chunks for two files
|
||||
chunk1 = SemanticChunk(content="def a(): pass", metadata={})
|
||||
chunk1.embedding = embedder.embed_single(chunk1.content)
|
||||
vector_store.add_chunk(chunk1, "/test/file1.py")
|
||||
|
||||
chunk2 = SemanticChunk(content="def b(): pass", metadata={})
|
||||
chunk2.embedding = embedder.embed_single(chunk2.content)
|
||||
vector_store.add_chunk(chunk2, "/test/file2.py")
|
||||
|
||||
assert vector_store.count_chunks() == 2
|
||||
|
||||
# Delete one file's chunks
|
||||
deleted = vector_store.delete_file_chunks("/test/file1.py")
|
||||
|
||||
assert deleted == 1
|
||||
assert vector_store.count_chunks() == 1
|
||||
|
||||
def test_delete_nonexistent_file(self, vector_store):
|
||||
"""Test deleting non-existent file returns 0."""
|
||||
deleted = vector_store.delete_file_chunks("/nonexistent/file.py")
|
||||
assert deleted == 0
|
||||
|
||||
def test_count_chunks_empty(self, vector_store):
|
||||
"""Test count on empty store."""
|
||||
assert vector_store.count_chunks() == 0
|
||||
|
||||
|
||||
class TestVectorStoreSearch:
|
||||
"""Tests for VectorStore search functionality."""
|
||||
|
||||
def test_search_similar_basic(self, vector_store, embedder, sample_code_chunks):
|
||||
"""Test basic similarity search."""
|
||||
# Add chunks
|
||||
for data in sample_code_chunks:
|
||||
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/file.py")
|
||||
|
||||
# Search
|
||||
query = "function to authenticate user login"
|
||||
query_embedding = embedder.embed_single(query)
|
||||
results = vector_store.search_similar(query_embedding, top_k=3)
|
||||
|
||||
assert len(results) > 0
|
||||
assert all(isinstance(r, SearchResult) for r in results)
|
||||
# Top result should be auth-related
|
||||
assert "authenticate" in results[0].excerpt.lower() or "auth" in results[0].path.lower()
|
||||
|
||||
def test_search_respects_top_k(self, vector_store, embedder, sample_code_chunks):
|
||||
"""Test search respects top_k parameter."""
|
||||
# Add all chunks
|
||||
for data in sample_code_chunks:
|
||||
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/file.py")
|
||||
|
||||
query_embedding = embedder.embed_single("code")
|
||||
|
||||
results_2 = vector_store.search_similar(query_embedding, top_k=2)
|
||||
results_5 = vector_store.search_similar(query_embedding, top_k=5)
|
||||
|
||||
assert len(results_2) <= 2
|
||||
assert len(results_5) <= 5
|
||||
|
||||
def test_search_min_score_filtering(self, vector_store, embedder):
|
||||
"""Test min_score filtering."""
|
||||
chunk = SemanticChunk(
|
||||
content="def hello(): print('hello world')",
|
||||
metadata={},
|
||||
)
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/hello.py")
|
||||
|
||||
query_embedding = embedder.embed_single("database connection pool")
|
||||
|
||||
results_no_filter = vector_store.search_similar(query_embedding, min_score=0.0)
|
||||
results_high_filter = vector_store.search_similar(query_embedding, min_score=0.9)
|
||||
|
||||
assert len(results_no_filter) >= len(results_high_filter)
|
||||
|
||||
def test_search_returns_sorted_by_score(self, vector_store, embedder, sample_code_chunks):
|
||||
"""Test results are sorted by score descending."""
|
||||
for data in sample_code_chunks:
|
||||
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/file.py")
|
||||
|
||||
query_embedding = embedder.embed_single("function")
|
||||
results = vector_store.search_similar(query_embedding, top_k=5)
|
||||
|
||||
if len(results) > 1:
|
||||
for i in range(len(results) - 1):
|
||||
assert results[i].score >= results[i + 1].score
|
||||
|
||||
def test_search_includes_metadata(self, vector_store, embedder):
|
||||
"""Test search results include metadata."""
|
||||
chunk = SemanticChunk(
|
||||
content="def test_function(): pass",
|
||||
metadata={
|
||||
"symbol_name": "test_function",
|
||||
"symbol_kind": "function",
|
||||
"start_line": 10,
|
||||
"end_line": 15,
|
||||
},
|
||||
)
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/func.py")
|
||||
|
||||
query_embedding = embedder.embed_single("test function")
|
||||
results = vector_store.search_similar(query_embedding, top_k=1)
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].symbol_name == "test_function"
|
||||
assert results[0].symbol_kind == "function"
|
||||
assert results[0].start_line == 10
|
||||
assert results[0].end_line == 15
|
||||
|
||||
def test_search_empty_store_returns_empty(self, vector_store, embedder):
|
||||
"""Test search on empty store returns empty list."""
|
||||
query_embedding = embedder.embed_single("anything")
|
||||
results = vector_store.search_similar(query_embedding)
|
||||
assert results == []
|
||||
|
||||
def test_search_with_return_full_content_false(self, vector_store, embedder):
|
||||
"""Test search with return_full_content=False."""
|
||||
chunk = SemanticChunk(
|
||||
content="def long_function(): " + "pass\n" * 100,
|
||||
metadata={},
|
||||
)
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/long.py")
|
||||
|
||||
query_embedding = embedder.embed_single("function")
|
||||
results = vector_store.search_similar(
|
||||
query_embedding, top_k=1, return_full_content=False
|
||||
)
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].content is None
|
||||
assert results[0].excerpt is not None
|
||||
|
||||
|
||||
class TestVectorStoreCache:
|
||||
"""Tests for VectorStore caching behavior."""
|
||||
|
||||
def test_cache_invalidation_on_add(self, vector_store, embedder):
|
||||
"""Test cache is invalidated when chunks are added."""
|
||||
chunk1 = SemanticChunk(content="def a(): pass", metadata={})
|
||||
chunk1.embedding = embedder.embed_single(chunk1.content)
|
||||
vector_store.add_chunk(chunk1, "/test/a.py")
|
||||
|
||||
# Trigger cache population
|
||||
query_embedding = embedder.embed_single("function")
|
||||
vector_store.search_similar(query_embedding)
|
||||
|
||||
initial_version = vector_store._cache_version
|
||||
|
||||
# Add another chunk
|
||||
chunk2 = SemanticChunk(content="def b(): pass", metadata={})
|
||||
chunk2.embedding = embedder.embed_single(chunk2.content)
|
||||
vector_store.add_chunk(chunk2, "/test/b.py")
|
||||
|
||||
assert vector_store._cache_version > initial_version
|
||||
assert vector_store._embedding_matrix is None
|
||||
|
||||
def test_cache_invalidation_on_delete(self, vector_store, embedder):
|
||||
"""Test cache is invalidated when chunks are deleted."""
|
||||
chunk = SemanticChunk(content="def a(): pass", metadata={})
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/a.py")
|
||||
|
||||
# Trigger cache population
|
||||
query_embedding = embedder.embed_single("function")
|
||||
vector_store.search_similar(query_embedding)
|
||||
|
||||
initial_version = vector_store._cache_version
|
||||
|
||||
# Delete chunk
|
||||
vector_store.delete_file_chunks("/test/a.py")
|
||||
|
||||
assert vector_store._cache_version > initial_version
|
||||
|
||||
def test_manual_cache_clear(self, vector_store, embedder):
|
||||
"""Test manual cache clearing."""
|
||||
chunk = SemanticChunk(content="def a(): pass", metadata={})
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/a.py")
|
||||
|
||||
# Trigger cache population
|
||||
query_embedding = embedder.embed_single("function")
|
||||
vector_store.search_similar(query_embedding)
|
||||
|
||||
assert vector_store._embedding_matrix is not None
|
||||
|
||||
vector_store.clear_cache()
|
||||
|
||||
assert vector_store._embedding_matrix is None
|
||||
|
||||
|
||||
# === Semantic Search Accuracy Tests ===
|
||||
|
||||
class TestSemanticSearchAccuracy:
|
||||
"""Tests for semantic search accuracy and relevance."""
|
||||
|
||||
def test_auth_query_finds_auth_code(self, vector_store, embedder, sample_code_chunks):
|
||||
"""Test authentication query finds auth code."""
|
||||
for data in sample_code_chunks:
|
||||
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/file.py")
|
||||
|
||||
query = "user authentication login"
|
||||
query_embedding = embedder.embed_single(query)
|
||||
results = vector_store.search_similar(query_embedding, top_k=1)
|
||||
|
||||
assert len(results) > 0
|
||||
assert "authenticate" in results[0].excerpt.lower()
|
||||
|
||||
def test_database_query_finds_db_code(self, vector_store, embedder, sample_code_chunks):
|
||||
"""Test database query finds database code."""
|
||||
for data in sample_code_chunks:
|
||||
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/file.py")
|
||||
|
||||
query = "database connection execute query"
|
||||
query_embedding = embedder.embed_single(query)
|
||||
results = vector_store.search_similar(query_embedding, top_k=1)
|
||||
|
||||
assert len(results) > 0
|
||||
assert "database" in results[0].excerpt.lower() or "connect" in results[0].excerpt.lower()
|
||||
|
||||
def test_math_query_finds_calculation_code(self, vector_store, embedder, sample_code_chunks):
|
||||
"""Test math query finds calculation code."""
|
||||
for data in sample_code_chunks:
|
||||
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/file.py")
|
||||
|
||||
query = "sum numbers add calculation"
|
||||
query_embedding = embedder.embed_single(query)
|
||||
results = vector_store.search_similar(query_embedding, top_k=1)
|
||||
|
||||
assert len(results) > 0
|
||||
assert "sum" in results[0].excerpt.lower() or "calculate" in results[0].excerpt.lower()
|
||||
|
||||
|
||||
# === Performance Tests ===
|
||||
|
||||
class TestVectorSearchPerformance:
|
||||
"""Performance tests for vector search."""
|
||||
|
||||
def test_embedding_performance(self, embedder):
|
||||
"""Test embedding generation performance."""
|
||||
text = "def calculate_sum(a, b): return a + b"
|
||||
|
||||
# Warm up
|
||||
embedder.embed_single(text)
|
||||
|
||||
# Measure
|
||||
start = time.perf_counter()
|
||||
iterations = 10
|
||||
for _ in range(iterations):
|
||||
embedder.embed_single(text)
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
avg_ms = (elapsed / iterations) * 1000
|
||||
assert avg_ms < 100, f"Single embedding should be <100ms, got {avg_ms:.2f}ms"
|
||||
|
||||
def test_batch_embedding_performance(self, embedder):
|
||||
"""Test batch embedding performance."""
|
||||
texts = [f"def function_{i}(): pass" for i in range(50)]
|
||||
|
||||
# Warm up
|
||||
embedder.embed(texts[:5])
|
||||
|
||||
# Measure
|
||||
start = time.perf_counter()
|
||||
embedder.embed(texts)
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
total_ms = elapsed * 1000
|
||||
per_text_ms = total_ms / len(texts)
|
||||
assert per_text_ms < 20, f"Per-text embedding should be <20ms, got {per_text_ms:.2f}ms"
|
||||
|
||||
def test_search_performance_small(self, vector_store, embedder):
|
||||
"""Test search performance with small dataset."""
|
||||
# Add 100 chunks
|
||||
for i in range(100):
|
||||
chunk = SemanticChunk(
|
||||
content=f"def function_{i}(): return {i}",
|
||||
metadata={"index": i},
|
||||
)
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, f"/test/file_{i}.py")
|
||||
|
||||
query_embedding = embedder.embed_single("function return value")
|
||||
|
||||
# Warm up
|
||||
vector_store.search_similar(query_embedding)
|
||||
|
||||
# Measure
|
||||
start = time.perf_counter()
|
||||
iterations = 10
|
||||
for _ in range(iterations):
|
||||
vector_store.search_similar(query_embedding)
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
avg_ms = (elapsed / iterations) * 1000
|
||||
assert avg_ms < 50, f"Search with 100 chunks should be <50ms, got {avg_ms:.2f}ms"
|
||||
|
||||
def test_search_performance_medium(self, vector_store, embedder):
|
||||
"""Test search performance with medium dataset."""
|
||||
# Add 500 chunks in batch
|
||||
chunks = []
|
||||
for i in range(500):
|
||||
chunk = SemanticChunk(
|
||||
content=f"def function_{i}(x): return x * {i}",
|
||||
metadata={"index": i},
|
||||
)
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
chunks.append(chunk)
|
||||
|
||||
vector_store.add_chunks(chunks, "/test/bulk.py")
|
||||
|
||||
query_embedding = embedder.embed_single("multiply value")
|
||||
|
||||
# Warm up
|
||||
vector_store.search_similar(query_embedding)
|
||||
|
||||
# Measure
|
||||
start = time.perf_counter()
|
||||
iterations = 5
|
||||
for _ in range(iterations):
|
||||
vector_store.search_similar(query_embedding)
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
avg_ms = (elapsed / iterations) * 1000
|
||||
assert avg_ms < 100, f"Search with 500 chunks should be <100ms, got {avg_ms:.2f}ms"
|
||||
|
||||
|
||||
# === Thread Safety Tests ===
|
||||
|
||||
class TestThreadSafety:
|
||||
"""Tests for thread safety."""
|
||||
|
||||
def test_concurrent_searches(self, vector_store, embedder, sample_code_chunks):
|
||||
"""Test concurrent searches are thread-safe."""
|
||||
# Populate store
|
||||
for data in sample_code_chunks:
|
||||
chunk = SemanticChunk(content=data["content"], metadata=data["metadata"])
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/file.py")
|
||||
|
||||
results_list = []
|
||||
errors = []
|
||||
|
||||
def search_task(query):
|
||||
try:
|
||||
query_embedding = embedder.embed_single(query)
|
||||
results = vector_store.search_similar(query_embedding, top_k=3)
|
||||
results_list.append(len(results))
|
||||
except Exception as e:
|
||||
errors.append(str(e))
|
||||
|
||||
queries = ["authentication", "database", "function", "class", "async"]
|
||||
threads = [threading.Thread(target=search_task, args=(q,)) for q in queries]
|
||||
|
||||
for t in threads:
|
||||
t.start()
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
assert len(errors) == 0, f"Errors during concurrent search: {errors}"
|
||||
assert len(results_list) == len(queries)
|
||||
|
||||
def test_concurrent_add_and_search(self, vector_store, embedder):
|
||||
"""Test concurrent add and search operations."""
|
||||
errors = []
|
||||
|
||||
def add_task(idx):
|
||||
try:
|
||||
chunk = SemanticChunk(
|
||||
content=f"def task_{idx}(): pass",
|
||||
metadata={"idx": idx},
|
||||
)
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, f"/test/task_{idx}.py")
|
||||
except Exception as e:
|
||||
errors.append(f"Add error: {e}")
|
||||
|
||||
def search_task():
|
||||
try:
|
||||
query_embedding = embedder.embed_single("function task")
|
||||
vector_store.search_similar(query_embedding)
|
||||
except Exception as e:
|
||||
errors.append(f"Search error: {e}")
|
||||
|
||||
threads = []
|
||||
for i in range(10):
|
||||
threads.append(threading.Thread(target=add_task, args=(i,)))
|
||||
threads.append(threading.Thread(target=search_task))
|
||||
|
||||
for t in threads:
|
||||
t.start()
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
assert len(errors) == 0, f"Errors during concurrent ops: {errors}"
|
||||
|
||||
|
||||
# === Edge Cases ===
|
||||
|
||||
class TestEdgeCases:
|
||||
"""Tests for edge cases."""
|
||||
|
||||
def test_very_short_content(self, vector_store, embedder):
|
||||
"""Test handling very short content."""
|
||||
chunk = SemanticChunk(content="x", metadata={})
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/short.py")
|
||||
|
||||
query_embedding = embedder.embed_single("x")
|
||||
results = vector_store.search_similar(query_embedding)
|
||||
|
||||
assert len(results) == 1
|
||||
|
||||
def test_special_characters_in_path(self, vector_store, embedder):
|
||||
"""Test handling special characters in file path."""
|
||||
chunk = SemanticChunk(content="def test(): pass", metadata={})
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
|
||||
special_path = "/test/path with spaces/file-name_v2.py"
|
||||
vector_store.add_chunk(chunk, special_path)
|
||||
|
||||
query_embedding = embedder.embed_single("test function")
|
||||
results = vector_store.search_similar(query_embedding)
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].path == special_path
|
||||
|
||||
def test_json_metadata_special_chars(self, vector_store, embedder):
|
||||
"""Test metadata with special JSON characters."""
|
||||
metadata = {
|
||||
"description": 'Test "quoted" text with \'single\' quotes',
|
||||
"path": "C:\\Users\\test\\file.py",
|
||||
"tags": ["tag1", "tag2"],
|
||||
}
|
||||
chunk = SemanticChunk(content="def test(): pass", metadata=metadata)
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
|
||||
vector_store.add_chunk(chunk, "/test/special.py")
|
||||
|
||||
query_embedding = embedder.embed_single("test")
|
||||
results = vector_store.search_similar(query_embedding)
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].metadata["description"] == metadata["description"]
|
||||
|
||||
def test_search_zero_top_k(self, vector_store, embedder):
|
||||
"""Test search with top_k=0."""
|
||||
chunk = SemanticChunk(content="def test(): pass", metadata={})
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/file.py")
|
||||
|
||||
query_embedding = embedder.embed_single("test")
|
||||
results = vector_store.search_similar(query_embedding, top_k=0)
|
||||
|
||||
assert results == []
|
||||
|
||||
def test_search_very_high_min_score(self, vector_store, embedder):
|
||||
"""Test search with very high min_score filters all results."""
|
||||
chunk = SemanticChunk(content="def hello(): print('world')", metadata={})
|
||||
chunk.embedding = embedder.embed_single(chunk.content)
|
||||
vector_store.add_chunk(chunk, "/test/hello.py")
|
||||
|
||||
# Query something unrelated with very high threshold
|
||||
query_embedding = embedder.embed_single("database connection")
|
||||
results = vector_store.search_similar(query_embedding, min_score=0.99)
|
||||
|
||||
# Should filter out since unrelated
|
||||
assert len(results) == 0
|
||||
|
||||
|
||||
# === Availability Check Tests ===
|
||||
|
||||
class TestAvailabilityCheck:
|
||||
"""Tests for semantic availability checking."""
|
||||
|
||||
def test_check_semantic_available(self):
|
||||
"""Test check_semantic_available function."""
|
||||
available, error = check_semantic_available()
|
||||
assert available is True
|
||||
assert error is None
|
||||
|
||||
def test_semantic_available_flag(self):
|
||||
"""Test SEMANTIC_AVAILABLE flag is True when deps installed."""
|
||||
assert SEMANTIC_AVAILABLE is True
|
||||
Reference in New Issue
Block a user