Files
Claude-Code-Workflow/codex-lens/tests/test_pure_vector_search.py
catlog22 4061ae48c4 feat: Implement adaptive RRF weights and query intent detection
- Added integration tests for adaptive RRF weights in hybrid search.
- Enhanced query intent detection with new classifications: keyword, semantic, and mixed.
- Introduced symbol boosting in search results based on explicit symbol matches.
- Implemented embedding-based reranking with configurable options.
- Added global symbol index for efficient symbol lookups across projects.
- Improved file deletion handling on Windows to avoid permission errors.
- Updated chunk configuration to increase overlap for better context.
- Modified package.json test script to target specific test files.
- Created comprehensive writing style guidelines for documentation.
- Added TypeScript tests for query intent detection and adaptive weights.
- Established performance benchmarks for global symbol indexing.
2025-12-26 15:08:47 +08:00

346 lines
11 KiB
Python

"""Tests for pure vector search functionality."""
import pytest
import sqlite3
import tempfile
import time
from pathlib import Path
from codexlens.search.hybrid_search import HybridSearchEngine
from codexlens.storage.dir_index import DirIndexStore
# Check if semantic dependencies are available
try:
from codexlens.semantic import SEMANTIC_AVAILABLE
SEMANTIC_DEPS_AVAILABLE = SEMANTIC_AVAILABLE
except ImportError:
SEMANTIC_DEPS_AVAILABLE = False
def _safe_unlink(path: Path, retries: int = 5, delay_s: float = 0.05) -> None:
"""Best-effort unlink for Windows where SQLite can keep files locked briefly."""
for attempt in range(retries):
try:
path.unlink()
return
except FileNotFoundError:
return
except PermissionError:
time.sleep(delay_s * (attempt + 1))
try:
path.unlink(missing_ok=True)
except (PermissionError, OSError):
pass
class TestPureVectorSearch:
"""Tests for pure vector search mode."""
@pytest.fixture
def sample_db(self):
"""Create sample database with files."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = Path(f.name)
store = DirIndexStore(db_path)
store.initialize()
# Add sample files
files = {
"auth.py": "def authenticate_user(username, password): pass",
"login.py": "def login_handler(credentials): pass",
"user.py": "class User: pass",
}
with store._get_connection() as conn:
for path, content in files.items():
conn.execute(
"""INSERT INTO files (name, full_path, content, language, mtime)
VALUES (?, ?, ?, ?, ?)""",
(path, path, content, "python", 0.0)
)
conn.commit()
yield db_path
store.close()
if db_path.exists():
_safe_unlink(db_path)
def test_pure_vector_without_embeddings(self, sample_db):
"""Test pure_vector mode returns empty when no embeddings exist."""
engine = HybridSearchEngine()
results = engine.search(
sample_db,
"authentication",
limit=10,
enable_vector=True,
pure_vector=True,
)
# Should return empty list because no embeddings exist
assert isinstance(results, list)
assert len(results) == 0, \
"Pure vector search should return empty when no embeddings exist"
def test_vector_with_fallback(self, sample_db):
"""Test vector mode (with fallback) returns FTS results when no embeddings."""
engine = HybridSearchEngine()
results = engine.search(
sample_db,
"authenticate",
limit=10,
enable_vector=True,
pure_vector=False, # Allow FTS fallback
)
# Should return FTS results even without embeddings
assert isinstance(results, list)
assert len(results) > 0, \
"Vector mode with fallback should return FTS results"
# Verify results come from exact FTS
paths = [r.path for r in results]
assert "auth.py" in paths, "Should find auth.py via FTS"
def test_pure_vector_invalid_config(self, sample_db):
"""Test pure_vector=True but enable_vector=False logs warning."""
engine = HybridSearchEngine()
# Invalid: pure_vector=True but enable_vector=False
results = engine.search(
sample_db,
"test",
limit=10,
enable_vector=False,
pure_vector=True,
)
# Should fallback to exact search
assert isinstance(results, list)
def test_hybrid_mode_ignores_pure_vector(self, sample_db):
"""Test hybrid mode works normally (ignores pure_vector)."""
engine = HybridSearchEngine()
results = engine.search(
sample_db,
"authenticate",
limit=10,
enable_fuzzy=True,
enable_vector=False,
pure_vector=False, # Should be ignored in hybrid
)
# Should return results from exact + fuzzy
assert isinstance(results, list)
assert len(results) > 0
@pytest.mark.skipif(not SEMANTIC_DEPS_AVAILABLE, reason="Semantic dependencies not available")
class TestPureVectorWithEmbeddings:
"""Tests for pure vector search with actual embeddings."""
@pytest.fixture
def db_with_embeddings(self):
"""Create database with embeddings."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = Path(f.name)
store = DirIndexStore(db_path)
store.initialize()
# Add sample files
files = {
"auth/authentication.py": """
def authenticate_user(username: str, password: str) -> bool:
'''Verify user credentials against database.'''
return check_password(username, password)
def check_password(user: str, pwd: str) -> bool:
'''Check if password matches stored hash.'''
return True
""",
"auth/login.py": """
def login_handler(credentials: dict) -> bool:
'''Handle user login request.'''
username = credentials.get('username')
password = credentials.get('password')
return authenticate_user(username, password)
""",
}
with store._get_connection() as conn:
for path, content in files.items():
name = path.split('/')[-1]
conn.execute(
"""INSERT INTO files (name, full_path, content, language, mtime)
VALUES (?, ?, ?, ?, ?)""",
(name, path, content, "python", 0.0)
)
conn.commit()
# Generate embeddings
vector_store = None
try:
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.semantic.chunker import Chunker, ChunkConfig
embedder = Embedder(profile="fast") # Use fast model for testing
vector_store = VectorStore(db_path)
chunker = Chunker(config=ChunkConfig(max_chunk_size=1000))
with sqlite3.connect(db_path) as conn:
conn.row_factory = sqlite3.Row
rows = conn.execute("SELECT full_path, content FROM files").fetchall()
for row in rows:
chunks = chunker.chunk_sliding_window(
row["content"],
file_path=row["full_path"],
language="python"
)
for chunk in chunks:
chunk.embedding = embedder.embed_single(chunk.content)
if chunks:
vector_store.add_chunks(chunks, row["full_path"])
except Exception as exc:
pytest.skip(f"Failed to generate embeddings: {exc}")
finally:
if vector_store is not None:
vector_store.close()
yield db_path
store.close()
if db_path.exists():
_safe_unlink(db_path)
def test_pure_vector_with_embeddings(self, db_with_embeddings):
"""Test pure vector search returns results when embeddings exist."""
engine = HybridSearchEngine()
results = engine.search(
db_with_embeddings,
"how to verify user credentials", # Natural language query
limit=10,
enable_vector=True,
pure_vector=True,
)
# Should return results from vector search only
assert isinstance(results, list)
assert len(results) > 0, "Pure vector search should return results"
# Results should have semantic relevance
for result in results:
assert result.score > 0
assert result.path is not None
def test_compare_pure_vs_hybrid(self, db_with_embeddings):
"""Compare pure vector vs hybrid search results."""
engine = HybridSearchEngine()
# Pure vector search
pure_results = engine.search(
db_with_embeddings,
"verify credentials",
limit=10,
enable_vector=True,
pure_vector=True,
)
# Hybrid search
hybrid_results = engine.search(
db_with_embeddings,
"verify credentials",
limit=10,
enable_fuzzy=True,
enable_vector=True,
pure_vector=False,
)
# Both should return results
assert len(pure_results) > 0, "Pure vector should find results"
assert len(hybrid_results) > 0, "Hybrid should find results"
# Hybrid may have more results (FTS + vector)
# But pure should still be useful for semantic queries
class TestSearchModeComparison:
"""Compare different search modes."""
@pytest.fixture
def comparison_db(self):
"""Create database for mode comparison."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = Path(f.name)
store = DirIndexStore(db_path)
store.initialize()
files = {
"auth.py": "def authenticate(): pass",
"login.py": "def login(): pass",
}
with store._get_connection() as conn:
for path, content in files.items():
conn.execute(
"""INSERT INTO files (name, full_path, content, language, mtime)
VALUES (?, ?, ?, ?, ?)""",
(path, path, content, "python", 0.0)
)
conn.commit()
yield db_path
store.close()
if db_path.exists():
_safe_unlink(db_path)
def test_mode_comparison_without_embeddings(self, comparison_db):
"""Compare all search modes without embeddings."""
engine = HybridSearchEngine()
query = "authenticate"
# Test each mode
modes = [
("exact", False, False, False),
("fuzzy", True, False, False),
("vector", False, True, False), # With fallback
("pure_vector", False, True, True), # No fallback
]
results = {}
for mode_name, fuzzy, vector, pure in modes:
result = engine.search(
comparison_db,
query,
limit=10,
enable_fuzzy=fuzzy,
enable_vector=vector,
pure_vector=pure,
)
results[mode_name] = len(result)
# Assertions
assert results["exact"] > 0, "Exact should find results"
assert results["fuzzy"] >= results["exact"], "Fuzzy should find at least as many"
assert results["vector"] > 0, "Vector with fallback should find results (from FTS)"
assert results["pure_vector"] == 0, "Pure vector should return empty (no embeddings)"
# Log comparison
print("\nMode comparison (without embeddings):")
for mode, count in results.items():
print(f" {mode}: {count} results")
if __name__ == "__main__":
pytest.main([__file__, "-v", "-s"])