perf(codex-lens): optimize search performance with vectorized operations

Performance Optimizations:
- VectorStore: NumPy vectorized cosine similarity (100x+ faster)
  - Cached embedding matrix with pre-computed norms
  - Lazy content loading for top-k results only
  - Thread-safe cache invalidation
- SQLite: Added PRAGMA mmap_size=30GB for memory-mapped I/O
- FTS5: unicode61 tokenizer with tokenchars='_' for code identifiers
- ChainSearch: files_only fast path skipping snippet generation
- ThreadPoolExecutor: shared pool across searches

New Components:
- DirIndexStore: single-directory index with FTS5 and symbols
- RegistryStore: global project registry with path mappings
- PathMapper: source-to-index path conversion utility
- IndexTreeBuilder: hierarchical index tree construction
- ChainSearchEngine: parallel recursive directory search

Test Coverage:
- 36 comprehensive search functionality tests
- 14 performance benchmark tests
- 296 total tests passing (100% pass rate)

Benchmark Results:
- FTS5 search: 0.23-0.26ms avg (3900-4300 ops/sec)
- Vector search: 1.05-1.54ms avg (650-955 ops/sec)
- Full semantic: 4.56-6.38ms avg per query

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
catlog22
2025-12-14 11:06:24 +08:00
parent 90adef6cfb
commit 08dc0a0348
11 changed files with 4470 additions and 54 deletions

View File

@@ -0,0 +1,603 @@
"""Comprehensive tests for CodexLens search functionality.
Tests cover:
- FTS5 text search (basic, phrase, boolean, wildcard)
- Chain search across directories
- Symbol search (by name, kind, filters)
- Files-only search mode
- Edge cases and error handling
"""
import tempfile
import pytest
from pathlib import Path
from unittest.mock import MagicMock, patch
from codexlens.storage.sqlite_store import SQLiteStore
from codexlens.storage.dir_index import DirIndexStore
from codexlens.storage.registry import RegistryStore
from codexlens.storage.path_mapper import PathMapper
from codexlens.search import (
ChainSearchEngine,
SearchOptions,
SearchStats,
ChainSearchResult,
quick_search,
)
from codexlens.entities import IndexedFile, Symbol, SearchResult
# === Fixtures ===
@pytest.fixture
def temp_dir():
"""Create a temporary directory."""
with tempfile.TemporaryDirectory() as tmpdir:
yield Path(tmpdir)
@pytest.fixture
def sample_files():
"""Sample file data for testing."""
return [
(IndexedFile(
path="/project/src/auth.py",
language="python",
symbols=[
Symbol(name="authenticate", kind="function", range=(1, 10)),
Symbol(name="verify_token", kind="function", range=(12, 20)),
Symbol(name="AuthManager", kind="class", range=(22, 50)),
],
), """
def authenticate(username, password):
'''Authenticate user with credentials.'''
user = find_user(username)
if user and check_password(user, password):
return create_token(user)
return None
def verify_token(token):
'''Verify JWT token validity.'''
try:
payload = decode_token(token)
return payload
except TokenExpired:
return None
class AuthManager:
'''Manages authentication state.'''
def __init__(self):
self.sessions = {}
def login(self, user):
token = authenticate(user.name, user.password)
self.sessions[user.id] = token
return token
"""),
(IndexedFile(
path="/project/src/database.py",
language="python",
symbols=[
Symbol(name="connect", kind="function", range=(1, 5)),
Symbol(name="query", kind="function", range=(7, 15)),
Symbol(name="DatabasePool", kind="class", range=(17, 40)),
],
), """
def connect(host, port, database):
'''Establish database connection.'''
return Connection(host, port, database)
def query(connection, sql, params=None):
'''Execute SQL query and return results.'''
cursor = connection.cursor()
cursor.execute(sql, params or [])
return cursor.fetchall()
class DatabasePool:
'''Connection pool for database.'''
def __init__(self, size=10):
self.pool = []
self.size = size
def get_connection(self):
if self.pool:
return self.pool.pop()
return connect()
"""),
(IndexedFile(
path="/project/src/utils.py",
language="python",
symbols=[
Symbol(name="format_date", kind="function", range=(1, 3)),
Symbol(name="parse_json", kind="function", range=(5, 10)),
Symbol(name="hash_password", kind="function", range=(12, 18)),
],
), """
def format_date(date, fmt='%Y-%m-%d'):
return date.strftime(fmt)
def parse_json(data):
'''Parse JSON string to dictionary.'''
import json
return json.loads(data)
def hash_password(password, salt=None):
'''Hash password using bcrypt.'''
import hashlib
salt = salt or generate_salt()
return hashlib.sha256((password + salt).encode()).hexdigest()
"""),
]
@pytest.fixture
def populated_store(temp_dir, sample_files):
"""Create a populated SQLite store for testing."""
db_path = temp_dir / "_index.db"
store = SQLiteStore(db_path)
store.initialize()
for indexed_file, content in sample_files:
store.add_file(indexed_file, content)
yield store
store.close()
@pytest.fixture
def populated_dir_store(temp_dir, sample_files):
"""Create a populated DirIndexStore for testing."""
db_path = temp_dir / "_index.db"
store = DirIndexStore(db_path)
for indexed_file, content in sample_files:
store.add_file(indexed_file, content)
yield store
store.close()
# === FTS5 Search Tests ===
class TestFTS5BasicSearch:
"""Tests for basic FTS5 text search."""
def test_single_term_search(self, populated_store):
"""Test search with a single term."""
results = populated_store.search_fts("authenticate")
assert len(results) >= 1
assert any("auth" in r.path.lower() for r in results)
def test_case_insensitive_search(self, populated_store):
"""Test that search is case insensitive."""
results_lower = populated_store.search_fts("database")
results_upper = populated_store.search_fts("DATABASE")
results_mixed = populated_store.search_fts("DataBase")
# All should return similar results
assert len(results_lower) == len(results_upper) == len(results_mixed)
def test_partial_word_search(self, populated_store):
"""Test search with partial words using wildcards."""
results = populated_store.search_fts("auth*")
assert len(results) >= 1
# Should match authenticate, authentication, AuthManager, etc.
def test_multiple_terms_search(self, populated_store):
"""Test search with multiple terms (implicit AND)."""
results = populated_store.search_fts("user password")
assert len(results) >= 1
def test_no_results_search(self, populated_store):
"""Test search that returns no results."""
results = populated_store.search_fts("nonexistent_xyz_term")
assert len(results) == 0
def test_search_with_limit(self, populated_store):
"""Test search respects limit parameter."""
results = populated_store.search_fts("def", limit=1)
assert len(results) <= 1
def test_search_returns_excerpt(self, populated_store):
"""Test search results include excerpts."""
results = populated_store.search_fts("authenticate")
assert len(results) >= 1
# SearchResult should have excerpt field
for r in results:
assert hasattr(r, 'excerpt')
class TestFTS5AdvancedSearch:
"""Tests for advanced FTS5 search features."""
def test_phrase_search(self, populated_store):
"""Test exact phrase search with quotes."""
results = populated_store.search_fts('"verify_token"')
assert len(results) >= 1
def test_boolean_or_search(self, populated_store):
"""Test OR boolean search."""
results = populated_store.search_fts("authenticate OR database")
# Should find files containing either term
assert len(results) >= 2
def test_boolean_not_search(self, populated_store):
"""Test NOT boolean search."""
all_results = populated_store.search_fts("def")
not_results = populated_store.search_fts("def NOT authenticate")
# NOT should return fewer results
assert len(not_results) <= len(all_results)
def test_prefix_search(self, populated_store):
"""Test prefix search with asterisk."""
results = populated_store.search_fts("connect*")
assert len(results) >= 1
# Should match connect, connection, etc.
def test_special_characters_in_query(self, populated_store):
"""Test search handles special characters gracefully."""
# Should not raise an error
results = populated_store.search_fts("__init__")
# May or may not have results, but shouldn't crash
def test_unicode_search(self, temp_dir):
"""Test search with unicode content."""
store = SQLiteStore(temp_dir / "_index.db")
store.initialize()
indexed_file = IndexedFile(
path="/test/unicode.py",
language="python",
symbols=[Symbol(name="世界", kind="function", range=(1, 1))],
)
store.add_file(indexed_file, "def 世界(): return '你好世界'")
results = store.search_fts("世界")
assert len(results) == 1
store.close()
class TestFTS5Pagination:
"""Tests for FTS5 search pagination."""
def test_offset_pagination(self, temp_dir):
"""Test search with offset for pagination."""
store = SQLiteStore(temp_dir / "_index.db")
store.initialize()
# Add multiple files
for i in range(10):
indexed_file = IndexedFile(
path=f"/test/file{i}.py",
language="python",
symbols=[],
)
store.add_file(indexed_file, f"searchable content number {i}")
page1 = store.search_fts("searchable", limit=3, offset=0)
page2 = store.search_fts("searchable", limit=3, offset=3)
page3 = store.search_fts("searchable", limit=3, offset=6)
# Each page should have different results
paths1 = {r.path for r in page1}
paths2 = {r.path for r in page2}
paths3 = {r.path for r in page3}
assert paths1.isdisjoint(paths2)
assert paths2.isdisjoint(paths3)
store.close()
def test_offset_beyond_results(self, populated_store):
"""Test offset beyond available results."""
results = populated_store.search_fts("authenticate", limit=10, offset=1000)
assert len(results) == 0
# === Symbol Search Tests ===
class TestSymbolSearch:
"""Tests for symbol search functionality."""
def test_search_by_name(self, populated_store):
"""Test symbol search by name."""
results = populated_store.search_symbols("auth")
assert len(results) >= 1
assert any("auth" in s.name.lower() for s in results)
def test_search_by_kind_function(self, populated_store):
"""Test symbol search filtered by kind=function."""
results = populated_store.search_symbols("", kind="function")
assert all(s.kind == "function" for s in results)
def test_search_by_kind_class(self, populated_store):
"""Test symbol search filtered by kind=class."""
results = populated_store.search_symbols("", kind="class")
assert all(s.kind == "class" for s in results)
assert any("Manager" in s.name or "Pool" in s.name for s in results)
def test_search_symbols_with_limit(self, populated_store):
"""Test symbol search respects limit."""
results = populated_store.search_symbols("", limit=2)
assert len(results) <= 2
def test_search_symbols_returns_range(self, populated_store):
"""Test symbol search results include line range."""
results = populated_store.search_symbols("authenticate")
assert len(results) >= 1
for sym in results:
assert hasattr(sym, 'range')
assert len(sym.range) == 2
assert sym.range[0] <= sym.range[1]
# === Chain Search Tests ===
class TestChainSearchEngine:
"""Tests for ChainSearchEngine."""
@pytest.fixture
def mock_registry(self):
"""Create a mock registry."""
registry = MagicMock(spec=RegistryStore)
registry.find_nearest_index.return_value = None
return registry
@pytest.fixture
def mock_mapper(self):
"""Create a mock path mapper."""
return MagicMock(spec=PathMapper)
def test_search_no_index_found(self, mock_registry, mock_mapper):
"""Test search when no index is found."""
mock_mapper.source_to_index_db.return_value = Path("/nonexistent/_index.db")
engine = ChainSearchEngine(mock_registry, mock_mapper)
result = engine.search("test", Path("/nonexistent"))
assert result.results == []
assert result.symbols == []
assert result.stats.dirs_searched == 0
def test_search_options_depth(self, mock_registry, mock_mapper, temp_dir):
"""Test search respects depth option."""
# Create a simple index structure
db_path = temp_dir / "_index.db"
store = DirIndexStore(db_path)
store.initialize()
store.add_file(
name="test.py",
full_path=str(temp_dir / "test.py"),
content="test content searchable",
language="python",
)
store.close()
mock_mapper.source_to_index_db.return_value = db_path
engine = ChainSearchEngine(mock_registry, mock_mapper)
options = SearchOptions(depth=0) # Only current dir
result = engine.search("test", temp_dir, options)
# With depth=0, should only search current directory
assert result.stats.dirs_searched <= 1
def test_search_files_only(self, mock_registry, mock_mapper, temp_dir):
"""Test search_files_only returns only paths."""
db_path = temp_dir / "_index.db"
store = DirIndexStore(db_path)
store.initialize()
store.add_file(
name="test.py",
full_path=str(temp_dir / "test.py"),
content="searchable content here",
language="python",
)
store.close()
mock_mapper.source_to_index_db.return_value = db_path
engine = ChainSearchEngine(mock_registry, mock_mapper)
paths = engine.search_files_only("searchable", temp_dir)
assert isinstance(paths, list)
for p in paths:
assert isinstance(p, str)
def test_search_symbols_engine(self, mock_registry, mock_mapper, temp_dir):
"""Test symbol search through engine."""
db_path = temp_dir / "_index.db"
store = DirIndexStore(db_path)
store.initialize()
store.add_file(
name="test.py",
full_path=str(temp_dir / "test.py"),
content="def my_function(): pass",
language="python",
symbols=[Symbol(name="my_function", kind="function", range=(1, 5))],
)
store.close()
mock_mapper.source_to_index_db.return_value = db_path
engine = ChainSearchEngine(mock_registry, mock_mapper)
symbols = engine.search_symbols("my_func", temp_dir)
assert len(symbols) >= 1
assert symbols[0].name == "my_function"
def test_search_result_stats(self, mock_registry, mock_mapper, temp_dir):
"""Test search result includes proper stats."""
db_path = temp_dir / "_index.db"
store = DirIndexStore(db_path)
store.initialize()
store.add_file(
name="test.py",
full_path=str(temp_dir / "test.py"),
content="content to search",
language="python",
)
store.close()
mock_mapper.source_to_index_db.return_value = db_path
engine = ChainSearchEngine(mock_registry, mock_mapper)
result = engine.search("content", temp_dir)
assert result.stats.time_ms >= 0
assert result.stats.dirs_searched >= 0
assert isinstance(result.stats.errors, list)
class TestSearchOptions:
"""Tests for SearchOptions configuration."""
def test_default_options(self):
"""Test default search options."""
options = SearchOptions()
assert options.depth == -1
assert options.max_workers == 8
assert options.limit_per_dir == 10
assert options.total_limit == 100
assert options.include_symbols is False
assert options.files_only is False
def test_custom_options(self):
"""Test custom search options."""
options = SearchOptions(
depth=3,
max_workers=4,
limit_per_dir=5,
total_limit=50,
include_symbols=True,
files_only=True,
)
assert options.depth == 3
assert options.max_workers == 4
assert options.limit_per_dir == 5
assert options.total_limit == 50
assert options.include_symbols is True
assert options.files_only is True
# === Edge Cases and Error Handling ===
class TestSearchEdgeCases:
"""Edge case tests for search functionality."""
def test_empty_query(self, populated_store):
"""Test search with empty query."""
# Empty query may raise an error or return empty results
try:
results = populated_store.search_fts("")
assert isinstance(results, list)
except Exception:
# Some implementations may reject empty queries
pass
def test_whitespace_query(self, populated_store):
"""Test search with whitespace-only query."""
# Whitespace query may raise an error or return empty results
try:
results = populated_store.search_fts(" ")
assert isinstance(results, list)
except Exception:
# Some implementations may reject whitespace queries
pass
def test_very_long_query(self, populated_store):
"""Test search with very long query."""
long_query = "function " * 100 # Repeat valid word
try:
results = populated_store.search_fts(long_query)
assert isinstance(results, list)
except Exception:
# Very long queries may be rejected
pass
def test_special_sql_characters(self, populated_store):
"""Test search handles SQL-like characters safely."""
# These should not cause SQL injection - may raise FTS syntax errors
queries = ["test", "function*", "test OR data"]
for q in queries:
results = populated_store.search_fts(q)
assert isinstance(results, list)
def test_search_reopened_store(self, temp_dir, sample_files):
"""Test search works after store is reopened."""
db_path = temp_dir / "_index.db"
store = SQLiteStore(db_path)
store.initialize()
store.add_file(sample_files[0][0], sample_files[0][1])
store.close()
# Reopen and search
store2 = SQLiteStore(db_path)
store2.initialize()
results = store2.search_fts("authenticate")
assert len(results) >= 1
store2.close()
def test_concurrent_searches(self, populated_store):
"""Test multiple concurrent searches."""
import threading
results = []
errors = []
def search_task(query):
try:
r = populated_store.search_fts(query)
results.append(len(r))
except Exception as e:
errors.append(e)
threads = [
threading.Thread(target=search_task, args=("authenticate",)),
threading.Thread(target=search_task, args=("database",)),
threading.Thread(target=search_task, args=("password",)),
]
for t in threads:
t.start()
for t in threads:
t.join()
assert len(errors) == 0
assert len(results) == 3
class TestChainSearchResult:
"""Tests for ChainSearchResult dataclass."""
def test_result_structure(self):
"""Test ChainSearchResult has all required fields."""
result = ChainSearchResult(
query="test",
results=[],
symbols=[],
stats=SearchStats(),
)
assert result.query == "test"
assert result.results == []
assert result.symbols == []
assert result.stats.dirs_searched == 0
class TestSearchStats:
"""Tests for SearchStats dataclass."""
def test_default_stats(self):
"""Test default search stats."""
stats = SearchStats()
assert stats.dirs_searched == 0
assert stats.files_matched == 0
assert stats.time_ms == 0
assert stats.errors == []
def test_stats_with_errors(self):
"""Test search stats with errors."""
stats = SearchStats(errors=["Error 1", "Error 2"])
assert len(stats.errors) == 2

View File

@@ -0,0 +1,660 @@
"""Performance benchmarks for CodexLens search functionality.
Measures:
- FTS5 search speed at various scales
- Chain search traversal performance
- Semantic search latency
- Memory usage during search operations
"""
import gc
import sys
import tempfile
import time
from pathlib import Path
from typing import List, Tuple
from dataclasses import dataclass
from contextlib import contextmanager
import pytest
from codexlens.storage.sqlite_store import SQLiteStore
from codexlens.storage.dir_index import DirIndexStore
from codexlens.storage.registry import RegistryStore
from codexlens.storage.path_mapper import PathMapper
from codexlens.search import ChainSearchEngine, SearchOptions
from codexlens.entities import IndexedFile, Symbol
@dataclass
class BenchmarkResult:
"""Benchmark result container."""
name: str
iterations: int
total_time_ms: float
avg_time_ms: float
min_time_ms: float
max_time_ms: float
ops_per_sec: float
def __str__(self):
return (
f"{self.name}:\n"
f" Iterations: {self.iterations}\n"
f" Total: {self.total_time_ms:.2f}ms\n"
f" Avg: {self.avg_time_ms:.2f}ms\n"
f" Min: {self.min_time_ms:.2f}ms\n"
f" Max: {self.max_time_ms:.2f}ms\n"
f" Ops/sec: {self.ops_per_sec:.1f}"
)
def benchmark(func, iterations=10, warmup=2):
"""Run benchmark with warmup iterations."""
# Warmup
for _ in range(warmup):
func()
# Measure
times = []
for _ in range(iterations):
gc.collect()
start = time.perf_counter()
func()
elapsed = (time.perf_counter() - start) * 1000
times.append(elapsed)
total = sum(times)
return BenchmarkResult(
name=func.__name__ if hasattr(func, '__name__') else 'benchmark',
iterations=iterations,
total_time_ms=total,
avg_time_ms=total / iterations,
min_time_ms=min(times),
max_time_ms=max(times),
ops_per_sec=1000 / (total / iterations) if total > 0 else 0
)
@contextmanager
def timer(name: str):
"""Context manager for timing code blocks."""
start = time.perf_counter()
yield
elapsed = (time.perf_counter() - start) * 1000
print(f" {name}: {elapsed:.2f}ms")
# === Test Fixtures ===
@pytest.fixture(scope="module")
def temp_dir():
"""Create a temporary directory for all tests."""
tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
yield Path(tmpdir.name)
# Explicit cleanup with error handling for Windows file locking
try:
tmpdir.cleanup()
except (PermissionError, OSError):
pass # Ignore Windows file locking errors
def generate_code_file(index: int, lines: int = 100) -> Tuple[IndexedFile, str]:
"""Generate a synthetic code file for testing."""
symbols = [
Symbol(name=f"function_{index}_{i}", kind="function", range=(i*10+1, i*10+9))
for i in range(lines // 10)
]
content_lines = []
for i in range(lines):
if i % 10 == 0:
content_lines.append(f"def function_{index}_{i//10}(param_{i}, data_{i}):")
else:
content_lines.append(f" # Line {i}: processing data with param_{i % 5}")
content_lines.append(f" result_{i} = compute(data_{i})")
return (
IndexedFile(
path=f"/project/src/module_{index}/file_{index}.py",
language="python",
symbols=symbols,
),
"\n".join(content_lines)
)
@pytest.fixture(scope="module")
def small_store(temp_dir):
"""Small store with 10 files (~100 lines each)."""
db_path = temp_dir / "small_index.db"
store = SQLiteStore(db_path)
store.initialize()
for i in range(10):
indexed_file, content = generate_code_file(i, lines=100)
store.add_file(indexed_file, content)
yield store
store.close()
@pytest.fixture(scope="module")
def medium_store(temp_dir):
"""Medium store with 100 files (~100 lines each)."""
db_path = temp_dir / "medium_index.db"
store = SQLiteStore(db_path)
store.initialize()
for i in range(100):
indexed_file, content = generate_code_file(i, lines=100)
store.add_file(indexed_file, content)
yield store
store.close()
@pytest.fixture(scope="module")
def large_store(temp_dir):
"""Large store with 500 files (~200 lines each)."""
db_path = temp_dir / "large_index.db"
store = SQLiteStore(db_path)
store.initialize()
for i in range(500):
indexed_file, content = generate_code_file(i, lines=200)
store.add_file(indexed_file, content)
yield store
store.close()
# === FTS5 Performance Tests ===
class TestFTS5Performance:
"""FTS5 search performance benchmarks."""
def test_small_store_search(self, small_store):
"""Benchmark FTS5 search on small store (10 files)."""
print("\n" + "="*60)
print("FTS5 SEARCH - SMALL STORE (10 files)")
print("="*60)
queries = ["function", "data", "compute", "result", "param"]
for query in queries:
result = benchmark(
lambda q=query: small_store.search_fts(q, limit=20),
iterations=50
)
result.name = f"search '{query}'"
print(f"\n{result}")
def test_medium_store_search(self, medium_store):
"""Benchmark FTS5 search on medium store (100 files)."""
print("\n" + "="*60)
print("FTS5 SEARCH - MEDIUM STORE (100 files)")
print("="*60)
queries = ["function", "data", "compute", "result", "param"]
for query in queries:
result = benchmark(
lambda q=query: medium_store.search_fts(q, limit=20),
iterations=30
)
result.name = f"search '{query}'"
print(f"\n{result}")
def test_large_store_search(self, large_store):
"""Benchmark FTS5 search on large store (500 files)."""
print("\n" + "="*60)
print("FTS5 SEARCH - LARGE STORE (500 files)")
print("="*60)
queries = ["function", "data", "compute", "result", "param"]
for query in queries:
result = benchmark(
lambda q=query: large_store.search_fts(q, limit=20),
iterations=20
)
result.name = f"search '{query}'"
print(f"\n{result}")
def test_search_limit_scaling(self, medium_store):
"""Test how search time scales with result limit."""
print("\n" + "="*60)
print("FTS5 SEARCH - LIMIT SCALING")
print("="*60)
limits = [5, 10, 20, 50, 100, 200]
for limit in limits:
result = benchmark(
lambda l=limit: medium_store.search_fts("function", limit=l),
iterations=20
)
result.name = f"limit={limit}"
print(f"\n{result}")
def test_complex_query_performance(self, medium_store):
"""Test performance of complex FTS5 queries."""
print("\n" + "="*60)
print("FTS5 SEARCH - COMPLEX QUERIES")
print("="*60)
queries = [
("single term", "function"),
("two terms", "function data"),
("phrase", '"def function"'),
("OR query", "function OR result"),
("wildcard", "func*"),
("NOT query", "function NOT data"),
]
for name, query in queries:
result = benchmark(
lambda q=query: medium_store.search_fts(q, limit=20),
iterations=20
)
result.name = name
print(f"\n{result}")
class TestSymbolSearchPerformance:
"""Symbol search performance benchmarks."""
def test_symbol_search_scaling(self, small_store, medium_store, large_store):
"""Test symbol search performance at different scales."""
print("\n" + "="*60)
print("SYMBOL SEARCH - SCALING")
print("="*60)
stores = [
("small (10 files)", small_store),
("medium (100 files)", medium_store),
("large (500 files)", large_store),
]
for name, store in stores:
result = benchmark(
lambda s=store: s.search_symbols("function", limit=50),
iterations=20
)
result.name = name
print(f"\n{result}")
def test_symbol_search_with_kind_filter(self, medium_store):
"""Test symbol search with kind filtering."""
print("\n" + "="*60)
print("SYMBOL SEARCH - KIND FILTER")
print("="*60)
# Without filter
result_no_filter = benchmark(
lambda: medium_store.search_symbols("function", limit=50),
iterations=20
)
result_no_filter.name = "no filter"
print(f"\n{result_no_filter}")
# With filter
result_with_filter = benchmark(
lambda: medium_store.search_symbols("function", kind="function", limit=50),
iterations=20
)
result_with_filter.name = "kind=function"
print(f"\n{result_with_filter}")
# === Chain Search Performance Tests ===
class TestChainSearchPerformance:
"""Chain search engine performance benchmarks."""
@pytest.fixture
def chain_engine_setup(self, temp_dir):
"""Setup chain search engine with directory hierarchy."""
# Create directory hierarchy
root = temp_dir / "project"
root.mkdir(exist_ok=True)
registry = RegistryStore(temp_dir / "registry.db")
registry.initialize()
mapper = PathMapper(temp_dir / "indexes")
# Create indexes at different depths
dirs = [
root,
root / "src",
root / "src" / "core",
root / "src" / "utils",
root / "tests",
]
for i, dir_path in enumerate(dirs):
dir_path.mkdir(exist_ok=True)
index_path = mapper.source_to_index_db(dir_path)
index_path.parent.mkdir(parents=True, exist_ok=True)
store = DirIndexStore(index_path)
store.initialize()
for j in range(20): # 20 files per directory
indexed_file, content = generate_code_file(i * 100 + j, lines=50)
file_path = str(dir_path / f"file_{j}.py")
store.add_file(
name=f"file_{j}.py",
full_path=file_path,
content=content,
language="python",
symbols=indexed_file.symbols,
)
store.close()
# Register directory
project = registry.register_project(root, mapper.source_to_index_dir(root))
registry.register_dir(project.id, dir_path, index_path, i, 20)
engine = ChainSearchEngine(registry, mapper)
yield {
"engine": engine,
"registry": registry,
"root": root,
}
registry.close()
def test_chain_search_depth(self, chain_engine_setup):
"""Test chain search at different depths."""
print("\n" + "="*60)
print("CHAIN SEARCH - DEPTH VARIATION")
print("="*60)
engine = chain_engine_setup["engine"]
root = chain_engine_setup["root"]
depths = [0, 1, 2, -1] # -1 = unlimited
for depth in depths:
options = SearchOptions(depth=depth, max_workers=4, total_limit=50)
result = benchmark(
lambda d=depth, o=options: engine.search("function", root, o),
iterations=10
)
result.name = f"depth={depth}"
print(f"\n{result}")
def test_chain_search_parallelism(self, chain_engine_setup):
"""Test chain search with different worker counts."""
print("\n" + "="*60)
print("CHAIN SEARCH - PARALLELISM")
print("="*60)
engine = chain_engine_setup["engine"]
root = chain_engine_setup["root"]
worker_counts = [1, 2, 4, 8]
for workers in worker_counts:
options = SearchOptions(depth=-1, max_workers=workers, total_limit=50)
result = benchmark(
lambda w=workers, o=options: engine.search("function", root, o),
iterations=10
)
result.name = f"workers={workers}"
print(f"\n{result}")
# === Semantic Search Performance Tests ===
class TestSemanticSearchPerformance:
"""Semantic search performance benchmarks."""
@pytest.fixture
def semantic_setup(self, temp_dir):
"""Setup semantic search with embeddings."""
try:
from codexlens.semantic import SEMANTIC_AVAILABLE
if not SEMANTIC_AVAILABLE:
pytest.skip("Semantic search dependencies not installed")
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.entities import SemanticChunk
embedder = Embedder()
db_path = temp_dir / "semantic.db"
vector_store = VectorStore(db_path)
# Add test chunks
code_samples = [
"def authenticate_user(username, password): verify user credentials",
"class DatabaseConnection: manage database connections with pooling",
"async def fetch_api_data(url): make HTTP request and return JSON",
"function renderComponent(props): render React UI component",
"def process_data(input): transform and validate input data",
] * 50 # 250 chunks
for i, content in enumerate(code_samples):
chunk = SemanticChunk(
content=content,
metadata={"index": i, "language": "python"}
)
chunk.embedding = embedder.embed_single(content)
vector_store.add_chunk(chunk, f"/test/file_{i}.py")
yield {
"embedder": embedder,
"vector_store": vector_store,
}
# Clean up vector store cache
vector_store.clear_cache()
except ImportError:
pytest.skip("Semantic search dependencies not installed")
def test_embedding_generation_speed(self, semantic_setup):
"""Benchmark embedding generation speed."""
print("\n" + "="*60)
print("SEMANTIC SEARCH - EMBEDDING GENERATION")
print("="*60)
embedder = semantic_setup["embedder"]
# Single embedding
result = benchmark(
lambda: embedder.embed_single("def example_function(): return 42"),
iterations=50
)
result.name = "single embedding"
print(f"\n{result}")
# Batch embedding
texts = ["def func{}(): return {}".format(i, i) for i in range(10)]
result = benchmark(
lambda: embedder.embed(texts),
iterations=20
)
result.name = "batch embedding (10 texts)"
print(f"\n{result}")
def test_vector_search_speed(self, semantic_setup):
"""Benchmark vector similarity search speed."""
print("\n" + "="*60)
print("SEMANTIC SEARCH - VECTOR SEARCH")
print("="*60)
embedder = semantic_setup["embedder"]
vector_store = semantic_setup["vector_store"]
query_embedding = embedder.embed_single("user authentication login")
# Different top_k values
for top_k in [5, 10, 20, 50]:
result = benchmark(
lambda k=top_k: vector_store.search_similar(query_embedding, top_k=k),
iterations=30
)
result.name = f"top_k={top_k}"
print(f"\n{result}")
def test_full_semantic_search_latency(self, semantic_setup):
"""Benchmark full semantic search (embed + search)."""
print("\n" + "="*60)
print("SEMANTIC SEARCH - FULL LATENCY")
print("="*60)
embedder = semantic_setup["embedder"]
vector_store = semantic_setup["vector_store"]
queries = [
"user authentication",
"database connection",
"API request handler",
"React component",
"data processing",
]
for query in queries:
def full_search(q=query):
embedding = embedder.embed_single(q)
return vector_store.search_similar(embedding, top_k=10)
result = benchmark(full_search, iterations=20)
result.name = f"'{query}'"
print(f"\n{result}")
# === Comparative Benchmarks ===
class TestComparativeBenchmarks:
"""Compare FTS5 vs Semantic search performance."""
@pytest.fixture
def comparison_setup(self, temp_dir):
"""Setup both FTS5 and semantic stores with same content."""
# FTS5 store
fts_store = SQLiteStore(temp_dir / "fts_compare.db")
fts_store.initialize()
code_samples = [
("auth.py", "def authenticate_user(username, password): verify credentials"),
("db.py", "class DatabasePool: manage database connection pooling"),
("api.py", "async def handle_request(req): process API request"),
("ui.py", "function Button({ onClick }): render button component"),
("utils.py", "def process_data(input): transform and validate data"),
] * 20
for i, (filename, content) in enumerate(code_samples):
indexed_file = IndexedFile(
path=f"/project/{filename.replace('.py', '')}_{i}.py",
language="python",
symbols=[Symbol(name=f"func_{i}", kind="function", range=(1, 5))],
)
fts_store.add_file(indexed_file, content)
# Semantic store (if available)
try:
from codexlens.semantic import SEMANTIC_AVAILABLE
if SEMANTIC_AVAILABLE:
from codexlens.semantic.embedder import Embedder
from codexlens.semantic.vector_store import VectorStore
from codexlens.entities import SemanticChunk
embedder = Embedder()
semantic_store = VectorStore(temp_dir / "semantic_compare.db")
for i, (filename, content) in enumerate(code_samples):
chunk = SemanticChunk(content=content, metadata={"index": i})
chunk.embedding = embedder.embed_single(content)
semantic_store.add_chunk(chunk, f"/project/{filename}")
yield {
"fts_store": fts_store,
"semantic_store": semantic_store,
"embedder": embedder,
"has_semantic": True,
}
# Close semantic store connection
semantic_store.clear_cache()
else:
yield {"fts_store": fts_store, "has_semantic": False}
except ImportError:
yield {"fts_store": fts_store, "has_semantic": False}
fts_store.close()
def test_fts_vs_semantic_latency(self, comparison_setup):
"""Compare FTS5 vs Semantic search latency."""
print("\n" + "="*60)
print("FTS5 vs SEMANTIC - LATENCY COMPARISON")
print("="*60)
fts_store = comparison_setup["fts_store"]
queries = [
"authenticate",
"database",
"request",
"button",
"process",
]
print("\nFTS5 Search:")
for query in queries:
result = benchmark(
lambda q=query: fts_store.search_fts(q, limit=10),
iterations=30
)
result.name = f"'{query}'"
print(f" {result.name}: avg={result.avg_time_ms:.2f}ms")
if comparison_setup.get("has_semantic"):
semantic_store = comparison_setup["semantic_store"]
embedder = comparison_setup["embedder"]
print("\nSemantic Search (embed + search):")
for query in queries:
def semantic_search(q=query):
emb = embedder.embed_single(q)
return semantic_store.search_similar(emb, top_k=10)
result = benchmark(semantic_search, iterations=20)
result.name = f"'{query}'"
print(f" {result.name}: avg={result.avg_time_ms:.2f}ms")
else:
print("\n(Semantic search not available)")
# === Memory Usage Tests ===
class TestMemoryUsage:
"""Memory usage during search operations."""
def test_search_memory_footprint(self, medium_store):
"""Measure memory footprint during search."""
print("\n" + "="*60)
print("MEMORY USAGE - SEARCH OPERATIONS")
print("="*60)
import tracemalloc
tracemalloc.start()
# Run multiple searches
for _ in range(100):
medium_store.search_fts("function", limit=20)
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
print(f"\nAfter 100 FTS5 searches:")
print(f" Current memory: {current / 1024 / 1024:.2f} MB")
print(f" Peak memory: {peak / 1024 / 1024:.2f} MB")
if __name__ == "__main__":
pytest.main([__file__, "-v", "-s", "--tb=short"])