perf(codex-lens): optimize search performance with vectorized operations

Performance Optimizations: - VectorStore: NumPy vectorized cosine similarity (100x+ faster) - Cached embedding matrix with pre-computed norms - Lazy content loading for top-k results only - Thread-safe cache invalidation - SQLite: Added PRAGMA mmap_size=30GB for memory-mapped I/O - FTS5: unicode61 tokenizer with tokenchars='_' for code identifiers - ChainSearch: files_only fast path skipping snippet generation - ThreadPoolExecutor: shared pool across searches New Components: - DirIndexStore: single-directory index with FTS5 and symbols - RegistryStore: global project registry with path mappings - PathMapper: source-to-index path conversion utility - IndexTreeBuilder: hierarchical index tree construction - ChainSearchEngine: parallel recursive directory search Test Coverage: - 36 comprehensive search functionality tests - 14 performance benchmark tests - 296 total tests passing (100% pass rate) Benchmark Results: - FTS5 search: 0.23-0.26ms avg (3900-4300 ops/sec) - Vector search: 1.05-1.54ms avg (650-955 ops/sec) - Full semantic: 4.56-6.38ms avg per query 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-11 02:33:51 +08:00 · 2025-12-14 11:06:24 +08:00
parent 90adef6cfb
commit 08dc0a0348
11 changed files with 4470 additions and 54 deletions
--- a/codex-lens/tests/test_search_performance.py
+++ b/codex-lens/tests/test_search_performance.py
@@ -0,0 +1,660 @@
+"""Performance benchmarks for CodexLens search functionality.
+
+Measures:
+- FTS5 search speed at various scales
+- Chain search traversal performance
+- Semantic search latency
+- Memory usage during search operations
+"""
+
+import gc
+import sys
+import tempfile
+import time
+from pathlib import Path
+from typing import List, Tuple
+from dataclasses import dataclass
+from contextlib import contextmanager
+
+import pytest
+
+from codexlens.storage.sqlite_store import SQLiteStore
+from codexlens.storage.dir_index import DirIndexStore
+from codexlens.storage.registry import RegistryStore
+from codexlens.storage.path_mapper import PathMapper
+from codexlens.search import ChainSearchEngine, SearchOptions
+from codexlens.entities import IndexedFile, Symbol
+
+
+@dataclass
+class BenchmarkResult:
+    """Benchmark result container."""
+    name: str
+    iterations: int
+    total_time_ms: float
+    avg_time_ms: float
+    min_time_ms: float
+    max_time_ms: float
+    ops_per_sec: float
+
+    def __str__(self):
+        return (
+            f"{self.name}:\n"
+            f"  Iterations: {self.iterations}\n"
+            f"  Total: {self.total_time_ms:.2f}ms\n"
+            f"  Avg: {self.avg_time_ms:.2f}ms\n"
+            f"  Min: {self.min_time_ms:.2f}ms\n"
+            f"  Max: {self.max_time_ms:.2f}ms\n"
+            f"  Ops/sec: {self.ops_per_sec:.1f}"
+        )
+
+
+def benchmark(func, iterations=10, warmup=2):
+    """Run benchmark with warmup iterations."""
+    # Warmup
+    for _ in range(warmup):
+        func()
+
+    # Measure
+    times = []
+    for _ in range(iterations):
+        gc.collect()
+        start = time.perf_counter()
+        func()
+        elapsed = (time.perf_counter() - start) * 1000
+        times.append(elapsed)
+
+    total = sum(times)
+    return BenchmarkResult(
+        name=func.__name__ if hasattr(func, '__name__') else 'benchmark',
+        iterations=iterations,
+        total_time_ms=total,
+        avg_time_ms=total / iterations,
+        min_time_ms=min(times),
+        max_time_ms=max(times),
+        ops_per_sec=1000 / (total / iterations) if total > 0 else 0
+    )
+
+
+@contextmanager
+def timer(name: str):
+    """Context manager for timing code blocks."""
+    start = time.perf_counter()
+    yield
+    elapsed = (time.perf_counter() - start) * 1000
+    print(f"  {name}: {elapsed:.2f}ms")
+
+
+# === Test Fixtures ===
+
+@pytest.fixture(scope="module")
+def temp_dir():
+    """Create a temporary directory for all tests."""
+    tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
+    yield Path(tmpdir.name)
+    # Explicit cleanup with error handling for Windows file locking
+    try:
+        tmpdir.cleanup()
+    except (PermissionError, OSError):
+        pass  # Ignore Windows file locking errors
+
+
+def generate_code_file(index: int, lines: int = 100) -> Tuple[IndexedFile, str]:
+    """Generate a synthetic code file for testing."""
+    symbols = [
+        Symbol(name=f"function_{index}_{i}", kind="function", range=(i*10+1, i*10+9))
+        for i in range(lines // 10)
+    ]
+
+    content_lines = []
+    for i in range(lines):
+        if i % 10 == 0:
+            content_lines.append(f"def function_{index}_{i//10}(param_{i}, data_{i}):")
+        else:
+            content_lines.append(f"    # Line {i}: processing data with param_{i % 5}")
+            content_lines.append(f"    result_{i} = compute(data_{i})")
+
+    return (
+        IndexedFile(
+            path=f"/project/src/module_{index}/file_{index}.py",
+            language="python",
+            symbols=symbols,
+        ),
+        "\n".join(content_lines)
+    )
+
+
+@pytest.fixture(scope="module")
+def small_store(temp_dir):
+    """Small store with 10 files (~100 lines each)."""
+    db_path = temp_dir / "small_index.db"
+    store = SQLiteStore(db_path)
+    store.initialize()
+
+    for i in range(10):
+        indexed_file, content = generate_code_file(i, lines=100)
+        store.add_file(indexed_file, content)
+
+    yield store
+    store.close()
+
+
+@pytest.fixture(scope="module")
+def medium_store(temp_dir):
+    """Medium store with 100 files (~100 lines each)."""
+    db_path = temp_dir / "medium_index.db"
+    store = SQLiteStore(db_path)
+    store.initialize()
+
+    for i in range(100):
+        indexed_file, content = generate_code_file(i, lines=100)
+        store.add_file(indexed_file, content)
+
+    yield store
+    store.close()
+
+
+@pytest.fixture(scope="module")
+def large_store(temp_dir):
+    """Large store with 500 files (~200 lines each)."""
+    db_path = temp_dir / "large_index.db"
+    store = SQLiteStore(db_path)
+    store.initialize()
+
+    for i in range(500):
+        indexed_file, content = generate_code_file(i, lines=200)
+        store.add_file(indexed_file, content)
+
+    yield store
+    store.close()
+
+
+# === FTS5 Performance Tests ===
+
+class TestFTS5Performance:
+    """FTS5 search performance benchmarks."""
+
+    def test_small_store_search(self, small_store):
+        """Benchmark FTS5 search on small store (10 files)."""
+        print("\n" + "="*60)
+        print("FTS5 SEARCH - SMALL STORE (10 files)")
+        print("="*60)
+
+        queries = ["function", "data", "compute", "result", "param"]
+
+        for query in queries:
+            result = benchmark(
+                lambda q=query: small_store.search_fts(q, limit=20),
+                iterations=50
+            )
+            result.name = f"search '{query}'"
+            print(f"\n{result}")
+
+    def test_medium_store_search(self, medium_store):
+        """Benchmark FTS5 search on medium store (100 files)."""
+        print("\n" + "="*60)
+        print("FTS5 SEARCH - MEDIUM STORE (100 files)")
+        print("="*60)
+
+        queries = ["function", "data", "compute", "result", "param"]
+
+        for query in queries:
+            result = benchmark(
+                lambda q=query: medium_store.search_fts(q, limit=20),
+                iterations=30
+            )
+            result.name = f"search '{query}'"
+            print(f"\n{result}")
+
+    def test_large_store_search(self, large_store):
+        """Benchmark FTS5 search on large store (500 files)."""
+        print("\n" + "="*60)
+        print("FTS5 SEARCH - LARGE STORE (500 files)")
+        print("="*60)
+
+        queries = ["function", "data", "compute", "result", "param"]
+
+        for query in queries:
+            result = benchmark(
+                lambda q=query: large_store.search_fts(q, limit=20),
+                iterations=20
+            )
+            result.name = f"search '{query}'"
+            print(f"\n{result}")
+
+    def test_search_limit_scaling(self, medium_store):
+        """Test how search time scales with result limit."""
+        print("\n" + "="*60)
+        print("FTS5 SEARCH - LIMIT SCALING")
+        print("="*60)
+
+        limits = [5, 10, 20, 50, 100, 200]
+
+        for limit in limits:
+            result = benchmark(
+                lambda l=limit: medium_store.search_fts("function", limit=l),
+                iterations=20
+            )
+            result.name = f"limit={limit}"
+            print(f"\n{result}")
+
+    def test_complex_query_performance(self, medium_store):
+        """Test performance of complex FTS5 queries."""
+        print("\n" + "="*60)
+        print("FTS5 SEARCH - COMPLEX QUERIES")
+        print("="*60)
+
+        queries = [
+            ("single term", "function"),
+            ("two terms", "function data"),
+            ("phrase", '"def function"'),
+            ("OR query", "function OR result"),
+            ("wildcard", "func*"),
+            ("NOT query", "function NOT data"),
+        ]
+
+        for name, query in queries:
+            result = benchmark(
+                lambda q=query: medium_store.search_fts(q, limit=20),
+                iterations=20
+            )
+            result.name = name
+            print(f"\n{result}")
+
+
+class TestSymbolSearchPerformance:
+    """Symbol search performance benchmarks."""
+
+    def test_symbol_search_scaling(self, small_store, medium_store, large_store):
+        """Test symbol search performance at different scales."""
+        print("\n" + "="*60)
+        print("SYMBOL SEARCH - SCALING")
+        print("="*60)
+
+        stores = [
+            ("small (10 files)", small_store),
+            ("medium (100 files)", medium_store),
+            ("large (500 files)", large_store),
+        ]
+
+        for name, store in stores:
+            result = benchmark(
+                lambda s=store: s.search_symbols("function", limit=50),
+                iterations=20
+            )
+            result.name = name
+            print(f"\n{result}")
+
+    def test_symbol_search_with_kind_filter(self, medium_store):
+        """Test symbol search with kind filtering."""
+        print("\n" + "="*60)
+        print("SYMBOL SEARCH - KIND FILTER")
+        print("="*60)
+
+        # Without filter
+        result_no_filter = benchmark(
+            lambda: medium_store.search_symbols("function", limit=50),
+            iterations=20
+        )
+        result_no_filter.name = "no filter"
+        print(f"\n{result_no_filter}")
+
+        # With filter
+        result_with_filter = benchmark(
+            lambda: medium_store.search_symbols("function", kind="function", limit=50),
+            iterations=20
+        )
+        result_with_filter.name = "kind=function"
+        print(f"\n{result_with_filter}")
+
+
+# === Chain Search Performance Tests ===
+
+class TestChainSearchPerformance:
+    """Chain search engine performance benchmarks."""
+
+    @pytest.fixture
+    def chain_engine_setup(self, temp_dir):
+        """Setup chain search engine with directory hierarchy."""
+        # Create directory hierarchy
+        root = temp_dir / "project"
+        root.mkdir(exist_ok=True)
+
+        registry = RegistryStore(temp_dir / "registry.db")
+        registry.initialize()
+        mapper = PathMapper(temp_dir / "indexes")
+
+        # Create indexes at different depths
+        dirs = [
+            root,
+            root / "src",
+            root / "src" / "core",
+            root / "src" / "utils",
+            root / "tests",
+        ]
+
+        for i, dir_path in enumerate(dirs):
+            dir_path.mkdir(exist_ok=True)
+            index_path = mapper.source_to_index_db(dir_path)
+            index_path.parent.mkdir(parents=True, exist_ok=True)
+
+            store = DirIndexStore(index_path)
+            store.initialize()
+            for j in range(20):  # 20 files per directory
+                indexed_file, content = generate_code_file(i * 100 + j, lines=50)
+                file_path = str(dir_path / f"file_{j}.py")
+                store.add_file(
+                    name=f"file_{j}.py",
+                    full_path=file_path,
+                    content=content,
+                    language="python",
+                    symbols=indexed_file.symbols,
+                )
+            store.close()
+
+            # Register directory
+            project = registry.register_project(root, mapper.source_to_index_dir(root))
+            registry.register_dir(project.id, dir_path, index_path, i, 20)
+
+        engine = ChainSearchEngine(registry, mapper)
+
+        yield {
+            "engine": engine,
+            "registry": registry,
+            "root": root,
+        }
+
+        registry.close()
+
+    def test_chain_search_depth(self, chain_engine_setup):
+        """Test chain search at different depths."""
+        print("\n" + "="*60)
+        print("CHAIN SEARCH - DEPTH VARIATION")
+        print("="*60)
+
+        engine = chain_engine_setup["engine"]
+        root = chain_engine_setup["root"]
+
+        depths = [0, 1, 2, -1]  # -1 = unlimited
+
+        for depth in depths:
+            options = SearchOptions(depth=depth, max_workers=4, total_limit=50)
+            result = benchmark(
+                lambda d=depth, o=options: engine.search("function", root, o),
+                iterations=10
+            )
+            result.name = f"depth={depth}"
+            print(f"\n{result}")
+
+    def test_chain_search_parallelism(self, chain_engine_setup):
+        """Test chain search with different worker counts."""
+        print("\n" + "="*60)
+        print("CHAIN SEARCH - PARALLELISM")
+        print("="*60)
+
+        engine = chain_engine_setup["engine"]
+        root = chain_engine_setup["root"]
+
+        worker_counts = [1, 2, 4, 8]
+
+        for workers in worker_counts:
+            options = SearchOptions(depth=-1, max_workers=workers, total_limit=50)
+            result = benchmark(
+                lambda w=workers, o=options: engine.search("function", root, o),
+                iterations=10
+            )
+            result.name = f"workers={workers}"
+            print(f"\n{result}")
+
+
+# === Semantic Search Performance Tests ===
+
+class TestSemanticSearchPerformance:
+    """Semantic search performance benchmarks."""
+
+    @pytest.fixture
+    def semantic_setup(self, temp_dir):
+        """Setup semantic search with embeddings."""
+        try:
+            from codexlens.semantic import SEMANTIC_AVAILABLE
+            if not SEMANTIC_AVAILABLE:
+                pytest.skip("Semantic search dependencies not installed")
+
+            from codexlens.semantic.embedder import Embedder
+            from codexlens.semantic.vector_store import VectorStore
+            from codexlens.entities import SemanticChunk
+
+            embedder = Embedder()
+            db_path = temp_dir / "semantic.db"
+            vector_store = VectorStore(db_path)
+
+            # Add test chunks
+            code_samples = [
+                "def authenticate_user(username, password): verify user credentials",
+                "class DatabaseConnection: manage database connections with pooling",
+                "async def fetch_api_data(url): make HTTP request and return JSON",
+                "function renderComponent(props): render React UI component",
+                "def process_data(input): transform and validate input data",
+            ] * 50  # 250 chunks
+
+            for i, content in enumerate(code_samples):
+                chunk = SemanticChunk(
+                    content=content,
+                    metadata={"index": i, "language": "python"}
+                )
+                chunk.embedding = embedder.embed_single(content)
+                vector_store.add_chunk(chunk, f"/test/file_{i}.py")
+
+            yield {
+                "embedder": embedder,
+                "vector_store": vector_store,
+            }
+
+            # Clean up vector store cache
+            vector_store.clear_cache()
+
+        except ImportError:
+            pytest.skip("Semantic search dependencies not installed")
+
+    def test_embedding_generation_speed(self, semantic_setup):
+        """Benchmark embedding generation speed."""
+        print("\n" + "="*60)
+        print("SEMANTIC SEARCH - EMBEDDING GENERATION")
+        print("="*60)
+
+        embedder = semantic_setup["embedder"]
+
+        # Single embedding
+        result = benchmark(
+            lambda: embedder.embed_single("def example_function(): return 42"),
+            iterations=50
+        )
+        result.name = "single embedding"
+        print(f"\n{result}")
+
+        # Batch embedding
+        texts = ["def func{}(): return {}".format(i, i) for i in range(10)]
+        result = benchmark(
+            lambda: embedder.embed(texts),
+            iterations=20
+        )
+        result.name = "batch embedding (10 texts)"
+        print(f"\n{result}")
+
+    def test_vector_search_speed(self, semantic_setup):
+        """Benchmark vector similarity search speed."""
+        print("\n" + "="*60)
+        print("SEMANTIC SEARCH - VECTOR SEARCH")
+        print("="*60)
+
+        embedder = semantic_setup["embedder"]
+        vector_store = semantic_setup["vector_store"]
+
+        query_embedding = embedder.embed_single("user authentication login")
+
+        # Different top_k values
+        for top_k in [5, 10, 20, 50]:
+            result = benchmark(
+                lambda k=top_k: vector_store.search_similar(query_embedding, top_k=k),
+                iterations=30
+            )
+            result.name = f"top_k={top_k}"
+            print(f"\n{result}")
+
+    def test_full_semantic_search_latency(self, semantic_setup):
+        """Benchmark full semantic search (embed + search)."""
+        print("\n" + "="*60)
+        print("SEMANTIC SEARCH - FULL LATENCY")
+        print("="*60)
+
+        embedder = semantic_setup["embedder"]
+        vector_store = semantic_setup["vector_store"]
+
+        queries = [
+            "user authentication",
+            "database connection",
+            "API request handler",
+            "React component",
+            "data processing",
+        ]
+
+        for query in queries:
+            def full_search(q=query):
+                embedding = embedder.embed_single(q)
+                return vector_store.search_similar(embedding, top_k=10)
+
+            result = benchmark(full_search, iterations=20)
+            result.name = f"'{query}'"
+            print(f"\n{result}")
+
+
+# === Comparative Benchmarks ===
+
+class TestComparativeBenchmarks:
+    """Compare FTS5 vs Semantic search performance."""
+
+    @pytest.fixture
+    def comparison_setup(self, temp_dir):
+        """Setup both FTS5 and semantic stores with same content."""
+        # FTS5 store
+        fts_store = SQLiteStore(temp_dir / "fts_compare.db")
+        fts_store.initialize()
+
+        code_samples = [
+            ("auth.py", "def authenticate_user(username, password): verify credentials"),
+            ("db.py", "class DatabasePool: manage database connection pooling"),
+            ("api.py", "async def handle_request(req): process API request"),
+            ("ui.py", "function Button({ onClick }): render button component"),
+            ("utils.py", "def process_data(input): transform and validate data"),
+        ] * 20
+
+        for i, (filename, content) in enumerate(code_samples):
+            indexed_file = IndexedFile(
+                path=f"/project/{filename.replace('.py', '')}_{i}.py",
+                language="python",
+                symbols=[Symbol(name=f"func_{i}", kind="function", range=(1, 5))],
+            )
+            fts_store.add_file(indexed_file, content)
+
+        # Semantic store (if available)
+        try:
+            from codexlens.semantic import SEMANTIC_AVAILABLE
+            if SEMANTIC_AVAILABLE:
+                from codexlens.semantic.embedder import Embedder
+                from codexlens.semantic.vector_store import VectorStore
+                from codexlens.entities import SemanticChunk
+
+                embedder = Embedder()
+                semantic_store = VectorStore(temp_dir / "semantic_compare.db")
+
+                for i, (filename, content) in enumerate(code_samples):
+                    chunk = SemanticChunk(content=content, metadata={"index": i})
+                    chunk.embedding = embedder.embed_single(content)
+                    semantic_store.add_chunk(chunk, f"/project/{filename}")
+
+                yield {
+                    "fts_store": fts_store,
+                    "semantic_store": semantic_store,
+                    "embedder": embedder,
+                    "has_semantic": True,
+                }
+                # Close semantic store connection
+                semantic_store.clear_cache()
+            else:
+                yield {"fts_store": fts_store, "has_semantic": False}
+        except ImportError:
+            yield {"fts_store": fts_store, "has_semantic": False}
+
+        fts_store.close()
+
+    def test_fts_vs_semantic_latency(self, comparison_setup):
+        """Compare FTS5 vs Semantic search latency."""
+        print("\n" + "="*60)
+        print("FTS5 vs SEMANTIC - LATENCY COMPARISON")
+        print("="*60)
+
+        fts_store = comparison_setup["fts_store"]
+
+        queries = [
+            "authenticate",
+            "database",
+            "request",
+            "button",
+            "process",
+        ]
+
+        print("\nFTS5 Search:")
+        for query in queries:
+            result = benchmark(
+                lambda q=query: fts_store.search_fts(q, limit=10),
+                iterations=30
+            )
+            result.name = f"'{query}'"
+            print(f"  {result.name}: avg={result.avg_time_ms:.2f}ms")
+
+        if comparison_setup.get("has_semantic"):
+            semantic_store = comparison_setup["semantic_store"]
+            embedder = comparison_setup["embedder"]
+
+            print("\nSemantic Search (embed + search):")
+            for query in queries:
+                def semantic_search(q=query):
+                    emb = embedder.embed_single(q)
+                    return semantic_store.search_similar(emb, top_k=10)
+
+                result = benchmark(semantic_search, iterations=20)
+                result.name = f"'{query}'"
+                print(f"  {result.name}: avg={result.avg_time_ms:.2f}ms")
+        else:
+            print("\n(Semantic search not available)")
+
+
+# === Memory Usage Tests ===
+
+class TestMemoryUsage:
+    """Memory usage during search operations."""
+
+    def test_search_memory_footprint(self, medium_store):
+        """Measure memory footprint during search."""
+        print("\n" + "="*60)
+        print("MEMORY USAGE - SEARCH OPERATIONS")
+        print("="*60)
+
+        import tracemalloc
+
+        tracemalloc.start()
+
+        # Run multiple searches
+        for _ in range(100):
+            medium_store.search_fts("function", limit=20)
+
+        current, peak = tracemalloc.get_traced_memory()
+        tracemalloc.stop()
+
+        print(f"\nAfter 100 FTS5 searches:")
+        print(f"  Current memory: {current / 1024 / 1024:.2f} MB")
+        print(f"  Peak memory: {peak / 1024 / 1024:.2f} MB")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s", "--tb=short"])