Add unit tests for LspGraphBuilder class

- Implement comprehensive unit tests for the LspGraphBuilder class to validate its functionality in building code association graphs. - Tests cover various scenarios including single level graph expansion, max nodes and depth boundaries, concurrent expansion limits, document symbol caching, error handling during node expansion, and edge cases such as empty seed lists and self-referencing nodes. - Utilize pytest and asyncio for asynchronous testing and mocking of LspBridge methods.
2026-03-21 19:08:17 +08:00 · 2026-01-20 12:49:31 +08:00
parent 1376dc71d9
commit 2f3a14e946
33 changed files with 8303 additions and 716 deletions
--- a/codex-lens/src/codexlens/search/hybrid_search.py
+++ b/codex-lens/src/codexlens/search/hybrid_search.py
@@ -49,6 +49,13 @@ from codexlens.search.ranking import (
 )
 from codexlens.storage.dir_index import DirIndexStore

+# Optional LSP imports (for real-time graph expansion)
+try:
+    from codexlens.lsp import LspBridge, LspGraphBuilder
+    HAS_LSP = True
+except ImportError:
+    HAS_LSP = False
+

 # Three-way fusion weights (FTS + Vector + SPLADE)
 THREE_WAY_WEIGHTS = {
@@ -113,6 +120,9 @@ class HybridSearchEngine:
        enable_vector: bool = False,
        pure_vector: bool = False,
        enable_splade: bool = False,
+        enable_lsp_graph: bool = False,
+        lsp_max_depth: int = 1,
+        lsp_max_nodes: int = 20,
    ) -> List[SearchResult]:
        """Execute hybrid search with parallel retrieval and RRF fusion.

@@ -124,6 +134,9 @@ class HybridSearchEngine:
            enable_vector: Enable vector search (default False)
            pure_vector: If True, only use vector search without FTS fallback (default False)
            enable_splade: If True, force SPLADE sparse neural search (default False)
+            enable_lsp_graph: If True, enable real-time LSP graph expansion (default False)
+            lsp_max_depth: Maximum depth for LSP graph BFS expansion (default 1)
+            lsp_max_nodes: Maximum nodes to collect in LSP graph (default 20)

        Returns:
            List of SearchResult objects sorted by fusion score
@@ -140,6 +153,9 @@ class HybridSearchEngine:
            >>> # SPLADE sparse neural search
            >>> results = engine.search(Path("project/_index.db"), "auth flow",
            ...                         enable_splade=True, enable_vector=True)
+            >>> # With LSP graph expansion (real-time)
+            >>> results = engine.search(Path("project/_index.db"), "auth flow",
+            ...                         enable_vector=True, enable_lsp_graph=True)
            >>> for r in results[:5]:
            ...     print(f"{r.path}: {r.score:.3f}")
        """
@@ -228,9 +244,21 @@ class HybridSearchEngine:
                if enable_vector:
                    backends["vector"] = True

+        # Add LSP graph expansion if requested and available
+        if enable_lsp_graph and HAS_LSP:
+            backends["lsp_graph"] = True
+        elif enable_lsp_graph and not HAS_LSP:
+            self.logger.warning(
+                "LSP graph search requested but dependencies not available. "
+                "Install: pip install aiohttp"
+            )
+
        # Execute parallel searches
        with timer("parallel_search_total", self.logger):
-            results_map = self._search_parallel(index_path, query, backends, limit, vector_category)
+            results_map = self._search_parallel(
+                index_path, query, backends, limit, vector_category,
+                lsp_max_depth, lsp_max_nodes
+            )

        # Provide helpful message if pure-vector mode returns no results
        if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0:
@@ -427,6 +455,8 @@ class HybridSearchEngine:
        backends: Dict[str, bool],
        limit: int,
        category: Optional[str] = None,
+        lsp_max_depth: int = 1,
+        lsp_max_nodes: int = 20,
    ) -> Dict[str, List[SearchResult]]:
        """Execute parallel searches across enabled backends.

@@ -436,6 +466,8 @@ class HybridSearchEngine:
            backends: Dictionary of backend name to enabled flag
            limit: Results limit per backend
            category: Optional category filter for vector search ('code' or 'doc')
+            lsp_max_depth: Maximum depth for LSP graph BFS expansion (default 1)
+            lsp_max_nodes: Maximum nodes to collect in LSP graph (default 20)

        Returns:
            Dictionary mapping source name to results list
@@ -477,6 +509,14 @@ class HybridSearchEngine:
                )
                future_to_source[future] = "splade"

+            if backends.get("lsp_graph"):
+                submit_times["lsp_graph"] = time.perf_counter()
+                future = executor.submit(
+                    self._search_lsp_graph, index_path, query, limit,
+                    lsp_max_depth, lsp_max_nodes
+                )
+                future_to_source[future] = "lsp_graph"
+
            # Collect results as they complete with timeout protection
            try:
                for future in as_completed(future_to_source, timeout=30.0):
@@ -1211,7 +1251,159 @@ class HybridSearchEngine:
                ))
            
            return results
-            
+
        except Exception as exc:
            self.logger.debug("SPLADE search error: %s", exc)
            return []
+
+    def _search_lsp_graph(
+        self,
+        index_path: Path,
+        query: str,
+        limit: int,
+        max_depth: int = 1,
+        max_nodes: int = 20,
+    ) -> List[SearchResult]:
+        """Execute LSP-based graph expansion search.
+
+        Uses real-time LSP to expand from seed results and find related code.
+        This provides accurate, up-to-date code relationships.
+
+        Args:
+            index_path: Path to _index.db file
+            query: Natural language query string
+            limit: Maximum results
+            max_depth: Maximum depth for LSP graph BFS expansion (default 1)
+            max_nodes: Maximum nodes to collect in LSP graph (default 20)
+
+        Returns:
+            List of SearchResult from graph expansion
+        """
+        import asyncio
+
+        if not HAS_LSP:
+            self.logger.debug("LSP dependencies not available")
+            return []
+
+        try:
+            # Try multiple seed sources in priority order
+            seeds = []
+            seed_source = "none"
+
+            # 1. Try vector search first (best semantic match)
+            seeds = self._search_vector(index_path, query, limit=3, category="code")
+            if seeds:
+                seed_source = "vector"
+
+            # 2. Fallback to SPLADE if vector returns nothing
+            if not seeds:
+                self.logger.debug("Vector search returned no seeds, trying SPLADE")
+                seeds = self._search_splade(index_path, query, limit=3)
+                if seeds:
+                    seed_source = "splade"
+
+            # 3. Fallback to exact FTS if SPLADE also fails
+            if not seeds:
+                self.logger.debug("SPLADE returned no seeds, trying exact FTS")
+                seeds = self._search_exact(index_path, query, limit=3)
+                if seeds:
+                    seed_source = "exact_fts"
+
+            # 4. No seeds available from any source
+            if not seeds:
+                self.logger.debug("No seed results available for LSP graph expansion")
+                return []
+
+            self.logger.debug(
+                "LSP graph expansion using %d seeds from %s",
+                len(seeds),
+                seed_source,
+            )
+
+            # Convert SearchResult to CodeSymbolNode for LSP processing
+            from codexlens.hybrid_search.data_structures import CodeSymbolNode, Range
+
+            seed_nodes = []
+            for seed in seeds:
+                try:
+                    node = CodeSymbolNode(
+                        id=f"{seed.path}:{seed.symbol_name or 'unknown'}:{seed.start_line or 0}",
+                        name=seed.symbol_name or "unknown",
+                        kind=seed.symbol_kind or "unknown",
+                        file_path=seed.path,
+                        range=Range(
+                            start_line=seed.start_line or 1,
+                            start_character=0,
+                            end_line=seed.end_line or seed.start_line or 1,
+                            end_character=0,
+                        ),
+                        raw_code=seed.content or "",
+                        docstring=seed.excerpt or "",
+                    )
+                    seed_nodes.append(node)
+                except Exception as e:
+                    self.logger.debug("Failed to create seed node: %s", e)
+                    continue
+
+            if not seed_nodes:
+                return []
+
+            # Run async LSP expansion in sync context
+            async def expand_graph():
+                async with LspBridge() as bridge:
+                    builder = LspGraphBuilder(max_depth=max_depth, max_nodes=max_nodes)
+                    graph = await builder.build_from_seeds(seed_nodes, bridge)
+                    return graph
+
+            # Run the async code
+            try:
+                loop = asyncio.get_event_loop()
+                if loop.is_running():
+                    # Already in async context - use run_coroutine_threadsafe
+                    import concurrent.futures
+                    future = asyncio.run_coroutine_threadsafe(expand_graph(), loop)
+                    graph = future.result(timeout=5.0)
+                else:
+                    graph = loop.run_until_complete(expand_graph())
+            except RuntimeError:
+                # No event loop - create new one
+                graph = asyncio.run(expand_graph())
+
+            # Convert graph nodes to SearchResult
+            # Create set of seed identifiers for fast lookup
+            seed_ids = set()
+            for seed in seeds:
+                seed_id = f"{seed.path}:{seed.symbol_name or 'unknown'}:{seed.start_line or 0}"
+                seed_ids.add(seed_id)
+
+            results = []
+            for node_id, node in graph.nodes.items():
+                # Skip seed nodes using ID comparison (already in other results)
+                if node_id in seed_ids or node.id in seed_ids:
+                    continue
+
+                # Calculate score based on graph position
+                # Nodes closer to seeds get higher scores
+                depth = 1  # Simple heuristic, could be improved
+                score = 0.8 / (1 + depth)  # Score decreases with depth
+
+                results.append(SearchResult(
+                    path=node.file_path,
+                    score=score,
+                    excerpt=node.docstring[:200] if node.docstring else node.raw_code[:200] if node.raw_code else "",
+                    content=node.raw_code,
+                    symbol=None,
+                    metadata={"lsp_node_id": node_id, "lsp_kind": node.kind},
+                    start_line=node.range.start_line,
+                    end_line=node.range.end_line,
+                    symbol_name=node.name,
+                    symbol_kind=node.kind,
+                ))
+
+            # Sort by score
+            results.sort(key=lambda r: r.score, reverse=True)
+            return results[:limit]
+
+        except Exception as exc:
+            self.logger.debug("LSP graph search error: %s", exc)
+            return []