mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
- Add `association_tree` module with components for building and processing call association trees using LSP call hierarchy capabilities. - Introduce `AssociationTreeBuilder` for constructing call trees from seed locations with depth-first expansion. - Create data structures: `TreeNode`, `CallTree`, and `UniqueNode` for representing nodes and relationships in the call tree. - Implement `ResultDeduplicator` to extract unique nodes from call trees and assign relevance scores based on depth, frequency, and kind. - Add unit tests for `AssociationTreeBuilder` and `ResultDeduplicator` to ensure functionality and correctness.
327 lines
11 KiB
Python
327 lines
11 KiB
Python
"""Search method comparison benchmark.
|
|
|
|
Compares different search strategies:
|
|
1. Pure FTS (exact + fuzzy matching)
|
|
2. Pure Vector (semantic search only)
|
|
3. Hybrid Fusion (FTS + Vector with RRF)
|
|
4. Vector + LSP Association Tree (new strategy)
|
|
|
|
Usage:
|
|
python examples/search_comparison_benchmark.py
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import time
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any
|
|
|
|
from codexlens.config import Config
|
|
from codexlens.entities import SearchResult
|
|
from codexlens.search.hybrid_search import HybridSearchEngine
|
|
from codexlens.lsp.standalone_manager import StandaloneLspManager
|
|
from codexlens.search.association_tree import AssociationTreeBuilder, ResultDeduplicator
|
|
|
|
|
|
class SearchBenchmark:
|
|
"""Benchmark different search strategies."""
|
|
|
|
def __init__(self, index_path: Path, config: Config):
|
|
"""Initialize benchmark.
|
|
|
|
Args:
|
|
index_path: Path to _index.db file
|
|
config: CodexLens config
|
|
"""
|
|
self.index_path = index_path
|
|
self.config = config
|
|
self.engine = HybridSearchEngine(config=config)
|
|
self.lsp_manager: StandaloneLspManager | None = None
|
|
self.tree_builder: AssociationTreeBuilder | None = None
|
|
self.deduplicator = ResultDeduplicator(
|
|
depth_weight=0.4,
|
|
frequency_weight=0.3,
|
|
kind_weight=0.3,
|
|
max_depth_penalty=10,
|
|
)
|
|
|
|
async def setup_lsp(self):
|
|
"""Setup LSP manager for association tree search."""
|
|
self.lsp_manager = StandaloneLspManager(
|
|
workspace_root=str(self.index_path.parent),
|
|
timeout=5.0,
|
|
)
|
|
await self.lsp_manager.start()
|
|
self.tree_builder = AssociationTreeBuilder(
|
|
lsp_manager=self.lsp_manager,
|
|
timeout=5.0,
|
|
)
|
|
|
|
async def cleanup_lsp(self):
|
|
"""Cleanup LSP manager."""
|
|
if self.lsp_manager:
|
|
await self.lsp_manager.stop()
|
|
|
|
def method1_pure_fts(self, query: str, limit: int = 20) -> tuple[List[SearchResult], float]:
|
|
"""Method 1: Pure FTS (exact + fuzzy)."""
|
|
start = time.perf_counter()
|
|
results = self.engine.search(
|
|
index_path=self.index_path,
|
|
query=query,
|
|
limit=limit,
|
|
enable_fuzzy=True,
|
|
enable_vector=False,
|
|
pure_vector=False,
|
|
)
|
|
elapsed = time.perf_counter() - start
|
|
return results, elapsed
|
|
|
|
def method2_pure_vector(self, query: str, limit: int = 20) -> tuple[List[SearchResult], float]:
|
|
"""Method 2: Pure Vector (semantic search only)."""
|
|
start = time.perf_counter()
|
|
results = self.engine.search(
|
|
index_path=self.index_path,
|
|
query=query,
|
|
limit=limit,
|
|
enable_fuzzy=False,
|
|
enable_vector=True,
|
|
pure_vector=True,
|
|
)
|
|
elapsed = time.perf_counter() - start
|
|
return results, elapsed
|
|
|
|
def method3_hybrid_fusion(self, query: str, limit: int = 20) -> tuple[List[SearchResult], float]:
|
|
"""Method 3: Hybrid Fusion (FTS + Vector with RRF)."""
|
|
start = time.perf_counter()
|
|
results = self.engine.search(
|
|
index_path=self.index_path,
|
|
query=query,
|
|
limit=limit,
|
|
enable_fuzzy=True,
|
|
enable_vector=True,
|
|
pure_vector=False,
|
|
)
|
|
elapsed = time.perf_counter() - start
|
|
return results, elapsed
|
|
|
|
async def method4_vector_lsp_tree(
|
|
self,
|
|
query: str,
|
|
limit: int = 20,
|
|
max_depth: int = 3,
|
|
expand_callers: bool = True,
|
|
expand_callees: bool = True,
|
|
) -> tuple[List[SearchResult], float, Dict[str, Any]]:
|
|
"""Method 4: Vector + LSP Association Tree (new strategy).
|
|
|
|
Steps:
|
|
1. Vector search to find seed results (top 5-10)
|
|
2. For each seed, build LSP association tree
|
|
3. Deduplicate and score all discovered nodes
|
|
4. Return top N results
|
|
|
|
Args:
|
|
query: Search query
|
|
limit: Final result limit
|
|
max_depth: Maximum depth for LSP tree expansion
|
|
expand_callers: Whether to expand incoming calls
|
|
expand_callees: Whether to expand outgoing calls
|
|
|
|
Returns:
|
|
Tuple of (results, elapsed_time, stats)
|
|
"""
|
|
if not self.tree_builder:
|
|
raise RuntimeError("LSP not initialized. Call setup_lsp() first.")
|
|
|
|
start = time.perf_counter()
|
|
stats = {
|
|
"seed_count": 0,
|
|
"trees_built": 0,
|
|
"total_tree_nodes": 0,
|
|
"unique_nodes": 0,
|
|
"dedup_time_ms": 0,
|
|
}
|
|
|
|
# Step 1: Get seed results from vector search (top 10)
|
|
seed_results = self.engine.search(
|
|
index_path=self.index_path,
|
|
query=query,
|
|
limit=10,
|
|
enable_fuzzy=False,
|
|
enable_vector=True,
|
|
pure_vector=True,
|
|
)
|
|
stats["seed_count"] = len(seed_results)
|
|
|
|
if not seed_results:
|
|
return [], time.perf_counter() - start, stats
|
|
|
|
# Step 2: Build association trees for each seed
|
|
all_trees = []
|
|
for seed in seed_results:
|
|
try:
|
|
tree = await self.tree_builder.build_tree(
|
|
seed_file_path=seed.path,
|
|
seed_line=seed.start_line or 1,
|
|
seed_character=1,
|
|
max_depth=max_depth,
|
|
expand_callers=expand_callers,
|
|
expand_callees=expand_callees,
|
|
)
|
|
if tree.node_list:
|
|
all_trees.append(tree)
|
|
stats["trees_built"] += 1
|
|
stats["total_tree_nodes"] += len(tree.node_list)
|
|
except Exception as e:
|
|
print(f"Error building tree for {seed.path}:{seed.start_line}: {e}")
|
|
continue
|
|
|
|
if not all_trees:
|
|
# Fallback to seed results if no trees built
|
|
return seed_results[:limit], time.perf_counter() - start, stats
|
|
|
|
# Step 3: Merge and deduplicate all trees
|
|
dedup_start = time.perf_counter()
|
|
|
|
# Merge all node_lists into a single CallTree
|
|
from codexlens.search.association_tree.data_structures import CallTree
|
|
merged_tree = CallTree()
|
|
for tree in all_trees:
|
|
merged_tree.node_list.extend(tree.node_list)
|
|
|
|
# Deduplicate
|
|
unique_nodes = self.deduplicator.deduplicate(
|
|
tree=merged_tree,
|
|
max_results=limit,
|
|
)
|
|
stats["unique_nodes"] = len(unique_nodes)
|
|
stats["dedup_time_ms"] = (time.perf_counter() - dedup_start) * 1000
|
|
|
|
# Step 4: Convert UniqueNode to SearchResult
|
|
results = []
|
|
for node in unique_nodes:
|
|
# Use node.score as the search score
|
|
result = SearchResult(
|
|
path=node.file_path,
|
|
score=node.score,
|
|
start_line=node.range.start_line,
|
|
end_line=node.range.end_line,
|
|
symbol_name=node.name,
|
|
symbol_kind=node.kind,
|
|
content="", # LSP doesn't provide content
|
|
metadata={"search_source": "lsp_tree"},
|
|
)
|
|
results.append(result)
|
|
|
|
elapsed = time.perf_counter() - start
|
|
return results, elapsed, stats
|
|
|
|
def print_results(self, method_name: str, results: List[SearchResult], elapsed: float, stats: Dict[str, Any] | None = None):
|
|
"""Print benchmark results."""
|
|
print(f"\n{'='*80}")
|
|
print(f"Method: {method_name}")
|
|
print(f"{'='*80}")
|
|
print(f"Time: {elapsed*1000:.2f}ms")
|
|
print(f"Results: {len(results)}")
|
|
|
|
if stats:
|
|
print(f"\nStats:")
|
|
for key, value in stats.items():
|
|
print(f" {key}: {value}")
|
|
|
|
print(f"\nTop 5 Results:")
|
|
for i, result in enumerate(results[:5], 1):
|
|
print(f"{i}. [{result.score:.4f}] {result.path}:{result.start_line}")
|
|
if result.symbol_name:
|
|
print(f" Name: {result.symbol_name}")
|
|
if result.metadata.get("search_source"):
|
|
print(f" Source: {result.metadata.get('search_source')}")
|
|
|
|
async def run_comparison(self, query: str, limit: int = 20):
|
|
"""Run comparison for a single query."""
|
|
print(f"\n{'#'*80}")
|
|
print(f"Query: {query}")
|
|
print(f"{'#'*80}")
|
|
|
|
# Method 1: Pure FTS
|
|
results1, time1 = self.method1_pure_fts(query, limit)
|
|
self.print_results("Method 1: Pure FTS", results1, time1)
|
|
|
|
# Method 2: Pure Vector
|
|
results2, time2 = self.method2_pure_vector(query, limit)
|
|
self.print_results("Method 2: Pure Vector", results2, time2)
|
|
|
|
# Method 3: Hybrid Fusion
|
|
results3, time3 = self.method3_hybrid_fusion(query, limit)
|
|
self.print_results("Method 3: Hybrid Fusion (FTS+Vector)", results3, time3)
|
|
|
|
# Method 4: Vector + LSP Tree (requires LSP setup)
|
|
results4 = None
|
|
time4 = 0.0
|
|
try:
|
|
results4, time4, stats4 = await self.method4_vector_lsp_tree(query, limit, max_depth=3)
|
|
self.print_results("Method 4: Vector + LSP Association Tree", results4, time4, stats4)
|
|
except Exception as e:
|
|
print(f"\nMethod 4: Vector + LSP Association Tree")
|
|
print(f"Error: {e}")
|
|
|
|
# Comparison summary
|
|
print(f"\n{'='*80}")
|
|
print(f"Summary")
|
|
print(f"{'='*80}")
|
|
print(f"Method 1 (FTS): {time1*1000:8.2f}ms {len(results1):3d} results")
|
|
print(f"Method 2 (Vector): {time2*1000:8.2f}ms {len(results2):3d} results")
|
|
print(f"Method 3 (Hybrid): {time3*1000:8.2f}ms {len(results3):3d} results")
|
|
if results4 is not None:
|
|
print(f"Method 4 (Vector+LSP): {time4*1000:8.2f}ms {len(results4):3d} results")
|
|
|
|
|
|
async def main():
|
|
"""Main benchmark entry point."""
|
|
# Setup - use the actual index path from ~/.codexlens/indexes/
|
|
import os
|
|
codexlens_home = Path(os.path.expanduser("~/.codexlens"))
|
|
index_path = codexlens_home / "indexes/D/Claude_dms3/codex-lens/src/codexlens/_index.db"
|
|
|
|
if not index_path.exists():
|
|
print(f"Error: Index not found at {index_path}")
|
|
print("Please run: python -m codexlens index init src")
|
|
return
|
|
|
|
project_root = Path("D:/Claude_dms3/codex-lens/src")
|
|
|
|
config = Config()
|
|
benchmark = SearchBenchmark(index_path, config)
|
|
|
|
# Test queries
|
|
queries = [
|
|
"vector search implementation",
|
|
"LSP call hierarchy",
|
|
"search result ranking",
|
|
"index building",
|
|
]
|
|
|
|
# Setup LSP for Method 4
|
|
print("Setting up LSP manager...")
|
|
try:
|
|
await benchmark.setup_lsp()
|
|
print("LSP manager ready")
|
|
except Exception as e:
|
|
print(f"Warning: Could not setup LSP: {e}")
|
|
print("Method 4 will be skipped")
|
|
|
|
try:
|
|
# Run benchmarks
|
|
for query in queries:
|
|
await benchmark.run_comparison(query, limit=20)
|
|
|
|
finally:
|
|
# Cleanup
|
|
await benchmark.cleanup_lsp()
|
|
print("\nBenchmark complete")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|