mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-11 02:33:51 +08:00
1410 lines
55 KiB
Python
1410 lines
55 KiB
Python
"""Hybrid search engine orchestrating parallel exact/fuzzy/vector searches with RRF fusion.
|
|
|
|
Coordinates multiple search backends in parallel using ThreadPoolExecutor and combines
|
|
results via Reciprocal Rank Fusion (RRF) algorithm.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError, as_completed
|
|
from contextlib import contextmanager
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
|
|
@contextmanager
|
|
def timer(name: str, logger: logging.Logger, level: int = logging.DEBUG):
|
|
"""Context manager for timing code blocks.
|
|
|
|
Args:
|
|
name: Name of the operation being timed
|
|
logger: Logger instance to use
|
|
level: Logging level (default DEBUG)
|
|
"""
|
|
start = time.perf_counter()
|
|
try:
|
|
yield
|
|
finally:
|
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
logger.log(level, "[TIMING] %s: %.2fms", name, elapsed_ms)
|
|
|
|
from codexlens.config import Config
|
|
from codexlens.config import VECTORS_HNSW_NAME
|
|
from codexlens.entities import SearchResult
|
|
from codexlens.search.ranking import (
|
|
DEFAULT_WEIGHTS,
|
|
FTS_FALLBACK_WEIGHTS,
|
|
QueryIntent,
|
|
apply_symbol_boost,
|
|
cross_encoder_rerank,
|
|
detect_query_intent,
|
|
filter_results_by_category,
|
|
get_rrf_weights,
|
|
reciprocal_rank_fusion,
|
|
rerank_results,
|
|
simple_weighted_fusion,
|
|
tag_search_source,
|
|
)
|
|
from codexlens.storage.dir_index import DirIndexStore
|
|
|
|
# Optional LSP imports (for real-time graph expansion)
|
|
try:
|
|
from codexlens.lsp import LspBridge, LspGraphBuilder
|
|
HAS_LSP = True
|
|
except ImportError:
|
|
HAS_LSP = False
|
|
|
|
|
|
# Three-way fusion weights (FTS + Vector + SPLADE)
|
|
THREE_WAY_WEIGHTS = {
|
|
"exact": 0.2,
|
|
"splade": 0.3,
|
|
"vector": 0.5,
|
|
}
|
|
|
|
|
|
class HybridSearchEngine:
|
|
"""Hybrid search engine with parallel execution and RRF fusion.
|
|
|
|
Orchestrates searches across exact FTS, fuzzy FTS, and optional vector backends,
|
|
executing them in parallel and fusing results via Reciprocal Rank Fusion.
|
|
|
|
Attributes:
|
|
logger: Python logger instance
|
|
default_weights: Default RRF weights for each source
|
|
"""
|
|
|
|
# NOTE: DEFAULT_WEIGHTS imported from ranking.py - single source of truth
|
|
# Default RRF weights: SPLADE-based hybrid (splade: 0.4, vector: 0.6)
|
|
# FTS fallback mode uses FTS_FALLBACK_WEIGHTS (exact: 0.3, fuzzy: 0.1, vector: 0.6)
|
|
|
|
def __init__(
|
|
self,
|
|
weights: Optional[Dict[str, float]] = None,
|
|
config: Optional[Config] = None,
|
|
embedder: Any = None,
|
|
):
|
|
"""Initialize hybrid search engine.
|
|
|
|
Args:
|
|
weights: Optional custom RRF weights (default: DEFAULT_WEIGHTS)
|
|
config: Optional runtime config (enables optional reranking features)
|
|
embedder: Optional embedder instance for embedding-based reranking
|
|
|
|
Raises:
|
|
TypeError: If weights is not a dict (e.g., if a Path is passed)
|
|
"""
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
# Validate weights type to catch common usage errors
|
|
if weights is not None and not isinstance(weights, dict):
|
|
raise TypeError(
|
|
f"weights must be a dict, got {type(weights).__name__}. "
|
|
f"Did you mean to pass index_path to search() instead of __init__()?"
|
|
)
|
|
|
|
self.weights = weights or DEFAULT_WEIGHTS.copy()
|
|
self._config = config
|
|
self.embedder = embedder
|
|
self.reranker: Any = None
|
|
self._use_gpu = config.embedding_use_gpu if config else True
|
|
|
|
def search(
|
|
self,
|
|
index_path: Path,
|
|
query: str,
|
|
limit: int = 20,
|
|
enable_fuzzy: bool = True,
|
|
enable_vector: bool = False,
|
|
pure_vector: bool = False,
|
|
enable_splade: bool = False,
|
|
enable_lsp_graph: bool = False,
|
|
lsp_max_depth: int = 1,
|
|
lsp_max_nodes: int = 20,
|
|
) -> List[SearchResult]:
|
|
"""Execute hybrid search with parallel retrieval and RRF fusion.
|
|
|
|
Args:
|
|
index_path: Path to _index.db file
|
|
query: FTS5 query string (for FTS) or natural language query (for vector)
|
|
limit: Maximum results to return after fusion
|
|
enable_fuzzy: Enable fuzzy FTS search (default True)
|
|
enable_vector: Enable vector search (default False)
|
|
pure_vector: If True, only use vector search without FTS fallback (default False)
|
|
enable_splade: If True, force SPLADE sparse neural search (default False)
|
|
enable_lsp_graph: If True, enable real-time LSP graph expansion (default False)
|
|
lsp_max_depth: Maximum depth for LSP graph BFS expansion (default 1)
|
|
lsp_max_nodes: Maximum nodes to collect in LSP graph (default 20)
|
|
|
|
Returns:
|
|
List of SearchResult objects sorted by fusion score
|
|
|
|
Examples:
|
|
>>> engine = HybridSearchEngine()
|
|
>>> # Hybrid search (exact + fuzzy + vector)
|
|
>>> results = engine.search(Path("project/_index.db"), "authentication",
|
|
... enable_vector=True)
|
|
>>> # Pure vector search (semantic only)
|
|
>>> results = engine.search(Path("project/_index.db"),
|
|
... "how to authenticate users",
|
|
... enable_vector=True, pure_vector=True)
|
|
>>> # SPLADE sparse neural search
|
|
>>> results = engine.search(Path("project/_index.db"), "auth flow",
|
|
... enable_splade=True, enable_vector=True)
|
|
>>> # With LSP graph expansion (real-time)
|
|
>>> results = engine.search(Path("project/_index.db"), "auth flow",
|
|
... enable_vector=True, enable_lsp_graph=True)
|
|
>>> for r in results[:5]:
|
|
... print(f"{r.path}: {r.score:.3f}")
|
|
"""
|
|
# Defensive: avoid creating/locking an index database when callers pass
|
|
# an empty placeholder file (common in tests and misconfigured callers).
|
|
try:
|
|
if index_path.exists() and index_path.stat().st_size == 0:
|
|
return []
|
|
except OSError:
|
|
return []
|
|
|
|
# Detect query intent early for category filtering at index level
|
|
query_intent = detect_query_intent(query)
|
|
# Map intent to category for vector search:
|
|
# - KEYWORD (code intent) -> filter to 'code' only
|
|
# - SEMANTIC (doc intent) -> no filter (allow docs to surface)
|
|
# - MIXED -> no filter (allow all)
|
|
vector_category: Optional[str] = None
|
|
if query_intent == QueryIntent.KEYWORD:
|
|
vector_category = "code"
|
|
|
|
# Determine which backends to use
|
|
backends = {}
|
|
|
|
# Check if SPLADE is available
|
|
splade_available = False
|
|
# Respect config.enable_splade flag and use_fts_fallback flag
|
|
if self._config and getattr(self._config, 'use_fts_fallback', False):
|
|
# Config explicitly requests FTS fallback - disable SPLADE
|
|
splade_available = False
|
|
elif self._config and not getattr(self._config, 'enable_splade', True):
|
|
# Config explicitly disabled SPLADE
|
|
splade_available = False
|
|
else:
|
|
# Check if SPLADE dependencies are available
|
|
try:
|
|
from codexlens.semantic.splade_encoder import check_splade_available
|
|
ok, _ = check_splade_available()
|
|
if ok:
|
|
# SPLADE tables are in main index database, will check table existence in _search_splade
|
|
splade_available = True
|
|
except Exception:
|
|
pass
|
|
|
|
if pure_vector:
|
|
# Pure vector mode: only use vector search, no FTS fallback
|
|
if enable_vector:
|
|
backends["vector"] = True
|
|
else:
|
|
# Invalid configuration: pure_vector=True but enable_vector=False
|
|
self.logger.warning(
|
|
"pure_vector=True requires enable_vector=True. "
|
|
"Falling back to exact search. "
|
|
"To use pure vector search, enable vector search mode."
|
|
)
|
|
backends["exact"] = True
|
|
elif enable_splade:
|
|
# Explicit SPLADE mode requested via CLI --method splade
|
|
if splade_available:
|
|
backends["splade"] = True
|
|
if enable_vector:
|
|
backends["vector"] = True
|
|
else:
|
|
# SPLADE requested but not available - warn and fallback
|
|
self.logger.warning(
|
|
"SPLADE search requested but not available. "
|
|
"Falling back to FTS. Run 'codexlens index splade' to enable."
|
|
)
|
|
backends["exact"] = True
|
|
if enable_fuzzy:
|
|
backends["fuzzy"] = True
|
|
if enable_vector:
|
|
backends["vector"] = True
|
|
else:
|
|
# Hybrid mode: default to SPLADE if available, otherwise use FTS
|
|
if splade_available:
|
|
# Default: enable SPLADE, disable exact and fuzzy
|
|
backends["splade"] = True
|
|
if enable_vector:
|
|
backends["vector"] = True
|
|
else:
|
|
# Fallback mode: enable exact+fuzzy when SPLADE unavailable
|
|
backends["exact"] = True
|
|
if enable_fuzzy:
|
|
backends["fuzzy"] = True
|
|
if enable_vector:
|
|
backends["vector"] = True
|
|
|
|
# Add LSP graph expansion if requested and available
|
|
if enable_lsp_graph and HAS_LSP:
|
|
backends["lsp_graph"] = True
|
|
elif enable_lsp_graph and not HAS_LSP:
|
|
self.logger.warning(
|
|
"LSP graph search requested but dependencies not available. "
|
|
"Install: pip install aiohttp"
|
|
)
|
|
|
|
# Execute parallel searches
|
|
with timer("parallel_search_total", self.logger):
|
|
results_map = self._search_parallel(
|
|
index_path, query, backends, limit, vector_category,
|
|
lsp_max_depth, lsp_max_nodes
|
|
)
|
|
|
|
# Provide helpful message if pure-vector mode returns no results
|
|
if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0:
|
|
self.logger.warning(
|
|
"Pure vector search returned no results. "
|
|
"This usually means embeddings haven't been generated. "
|
|
"Run: codexlens embeddings-generate %s",
|
|
index_path.parent if index_path.name == "_index.db" else index_path
|
|
)
|
|
|
|
# Apply RRF fusion
|
|
# Filter weights to only active backends
|
|
active_weights = {
|
|
source: weight
|
|
for source, weight in self.weights.items()
|
|
if source in results_map
|
|
}
|
|
|
|
# Determine fusion method from config (default: rrf)
|
|
fusion_method = "rrf"
|
|
rrf_k = 60
|
|
if self._config is not None:
|
|
fusion_method = getattr(self._config, "fusion_method", "rrf") or "rrf"
|
|
rrf_k = getattr(self._config, "rrf_k", 60) or 60
|
|
|
|
with timer("fusion", self.logger):
|
|
adaptive_weights = get_rrf_weights(query, active_weights)
|
|
if fusion_method == "simple":
|
|
fused_results = simple_weighted_fusion(results_map, adaptive_weights)
|
|
else:
|
|
# Default to RRF
|
|
fused_results = reciprocal_rank_fusion(
|
|
results_map, adaptive_weights, k=rrf_k
|
|
)
|
|
|
|
# Optional: boost results that include explicit symbol matches
|
|
boost_factor = (
|
|
self._config.symbol_boost_factor
|
|
if self._config is not None
|
|
else 1.5
|
|
)
|
|
with timer("symbol_boost", self.logger):
|
|
fused_results = apply_symbol_boost(
|
|
fused_results, boost_factor=boost_factor
|
|
)
|
|
|
|
# Optional: embedding-based reranking on top results
|
|
if self._config is not None and self._config.enable_reranking:
|
|
with timer("reranking", self.logger):
|
|
if self.embedder is None:
|
|
self.embedder = self._get_reranking_embedder()
|
|
fused_results = rerank_results(
|
|
query,
|
|
fused_results[:100],
|
|
self.embedder,
|
|
top_k=(
|
|
100
|
|
if self._config.enable_cross_encoder_rerank
|
|
else self._config.reranking_top_k
|
|
),
|
|
)
|
|
|
|
# Optional: cross-encoder reranking as a second stage
|
|
if (
|
|
self._config is not None
|
|
and self._config.enable_reranking
|
|
and self._config.enable_cross_encoder_rerank
|
|
):
|
|
with timer("cross_encoder_rerank", self.logger):
|
|
if self.reranker is None:
|
|
self.reranker = self._get_cross_encoder_reranker()
|
|
if self.reranker is not None:
|
|
fused_results = cross_encoder_rerank(
|
|
query,
|
|
fused_results,
|
|
self.reranker,
|
|
top_k=self._config.reranker_top_k,
|
|
)
|
|
|
|
# Apply category filtering to avoid code/doc pollution
|
|
# This ensures KEYWORD queries return code files, SEMANTIC queries prefer docs
|
|
enable_category_filter = (
|
|
self._config is None
|
|
or getattr(self._config, 'enable_category_filter', True)
|
|
)
|
|
if enable_category_filter and not pure_vector:
|
|
with timer("category_filter", self.logger):
|
|
query_intent = detect_query_intent(query)
|
|
fused_results = filter_results_by_category(
|
|
fused_results, query_intent, allow_mixed=True
|
|
)
|
|
|
|
# Apply final limit
|
|
return fused_results[:limit]
|
|
|
|
def _get_reranking_embedder(self) -> Any:
|
|
"""Create an embedder for reranking based on Config embedding settings."""
|
|
if self._config is None:
|
|
return None
|
|
|
|
try:
|
|
from codexlens.semantic.factory import get_embedder
|
|
except Exception as exc:
|
|
self.logger.debug("Reranking embedder unavailable: %s", exc)
|
|
return None
|
|
|
|
try:
|
|
if self._config.embedding_backend == "fastembed":
|
|
return get_embedder(
|
|
backend="fastembed",
|
|
profile=self._config.embedding_model,
|
|
use_gpu=self._config.embedding_use_gpu,
|
|
)
|
|
if self._config.embedding_backend == "litellm":
|
|
return get_embedder(
|
|
backend="litellm",
|
|
model=self._config.embedding_model,
|
|
endpoints=self._config.embedding_endpoints,
|
|
strategy=self._config.embedding_strategy,
|
|
cooldown=self._config.embedding_cooldown,
|
|
)
|
|
except Exception as exc:
|
|
self.logger.debug("Failed to initialize reranking embedder: %s", exc)
|
|
return None
|
|
|
|
self.logger.debug(
|
|
"Unknown embedding backend for reranking: %s",
|
|
self._config.embedding_backend,
|
|
)
|
|
return None
|
|
|
|
def _get_cross_encoder_reranker(self) -> Any:
|
|
if self._config is None:
|
|
return None
|
|
|
|
try:
|
|
from codexlens.semantic.reranker import (
|
|
check_reranker_available,
|
|
get_reranker,
|
|
)
|
|
except Exception as exc:
|
|
self.logger.debug("Reranker factory unavailable: %s", exc)
|
|
return None
|
|
|
|
backend = (getattr(self._config, "reranker_backend", "") or "").strip().lower() or "onnx"
|
|
|
|
ok, err = check_reranker_available(backend)
|
|
if not ok:
|
|
self.logger.debug(
|
|
"Reranker backend unavailable (backend=%s): %s",
|
|
backend,
|
|
err,
|
|
)
|
|
return None
|
|
|
|
try:
|
|
model_name = (getattr(self._config, "reranker_model", "") or "").strip() or None
|
|
|
|
if backend != "legacy" and model_name == "cross-encoder/ms-marco-MiniLM-L-6-v2":
|
|
model_name = None
|
|
|
|
device: str | None = None
|
|
kwargs: dict[str, Any] = {}
|
|
|
|
if backend == "onnx":
|
|
kwargs["use_gpu"] = bool(getattr(self._config, "embedding_use_gpu", True))
|
|
elif backend == "legacy":
|
|
if not bool(getattr(self._config, "embedding_use_gpu", True)):
|
|
device = "cpu"
|
|
elif backend == "api":
|
|
# Pass max_input_tokens for adaptive batching
|
|
max_tokens = getattr(self._config, "reranker_max_input_tokens", None)
|
|
if max_tokens:
|
|
kwargs["max_input_tokens"] = max_tokens
|
|
|
|
return get_reranker(
|
|
backend=backend,
|
|
model_name=model_name,
|
|
device=device,
|
|
**kwargs,
|
|
)
|
|
except Exception as exc:
|
|
self.logger.debug(
|
|
"Failed to initialize reranker (backend=%s): %s",
|
|
backend,
|
|
exc,
|
|
)
|
|
return None
|
|
|
|
def _search_parallel(
|
|
self,
|
|
index_path: Path,
|
|
query: str,
|
|
backends: Dict[str, bool],
|
|
limit: int,
|
|
category: Optional[str] = None,
|
|
lsp_max_depth: int = 1,
|
|
lsp_max_nodes: int = 20,
|
|
) -> Dict[str, List[SearchResult]]:
|
|
"""Execute parallel searches across enabled backends.
|
|
|
|
Args:
|
|
index_path: Path to _index.db file
|
|
query: FTS5 query string
|
|
backends: Dictionary of backend name to enabled flag
|
|
limit: Results limit per backend
|
|
category: Optional category filter for vector search ('code' or 'doc')
|
|
lsp_max_depth: Maximum depth for LSP graph BFS expansion (default 1)
|
|
lsp_max_nodes: Maximum nodes to collect in LSP graph (default 20)
|
|
|
|
Returns:
|
|
Dictionary mapping source name to results list
|
|
"""
|
|
results_map: Dict[str, List[SearchResult]] = {}
|
|
timing_data: Dict[str, float] = {}
|
|
|
|
# Use ThreadPoolExecutor for parallel I/O-bound searches
|
|
with ThreadPoolExecutor(max_workers=len(backends)) as executor:
|
|
# Submit search tasks with timing
|
|
future_to_source = {}
|
|
submit_times = {}
|
|
|
|
if backends.get("exact"):
|
|
submit_times["exact"] = time.perf_counter()
|
|
future = executor.submit(
|
|
self._search_exact, index_path, query, limit
|
|
)
|
|
future_to_source[future] = "exact"
|
|
|
|
if backends.get("fuzzy"):
|
|
submit_times["fuzzy"] = time.perf_counter()
|
|
future = executor.submit(
|
|
self._search_fuzzy, index_path, query, limit
|
|
)
|
|
future_to_source[future] = "fuzzy"
|
|
|
|
if backends.get("vector"):
|
|
submit_times["vector"] = time.perf_counter()
|
|
future = executor.submit(
|
|
self._search_vector, index_path, query, limit, category
|
|
)
|
|
future_to_source[future] = "vector"
|
|
|
|
if backends.get("splade"):
|
|
submit_times["splade"] = time.perf_counter()
|
|
future = executor.submit(
|
|
self._search_splade, index_path, query, limit
|
|
)
|
|
future_to_source[future] = "splade"
|
|
|
|
if backends.get("lsp_graph"):
|
|
submit_times["lsp_graph"] = time.perf_counter()
|
|
future = executor.submit(
|
|
self._search_lsp_graph, index_path, query, limit,
|
|
lsp_max_depth, lsp_max_nodes
|
|
)
|
|
future_to_source[future] = "lsp_graph"
|
|
|
|
# Collect results as they complete with timeout protection
|
|
try:
|
|
for future in as_completed(future_to_source, timeout=30.0):
|
|
source = future_to_source[future]
|
|
elapsed_ms = (time.perf_counter() - submit_times[source]) * 1000
|
|
timing_data[source] = elapsed_ms
|
|
try:
|
|
results = future.result(timeout=10.0)
|
|
# Tag results with source for debugging
|
|
tagged_results = tag_search_source(results, source)
|
|
results_map[source] = tagged_results
|
|
self.logger.debug(
|
|
"[TIMING] %s_search: %.2fms (%d results)",
|
|
source, elapsed_ms, len(results)
|
|
)
|
|
except (Exception, FuturesTimeoutError) as exc:
|
|
self.logger.error("Search failed for %s: %s", source, exc)
|
|
results_map[source] = []
|
|
except FuturesTimeoutError:
|
|
self.logger.warning("Search timeout: some backends did not respond in time")
|
|
# Cancel remaining futures
|
|
for future in future_to_source:
|
|
future.cancel()
|
|
# Set empty results for sources that didn't complete
|
|
for source in backends:
|
|
if source not in results_map:
|
|
results_map[source] = []
|
|
|
|
# Log timing summary
|
|
if timing_data:
|
|
timing_str = ", ".join(f"{k}={v:.1f}ms" for k, v in timing_data.items())
|
|
self.logger.debug("[TIMING] search_backends: {%s}", timing_str)
|
|
|
|
return results_map
|
|
|
|
def _search_exact(
|
|
self, index_path: Path, query: str, limit: int
|
|
) -> List[SearchResult]:
|
|
"""Execute exact FTS search using unicode61 tokenizer.
|
|
|
|
Args:
|
|
index_path: Path to _index.db file
|
|
query: FTS5 query string
|
|
limit: Maximum results
|
|
|
|
Returns:
|
|
List of SearchResult objects
|
|
"""
|
|
try:
|
|
with DirIndexStore(index_path) as store:
|
|
return store.search_fts_exact(
|
|
query, limit=limit, return_full_content=True
|
|
)
|
|
except Exception as exc:
|
|
self.logger.debug("Exact search error: %s", exc)
|
|
return []
|
|
|
|
def _search_fuzzy(
|
|
self, index_path: Path, query: str, limit: int
|
|
) -> List[SearchResult]:
|
|
"""Execute fuzzy FTS search using trigram/extended unicode61 tokenizer.
|
|
|
|
Args:
|
|
index_path: Path to _index.db file
|
|
query: FTS5 query string
|
|
limit: Maximum results
|
|
|
|
Returns:
|
|
List of SearchResult objects
|
|
"""
|
|
try:
|
|
with DirIndexStore(index_path) as store:
|
|
return store.search_fts_fuzzy(
|
|
query, limit=limit, return_full_content=True
|
|
)
|
|
except Exception as exc:
|
|
self.logger.debug("Fuzzy search error: %s", exc)
|
|
return []
|
|
|
|
def _find_vectors_hnsw(self, index_path: Path) -> Optional[Path]:
|
|
"""Find the centralized _vectors.hnsw file by traversing up from index_path.
|
|
|
|
Similar to _search_splade's approach, this method searches for the
|
|
centralized dense vector index file in parent directories.
|
|
|
|
Args:
|
|
index_path: Path to the current _index.db file
|
|
|
|
Returns:
|
|
Path to _vectors.hnsw if found, None otherwise
|
|
"""
|
|
current_dir = index_path.parent
|
|
for _ in range(10): # Limit search depth
|
|
candidate = current_dir / VECTORS_HNSW_NAME
|
|
if candidate.exists():
|
|
return candidate
|
|
parent = current_dir.parent
|
|
if parent == current_dir: # Reached root
|
|
break
|
|
current_dir = parent
|
|
return None
|
|
|
|
def _search_vector_centralized(
|
|
self,
|
|
index_path: Path,
|
|
hnsw_path: Path,
|
|
query: str,
|
|
limit: int,
|
|
category: Optional[str] = None,
|
|
) -> List[SearchResult]:
|
|
"""Search using centralized vector index.
|
|
|
|
Args:
|
|
index_path: Path to _index.db file (for metadata lookup)
|
|
hnsw_path: Path to centralized _vectors.hnsw file
|
|
query: Natural language query string
|
|
limit: Maximum results
|
|
category: Optional category filter ('code' or 'doc')
|
|
|
|
Returns:
|
|
List of SearchResult objects ordered by semantic similarity
|
|
"""
|
|
try:
|
|
import sqlite3
|
|
import json
|
|
from codexlens.semantic.factory import get_embedder
|
|
from codexlens.semantic.ann_index import ANNIndex
|
|
|
|
# Get model config from the first index database we can find
|
|
# (all indexes should use the same embedding model)
|
|
index_root = hnsw_path.parent
|
|
model_config = None
|
|
|
|
# Try to get model config from the centralized index root first
|
|
# (not the sub-directory index_path, which may have outdated config)
|
|
try:
|
|
from codexlens.semantic.vector_store import VectorStore
|
|
central_index_path = index_root / "_index.db"
|
|
if central_index_path.exists():
|
|
with VectorStore(central_index_path) as vs:
|
|
model_config = vs.get_model_config()
|
|
self.logger.debug(
|
|
"Loaded model config from centralized index: %s",
|
|
model_config
|
|
)
|
|
except Exception as e:
|
|
self.logger.debug("Failed to load model config from centralized index: %s", e)
|
|
|
|
# Detect dimension from HNSW file if model config not found
|
|
if model_config is None:
|
|
self.logger.debug("Model config not found, will detect from HNSW index")
|
|
# Create a temporary ANNIndex to load and detect dimension
|
|
# We need to know the dimension to properly load the index
|
|
|
|
# Get embedder based on model config or default
|
|
if model_config:
|
|
backend = model_config.get("backend", "fastembed")
|
|
model_name = model_config["model_name"]
|
|
model_profile = model_config["model_profile"]
|
|
embedding_dim = model_config["embedding_dim"]
|
|
|
|
if backend == "litellm":
|
|
embedder = get_embedder(backend="litellm", model=model_name)
|
|
else:
|
|
embedder = get_embedder(backend="fastembed", profile=model_profile)
|
|
else:
|
|
# Default to code profile
|
|
embedder = get_embedder(backend="fastembed", profile="code")
|
|
embedding_dim = embedder.embedding_dim
|
|
|
|
# Load centralized ANN index
|
|
start_load = time.perf_counter()
|
|
ann_index = ANNIndex.create_central(
|
|
index_root=index_root,
|
|
dim=embedding_dim,
|
|
)
|
|
if not ann_index.load():
|
|
self.logger.warning("Failed to load centralized vector index from %s", hnsw_path)
|
|
return []
|
|
self.logger.debug(
|
|
"[TIMING] central_ann_load: %.2fms (%d vectors)",
|
|
(time.perf_counter() - start_load) * 1000,
|
|
ann_index.count()
|
|
)
|
|
|
|
# Generate query embedding
|
|
start_embed = time.perf_counter()
|
|
query_embedding = embedder.embed_single(query)
|
|
self.logger.debug(
|
|
"[TIMING] query_embedding: %.2fms",
|
|
(time.perf_counter() - start_embed) * 1000
|
|
)
|
|
|
|
# Search ANN index
|
|
start_search = time.perf_counter()
|
|
import numpy as np
|
|
query_vec = np.array(query_embedding, dtype=np.float32)
|
|
ids, distances = ann_index.search(query_vec, top_k=limit * 2) # Fetch extra for filtering
|
|
self.logger.debug(
|
|
"[TIMING] central_ann_search: %.2fms (%d results)",
|
|
(time.perf_counter() - start_search) * 1000,
|
|
len(ids) if ids else 0
|
|
)
|
|
|
|
if not ids:
|
|
return []
|
|
|
|
# Convert distances to similarity scores (for cosine: score = 1 - distance)
|
|
scores = [1.0 - d for d in distances]
|
|
|
|
# Fetch chunk metadata from semantic_chunks tables
|
|
# We need to search across all _index.db files in the project
|
|
results = self._fetch_chunks_by_ids_centralized(
|
|
index_root, ids, scores, category
|
|
)
|
|
|
|
return results[:limit]
|
|
|
|
except ImportError as exc:
|
|
self.logger.debug("Semantic dependencies not available: %s", exc)
|
|
return []
|
|
except Exception as exc:
|
|
self.logger.error("Centralized vector search error: %s", exc)
|
|
return []
|
|
|
|
def _fetch_chunks_by_ids_centralized(
|
|
self,
|
|
index_root: Path,
|
|
chunk_ids: List[int],
|
|
scores: List[float],
|
|
category: Optional[str] = None,
|
|
) -> List[SearchResult]:
|
|
"""Fetch chunk metadata from centralized _vectors_meta.db for fast lookup.
|
|
|
|
This method uses the centralized VectorMetadataStore for O(1) lookup
|
|
instead of traversing all _index.db files (O(n) where n = number of indexes).
|
|
|
|
Falls back to the legacy per-index lookup if centralized metadata is unavailable.
|
|
|
|
Args:
|
|
index_root: Root directory containing _vectors_meta.db
|
|
chunk_ids: List of chunk IDs from ANN search
|
|
scores: Corresponding similarity scores
|
|
category: Optional category filter
|
|
|
|
Returns:
|
|
List of SearchResult objects
|
|
"""
|
|
from codexlens.config import VECTORS_META_DB_NAME
|
|
|
|
# Build score map
|
|
score_map = {cid: score for cid, score in zip(chunk_ids, scores)}
|
|
|
|
# Try centralized metadata store first (fast path)
|
|
vectors_meta_path = index_root / VECTORS_META_DB_NAME
|
|
if vectors_meta_path.exists():
|
|
try:
|
|
return self._fetch_from_vector_meta_store(
|
|
vectors_meta_path, chunk_ids, score_map, category
|
|
)
|
|
except Exception as e:
|
|
self.logger.warning(
|
|
"Centralized metadata lookup failed, falling back to legacy traversal: %s. "
|
|
"Consider regenerating embeddings with: codexlens embeddings-generate --centralized",
|
|
e
|
|
)
|
|
|
|
# Fallback: traverse _index.db files (legacy path)
|
|
return self._fetch_chunks_by_ids_legacy(
|
|
index_root, chunk_ids, score_map, category
|
|
)
|
|
|
|
def _fetch_from_vector_meta_store(
|
|
self,
|
|
meta_db_path: Path,
|
|
chunk_ids: List[int],
|
|
score_map: Dict[int, float],
|
|
category: Optional[str] = None,
|
|
) -> List[SearchResult]:
|
|
"""Fetch chunks from centralized VectorMetadataStore.
|
|
|
|
Args:
|
|
meta_db_path: Path to _vectors_meta.db
|
|
chunk_ids: List of chunk IDs to fetch
|
|
score_map: Mapping of chunk_id to score
|
|
category: Optional category filter
|
|
|
|
Returns:
|
|
List of SearchResult objects
|
|
"""
|
|
from codexlens.storage.vector_meta_store import VectorMetadataStore
|
|
|
|
results = []
|
|
|
|
with VectorMetadataStore(meta_db_path) as meta_store:
|
|
rows = meta_store.get_chunks_by_ids(chunk_ids, category=category)
|
|
|
|
for row in rows:
|
|
chunk_id = row["chunk_id"]
|
|
file_path = row["file_path"]
|
|
content = row["content"] or ""
|
|
metadata = row.get("metadata") or {}
|
|
start_line = row.get("start_line")
|
|
end_line = row.get("end_line")
|
|
|
|
score = score_map.get(chunk_id, 0.0)
|
|
|
|
# Build excerpt
|
|
excerpt = content[:200] + "..." if len(content) > 200 else content
|
|
|
|
# Extract symbol information
|
|
symbol_name = metadata.get("symbol_name")
|
|
symbol_kind = metadata.get("symbol_kind")
|
|
|
|
# Build Symbol object if available
|
|
symbol = None
|
|
if symbol_name and symbol_kind and start_line and end_line:
|
|
try:
|
|
from codexlens.entities import Symbol
|
|
symbol = Symbol(
|
|
name=symbol_name,
|
|
kind=symbol_kind,
|
|
range=(start_line, end_line)
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
results.append(SearchResult(
|
|
path=file_path,
|
|
score=score,
|
|
excerpt=excerpt,
|
|
content=content,
|
|
symbol=symbol,
|
|
metadata=metadata,
|
|
start_line=start_line,
|
|
end_line=end_line,
|
|
symbol_name=symbol_name,
|
|
symbol_kind=symbol_kind,
|
|
))
|
|
|
|
# Sort by score descending
|
|
results.sort(key=lambda r: r.score, reverse=True)
|
|
return results
|
|
|
|
def _fetch_chunks_by_ids_legacy(
|
|
self,
|
|
index_root: Path,
|
|
chunk_ids: List[int],
|
|
score_map: Dict[int, float],
|
|
category: Optional[str] = None,
|
|
) -> List[SearchResult]:
|
|
"""Legacy fallback: fetch chunk metadata by traversing all _index.db files.
|
|
|
|
This is the O(n) fallback path used when centralized metadata is unavailable.
|
|
|
|
Args:
|
|
index_root: Root directory containing _index.db files
|
|
chunk_ids: List of chunk IDs from ANN search
|
|
score_map: Mapping of chunk_id to score
|
|
category: Optional category filter
|
|
|
|
Returns:
|
|
List of SearchResult objects
|
|
"""
|
|
import sqlite3
|
|
import json
|
|
|
|
# Find all _index.db files
|
|
index_files = list(index_root.rglob("_index.db"))
|
|
|
|
results = []
|
|
found_ids = set()
|
|
|
|
for index_path in index_files:
|
|
try:
|
|
with sqlite3.connect(index_path) as conn:
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
# Check if semantic_chunks table exists
|
|
cursor = conn.execute(
|
|
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
|
|
)
|
|
if cursor.fetchone() is None:
|
|
continue
|
|
|
|
# Build query for chunk IDs we haven't found yet
|
|
remaining_ids = [cid for cid in chunk_ids if cid not in found_ids]
|
|
if not remaining_ids:
|
|
break
|
|
|
|
placeholders = ",".join("?" * len(remaining_ids))
|
|
|
|
if category:
|
|
query = f"""
|
|
SELECT id, file_path, content, metadata
|
|
FROM semantic_chunks
|
|
WHERE id IN ({placeholders}) AND category = ?
|
|
"""
|
|
params = remaining_ids + [category]
|
|
else:
|
|
query = f"""
|
|
SELECT id, file_path, content, metadata
|
|
FROM semantic_chunks
|
|
WHERE id IN ({placeholders})
|
|
"""
|
|
params = remaining_ids
|
|
|
|
rows = conn.execute(query, params).fetchall()
|
|
|
|
for row in rows:
|
|
chunk_id = row["id"]
|
|
if chunk_id in found_ids:
|
|
continue
|
|
found_ids.add(chunk_id)
|
|
|
|
file_path = row["file_path"]
|
|
content = row["content"]
|
|
metadata_json = row["metadata"]
|
|
metadata = json.loads(metadata_json) if metadata_json else {}
|
|
|
|
score = score_map.get(chunk_id, 0.0)
|
|
|
|
# Build excerpt
|
|
excerpt = content[:200] + "..." if len(content) > 200 else content
|
|
|
|
# Extract symbol information
|
|
symbol_name = metadata.get("symbol_name")
|
|
symbol_kind = metadata.get("symbol_kind")
|
|
start_line = metadata.get("start_line")
|
|
end_line = metadata.get("end_line")
|
|
|
|
# Build Symbol object if available
|
|
symbol = None
|
|
if symbol_name and symbol_kind and start_line and end_line:
|
|
try:
|
|
from codexlens.entities import Symbol
|
|
symbol = Symbol(
|
|
name=symbol_name,
|
|
kind=symbol_kind,
|
|
range=(start_line, end_line)
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
results.append(SearchResult(
|
|
path=file_path,
|
|
score=score,
|
|
excerpt=excerpt,
|
|
content=content,
|
|
symbol=symbol,
|
|
metadata=metadata,
|
|
start_line=start_line,
|
|
end_line=end_line,
|
|
symbol_name=symbol_name,
|
|
symbol_kind=symbol_kind,
|
|
))
|
|
|
|
except Exception as e:
|
|
self.logger.debug("Failed to fetch chunks from %s: %s", index_path, e)
|
|
continue
|
|
|
|
# Sort by score descending
|
|
results.sort(key=lambda r: r.score, reverse=True)
|
|
return results
|
|
|
|
def _search_vector(
|
|
self, index_path: Path, query: str, limit: int, category: Optional[str] = None
|
|
) -> List[SearchResult]:
|
|
"""Execute vector similarity search using semantic embeddings.
|
|
|
|
Supports both centralized vector storage (single _vectors.hnsw at project root)
|
|
and distributed storage (per-directory .hnsw files).
|
|
|
|
Args:
|
|
index_path: Path to _index.db file
|
|
query: Natural language query string
|
|
limit: Maximum results
|
|
category: Optional category filter ('code' or 'doc')
|
|
|
|
Returns:
|
|
List of SearchResult objects ordered by semantic similarity
|
|
"""
|
|
try:
|
|
# First, check for centralized vector index
|
|
central_hnsw_path = self._find_vectors_hnsw(index_path)
|
|
if central_hnsw_path is not None:
|
|
self.logger.debug("Found centralized vector index at %s", central_hnsw_path)
|
|
return self._search_vector_centralized(
|
|
index_path, central_hnsw_path, query, limit, category
|
|
)
|
|
|
|
# Fallback to distributed (per-index) vector storage
|
|
# Check if semantic chunks table exists
|
|
import sqlite3
|
|
|
|
start_check = time.perf_counter()
|
|
try:
|
|
with sqlite3.connect(index_path) as conn:
|
|
cursor = conn.execute(
|
|
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
|
|
)
|
|
has_semantic_table = cursor.fetchone() is not None
|
|
except sqlite3.Error as e:
|
|
self.logger.error("Database check failed in vector search: %s", e)
|
|
return []
|
|
self.logger.debug(
|
|
"[TIMING] vector_table_check: %.2fms",
|
|
(time.perf_counter() - start_check) * 1000
|
|
)
|
|
|
|
if not has_semantic_table:
|
|
self.logger.info(
|
|
"No embeddings found in index. "
|
|
"Generate embeddings with: codexlens embeddings-generate %s",
|
|
index_path.parent if index_path.name == "_index.db" else index_path
|
|
)
|
|
return []
|
|
|
|
# Initialize embedder and vector store
|
|
from codexlens.semantic.factory import get_embedder
|
|
from codexlens.semantic.vector_store import VectorStore
|
|
|
|
start_init = time.perf_counter()
|
|
vector_store = VectorStore(index_path)
|
|
self.logger.debug(
|
|
"[TIMING] vector_store_init: %.2fms",
|
|
(time.perf_counter() - start_init) * 1000
|
|
)
|
|
|
|
# Check if vector store has data
|
|
if vector_store.count_chunks() == 0:
|
|
self.logger.info(
|
|
"Vector store is empty (0 chunks). "
|
|
"Generate embeddings with: codexlens embeddings-generate %s",
|
|
index_path.parent if index_path.name == "_index.db" else index_path
|
|
)
|
|
return []
|
|
|
|
# Get stored model configuration (preferred) or auto-detect from dimension
|
|
start_embedder = time.perf_counter()
|
|
model_config = vector_store.get_model_config()
|
|
if model_config:
|
|
backend = model_config.get("backend", "fastembed")
|
|
model_name = model_config["model_name"]
|
|
model_profile = model_config["model_profile"]
|
|
self.logger.debug(
|
|
"Using stored model config: %s backend, %s (%s, %dd)",
|
|
backend, model_profile, model_name, model_config["embedding_dim"]
|
|
)
|
|
|
|
# Get embedder based on backend
|
|
if backend == "litellm":
|
|
embedder = get_embedder(backend="litellm", model=model_name)
|
|
else:
|
|
embedder = get_embedder(backend="fastembed", profile=model_profile)
|
|
else:
|
|
# Fallback: auto-detect from embedding dimension
|
|
detected_dim = vector_store.dimension
|
|
if detected_dim is None:
|
|
self.logger.info("Vector store dimension unknown, using default profile")
|
|
embedder = get_embedder(backend="fastembed", profile="code")
|
|
elif detected_dim == 384:
|
|
embedder = get_embedder(backend="fastembed", profile="fast")
|
|
elif detected_dim == 768:
|
|
embedder = get_embedder(backend="fastembed", profile="code")
|
|
elif detected_dim == 1024:
|
|
embedder = get_embedder(backend="fastembed", profile="multilingual")
|
|
elif detected_dim == 1536:
|
|
# Likely OpenAI text-embedding-3-small or ada-002
|
|
self.logger.info(
|
|
"Detected 1536-dim embeddings (likely OpenAI), using litellm backend with text-embedding-3-small"
|
|
)
|
|
embedder = get_embedder(backend="litellm", model="text-embedding-3-small")
|
|
elif detected_dim == 3072:
|
|
# Likely OpenAI text-embedding-3-large
|
|
self.logger.info(
|
|
"Detected 3072-dim embeddings (likely OpenAI), using litellm backend with text-embedding-3-large"
|
|
)
|
|
embedder = get_embedder(backend="litellm", model="text-embedding-3-large")
|
|
else:
|
|
self.logger.debug(
|
|
"Unknown dimension %s, using default fastembed profile 'code'",
|
|
detected_dim
|
|
)
|
|
embedder = get_embedder(backend="fastembed", profile="code")
|
|
self.logger.debug(
|
|
"[TIMING] embedder_init: %.2fms",
|
|
(time.perf_counter() - start_embedder) * 1000
|
|
)
|
|
|
|
# Generate query embedding
|
|
start_embed = time.perf_counter()
|
|
query_embedding = embedder.embed_single(query)
|
|
self.logger.debug(
|
|
"[TIMING] query_embedding: %.2fms",
|
|
(time.perf_counter() - start_embed) * 1000
|
|
)
|
|
|
|
# Search for similar chunks
|
|
start_search = time.perf_counter()
|
|
results = vector_store.search_similar(
|
|
query_embedding=query_embedding,
|
|
top_k=limit,
|
|
min_score=0.0, # Return all results, let RRF handle filtering
|
|
return_full_content=True,
|
|
category=category,
|
|
)
|
|
self.logger.debug(
|
|
"[TIMING] vector_similarity_search: %.2fms (%d results)",
|
|
(time.perf_counter() - start_search) * 1000, len(results)
|
|
)
|
|
|
|
return results
|
|
|
|
except ImportError as exc:
|
|
self.logger.debug("Semantic dependencies not available: %s", exc)
|
|
return []
|
|
except Exception as exc:
|
|
self.logger.error("Vector search error: %s", exc)
|
|
return []
|
|
|
|
def _search_splade(
|
|
self, index_path: Path, query: str, limit: int
|
|
) -> List[SearchResult]:
|
|
"""SPLADE sparse retrieval via inverted index.
|
|
|
|
Args:
|
|
index_path: Path to _index.db file
|
|
query: Natural language query string
|
|
limit: Maximum results
|
|
|
|
Returns:
|
|
List of SearchResult ordered by SPLADE score
|
|
"""
|
|
try:
|
|
from codexlens.semantic.splade_encoder import get_splade_encoder, check_splade_available
|
|
from codexlens.storage.splade_index import SpladeIndex
|
|
from codexlens.config import SPLADE_DB_NAME
|
|
import sqlite3
|
|
import json
|
|
|
|
# Check dependencies
|
|
ok, err = check_splade_available()
|
|
if not ok:
|
|
self.logger.debug("SPLADE not available: %s", err)
|
|
return []
|
|
|
|
# SPLADE index is stored in _splade.db at the project index root
|
|
# Traverse up from the current index to find the root _splade.db
|
|
current_dir = index_path.parent
|
|
splade_db_path = None
|
|
for _ in range(10): # Limit search depth
|
|
candidate = current_dir / SPLADE_DB_NAME
|
|
if candidate.exists():
|
|
splade_db_path = candidate
|
|
break
|
|
parent = current_dir.parent
|
|
if parent == current_dir: # Reached root
|
|
break
|
|
current_dir = parent
|
|
|
|
if not splade_db_path:
|
|
self.logger.debug("SPLADE index not found in ancestor directories of %s", index_path)
|
|
return []
|
|
|
|
splade_index = SpladeIndex(splade_db_path)
|
|
if not splade_index.has_index():
|
|
self.logger.debug("SPLADE index not initialized")
|
|
return []
|
|
|
|
# Encode query to sparse vector
|
|
encoder = get_splade_encoder(use_gpu=self._use_gpu)
|
|
query_sparse = encoder.encode_text(query)
|
|
|
|
# Search inverted index for top matches
|
|
raw_results = splade_index.search(query_sparse, limit=limit, min_score=0.0)
|
|
|
|
if not raw_results:
|
|
return []
|
|
|
|
# Fetch chunk details from splade_chunks table (self-contained)
|
|
chunk_ids = [chunk_id for chunk_id, _ in raw_results]
|
|
score_map = {chunk_id: score for chunk_id, score in raw_results}
|
|
|
|
# Get chunk metadata from SPLADE database
|
|
rows = splade_index.get_chunks_by_ids(chunk_ids)
|
|
|
|
# Build SearchResult objects
|
|
results = []
|
|
for row in rows:
|
|
chunk_id = row["id"]
|
|
file_path = row["file_path"]
|
|
content = row["content"]
|
|
metadata_json = row["metadata"]
|
|
metadata = json.loads(metadata_json) if metadata_json else {}
|
|
|
|
score = score_map.get(chunk_id, 0.0)
|
|
|
|
# Build excerpt (short preview)
|
|
excerpt = content[:200] + "..." if len(content) > 200 else content
|
|
|
|
# Extract symbol information from metadata
|
|
symbol_name = metadata.get("symbol_name")
|
|
symbol_kind = metadata.get("symbol_kind")
|
|
start_line = metadata.get("start_line")
|
|
end_line = metadata.get("end_line")
|
|
|
|
# Build Symbol object if we have symbol info
|
|
symbol = None
|
|
if symbol_name and symbol_kind and start_line and end_line:
|
|
try:
|
|
from codexlens.entities import Symbol
|
|
symbol = Symbol(
|
|
name=symbol_name,
|
|
kind=symbol_kind,
|
|
range=(start_line, end_line)
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
results.append(SearchResult(
|
|
path=file_path,
|
|
score=score,
|
|
excerpt=excerpt,
|
|
content=content,
|
|
symbol=symbol,
|
|
metadata=metadata,
|
|
start_line=start_line,
|
|
end_line=end_line,
|
|
symbol_name=symbol_name,
|
|
symbol_kind=symbol_kind,
|
|
))
|
|
|
|
return results
|
|
|
|
except Exception as exc:
|
|
self.logger.debug("SPLADE search error: %s", exc)
|
|
return []
|
|
|
|
def _search_lsp_graph(
|
|
self,
|
|
index_path: Path,
|
|
query: str,
|
|
limit: int,
|
|
max_depth: int = 1,
|
|
max_nodes: int = 20,
|
|
) -> List[SearchResult]:
|
|
"""Execute LSP-based graph expansion search.
|
|
|
|
Uses real-time LSP to expand from seed results and find related code.
|
|
This provides accurate, up-to-date code relationships.
|
|
|
|
Args:
|
|
index_path: Path to _index.db file
|
|
query: Natural language query string
|
|
limit: Maximum results
|
|
max_depth: Maximum depth for LSP graph BFS expansion (default 1)
|
|
max_nodes: Maximum nodes to collect in LSP graph (default 20)
|
|
|
|
Returns:
|
|
List of SearchResult from graph expansion
|
|
"""
|
|
import asyncio
|
|
|
|
if not HAS_LSP:
|
|
self.logger.debug("LSP dependencies not available")
|
|
return []
|
|
|
|
try:
|
|
# Try multiple seed sources in priority order
|
|
seeds = []
|
|
seed_source = "none"
|
|
|
|
# 1. Try vector search first (best semantic match)
|
|
seeds = self._search_vector(index_path, query, limit=3, category="code")
|
|
if seeds:
|
|
seed_source = "vector"
|
|
|
|
# 2. Fallback to SPLADE if vector returns nothing
|
|
if not seeds:
|
|
self.logger.debug("Vector search returned no seeds, trying SPLADE")
|
|
seeds = self._search_splade(index_path, query, limit=3)
|
|
if seeds:
|
|
seed_source = "splade"
|
|
|
|
# 3. Fallback to exact FTS if SPLADE also fails
|
|
if not seeds:
|
|
self.logger.debug("SPLADE returned no seeds, trying exact FTS")
|
|
seeds = self._search_exact(index_path, query, limit=3)
|
|
if seeds:
|
|
seed_source = "exact_fts"
|
|
|
|
# 4. No seeds available from any source
|
|
if not seeds:
|
|
self.logger.debug("No seed results available for LSP graph expansion")
|
|
return []
|
|
|
|
self.logger.debug(
|
|
"LSP graph expansion using %d seeds from %s",
|
|
len(seeds),
|
|
seed_source,
|
|
)
|
|
|
|
# Convert SearchResult to CodeSymbolNode for LSP processing
|
|
from codexlens.hybrid_search.data_structures import CodeSymbolNode, Range
|
|
|
|
seed_nodes = []
|
|
for seed in seeds:
|
|
try:
|
|
node = CodeSymbolNode(
|
|
id=f"{seed.path}:{seed.symbol_name or 'unknown'}:{seed.start_line or 0}",
|
|
name=seed.symbol_name or "unknown",
|
|
kind=seed.symbol_kind or "unknown",
|
|
file_path=seed.path,
|
|
range=Range(
|
|
start_line=seed.start_line or 1,
|
|
start_character=0,
|
|
end_line=seed.end_line or seed.start_line or 1,
|
|
end_character=0,
|
|
),
|
|
raw_code=seed.content or "",
|
|
docstring=seed.excerpt or "",
|
|
)
|
|
seed_nodes.append(node)
|
|
except Exception as e:
|
|
self.logger.debug("Failed to create seed node: %s", e)
|
|
continue
|
|
|
|
if not seed_nodes:
|
|
return []
|
|
|
|
# Run async LSP expansion in sync context
|
|
async def expand_graph():
|
|
async with LspBridge() as bridge:
|
|
builder = LspGraphBuilder(max_depth=max_depth, max_nodes=max_nodes)
|
|
graph = await builder.build_from_seeds(seed_nodes, bridge)
|
|
return graph
|
|
|
|
# Run the async code
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
if loop.is_running():
|
|
# Already in async context - use run_coroutine_threadsafe
|
|
import concurrent.futures
|
|
future = asyncio.run_coroutine_threadsafe(expand_graph(), loop)
|
|
graph = future.result(timeout=5.0)
|
|
else:
|
|
graph = loop.run_until_complete(expand_graph())
|
|
except RuntimeError:
|
|
# No event loop - create new one
|
|
graph = asyncio.run(expand_graph())
|
|
|
|
# Convert graph nodes to SearchResult
|
|
# Create set of seed identifiers for fast lookup
|
|
seed_ids = set()
|
|
for seed in seeds:
|
|
seed_id = f"{seed.path}:{seed.symbol_name or 'unknown'}:{seed.start_line or 0}"
|
|
seed_ids.add(seed_id)
|
|
|
|
results = []
|
|
for node_id, node in graph.nodes.items():
|
|
# Skip seed nodes using ID comparison (already in other results)
|
|
if node_id in seed_ids or node.id in seed_ids:
|
|
continue
|
|
|
|
# Calculate score based on graph position
|
|
# Nodes closer to seeds get higher scores
|
|
depth = 1 # Simple heuristic, could be improved
|
|
score = 0.8 / (1 + depth) # Score decreases with depth
|
|
|
|
results.append(SearchResult(
|
|
path=node.file_path,
|
|
score=score,
|
|
excerpt=node.docstring[:200] if node.docstring else node.raw_code[:200] if node.raw_code else "",
|
|
content=node.raw_code,
|
|
symbol=None,
|
|
metadata={"lsp_node_id": node_id, "lsp_kind": node.kind},
|
|
start_line=node.range.start_line,
|
|
end_line=node.range.end_line,
|
|
symbol_name=node.name,
|
|
symbol_kind=node.kind,
|
|
))
|
|
|
|
# Sort by score
|
|
results.sort(key=lambda r: r.score, reverse=True)
|
|
return results[:limit]
|
|
|
|
except Exception as exc:
|
|
self.logger.debug("LSP graph search error: %s", exc)
|
|
return []
|