mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-10 02:24:35 +08:00
- Added pre-calculation of estimated chunk count for HNSW capacity in `generate_dense_embeddings_centralized` to optimize indexing performance. - Implemented binary vector generation with memory-mapped storage for efficient cascade search, including metadata saving. - Introduced SPLADE sparse index generation with improved handling and metadata storage. - Updated `ChainSearchEngine` to prefer centralized binary searcher for improved performance and added fallback to legacy binary index. - Deprecated `BinaryANNIndex` in favor of `BinarySearcher` for better memory management and performance. - Enhanced `SpladeEncoder` with warmup functionality to reduce latency spikes during first-time inference. - Improved `SpladeIndex` with cache size adjustments for better query performance. - Added methods for managing binary vectors in `VectorMetadataStore`, including batch insertion and retrieval. - Created a new `BinarySearcher` class for efficient binary vector search using Hamming distance, supporting both memory-mapped and database loading modes.
1583 lines
60 KiB
Python
1583 lines
60 KiB
Python
"""Chain search engine for recursive multi-directory searching.
|
|
|
|
Provides parallel search across directory hierarchies using indexed _index.db files.
|
|
Supports depth-limited traversal, result aggregation, and symbol search.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING
|
|
import logging
|
|
import os
|
|
import time
|
|
|
|
from codexlens.entities import SearchResult, Symbol
|
|
|
|
if TYPE_CHECKING:
|
|
import numpy as np
|
|
|
|
try:
|
|
import numpy as np
|
|
NUMPY_AVAILABLE = True
|
|
except ImportError:
|
|
NUMPY_AVAILABLE = False
|
|
from codexlens.config import Config
|
|
from codexlens.storage.registry import RegistryStore, DirMapping
|
|
from codexlens.storage.dir_index import DirIndexStore, SubdirLink
|
|
from codexlens.storage.global_index import GlobalSymbolIndex
|
|
from codexlens.storage.path_mapper import PathMapper
|
|
from codexlens.storage.sqlite_store import SQLiteStore
|
|
from codexlens.search.hybrid_search import HybridSearchEngine
|
|
|
|
|
|
@dataclass
|
|
class SearchOptions:
|
|
"""Configuration options for chain search.
|
|
|
|
Attributes:
|
|
depth: Maximum search depth (-1 = unlimited, 0 = current dir only)
|
|
max_workers: Number of parallel worker threads
|
|
limit_per_dir: Maximum results per directory
|
|
total_limit: Total result limit across all directories
|
|
include_symbols: Whether to include symbol search results
|
|
files_only: Return only file paths without excerpts
|
|
include_semantic: Whether to include semantic keyword search results
|
|
hybrid_mode: Enable hybrid search with RRF fusion (default False)
|
|
enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True)
|
|
enable_vector: Enable vector semantic search (default False)
|
|
pure_vector: If True, only use vector search without FTS fallback (default False)
|
|
hybrid_weights: Custom RRF weights for hybrid search (optional)
|
|
group_results: Enable grouping of similar results (default False)
|
|
grouping_threshold: Score threshold for grouping similar results (default 0.01)
|
|
"""
|
|
depth: int = -1
|
|
max_workers: int = 8
|
|
limit_per_dir: int = 10
|
|
total_limit: int = 100
|
|
include_symbols: bool = False
|
|
files_only: bool = False
|
|
include_semantic: bool = False
|
|
hybrid_mode: bool = False
|
|
enable_fuzzy: bool = True
|
|
enable_vector: bool = False
|
|
pure_vector: bool = False
|
|
hybrid_weights: Optional[Dict[str, float]] = None
|
|
group_results: bool = False
|
|
grouping_threshold: float = 0.01
|
|
|
|
|
|
@dataclass
|
|
class SearchStats:
|
|
"""Statistics collected during search execution.
|
|
|
|
Attributes:
|
|
dirs_searched: Number of directories searched
|
|
files_matched: Number of files with matches
|
|
time_ms: Total search time in milliseconds
|
|
errors: List of error messages encountered
|
|
"""
|
|
dirs_searched: int = 0
|
|
files_matched: int = 0
|
|
time_ms: float = 0
|
|
errors: List[str] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class ChainSearchResult:
|
|
"""Comprehensive search result with metadata.
|
|
|
|
Attributes:
|
|
query: Original search query
|
|
results: List of SearchResult objects
|
|
related_results: Expanded results from graph neighbors (optional)
|
|
symbols: List of Symbol objects (if include_symbols=True)
|
|
stats: SearchStats with execution metrics
|
|
"""
|
|
query: str
|
|
results: List[SearchResult]
|
|
symbols: List[Symbol]
|
|
stats: SearchStats
|
|
related_results: List[SearchResult] = field(default_factory=list)
|
|
|
|
|
|
class ChainSearchEngine:
|
|
"""Parallel chain search engine for hierarchical directory indexes.
|
|
|
|
Searches across multiple directory indexes in parallel, following subdirectory
|
|
links to recursively traverse the file tree. Supports depth limits, result
|
|
aggregation, and both content and symbol searches.
|
|
|
|
Thread-safe with configurable parallelism.
|
|
|
|
Attributes:
|
|
registry: Global project registry
|
|
mapper: Path mapping utility
|
|
logger: Python logger instance
|
|
"""
|
|
|
|
def __init__(self,
|
|
registry: RegistryStore,
|
|
mapper: PathMapper,
|
|
max_workers: int = 8,
|
|
config: Config | None = None):
|
|
"""Initialize chain search engine.
|
|
|
|
Args:
|
|
registry: Global project registry for path lookups
|
|
mapper: Path mapper for source/index conversions
|
|
max_workers: Maximum parallel workers (default 8)
|
|
"""
|
|
self.registry = registry
|
|
self.mapper = mapper
|
|
self.logger = logging.getLogger(__name__)
|
|
self._max_workers = max_workers
|
|
self._executor: Optional[ThreadPoolExecutor] = None
|
|
self._config = config
|
|
|
|
def _get_executor(self, max_workers: Optional[int] = None) -> ThreadPoolExecutor:
|
|
"""Get or create the shared thread pool executor.
|
|
|
|
Lazy initialization to avoid creating executor if never used.
|
|
|
|
Args:
|
|
max_workers: Override default max_workers if specified
|
|
|
|
Returns:
|
|
ThreadPoolExecutor instance
|
|
"""
|
|
workers = max_workers or self._max_workers
|
|
if self._executor is None:
|
|
self._executor = ThreadPoolExecutor(max_workers=workers)
|
|
return self._executor
|
|
|
|
def close(self) -> None:
|
|
"""Shutdown the thread pool executor."""
|
|
if self._executor is not None:
|
|
self._executor.shutdown(wait=True)
|
|
self._executor = None
|
|
|
|
def __enter__(self) -> "ChainSearchEngine":
|
|
"""Context manager entry."""
|
|
return self
|
|
|
|
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
|
|
"""Context manager exit."""
|
|
self.close()
|
|
|
|
def search(self, query: str,
|
|
source_path: Path,
|
|
options: Optional[SearchOptions] = None) -> ChainSearchResult:
|
|
"""Execute chain search from source_path with recursive traversal.
|
|
|
|
Process:
|
|
1. Locate starting index for source_path
|
|
2. Collect all child indexes based on depth limit
|
|
3. Search indexes in parallel using ThreadPoolExecutor
|
|
4. Aggregate, deduplicate, and rank results
|
|
|
|
Args:
|
|
query: FTS5 search query string
|
|
source_path: Starting directory path
|
|
options: Search configuration (uses defaults if None)
|
|
|
|
Returns:
|
|
ChainSearchResult with results, symbols, and statistics
|
|
|
|
Examples:
|
|
>>> engine = ChainSearchEngine(registry, mapper)
|
|
>>> result = engine.search("authentication", Path("D:/project/src"))
|
|
>>> for r in result.results[:5]:
|
|
... print(f"{r.path}: {r.score:.2f}")
|
|
"""
|
|
options = options or SearchOptions()
|
|
start_time = time.time()
|
|
stats = SearchStats()
|
|
|
|
# Step 1: Find starting index
|
|
start_index = self._find_start_index(source_path)
|
|
if not start_index:
|
|
self.logger.warning(f"No index found for {source_path}")
|
|
stats.time_ms = (time.time() - start_time) * 1000
|
|
return ChainSearchResult(
|
|
query=query,
|
|
results=[],
|
|
symbols=[],
|
|
stats=stats
|
|
)
|
|
|
|
# Step 2: Collect all index paths to search
|
|
index_paths = self._collect_index_paths(start_index, options.depth)
|
|
stats.dirs_searched = len(index_paths)
|
|
|
|
if not index_paths:
|
|
self.logger.warning(f"No indexes collected from {start_index}")
|
|
stats.time_ms = (time.time() - start_time) * 1000
|
|
return ChainSearchResult(
|
|
query=query,
|
|
results=[],
|
|
symbols=[],
|
|
stats=stats
|
|
)
|
|
|
|
# Step 3: Parallel search
|
|
results, search_stats = self._search_parallel(
|
|
index_paths, query, options
|
|
)
|
|
stats.errors = search_stats.errors
|
|
|
|
# Step 4: Merge and rank
|
|
final_results = self._merge_and_rank(results, options.total_limit)
|
|
|
|
# Step 5: Optional grouping of similar results
|
|
if options.group_results:
|
|
from codexlens.search.ranking import group_similar_results
|
|
final_results = group_similar_results(
|
|
final_results, score_threshold_abs=options.grouping_threshold
|
|
)
|
|
|
|
stats.files_matched = len(final_results)
|
|
|
|
# Optional: Symbol search
|
|
symbols = []
|
|
if options.include_symbols:
|
|
symbols = self._search_symbols_parallel(
|
|
index_paths, query, None, options.total_limit
|
|
)
|
|
|
|
# Optional: graph expansion using precomputed neighbors
|
|
related_results: List[SearchResult] = []
|
|
if self._config is not None and getattr(self._config, "enable_graph_expansion", False):
|
|
try:
|
|
from codexlens.search.enrichment import SearchEnrichmentPipeline
|
|
|
|
pipeline = SearchEnrichmentPipeline(self.mapper, config=self._config)
|
|
related_results = pipeline.expand_related_results(final_results)
|
|
except Exception as exc:
|
|
self.logger.debug("Graph expansion failed: %s", exc)
|
|
related_results = []
|
|
|
|
stats.time_ms = (time.time() - start_time) * 1000
|
|
|
|
return ChainSearchResult(
|
|
query=query,
|
|
results=final_results,
|
|
symbols=symbols,
|
|
stats=stats,
|
|
related_results=related_results,
|
|
)
|
|
|
|
def hybrid_cascade_search(
|
|
self,
|
|
query: str,
|
|
source_path: Path,
|
|
k: int = 10,
|
|
coarse_k: int = 100,
|
|
options: Optional[SearchOptions] = None,
|
|
) -> ChainSearchResult:
|
|
"""Execute two-stage cascade search with hybrid coarse retrieval and cross-encoder reranking.
|
|
|
|
Hybrid cascade search process:
|
|
1. Stage 1 (Coarse): Fast retrieval using RRF fusion of FTS + SPLADE + Vector
|
|
to get coarse_k candidates
|
|
2. Stage 2 (Fine): CrossEncoder reranking of candidates to get final k results
|
|
|
|
This approach balances recall (from broad coarse search) with precision
|
|
(from expensive but accurate cross-encoder scoring).
|
|
|
|
Note: This method is the original hybrid approach. For binary vector cascade,
|
|
use binary_cascade_search() instead.
|
|
|
|
Args:
|
|
query: Natural language or keyword query string
|
|
source_path: Starting directory path
|
|
k: Number of final results to return (default 10)
|
|
coarse_k: Number of coarse candidates from first stage (default 100)
|
|
options: Search configuration (uses defaults if None)
|
|
|
|
Returns:
|
|
ChainSearchResult with reranked results and statistics
|
|
|
|
Examples:
|
|
>>> engine = ChainSearchEngine(registry, mapper, config=config)
|
|
>>> result = engine.hybrid_cascade_search(
|
|
... "how to authenticate users",
|
|
... Path("D:/project/src"),
|
|
... k=10,
|
|
... coarse_k=100
|
|
... )
|
|
>>> for r in result.results:
|
|
... print(f"{r.path}: {r.score:.3f}")
|
|
"""
|
|
options = options or SearchOptions()
|
|
start_time = time.time()
|
|
stats = SearchStats()
|
|
|
|
# Use config defaults if available
|
|
if self._config is not None:
|
|
if hasattr(self._config, "cascade_coarse_k"):
|
|
coarse_k = coarse_k or self._config.cascade_coarse_k
|
|
if hasattr(self._config, "cascade_fine_k"):
|
|
k = k or self._config.cascade_fine_k
|
|
|
|
# Step 1: Find starting index
|
|
start_index = self._find_start_index(source_path)
|
|
if not start_index:
|
|
self.logger.warning(f"No index found for {source_path}")
|
|
stats.time_ms = (time.time() - start_time) * 1000
|
|
return ChainSearchResult(
|
|
query=query,
|
|
results=[],
|
|
symbols=[],
|
|
stats=stats
|
|
)
|
|
|
|
# Step 2: Collect all index paths
|
|
index_paths = self._collect_index_paths(start_index, options.depth)
|
|
stats.dirs_searched = len(index_paths)
|
|
|
|
if not index_paths:
|
|
self.logger.warning(f"No indexes collected from {start_index}")
|
|
stats.time_ms = (time.time() - start_time) * 1000
|
|
return ChainSearchResult(
|
|
query=query,
|
|
results=[],
|
|
symbols=[],
|
|
stats=stats
|
|
)
|
|
|
|
# Stage 1: Coarse retrieval with hybrid search (FTS + SPLADE + Vector)
|
|
# Use hybrid mode for multi-signal retrieval
|
|
coarse_options = SearchOptions(
|
|
depth=options.depth,
|
|
max_workers=1, # Single thread for GPU safety
|
|
limit_per_dir=max(coarse_k // len(index_paths), 20),
|
|
total_limit=coarse_k,
|
|
hybrid_mode=True,
|
|
enable_fuzzy=options.enable_fuzzy,
|
|
enable_vector=True, # Enable vector for semantic matching
|
|
pure_vector=False,
|
|
hybrid_weights=options.hybrid_weights,
|
|
)
|
|
|
|
self.logger.debug(
|
|
"Cascade Stage 1: Coarse retrieval for %d candidates", coarse_k
|
|
)
|
|
coarse_results, search_stats = self._search_parallel(
|
|
index_paths, query, coarse_options
|
|
)
|
|
stats.errors = search_stats.errors
|
|
|
|
# Merge and deduplicate coarse results
|
|
coarse_merged = self._merge_and_rank(coarse_results, coarse_k)
|
|
self.logger.debug(
|
|
"Cascade Stage 1 complete: %d candidates retrieved", len(coarse_merged)
|
|
)
|
|
|
|
if not coarse_merged:
|
|
stats.time_ms = (time.time() - start_time) * 1000
|
|
return ChainSearchResult(
|
|
query=query,
|
|
results=[],
|
|
symbols=[],
|
|
stats=stats
|
|
)
|
|
|
|
# Stage 2: Cross-encoder reranking
|
|
self.logger.debug(
|
|
"Cascade Stage 2: Cross-encoder reranking %d candidates to top-%d",
|
|
len(coarse_merged),
|
|
k,
|
|
)
|
|
|
|
final_results = self._cross_encoder_rerank(query, coarse_merged, k)
|
|
|
|
# Optional: grouping of similar results
|
|
if options.group_results:
|
|
from codexlens.search.ranking import group_similar_results
|
|
final_results = group_similar_results(
|
|
final_results, score_threshold_abs=options.grouping_threshold
|
|
)
|
|
|
|
stats.files_matched = len(final_results)
|
|
stats.time_ms = (time.time() - start_time) * 1000
|
|
|
|
self.logger.debug(
|
|
"Cascade search complete: %d results in %.2fms",
|
|
len(final_results),
|
|
stats.time_ms,
|
|
)
|
|
|
|
return ChainSearchResult(
|
|
query=query,
|
|
results=final_results,
|
|
symbols=[],
|
|
stats=stats,
|
|
)
|
|
|
|
def binary_cascade_search(
|
|
self,
|
|
query: str,
|
|
source_path: Path,
|
|
k: int = 10,
|
|
coarse_k: int = 100,
|
|
options: Optional[SearchOptions] = None,
|
|
) -> ChainSearchResult:
|
|
"""Execute binary cascade search with binary coarse ranking and dense fine ranking.
|
|
|
|
Binary cascade search process:
|
|
1. Stage 1 (Coarse): Fast binary vector search using Hamming distance
|
|
to quickly filter to coarse_k candidates (256-dim binary, 32 bytes/vector)
|
|
2. Stage 2 (Fine): Dense vector cosine similarity for precise reranking
|
|
of candidates (2048-dim float32)
|
|
|
|
This approach leverages the speed of binary search (~100x faster) while
|
|
maintaining precision through dense vector reranking.
|
|
|
|
Performance characteristics:
|
|
- Binary search: O(N) with SIMD-accelerated XOR + popcount
|
|
- Dense rerank: Only applied to top coarse_k candidates
|
|
- Memory: 32 bytes (binary) + 8KB (dense) per chunk
|
|
|
|
Args:
|
|
query: Natural language or keyword query string
|
|
source_path: Starting directory path
|
|
k: Number of final results to return (default 10)
|
|
coarse_k: Number of coarse candidates from first stage (default 100)
|
|
options: Search configuration (uses defaults if None)
|
|
|
|
Returns:
|
|
ChainSearchResult with reranked results and statistics
|
|
|
|
Examples:
|
|
>>> engine = ChainSearchEngine(registry, mapper, config=config)
|
|
>>> result = engine.binary_cascade_search(
|
|
... "how to authenticate users",
|
|
... Path("D:/project/src"),
|
|
... k=10,
|
|
... coarse_k=100
|
|
... )
|
|
>>> for r in result.results:
|
|
... print(f"{r.path}: {r.score:.3f}")
|
|
"""
|
|
if not NUMPY_AVAILABLE:
|
|
self.logger.warning(
|
|
"NumPy not available, falling back to hybrid cascade search"
|
|
)
|
|
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
|
|
|
|
options = options or SearchOptions()
|
|
start_time = time.time()
|
|
stats = SearchStats()
|
|
|
|
# Use config defaults if available
|
|
if self._config is not None:
|
|
if hasattr(self._config, "cascade_coarse_k"):
|
|
coarse_k = coarse_k or self._config.cascade_coarse_k
|
|
if hasattr(self._config, "cascade_fine_k"):
|
|
k = k or self._config.cascade_fine_k
|
|
|
|
# Step 1: Find starting index
|
|
start_index = self._find_start_index(source_path)
|
|
if not start_index:
|
|
self.logger.warning(f"No index found for {source_path}")
|
|
stats.time_ms = (time.time() - start_time) * 1000
|
|
return ChainSearchResult(
|
|
query=query,
|
|
results=[],
|
|
symbols=[],
|
|
stats=stats
|
|
)
|
|
|
|
# Step 2: Collect all index paths
|
|
index_paths = self._collect_index_paths(start_index, options.depth)
|
|
stats.dirs_searched = len(index_paths)
|
|
|
|
if not index_paths:
|
|
self.logger.warning(f"No indexes collected from {start_index}")
|
|
stats.time_ms = (time.time() - start_time) * 1000
|
|
return ChainSearchResult(
|
|
query=query,
|
|
results=[],
|
|
symbols=[],
|
|
stats=stats
|
|
)
|
|
|
|
# Initialize embedding backends
|
|
try:
|
|
from codexlens.indexing.embedding import (
|
|
BinaryEmbeddingBackend,
|
|
DenseEmbeddingBackend,
|
|
)
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
except ImportError as exc:
|
|
self.logger.warning(
|
|
"Binary cascade dependencies not available: %s. "
|
|
"Falling back to hybrid cascade search.",
|
|
exc
|
|
)
|
|
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
|
|
|
|
# Stage 1: Binary vector coarse retrieval
|
|
self.logger.debug(
|
|
"Binary Cascade Stage 1: Binary coarse retrieval for %d candidates",
|
|
coarse_k,
|
|
)
|
|
|
|
use_gpu = True
|
|
if self._config is not None:
|
|
use_gpu = getattr(self._config, "embedding_use_gpu", True)
|
|
|
|
try:
|
|
binary_backend = BinaryEmbeddingBackend(use_gpu=use_gpu)
|
|
query_binary_packed = binary_backend.embed_packed([query])[0]
|
|
except Exception as exc:
|
|
self.logger.warning(
|
|
"Failed to generate binary query embedding: %s. "
|
|
"Falling back to hybrid cascade search.",
|
|
exc
|
|
)
|
|
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
|
|
|
|
# Try centralized BinarySearcher first (preferred for mmap indexes)
|
|
# The index root is the parent of the first index path
|
|
index_root = index_paths[0].parent if index_paths else None
|
|
all_candidates: List[Tuple[int, int, Path]] = [] # (chunk_id, distance, index_path)
|
|
used_centralized = False
|
|
|
|
if index_root:
|
|
centralized_searcher = self._get_centralized_binary_searcher(index_root)
|
|
if centralized_searcher is not None:
|
|
try:
|
|
# BinarySearcher expects dense vector, not packed binary
|
|
from codexlens.semantic.embedder import Embedder
|
|
embedder = Embedder()
|
|
query_dense = embedder.embed_to_numpy([query])[0]
|
|
|
|
# Centralized search - returns (chunk_id, hamming_distance) tuples
|
|
results = centralized_searcher.search(query_dense, top_k=coarse_k)
|
|
for chunk_id, dist in results:
|
|
all_candidates.append((chunk_id, dist, index_root))
|
|
used_centralized = True
|
|
self.logger.debug(
|
|
"Centralized binary search found %d candidates", len(results)
|
|
)
|
|
except Exception as exc:
|
|
self.logger.debug(
|
|
"Centralized binary search failed: %s, falling back to per-directory",
|
|
exc
|
|
)
|
|
centralized_searcher.clear()
|
|
|
|
# Fallback: Search per-directory indexes with legacy BinaryANNIndex
|
|
if not used_centralized:
|
|
for index_path in index_paths:
|
|
try:
|
|
# Get or create binary index for this path (uses deprecated BinaryANNIndex)
|
|
binary_index = self._get_or_create_binary_index(index_path)
|
|
if binary_index is None or binary_index.count() == 0:
|
|
continue
|
|
|
|
# Search binary index
|
|
ids, distances = binary_index.search(query_binary_packed, coarse_k)
|
|
for chunk_id, dist in zip(ids, distances):
|
|
all_candidates.append((chunk_id, dist, index_path))
|
|
|
|
except Exception as exc:
|
|
self.logger.debug(
|
|
"Binary search failed for %s: %s", index_path, exc
|
|
)
|
|
stats.errors.append(f"Binary search failed for {index_path}: {exc}")
|
|
|
|
if not all_candidates:
|
|
self.logger.debug("No binary candidates found, falling back to hybrid")
|
|
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
|
|
|
|
# Sort by Hamming distance and take top coarse_k
|
|
all_candidates.sort(key=lambda x: x[1])
|
|
coarse_candidates = all_candidates[:coarse_k]
|
|
|
|
self.logger.debug(
|
|
"Binary Cascade Stage 1 complete: %d candidates retrieved",
|
|
len(coarse_candidates),
|
|
)
|
|
|
|
# Stage 2: Dense vector fine ranking
|
|
self.logger.debug(
|
|
"Binary Cascade Stage 2: Dense reranking %d candidates to top-%d",
|
|
len(coarse_candidates),
|
|
k,
|
|
)
|
|
|
|
try:
|
|
dense_backend = DenseEmbeddingBackend(use_gpu=use_gpu)
|
|
query_dense = dense_backend.embed_to_numpy([query])[0]
|
|
except Exception as exc:
|
|
self.logger.warning(
|
|
"Failed to generate dense query embedding: %s. "
|
|
"Using Hamming distance scores only.",
|
|
exc
|
|
)
|
|
# Fall back to using Hamming distance as score
|
|
return self._build_results_from_candidates(
|
|
coarse_candidates[:k], index_paths, stats, query, start_time
|
|
)
|
|
|
|
# Group candidates by index path for batch retrieval
|
|
candidates_by_index: Dict[Path, List[int]] = {}
|
|
for chunk_id, _, index_path in coarse_candidates:
|
|
if index_path not in candidates_by_index:
|
|
candidates_by_index[index_path] = []
|
|
candidates_by_index[index_path].append(chunk_id)
|
|
|
|
# Retrieve dense embeddings and compute cosine similarity
|
|
scored_results: List[Tuple[float, SearchResult]] = []
|
|
|
|
for index_path, chunk_ids in candidates_by_index.items():
|
|
try:
|
|
# Read directly from semantic_chunks table (where cascade-index stores data)
|
|
import sqlite3
|
|
conn = sqlite3.connect(str(index_path))
|
|
conn.row_factory = sqlite3.Row
|
|
|
|
placeholders = ",".join("?" * len(chunk_ids))
|
|
rows = conn.execute(
|
|
f"SELECT id, file_path, content, embedding_dense FROM semantic_chunks WHERE id IN ({placeholders})",
|
|
chunk_ids
|
|
).fetchall()
|
|
conn.close()
|
|
|
|
# Batch processing: collect all valid embeddings first
|
|
valid_rows = []
|
|
dense_vectors = []
|
|
for row in rows:
|
|
dense_bytes = row["embedding_dense"]
|
|
if dense_bytes is not None:
|
|
valid_rows.append(row)
|
|
dense_vectors.append(np.frombuffer(dense_bytes, dtype=np.float32))
|
|
|
|
if not dense_vectors:
|
|
continue
|
|
|
|
# Stack into matrix for batch computation
|
|
doc_matrix = np.vstack(dense_vectors)
|
|
|
|
# Batch compute cosine similarities
|
|
scores = self._compute_cosine_similarity_batch(query_dense, doc_matrix)
|
|
|
|
# Create search results
|
|
for i, row in enumerate(valid_rows):
|
|
score = float(scores[i])
|
|
excerpt = (row["content"] or "")[:500]
|
|
result = SearchResult(
|
|
path=row["file_path"] or "",
|
|
score=score,
|
|
excerpt=excerpt,
|
|
)
|
|
scored_results.append((score, result))
|
|
|
|
except Exception as exc:
|
|
self.logger.debug(
|
|
"Dense reranking failed for %s: %s", index_path, exc
|
|
)
|
|
stats.errors.append(f"Dense reranking failed for {index_path}: {exc}")
|
|
|
|
# Sort by score descending and deduplicate by path
|
|
scored_results.sort(key=lambda x: x[0], reverse=True)
|
|
|
|
path_to_result: Dict[str, SearchResult] = {}
|
|
for score, result in scored_results:
|
|
if result.path not in path_to_result:
|
|
path_to_result[result.path] = result
|
|
|
|
final_results = list(path_to_result.values())[:k]
|
|
|
|
# Optional: grouping of similar results
|
|
if options.group_results:
|
|
from codexlens.search.ranking import group_similar_results
|
|
final_results = group_similar_results(
|
|
final_results, score_threshold_abs=options.grouping_threshold
|
|
)
|
|
|
|
stats.files_matched = len(final_results)
|
|
stats.time_ms = (time.time() - start_time) * 1000
|
|
|
|
self.logger.debug(
|
|
"Binary cascade search complete: %d results in %.2fms",
|
|
len(final_results),
|
|
stats.time_ms,
|
|
)
|
|
|
|
return ChainSearchResult(
|
|
query=query,
|
|
results=final_results,
|
|
symbols=[],
|
|
stats=stats,
|
|
)
|
|
|
|
def cascade_search(
|
|
self,
|
|
query: str,
|
|
source_path: Path,
|
|
k: int = 10,
|
|
coarse_k: int = 100,
|
|
options: Optional[SearchOptions] = None,
|
|
strategy: Literal["binary", "hybrid"] = "binary",
|
|
) -> ChainSearchResult:
|
|
"""Unified cascade search entry point with strategy selection.
|
|
|
|
Provides a single interface for cascade search with configurable strategy:
|
|
- "binary": Uses binary vector coarse ranking + dense fine ranking (faster)
|
|
- "hybrid": Uses FTS+SPLADE+Vector coarse ranking + cross-encoder reranking (original)
|
|
|
|
The strategy can be configured via:
|
|
1. The `strategy` parameter (highest priority)
|
|
2. Config `cascade_strategy` setting
|
|
3. Default: "binary"
|
|
|
|
Args:
|
|
query: Natural language or keyword query string
|
|
source_path: Starting directory path
|
|
k: Number of final results to return (default 10)
|
|
coarse_k: Number of coarse candidates from first stage (default 100)
|
|
options: Search configuration (uses defaults if None)
|
|
strategy: Cascade strategy - "binary" or "hybrid" (default "binary")
|
|
|
|
Returns:
|
|
ChainSearchResult with reranked results and statistics
|
|
|
|
Examples:
|
|
>>> engine = ChainSearchEngine(registry, mapper, config=config)
|
|
>>> # Use binary cascade (default, faster)
|
|
>>> result = engine.cascade_search("auth", Path("D:/project"))
|
|
>>> # Use hybrid cascade (original behavior)
|
|
>>> result = engine.cascade_search("auth", Path("D:/project"), strategy="hybrid")
|
|
"""
|
|
# Check config for strategy override
|
|
effective_strategy = strategy
|
|
if self._config is not None:
|
|
config_strategy = getattr(self._config, "cascade_strategy", None)
|
|
if config_strategy in ("binary", "hybrid"):
|
|
# Only use config if no explicit strategy was passed
|
|
# (we can't detect if strategy was explicitly passed vs default)
|
|
effective_strategy = config_strategy
|
|
|
|
if effective_strategy == "binary":
|
|
return self.binary_cascade_search(query, source_path, k, coarse_k, options)
|
|
else:
|
|
return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
|
|
|
|
def _get_or_create_binary_index(self, index_path: Path) -> Optional[Any]:
|
|
"""Get or create a BinaryANNIndex for the given index path.
|
|
|
|
.. deprecated::
|
|
This method uses the deprecated BinaryANNIndex. For centralized indexes,
|
|
use _get_centralized_binary_searcher() instead.
|
|
|
|
Attempts to load an existing binary index from disk. If not found,
|
|
returns None (binary index should be built during indexing).
|
|
|
|
Args:
|
|
index_path: Path to the _index.db file
|
|
|
|
Returns:
|
|
BinaryANNIndex instance or None if not available
|
|
"""
|
|
try:
|
|
import warnings
|
|
# Suppress deprecation warning since we're using it intentionally for legacy support
|
|
with warnings.catch_warnings():
|
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
from codexlens.semantic.ann_index import BinaryANNIndex
|
|
|
|
binary_index = BinaryANNIndex(index_path, dim=256)
|
|
if binary_index.load():
|
|
return binary_index
|
|
return None
|
|
except Exception as exc:
|
|
self.logger.debug("Failed to load binary index for %s: %s", index_path, exc)
|
|
return None
|
|
|
|
def _get_centralized_binary_searcher(self, index_root: Path) -> Optional[Any]:
|
|
"""Get centralized BinarySearcher for memory-mapped binary vectors.
|
|
|
|
This is the preferred method for centralized indexes, providing faster
|
|
search via memory-mapped files.
|
|
|
|
Args:
|
|
index_root: Root directory containing centralized index files
|
|
|
|
Returns:
|
|
BinarySearcher instance or None if not available
|
|
"""
|
|
try:
|
|
from codexlens.search.binary_searcher import BinarySearcher
|
|
|
|
binary_searcher = BinarySearcher(index_root)
|
|
if binary_searcher.load():
|
|
self.logger.debug(
|
|
"Using centralized BinarySearcher with %d vectors (mmap=%s)",
|
|
binary_searcher.vector_count,
|
|
binary_searcher.is_memmap
|
|
)
|
|
return binary_searcher
|
|
return None
|
|
except Exception as exc:
|
|
self.logger.debug("Failed to load centralized binary searcher: %s", exc)
|
|
return None
|
|
|
|
def _compute_cosine_similarity(
|
|
self,
|
|
query_vec: "np.ndarray",
|
|
doc_vec: "np.ndarray",
|
|
) -> float:
|
|
"""Compute cosine similarity between query and document vectors.
|
|
|
|
Args:
|
|
query_vec: Query embedding vector
|
|
doc_vec: Document embedding vector
|
|
|
|
Returns:
|
|
Cosine similarity score in range [-1, 1]
|
|
"""
|
|
if not NUMPY_AVAILABLE:
|
|
return 0.0
|
|
|
|
# Ensure same shape
|
|
min_len = min(len(query_vec), len(doc_vec))
|
|
q = query_vec[:min_len]
|
|
d = doc_vec[:min_len]
|
|
|
|
# Compute cosine similarity
|
|
dot_product = np.dot(q, d)
|
|
norm_q = np.linalg.norm(q)
|
|
norm_d = np.linalg.norm(d)
|
|
|
|
if norm_q == 0 or norm_d == 0:
|
|
return 0.0
|
|
|
|
return float(dot_product / (norm_q * norm_d))
|
|
|
|
def _compute_cosine_similarity_batch(
|
|
self,
|
|
query_vec: "np.ndarray",
|
|
doc_matrix: "np.ndarray",
|
|
) -> "np.ndarray":
|
|
"""Compute cosine similarity between query and multiple document vectors.
|
|
|
|
Uses vectorized matrix operations for efficient batch computation.
|
|
|
|
Args:
|
|
query_vec: Query embedding vector of shape (dim,)
|
|
doc_matrix: Document embeddings matrix of shape (n_docs, dim)
|
|
|
|
Returns:
|
|
Array of cosine similarity scores of shape (n_docs,)
|
|
"""
|
|
if not NUMPY_AVAILABLE:
|
|
return np.zeros(doc_matrix.shape[0])
|
|
|
|
# Ensure query is 1D
|
|
if query_vec.ndim > 1:
|
|
query_vec = query_vec.flatten()
|
|
|
|
# Handle dimension mismatch by truncating to smaller dimension
|
|
min_dim = min(len(query_vec), doc_matrix.shape[1])
|
|
q = query_vec[:min_dim]
|
|
docs = doc_matrix[:, :min_dim]
|
|
|
|
# Compute query norm once
|
|
norm_q = np.linalg.norm(q)
|
|
if norm_q == 0:
|
|
return np.zeros(docs.shape[0])
|
|
|
|
# Normalize query
|
|
q_normalized = q / norm_q
|
|
|
|
# Compute document norms (vectorized)
|
|
doc_norms = np.linalg.norm(docs, axis=1)
|
|
|
|
# Avoid division by zero
|
|
nonzero_mask = doc_norms > 0
|
|
scores = np.zeros(docs.shape[0], dtype=np.float32)
|
|
|
|
if np.any(nonzero_mask):
|
|
# Normalize documents with non-zero norms
|
|
docs_normalized = docs[nonzero_mask] / doc_norms[nonzero_mask, np.newaxis]
|
|
|
|
# Batch dot product: (n_docs, dim) @ (dim,) = (n_docs,)
|
|
scores[nonzero_mask] = docs_normalized @ q_normalized
|
|
|
|
return scores
|
|
|
|
def _build_results_from_candidates(
|
|
self,
|
|
candidates: List[Tuple[int, int, Path]],
|
|
index_paths: List[Path],
|
|
stats: SearchStats,
|
|
query: str,
|
|
start_time: float,
|
|
) -> ChainSearchResult:
|
|
"""Build ChainSearchResult from binary candidates using Hamming distance scores.
|
|
|
|
Used as fallback when dense embeddings are not available.
|
|
|
|
Args:
|
|
candidates: List of (chunk_id, hamming_distance, index_path) tuples
|
|
index_paths: List of all searched index paths
|
|
stats: SearchStats to update
|
|
query: Original query string
|
|
start_time: Search start time for timing
|
|
|
|
Returns:
|
|
ChainSearchResult with results scored by Hamming distance
|
|
"""
|
|
results: List[SearchResult] = []
|
|
|
|
# Group by index path
|
|
candidates_by_index: Dict[Path, List[Tuple[int, int]]] = {}
|
|
for chunk_id, distance, index_path in candidates:
|
|
if index_path not in candidates_by_index:
|
|
candidates_by_index[index_path] = []
|
|
candidates_by_index[index_path].append((chunk_id, distance))
|
|
|
|
for index_path, chunk_tuples in candidates_by_index.items():
|
|
try:
|
|
store = SQLiteStore(index_path)
|
|
chunk_ids = [c[0] for c in chunk_tuples]
|
|
chunks_data = store.get_chunks_by_ids(chunk_ids)
|
|
|
|
chunk_content: Dict[int, Dict[str, Any]] = {
|
|
c["id"]: c for c in chunks_data
|
|
}
|
|
|
|
for chunk_id, distance in chunk_tuples:
|
|
chunk_info = chunk_content.get(chunk_id)
|
|
if chunk_info is None:
|
|
continue
|
|
|
|
# Convert Hamming distance to score (lower distance = higher score)
|
|
# Max Hamming distance for 256-bit is 256
|
|
score = 1.0 - (distance / 256.0)
|
|
|
|
excerpt = chunk_info.get("content", "")[:500]
|
|
result = SearchResult(
|
|
path=chunk_info.get("file_path", ""),
|
|
score=float(score),
|
|
excerpt=excerpt,
|
|
)
|
|
results.append(result)
|
|
|
|
except Exception as exc:
|
|
self.logger.debug(
|
|
"Failed to build results from %s: %s", index_path, exc
|
|
)
|
|
|
|
# Deduplicate by path
|
|
path_to_result: Dict[str, SearchResult] = {}
|
|
for result in results:
|
|
if result.path not in path_to_result or result.score > path_to_result[result.path].score:
|
|
path_to_result[result.path] = result
|
|
|
|
final_results = sorted(
|
|
path_to_result.values(),
|
|
key=lambda r: r.score,
|
|
reverse=True,
|
|
)
|
|
|
|
stats.files_matched = len(final_results)
|
|
stats.time_ms = (time.time() - start_time) * 1000
|
|
|
|
return ChainSearchResult(
|
|
query=query,
|
|
results=final_results,
|
|
symbols=[],
|
|
stats=stats,
|
|
)
|
|
|
|
def _cross_encoder_rerank(
|
|
self,
|
|
query: str,
|
|
results: List[SearchResult],
|
|
top_k: int,
|
|
) -> List[SearchResult]:
|
|
"""Rerank results using cross-encoder model.
|
|
|
|
Args:
|
|
query: Search query string
|
|
results: Candidate results to rerank
|
|
top_k: Number of top results to return
|
|
|
|
Returns:
|
|
Reranked results sorted by cross-encoder score
|
|
"""
|
|
if not results:
|
|
return []
|
|
|
|
# Try to get reranker from config or create new one
|
|
reranker = None
|
|
try:
|
|
from codexlens.semantic.reranker import (
|
|
check_reranker_available,
|
|
get_reranker,
|
|
)
|
|
|
|
# Determine backend and model from config
|
|
backend = "onnx"
|
|
model_name = None
|
|
use_gpu = True
|
|
|
|
if self._config is not None:
|
|
backend = getattr(self._config, "reranker_backend", "onnx") or "onnx"
|
|
model_name = getattr(self._config, "reranker_model", None)
|
|
use_gpu = getattr(self._config, "embedding_use_gpu", True)
|
|
|
|
ok, err = check_reranker_available(backend)
|
|
if not ok:
|
|
self.logger.debug("Reranker backend unavailable (%s): %s", backend, err)
|
|
return results[:top_k]
|
|
|
|
# Create reranker
|
|
kwargs = {}
|
|
if backend == "onnx":
|
|
kwargs["use_gpu"] = use_gpu
|
|
|
|
reranker = get_reranker(backend=backend, model_name=model_name, **kwargs)
|
|
|
|
except ImportError as exc:
|
|
self.logger.debug("Reranker not available: %s", exc)
|
|
return results[:top_k]
|
|
except Exception as exc:
|
|
self.logger.debug("Failed to initialize reranker: %s", exc)
|
|
return results[:top_k]
|
|
|
|
# Use cross_encoder_rerank from ranking module
|
|
from codexlens.search.ranking import cross_encoder_rerank
|
|
|
|
return cross_encoder_rerank(
|
|
query=query,
|
|
results=results,
|
|
reranker=reranker,
|
|
top_k=top_k,
|
|
batch_size=32,
|
|
)
|
|
|
|
def search_files_only(self, query: str,
|
|
source_path: Path,
|
|
options: Optional[SearchOptions] = None) -> List[str]:
|
|
"""Search and return only matching file paths.
|
|
|
|
Faster than full search when excerpts are not needed.
|
|
|
|
Args:
|
|
query: FTS5 search query string
|
|
source_path: Starting directory path
|
|
options: Search configuration (uses defaults if None)
|
|
|
|
Returns:
|
|
List of file paths as strings
|
|
|
|
Examples:
|
|
>>> engine = ChainSearchEngine(registry, mapper)
|
|
>>> paths = engine.search_files_only("TODO", Path("D:/project"))
|
|
>>> print(f"Found {len(paths)} files with TODOs")
|
|
"""
|
|
options = options or SearchOptions()
|
|
options.files_only = True
|
|
|
|
result = self.search(query, source_path, options)
|
|
return [r.path for r in result.results]
|
|
|
|
def search_symbols(self, name: str,
|
|
source_path: Path,
|
|
kind: Optional[str] = None,
|
|
options: Optional[SearchOptions] = None) -> List[Symbol]:
|
|
"""Chain symbol search across directory hierarchy.
|
|
|
|
Args:
|
|
name: Symbol name pattern (partial match supported)
|
|
source_path: Starting directory path
|
|
kind: Optional symbol kind filter (e.g., 'function', 'class')
|
|
options: Search configuration (uses defaults if None)
|
|
|
|
Returns:
|
|
List of Symbol objects sorted by name
|
|
|
|
Examples:
|
|
>>> engine = ChainSearchEngine(registry, mapper)
|
|
>>> funcs = engine.search_symbols("init", Path("D:/project"), kind="function")
|
|
>>> for sym in funcs[:10]:
|
|
... print(f"{sym.name} ({sym.kind}): lines {sym.range}")
|
|
"""
|
|
options = options or SearchOptions()
|
|
|
|
start_index = self._find_start_index(source_path)
|
|
if not start_index:
|
|
self.logger.warning(f"No index found for {source_path}")
|
|
return []
|
|
|
|
# Fast path: project-wide global symbol index (avoids chain traversal).
|
|
if self._config is None or getattr(self._config, "global_symbol_index_enabled", True):
|
|
try:
|
|
# Avoid relying on index_to_source() here; use the same logic as _find_start_index
|
|
# to determine the effective search root directory.
|
|
search_root = source_path.resolve()
|
|
exact_index = self.mapper.source_to_index_db(search_root)
|
|
if not exact_index.exists():
|
|
nearest = self.registry.find_nearest_index(search_root)
|
|
if nearest:
|
|
search_root = nearest.source_path
|
|
|
|
project = self.registry.find_by_source_path(str(search_root))
|
|
if project:
|
|
global_db_path = Path(project["index_root"]) / GlobalSymbolIndex.DEFAULT_DB_NAME
|
|
if global_db_path.exists():
|
|
query_limit = max(int(options.total_limit) * 10, int(options.total_limit))
|
|
with GlobalSymbolIndex(global_db_path, project_id=int(project["id"])) as global_index:
|
|
candidates = global_index.search(name=name, kind=kind, limit=query_limit)
|
|
|
|
# Apply depth constraint relative to the start index directory.
|
|
filtered: List[Symbol] = []
|
|
for sym in candidates:
|
|
if not sym.file:
|
|
continue
|
|
try:
|
|
root_str = str(search_root)
|
|
file_dir_str = str(Path(sym.file).parent)
|
|
|
|
# Normalize Windows long-path prefix (\\?\) if present.
|
|
if root_str.startswith("\\\\?\\"):
|
|
root_str = root_str[4:]
|
|
if file_dir_str.startswith("\\\\?\\"):
|
|
file_dir_str = file_dir_str[4:]
|
|
|
|
root_cmp = root_str.lower().rstrip("\\/")
|
|
dir_cmp = file_dir_str.lower().rstrip("\\/")
|
|
|
|
# Guard against Windows cross-drive comparisons (ValueError).
|
|
if os.name == "nt":
|
|
root_drive, _ = os.path.splitdrive(root_cmp)
|
|
dir_drive, _ = os.path.splitdrive(dir_cmp)
|
|
if root_drive and dir_drive and root_drive != dir_drive:
|
|
self.logger.debug(
|
|
"Skipping symbol due to cross-drive path (root=%s file=%s name=%s)",
|
|
root_cmp,
|
|
sym.file,
|
|
sym.name,
|
|
)
|
|
continue
|
|
|
|
if os.path.commonpath([root_cmp, dir_cmp]) != root_cmp:
|
|
continue
|
|
|
|
rel = os.path.relpath(dir_cmp, root_cmp)
|
|
rel_depth = 0 if rel == "." else len(rel.split(os.sep))
|
|
except ValueError as exc:
|
|
self.logger.debug(
|
|
"Skipping symbol due to path operation failure (root=%s file=%s name=%s): %s",
|
|
str(search_root),
|
|
sym.file,
|
|
sym.name,
|
|
exc,
|
|
)
|
|
continue
|
|
except Exception as exc:
|
|
self.logger.debug(
|
|
"Skipping symbol due to unexpected path error (root=%s file=%s name=%s): %s",
|
|
str(search_root),
|
|
sym.file,
|
|
sym.name,
|
|
exc,
|
|
)
|
|
continue
|
|
|
|
if options.depth >= 0 and rel_depth > options.depth:
|
|
continue
|
|
filtered.append(sym)
|
|
|
|
if filtered:
|
|
# Match existing semantics: dedupe by (name, kind, range), sort by name.
|
|
seen = set()
|
|
unique_symbols: List[Symbol] = []
|
|
for sym in filtered:
|
|
key = (sym.name, sym.kind, sym.range)
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
unique_symbols.append(sym)
|
|
unique_symbols.sort(key=lambda s: s.name)
|
|
return unique_symbols[: options.total_limit]
|
|
except Exception as exc:
|
|
self.logger.debug("Global symbol index fast path failed: %s", exc)
|
|
|
|
index_paths = self._collect_index_paths(start_index, options.depth)
|
|
if not index_paths:
|
|
return []
|
|
|
|
return self._search_symbols_parallel(
|
|
index_paths, name, kind, options.total_limit
|
|
)
|
|
|
|
# === Internal Methods ===
|
|
|
|
def _find_start_index(self, source_path: Path) -> Optional[Path]:
|
|
"""Find index database path for source directory.
|
|
|
|
Attempts exact match first, then searches for nearest ancestor index.
|
|
|
|
Args:
|
|
source_path: Source directory path
|
|
|
|
Returns:
|
|
Path to _index.db file, or None if not found
|
|
"""
|
|
source_path = source_path.resolve()
|
|
|
|
# Try exact match first
|
|
exact_index = self.mapper.source_to_index_db(source_path)
|
|
if exact_index.exists():
|
|
self.logger.debug(f"Found exact index: {exact_index}")
|
|
return exact_index
|
|
|
|
# Try nearest ancestor via registry
|
|
nearest = self.registry.find_nearest_index(source_path)
|
|
if nearest:
|
|
self.logger.debug(f"Found nearest index: {nearest.index_path}")
|
|
return nearest.index_path
|
|
|
|
self.logger.warning(f"No index found for {source_path}")
|
|
return None
|
|
|
|
def _collect_index_paths(self, start_index: Path,
|
|
depth: int) -> List[Path]:
|
|
"""Recursively collect all subdirectory index paths.
|
|
|
|
Traverses directory tree via subdirs table in each _index.db,
|
|
respecting depth limit.
|
|
|
|
Args:
|
|
start_index: Starting _index.db path
|
|
depth: Maximum depth (-1 = unlimited, 0 = current only)
|
|
|
|
Returns:
|
|
List of _index.db paths to search
|
|
"""
|
|
collected = []
|
|
visited = set()
|
|
|
|
def _collect_recursive(index_path: Path, current_depth: int):
|
|
# Normalize path to avoid duplicates
|
|
normalized = index_path.resolve()
|
|
if normalized in visited:
|
|
return
|
|
visited.add(normalized)
|
|
|
|
# Add current index
|
|
if normalized.exists():
|
|
collected.append(normalized)
|
|
else:
|
|
self.logger.debug(f"Index does not exist: {normalized}")
|
|
return
|
|
|
|
# Check depth limit
|
|
if depth >= 0 and current_depth >= depth:
|
|
return
|
|
|
|
# Read subdirs and recurse
|
|
try:
|
|
with DirIndexStore(normalized) as store:
|
|
subdirs = store.get_subdirs()
|
|
for subdir in subdirs:
|
|
_collect_recursive(subdir.index_path, current_depth + 1)
|
|
except Exception as exc:
|
|
self.logger.warning(f"Failed to read subdirs from {normalized}: {exc}")
|
|
|
|
_collect_recursive(start_index, 0)
|
|
self.logger.info(f"Collected {len(collected)} indexes (depth={depth})")
|
|
return collected
|
|
|
|
def _search_parallel(self, index_paths: List[Path],
|
|
query: str,
|
|
options: SearchOptions) -> tuple[List[SearchResult], SearchStats]:
|
|
"""Search multiple indexes in parallel using shared ThreadPoolExecutor.
|
|
|
|
Args:
|
|
index_paths: List of _index.db paths to search
|
|
query: FTS5 query string
|
|
options: Search configuration
|
|
|
|
Returns:
|
|
Tuple of (all results, search statistics)
|
|
"""
|
|
all_results = []
|
|
stats = SearchStats()
|
|
|
|
# Force single-threaded execution for vector/hybrid search to avoid GPU crashes
|
|
# DirectML/ONNX have threading issues when multiple threads access GPU resources
|
|
effective_workers = options.max_workers
|
|
if options.enable_vector or options.hybrid_mode:
|
|
effective_workers = 1
|
|
self.logger.debug("Using single-threaded mode for vector search (GPU safety)")
|
|
# Pre-load embedder to avoid initialization overhead per-search
|
|
try:
|
|
from codexlens.semantic.embedder import get_embedder
|
|
get_embedder(profile="code", use_gpu=True)
|
|
except Exception:
|
|
pass # Ignore pre-load failures
|
|
|
|
executor = self._get_executor(effective_workers)
|
|
# Submit all search tasks
|
|
future_to_path = {
|
|
executor.submit(
|
|
self._search_single_index,
|
|
idx_path,
|
|
query,
|
|
options.limit_per_dir,
|
|
options.files_only,
|
|
options.include_semantic,
|
|
options.hybrid_mode,
|
|
options.enable_fuzzy,
|
|
options.enable_vector,
|
|
options.pure_vector,
|
|
options.hybrid_weights
|
|
): idx_path
|
|
for idx_path in index_paths
|
|
}
|
|
|
|
# Collect results as they complete
|
|
for future in as_completed(future_to_path):
|
|
idx_path = future_to_path[future]
|
|
try:
|
|
results = future.result()
|
|
all_results.extend(results)
|
|
self.logger.debug(f"Got {len(results)} results from {idx_path.parent.name}")
|
|
except Exception as exc:
|
|
error_msg = f"Search failed for {idx_path}: {exc}"
|
|
self.logger.error(error_msg)
|
|
stats.errors.append(error_msg)
|
|
|
|
return all_results, stats
|
|
|
|
def _search_single_index(self, index_path: Path,
|
|
query: str,
|
|
limit: int,
|
|
files_only: bool = False,
|
|
include_semantic: bool = False,
|
|
hybrid_mode: bool = False,
|
|
enable_fuzzy: bool = True,
|
|
enable_vector: bool = False,
|
|
pure_vector: bool = False,
|
|
hybrid_weights: Optional[Dict[str, float]] = None) -> List[SearchResult]:
|
|
"""Search a single index database.
|
|
|
|
Handles exceptions gracefully, returning empty list on failure.
|
|
|
|
Args:
|
|
index_path: Path to _index.db file
|
|
query: FTS5 query string (for FTS) or natural language query (for vector)
|
|
limit: Maximum results from this index
|
|
files_only: If True, skip snippet generation for faster search
|
|
include_semantic: If True, also search semantic keywords and merge results
|
|
hybrid_mode: If True, use hybrid search with RRF fusion
|
|
enable_fuzzy: Enable fuzzy FTS in hybrid mode
|
|
enable_vector: Enable vector semantic search
|
|
pure_vector: If True, only use vector search without FTS fallback
|
|
hybrid_weights: Custom RRF weights for hybrid search
|
|
|
|
Returns:
|
|
List of SearchResult objects (empty on error)
|
|
"""
|
|
try:
|
|
# Use hybrid search if enabled
|
|
if hybrid_mode:
|
|
hybrid_engine = HybridSearchEngine(weights=hybrid_weights)
|
|
fts_results = hybrid_engine.search(
|
|
index_path,
|
|
query,
|
|
limit=limit,
|
|
enable_fuzzy=enable_fuzzy,
|
|
enable_vector=enable_vector,
|
|
pure_vector=pure_vector,
|
|
)
|
|
else:
|
|
# Single-FTS search (exact or fuzzy mode)
|
|
with DirIndexStore(index_path) as store:
|
|
# Get FTS results
|
|
if files_only:
|
|
# Fast path: return paths only without snippets
|
|
paths = store.search_files_only(query, limit=limit)
|
|
fts_results = [SearchResult(path=p, score=0.0, excerpt="") for p in paths]
|
|
else:
|
|
# Use fuzzy FTS if enable_fuzzy=True (mode="fuzzy"), otherwise exact FTS
|
|
if enable_fuzzy:
|
|
fts_results = store.search_fts_fuzzy(
|
|
query, limit=limit, return_full_content=True
|
|
)
|
|
else:
|
|
fts_results = store.search_fts_exact(
|
|
query, limit=limit, return_full_content=True
|
|
)
|
|
|
|
# Optionally add semantic keyword results
|
|
if include_semantic:
|
|
try:
|
|
semantic_matches = store.search_semantic_keywords(query)
|
|
# Convert semantic matches to SearchResult with 0.8x weight
|
|
for file_entry, keywords in semantic_matches:
|
|
# Create excerpt from keywords
|
|
excerpt = f"Keywords: {', '.join(keywords[:5])}"
|
|
# Use a base score of 10.0 for semantic matches, weighted by 0.8
|
|
semantic_result = SearchResult(
|
|
path=str(file_entry.full_path),
|
|
score=10.0 * 0.8,
|
|
excerpt=excerpt
|
|
)
|
|
fts_results.append(semantic_result)
|
|
except Exception as sem_exc:
|
|
self.logger.debug(f"Semantic search error in {index_path}: {sem_exc}")
|
|
|
|
return fts_results
|
|
except Exception as exc:
|
|
self.logger.debug(f"Search error in {index_path}: {exc}")
|
|
return []
|
|
|
|
def _merge_and_rank(self, results: List[SearchResult],
|
|
limit: int) -> List[SearchResult]:
|
|
"""Aggregate, deduplicate, and rank results.
|
|
|
|
Process:
|
|
1. Deduplicate by path (keep highest score)
|
|
2. Sort by score descending
|
|
3. Limit to requested count
|
|
|
|
Args:
|
|
results: Raw results from all indexes
|
|
limit: Maximum results to return
|
|
|
|
Returns:
|
|
Deduplicated and ranked results
|
|
"""
|
|
# Deduplicate by path, keeping best score
|
|
path_to_result: Dict[str, SearchResult] = {}
|
|
for result in results:
|
|
path = result.path
|
|
if path not in path_to_result or result.score > path_to_result[path].score:
|
|
path_to_result[path] = result
|
|
|
|
# Sort by score descending
|
|
unique_results = list(path_to_result.values())
|
|
unique_results.sort(key=lambda r: r.score, reverse=True)
|
|
|
|
# Apply limit
|
|
return unique_results[:limit]
|
|
|
|
def _search_symbols_parallel(self, index_paths: List[Path],
|
|
name: str,
|
|
kind: Optional[str],
|
|
limit: int) -> List[Symbol]:
|
|
"""Search symbols across multiple indexes in parallel.
|
|
|
|
Args:
|
|
index_paths: List of _index.db paths to search
|
|
name: Symbol name pattern
|
|
kind: Optional symbol kind filter
|
|
limit: Total symbol limit
|
|
|
|
Returns:
|
|
Deduplicated and sorted symbols
|
|
"""
|
|
all_symbols = []
|
|
|
|
executor = self._get_executor()
|
|
# Submit all symbol search tasks
|
|
future_to_path = {
|
|
executor.submit(
|
|
self._search_symbols_single,
|
|
idx_path,
|
|
name,
|
|
kind
|
|
): idx_path
|
|
for idx_path in index_paths
|
|
}
|
|
|
|
# Collect results
|
|
for future in as_completed(future_to_path):
|
|
try:
|
|
symbols = future.result()
|
|
all_symbols.extend(symbols)
|
|
except Exception as exc:
|
|
self.logger.error(f"Symbol search failed: {exc}")
|
|
|
|
# Deduplicate by (name, kind, range)
|
|
seen = set()
|
|
unique_symbols = []
|
|
for sym in all_symbols:
|
|
key = (sym.name, sym.kind, sym.range)
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique_symbols.append(sym)
|
|
|
|
# Sort by name
|
|
unique_symbols.sort(key=lambda s: s.name)
|
|
|
|
return unique_symbols[:limit]
|
|
|
|
def _search_symbols_single(self, index_path: Path,
|
|
name: str,
|
|
kind: Optional[str]) -> List[Symbol]:
|
|
"""Search symbols in a single index.
|
|
|
|
Args:
|
|
index_path: Path to _index.db file
|
|
name: Symbol name pattern
|
|
kind: Optional symbol kind filter
|
|
|
|
Returns:
|
|
List of Symbol objects (empty on error)
|
|
"""
|
|
try:
|
|
with DirIndexStore(index_path) as store:
|
|
return store.search_symbols(name, kind=kind)
|
|
except Exception as exc:
|
|
self.logger.debug(f"Symbol search error in {index_path}: {exc}")
|
|
return []
|
|
|
|
|
|
# === Convenience Functions ===
|
|
|
|
def quick_search(query: str,
|
|
source_path: Path,
|
|
depth: int = -1) -> List[SearchResult]:
|
|
"""Quick search convenience function with automatic initialization.
|
|
|
|
Creates temporary registry and mapper instances for one-off searches.
|
|
For repeated searches, create a ChainSearchEngine instance directly.
|
|
|
|
Args:
|
|
query: FTS5 search query string
|
|
source_path: Starting directory path
|
|
depth: Maximum search depth (-1 = unlimited)
|
|
|
|
Returns:
|
|
List of SearchResult objects sorted by relevance
|
|
|
|
Examples:
|
|
>>> from pathlib import Path
|
|
>>> results = quick_search("authentication", Path("D:/project/src"))
|
|
>>> print(f"Found {len(results)} matches")
|
|
"""
|
|
registry = RegistryStore()
|
|
registry.initialize()
|
|
|
|
mapper = PathMapper()
|
|
|
|
with ChainSearchEngine(registry, mapper) as engine:
|
|
options = SearchOptions(depth=depth)
|
|
result = engine.search(query, source_path, options)
|
|
|
|
registry.close()
|
|
|
|
return result.results
|