feat: Implement adaptive RRF weights and query intent detection

- Added integration tests for adaptive RRF weights in hybrid search.
- Enhanced query intent detection with new classifications: keyword, semantic, and mixed.
- Introduced symbol boosting in search results based on explicit symbol matches.
- Implemented embedding-based reranking with configurable options.
- Added global symbol index for efficient symbol lookups across projects.
- Improved file deletion handling on Windows to avoid permission errors.
- Updated chunk configuration to increase overlap for better context.
- Modified package.json test script to target specific test files.
- Created comprehensive writing style guidelines for documentation.
- Added TypeScript tests for query intent detection and adaptive weights.
- Established performance benchmarks for global symbol indexing.
This commit is contained in:
catlog22
2025-12-26 15:08:47 +08:00
parent ecd5085e51
commit 4061ae48c4
29 changed files with 2685 additions and 828 deletions

View File

@@ -103,6 +103,11 @@ class Config:
# Indexing/search optimizations
global_symbol_index_enabled: bool = True # Enable project-wide symbol index fast path
# Optional search reranking (disabled by default)
enable_reranking: bool = False
reranking_top_k: int = 50
symbol_boost_factor: float = 1.5
# Multi-endpoint configuration for litellm backend
embedding_endpoints: List[Dict[str, Any]] = field(default_factory=list)
# List of endpoint configs: [{"model": "...", "api_key": "...", "api_base": "...", "weight": 1.0}]

View File

@@ -7,12 +7,38 @@ results via Reciprocal Rank Fusion (RRF) algorithm.
from __future__ import annotations
import logging
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from contextlib import contextmanager
from pathlib import Path
from typing import Dict, List, Optional
from typing import Any, Dict, List, Optional
@contextmanager
def timer(name: str, logger: logging.Logger, level: int = logging.DEBUG):
"""Context manager for timing code blocks.
Args:
name: Name of the operation being timed
logger: Logger instance to use
level: Logging level (default DEBUG)
"""
start = time.perf_counter()
try:
yield
finally:
elapsed_ms = (time.perf_counter() - start) * 1000
logger.log(level, "[TIMING] %s: %.2fms", name, elapsed_ms)
from codexlens.config import Config
from codexlens.entities import SearchResult
from codexlens.search.ranking import reciprocal_rank_fusion, tag_search_source
from codexlens.search.ranking import (
apply_symbol_boost,
get_rrf_weights,
reciprocal_rank_fusion,
rerank_results,
tag_search_source,
)
from codexlens.storage.dir_index import DirIndexStore
@@ -34,14 +60,23 @@ class HybridSearchEngine:
"vector": 0.6,
}
def __init__(self, weights: Optional[Dict[str, float]] = None):
def __init__(
self,
weights: Optional[Dict[str, float]] = None,
config: Optional[Config] = None,
embedder: Any = None,
):
"""Initialize hybrid search engine.
Args:
weights: Optional custom RRF weights (default: DEFAULT_WEIGHTS)
config: Optional runtime config (enables optional reranking features)
embedder: Optional embedder instance for embedding-based reranking
"""
self.logger = logging.getLogger(__name__)
self.weights = weights or self.DEFAULT_WEIGHTS.copy()
self._config = config
self.embedder = embedder
def search(
self,
@@ -101,7 +136,8 @@ class HybridSearchEngine:
backends["vector"] = True
# Execute parallel searches
results_map = self._search_parallel(index_path, query, backends, limit)
with timer("parallel_search_total", self.logger):
results_map = self._search_parallel(index_path, query, backends, limit)
# Provide helpful message if pure-vector mode returns no results
if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0:
@@ -120,11 +156,72 @@ class HybridSearchEngine:
if source in results_map
}
fused_results = reciprocal_rank_fusion(results_map, active_weights)
with timer("rrf_fusion", self.logger):
adaptive_weights = get_rrf_weights(query, active_weights)
fused_results = reciprocal_rank_fusion(results_map, adaptive_weights)
# Optional: boost results that include explicit symbol matches
boost_factor = (
self._config.symbol_boost_factor
if self._config is not None
else 1.5
)
with timer("symbol_boost", self.logger):
fused_results = apply_symbol_boost(
fused_results, boost_factor=boost_factor
)
# Optional: embedding-based reranking on top results
if self._config is not None and self._config.enable_reranking:
with timer("reranking", self.logger):
if self.embedder is None:
self.embedder = self._get_reranking_embedder()
fused_results = rerank_results(
query,
fused_results[:100],
self.embedder,
top_k=self._config.reranking_top_k,
)
# Apply final limit
return fused_results[:limit]
def _get_reranking_embedder(self) -> Any:
"""Create an embedder for reranking based on Config embedding settings."""
if self._config is None:
return None
try:
from codexlens.semantic.factory import get_embedder
except Exception as exc:
self.logger.debug("Reranking embedder unavailable: %s", exc)
return None
try:
if self._config.embedding_backend == "fastembed":
return get_embedder(
backend="fastembed",
profile=self._config.embedding_model,
use_gpu=self._config.embedding_use_gpu,
)
if self._config.embedding_backend == "litellm":
return get_embedder(
backend="litellm",
model=self._config.embedding_model,
endpoints=self._config.embedding_endpoints,
strategy=self._config.embedding_strategy,
cooldown=self._config.embedding_cooldown,
)
except Exception as exc:
self.logger.debug("Failed to initialize reranking embedder: %s", exc)
return None
self.logger.debug(
"Unknown embedding backend for reranking: %s",
self._config.embedding_backend,
)
return None
def _search_parallel(
self,
index_path: Path,
@@ -144,25 +241,30 @@ class HybridSearchEngine:
Dictionary mapping source name to results list
"""
results_map: Dict[str, List[SearchResult]] = {}
timing_data: Dict[str, float] = {}
# Use ThreadPoolExecutor for parallel I/O-bound searches
with ThreadPoolExecutor(max_workers=len(backends)) as executor:
# Submit search tasks
# Submit search tasks with timing
future_to_source = {}
submit_times = {}
if backends.get("exact"):
submit_times["exact"] = time.perf_counter()
future = executor.submit(
self._search_exact, index_path, query, limit
)
future_to_source[future] = "exact"
if backends.get("fuzzy"):
submit_times["fuzzy"] = time.perf_counter()
future = executor.submit(
self._search_fuzzy, index_path, query, limit
)
future_to_source[future] = "fuzzy"
if backends.get("vector"):
submit_times["vector"] = time.perf_counter()
future = executor.submit(
self._search_vector, index_path, query, limit
)
@@ -171,18 +273,26 @@ class HybridSearchEngine:
# Collect results as they complete
for future in as_completed(future_to_source):
source = future_to_source[future]
elapsed_ms = (time.perf_counter() - submit_times[source]) * 1000
timing_data[source] = elapsed_ms
try:
results = future.result()
# Tag results with source for debugging
tagged_results = tag_search_source(results, source)
results_map[source] = tagged_results
self.logger.debug(
"Got %d results from %s search", len(results), source
"[TIMING] %s_search: %.2fms (%d results)",
source, elapsed_ms, len(results)
)
except Exception as exc:
self.logger.error("Search failed for %s: %s", source, exc)
results_map[source] = []
# Log timing summary
if timing_data:
timing_str = ", ".join(f"{k}={v:.1f}ms" for k, v in timing_data.items())
self.logger.debug("[TIMING] search_backends: {%s}", timing_str)
return results_map
def _search_exact(
@@ -245,6 +355,8 @@ class HybridSearchEngine:
try:
# Check if semantic chunks table exists
import sqlite3
start_check = time.perf_counter()
try:
with sqlite3.connect(index_path) as conn:
cursor = conn.execute(
@@ -254,6 +366,10 @@ class HybridSearchEngine:
except sqlite3.Error as e:
self.logger.error("Database check failed in vector search: %s", e)
return []
self.logger.debug(
"[TIMING] vector_table_check: %.2fms",
(time.perf_counter() - start_check) * 1000
)
if not has_semantic_table:
self.logger.info(
@@ -267,7 +383,12 @@ class HybridSearchEngine:
from codexlens.semantic.factory import get_embedder
from codexlens.semantic.vector_store import VectorStore
start_init = time.perf_counter()
vector_store = VectorStore(index_path)
self.logger.debug(
"[TIMING] vector_store_init: %.2fms",
(time.perf_counter() - start_init) * 1000
)
# Check if vector store has data
if vector_store.count_chunks() == 0:
@@ -279,6 +400,7 @@ class HybridSearchEngine:
return []
# Get stored model configuration (preferred) or auto-detect from dimension
start_embedder = time.perf_counter()
model_config = vector_store.get_model_config()
if model_config:
backend = model_config.get("backend", "fastembed")
@@ -288,7 +410,7 @@ class HybridSearchEngine:
"Using stored model config: %s backend, %s (%s, %dd)",
backend, model_profile, model_name, model_config["embedding_dim"]
)
# Get embedder based on backend
if backend == "litellm":
embedder = get_embedder(backend="litellm", model=model_name)
@@ -324,21 +446,32 @@ class HybridSearchEngine:
detected_dim
)
embedder = get_embedder(backend="fastembed", profile="code")
self.logger.debug(
"[TIMING] embedder_init: %.2fms",
(time.perf_counter() - start_embedder) * 1000
)
# Generate query embedding
start_embed = time.perf_counter()
query_embedding = embedder.embed_single(query)
self.logger.debug(
"[TIMING] query_embedding: %.2fms",
(time.perf_counter() - start_embed) * 1000
)
# Search for similar chunks
start_search = time.perf_counter()
results = vector_store.search_similar(
query_embedding=query_embedding,
top_k=limit,
min_score=0.0, # Return all results, let RRF handle filtering
return_full_content=True,
)
self.logger.debug(
"[TIMING] vector_similarity_search: %.2fms (%d results)",
(time.perf_counter() - start_search) * 1000, len(results)
)
self.logger.debug("Vector search found %d results", len(results))
return results
except ImportError as exc:

View File

@@ -6,12 +6,98 @@ for combining results from heterogeneous search backends (exact FTS, fuzzy FTS,
from __future__ import annotations
import re
import math
from typing import Dict, List
from enum import Enum
from typing import Any, Dict, List
from codexlens.entities import SearchResult, AdditionalLocation
class QueryIntent(str, Enum):
"""Query intent for adaptive RRF weights (Python/TypeScript parity)."""
KEYWORD = "keyword"
SEMANTIC = "semantic"
MIXED = "mixed"
def normalize_weights(weights: Dict[str, float]) -> Dict[str, float]:
"""Normalize weights to sum to 1.0 (best-effort)."""
total = sum(float(v) for v in weights.values() if v is not None)
if not math.isfinite(total) or total <= 0:
return {k: float(v) for k, v in weights.items()}
return {k: float(v) / total for k, v in weights.items()}
def detect_query_intent(query: str) -> QueryIntent:
"""Detect whether a query is code-like, natural-language, or mixed.
Heuristic signals kept aligned with `ccw/src/tools/smart-search.ts`.
"""
trimmed = (query or "").strip()
if not trimmed:
return QueryIntent.MIXED
lower = trimmed.lower()
word_count = len([w for w in re.split(r"\s+", trimmed) if w])
has_code_signals = bool(
re.search(r"(::|->|\.)", trimmed)
or re.search(r"[A-Z][a-z]+[A-Z]", trimmed)
or re.search(r"\b\w+_\w+\b", trimmed)
or re.search(
r"\b(def|class|function|const|let|var|import|from|return|async|await|interface|type)\b",
lower,
flags=re.IGNORECASE,
)
)
has_natural_signals = bool(
word_count > 5
or "?" in trimmed
or re.search(r"\b(how|what|why|when|where)\b", trimmed, flags=re.IGNORECASE)
or re.search(
r"\b(handle|explain|fix|implement|create|build|use|find|search|convert|parse|generate|support)\b",
trimmed,
flags=re.IGNORECASE,
)
)
if has_code_signals and has_natural_signals:
return QueryIntent.MIXED
if has_code_signals:
return QueryIntent.KEYWORD
if has_natural_signals:
return QueryIntent.SEMANTIC
return QueryIntent.MIXED
def adjust_weights_by_intent(
intent: QueryIntent,
base_weights: Dict[str, float],
) -> Dict[str, float]:
"""Map intent → weights (kept aligned with TypeScript mapping)."""
if intent == QueryIntent.KEYWORD:
target = {"exact": 0.5, "fuzzy": 0.1, "vector": 0.4}
elif intent == QueryIntent.SEMANTIC:
target = {"exact": 0.2, "fuzzy": 0.1, "vector": 0.7}
else:
target = dict(base_weights)
# Preserve only keys that are present in base_weights (active backends).
keys = list(base_weights.keys())
filtered = {k: float(target.get(k, 0.0)) for k in keys}
return normalize_weights(filtered)
def get_rrf_weights(
query: str,
base_weights: Dict[str, float],
) -> Dict[str, float]:
"""Compute adaptive RRF weights from query intent."""
return adjust_weights_by_intent(detect_query_intent(query), base_weights)
def reciprocal_rank_fusion(
results_map: Dict[str, List[SearchResult]],
weights: Dict[str, float] = None,
@@ -102,6 +188,186 @@ def reciprocal_rank_fusion(
return fused_results
def apply_symbol_boost(
results: List[SearchResult],
boost_factor: float = 1.5,
) -> List[SearchResult]:
"""Boost fused scores for results that include an explicit symbol match.
The boost is multiplicative on the current result.score (typically the RRF fusion score).
When boosted, the original score is preserved in metadata["original_fusion_score"] and
metadata["boosted"] is set to True.
"""
if not results:
return []
if boost_factor <= 1.0:
# Still return new objects to follow immutable transformation pattern.
return [
SearchResult(
path=r.path,
score=r.score,
excerpt=r.excerpt,
content=r.content,
symbol=r.symbol,
chunk=r.chunk,
metadata={**r.metadata},
start_line=r.start_line,
end_line=r.end_line,
symbol_name=r.symbol_name,
symbol_kind=r.symbol_kind,
additional_locations=list(r.additional_locations),
)
for r in results
]
boosted_results: List[SearchResult] = []
for result in results:
has_symbol = bool(result.symbol_name)
original_score = float(result.score)
boosted_score = original_score * boost_factor if has_symbol else original_score
metadata = {**result.metadata}
if has_symbol:
metadata.setdefault("original_fusion_score", metadata.get("fusion_score", original_score))
metadata["boosted"] = True
metadata["symbol_boost_factor"] = boost_factor
boosted_results.append(
SearchResult(
path=result.path,
score=boosted_score,
excerpt=result.excerpt,
content=result.content,
symbol=result.symbol,
chunk=result.chunk,
metadata=metadata,
start_line=result.start_line,
end_line=result.end_line,
symbol_name=result.symbol_name,
symbol_kind=result.symbol_kind,
additional_locations=list(result.additional_locations),
)
)
boosted_results.sort(key=lambda r: r.score, reverse=True)
return boosted_results
def rerank_results(
query: str,
results: List[SearchResult],
embedder: Any,
top_k: int = 50,
) -> List[SearchResult]:
"""Re-rank results with embedding cosine similarity, combined with current score.
Combined score formula:
0.5 * rrf_score + 0.5 * cosine_similarity
If embedder is None or embedding fails, returns results as-is.
"""
if not results:
return []
if embedder is None or top_k <= 0:
return results
rerank_count = min(int(top_k), len(results))
def cosine_similarity(vec_a: List[float], vec_b: List[float]) -> float:
# Defensive: handle mismatched lengths and zero vectors.
n = min(len(vec_a), len(vec_b))
if n == 0:
return 0.0
dot = 0.0
norm_a = 0.0
norm_b = 0.0
for i in range(n):
a = float(vec_a[i])
b = float(vec_b[i])
dot += a * b
norm_a += a * a
norm_b += b * b
if norm_a <= 0.0 or norm_b <= 0.0:
return 0.0
sim = dot / (math.sqrt(norm_a) * math.sqrt(norm_b))
# SearchResult.score requires non-negative scores; clamp cosine similarity to [0, 1].
return max(0.0, min(1.0, sim))
def text_for_embedding(r: SearchResult) -> str:
if r.excerpt and r.excerpt.strip():
return r.excerpt
if r.content and r.content.strip():
return r.content
if r.chunk and r.chunk.content and r.chunk.content.strip():
return r.chunk.content
# Fallback: stable, non-empty text.
return r.symbol_name or r.path
try:
if hasattr(embedder, "embed_single"):
query_vec = embedder.embed_single(query)
else:
query_vec = embedder.embed(query)[0]
doc_texts = [text_for_embedding(r) for r in results[:rerank_count]]
doc_vecs = embedder.embed(doc_texts)
except Exception:
return results
reranked_results: List[SearchResult] = []
for idx, result in enumerate(results):
if idx < rerank_count:
rrf_score = float(result.score)
sim = cosine_similarity(query_vec, doc_vecs[idx])
combined_score = 0.5 * rrf_score + 0.5 * sim
reranked_results.append(
SearchResult(
path=result.path,
score=combined_score,
excerpt=result.excerpt,
content=result.content,
symbol=result.symbol,
chunk=result.chunk,
metadata={
**result.metadata,
"rrf_score": rrf_score,
"cosine_similarity": sim,
"reranked": True,
},
start_line=result.start_line,
end_line=result.end_line,
symbol_name=result.symbol_name,
symbol_kind=result.symbol_kind,
additional_locations=list(result.additional_locations),
)
)
else:
# Preserve remaining results without re-ranking, but keep immutability.
reranked_results.append(
SearchResult(
path=result.path,
score=result.score,
excerpt=result.excerpt,
content=result.content,
symbol=result.symbol,
chunk=result.chunk,
metadata={**result.metadata},
start_line=result.start_line,
end_line=result.end_line,
symbol_name=result.symbol_name,
symbol_kind=result.symbol_kind,
additional_locations=list(result.additional_locations),
)
)
reranked_results.sort(key=lambda r: r.score, reverse=True)
return reranked_results
def normalize_bm25_score(score: float) -> float:
"""Normalize BM25 scores from SQLite FTS5 to 0-1 range.

View File

@@ -392,6 +392,22 @@ class HybridChunker:
filtered.append(symbol)
return filtered
def _find_parent_symbol(
self,
start_line: int,
end_line: int,
symbols: List[Symbol],
) -> Optional[Symbol]:
"""Find the smallest symbol range that fully contains a docstring span."""
candidates: List[Symbol] = []
for symbol in symbols:
sym_start, sym_end = symbol.range
if sym_start <= start_line and end_line <= sym_end:
candidates.append(symbol)
if not candidates:
return None
return min(candidates, key=lambda s: (s.range[1] - s.range[0], s.range[0]))
def chunk_file(
self,
content: str,
@@ -414,24 +430,53 @@ class HybridChunker:
chunks: List[SemanticChunk] = []
# Step 1: Extract docstrings as dedicated chunks
docstrings = self.docstring_extractor.extract_docstrings(content, language)
docstrings: List[Tuple[str, int, int]] = []
if language == "python":
# Fast path: avoid expensive docstring extraction if delimiters are absent.
if '"""' in content or "'''" in content:
docstrings = self.docstring_extractor.extract_docstrings(content, language)
elif language in {"javascript", "typescript"}:
if "/**" in content:
docstrings = self.docstring_extractor.extract_docstrings(content, language)
else:
docstrings = self.docstring_extractor.extract_docstrings(content, language)
# Fast path: no docstrings -> delegate to base chunker directly.
if not docstrings:
if symbols:
base_chunks = self.base_chunker.chunk_by_symbol(
content, symbols, file_path, language, symbol_token_counts
)
else:
base_chunks = self.base_chunker.chunk_sliding_window(content, file_path, language)
for chunk in base_chunks:
chunk.metadata["strategy"] = "hybrid"
chunk.metadata["chunk_type"] = "code"
return base_chunks
for docstring_content, start_line, end_line in docstrings:
if len(docstring_content.strip()) >= self.config.min_chunk_size:
parent_symbol = self._find_parent_symbol(start_line, end_line, symbols)
# Use base chunker's token estimation method
token_count = self.base_chunker._estimate_token_count(docstring_content)
metadata = {
"file": str(file_path),
"language": language,
"chunk_type": "docstring",
"start_line": start_line,
"end_line": end_line,
"strategy": "hybrid",
"token_count": token_count,
}
if parent_symbol is not None:
metadata["parent_symbol"] = parent_symbol.name
metadata["parent_symbol_kind"] = parent_symbol.kind
metadata["parent_symbol_range"] = parent_symbol.range
chunks.append(SemanticChunk(
content=docstring_content,
embedding=None,
metadata={
"file": str(file_path),
"language": language,
"chunk_type": "docstring",
"start_line": start_line,
"end_line": end_line,
"strategy": "hybrid",
"token_count": token_count,
}
metadata=metadata
))
# Step 2: Get line ranges occupied by docstrings