feat: Implement adaptive RRF weights and query intent detection

- Added integration tests for adaptive RRF weights in hybrid search. - Enhanced query intent detection with new classifications: keyword, semantic, and mixed. - Introduced symbol boosting in search results based on explicit symbol matches. - Implemented embedding-based reranking with configurable options. - Added global symbol index for efficient symbol lookups across projects. - Improved file deletion handling on Windows to avoid permission errors. - Updated chunk configuration to increase overlap for better context. - Modified package.json test script to target specific test files. - Created comprehensive writing style guidelines for documentation. - Added TypeScript tests for query intent detection and adaptive weights. - Established performance benchmarks for global symbol indexing.
2026-02-11 02:33:51 +08:00 · 2025-12-26 15:08:47 +08:00
parent ecd5085e51
commit 4061ae48c4
29 changed files with 2685 additions and 828 deletions
--- a/codex-lens/src/codexlens/config.py
+++ b/codex-lens/src/codexlens/config.py
@@ -103,6 +103,11 @@ class Config:
    # Indexing/search optimizations
    global_symbol_index_enabled: bool = True  # Enable project-wide symbol index fast path

+    # Optional search reranking (disabled by default)
+    enable_reranking: bool = False
+    reranking_top_k: int = 50
+    symbol_boost_factor: float = 1.5
+
    # Multi-endpoint configuration for litellm backend
    embedding_endpoints: List[Dict[str, Any]] = field(default_factory=list)
    # List of endpoint configs: [{"model": "...", "api_key": "...", "api_base": "...", "weight": 1.0}]
--- a/codex-lens/src/codexlens/search/hybrid_search.py
+++ b/codex-lens/src/codexlens/search/hybrid_search.py
@@ -7,12 +7,38 @@ results via Reciprocal Rank Fusion (RRF) algorithm.
 from __future__ import annotations

 import logging
+import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from contextlib import contextmanager
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional

+
+@contextmanager
+def timer(name: str, logger: logging.Logger, level: int = logging.DEBUG):
+    """Context manager for timing code blocks.
+
+    Args:
+        name: Name of the operation being timed
+        logger: Logger instance to use
+        level: Logging level (default DEBUG)
+    """
+    start = time.perf_counter()
+    try:
+        yield
+    finally:
+        elapsed_ms = (time.perf_counter() - start) * 1000
+        logger.log(level, "[TIMING] %s: %.2fms", name, elapsed_ms)
+
+from codexlens.config import Config
 from codexlens.entities import SearchResult
-from codexlens.search.ranking import reciprocal_rank_fusion, tag_search_source
+from codexlens.search.ranking import (
+    apply_symbol_boost,
+    get_rrf_weights,
+    reciprocal_rank_fusion,
+    rerank_results,
+    tag_search_source,
+)
 from codexlens.storage.dir_index import DirIndexStore


@@ -34,14 +60,23 @@ class HybridSearchEngine:
        "vector": 0.6,
    }

-    def __init__(self, weights: Optional[Dict[str, float]] = None):
+    def __init__(
+        self,
+        weights: Optional[Dict[str, float]] = None,
+        config: Optional[Config] = None,
+        embedder: Any = None,
+    ):
        """Initialize hybrid search engine.

        Args:
            weights: Optional custom RRF weights (default: DEFAULT_WEIGHTS)
+            config: Optional runtime config (enables optional reranking features)
+            embedder: Optional embedder instance for embedding-based reranking
        """
        self.logger = logging.getLogger(__name__)
        self.weights = weights or self.DEFAULT_WEIGHTS.copy()
+        self._config = config
+        self.embedder = embedder

    def search(
        self,
@@ -101,7 +136,8 @@ class HybridSearchEngine:
                backends["vector"] = True

        # Execute parallel searches
-        results_map = self._search_parallel(index_path, query, backends, limit)
+        with timer("parallel_search_total", self.logger):
+            results_map = self._search_parallel(index_path, query, backends, limit)

        # Provide helpful message if pure-vector mode returns no results
        if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0:
@@ -120,11 +156,72 @@ class HybridSearchEngine:
            if source in results_map
        }

-        fused_results = reciprocal_rank_fusion(results_map, active_weights)
+        with timer("rrf_fusion", self.logger):
+            adaptive_weights = get_rrf_weights(query, active_weights)
+            fused_results = reciprocal_rank_fusion(results_map, adaptive_weights)
+
+        # Optional: boost results that include explicit symbol matches
+        boost_factor = (
+            self._config.symbol_boost_factor
+            if self._config is not None
+            else 1.5
+        )
+        with timer("symbol_boost", self.logger):
+            fused_results = apply_symbol_boost(
+                fused_results, boost_factor=boost_factor
+            )
+
+        # Optional: embedding-based reranking on top results
+        if self._config is not None and self._config.enable_reranking:
+            with timer("reranking", self.logger):
+                if self.embedder is None:
+                    self.embedder = self._get_reranking_embedder()
+                fused_results = rerank_results(
+                    query,
+                    fused_results[:100],
+                    self.embedder,
+                    top_k=self._config.reranking_top_k,
+                )

        # Apply final limit
        return fused_results[:limit]

+    def _get_reranking_embedder(self) -> Any:
+        """Create an embedder for reranking based on Config embedding settings."""
+        if self._config is None:
+            return None
+
+        try:
+            from codexlens.semantic.factory import get_embedder
+        except Exception as exc:
+            self.logger.debug("Reranking embedder unavailable: %s", exc)
+            return None
+
+        try:
+            if self._config.embedding_backend == "fastembed":
+                return get_embedder(
+                    backend="fastembed",
+                    profile=self._config.embedding_model,
+                    use_gpu=self._config.embedding_use_gpu,
+                )
+            if self._config.embedding_backend == "litellm":
+                return get_embedder(
+                    backend="litellm",
+                    model=self._config.embedding_model,
+                    endpoints=self._config.embedding_endpoints,
+                    strategy=self._config.embedding_strategy,
+                    cooldown=self._config.embedding_cooldown,
+                )
+        except Exception as exc:
+            self.logger.debug("Failed to initialize reranking embedder: %s", exc)
+            return None
+
+        self.logger.debug(
+            "Unknown embedding backend for reranking: %s",
+            self._config.embedding_backend,
+        )
+        return None
+
    def _search_parallel(
        self,
        index_path: Path,
@@ -144,25 +241,30 @@ class HybridSearchEngine:
            Dictionary mapping source name to results list
        """
        results_map: Dict[str, List[SearchResult]] = {}
+        timing_data: Dict[str, float] = {}

        # Use ThreadPoolExecutor for parallel I/O-bound searches
        with ThreadPoolExecutor(max_workers=len(backends)) as executor:
-            # Submit search tasks
+            # Submit search tasks with timing
            future_to_source = {}
+            submit_times = {}

            if backends.get("exact"):
+                submit_times["exact"] = time.perf_counter()
                future = executor.submit(
                    self._search_exact, index_path, query, limit
                )
                future_to_source[future] = "exact"

            if backends.get("fuzzy"):
+                submit_times["fuzzy"] = time.perf_counter()
                future = executor.submit(
                    self._search_fuzzy, index_path, query, limit
                )
                future_to_source[future] = "fuzzy"

            if backends.get("vector"):
+                submit_times["vector"] = time.perf_counter()
                future = executor.submit(
                    self._search_vector, index_path, query, limit
                )
@@ -171,18 +273,26 @@ class HybridSearchEngine:
            # Collect results as they complete
            for future in as_completed(future_to_source):
                source = future_to_source[future]
+                elapsed_ms = (time.perf_counter() - submit_times[source]) * 1000
+                timing_data[source] = elapsed_ms
                try:
                    results = future.result()
                    # Tag results with source for debugging
                    tagged_results = tag_search_source(results, source)
                    results_map[source] = tagged_results
                    self.logger.debug(
-                        "Got %d results from %s search", len(results), source
+                        "[TIMING] %s_search: %.2fms (%d results)",
+                        source, elapsed_ms, len(results)
                    )
                except Exception as exc:
                    self.logger.error("Search failed for %s: %s", source, exc)
                    results_map[source] = []

+        # Log timing summary
+        if timing_data:
+            timing_str = ", ".join(f"{k}={v:.1f}ms" for k, v in timing_data.items())
+            self.logger.debug("[TIMING] search_backends: {%s}", timing_str)
+
        return results_map

    def _search_exact(
@@ -245,6 +355,8 @@ class HybridSearchEngine:
        try:
            # Check if semantic chunks table exists
            import sqlite3
+
+            start_check = time.perf_counter()
            try:
                with sqlite3.connect(index_path) as conn:
                    cursor = conn.execute(
@@ -254,6 +366,10 @@ class HybridSearchEngine:
            except sqlite3.Error as e:
                self.logger.error("Database check failed in vector search: %s", e)
                return []
+            self.logger.debug(
+                "[TIMING] vector_table_check: %.2fms",
+                (time.perf_counter() - start_check) * 1000
+            )

            if not has_semantic_table:
                self.logger.info(
@@ -267,7 +383,12 @@ class HybridSearchEngine:
            from codexlens.semantic.factory import get_embedder
            from codexlens.semantic.vector_store import VectorStore

+            start_init = time.perf_counter()
            vector_store = VectorStore(index_path)
+            self.logger.debug(
+                "[TIMING] vector_store_init: %.2fms",
+                (time.perf_counter() - start_init) * 1000
+            )

            # Check if vector store has data
            if vector_store.count_chunks() == 0:
@@ -279,6 +400,7 @@ class HybridSearchEngine:
                return []

            # Get stored model configuration (preferred) or auto-detect from dimension
+            start_embedder = time.perf_counter()
            model_config = vector_store.get_model_config()
            if model_config:
                backend = model_config.get("backend", "fastembed")
@@ -288,7 +410,7 @@ class HybridSearchEngine:
                    "Using stored model config: %s backend, %s (%s, %dd)",
                    backend, model_profile, model_name, model_config["embedding_dim"]
                )
-                
+
                # Get embedder based on backend
                if backend == "litellm":
                    embedder = get_embedder(backend="litellm", model=model_name)
@@ -324,21 +446,32 @@ class HybridSearchEngine:
                        detected_dim
                    )
                    embedder = get_embedder(backend="fastembed", profile="code")
-
-
+            self.logger.debug(
+                "[TIMING] embedder_init: %.2fms",
+                (time.perf_counter() - start_embedder) * 1000
+            )

            # Generate query embedding
+            start_embed = time.perf_counter()
            query_embedding = embedder.embed_single(query)
+            self.logger.debug(
+                "[TIMING] query_embedding: %.2fms",
+                (time.perf_counter() - start_embed) * 1000
+            )

            # Search for similar chunks
+            start_search = time.perf_counter()
            results = vector_store.search_similar(
                query_embedding=query_embedding,
                top_k=limit,
                min_score=0.0,  # Return all results, let RRF handle filtering
                return_full_content=True,
            )
+            self.logger.debug(
+                "[TIMING] vector_similarity_search: %.2fms (%d results)",
+                (time.perf_counter() - start_search) * 1000, len(results)
+            )

-            self.logger.debug("Vector search found %d results", len(results))
            return results

        except ImportError as exc:
--- a/codex-lens/src/codexlens/search/ranking.py
+++ b/codex-lens/src/codexlens/search/ranking.py
@@ -6,12 +6,98 @@ for combining results from heterogeneous search backends (exact FTS, fuzzy FTS,

 from __future__ import annotations

+import re
 import math
-from typing import Dict, List
+from enum import Enum
+from typing import Any, Dict, List

 from codexlens.entities import SearchResult, AdditionalLocation


+class QueryIntent(str, Enum):
+    """Query intent for adaptive RRF weights (Python/TypeScript parity)."""
+
+    KEYWORD = "keyword"
+    SEMANTIC = "semantic"
+    MIXED = "mixed"
+
+
+def normalize_weights(weights: Dict[str, float]) -> Dict[str, float]:
+    """Normalize weights to sum to 1.0 (best-effort)."""
+    total = sum(float(v) for v in weights.values() if v is not None)
+    if not math.isfinite(total) or total <= 0:
+        return {k: float(v) for k, v in weights.items()}
+    return {k: float(v) / total for k, v in weights.items()}
+
+
+def detect_query_intent(query: str) -> QueryIntent:
+    """Detect whether a query is code-like, natural-language, or mixed.
+
+    Heuristic signals kept aligned with `ccw/src/tools/smart-search.ts`.
+    """
+    trimmed = (query or "").strip()
+    if not trimmed:
+        return QueryIntent.MIXED
+
+    lower = trimmed.lower()
+    word_count = len([w for w in re.split(r"\s+", trimmed) if w])
+
+    has_code_signals = bool(
+        re.search(r"(::|->|\.)", trimmed)
+        or re.search(r"[A-Z][a-z]+[A-Z]", trimmed)
+        or re.search(r"\b\w+_\w+\b", trimmed)
+        or re.search(
+            r"\b(def|class|function|const|let|var|import|from|return|async|await|interface|type)\b",
+            lower,
+            flags=re.IGNORECASE,
+        )
+    )
+    has_natural_signals = bool(
+        word_count > 5
+        or "?" in trimmed
+        or re.search(r"\b(how|what|why|when|where)\b", trimmed, flags=re.IGNORECASE)
+        or re.search(
+            r"\b(handle|explain|fix|implement|create|build|use|find|search|convert|parse|generate|support)\b",
+            trimmed,
+            flags=re.IGNORECASE,
+        )
+    )
+
+    if has_code_signals and has_natural_signals:
+        return QueryIntent.MIXED
+    if has_code_signals:
+        return QueryIntent.KEYWORD
+    if has_natural_signals:
+        return QueryIntent.SEMANTIC
+    return QueryIntent.MIXED
+
+
+def adjust_weights_by_intent(
+    intent: QueryIntent,
+    base_weights: Dict[str, float],
+) -> Dict[str, float]:
+    """Map intent → weights (kept aligned with TypeScript mapping)."""
+    if intent == QueryIntent.KEYWORD:
+        target = {"exact": 0.5, "fuzzy": 0.1, "vector": 0.4}
+    elif intent == QueryIntent.SEMANTIC:
+        target = {"exact": 0.2, "fuzzy": 0.1, "vector": 0.7}
+    else:
+        target = dict(base_weights)
+
+    # Preserve only keys that are present in base_weights (active backends).
+    keys = list(base_weights.keys())
+    filtered = {k: float(target.get(k, 0.0)) for k in keys}
+    return normalize_weights(filtered)
+
+
+def get_rrf_weights(
+    query: str,
+    base_weights: Dict[str, float],
+) -> Dict[str, float]:
+    """Compute adaptive RRF weights from query intent."""
+    return adjust_weights_by_intent(detect_query_intent(query), base_weights)
+
+
 def reciprocal_rank_fusion(
    results_map: Dict[str, List[SearchResult]],
    weights: Dict[str, float] = None,
@@ -102,6 +188,186 @@ def reciprocal_rank_fusion(
    return fused_results


+def apply_symbol_boost(
+    results: List[SearchResult],
+    boost_factor: float = 1.5,
+) -> List[SearchResult]:
+    """Boost fused scores for results that include an explicit symbol match.
+
+    The boost is multiplicative on the current result.score (typically the RRF fusion score).
+    When boosted, the original score is preserved in metadata["original_fusion_score"] and
+    metadata["boosted"] is set to True.
+    """
+    if not results:
+        return []
+
+    if boost_factor <= 1.0:
+        # Still return new objects to follow immutable transformation pattern.
+        return [
+            SearchResult(
+                path=r.path,
+                score=r.score,
+                excerpt=r.excerpt,
+                content=r.content,
+                symbol=r.symbol,
+                chunk=r.chunk,
+                metadata={**r.metadata},
+                start_line=r.start_line,
+                end_line=r.end_line,
+                symbol_name=r.symbol_name,
+                symbol_kind=r.symbol_kind,
+                additional_locations=list(r.additional_locations),
+            )
+            for r in results
+        ]
+
+    boosted_results: List[SearchResult] = []
+    for result in results:
+        has_symbol = bool(result.symbol_name)
+        original_score = float(result.score)
+        boosted_score = original_score * boost_factor if has_symbol else original_score
+
+        metadata = {**result.metadata}
+        if has_symbol:
+            metadata.setdefault("original_fusion_score", metadata.get("fusion_score", original_score))
+            metadata["boosted"] = True
+            metadata["symbol_boost_factor"] = boost_factor
+
+        boosted_results.append(
+            SearchResult(
+                path=result.path,
+                score=boosted_score,
+                excerpt=result.excerpt,
+                content=result.content,
+                symbol=result.symbol,
+                chunk=result.chunk,
+                metadata=metadata,
+                start_line=result.start_line,
+                end_line=result.end_line,
+                symbol_name=result.symbol_name,
+                symbol_kind=result.symbol_kind,
+                additional_locations=list(result.additional_locations),
+            )
+        )
+
+    boosted_results.sort(key=lambda r: r.score, reverse=True)
+    return boosted_results
+
+
+def rerank_results(
+    query: str,
+    results: List[SearchResult],
+    embedder: Any,
+    top_k: int = 50,
+) -> List[SearchResult]:
+    """Re-rank results with embedding cosine similarity, combined with current score.
+
+    Combined score formula:
+        0.5 * rrf_score + 0.5 * cosine_similarity
+
+    If embedder is None or embedding fails, returns results as-is.
+    """
+    if not results:
+        return []
+
+    if embedder is None or top_k <= 0:
+        return results
+
+    rerank_count = min(int(top_k), len(results))
+
+    def cosine_similarity(vec_a: List[float], vec_b: List[float]) -> float:
+        # Defensive: handle mismatched lengths and zero vectors.
+        n = min(len(vec_a), len(vec_b))
+        if n == 0:
+            return 0.0
+        dot = 0.0
+        norm_a = 0.0
+        norm_b = 0.0
+        for i in range(n):
+            a = float(vec_a[i])
+            b = float(vec_b[i])
+            dot += a * b
+            norm_a += a * a
+            norm_b += b * b
+        if norm_a <= 0.0 or norm_b <= 0.0:
+            return 0.0
+        sim = dot / (math.sqrt(norm_a) * math.sqrt(norm_b))
+        # SearchResult.score requires non-negative scores; clamp cosine similarity to [0, 1].
+        return max(0.0, min(1.0, sim))
+
+    def text_for_embedding(r: SearchResult) -> str:
+        if r.excerpt and r.excerpt.strip():
+            return r.excerpt
+        if r.content and r.content.strip():
+            return r.content
+        if r.chunk and r.chunk.content and r.chunk.content.strip():
+            return r.chunk.content
+        # Fallback: stable, non-empty text.
+        return r.symbol_name or r.path
+
+    try:
+        if hasattr(embedder, "embed_single"):
+            query_vec = embedder.embed_single(query)
+        else:
+            query_vec = embedder.embed(query)[0]
+
+        doc_texts = [text_for_embedding(r) for r in results[:rerank_count]]
+        doc_vecs = embedder.embed(doc_texts)
+    except Exception:
+        return results
+
+    reranked_results: List[SearchResult] = []
+
+    for idx, result in enumerate(results):
+        if idx < rerank_count:
+            rrf_score = float(result.score)
+            sim = cosine_similarity(query_vec, doc_vecs[idx])
+            combined_score = 0.5 * rrf_score + 0.5 * sim
+
+            reranked_results.append(
+                SearchResult(
+                    path=result.path,
+                    score=combined_score,
+                    excerpt=result.excerpt,
+                    content=result.content,
+                    symbol=result.symbol,
+                    chunk=result.chunk,
+                    metadata={
+                        **result.metadata,
+                        "rrf_score": rrf_score,
+                        "cosine_similarity": sim,
+                        "reranked": True,
+                    },
+                    start_line=result.start_line,
+                    end_line=result.end_line,
+                    symbol_name=result.symbol_name,
+                    symbol_kind=result.symbol_kind,
+                    additional_locations=list(result.additional_locations),
+                )
+            )
+        else:
+            # Preserve remaining results without re-ranking, but keep immutability.
+            reranked_results.append(
+                SearchResult(
+                    path=result.path,
+                    score=result.score,
+                    excerpt=result.excerpt,
+                    content=result.content,
+                    symbol=result.symbol,
+                    chunk=result.chunk,
+                    metadata={**result.metadata},
+                    start_line=result.start_line,
+                    end_line=result.end_line,
+                    symbol_name=result.symbol_name,
+                    symbol_kind=result.symbol_kind,
+                    additional_locations=list(result.additional_locations),
+                )
+            )
+
+    reranked_results.sort(key=lambda r: r.score, reverse=True)
+    return reranked_results
+
+
 def normalize_bm25_score(score: float) -> float:
    """Normalize BM25 scores from SQLite FTS5 to 0-1 range.

--- a/codex-lens/src/codexlens/semantic/chunker.py
+++ b/codex-lens/src/codexlens/semantic/chunker.py
@@ -392,6 +392,22 @@ class HybridChunker:
                filtered.append(symbol)
        return filtered

+    def _find_parent_symbol(
+        self,
+        start_line: int,
+        end_line: int,
+        symbols: List[Symbol],
+    ) -> Optional[Symbol]:
+        """Find the smallest symbol range that fully contains a docstring span."""
+        candidates: List[Symbol] = []
+        for symbol in symbols:
+            sym_start, sym_end = symbol.range
+            if sym_start <= start_line and end_line <= sym_end:
+                candidates.append(symbol)
+        if not candidates:
+            return None
+        return min(candidates, key=lambda s: (s.range[1] - s.range[0], s.range[0]))
+
    def chunk_file(
        self,
        content: str,
@@ -414,24 +430,53 @@ class HybridChunker:
        chunks: List[SemanticChunk] = []

        # Step 1: Extract docstrings as dedicated chunks
-        docstrings = self.docstring_extractor.extract_docstrings(content, language)
+        docstrings: List[Tuple[str, int, int]] = []
+        if language == "python":
+            # Fast path: avoid expensive docstring extraction if delimiters are absent.
+            if '"""' in content or "'''" in content:
+                docstrings = self.docstring_extractor.extract_docstrings(content, language)
+        elif language in {"javascript", "typescript"}:
+            if "/**" in content:
+                docstrings = self.docstring_extractor.extract_docstrings(content, language)
+        else:
+            docstrings = self.docstring_extractor.extract_docstrings(content, language)
+
+        # Fast path: no docstrings -> delegate to base chunker directly.
+        if not docstrings:
+            if symbols:
+                base_chunks = self.base_chunker.chunk_by_symbol(
+                    content, symbols, file_path, language, symbol_token_counts
+                )
+            else:
+                base_chunks = self.base_chunker.chunk_sliding_window(content, file_path, language)
+
+            for chunk in base_chunks:
+                chunk.metadata["strategy"] = "hybrid"
+                chunk.metadata["chunk_type"] = "code"
+            return base_chunks

        for docstring_content, start_line, end_line in docstrings:
            if len(docstring_content.strip()) >= self.config.min_chunk_size:
+                parent_symbol = self._find_parent_symbol(start_line, end_line, symbols)
                # Use base chunker's token estimation method
                token_count = self.base_chunker._estimate_token_count(docstring_content)
+                metadata = {
+                    "file": str(file_path),
+                    "language": language,
+                    "chunk_type": "docstring",
+                    "start_line": start_line,
+                    "end_line": end_line,
+                    "strategy": "hybrid",
+                    "token_count": token_count,
+                }
+                if parent_symbol is not None:
+                    metadata["parent_symbol"] = parent_symbol.name
+                    metadata["parent_symbol_kind"] = parent_symbol.kind
+                    metadata["parent_symbol_range"] = parent_symbol.range
                chunks.append(SemanticChunk(
                    content=docstring_content,
                    embedding=None,
-                    metadata={
-                        "file": str(file_path),
-                        "language": language,
-                        "chunk_type": "docstring",
-                        "start_line": start_line,
-                        "end_line": end_line,
-                        "strategy": "hybrid",
-                        "token_count": token_count,
-                    }
+                    metadata=metadata
                ))

        # Step 2: Get line ranges occupied by docstrings