feat: Add multi-type embedding backends for cascade retrieval

- Implemented BinaryEmbeddingBackend for fast coarse filtering using 256-dimensional binary vectors. - Developed DenseEmbeddingBackend for high-precision dense vectors (2048 dimensions) for reranking. - Created CascadeEmbeddingBackend to combine binary and dense embeddings for two-stage retrieval. - Introduced utility functions for embedding conversion and distance computation. chore: Migration 010 - Add multi-vector storage support - Added 'chunks' table to support multi-vector embeddings for cascade retrieval. - Included new columns: embedding_binary (256-dim) and embedding_dense (2048-dim) for efficient storage. - Implemented upgrade and downgrade functions to manage schema changes and data migration.
2026-02-11 02:33:51 +08:00 · 2026-01-02 10:52:43 +08:00
parent 195438d26a
commit e21d801523
13 changed files with 3449 additions and 6 deletions
--- a/codex-lens/src/codexlens/config.py
+++ b/codex-lens/src/codexlens/config.py
@@ -131,6 +131,16 @@ class Config:
    reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
    reranker_top_k: int = 50

+    # Cascade search configuration (two-stage retrieval)
+    enable_cascade_search: bool = False  # Enable cascade search (coarse + fine ranking)
+    cascade_coarse_k: int = 100  # Number of coarse candidates from first stage
+    cascade_fine_k: int = 10  # Number of final results after reranking
+    cascade_strategy: str = "binary"  # "binary" (fast binary+dense) or "hybrid" (FTS+SPLADE+Vector+CrossEncoder)
+
+    # RRF fusion configuration
+    fusion_method: str = "rrf"  # "simple" (weighted sum) or "rrf" (reciprocal rank fusion)
+    rrf_k: int = 60  # RRF constant (default 60)
+
    # Multi-endpoint configuration for litellm backend
    embedding_endpoints: List[Dict[str, Any]] = field(default_factory=list)
    # List of endpoint configs: [{"model": "...", "api_key": "...", "api_base": "...", "weight": 1.0}]
--- a/codex-lens/src/codexlens/indexing/init.py
+++ b/codex-lens/src/codexlens/indexing/init.py
@@ -1,4 +1,26 @@
 """Code indexing and symbol extraction."""
 from codexlens.indexing.symbol_extractor import SymbolExtractor
+from codexlens.indexing.embedding import (
+    BinaryEmbeddingBackend,
+    DenseEmbeddingBackend,
+    CascadeEmbeddingBackend,
+    get_cascade_embedder,
+    binarize_embedding,
+    pack_binary_embedding,
+    unpack_binary_embedding,
+    hamming_distance,
+)

-__all__ = ["SymbolExtractor"]
+__all__ = [
+    "SymbolExtractor",
+    # Cascade embedding backends
+    "BinaryEmbeddingBackend",
+    "DenseEmbeddingBackend",
+    "CascadeEmbeddingBackend",
+    "get_cascade_embedder",
+    # Utility functions
+    "binarize_embedding",
+    "pack_binary_embedding",
+    "unpack_binary_embedding",
+    "hamming_distance",
+]
--- a/codex-lens/src/codexlens/indexing/embedding.py
+++ b/codex-lens/src/codexlens/indexing/embedding.py
@@ -0,0 +1,582 @@
+"""Multi-type embedding backends for cascade retrieval.
+
+This module provides embedding backends optimized for cascade retrieval:
+1. BinaryEmbeddingBackend - Fast coarse filtering with binary vectors
+2. DenseEmbeddingBackend - High-precision dense vectors for reranking
+3. CascadeEmbeddingBackend - Combined binary + dense for two-stage retrieval
+
+Cascade retrieval workflow:
+1. Binary search (fast, ~32 bytes/vector) -> top-K candidates
+2. Dense rerank (precise, ~8KB/vector) -> final results
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Iterable, List, Optional, Tuple
+
+import numpy as np
+
+from codexlens.semantic.base import BaseEmbedder
+
+logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Utility Functions
+# =============================================================================
+
+
+def binarize_embedding(embedding: np.ndarray) -> np.ndarray:
+    """Convert float embedding to binary vector.
+
+    Applies sign-based quantization: values > 0 become 1, values <= 0 become 0.
+
+    Args:
+        embedding: Float32 embedding of any dimension
+
+    Returns:
+        Binary vector (uint8 with values 0 or 1) of same dimension
+    """
+    return (embedding > 0).astype(np.uint8)
+
+
+def pack_binary_embedding(binary_vector: np.ndarray) -> bytes:
+    """Pack binary vector into compact bytes format.
+
+    Packs 8 binary values into each byte for storage efficiency.
+    For a 256-dim binary vector, output is 32 bytes.
+
+    Args:
+        binary_vector: Binary vector (uint8 with values 0 or 1)
+
+    Returns:
+        Packed bytes (length = ceil(dim / 8))
+    """
+    # Ensure vector length is multiple of 8 by padding if needed
+    dim = len(binary_vector)
+    padded_dim = ((dim + 7) // 8) * 8
+    if padded_dim > dim:
+        padded = np.zeros(padded_dim, dtype=np.uint8)
+        padded[:dim] = binary_vector
+        binary_vector = padded
+
+    # Pack 8 bits per byte
+    packed = np.packbits(binary_vector)
+    return packed.tobytes()
+
+
+def unpack_binary_embedding(packed_bytes: bytes, dim: int = 256) -> np.ndarray:
+    """Unpack bytes back to binary vector.
+
+    Args:
+        packed_bytes: Packed binary data
+        dim: Original vector dimension (default: 256)
+
+    Returns:
+        Binary vector (uint8 with values 0 or 1)
+    """
+    unpacked = np.unpackbits(np.frombuffer(packed_bytes, dtype=np.uint8))
+    return unpacked[:dim]
+
+
+def hamming_distance(a: bytes, b: bytes) -> int:
+    """Compute Hamming distance between two packed binary vectors.
+
+    Uses XOR and popcount for efficient distance computation.
+
+    Args:
+        a: First packed binary vector
+        b: Second packed binary vector
+
+    Returns:
+        Hamming distance (number of differing bits)
+    """
+    a_arr = np.frombuffer(a, dtype=np.uint8)
+    b_arr = np.frombuffer(b, dtype=np.uint8)
+    xor = np.bitwise_xor(a_arr, b_arr)
+    return int(np.unpackbits(xor).sum())
+
+
+# =============================================================================
+# Binary Embedding Backend
+# =============================================================================
+
+
+class BinaryEmbeddingBackend(BaseEmbedder):
+    """Generate 256-dimensional binary embeddings for fast coarse retrieval.
+
+    Uses a lightweight embedding model and applies sign-based quantization
+    to produce compact binary vectors (32 bytes per embedding).
+
+    Suitable for:
+    - First-stage candidate retrieval
+    - Hamming distance-based similarity search
+    - Memory-constrained environments
+
+    Model: sentence-transformers/all-MiniLM-L6-v2 (384 dim) -> quantized to 256 bits
+    """
+
+    DEFAULT_MODEL = "BAAI/bge-small-en-v1.5"  # 384 dim, fast
+    BINARY_DIM = 256
+
+    def __init__(
+        self,
+        model_name: Optional[str] = None,
+        use_gpu: bool = True,
+    ) -> None:
+        """Initialize binary embedding backend.
+
+        Args:
+            model_name: Base embedding model name. Defaults to BAAI/bge-small-en-v1.5
+            use_gpu: Whether to use GPU acceleration
+        """
+        from codexlens.semantic import SEMANTIC_AVAILABLE
+
+        if not SEMANTIC_AVAILABLE:
+            raise ImportError(
+                "Semantic search dependencies not available. "
+                "Install with: pip install codexlens[semantic]"
+            )
+
+        self._model_name = model_name or self.DEFAULT_MODEL
+        self._use_gpu = use_gpu
+        self._model = None
+
+        # Projection matrix for dimension reduction (lazily initialized)
+        self._projection_matrix: Optional[np.ndarray] = None
+
+    @property
+    def model_name(self) -> str:
+        """Return model name."""
+        return self._model_name
+
+    @property
+    def embedding_dim(self) -> int:
+        """Return binary embedding dimension (256)."""
+        return self.BINARY_DIM
+
+    @property
+    def packed_bytes(self) -> int:
+        """Return packed bytes size (32 bytes for 256 bits)."""
+        return self.BINARY_DIM // 8
+
+    def _load_model(self) -> None:
+        """Lazy load the embedding model."""
+        if self._model is not None:
+            return
+
+        from fastembed import TextEmbedding
+        from codexlens.semantic.gpu_support import get_optimal_providers
+
+        providers = get_optimal_providers(use_gpu=self._use_gpu, with_device_options=True)
+        try:
+            self._model = TextEmbedding(
+                model_name=self._model_name,
+                providers=providers,
+            )
+        except TypeError:
+            # Fallback for older fastembed versions
+            self._model = TextEmbedding(model_name=self._model_name)
+
+        logger.debug(f"BinaryEmbeddingBackend loaded model: {self._model_name}")
+
+    def _get_projection_matrix(self, input_dim: int) -> np.ndarray:
+        """Get or create projection matrix for dimension reduction.
+
+        Uses random projection with fixed seed for reproducibility.
+
+        Args:
+            input_dim: Input embedding dimension from base model
+
+        Returns:
+            Projection matrix of shape (input_dim, BINARY_DIM)
+        """
+        if self._projection_matrix is not None:
+            return self._projection_matrix
+
+        # Fixed seed for reproducibility across sessions
+        rng = np.random.RandomState(42)
+        # Gaussian random projection
+        self._projection_matrix = rng.randn(input_dim, self.BINARY_DIM).astype(np.float32)
+        # Normalize columns for consistent scale
+        norms = np.linalg.norm(self._projection_matrix, axis=0, keepdims=True)
+        self._projection_matrix /= (norms + 1e-8)
+
+        return self._projection_matrix
+
+    def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
+        """Generate binary embeddings as numpy array.
+
+        Args:
+            texts: Single text or iterable of texts
+
+        Returns:
+            Binary embeddings of shape (n_texts, 256) with values 0 or 1
+        """
+        self._load_model()
+
+        if isinstance(texts, str):
+            texts = [texts]
+        else:
+            texts = list(texts)
+
+        # Get base float embeddings
+        float_embeddings = np.array(list(self._model.embed(texts)))
+        input_dim = float_embeddings.shape[1]
+
+        # Project to target dimension if needed
+        if input_dim != self.BINARY_DIM:
+            projection = self._get_projection_matrix(input_dim)
+            float_embeddings = float_embeddings @ projection
+
+        # Binarize
+        return binarize_embedding(float_embeddings)
+
+    def embed_packed(self, texts: str | Iterable[str]) -> List[bytes]:
+        """Generate packed binary embeddings.
+
+        Args:
+            texts: Single text or iterable of texts
+
+        Returns:
+            List of packed bytes (32 bytes each for 256-dim)
+        """
+        binary = self.embed_to_numpy(texts)
+        return [pack_binary_embedding(vec) for vec in binary]
+
+
+# =============================================================================
+# Dense Embedding Backend
+# =============================================================================
+
+
+class DenseEmbeddingBackend(BaseEmbedder):
+    """Generate high-dimensional dense embeddings for precise reranking.
+
+    Uses large embedding models to produce 2048-dimensional float32 vectors
+    for maximum retrieval quality.
+
+    Suitable for:
+    - Second-stage reranking
+    - High-precision similarity search
+    - Quality-critical applications
+
+    Model: BAAI/bge-large-en-v1.5 (1024 dim) with optional expansion
+    """
+
+    DEFAULT_MODEL = "BAAI/bge-large-en-v1.5"  # 1024 dim, high quality
+    TARGET_DIM = 2048
+
+    def __init__(
+        self,
+        model_name: Optional[str] = None,
+        use_gpu: bool = True,
+        expand_dim: bool = True,
+    ) -> None:
+        """Initialize dense embedding backend.
+
+        Args:
+            model_name: Dense embedding model name. Defaults to BAAI/bge-large-en-v1.5
+            use_gpu: Whether to use GPU acceleration
+            expand_dim: If True, expand embeddings to TARGET_DIM using learned expansion
+        """
+        from codexlens.semantic import SEMANTIC_AVAILABLE
+
+        if not SEMANTIC_AVAILABLE:
+            raise ImportError(
+                "Semantic search dependencies not available. "
+                "Install with: pip install codexlens[semantic]"
+            )
+
+        self._model_name = model_name or self.DEFAULT_MODEL
+        self._use_gpu = use_gpu
+        self._expand_dim = expand_dim
+        self._model = None
+        self._native_dim: Optional[int] = None
+
+        # Expansion matrix for dimension expansion (lazily initialized)
+        self._expansion_matrix: Optional[np.ndarray] = None
+
+    @property
+    def model_name(self) -> str:
+        """Return model name."""
+        return self._model_name
+
+    @property
+    def embedding_dim(self) -> int:
+        """Return embedding dimension.
+
+        Returns TARGET_DIM if expand_dim is True, otherwise native model dimension.
+        """
+        if self._expand_dim:
+            return self.TARGET_DIM
+        # Return cached native dim or estimate based on model
+        if self._native_dim is not None:
+            return self._native_dim
+        # Model dimension estimates
+        model_dims = {
+            "BAAI/bge-large-en-v1.5": 1024,
+            "BAAI/bge-base-en-v1.5": 768,
+            "BAAI/bge-small-en-v1.5": 384,
+            "intfloat/multilingual-e5-large": 1024,
+        }
+        return model_dims.get(self._model_name, 1024)
+
+    @property
+    def max_tokens(self) -> int:
+        """Return maximum token limit."""
+        return 512  # Conservative default for large models
+
+    def _load_model(self) -> None:
+        """Lazy load the embedding model."""
+        if self._model is not None:
+            return
+
+        from fastembed import TextEmbedding
+        from codexlens.semantic.gpu_support import get_optimal_providers
+
+        providers = get_optimal_providers(use_gpu=self._use_gpu, with_device_options=True)
+        try:
+            self._model = TextEmbedding(
+                model_name=self._model_name,
+                providers=providers,
+            )
+        except TypeError:
+            self._model = TextEmbedding(model_name=self._model_name)
+
+        logger.debug(f"DenseEmbeddingBackend loaded model: {self._model_name}")
+
+    def _get_expansion_matrix(self, input_dim: int) -> np.ndarray:
+        """Get or create expansion matrix for dimension expansion.
+
+        Uses random orthogonal projection for information-preserving expansion.
+
+        Args:
+            input_dim: Input embedding dimension from base model
+
+        Returns:
+            Expansion matrix of shape (input_dim, TARGET_DIM)
+        """
+        if self._expansion_matrix is not None:
+            return self._expansion_matrix
+
+        # Fixed seed for reproducibility
+        rng = np.random.RandomState(123)
+
+        # Create semi-orthogonal expansion matrix
+        # First input_dim columns form identity-like structure
+        self._expansion_matrix = np.zeros((input_dim, self.TARGET_DIM), dtype=np.float32)
+
+        # Copy original dimensions
+        copy_dim = min(input_dim, self.TARGET_DIM)
+        self._expansion_matrix[:copy_dim, :copy_dim] = np.eye(copy_dim, dtype=np.float32)
+
+        # Fill remaining with random projections
+        if self.TARGET_DIM > input_dim:
+            random_part = rng.randn(input_dim, self.TARGET_DIM - input_dim).astype(np.float32)
+            # Normalize
+            norms = np.linalg.norm(random_part, axis=0, keepdims=True)
+            random_part /= (norms + 1e-8)
+            self._expansion_matrix[:, input_dim:] = random_part
+
+        return self._expansion_matrix
+
+    def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
+        """Generate dense embeddings as numpy array.
+
+        Args:
+            texts: Single text or iterable of texts
+
+        Returns:
+            Dense embeddings of shape (n_texts, TARGET_DIM) as float32
+        """
+        self._load_model()
+
+        if isinstance(texts, str):
+            texts = [texts]
+        else:
+            texts = list(texts)
+
+        # Get base float embeddings
+        float_embeddings = np.array(list(self._model.embed(texts)), dtype=np.float32)
+        self._native_dim = float_embeddings.shape[1]
+
+        # Expand to target dimension if needed
+        if self._expand_dim and self._native_dim < self.TARGET_DIM:
+            expansion = self._get_expansion_matrix(self._native_dim)
+            float_embeddings = float_embeddings @ expansion
+
+        return float_embeddings
+
+
+# =============================================================================
+# Cascade Embedding Backend
+# =============================================================================
+
+
+class CascadeEmbeddingBackend(BaseEmbedder):
+    """Combined binary + dense embedding backend for cascade retrieval.
+
+    Generates both binary (for fast coarse filtering) and dense (for precise
+    reranking) embeddings in a single pass, optimized for two-stage retrieval.
+
+    Cascade workflow:
+    1. encode_cascade() returns (binary_embeddings, dense_embeddings)
+    2. Binary search: Use Hamming distance on binary vectors -> top-K candidates
+    3. Dense rerank: Use cosine similarity on dense vectors -> final results
+
+    Memory efficiency:
+    - Binary: 32 bytes per vector (256 bits)
+    - Dense: 8192 bytes per vector (2048 x float32)
+    - Total: ~8KB per document for full cascade support
+    """
+
+    def __init__(
+        self,
+        binary_model: Optional[str] = None,
+        dense_model: Optional[str] = None,
+        use_gpu: bool = True,
+    ) -> None:
+        """Initialize cascade embedding backend.
+
+        Args:
+            binary_model: Model for binary embeddings. Defaults to BAAI/bge-small-en-v1.5
+            dense_model: Model for dense embeddings. Defaults to BAAI/bge-large-en-v1.5
+            use_gpu: Whether to use GPU acceleration
+        """
+        self._binary_backend = BinaryEmbeddingBackend(
+            model_name=binary_model,
+            use_gpu=use_gpu,
+        )
+        self._dense_backend = DenseEmbeddingBackend(
+            model_name=dense_model,
+            use_gpu=use_gpu,
+            expand_dim=True,
+        )
+        self._use_gpu = use_gpu
+
+    @property
+    def model_name(self) -> str:
+        """Return model names for both backends."""
+        return f"cascade({self._binary_backend.model_name}, {self._dense_backend.model_name})"
+
+    @property
+    def embedding_dim(self) -> int:
+        """Return dense embedding dimension (for compatibility)."""
+        return self._dense_backend.embedding_dim
+
+    @property
+    def binary_dim(self) -> int:
+        """Return binary embedding dimension."""
+        return self._binary_backend.embedding_dim
+
+    @property
+    def dense_dim(self) -> int:
+        """Return dense embedding dimension."""
+        return self._dense_backend.embedding_dim
+
+    def embed_to_numpy(self, texts: str | Iterable[str]) -> np.ndarray:
+        """Generate dense embeddings (for BaseEmbedder compatibility).
+
+        For cascade embeddings, use encode_cascade() instead.
+
+        Args:
+            texts: Single text or iterable of texts
+
+        Returns:
+            Dense embeddings of shape (n_texts, dense_dim)
+        """
+        return self._dense_backend.embed_to_numpy(texts)
+
+    def encode_cascade(
+        self,
+        texts: str | Iterable[str],
+        batch_size: int = 32,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Generate both binary and dense embeddings.
+
+        Args:
+            texts: Single text or iterable of texts
+            batch_size: Batch size for processing
+
+        Returns:
+            Tuple of:
+            - binary_embeddings: Shape (n_texts, 256), uint8 values 0/1
+            - dense_embeddings: Shape (n_texts, 2048), float32
+        """
+        if isinstance(texts, str):
+            texts = [texts]
+        else:
+            texts = list(texts)
+
+        binary_embeddings = self._binary_backend.embed_to_numpy(texts)
+        dense_embeddings = self._dense_backend.embed_to_numpy(texts)
+
+        return binary_embeddings, dense_embeddings
+
+    def encode_binary(self, texts: str | Iterable[str]) -> np.ndarray:
+        """Generate only binary embeddings.
+
+        Args:
+            texts: Single text or iterable of texts
+
+        Returns:
+            Binary embeddings of shape (n_texts, 256)
+        """
+        return self._binary_backend.embed_to_numpy(texts)
+
+    def encode_dense(self, texts: str | Iterable[str]) -> np.ndarray:
+        """Generate only dense embeddings.
+
+        Args:
+            texts: Single text or iterable of texts
+
+        Returns:
+            Dense embeddings of shape (n_texts, 2048)
+        """
+        return self._dense_backend.embed_to_numpy(texts)
+
+    def encode_binary_packed(self, texts: str | Iterable[str]) -> List[bytes]:
+        """Generate packed binary embeddings.
+
+        Args:
+            texts: Single text or iterable of texts
+
+        Returns:
+            List of packed bytes (32 bytes each)
+        """
+        return self._binary_backend.embed_packed(texts)
+
+
+# =============================================================================
+# Factory Function
+# =============================================================================
+
+
+def get_cascade_embedder(
+    binary_model: Optional[str] = None,
+    dense_model: Optional[str] = None,
+    use_gpu: bool = True,
+) -> CascadeEmbeddingBackend:
+    """Factory function to create a cascade embedder.
+
+    Args:
+        binary_model: Model for binary embeddings (default: BAAI/bge-small-en-v1.5)
+        dense_model: Model for dense embeddings (default: BAAI/bge-large-en-v1.5)
+        use_gpu: Whether to use GPU acceleration
+
+    Returns:
+        Configured CascadeEmbeddingBackend instance
+
+    Example:
+        >>> embedder = get_cascade_embedder()
+        >>> binary, dense = embedder.encode_cascade(["hello world"])
+        >>> binary.shape  # (1, 256)
+        >>> dense.shape   # (1, 2048)
+    """
+    return CascadeEmbeddingBackend(
+        binary_model=binary_model,
+        dense_model=dense_model,
+        use_gpu=use_gpu,
+    )
--- a/codex-lens/src/codexlens/search/chain_search.py
+++ b/codex-lens/src/codexlens/search/chain_search.py
@@ -9,12 +9,21 @@ from __future__ import annotations
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import List, Optional, Dict, Any
+from typing import List, Optional, Dict, Any, Literal, Tuple, TYPE_CHECKING
 import logging
 import os
 import time

 from codexlens.entities import SearchResult, Symbol
+
+if TYPE_CHECKING:
+    import numpy as np
+
+try:
+    import numpy as np
+    NUMPY_AVAILABLE = True
+except ImportError:
+    NUMPY_AVAILABLE = False
 from codexlens.config import Config
 from codexlens.storage.registry import RegistryStore, DirMapping
 from codexlens.storage.dir_index import DirIndexStore, SubdirLink
@@ -260,6 +269,672 @@ class ChainSearchEngine:
            related_results=related_results,
        )

+    def hybrid_cascade_search(
+        self,
+        query: str,
+        source_path: Path,
+        k: int = 10,
+        coarse_k: int = 100,
+        options: Optional[SearchOptions] = None,
+    ) -> ChainSearchResult:
+        """Execute two-stage cascade search with hybrid coarse retrieval and cross-encoder reranking.
+
+        Hybrid cascade search process:
+        1. Stage 1 (Coarse): Fast retrieval using RRF fusion of FTS + SPLADE + Vector
+           to get coarse_k candidates
+        2. Stage 2 (Fine): CrossEncoder reranking of candidates to get final k results
+
+        This approach balances recall (from broad coarse search) with precision
+        (from expensive but accurate cross-encoder scoring).
+
+        Note: This method is the original hybrid approach. For binary vector cascade,
+        use binary_cascade_search() instead.
+
+        Args:
+            query: Natural language or keyword query string
+            source_path: Starting directory path
+            k: Number of final results to return (default 10)
+            coarse_k: Number of coarse candidates from first stage (default 100)
+            options: Search configuration (uses defaults if None)
+
+        Returns:
+            ChainSearchResult with reranked results and statistics
+
+        Examples:
+            >>> engine = ChainSearchEngine(registry, mapper, config=config)
+            >>> result = engine.hybrid_cascade_search(
+            ...     "how to authenticate users",
+            ...     Path("D:/project/src"),
+            ...     k=10,
+            ...     coarse_k=100
+            ... )
+            >>> for r in result.results:
+            ...     print(f"{r.path}: {r.score:.3f}")
+        """
+        options = options or SearchOptions()
+        start_time = time.time()
+        stats = SearchStats()
+
+        # Use config defaults if available
+        if self._config is not None:
+            if hasattr(self._config, "cascade_coarse_k"):
+                coarse_k = coarse_k or self._config.cascade_coarse_k
+            if hasattr(self._config, "cascade_fine_k"):
+                k = k or self._config.cascade_fine_k
+
+        # Step 1: Find starting index
+        start_index = self._find_start_index(source_path)
+        if not start_index:
+            self.logger.warning(f"No index found for {source_path}")
+            stats.time_ms = (time.time() - start_time) * 1000
+            return ChainSearchResult(
+                query=query,
+                results=[],
+                symbols=[],
+                stats=stats
+            )
+
+        # Step 2: Collect all index paths
+        index_paths = self._collect_index_paths(start_index, options.depth)
+        stats.dirs_searched = len(index_paths)
+
+        if not index_paths:
+            self.logger.warning(f"No indexes collected from {start_index}")
+            stats.time_ms = (time.time() - start_time) * 1000
+            return ChainSearchResult(
+                query=query,
+                results=[],
+                symbols=[],
+                stats=stats
+            )
+
+        # Stage 1: Coarse retrieval with hybrid search (FTS + SPLADE + Vector)
+        # Use hybrid mode for multi-signal retrieval
+        coarse_options = SearchOptions(
+            depth=options.depth,
+            max_workers=1,  # Single thread for GPU safety
+            limit_per_dir=max(coarse_k // len(index_paths), 20),
+            total_limit=coarse_k,
+            hybrid_mode=True,
+            enable_fuzzy=options.enable_fuzzy,
+            enable_vector=True,  # Enable vector for semantic matching
+            pure_vector=False,
+            hybrid_weights=options.hybrid_weights,
+        )
+
+        self.logger.debug(
+            "Cascade Stage 1: Coarse retrieval for %d candidates", coarse_k
+        )
+        coarse_results, search_stats = self._search_parallel(
+            index_paths, query, coarse_options
+        )
+        stats.errors = search_stats.errors
+
+        # Merge and deduplicate coarse results
+        coarse_merged = self._merge_and_rank(coarse_results, coarse_k)
+        self.logger.debug(
+            "Cascade Stage 1 complete: %d candidates retrieved", len(coarse_merged)
+        )
+
+        if not coarse_merged:
+            stats.time_ms = (time.time() - start_time) * 1000
+            return ChainSearchResult(
+                query=query,
+                results=[],
+                symbols=[],
+                stats=stats
+            )
+
+        # Stage 2: Cross-encoder reranking
+        self.logger.debug(
+            "Cascade Stage 2: Cross-encoder reranking %d candidates to top-%d",
+            len(coarse_merged),
+            k,
+        )
+
+        final_results = self._cross_encoder_rerank(query, coarse_merged, k)
+
+        # Optional: grouping of similar results
+        if options.group_results:
+            from codexlens.search.ranking import group_similar_results
+            final_results = group_similar_results(
+                final_results, score_threshold_abs=options.grouping_threshold
+            )
+
+        stats.files_matched = len(final_results)
+        stats.time_ms = (time.time() - start_time) * 1000
+
+        self.logger.debug(
+            "Cascade search complete: %d results in %.2fms",
+            len(final_results),
+            stats.time_ms,
+        )
+
+        return ChainSearchResult(
+            query=query,
+            results=final_results,
+            symbols=[],
+            stats=stats,
+        )
+
+    def binary_cascade_search(
+        self,
+        query: str,
+        source_path: Path,
+        k: int = 10,
+        coarse_k: int = 100,
+        options: Optional[SearchOptions] = None,
+    ) -> ChainSearchResult:
+        """Execute binary cascade search with binary coarse ranking and dense fine ranking.
+
+        Binary cascade search process:
+        1. Stage 1 (Coarse): Fast binary vector search using Hamming distance
+           to quickly filter to coarse_k candidates (256-dim binary, 32 bytes/vector)
+        2. Stage 2 (Fine): Dense vector cosine similarity for precise reranking
+           of candidates (2048-dim float32)
+
+        This approach leverages the speed of binary search (~100x faster) while
+        maintaining precision through dense vector reranking.
+
+        Performance characteristics:
+        - Binary search: O(N) with SIMD-accelerated XOR + popcount
+        - Dense rerank: Only applied to top coarse_k candidates
+        - Memory: 32 bytes (binary) + 8KB (dense) per chunk
+
+        Args:
+            query: Natural language or keyword query string
+            source_path: Starting directory path
+            k: Number of final results to return (default 10)
+            coarse_k: Number of coarse candidates from first stage (default 100)
+            options: Search configuration (uses defaults if None)
+
+        Returns:
+            ChainSearchResult with reranked results and statistics
+
+        Examples:
+            >>> engine = ChainSearchEngine(registry, mapper, config=config)
+            >>> result = engine.binary_cascade_search(
+            ...     "how to authenticate users",
+            ...     Path("D:/project/src"),
+            ...     k=10,
+            ...     coarse_k=100
+            ... )
+            >>> for r in result.results:
+            ...     print(f"{r.path}: {r.score:.3f}")
+        """
+        if not NUMPY_AVAILABLE:
+            self.logger.warning(
+                "NumPy not available, falling back to hybrid cascade search"
+            )
+            return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
+
+        options = options or SearchOptions()
+        start_time = time.time()
+        stats = SearchStats()
+
+        # Use config defaults if available
+        if self._config is not None:
+            if hasattr(self._config, "cascade_coarse_k"):
+                coarse_k = coarse_k or self._config.cascade_coarse_k
+            if hasattr(self._config, "cascade_fine_k"):
+                k = k or self._config.cascade_fine_k
+
+        # Step 1: Find starting index
+        start_index = self._find_start_index(source_path)
+        if not start_index:
+            self.logger.warning(f"No index found for {source_path}")
+            stats.time_ms = (time.time() - start_time) * 1000
+            return ChainSearchResult(
+                query=query,
+                results=[],
+                symbols=[],
+                stats=stats
+            )
+
+        # Step 2: Collect all index paths
+        index_paths = self._collect_index_paths(start_index, options.depth)
+        stats.dirs_searched = len(index_paths)
+
+        if not index_paths:
+            self.logger.warning(f"No indexes collected from {start_index}")
+            stats.time_ms = (time.time() - start_time) * 1000
+            return ChainSearchResult(
+                query=query,
+                results=[],
+                symbols=[],
+                stats=stats
+            )
+
+        # Initialize embedding backends
+        try:
+            from codexlens.indexing.embedding import (
+                BinaryEmbeddingBackend,
+                DenseEmbeddingBackend,
+            )
+            from codexlens.semantic.ann_index import BinaryANNIndex
+        except ImportError as exc:
+            self.logger.warning(
+                "Binary cascade dependencies not available: %s. "
+                "Falling back to hybrid cascade search.",
+                exc
+            )
+            return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
+
+        # Stage 1: Binary vector coarse retrieval
+        self.logger.debug(
+            "Binary Cascade Stage 1: Binary coarse retrieval for %d candidates",
+            coarse_k,
+        )
+
+        use_gpu = True
+        if self._config is not None:
+            use_gpu = getattr(self._config, "embedding_use_gpu", True)
+
+        try:
+            binary_backend = BinaryEmbeddingBackend(use_gpu=use_gpu)
+            query_binary_packed = binary_backend.embed_packed([query])[0]
+        except Exception as exc:
+            self.logger.warning(
+                "Failed to generate binary query embedding: %s. "
+                "Falling back to hybrid cascade search.",
+                exc
+            )
+            return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
+
+        # Search all indexes for binary candidates
+        all_candidates: List[Tuple[int, int, Path]] = []  # (chunk_id, distance, index_path)
+
+        for index_path in index_paths:
+            try:
+                # Get or create binary index for this path
+                binary_index = self._get_or_create_binary_index(index_path)
+                if binary_index is None or binary_index.count() == 0:
+                    continue
+
+                # Search binary index
+                ids, distances = binary_index.search(query_binary_packed, coarse_k)
+                for chunk_id, dist in zip(ids, distances):
+                    all_candidates.append((chunk_id, dist, index_path))
+
+            except Exception as exc:
+                self.logger.debug(
+                    "Binary search failed for %s: %s", index_path, exc
+                )
+                stats.errors.append(f"Binary search failed for {index_path}: {exc}")
+
+        if not all_candidates:
+            self.logger.debug("No binary candidates found, falling back to hybrid")
+            return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
+
+        # Sort by Hamming distance and take top coarse_k
+        all_candidates.sort(key=lambda x: x[1])
+        coarse_candidates = all_candidates[:coarse_k]
+
+        self.logger.debug(
+            "Binary Cascade Stage 1 complete: %d candidates retrieved",
+            len(coarse_candidates),
+        )
+
+        # Stage 2: Dense vector fine ranking
+        self.logger.debug(
+            "Binary Cascade Stage 2: Dense reranking %d candidates to top-%d",
+            len(coarse_candidates),
+            k,
+        )
+
+        try:
+            dense_backend = DenseEmbeddingBackend(use_gpu=use_gpu)
+            query_dense = dense_backend.embed_to_numpy([query])[0]
+        except Exception as exc:
+            self.logger.warning(
+                "Failed to generate dense query embedding: %s. "
+                "Using Hamming distance scores only.",
+                exc
+            )
+            # Fall back to using Hamming distance as score
+            return self._build_results_from_candidates(
+                coarse_candidates[:k], index_paths, stats, query, start_time
+            )
+
+        # Group candidates by index path for batch retrieval
+        candidates_by_index: Dict[Path, List[int]] = {}
+        for chunk_id, _, index_path in coarse_candidates:
+            if index_path not in candidates_by_index:
+                candidates_by_index[index_path] = []
+            candidates_by_index[index_path].append(chunk_id)
+
+        # Retrieve dense embeddings and compute cosine similarity
+        scored_results: List[Tuple[float, SearchResult]] = []
+
+        for index_path, chunk_ids in candidates_by_index.items():
+            try:
+                store = SQLiteStore(index_path)
+                dense_embeddings = store.get_dense_embeddings(chunk_ids)
+                chunks_data = store.get_chunks_by_ids(chunk_ids)
+
+                # Create lookup for chunk content
+                chunk_content: Dict[int, Dict[str, Any]] = {
+                    c["id"]: c for c in chunks_data
+                }
+
+                for chunk_id in chunk_ids:
+                    dense_bytes = dense_embeddings.get(chunk_id)
+                    chunk_info = chunk_content.get(chunk_id)
+
+                    if dense_bytes is None or chunk_info is None:
+                        continue
+
+                    # Compute cosine similarity
+                    dense_vec = np.frombuffer(dense_bytes, dtype=np.float32)
+                    score = self._compute_cosine_similarity(query_dense, dense_vec)
+
+                    # Create search result
+                    excerpt = chunk_info.get("content", "")[:500]
+                    result = SearchResult(
+                        path=chunk_info.get("file_path", ""),
+                        score=float(score),
+                        excerpt=excerpt,
+                    )
+                    scored_results.append((score, result))
+
+            except Exception as exc:
+                self.logger.debug(
+                    "Dense reranking failed for %s: %s", index_path, exc
+                )
+                stats.errors.append(f"Dense reranking failed for {index_path}: {exc}")
+
+        # Sort by score descending and deduplicate by path
+        scored_results.sort(key=lambda x: x[0], reverse=True)
+
+        path_to_result: Dict[str, SearchResult] = {}
+        for score, result in scored_results:
+            if result.path not in path_to_result:
+                path_to_result[result.path] = result
+
+        final_results = list(path_to_result.values())[:k]
+
+        # Optional: grouping of similar results
+        if options.group_results:
+            from codexlens.search.ranking import group_similar_results
+            final_results = group_similar_results(
+                final_results, score_threshold_abs=options.grouping_threshold
+            )
+
+        stats.files_matched = len(final_results)
+        stats.time_ms = (time.time() - start_time) * 1000
+
+        self.logger.debug(
+            "Binary cascade search complete: %d results in %.2fms",
+            len(final_results),
+            stats.time_ms,
+        )
+
+        return ChainSearchResult(
+            query=query,
+            results=final_results,
+            symbols=[],
+            stats=stats,
+        )
+
+    def cascade_search(
+        self,
+        query: str,
+        source_path: Path,
+        k: int = 10,
+        coarse_k: int = 100,
+        options: Optional[SearchOptions] = None,
+        strategy: Literal["binary", "hybrid"] = "binary",
+    ) -> ChainSearchResult:
+        """Unified cascade search entry point with strategy selection.
+
+        Provides a single interface for cascade search with configurable strategy:
+        - "binary": Uses binary vector coarse ranking + dense fine ranking (faster)
+        - "hybrid": Uses FTS+SPLADE+Vector coarse ranking + cross-encoder reranking (original)
+
+        The strategy can be configured via:
+        1. The `strategy` parameter (highest priority)
+        2. Config `cascade_strategy` setting
+        3. Default: "binary"
+
+        Args:
+            query: Natural language or keyword query string
+            source_path: Starting directory path
+            k: Number of final results to return (default 10)
+            coarse_k: Number of coarse candidates from first stage (default 100)
+            options: Search configuration (uses defaults if None)
+            strategy: Cascade strategy - "binary" or "hybrid" (default "binary")
+
+        Returns:
+            ChainSearchResult with reranked results and statistics
+
+        Examples:
+            >>> engine = ChainSearchEngine(registry, mapper, config=config)
+            >>> # Use binary cascade (default, faster)
+            >>> result = engine.cascade_search("auth", Path("D:/project"))
+            >>> # Use hybrid cascade (original behavior)
+            >>> result = engine.cascade_search("auth", Path("D:/project"), strategy="hybrid")
+        """
+        # Check config for strategy override
+        effective_strategy = strategy
+        if self._config is not None:
+            config_strategy = getattr(self._config, "cascade_strategy", None)
+            if config_strategy in ("binary", "hybrid"):
+                # Only use config if no explicit strategy was passed
+                # (we can't detect if strategy was explicitly passed vs default)
+                effective_strategy = config_strategy
+
+        if effective_strategy == "binary":
+            return self.binary_cascade_search(query, source_path, k, coarse_k, options)
+        else:
+            return self.hybrid_cascade_search(query, source_path, k, coarse_k, options)
+
+    def _get_or_create_binary_index(self, index_path: Path) -> Optional[Any]:
+        """Get or create a BinaryANNIndex for the given index path.
+
+        Attempts to load an existing binary index from disk. If not found,
+        returns None (binary index should be built during indexing).
+
+        Args:
+            index_path: Path to the _index.db file
+
+        Returns:
+            BinaryANNIndex instance or None if not available
+        """
+        try:
+            from codexlens.semantic.ann_index import BinaryANNIndex
+
+            binary_index = BinaryANNIndex(index_path, dim=256)
+            if binary_index.load():
+                return binary_index
+            return None
+        except Exception as exc:
+            self.logger.debug("Failed to load binary index for %s: %s", index_path, exc)
+            return None
+
+    def _compute_cosine_similarity(
+        self,
+        query_vec: "np.ndarray",
+        doc_vec: "np.ndarray",
+    ) -> float:
+        """Compute cosine similarity between query and document vectors.
+
+        Args:
+            query_vec: Query embedding vector
+            doc_vec: Document embedding vector
+
+        Returns:
+            Cosine similarity score in range [-1, 1]
+        """
+        if not NUMPY_AVAILABLE:
+            return 0.0
+
+        # Ensure same shape
+        min_len = min(len(query_vec), len(doc_vec))
+        q = query_vec[:min_len]
+        d = doc_vec[:min_len]
+
+        # Compute cosine similarity
+        dot_product = np.dot(q, d)
+        norm_q = np.linalg.norm(q)
+        norm_d = np.linalg.norm(d)
+
+        if norm_q == 0 or norm_d == 0:
+            return 0.0
+
+        return float(dot_product / (norm_q * norm_d))
+
+    def _build_results_from_candidates(
+        self,
+        candidates: List[Tuple[int, int, Path]],
+        index_paths: List[Path],
+        stats: SearchStats,
+        query: str,
+        start_time: float,
+    ) -> ChainSearchResult:
+        """Build ChainSearchResult from binary candidates using Hamming distance scores.
+
+        Used as fallback when dense embeddings are not available.
+
+        Args:
+            candidates: List of (chunk_id, hamming_distance, index_path) tuples
+            index_paths: List of all searched index paths
+            stats: SearchStats to update
+            query: Original query string
+            start_time: Search start time for timing
+
+        Returns:
+            ChainSearchResult with results scored by Hamming distance
+        """
+        results: List[SearchResult] = []
+
+        # Group by index path
+        candidates_by_index: Dict[Path, List[Tuple[int, int]]] = {}
+        for chunk_id, distance, index_path in candidates:
+            if index_path not in candidates_by_index:
+                candidates_by_index[index_path] = []
+            candidates_by_index[index_path].append((chunk_id, distance))
+
+        for index_path, chunk_tuples in candidates_by_index.items():
+            try:
+                store = SQLiteStore(index_path)
+                chunk_ids = [c[0] for c in chunk_tuples]
+                chunks_data = store.get_chunks_by_ids(chunk_ids)
+
+                chunk_content: Dict[int, Dict[str, Any]] = {
+                    c["id"]: c for c in chunks_data
+                }
+
+                for chunk_id, distance in chunk_tuples:
+                    chunk_info = chunk_content.get(chunk_id)
+                    if chunk_info is None:
+                        continue
+
+                    # Convert Hamming distance to score (lower distance = higher score)
+                    # Max Hamming distance for 256-bit is 256
+                    score = 1.0 - (distance / 256.0)
+
+                    excerpt = chunk_info.get("content", "")[:500]
+                    result = SearchResult(
+                        path=chunk_info.get("file_path", ""),
+                        score=float(score),
+                        excerpt=excerpt,
+                    )
+                    results.append(result)
+
+            except Exception as exc:
+                self.logger.debug(
+                    "Failed to build results from %s: %s", index_path, exc
+                )
+
+        # Deduplicate by path
+        path_to_result: Dict[str, SearchResult] = {}
+        for result in results:
+            if result.path not in path_to_result or result.score > path_to_result[result.path].score:
+                path_to_result[result.path] = result
+
+        final_results = sorted(
+            path_to_result.values(),
+            key=lambda r: r.score,
+            reverse=True,
+        )
+
+        stats.files_matched = len(final_results)
+        stats.time_ms = (time.time() - start_time) * 1000
+
+        return ChainSearchResult(
+            query=query,
+            results=final_results,
+            symbols=[],
+            stats=stats,
+        )
+
+    def _cross_encoder_rerank(
+        self,
+        query: str,
+        results: List[SearchResult],
+        top_k: int,
+    ) -> List[SearchResult]:
+        """Rerank results using cross-encoder model.
+
+        Args:
+            query: Search query string
+            results: Candidate results to rerank
+            top_k: Number of top results to return
+
+        Returns:
+            Reranked results sorted by cross-encoder score
+        """
+        if not results:
+            return []
+
+        # Try to get reranker from config or create new one
+        reranker = None
+        try:
+            from codexlens.semantic.reranker import (
+                check_reranker_available,
+                get_reranker,
+            )
+
+            # Determine backend and model from config
+            backend = "onnx"
+            model_name = None
+            use_gpu = True
+
+            if self._config is not None:
+                backend = getattr(self._config, "reranker_backend", "onnx") or "onnx"
+                model_name = getattr(self._config, "reranker_model", None)
+                use_gpu = getattr(self._config, "embedding_use_gpu", True)
+
+            ok, err = check_reranker_available(backend)
+            if not ok:
+                self.logger.debug("Reranker backend unavailable (%s): %s", backend, err)
+                return results[:top_k]
+
+            # Create reranker
+            kwargs = {}
+            if backend == "onnx":
+                kwargs["use_gpu"] = use_gpu
+
+            reranker = get_reranker(backend=backend, model_name=model_name, **kwargs)
+
+        except ImportError as exc:
+            self.logger.debug("Reranker not available: %s", exc)
+            return results[:top_k]
+        except Exception as exc:
+            self.logger.debug("Failed to initialize reranker: %s", exc)
+            return results[:top_k]
+
+        # Use cross_encoder_rerank from ranking module
+        from codexlens.search.ranking import cross_encoder_rerank
+
+        return cross_encoder_rerank(
+            query=query,
+            results=results,
+            reranker=reranker,
+            top_k=top_k,
+            batch_size=32,
+        )
+
    def search_files_only(self, query: str,
                          source_path: Path,
                          options: Optional[SearchOptions] = None) -> List[str]:
--- a/codex-lens/src/codexlens/search/hybrid_search.py
+++ b/codex-lens/src/codexlens/search/hybrid_search.py
@@ -40,11 +40,20 @@ from codexlens.search.ranking import (
    get_rrf_weights,
    reciprocal_rank_fusion,
    rerank_results,
+    simple_weighted_fusion,
    tag_search_source,
 )
 from codexlens.storage.dir_index import DirIndexStore


+# Three-way fusion weights (FTS + Vector + SPLADE)
+THREE_WAY_WEIGHTS = {
+    "exact": 0.2,
+    "splade": 0.3,
+    "vector": 0.5,
+}
+
+
 class HybridSearchEngine:
    """Hybrid search engine with parallel execution and RRF fusion.

@@ -193,9 +202,22 @@ class HybridSearchEngine:
            if source in results_map
        }

-        with timer("rrf_fusion", self.logger):
+        # Determine fusion method from config (default: rrf)
+        fusion_method = "rrf"
+        rrf_k = 60
+        if self._config is not None:
+            fusion_method = getattr(self._config, "fusion_method", "rrf") or "rrf"
+            rrf_k = getattr(self._config, "rrf_k", 60) or 60
+
+        with timer("fusion", self.logger):
            adaptive_weights = get_rrf_weights(query, active_weights)
-            fused_results = reciprocal_rank_fusion(results_map, adaptive_weights)
+            if fusion_method == "simple":
+                fused_results = simple_weighted_fusion(results_map, adaptive_weights)
+            else:
+                # Default to RRF
+                fused_results = reciprocal_rank_fusion(
+                    results_map, adaptive_weights, k=rrf_k
+                )

        # Optional: boost results that include explicit symbol matches
        boost_factor = (
--- a/codex-lens/src/codexlens/search/ranking.py
+++ b/codex-lens/src/codexlens/search/ranking.py
@@ -132,6 +132,116 @@ def get_rrf_weights(
    return adjust_weights_by_intent(detect_query_intent(query), base_weights)


+def simple_weighted_fusion(
+    results_map: Dict[str, List[SearchResult]],
+    weights: Dict[str, float] = None,
+) -> List[SearchResult]:
+    """Combine search results using simple weighted sum of normalized scores.
+
+    This is an alternative to RRF that preserves score magnitude information.
+    Scores are min-max normalized per source before weighted combination.
+
+    Formula: score(d) = Σ weight_source * normalized_score_source(d)
+
+    Args:
+        results_map: Dictionary mapping source name to list of SearchResult objects
+                     Sources: 'exact', 'fuzzy', 'vector', 'splade'
+        weights: Dictionary mapping source name to weight (default: equal weights)
+                 Example: {'exact': 0.3, 'fuzzy': 0.1, 'vector': 0.6}
+
+    Returns:
+        List of SearchResult objects sorted by fused score (descending)
+
+    Examples:
+        >>> fts_results = [SearchResult(path="a.py", score=10.0, excerpt="...")]
+        >>> vector_results = [SearchResult(path="b.py", score=0.85, excerpt="...")]
+        >>> results_map = {'exact': fts_results, 'vector': vector_results}
+        >>> fused = simple_weighted_fusion(results_map)
+    """
+    if not results_map:
+        return []
+
+    # Default equal weights if not provided
+    if weights is None:
+        num_sources = len(results_map)
+        weights = {source: 1.0 / num_sources for source in results_map}
+
+    # Normalize weights to sum to 1.0
+    weight_sum = sum(weights.values())
+    if not math.isclose(weight_sum, 1.0, abs_tol=0.01) and weight_sum > 0:
+        weights = {source: w / weight_sum for source, w in weights.items()}
+
+    # Compute min-max normalization parameters per source
+    source_stats: Dict[str, tuple] = {}
+    for source_name, results in results_map.items():
+        if not results:
+            continue
+        scores = [r.score for r in results]
+        min_s, max_s = min(scores), max(scores)
+        source_stats[source_name] = (min_s, max_s)
+
+    def normalize_score(score: float, source: str) -> float:
+        """Normalize score to [0, 1] range using min-max scaling."""
+        if source not in source_stats:
+            return 0.0
+        min_s, max_s = source_stats[source]
+        if max_s == min_s:
+            return 1.0 if score >= min_s else 0.0
+        return (score - min_s) / (max_s - min_s)
+
+    # Build unified result set with weighted scores
+    path_to_result: Dict[str, SearchResult] = {}
+    path_to_fusion_score: Dict[str, float] = {}
+    path_to_source_scores: Dict[str, Dict[str, float]] = {}
+
+    for source_name, results in results_map.items():
+        weight = weights.get(source_name, 0.0)
+        if weight == 0:
+            continue
+
+        for result in results:
+            path = result.path
+            normalized = normalize_score(result.score, source_name)
+            contribution = weight * normalized
+
+            if path not in path_to_fusion_score:
+                path_to_fusion_score[path] = 0.0
+                path_to_result[path] = result
+                path_to_source_scores[path] = {}
+
+            path_to_fusion_score[path] += contribution
+            path_to_source_scores[path][source_name] = normalized
+
+    # Create final results with fusion scores
+    fused_results = []
+    for path, base_result in path_to_result.items():
+        fusion_score = path_to_fusion_score[path]
+
+        fused_result = SearchResult(
+            path=base_result.path,
+            score=fusion_score,
+            excerpt=base_result.excerpt,
+            content=base_result.content,
+            symbol=base_result.symbol,
+            chunk=base_result.chunk,
+            metadata={
+                **base_result.metadata,
+                "fusion_method": "simple_weighted",
+                "fusion_score": fusion_score,
+                "original_score": base_result.score,
+                "source_scores": path_to_source_scores[path],
+            },
+            start_line=base_result.start_line,
+            end_line=base_result.end_line,
+            symbol_name=base_result.symbol_name,
+            symbol_kind=base_result.symbol_kind,
+        )
+        fused_results.append(fused_result)
+
+    fused_results.sort(key=lambda r: r.score, reverse=True)
+    return fused_results
+
+
 def reciprocal_rank_fusion(
    results_map: Dict[str, List[SearchResult]],
    weights: Dict[str, float] = None,
@@ -141,11 +251,14 @@ def reciprocal_rank_fusion(

    RRF formula: score(d) = Σ weight_source / (k + rank_source(d))

+    Supports three-way fusion with FTS, Vector, and SPLADE sources.
+
    Args:
        results_map: Dictionary mapping source name to list of SearchResult objects
-                     Sources: 'exact', 'fuzzy', 'vector'
+                     Sources: 'exact', 'fuzzy', 'vector', 'splade'
        weights: Dictionary mapping source name to weight (default: equal weights)
                 Example: {'exact': 0.3, 'fuzzy': 0.1, 'vector': 0.6}
+                 Or: {'splade': 0.4, 'vector': 0.6}
        k: Constant to avoid division by zero and control rank influence (default 60)

    Returns:
@@ -156,6 +269,14 @@ def reciprocal_rank_fusion(
        >>> fuzzy_results = [SearchResult(path="b.py", score=8.0, excerpt="...")]
        >>> results_map = {'exact': exact_results, 'fuzzy': fuzzy_results}
        >>> fused = reciprocal_rank_fusion(results_map)
+
+        # Three-way fusion with SPLADE
+        >>> results_map = {
+        ...     'exact': exact_results,
+        ...     'vector': vector_results,
+        ...     'splade': splade_results
+        ... }
+        >>> fused = reciprocal_rank_fusion(results_map, k=60)
    """
    if not results_map:
        return []
@@ -174,6 +295,7 @@ def reciprocal_rank_fusion(
    # Build unified result set with RRF scores
    path_to_result: Dict[str, SearchResult] = {}
    path_to_fusion_score: Dict[str, float] = {}
+    path_to_source_ranks: Dict[str, Dict[str, int]] = {}

    for source_name, results in results_map.items():
        weight = weights.get(source_name, 0.0)
@@ -188,8 +310,10 @@ def reciprocal_rank_fusion(
            if path not in path_to_fusion_score:
                path_to_fusion_score[path] = 0.0
                path_to_result[path] = result
+                path_to_source_ranks[path] = {}

            path_to_fusion_score[path] += rrf_contribution
+            path_to_source_ranks[path][source_name] = rank

    # Create final results with fusion scores
    fused_results = []
@@ -206,8 +330,11 @@ def reciprocal_rank_fusion(
            chunk=base_result.chunk,
            metadata={
                **base_result.metadata,
+                "fusion_method": "rrf",
                "fusion_score": fusion_score,
                "original_score": base_result.score,
+                "rrf_k": k,
+                "source_ranks": path_to_source_ranks[path],
            },
            start_line=base_result.start_line,
            end_line=base_result.end_line,
--- a/codex-lens/src/codexlens/semantic/ann_index.py
+++ b/codex-lens/src/codexlens/semantic/ann_index.py
@@ -412,3 +412,489 @@ class ANNIndex:
        """
        with self._lock:
            return self._index is not None and self._current_count > 0
+
+
+
+class BinaryANNIndex:
+    """Binary vector ANN index using Hamming distance for fast coarse retrieval.
+
+    Optimized for binary vectors (256-bit / 32 bytes per vector).
+    Uses packed binary representation for memory efficiency.
+
+    Performance characteristics:
+    - Storage: 32 bytes per vector (vs ~8KB for dense vectors)
+    - Distance: Hamming distance via XOR + popcount (CPU-efficient)
+    - Search: O(N) brute-force with SIMD-accelerated distance computation
+
+    Index parameters:
+    - dim: Binary vector dimension (default: 256)
+    - packed_dim: Packed bytes size (dim / 8 = 32 for 256-bit)
+
+    Usage:
+        index = BinaryANNIndex(index_path, dim=256)
+        index.add_vectors([1, 2, 3], packed_vectors)  # List of 32-byte packed vectors
+        ids, distances = index.search(query_packed, top_k=10)
+    """
+
+    DEFAULT_DIM = 256  # Default binary vector dimension
+
+    def __init__(
+        self,
+        index_path: Path,
+        dim: int = 256,
+        initial_capacity: int = 100000,
+        auto_save: bool = False,
+    ) -> None:
+        """Initialize Binary ANN index.
+
+        Args:
+            index_path: Path to database (index will be saved as _binary_vectors.bin)
+            dim: Dimension of binary vectors (default: 256)
+            initial_capacity: Initial capacity hint (default: 100000)
+            auto_save: Whether to automatically save index after operations
+
+        Raises:
+            ImportError: If required dependencies are not available
+            ValueError: If dimension is invalid
+        """
+        if not SEMANTIC_AVAILABLE:
+            raise ImportError(
+                "Semantic search dependencies not available. "
+                "Install with: pip install codexlens[semantic]"
+            )
+
+        if dim <= 0 or dim % 8 != 0:
+            raise ValueError(
+                f"Invalid dimension: {dim}. Must be positive and divisible by 8."
+            )
+
+        self.index_path = Path(index_path)
+        self.dim = dim
+        self.packed_dim = dim // 8  # 32 bytes for 256-bit vectors
+
+        # Derive binary index path from database path
+        db_stem = self.index_path.stem
+        self.binary_path = self.index_path.parent / f"{db_stem}_binary_vectors.bin"
+
+        # Memory management
+        self._auto_save = auto_save
+        self._initial_capacity = initial_capacity
+
+        # Thread safety
+        self._lock = threading.RLock()
+
+        # In-memory storage: id -> packed binary vector
+        self._vectors: dict[int, bytes] = {}
+        self._id_list: list[int] = []  # Ordered list for efficient iteration
+
+        logger.info(
+            f"Initialized BinaryANNIndex with dim={dim}, packed_dim={self.packed_dim}"
+        )
+
+    def add_vectors(self, ids: List[int], vectors: List[bytes]) -> None:
+        """Add packed binary vectors to the index.
+
+        Args:
+            ids: List of vector IDs (must be unique)
+            vectors: List of packed binary vectors (each of size packed_dim bytes)
+
+        Raises:
+            ValueError: If shapes don't match or vectors are invalid
+            StorageError: If index operation fails
+        """
+        if len(ids) == 0:
+            return
+
+        if len(vectors) != len(ids):
+            raise ValueError(
+                f"Number of vectors ({len(vectors)}) must match number of IDs ({len(ids)})"
+            )
+
+        # Validate vector sizes
+        for i, vec in enumerate(vectors):
+            if len(vec) != self.packed_dim:
+                raise ValueError(
+                    f"Vector {i} has size {len(vec)}, expected {self.packed_dim}"
+                )
+
+        with self._lock:
+            try:
+                for vec_id, vec in zip(ids, vectors):
+                    if vec_id not in self._vectors:
+                        self._id_list.append(vec_id)
+                    self._vectors[vec_id] = vec
+
+                logger.debug(
+                    f"Added {len(ids)} binary vectors to index (total: {len(self._vectors)})"
+                )
+
+                if self._auto_save:
+                    self.save()
+
+            except Exception as e:
+                raise StorageError(f"Failed to add vectors to Binary ANN index: {e}")
+
+    def add_vectors_numpy(self, ids: List[int], vectors: np.ndarray) -> None:
+        """Add unpacked binary vectors (0/1 values) to the index.
+
+        Convenience method that packs the vectors before adding.
+
+        Args:
+            ids: List of vector IDs (must be unique)
+            vectors: Numpy array of shape (N, dim) with binary values (0 or 1)
+
+        Raises:
+            ValueError: If shapes don't match
+            StorageError: If index operation fails
+        """
+        if len(ids) == 0:
+            return
+
+        if vectors.shape[0] != len(ids):
+            raise ValueError(
+                f"Number of vectors ({vectors.shape[0]}) must match number of IDs ({len(ids)})"
+            )
+
+        if vectors.shape[1] != self.dim:
+            raise ValueError(
+                f"Vector dimension ({vectors.shape[1]}) must match index dimension ({self.dim})"
+            )
+
+        # Pack vectors
+        packed_vectors = []
+        for i in range(vectors.shape[0]):
+            packed = np.packbits(vectors[i].astype(np.uint8)).tobytes()
+            packed_vectors.append(packed)
+
+        self.add_vectors(ids, packed_vectors)
+
+    def remove_vectors(self, ids: List[int]) -> None:
+        """Remove vectors from the index.
+
+        Args:
+            ids: List of vector IDs to remove
+
+        Raises:
+            StorageError: If index operation fails
+
+        Note:
+            Optimized for batch deletion using set operations instead of
+            O(N) list.remove() calls for each ID.
+        """
+        if len(ids) == 0:
+            return
+
+        with self._lock:
+            try:
+                # Use set for O(1) lookup during filtering
+                ids_to_remove = set(ids)
+                removed_count = 0
+
+                # Remove from dictionary - O(1) per deletion
+                for vec_id in ids_to_remove:
+                    if vec_id in self._vectors:
+                        del self._vectors[vec_id]
+                        removed_count += 1
+
+                # Rebuild ID list efficiently - O(N) once instead of O(N) per removal
+                if removed_count > 0:
+                    self._id_list = [id_ for id_ in self._id_list if id_ not in ids_to_remove]
+
+                logger.debug(f"Removed {removed_count}/{len(ids)} vectors from index")
+
+                if self._auto_save and removed_count > 0:
+                    self.save()
+
+            except Exception as e:
+                raise StorageError(
+                    f"Failed to remove vectors from Binary ANN index: {e}"
+                )
+
+    def search(
+        self, query: bytes, top_k: int = 10
+    ) -> Tuple[List[int], List[int]]:
+        """Search for nearest neighbors using Hamming distance.
+
+        Args:
+            query: Packed binary query vector (size: packed_dim bytes)
+            top_k: Number of nearest neighbors to return
+
+        Returns:
+            Tuple of (ids, distances) where:
+            - ids: List of vector IDs ordered by Hamming distance (ascending)
+            - distances: List of Hamming distances (lower = more similar)
+
+        Raises:
+            ValueError: If query size is invalid
+            StorageError: If search operation fails
+        """
+        if len(query) != self.packed_dim:
+            raise ValueError(
+                f"Query size ({len(query)}) must match packed_dim ({self.packed_dim})"
+            )
+
+        with self._lock:
+            try:
+                if len(self._vectors) == 0:
+                    return [], []
+
+                # Compute Hamming distances to all vectors
+                query_arr = np.frombuffer(query, dtype=np.uint8)
+                distances = []
+
+                for vec_id in self._id_list:
+                    vec = self._vectors[vec_id]
+                    vec_arr = np.frombuffer(vec, dtype=np.uint8)
+                    # XOR and popcount for Hamming distance
+                    xor = np.bitwise_xor(query_arr, vec_arr)
+                    dist = int(np.unpackbits(xor).sum())
+                    distances.append((vec_id, dist))
+
+                # Sort by distance (ascending)
+                distances.sort(key=lambda x: x[1])
+
+                # Return top-k
+                top_results = distances[:top_k]
+                ids = [r[0] for r in top_results]
+                dists = [r[1] for r in top_results]
+
+                return ids, dists
+
+            except Exception as e:
+                raise StorageError(f"Failed to search Binary ANN index: {e}")
+
+    def search_numpy(
+        self, query: np.ndarray, top_k: int = 10
+    ) -> Tuple[List[int], List[int]]:
+        """Search with unpacked binary query vector.
+
+        Convenience method that packs the query before searching.
+
+        Args:
+            query: Binary query vector of shape (dim,) with values 0 or 1
+            top_k: Number of nearest neighbors to return
+
+        Returns:
+            Tuple of (ids, distances)
+        """
+        if query.ndim == 2:
+            query = query.flatten()
+
+        if len(query) != self.dim:
+            raise ValueError(
+                f"Query dimension ({len(query)}) must match index dimension ({self.dim})"
+            )
+
+        packed_query = np.packbits(query.astype(np.uint8)).tobytes()
+        return self.search(packed_query, top_k)
+
+    def search_batch(
+        self, queries: List[bytes], top_k: int = 10
+    ) -> List[Tuple[List[int], List[int]]]:
+        """Batch search for multiple queries.
+
+        Args:
+            queries: List of packed binary query vectors
+            top_k: Number of nearest neighbors to return per query
+
+        Returns:
+            List of (ids, distances) tuples, one per query
+        """
+        results = []
+        for query in queries:
+            ids, dists = self.search(query, top_k)
+            results.append((ids, dists))
+        return results
+
+    def save(self) -> None:
+        """Save index to disk.
+
+        Binary format:
+        - 4 bytes: magic number (0x42494E56 = "BINV")
+        - 4 bytes: version (1)
+        - 4 bytes: dim
+        - 4 bytes: packed_dim
+        - 4 bytes: num_vectors
+        - For each vector:
+          - 4 bytes: id
+          - packed_dim bytes: vector data
+
+        Raises:
+            StorageError: If save operation fails
+        """
+        with self._lock:
+            try:
+                if len(self._vectors) == 0:
+                    logger.debug("Skipping save: index is empty")
+                    return
+
+                # Ensure parent directory exists
+                self.binary_path.parent.mkdir(parents=True, exist_ok=True)
+
+                with open(self.binary_path, "wb") as f:
+                    # Header
+                    f.write(b"BINV")  # Magic number
+                    f.write(np.array([1], dtype=np.uint32).tobytes())  # Version
+                    f.write(np.array([self.dim], dtype=np.uint32).tobytes())
+                    f.write(np.array([self.packed_dim], dtype=np.uint32).tobytes())
+                    f.write(
+                        np.array([len(self._vectors)], dtype=np.uint32).tobytes()
+                    )
+
+                    # Vectors
+                    for vec_id in self._id_list:
+                        f.write(np.array([vec_id], dtype=np.uint32).tobytes())
+                        f.write(self._vectors[vec_id])
+
+                logger.debug(
+                    f"Saved binary index to {self.binary_path} "
+                    f"({len(self._vectors)} vectors)"
+                )
+
+            except Exception as e:
+                raise StorageError(f"Failed to save Binary ANN index: {e}")
+
+    def load(self) -> bool:
+        """Load index from disk.
+
+        Returns:
+            True if index was loaded successfully, False if index file doesn't exist
+
+        Raises:
+            StorageError: If load operation fails
+        """
+        with self._lock:
+            try:
+                if not self.binary_path.exists():
+                    logger.debug(f"Binary index file not found: {self.binary_path}")
+                    return False
+
+                with open(self.binary_path, "rb") as f:
+                    # Read header
+                    magic = f.read(4)
+                    if magic != b"BINV":
+                        raise StorageError(
+                            f"Invalid binary index file: bad magic number"
+                        )
+
+                    version = np.frombuffer(f.read(4), dtype=np.uint32)[0]
+                    if version != 1:
+                        raise StorageError(
+                            f"Unsupported binary index version: {version}"
+                        )
+
+                    file_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0]
+                    file_packed_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0]
+                    num_vectors = np.frombuffer(f.read(4), dtype=np.uint32)[0]
+
+                    if file_dim != self.dim or file_packed_dim != self.packed_dim:
+                        raise StorageError(
+                            f"Dimension mismatch: file has dim={file_dim}, "
+                            f"packed_dim={file_packed_dim}, "
+                            f"expected dim={self.dim}, packed_dim={self.packed_dim}"
+                        )
+
+                    # Clear existing data
+                    self._vectors.clear()
+                    self._id_list.clear()
+
+                    # Read vectors
+                    for _ in range(num_vectors):
+                        vec_id = np.frombuffer(f.read(4), dtype=np.uint32)[0]
+                        vec_data = f.read(self.packed_dim)
+                        self._vectors[int(vec_id)] = vec_data
+                        self._id_list.append(int(vec_id))
+
+                logger.info(
+                    f"Loaded binary index from {self.binary_path} "
+                    f"({len(self._vectors)} vectors)"
+                )
+
+                return True
+
+            except StorageError:
+                raise
+            except Exception as e:
+                raise StorageError(f"Failed to load Binary ANN index: {e}")
+
+    def count(self) -> int:
+        """Get number of vectors in the index.
+
+        Returns:
+            Number of vectors currently in the index
+        """
+        with self._lock:
+            return len(self._vectors)
+
+    @property
+    def is_loaded(self) -> bool:
+        """Check if index has vectors.
+
+        Returns:
+            True if index has vectors, False otherwise
+        """
+        with self._lock:
+            return len(self._vectors) > 0
+
+    def get_vector(self, vec_id: int) -> Optional[bytes]:
+        """Get a specific vector by ID.
+
+        Args:
+            vec_id: Vector ID to retrieve
+
+        Returns:
+            Packed binary vector or None if not found
+        """
+        with self._lock:
+            return self._vectors.get(vec_id)
+
+    def clear(self) -> None:
+        """Clear all vectors from the index."""
+        with self._lock:
+            self._vectors.clear()
+            self._id_list.clear()
+            logger.debug("Cleared binary index")
+
+
+def create_ann_index(
+    index_path: Path,
+    index_type: str = "hnsw",
+    dim: int = 2048,
+    **kwargs,
+) -> ANNIndex | BinaryANNIndex:
+    """Factory function to create an ANN index.
+
+    Args:
+        index_path: Path to database file
+        index_type: Type of index - "hnsw" for dense vectors, "binary" for binary vectors
+        dim: Vector dimension (default: 2048 for dense, 256 for binary)
+        **kwargs: Additional arguments passed to the index constructor
+
+    Returns:
+        ANNIndex for dense vectors or BinaryANNIndex for binary vectors
+
+    Raises:
+        ValueError: If index_type is invalid
+
+    Example:
+        >>> # Dense vector index (HNSW)
+        >>> dense_index = create_ann_index(path, index_type="hnsw", dim=2048)
+        >>> dense_index.add_vectors(ids, dense_vectors)
+        >>>
+        >>> # Binary vector index (Hamming distance)
+        >>> binary_index = create_ann_index(path, index_type="binary", dim=256)
+        >>> binary_index.add_vectors(ids, packed_vectors)
+    """
+    index_type = index_type.lower()
+
+    if index_type == "hnsw":
+        return ANNIndex(index_path=index_path, dim=dim, **kwargs)
+    elif index_type == "binary":
+        # Default to 256 for binary if not specified
+        if dim == 2048:  # Default dense dim was used
+            dim = 256
+        return BinaryANNIndex(index_path=index_path, dim=dim, **kwargs)
+    else:
+        raise ValueError(
+            f"Invalid index_type: {index_type}. Must be 'hnsw' or 'binary'."
+        )
--- a/codex-lens/src/codexlens/semantic/vector_store.py
+++ b/codex-lens/src/codexlens/semantic/vector_store.py
@@ -29,10 +29,17 @@ except ImportError:

 # Try to import ANN index (optional hnswlib dependency)
 try:
-    from codexlens.semantic.ann_index import ANNIndex, HNSWLIB_AVAILABLE
+    from codexlens.semantic.ann_index import (
+        ANNIndex,
+        BinaryANNIndex,
+        create_ann_index,
+        HNSWLIB_AVAILABLE,
+    )
 except ImportError:
    HNSWLIB_AVAILABLE = False
    ANNIndex = None
+    BinaryANNIndex = None
+    create_ann_index = None


 logger = logging.getLogger(__name__)
--- a/codex-lens/src/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py
+++ b/codex-lens/src/codexlens/storage/migrations/migration_010_add_multi_vector_chunks.py
@@ -0,0 +1,162 @@
+"""
+Migration 010: Add multi-vector storage support for cascade retrieval.
+
+This migration introduces the chunks table with multi-vector support:
+- chunks: Stores code chunks with multiple embedding types
+  - embedding: Original embedding for backward compatibility
+  - embedding_binary: 256-dim binary vector for coarse ranking (fast)
+  - embedding_dense: 2048-dim dense vector for fine ranking (precise)
+
+The multi-vector architecture enables cascade retrieval:
+1. First stage: Fast binary vector search for candidate retrieval
+2. Second stage: Dense vector reranking for precision
+"""
+
+import logging
+from sqlite3 import Connection
+
+log = logging.getLogger(__name__)
+
+
+def upgrade(db_conn: Connection) -> None:
+    """
+    Adds chunks table with multi-vector embedding columns.
+
+    Creates:
+    - chunks: Table for storing code chunks with multiple embedding types
+    - idx_chunks_file_path: Index for efficient file-based lookups
+
+    Also migrates existing chunks tables by adding new columns if needed.
+
+    Args:
+        db_conn: The SQLite database connection.
+    """
+    cursor = db_conn.cursor()
+
+    # Check if chunks table already exists
+    table_exists = cursor.execute(
+        "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
+    ).fetchone()
+
+    if table_exists:
+        # Migrate existing table - add new columns if missing
+        log.info("chunks table exists, checking for missing columns...")
+        
+        col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall()
+        existing_columns = {row[1] for row in col_info}
+        
+        if "embedding_binary" not in existing_columns:
+            log.info("Adding embedding_binary column to chunks table...")
+            cursor.execute(
+                "ALTER TABLE chunks ADD COLUMN embedding_binary BLOB"
+            )
+        
+        if "embedding_dense" not in existing_columns:
+            log.info("Adding embedding_dense column to chunks table...")
+            cursor.execute(
+                "ALTER TABLE chunks ADD COLUMN embedding_dense BLOB"
+            )
+    else:
+        # Create new table with all columns
+        log.info("Creating chunks table with multi-vector support...")
+        cursor.execute(
+            """
+            CREATE TABLE chunks (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                file_path TEXT NOT NULL,
+                content TEXT NOT NULL,
+                embedding BLOB,
+                embedding_binary BLOB,
+                embedding_dense BLOB,
+                metadata TEXT,
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            )
+            """
+        )
+
+    # Create index for file-based lookups
+    log.info("Creating index for chunks table...")
+    cursor.execute(
+        """
+        CREATE INDEX IF NOT EXISTS idx_chunks_file_path
+        ON chunks(file_path)
+        """
+    )
+
+    log.info("Migration 010 completed successfully")
+
+
+def downgrade(db_conn: Connection) -> None:
+    """
+    Removes multi-vector columns from chunks table.
+
+    Note: This does not drop the chunks table entirely to preserve data.
+    Only the new columns added by this migration are removed.
+
+    Args:
+        db_conn: The SQLite database connection.
+    """
+    cursor = db_conn.cursor()
+
+    log.info("Removing multi-vector columns from chunks table...")
+    
+    # SQLite doesn't support DROP COLUMN directly in older versions
+    # We need to recreate the table without the columns
+    
+    # Check if chunks table exists
+    table_exists = cursor.execute(
+        "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
+    ).fetchone()
+    
+    if not table_exists:
+        log.info("chunks table does not exist, nothing to downgrade")
+        return
+
+    # Check if the columns exist before trying to remove them
+    col_info = cursor.execute("PRAGMA table_info(chunks)").fetchall()
+    existing_columns = {row[1] for row in col_info}
+    
+    needs_migration = (
+        "embedding_binary" in existing_columns or
+        "embedding_dense" in existing_columns
+    )
+    
+    if not needs_migration:
+        log.info("Multi-vector columns not present, nothing to remove")
+        return
+
+    # Recreate table without the new columns
+    log.info("Recreating chunks table without multi-vector columns...")
+    
+    cursor.execute(
+        """
+        CREATE TABLE chunks_backup (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            file_path TEXT NOT NULL,
+            content TEXT NOT NULL,
+            embedding BLOB,
+            metadata TEXT,
+            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+        """
+    )
+    
+    cursor.execute(
+        """
+        INSERT INTO chunks_backup (id, file_path, content, embedding, metadata, created_at)
+        SELECT id, file_path, content, embedding, metadata, created_at FROM chunks
+        """
+    )
+    
+    cursor.execute("DROP TABLE chunks")
+    cursor.execute("ALTER TABLE chunks_backup RENAME TO chunks")
+    
+    # Recreate index
+    cursor.execute(
+        """
+        CREATE INDEX IF NOT EXISTS idx_chunks_file_path
+        ON chunks(file_path)
+        """
+    )
+
+    log.info("Migration 010 downgrade completed successfully")
--- a/codex-lens/src/codexlens/storage/sqlite_store.py
+++ b/codex-lens/src/codexlens/storage/sqlite_store.py
@@ -539,6 +539,27 @@ class SQLiteStore:
            )
            conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
            conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
+            # Chunks table for multi-vector storage (cascade retrieval architecture)
+            # - embedding: Original embedding for backward compatibility
+            # - embedding_binary: 256-dim binary vector for coarse ranking
+            # - embedding_dense: 2048-dim dense vector for fine ranking
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS chunks (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    file_path TEXT NOT NULL,
+                    content TEXT NOT NULL,
+                    embedding BLOB,
+                    embedding_binary BLOB,
+                    embedding_dense BLOB,
+                    metadata TEXT,
+                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+                )
+                """
+            )
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_chunks_file_path ON chunks(file_path)")
+            # Run migration for existing databases
+            self._migrate_chunks_table(conn)
            conn.commit()
        except sqlite3.DatabaseError as exc:
            raise StorageError(f"Failed to initialize database schema: {exc}") from exc
@@ -650,3 +671,306 @@ class SQLiteStore:
            conn.execute("VACUUM")
        except sqlite3.DatabaseError:
            pass
+
+    def _migrate_chunks_table(self, conn: sqlite3.Connection) -> None:
+        """Migrate existing chunks table to add multi-vector columns if needed.
+
+        This handles upgrading existing databases that may have the chunks table
+        without the embedding_binary and embedding_dense columns.
+        """
+        # Check if chunks table exists
+        table_exists = conn.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'"
+        ).fetchone()
+
+        if not table_exists:
+            # Table doesn't exist yet, nothing to migrate
+            return
+
+        # Check existing columns
+        cursor = conn.execute("PRAGMA table_info(chunks)")
+        columns = {row[1] for row in cursor.fetchall()}
+
+        # Add embedding_binary column if missing
+        if "embedding_binary" not in columns:
+            logger.info("Migrating chunks table: adding embedding_binary column")
+            conn.execute(
+                "ALTER TABLE chunks ADD COLUMN embedding_binary BLOB"
+            )
+
+        # Add embedding_dense column if missing
+        if "embedding_dense" not in columns:
+            logger.info("Migrating chunks table: adding embedding_dense column")
+            conn.execute(
+                "ALTER TABLE chunks ADD COLUMN embedding_dense BLOB"
+            )
+
+    def add_chunks(
+        self,
+        file_path: str,
+        chunks_data: List[Dict[str, Any]],
+        *,
+        embedding: Optional[List[List[float]]] = None,
+        embedding_binary: Optional[List[bytes]] = None,
+        embedding_dense: Optional[List[bytes]] = None,
+    ) -> List[int]:
+        """Add multiple chunks with multi-vector embeddings support.
+
+        This method supports the cascade retrieval architecture with three embedding types:
+        - embedding: Original dense embedding for backward compatibility
+        - embedding_binary: 256-dim binary vector for fast coarse ranking
+        - embedding_dense: 2048-dim dense vector for precise fine ranking
+
+        Args:
+            file_path: Path to the source file for all chunks.
+            chunks_data: List of dicts with 'content' and optional 'metadata' keys.
+            embedding: Optional list of dense embeddings (one per chunk).
+            embedding_binary: Optional list of binary embeddings as bytes (one per chunk).
+            embedding_dense: Optional list of dense embeddings as bytes (one per chunk).
+
+        Returns:
+            List of inserted chunk IDs.
+
+        Raises:
+            ValueError: If embedding list lengths don't match chunks_data length.
+            StorageError: If database operation fails.
+        """
+        if not chunks_data:
+            return []
+
+        n_chunks = len(chunks_data)
+
+        # Validate embedding lengths
+        if embedding is not None and len(embedding) != n_chunks:
+            raise ValueError(
+                f"embedding length ({len(embedding)}) != chunks_data length ({n_chunks})"
+            )
+        if embedding_binary is not None and len(embedding_binary) != n_chunks:
+            raise ValueError(
+                f"embedding_binary length ({len(embedding_binary)}) != chunks_data length ({n_chunks})"
+            )
+        if embedding_dense is not None and len(embedding_dense) != n_chunks:
+            raise ValueError(
+                f"embedding_dense length ({len(embedding_dense)}) != chunks_data length ({n_chunks})"
+            )
+
+        # Prepare batch data
+        batch_data = []
+        for i, chunk in enumerate(chunks_data):
+            content = chunk.get("content", "")
+            metadata = chunk.get("metadata")
+            metadata_json = json.dumps(metadata) if metadata else None
+
+            # Convert embeddings to bytes if needed
+            emb_blob = None
+            if embedding is not None:
+                import struct
+                emb_blob = struct.pack(f"{len(embedding[i])}f", *embedding[i])
+
+            emb_binary_blob = embedding_binary[i] if embedding_binary is not None else None
+            emb_dense_blob = embedding_dense[i] if embedding_dense is not None else None
+
+            batch_data.append((
+                file_path, content, emb_blob, emb_binary_blob, emb_dense_blob, metadata_json
+            ))
+
+        with self._lock:
+            conn = self._get_connection()
+            try:
+                # Get starting ID before insert
+                row = conn.execute("SELECT MAX(id) FROM chunks").fetchone()
+                start_id = (row[0] or 0) + 1
+
+                conn.executemany(
+                    """
+                    INSERT INTO chunks (
+                        file_path, content, embedding, embedding_binary,
+                        embedding_dense, metadata
+                    )
+                    VALUES (?, ?, ?, ?, ?, ?)
+                    """,
+                    batch_data
+                )
+                conn.commit()
+
+                # Calculate inserted IDs
+                return list(range(start_id, start_id + n_chunks))
+
+            except sqlite3.DatabaseError as exc:
+                raise StorageError(
+                    f"Failed to add chunks: {exc}",
+                    db_path=str(self.db_path),
+                    operation="add_chunks",
+                ) from exc
+
+    def get_binary_embeddings(
+        self, chunk_ids: List[int]
+    ) -> Dict[int, Optional[bytes]]:
+        """Get binary embeddings for specified chunk IDs.
+
+        Used for coarse ranking in cascade retrieval architecture.
+        Binary embeddings (256-dim) enable fast approximate similarity search.
+
+        Args:
+            chunk_ids: List of chunk IDs to retrieve embeddings for.
+
+        Returns:
+            Dictionary mapping chunk_id to embedding_binary bytes (or None if not set).
+
+        Raises:
+            StorageError: If database query fails.
+        """
+        if not chunk_ids:
+            return {}
+
+        with self._lock:
+            conn = self._get_connection()
+            try:
+                placeholders = ",".join("?" * len(chunk_ids))
+                rows = conn.execute(
+                    f"SELECT id, embedding_binary FROM chunks WHERE id IN ({placeholders})",
+                    chunk_ids
+                ).fetchall()
+
+                return {row["id"]: row["embedding_binary"] for row in rows}
+
+            except sqlite3.DatabaseError as exc:
+                raise StorageError(
+                    f"Failed to get binary embeddings: {exc}",
+                    db_path=str(self.db_path),
+                    operation="get_binary_embeddings",
+                ) from exc
+
+    def get_dense_embeddings(
+        self, chunk_ids: List[int]
+    ) -> Dict[int, Optional[bytes]]:
+        """Get dense embeddings for specified chunk IDs.
+
+        Used for fine ranking in cascade retrieval architecture.
+        Dense embeddings (2048-dim) provide high-precision similarity scoring.
+
+        Args:
+            chunk_ids: List of chunk IDs to retrieve embeddings for.
+
+        Returns:
+            Dictionary mapping chunk_id to embedding_dense bytes (or None if not set).
+
+        Raises:
+            StorageError: If database query fails.
+        """
+        if not chunk_ids:
+            return {}
+
+        with self._lock:
+            conn = self._get_connection()
+            try:
+                placeholders = ",".join("?" * len(chunk_ids))
+                rows = conn.execute(
+                    f"SELECT id, embedding_dense FROM chunks WHERE id IN ({placeholders})",
+                    chunk_ids
+                ).fetchall()
+
+                return {row["id"]: row["embedding_dense"] for row in rows}
+
+            except sqlite3.DatabaseError as exc:
+                raise StorageError(
+                    f"Failed to get dense embeddings: {exc}",
+                    db_path=str(self.db_path),
+                    operation="get_dense_embeddings",
+                ) from exc
+
+    def get_chunks_by_ids(
+        self, chunk_ids: List[int]
+    ) -> List[Dict[str, Any]]:
+        """Get chunk data for specified IDs.
+
+        Args:
+            chunk_ids: List of chunk IDs to retrieve.
+
+        Returns:
+            List of chunk dictionaries with id, file_path, content, metadata.
+
+        Raises:
+            StorageError: If database query fails.
+        """
+        if not chunk_ids:
+            return []
+
+        with self._lock:
+            conn = self._get_connection()
+            try:
+                placeholders = ",".join("?" * len(chunk_ids))
+                rows = conn.execute(
+                    f"""
+                    SELECT id, file_path, content, metadata, created_at
+                    FROM chunks
+                    WHERE id IN ({placeholders})
+                    """,
+                    chunk_ids
+                ).fetchall()
+
+                results = []
+                for row in rows:
+                    metadata = None
+                    if row["metadata"]:
+                        try:
+                            metadata = json.loads(row["metadata"])
+                        except json.JSONDecodeError:
+                            pass
+
+                    results.append({
+                        "id": row["id"],
+                        "file_path": row["file_path"],
+                        "content": row["content"],
+                        "metadata": metadata,
+                        "created_at": row["created_at"],
+                    })
+
+                return results
+
+            except sqlite3.DatabaseError as exc:
+                raise StorageError(
+                    f"Failed to get chunks: {exc}",
+                    db_path=str(self.db_path),
+                    operation="get_chunks_by_ids",
+                ) from exc
+
+    def delete_chunks_by_file(self, file_path: str) -> int:
+        """Delete all chunks for a given file path.
+
+        Args:
+            file_path: Path to the source file.
+
+        Returns:
+            Number of deleted chunks.
+
+        Raises:
+            StorageError: If database operation fails.
+        """
+        with self._lock:
+            conn = self._get_connection()
+            try:
+                cursor = conn.execute(
+                    "DELETE FROM chunks WHERE file_path = ?",
+                    (file_path,)
+                )
+                conn.commit()
+                return cursor.rowcount
+
+            except sqlite3.DatabaseError as exc:
+                raise StorageError(
+                    f"Failed to delete chunks: {exc}",
+                    db_path=str(self.db_path),
+                    operation="delete_chunks_by_file",
+                ) from exc
+
+    def count_chunks(self) -> int:
+        """Count total chunks in store.
+
+        Returns:
+            Total number of chunks.
+        """
+        with self._lock:
+            conn = self._get_connection()
+            row = conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone()
+            return int(row["c"]) if row else 0