Refactor code structure and remove redundant changes

2026-03-30 20:21:09 +08:00 · 2026-01-24 14:47:47 +08:00
parent cf5fecd66d
commit f2b0a5bbc9
113 changed files with 43217 additions and 235 deletions
--- a/codex-lens/build/lib/codexlens/search/binary_searcher.py
+++ b/codex-lens/build/lib/codexlens/search/binary_searcher.py
@@ -0,0 +1,277 @@
+"""Binary vector searcher for cascade search.
+
+This module provides fast binary vector search using Hamming distance
+for the first stage of cascade search (coarse filtering).
+
+Supports two loading modes:
+1. Memory-mapped file (preferred): Low memory footprint, OS-managed paging
+2. Database loading (fallback): Loads all vectors into RAM
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+# Pre-computed popcount lookup table for vectorized Hamming distance
+# Each byte value (0-255) maps to its bit count
+_POPCOUNT_TABLE = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
+
+
+class BinarySearcher:
+    """Fast binary vector search using Hamming distance.
+
+    This class implements the first stage of cascade search:
+    fast, approximate retrieval using binary vectors and Hamming distance.
+
+    The binary vectors are derived from dense embeddings by thresholding:
+    binary[i] = 1 if dense[i] > 0 else 0
+
+    Hamming distance between two binary vectors counts the number of
+    differing bits, which can be computed very efficiently using XOR
+    and population count.
+
+    Supports two loading modes:
+    - Memory-mapped file (preferred): Uses np.memmap for minimal RAM usage
+    - Database (fallback): Loads all vectors into memory from SQLite
+    """
+
+    def __init__(self, index_root_or_meta_path: Path) -> None:
+        """Initialize BinarySearcher.
+
+        Args:
+            index_root_or_meta_path: Either:
+                - Path to index root directory (containing _binary_vectors.mmap)
+                - Path to _vectors_meta.db (legacy mode, loads from DB)
+        """
+        path = Path(index_root_or_meta_path)
+
+        # Determine if this is an index root or a specific DB path
+        if path.suffix == '.db':
+            # Legacy mode: specific DB path
+            self.index_root = path.parent
+            self.meta_store_path = path
+        else:
+            # New mode: index root directory
+            self.index_root = path
+            self.meta_store_path = path / "_vectors_meta.db"
+
+        self._chunk_ids: Optional[np.ndarray] = None
+        self._binary_matrix: Optional[np.ndarray] = None
+        self._is_memmap = False
+        self._loaded = False
+
+    def load(self) -> bool:
+        """Load binary vectors using memory-mapped file or database fallback.
+
+        Tries to load from memory-mapped file first (preferred for large indexes),
+        falls back to database loading if mmap file doesn't exist.
+
+        Returns:
+            True if vectors were loaded successfully.
+        """
+        if self._loaded:
+            return True
+
+        # Try memory-mapped file first (preferred)
+        mmap_path = self.index_root / "_binary_vectors.mmap"
+        meta_path = mmap_path.with_suffix('.meta.json')
+
+        if mmap_path.exists() and meta_path.exists():
+            try:
+                with open(meta_path, 'r') as f:
+                    meta = json.load(f)
+
+                shape = tuple(meta['shape'])
+                self._chunk_ids = np.array(meta['chunk_ids'], dtype=np.int64)
+
+                # Memory-map the binary matrix (read-only)
+                self._binary_matrix = np.memmap(
+                    str(mmap_path),
+                    dtype=np.uint8,
+                    mode='r',
+                    shape=shape
+                )
+                self._is_memmap = True
+                self._loaded = True
+
+                logger.info(
+                    "Memory-mapped %d binary vectors (%d bytes each)",
+                    len(self._chunk_ids), shape[1]
+                )
+                return True
+
+            except Exception as e:
+                logger.warning("Failed to load mmap binary vectors, falling back to DB: %s", e)
+
+        # Fallback: load from database
+        return self._load_from_db()
+
+    def _load_from_db(self) -> bool:
+        """Load binary vectors from database (legacy/fallback mode).
+
+        Returns:
+            True if vectors were loaded successfully.
+        """
+        try:
+            from codexlens.storage.vector_meta_store import VectorMetadataStore
+
+            with VectorMetadataStore(self.meta_store_path) as store:
+                rows = store.get_all_binary_vectors()
+
+            if not rows:
+                logger.warning("No binary vectors found in %s", self.meta_store_path)
+                return False
+
+            # Convert to numpy arrays for fast computation
+            self._chunk_ids = np.array([r[0] for r in rows], dtype=np.int64)
+
+            # Unpack bytes to numpy array
+            binary_arrays = []
+            for _, vec_bytes in rows:
+                arr = np.frombuffer(vec_bytes, dtype=np.uint8)
+                binary_arrays.append(arr)
+
+            self._binary_matrix = np.vstack(binary_arrays)
+            self._is_memmap = False
+            self._loaded = True
+
+            logger.info(
+                "Loaded %d binary vectors from DB (%d bytes each)",
+                len(self._chunk_ids), self._binary_matrix.shape[1]
+            )
+            return True
+
+        except Exception as e:
+            logger.error("Failed to load binary vectors: %s", e)
+            return False
+
+    def search(
+        self,
+        query_vector: np.ndarray,
+        top_k: int = 100
+    ) -> List[Tuple[int, int]]:
+        """Search for similar vectors using Hamming distance.
+
+        Args:
+            query_vector: Dense query vector (will be binarized).
+            top_k: Number of top results to return.
+
+        Returns:
+            List of (chunk_id, hamming_distance) tuples sorted by distance.
+        """
+        if not self._loaded and not self.load():
+            return []
+
+        # Binarize query vector
+        query_binary = (query_vector > 0).astype(np.uint8)
+        query_packed = np.packbits(query_binary)
+
+        # Compute Hamming distances using XOR and popcount
+        # XOR gives 1 for differing bits
+        xor_result = np.bitwise_xor(self._binary_matrix, query_packed)
+
+        # Vectorized popcount using lookup table (orders of magnitude faster)
+        # Sum the bit counts for each byte across all columns
+        distances = np.sum(_POPCOUNT_TABLE[xor_result], axis=1, dtype=np.int32)
+
+        # Get top-k with smallest distances
+        if top_k >= len(distances):
+            top_indices = np.argsort(distances)
+        else:
+            # Partial sort for efficiency
+            top_indices = np.argpartition(distances, top_k)[:top_k]
+            top_indices = top_indices[np.argsort(distances[top_indices])]
+
+        results = [
+            (int(self._chunk_ids[i]), int(distances[i]))
+            for i in top_indices
+        ]
+
+        return results
+
+    def search_with_rerank(
+        self,
+        query_dense: np.ndarray,
+        dense_vectors: np.ndarray,
+        dense_chunk_ids: np.ndarray,
+        top_k: int = 10,
+        candidates: int = 100
+    ) -> List[Tuple[int, float]]:
+        """Two-stage cascade search: binary filter + dense rerank.
+
+        Args:
+            query_dense: Dense query vector.
+            dense_vectors: Dense vectors for reranking (from HNSW or stored).
+            dense_chunk_ids: Chunk IDs corresponding to dense_vectors.
+            top_k: Final number of results.
+            candidates: Number of candidates from binary search.
+
+        Returns:
+            List of (chunk_id, cosine_similarity) tuples.
+        """
+        # Stage 1: Binary filtering
+        binary_results = self.search(query_dense, top_k=candidates)
+        if not binary_results:
+            return []
+
+        candidate_ids = {r[0] for r in binary_results}
+
+        # Stage 2: Dense reranking
+        # Find indices of candidates in dense_vectors
+        candidate_mask = np.isin(dense_chunk_ids, list(candidate_ids))
+        candidate_indices = np.where(candidate_mask)[0]
+
+        if len(candidate_indices) == 0:
+            # Fallback: return binary results with normalized distance
+            max_dist = max(r[1] for r in binary_results) if binary_results else 1
+            return [(r[0], 1.0 - r[1] / max_dist) for r in binary_results[:top_k]]
+
+        # Compute cosine similarities for candidates
+        candidate_vectors = dense_vectors[candidate_indices]
+        candidate_ids_array = dense_chunk_ids[candidate_indices]
+
+        # Normalize vectors
+        query_norm = query_dense / (np.linalg.norm(query_dense) + 1e-8)
+        cand_norms = candidate_vectors / (
+            np.linalg.norm(candidate_vectors, axis=1, keepdims=True) + 1e-8
+        )
+
+        # Cosine similarities
+        similarities = np.dot(cand_norms, query_norm)
+
+        # Sort by similarity (descending)
+        sorted_indices = np.argsort(-similarities)[:top_k]
+
+        results = [
+            (int(candidate_ids_array[i]), float(similarities[i]))
+            for i in sorted_indices
+        ]
+
+        return results
+
+    @property
+    def vector_count(self) -> int:
+        """Get number of loaded binary vectors."""
+        return len(self._chunk_ids) if self._chunk_ids is not None else 0
+
+    @property
+    def is_memmap(self) -> bool:
+        """Check if using memory-mapped file (vs in-memory array)."""
+        return self._is_memmap
+
+    def clear(self) -> None:
+        """Clear loaded vectors from memory."""
+        # For memmap, just delete the reference (OS will handle cleanup)
+        if self._is_memmap and self._binary_matrix is not None:
+            del self._binary_matrix
+        self._chunk_ids = None
+        self._binary_matrix = None
+        self._is_memmap = False
+        self._loaded = False