Claude-Code-Workflow/codex-lens/build/lib/codexlens/semantic/ann_index.py

"""Approximate Nearest Neighbor (ANN) index using HNSW algorithm.

Provides O(log N) similarity search using hnswlib's Hierarchical Navigable Small World graphs.
Falls back to brute-force search when hnswlib is not available.

Key features:
- HNSW index for fast approximate nearest neighbor search
- Persistent index storage (saved alongside SQLite database)
- Incremental vector addition and deletion
- Thread-safe operations
- Cosine similarity metric
- Support for centralized storage mode (single index at project root)
"""

from __future__ import annotations

import logging
import threading
from pathlib import Path
from typing import List, Optional, Tuple

from codexlens.errors import StorageError
from codexlens.config import VECTORS_HNSW_NAME

from . import SEMANTIC_AVAILABLE

if SEMANTIC_AVAILABLE:
    import numpy as np

logger = logging.getLogger(__name__)

# Try to import hnswlib (optional dependency)
try:
    import hnswlib

    HNSWLIB_AVAILABLE = True
except ImportError:
    HNSWLIB_AVAILABLE = False


class ANNIndex:
    """HNSW-based approximate nearest neighbor index for vector similarity search.

    Performance characteristics:
    - Build time: O(N log N) where N is number of vectors
    - Search time: O(log N) approximate
    - Memory: ~(M * 2 * 4 * d) bytes per vector (M=16, d=dimension)

    Index parameters:
    - space: cosine (cosine similarity metric)
    - M: 16 (max connections per node - balance between speed and recall)
    - ef_construction: 200 (search width during build - higher = better quality)
    - ef: 50 (search width during query - higher = better recall)
    """

    def __init__(
        self,
        index_path: Path,
        dim: int,
        initial_capacity: int = 50000,
        auto_save: bool = False,
        expansion_threshold: float = 0.8,
    ) -> None:
        """Initialize ANN index.

        Args:
            index_path: Path to SQLite database (index will be saved as _vectors.hnsw)
            dim: Dimension of embedding vectors
            initial_capacity: Initial maximum elements capacity (default: 50000)
            auto_save: Whether to automatically save index after operations (default: False)
            expansion_threshold: Capacity threshold to trigger auto-expansion (default: 0.8)

        Raises:
            ImportError: If required dependencies are not available
            ValueError: If dimension or capacity is invalid
        """
        if not SEMANTIC_AVAILABLE:
            raise ImportError(
                "Semantic search dependencies not available. "
                "Install with: pip install codexlens[semantic]"
            )

        if not HNSWLIB_AVAILABLE:
            raise ImportError(
                "hnswlib is required for ANN index. "
                "Install with: pip install hnswlib"
            )

        if dim <= 0:
            raise ValueError(f"Invalid dimension: {dim}")

        if initial_capacity <= 0:
            raise ValueError(f"Invalid initial capacity: {initial_capacity}")

        if not 0.0 < expansion_threshold < 1.0:
            raise ValueError(
                f"Invalid expansion threshold: {expansion_threshold}. Must be between 0 and 1."
            )

        self.index_path = Path(index_path)
        self.dim = dim

        # Derive HNSW index path from database path
        # e.g., /path/to/_index.db -> /path/to/_index_vectors.hnsw
        # This ensures unique HNSW files for each database
        db_stem = self.index_path.stem  # e.g., "_index" or "tmp123"
        self.hnsw_path = self.index_path.parent / f"{db_stem}_vectors.hnsw"

        # HNSW parameters
        self.space = "cosine"  # Cosine similarity metric
        self.M = 16  # Max connections per node (16 is good balance)
        self.ef_construction = 200  # Build-time search width (higher = better quality)
        self.ef = 50  # Query-time search width (higher = better recall)

        # Memory management parameters
        self._auto_save = auto_save
        self._expansion_threshold = expansion_threshold

        # Thread safety
        self._lock = threading.RLock()

        # HNSW index instance
        self._index: Optional[hnswlib.Index] = None
        self._max_elements = initial_capacity  # Initial capacity (reduced from 1M to 50K)
        self._current_count = 0  # Track number of vectors

        logger.info(
            f"Initialized ANNIndex with capacity={initial_capacity}, "
            f"auto_save={auto_save}, expansion_threshold={expansion_threshold}"
        )

    @classmethod
    def create_central(
        cls,
        index_root: Path,
        dim: int,
        initial_capacity: int = 50000,
        auto_save: bool = False,
        expansion_threshold: float = 0.8,
    ) -> "ANNIndex":
        """Create a centralized ANN index at the project index root.

        This method creates a single shared HNSW index file at the project root,
        rather than per-directory indexes. Use this for projects that want all
        dense vectors stored in one central location.

        Args:
            index_root: Root directory for the index (e.g., .codexlens/<project_hash>/)
            dim: Dimension of embedding vectors
            initial_capacity: Initial maximum elements capacity (default: 50000)
            auto_save: Whether to automatically save index after operations (default: False)
            expansion_threshold: Capacity threshold to trigger auto-expansion (default: 0.8)

        Returns:
            ANNIndex instance configured for centralized storage

        Example:
            >>> index = ANNIndex.create_central(Path(".codexlens/abc123"), dim=768)
            >>> index.hnsw_path  # Returns: .codexlens/abc123/_vectors.hnsw
        """
        # Create a dummy index_path that will result in the central hnsw_path
        # The index_path is used to derive hnsw_path, so we create a virtual path
        # such that self.hnsw_path = index_root / VECTORS_HNSW_NAME
        instance = cls.__new__(cls)

        if not SEMANTIC_AVAILABLE:
            raise ImportError(
                "Semantic search dependencies not available. "
                "Install with: pip install codexlens[semantic]"
            )

        if not HNSWLIB_AVAILABLE:
            raise ImportError(
                "hnswlib is required for ANN index. "
                "Install with: pip install hnswlib"
            )

        if dim <= 0:
            raise ValueError(f"Invalid dimension: {dim}")

        if initial_capacity <= 0:
            raise ValueError(f"Invalid initial capacity: {initial_capacity}")

        if not 0.0 < expansion_threshold < 1.0:
            raise ValueError(
                f"Invalid expansion threshold: {expansion_threshold}. Must be between 0 and 1."
            )

        instance.index_path = index_root
        instance.dim = dim

        # Centralized mode: use VECTORS_HNSW_NAME directly at index_root
        instance.hnsw_path = index_root / VECTORS_HNSW_NAME

        # HNSW parameters
        instance.space = "cosine"
        instance.M = 16
        instance.ef_construction = 200
        instance.ef = 50

        # Memory management parameters
        instance._auto_save = auto_save
        instance._expansion_threshold = expansion_threshold

        # Thread safety
        instance._lock = threading.RLock()

        # HNSW index instance
        instance._index: Optional[hnswlib.Index] = None
        instance._max_elements = initial_capacity
        instance._current_count = 0

        logger.info(
            f"Initialized centralized ANNIndex at {instance.hnsw_path} with "
            f"capacity={initial_capacity}, auto_save={auto_save}"
        )

        return instance

    def _ensure_index(self) -> None:
        """Ensure HNSW index is initialized (lazy initialization)."""
        if self._index is None:
            self._index = hnswlib.Index(space=self.space, dim=self.dim)
            self._index.init_index(
                max_elements=self._max_elements,
                ef_construction=self.ef_construction,
                M=self.M,
            )
            self._index.set_ef(self.ef)
            self._current_count = 0
            logger.debug(f"Created new HNSW index with capacity {self._max_elements}")

    def _auto_expand_if_needed(self, additional_count: int) -> None:
        """Auto-expand index capacity if threshold is reached.

        Args:
            additional_count: Number of vectors to be added

        Note:
            This is called internally by add_vectors and is thread-safe.
        """
        usage_ratio = (self._current_count + additional_count) / self._max_elements

        if usage_ratio >= self._expansion_threshold:
            # Calculate new capacity (2x current or enough to fit new vectors)
            new_capacity = max(
                self._max_elements * 2,
                self._current_count + additional_count,
            )

            logger.info(
                f"Expanding index capacity: {self._max_elements} -> {new_capacity} "
                f"(usage: {usage_ratio:.1%}, threshold: {self._expansion_threshold:.1%})"
            )

            self._index.resize_index(new_capacity)
            self._max_elements = new_capacity

    def add_vectors(self, ids: List[int], vectors: np.ndarray) -> None:
        """Add vectors to the index.

        Args:
            ids: List of vector IDs (must be unique)
            vectors: Numpy array of shape (N, dim) where N = len(ids)

        Raises:
            ValueError: If shapes don't match or vectors are invalid
            StorageError: If index operation fails
        """
        if len(ids) == 0:
            return

        if vectors.shape[0] != len(ids):
            raise ValueError(
                f"Number of vectors ({vectors.shape[0]}) must match number of IDs ({len(ids)})"
            )

        if vectors.shape[1] != self.dim:
            raise ValueError(
                f"Vector dimension ({vectors.shape[1]}) must match index dimension ({self.dim})"
            )

        with self._lock:
            try:
                self._ensure_index()

                # Auto-expand if threshold reached
                self._auto_expand_if_needed(len(ids))

                # Ensure vectors are C-contiguous float32 (hnswlib requirement)
                if not vectors.flags['C_CONTIGUOUS'] or vectors.dtype != np.float32:
                    vectors = np.ascontiguousarray(vectors, dtype=np.float32)

                # Add vectors to index
                self._index.add_items(vectors, ids)
                self._current_count += len(ids)

                logger.debug(
                    f"Added {len(ids)} vectors to index "
                    f"(total: {self._current_count}/{self._max_elements})"
                )

                # Auto-save if enabled
                if self._auto_save:
                    self.save()

            except Exception as e:
                raise StorageError(f"Failed to add vectors to ANN index: {e}")

    def remove_vectors(self, ids: List[int]) -> None:
        """Remove vectors from the index by marking them as deleted.

        Note: hnswlib uses soft deletion (mark_deleted). Vectors are not
        physically removed but will be excluded from search results.

        Args:
            ids: List of vector IDs to remove

        Raises:
            StorageError: If index operation fails
        """
        if len(ids) == 0:
            return

        with self._lock:
            try:
                if self._index is None or self._current_count == 0:
                    return  # Nothing to remove

                # Mark vectors as deleted
                deleted_count = 0
                for vec_id in ids:
                    try:
                        self._index.mark_deleted(vec_id)
                        deleted_count += 1
                    except RuntimeError:
                        # ID not found - ignore (idempotent deletion)
                        pass

                logger.debug(f"Marked {deleted_count}/{len(ids)} vectors as deleted")

                # Auto-save if enabled
                if self._auto_save and deleted_count > 0:
                    self.save()

            except Exception as e:
                raise StorageError(f"Failed to remove vectors from ANN index: {e}")

    def search(
        self, query: np.ndarray, top_k: int = 10
    ) -> Tuple[List[int], List[float]]:
        """Search for nearest neighbors.

        Args:
            query: Query vector of shape (dim,) or (1, dim)
            top_k: Number of nearest neighbors to return

        Returns:
            Tuple of (ids, distances) where:
            - ids: List of vector IDs ordered by similarity
            - distances: List of cosine distances (lower = more similar)

        Raises:
            ValueError: If query shape is invalid
            StorageError: If search operation fails
        """
        # Validate query shape
        if query.ndim == 1:
            query = query.reshape(1, -1)

        if query.shape[0] != 1:
            raise ValueError(
                f"Query must be a single vector, got shape {query.shape}"
            )

        if query.shape[1] != self.dim:
            raise ValueError(
                f"Query dimension ({query.shape[1]}) must match index dimension ({self.dim})"
            )

        with self._lock:
            try:
                if self._index is None or self._current_count == 0:
                    return [], []  # Empty index

                # Perform kNN search
                labels, distances = self._index.knn_query(query, k=top_k)

                # Convert to lists and flatten (knn_query returns 2D arrays)
                ids = labels[0].tolist()
                dists = distances[0].tolist()

                return ids, dists

            except Exception as e:
                raise StorageError(f"Failed to search ANN index: {e}")

    def save(self) -> None:
        """Save index to disk.

        Index is saved to [db_path_directory]/_vectors.hnsw

        Raises:
            StorageError: If save operation fails
        """
        with self._lock:
            try:
                if self._index is None or self._current_count == 0:
                    logger.debug("Skipping save: index is empty")
                    return  # Nothing to save

                # Ensure parent directory exists
                self.hnsw_path.parent.mkdir(parents=True, exist_ok=True)

                # Save index
                self._index.save_index(str(self.hnsw_path))

                logger.debug(
                    f"Saved index to {self.hnsw_path} "
                    f"({self._current_count} vectors, capacity: {self._max_elements})"
                )

            except Exception as e:
                raise StorageError(f"Failed to save ANN index: {e}")

    def load(self) -> bool:
        """Load index from disk.

        Returns:
            True if index was loaded successfully, False if index file doesn't exist

        Raises:
            StorageError: If load operation fails
        """
        with self._lock:
            try:
                if not self.hnsw_path.exists():
                    logger.debug(f"Index file not found: {self.hnsw_path}")
                    return False  # Index file doesn't exist (not an error)

                # Create fresh index object for loading (don't call init_index first)
                self._index = hnswlib.Index(space=self.space, dim=self.dim)

                # Load index from disk
                # Note: max_elements here is just for initial allocation, can expand later
                self._index.load_index(str(self.hnsw_path), max_elements=self._max_elements)

                # Update count and capacity from loaded index
                self._current_count = self._index.get_current_count()
                self._max_elements = self._index.get_max_elements()

                # Set query-time ef parameter
                self._index.set_ef(self.ef)

                logger.info(
                    f"Loaded index from {self.hnsw_path} "
                    f"({self._current_count} vectors, capacity: {self._max_elements})"
                )

                return True

            except Exception as e:
                raise StorageError(f"Failed to load ANN index: {e}")

    def count(self) -> int:
        """Get number of vectors in the index.

        Returns:
            Number of vectors currently in the index
        """
        with self._lock:
            return self._current_count

    @property
    def capacity(self) -> int:
        """Get current maximum capacity of the index.

        Returns:
            Maximum number of vectors the index can hold before expansion
        """
        with self._lock:
            return self._max_elements

    @property
    def usage_ratio(self) -> float:
        """Get current usage ratio (count / capacity).

        Returns:
            Usage ratio between 0.0 and 1.0
        """
        with self._lock:
            if self._max_elements == 0:
                return 0.0
            return self._current_count / self._max_elements

    @property
    def is_loaded(self) -> bool:
        """Check if index is loaded and ready for use.

        Returns:
            True if index is loaded, False otherwise
        """
        with self._lock:
            return self._index is not None and self._current_count > 0


class BinaryANNIndex:
    """Binary vector ANN index using Hamming distance for fast coarse retrieval.

    .. deprecated::
        This class is deprecated. Use :class:`codexlens.search.binary_searcher.BinarySearcher`
        instead, which provides faster memory-mapped search with centralized storage.

    Optimized for binary vectors (256-bit / 32 bytes per vector).
    Uses packed binary representation for memory efficiency.

    Performance characteristics:
    - Storage: 32 bytes per vector (vs ~8KB for dense vectors)
    - Distance: Hamming distance via XOR + popcount (CPU-efficient)
    - Search: O(N) brute-force with SIMD-accelerated distance computation

    Index parameters:
    - dim: Binary vector dimension (default: 256)
    - packed_dim: Packed bytes size (dim / 8 = 32 for 256-bit)

    Usage:
        index = BinaryANNIndex(index_path, dim=256)
        index.add_vectors([1, 2, 3], packed_vectors)  # List of 32-byte packed vectors
        ids, distances = index.search(query_packed, top_k=10)
    """

    DEFAULT_DIM = 256  # Default binary vector dimension

    def __init__(
        self,
        index_path: Path,
        dim: int = 256,
        initial_capacity: int = 100000,
        auto_save: bool = False,
    ) -> None:
        """Initialize Binary ANN index.

        Args:
            index_path: Path to database (index will be saved as _binary_vectors.bin)
            dim: Dimension of binary vectors (default: 256)
            initial_capacity: Initial capacity hint (default: 100000)
            auto_save: Whether to automatically save index after operations

        Raises:
            ImportError: If required dependencies are not available
            ValueError: If dimension is invalid
        """
        if not SEMANTIC_AVAILABLE:
            raise ImportError(
                "Semantic search dependencies not available. "
                "Install with: pip install codexlens[semantic]"
            )

        import warnings
        warnings.warn(
            "BinaryANNIndex is deprecated. Use codexlens.search.binary_searcher.BinarySearcher "
            "instead for faster memory-mapped search with centralized storage.",
            DeprecationWarning,
            stacklevel=2
        )

        if dim <= 0 or dim % 8 != 0:
            raise ValueError(
                f"Invalid dimension: {dim}. Must be positive and divisible by 8."
            )

        self.index_path = Path(index_path)
        self.dim = dim
        self.packed_dim = dim // 8  # 32 bytes for 256-bit vectors

        # Derive binary index path from database path
        db_stem = self.index_path.stem
        self.binary_path = self.index_path.parent / f"{db_stem}_binary_vectors.bin"

        # Memory management
        self._auto_save = auto_save
        self._initial_capacity = initial_capacity

        # Thread safety
        self._lock = threading.RLock()

        # In-memory storage: id -> packed binary vector
        self._vectors: dict[int, bytes] = {}
        self._id_list: list[int] = []  # Ordered list for efficient iteration

        # Cached numpy array for vectorized search (invalidated on add/remove)
        self._vectors_matrix: Optional[np.ndarray] = None
        self._ids_array: Optional[np.ndarray] = None
        self._cache_valid: bool = False

        logger.info(
            f"Initialized BinaryANNIndex with dim={dim}, packed_dim={self.packed_dim}"
        )

    def add_vectors(self, ids: List[int], vectors: List[bytes]) -> None:
        """Add packed binary vectors to the index.

        Args:
            ids: List of vector IDs (must be unique)
            vectors: List of packed binary vectors (each of size packed_dim bytes)

        Raises:
            ValueError: If shapes don't match or vectors are invalid
            StorageError: If index operation fails
        """
        if len(ids) == 0:
            return

        if len(vectors) != len(ids):
            raise ValueError(
                f"Number of vectors ({len(vectors)}) must match number of IDs ({len(ids)})"
            )

        # Validate vector sizes
        for i, vec in enumerate(vectors):
            if len(vec) != self.packed_dim:
                raise ValueError(
                    f"Vector {i} has size {len(vec)}, expected {self.packed_dim}"
                )

        with self._lock:
            try:
                for vec_id, vec in zip(ids, vectors):
                    if vec_id not in self._vectors:
                        self._id_list.append(vec_id)
                    self._vectors[vec_id] = vec

                # Invalidate cache on modification
                self._cache_valid = False

                logger.debug(
                    f"Added {len(ids)} binary vectors to index (total: {len(self._vectors)})"
                )

                if self._auto_save:
                    self.save()

            except Exception as e:
                raise StorageError(f"Failed to add vectors to Binary ANN index: {e}")

    def add_vectors_numpy(self, ids: List[int], vectors: np.ndarray) -> None:
        """Add unpacked binary vectors (0/1 values) to the index.

        Convenience method that packs the vectors before adding.

        Args:
            ids: List of vector IDs (must be unique)
            vectors: Numpy array of shape (N, dim) with binary values (0 or 1)

        Raises:
            ValueError: If shapes don't match
            StorageError: If index operation fails
        """
        if len(ids) == 0:
            return

        if vectors.shape[0] != len(ids):
            raise ValueError(
                f"Number of vectors ({vectors.shape[0]}) must match number of IDs ({len(ids)})"
            )

        if vectors.shape[1] != self.dim:
            raise ValueError(
                f"Vector dimension ({vectors.shape[1]}) must match index dimension ({self.dim})"
            )

        # Pack vectors
        packed_vectors = []
        for i in range(vectors.shape[0]):
            packed = np.packbits(vectors[i].astype(np.uint8)).tobytes()
            packed_vectors.append(packed)

        self.add_vectors(ids, packed_vectors)

    def remove_vectors(self, ids: List[int]) -> None:
        """Remove vectors from the index.

        Args:
            ids: List of vector IDs to remove

        Raises:
            StorageError: If index operation fails

        Note:
            Optimized for batch deletion using set operations instead of
            O(N) list.remove() calls for each ID.
        """
        if len(ids) == 0:
            return

        with self._lock:
            try:
                # Use set for O(1) lookup during filtering
                ids_to_remove = set(ids)
                removed_count = 0

                # Remove from dictionary - O(1) per deletion
                for vec_id in ids_to_remove:
                    if vec_id in self._vectors:
                        del self._vectors[vec_id]
                        removed_count += 1

                # Rebuild ID list efficiently - O(N) once instead of O(N) per removal
                if removed_count > 0:
                    self._id_list = [id_ for id_ in self._id_list if id_ not in ids_to_remove]
                    # Invalidate cache on modification
                    self._cache_valid = False

                logger.debug(f"Removed {removed_count}/{len(ids)} vectors from index")

                if self._auto_save and removed_count > 0:
                    self.save()

            except Exception as e:
                raise StorageError(
                    f"Failed to remove vectors from Binary ANN index: {e}"
                )

    def _build_cache(self) -> None:
        """Build numpy array cache from vectors dict for vectorized search.

        Pre-computes a contiguous numpy array from all vectors for efficient
        batch distance computation. Called lazily on first search after modification.
        """
        if self._cache_valid:
            return

        n_vectors = len(self._id_list)
        if n_vectors == 0:
            self._vectors_matrix = None
            self._ids_array = None
            self._cache_valid = True
            return

        # Build contiguous numpy array of all packed vectors
        # Shape: (n_vectors, packed_dim) with uint8 dtype
        self._vectors_matrix = np.empty((n_vectors, self.packed_dim), dtype=np.uint8)
        self._ids_array = np.array(self._id_list, dtype=np.int64)

        for i, vec_id in enumerate(self._id_list):
            vec_bytes = self._vectors[vec_id]
            self._vectors_matrix[i] = np.frombuffer(vec_bytes, dtype=np.uint8)

        self._cache_valid = True
        logger.debug(f"Built vectorized cache for {n_vectors} binary vectors")

    def search(
        self, query: bytes, top_k: int = 10
    ) -> Tuple[List[int], List[int]]:
        """Search for nearest neighbors using Hamming distance.

        Uses vectorized batch computation for O(N) search with SIMD acceleration.
        Pre-computes and caches numpy arrays for efficient repeated queries.

        Args:
            query: Packed binary query vector (size: packed_dim bytes)
            top_k: Number of nearest neighbors to return

        Returns:
            Tuple of (ids, distances) where:
            - ids: List of vector IDs ordered by Hamming distance (ascending)
            - distances: List of Hamming distances (lower = more similar)

        Raises:
            ValueError: If query size is invalid
            StorageError: If search operation fails
        """
        if len(query) != self.packed_dim:
            raise ValueError(
                f"Query size ({len(query)}) must match packed_dim ({self.packed_dim})"
            )

        with self._lock:
            try:
                if len(self._vectors) == 0:
                    return [], []

                # Build cache if needed (lazy initialization)
                self._build_cache()

                if self._vectors_matrix is None or self._ids_array is None:
                    return [], []

                # Vectorized Hamming distance computation
                # 1. Convert query to numpy array
                query_arr = np.frombuffer(query, dtype=np.uint8)

                # 2. Broadcast XOR: (1, packed_dim) XOR (n_vectors, packed_dim)
                #    Result shape: (n_vectors, packed_dim)
                xor_result = np.bitwise_xor(query_arr, self._vectors_matrix)

                # 3. Vectorized popcount using lookup table for efficiency
                #    np.unpackbits is slow for large arrays, use popcount LUT instead
                popcount_lut = np.array([bin(i).count('1') for i in range(256)], dtype=np.uint8)
                bit_counts = popcount_lut[xor_result]

                # 4. Sum across packed bytes to get Hamming distance per vector
                distances = bit_counts.sum(axis=1)

                # 5. Get top-k using argpartition (O(N) instead of O(N log N) for full sort)
                n_vectors = len(distances)
                k = min(top_k, n_vectors)

                if k == n_vectors:
                    # No partitioning needed, just sort all
                    sorted_indices = np.argsort(distances)
                else:
                    # Use argpartition for O(N) partial sort
                    partition_indices = np.argpartition(distances, k)[:k]
                    # Sort only the top-k
                    top_k_distances = distances[partition_indices]
                    sorted_order = np.argsort(top_k_distances)
                    sorted_indices = partition_indices[sorted_order]

                # 6. Return results
                result_ids = self._ids_array[sorted_indices].tolist()
                result_dists = distances[sorted_indices].tolist()

                return result_ids, result_dists

            except Exception as e:
                raise StorageError(f"Failed to search Binary ANN index: {e}")

    def search_numpy(
        self, query: np.ndarray, top_k: int = 10
    ) -> Tuple[List[int], List[int]]:
        """Search with unpacked binary query vector.

        Convenience method that packs the query before searching.

        Args:
            query: Binary query vector of shape (dim,) with values 0 or 1
            top_k: Number of nearest neighbors to return

        Returns:
            Tuple of (ids, distances)
        """
        if query.ndim == 2:
            query = query.flatten()

        if len(query) != self.dim:
            raise ValueError(
                f"Query dimension ({len(query)}) must match index dimension ({self.dim})"
            )

        packed_query = np.packbits(query.astype(np.uint8)).tobytes()
        return self.search(packed_query, top_k)

    def search_batch(
        self, queries: List[bytes], top_k: int = 10
    ) -> List[Tuple[List[int], List[int]]]:
        """Batch search for multiple queries.

        Args:
            queries: List of packed binary query vectors
            top_k: Number of nearest neighbors to return per query

        Returns:
            List of (ids, distances) tuples, one per query
        """
        results = []
        for query in queries:
            ids, dists = self.search(query, top_k)
            results.append((ids, dists))
        return results

    def save(self) -> None:
        """Save index to disk.

        Binary format:
        - 4 bytes: magic number (0x42494E56 = "BINV")
        - 4 bytes: version (1)
        - 4 bytes: dim
        - 4 bytes: packed_dim
        - 4 bytes: num_vectors
        - For each vector:
          - 4 bytes: id
          - packed_dim bytes: vector data

        Raises:
            StorageError: If save operation fails
        """
        with self._lock:
            try:
                if len(self._vectors) == 0:
                    logger.debug("Skipping save: index is empty")
                    return

                # Ensure parent directory exists
                self.binary_path.parent.mkdir(parents=True, exist_ok=True)

                with open(self.binary_path, "wb") as f:
                    # Header
                    f.write(b"BINV")  # Magic number
                    f.write(np.array([1], dtype=np.uint32).tobytes())  # Version
                    f.write(np.array([self.dim], dtype=np.uint32).tobytes())
                    f.write(np.array([self.packed_dim], dtype=np.uint32).tobytes())
                    f.write(
                        np.array([len(self._vectors)], dtype=np.uint32).tobytes()
                    )

                    # Vectors
                    for vec_id in self._id_list:
                        f.write(np.array([vec_id], dtype=np.uint32).tobytes())
                        f.write(self._vectors[vec_id])

                logger.debug(
                    f"Saved binary index to {self.binary_path} "
                    f"({len(self._vectors)} vectors)"
                )

            except Exception as e:
                raise StorageError(f"Failed to save Binary ANN index: {e}")

    def load(self) -> bool:
        """Load index from disk.

        Returns:
            True if index was loaded successfully, False if index file doesn't exist

        Raises:
            StorageError: If load operation fails
        """
        with self._lock:
            try:
                if not self.binary_path.exists():
                    logger.debug(f"Binary index file not found: {self.binary_path}")
                    return False

                with open(self.binary_path, "rb") as f:
                    # Read header
                    magic = f.read(4)
                    if magic != b"BINV":
                        raise StorageError(
                            f"Invalid binary index file: bad magic number"
                        )

                    version = np.frombuffer(f.read(4), dtype=np.uint32)[0]
                    if version != 1:
                        raise StorageError(
                            f"Unsupported binary index version: {version}"
                        )

                    file_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0]
                    file_packed_dim = np.frombuffer(f.read(4), dtype=np.uint32)[0]
                    num_vectors = np.frombuffer(f.read(4), dtype=np.uint32)[0]

                    if file_dim != self.dim or file_packed_dim != self.packed_dim:
                        raise StorageError(
                            f"Dimension mismatch: file has dim={file_dim}, "
                            f"packed_dim={file_packed_dim}, "
                            f"expected dim={self.dim}, packed_dim={self.packed_dim}"
                        )

                    # Clear existing data
                    self._vectors.clear()
                    self._id_list.clear()
                    self._cache_valid = False

                    # Read vectors
                    for _ in range(num_vectors):
                        vec_id = np.frombuffer(f.read(4), dtype=np.uint32)[0]
                        vec_data = f.read(self.packed_dim)
                        self._vectors[int(vec_id)] = vec_data
                        self._id_list.append(int(vec_id))

                logger.info(
                    f"Loaded binary index from {self.binary_path} "
                    f"({len(self._vectors)} vectors)"
                )

                return True

            except StorageError:
                raise
            except Exception as e:
                raise StorageError(f"Failed to load Binary ANN index: {e}")

    def count(self) -> int:
        """Get number of vectors in the index.

        Returns:
            Number of vectors currently in the index
        """
        with self._lock:
            return len(self._vectors)

    @property
    def is_loaded(self) -> bool:
        """Check if index has vectors.

        Returns:
            True if index has vectors, False otherwise
        """
        with self._lock:
            return len(self._vectors) > 0

    def get_vector(self, vec_id: int) -> Optional[bytes]:
        """Get a specific vector by ID.

        Args:
            vec_id: Vector ID to retrieve

        Returns:
            Packed binary vector or None if not found
        """
        with self._lock:
            return self._vectors.get(vec_id)

    def clear(self) -> None:
        """Clear all vectors from the index."""
        with self._lock:
            self._vectors.clear()
            self._id_list.clear()
            self._vectors_matrix = None
            self._ids_array = None
            self._cache_valid = False
            logger.debug("Cleared binary index")


def create_ann_index(
    index_path: Path,
    index_type: str = "hnsw",
    dim: int = 2048,
    **kwargs,
) -> ANNIndex | BinaryANNIndex:
    """Factory function to create an ANN index.

    Args:
        index_path: Path to database file
        index_type: Type of index - "hnsw" for dense vectors, "binary" for binary vectors
        dim: Vector dimension (default: 2048 for dense, 256 for binary)
        **kwargs: Additional arguments passed to the index constructor

    Returns:
        ANNIndex for dense vectors or BinaryANNIndex for binary vectors

    Raises:
        ValueError: If index_type is invalid

    Example:
        >>> # Dense vector index (HNSW)
        >>> dense_index = create_ann_index(path, index_type="hnsw", dim=2048)
        >>> dense_index.add_vectors(ids, dense_vectors)
        >>>
        >>> # Binary vector index (Hamming distance)
        >>> binary_index = create_ann_index(path, index_type="binary", dim=256)
        >>> binary_index.add_vectors(ids, packed_vectors)
    """
    index_type = index_type.lower()

    if index_type == "hnsw":
        return ANNIndex(index_path=index_path, dim=dim, **kwargs)
    elif index_type == "binary":
        # Default to 256 for binary if not specified
        if dim == 2048:  # Default dense dim was used
            dim = 256
        return BinaryANNIndex(index_path=index_path, dim=dim, **kwargs)
    else:
        raise ValueError(
            f"Invalid index_type: {index_type}. Must be 'hnsw' or 'binary'."
        )