Claude-Code-Workflow/codex-lens/src/codexlens/semantic/vector_store.py

"""Vector storage and similarity search for semantic chunks.

Optimized for high-performance similarity search using:
- HNSW index for O(log N) approximate nearest neighbor search (primary)
- Cached embedding matrix for batch operations (fallback)
- NumPy vectorized cosine similarity (fallback, 100x+ faster than loops)
- Lazy content loading (only fetch for top-k results)
"""

from __future__ import annotations

import json
import logging
import sqlite3
import threading
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

from codexlens.entities import SearchResult, SemanticChunk
from codexlens.errors import StorageError

from . import SEMANTIC_AVAILABLE

if SEMANTIC_AVAILABLE:
    import numpy as np

# Try to import ANN index (optional hnswlib dependency)
try:
    from codexlens.semantic.ann_index import ANNIndex, HNSWLIB_AVAILABLE
except ImportError:
    HNSWLIB_AVAILABLE = False
    ANNIndex = None


logger = logging.getLogger(__name__)


def _cosine_similarity(a: List[float], b: List[float]) -> float:
    """Compute cosine similarity between two vectors."""
    if not SEMANTIC_AVAILABLE:
        raise ImportError("numpy required for vector operations")

    a_arr = np.array(a)
    b_arr = np.array(b)

    norm_a = np.linalg.norm(a_arr)
    norm_b = np.linalg.norm(b_arr)

    if norm_a == 0 or norm_b == 0:
        return 0.0

    return float(np.dot(a_arr, b_arr) / (norm_a * norm_b))


class VectorStore:
    """SQLite-based vector storage with HNSW-accelerated similarity search.

    Performance optimizations:
    - HNSW index for O(log N) approximate nearest neighbor search
    - Embedding matrix cached in memory for batch similarity computation (fallback)
    - NumPy vectorized operations instead of Python loops (fallback)
    - Lazy content loading - only fetch full content for top-k results
    - Thread-safe cache invalidation
    """

    # Default embedding dimension (used when creating new index)
    DEFAULT_DIM = 768

    def __init__(self, db_path: str | Path) -> None:
        if not SEMANTIC_AVAILABLE:
            raise ImportError(
                "Semantic search dependencies not available. "
                "Install with: pip install codexlens[semantic]"
            )

        self.db_path = Path(db_path)
        self.db_path.parent.mkdir(parents=True, exist_ok=True)

        # Embedding cache for fast similarity search (fallback)
        self._cache_lock = threading.RLock()
        self._embedding_matrix: Optional[np.ndarray] = None
        self._embedding_norms: Optional[np.ndarray] = None
        self._chunk_ids: Optional[List[int]] = None
        self._cache_version: int = 0

        # ANN index for O(log N) search
        self._ann_index: Optional[ANNIndex] = None
        self._ann_dim: Optional[int] = None
        self._ann_write_lock = threading.Lock()  # Protects ANN index modifications

        self._init_schema()
        self._init_ann_index()

    def _init_schema(self) -> None:
        """Initialize vector storage schema."""
        with sqlite3.connect(self.db_path) as conn:
            # Enable memory mapping for faster reads
            conn.execute("PRAGMA mmap_size = 30000000000")  # 30GB limit
            conn.execute("""
                CREATE TABLE IF NOT EXISTS semantic_chunks (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    file_path TEXT NOT NULL,
                    content TEXT NOT NULL,
                    embedding BLOB NOT NULL,
                    metadata TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            """)
            conn.execute("""
                CREATE INDEX IF NOT EXISTS idx_chunks_file
                ON semantic_chunks(file_path)
            """)
            conn.commit()

    def _init_ann_index(self) -> None:
        """Initialize ANN index (lazy loading from existing data)."""
        if not HNSWLIB_AVAILABLE:
            logger.debug("hnswlib not available, using brute-force search")
            return

        # Try to detect embedding dimension from existing data
        dim = self._detect_embedding_dim()
        if dim is None:
            # No data yet, will initialize on first add
            logger.debug("No embeddings found, ANN index will be created on first add")
            return

        self._ann_dim = dim

        try:
            self._ann_index = ANNIndex(self.db_path, dim)
            if self._ann_index.load():
                logger.debug(
                    "Loaded ANN index with %d vectors", self._ann_index.count()
                )
            else:
                # Index file doesn't exist, try to build from SQLite data
                logger.debug("ANN index file not found, rebuilding from SQLite")
                self._rebuild_ann_index_internal()
        except Exception as e:
            logger.warning("Failed to initialize ANN index: %s", e)
            self._ann_index = None

    def _detect_embedding_dim(self) -> Optional[int]:
        """Detect embedding dimension from existing data."""
        with sqlite3.connect(self.db_path) as conn:
            row = conn.execute(
                "SELECT embedding FROM semantic_chunks LIMIT 1"
            ).fetchone()
            if row and row[0]:
                # Embedding is stored as float32 blob
                blob = row[0]
                return len(blob) // np.dtype(np.float32).itemsize
        return None

    @property
    def dimension(self) -> Optional[int]:
        """Return the dimension of embeddings in the store.

        Returns:
            Embedding dimension if available, None if store is empty.
        """
        if self._ann_dim is not None:
            return self._ann_dim
        self._ann_dim = self._detect_embedding_dim()
        return self._ann_dim

    def _rebuild_ann_index_internal(self) -> int:
        """Internal method to rebuild ANN index from SQLite data."""
        if self._ann_index is None:
            return 0

        with sqlite3.connect(self.db_path) as conn:
            conn.execute("PRAGMA mmap_size = 30000000000")
            rows = conn.execute(
                "SELECT id, embedding FROM semantic_chunks"
            ).fetchall()

        if not rows:
            return 0

        # Extract IDs and embeddings
        ids = [r[0] for r in rows]
        embeddings = np.vstack([
            np.frombuffer(r[1], dtype=np.float32) for r in rows
        ])

        # Add to ANN index
        self._ann_index.add_vectors(ids, embeddings)
        self._ann_index.save()

        logger.info("Rebuilt ANN index with %d vectors", len(ids))
        return len(ids)

    def rebuild_ann_index(self) -> int:
        """Rebuild HNSW index from all chunks in SQLite.

        Use this method to:
        - Migrate existing data to use ANN search
        - Repair corrupted index
        - Reclaim space after many deletions

        Returns:
            Number of vectors indexed.
        """
        if not HNSWLIB_AVAILABLE:
            logger.warning("hnswlib not available, cannot rebuild ANN index")
            return 0

        # Detect dimension
        dim = self._detect_embedding_dim()
        if dim is None:
            logger.warning("No embeddings found, cannot rebuild ANN index")
            return 0

        self._ann_dim = dim

        # Create new index
        try:
            self._ann_index = ANNIndex(self.db_path, dim)
            return self._rebuild_ann_index_internal()
        except Exception as e:
            logger.error("Failed to rebuild ANN index: %s", e)
            self._ann_index = None
            return 0

    def _invalidate_cache(self) -> None:
        """Invalidate the embedding cache (thread-safe)."""
        with self._cache_lock:
            self._embedding_matrix = None
            self._embedding_norms = None
            self._chunk_ids = None
            self._cache_version += 1

    def _refresh_cache(self) -> bool:
        """Load embeddings into numpy matrix for fast similarity search.

        Returns:
            True if cache was refreshed successfully, False if no data.
        """
        with self._cache_lock:
            with sqlite3.connect(self.db_path) as conn:
                conn.execute("PRAGMA mmap_size = 30000000000")
                rows = conn.execute(
                    "SELECT id, embedding FROM semantic_chunks"
                ).fetchall()

            if not rows:
                self._embedding_matrix = None
                self._embedding_norms = None
                self._chunk_ids = None
                return False

            # Extract IDs and embeddings
            self._chunk_ids = [r[0] for r in rows]

            # Bulk convert binary blobs to numpy matrix
            embeddings = [
                np.frombuffer(r[1], dtype=np.float32) for r in rows
            ]
            self._embedding_matrix = np.vstack(embeddings)

            # Pre-compute norms for faster similarity calculation
            self._embedding_norms = np.linalg.norm(
                self._embedding_matrix, axis=1, keepdims=True
            )
            # Avoid division by zero
            self._embedding_norms = np.where(
                self._embedding_norms == 0, 1e-10, self._embedding_norms
            )

            return True

    def _ensure_ann_index(self, dim: int) -> bool:
        """Ensure ANN index is initialized with correct dimension.

        This method is thread-safe and uses double-checked locking.

        Args:
            dim: Embedding dimension

        Returns:
            True if ANN index is ready, False otherwise
        """
        if not HNSWLIB_AVAILABLE:
            return False

        # Fast path: index already initialized (no lock needed)
        if self._ann_index is not None:
            return True

        # Slow path: acquire lock for initialization
        with self._ann_write_lock:
            # Double-check after acquiring lock
            if self._ann_index is not None:
                return True

            try:
                self._ann_dim = dim
                self._ann_index = ANNIndex(self.db_path, dim)
                self._ann_index.load()  # Try to load existing
                return True
            except Exception as e:
                logger.warning("Failed to initialize ANN index: %s", e)
                self._ann_index = None
                return False

    def add_chunk(self, chunk: SemanticChunk, file_path: str) -> int:
        """Add a single chunk with its embedding.

        Returns:
            The inserted chunk ID.
        """
        if chunk.embedding is None:
            raise ValueError("Chunk must have embedding before adding to store")

        embedding_arr = np.array(chunk.embedding, dtype=np.float32)
        embedding_blob = embedding_arr.tobytes()
        metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None

        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.execute(
                """
                INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
                VALUES (?, ?, ?, ?)
                """,
                (file_path, chunk.content, embedding_blob, metadata_json)
            )
            conn.commit()
            chunk_id = cursor.lastrowid or 0

        # Add to ANN index
        if self._ensure_ann_index(len(chunk.embedding)):
            with self._ann_write_lock:
                try:
                    self._ann_index.add_vectors([chunk_id], embedding_arr.reshape(1, -1))
                    self._ann_index.save()
                except Exception as e:
                    logger.warning("Failed to add to ANN index: %s", e)

        # Invalidate cache after modification
        self._invalidate_cache()
        return chunk_id

    def add_chunks(self, chunks: List[SemanticChunk], file_path: str) -> List[int]:
        """Add multiple chunks with embeddings (batch insert).

        Returns:
            List of inserted chunk IDs.
        """
        if not chunks:
            return []

        # Prepare batch data
        batch_data = []
        embeddings_list = []
        for chunk in chunks:
            if chunk.embedding is None:
                raise ValueError("All chunks must have embeddings")
            embedding_arr = np.array(chunk.embedding, dtype=np.float32)
            embedding_blob = embedding_arr.tobytes()
            metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
            batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
            embeddings_list.append(embedding_arr)

        # Batch insert to SQLite
        with sqlite3.connect(self.db_path) as conn:
            # Get starting ID before insert
            row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
            start_id = (row[0] or 0) + 1

            conn.executemany(
                """
                INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
                VALUES (?, ?, ?, ?)
                """,
                batch_data
            )
            conn.commit()
            # Calculate inserted IDs based on starting ID
            ids = list(range(start_id, start_id + len(chunks)))

        # Add to ANN index
        if embeddings_list and self._ensure_ann_index(len(embeddings_list[0])):
            with self._ann_write_lock:
                try:
                    embeddings_matrix = np.vstack(embeddings_list)
                    self._ann_index.add_vectors(ids, embeddings_matrix)
                    self._ann_index.save()
                except Exception as e:
                    logger.warning("Failed to add batch to ANN index: %s", e)

        # Invalidate cache after modification
        self._invalidate_cache()
        return ids

    def add_chunks_batch(
        self, chunks_with_paths: List[Tuple[SemanticChunk, str]]
    ) -> List[int]:
        """Batch insert chunks from multiple files in a single transaction.

        This method is optimized for bulk operations during index generation.

        Args:
            chunks_with_paths: List of (chunk, file_path) tuples

        Returns:
            List of inserted chunk IDs
        """
        if not chunks_with_paths:
            return []

        # Prepare batch data
        batch_data = []
        embeddings_list = []
        for chunk, file_path in chunks_with_paths:
            if chunk.embedding is None:
                raise ValueError("All chunks must have embeddings")
            embedding_arr = np.array(chunk.embedding, dtype=np.float32)
            embedding_blob = embedding_arr.tobytes()
            metadata_json = json.dumps(chunk.metadata) if chunk.metadata else None
            batch_data.append((file_path, chunk.content, embedding_blob, metadata_json))
            embeddings_list.append(embedding_arr)

        # Batch insert to SQLite in single transaction
        with sqlite3.connect(self.db_path) as conn:
            # Get starting ID before insert
            row = conn.execute("SELECT MAX(id) FROM semantic_chunks").fetchone()
            start_id = (row[0] or 0) + 1

            conn.executemany(
                """
                INSERT INTO semantic_chunks (file_path, content, embedding, metadata)
                VALUES (?, ?, ?, ?)
                """,
                batch_data
            )
            conn.commit()
            # Calculate inserted IDs based on starting ID
            ids = list(range(start_id, start_id + len(chunks_with_paths)))

        # Add to ANN index
        if embeddings_list and self._ensure_ann_index(len(embeddings_list[0])):
            with self._ann_write_lock:
                try:
                    embeddings_matrix = np.vstack(embeddings_list)
                    self._ann_index.add_vectors(ids, embeddings_matrix)
                    self._ann_index.save()
                except Exception as e:
                    logger.warning("Failed to add batch to ANN index: %s", e)

        # Invalidate cache after modification
        self._invalidate_cache()
        return ids

    def delete_file_chunks(self, file_path: str) -> int:
        """Delete all chunks for a file.

        Returns:
            Number of deleted chunks.
        """
        # Get chunk IDs before deletion (for ANN index)
        chunk_ids_to_delete = []
        if self._ann_index is not None:
            with sqlite3.connect(self.db_path) as conn:
                rows = conn.execute(
                    "SELECT id FROM semantic_chunks WHERE file_path = ?",
                    (file_path,)
                ).fetchall()
                chunk_ids_to_delete = [r[0] for r in rows]

        # Delete from SQLite
        with sqlite3.connect(self.db_path) as conn:
            cursor = conn.execute(
                "DELETE FROM semantic_chunks WHERE file_path = ?",
                (file_path,)
            )
            conn.commit()
            deleted = cursor.rowcount

        # Remove from ANN index
        if deleted > 0 and self._ann_index is not None and chunk_ids_to_delete:
            with self._ann_write_lock:
                try:
                    self._ann_index.remove_vectors(chunk_ids_to_delete)
                    self._ann_index.save()
                except Exception as e:
                    logger.warning("Failed to remove from ANN index: %s", e)

        if deleted > 0:
            self._invalidate_cache()
        return deleted

    def search_similar(
        self,
        query_embedding: List[float],
        top_k: int = 10,
        min_score: float = 0.0,
        return_full_content: bool = True,
    ) -> List[SearchResult]:
        """Find chunks most similar to query embedding.

        Uses HNSW index for O(log N) search when available, falls back to
        brute-force NumPy search otherwise.

        Args:
            query_embedding: Query vector.
            top_k: Maximum results to return.
            min_score: Minimum similarity score (0-1).
            return_full_content: If True, return full code block content.

        Returns:
            List of SearchResult ordered by similarity (highest first).
        """
        query_vec = np.array(query_embedding, dtype=np.float32)

        # Try HNSW search first (O(log N))
        if (
            HNSWLIB_AVAILABLE
            and self._ann_index is not None
            and self._ann_index.is_loaded
            and self._ann_index.count() > 0
        ):
            try:
                return self._search_with_ann(
                    query_vec, top_k, min_score, return_full_content
                )
            except Exception as e:
                logger.warning("ANN search failed, falling back to brute-force: %s", e)

        # Fallback to brute-force search (O(N))
        return self._search_brute_force(
            query_vec, top_k, min_score, return_full_content
        )

    def _search_with_ann(
        self,
        query_vec: np.ndarray,
        top_k: int,
        min_score: float,
        return_full_content: bool,
    ) -> List[SearchResult]:
        """Search using HNSW index (O(log N)).

        Args:
            query_vec: Query vector as numpy array
            top_k: Maximum results to return
            min_score: Minimum similarity score (0-1)
            return_full_content: If True, return full code block content

        Returns:
            List of SearchResult ordered by similarity (highest first)
        """
        # Limit top_k to available vectors to prevent hnswlib error
        ann_count = self._ann_index.count()
        effective_top_k = min(top_k, ann_count) if ann_count > 0 else 0

        if effective_top_k == 0:
            return []

        # HNSW search returns (ids, distances)
        # For cosine space: distance = 1 - similarity
        ids, distances = self._ann_index.search(query_vec, effective_top_k)

        if not ids:
            return []

        # Convert distances to similarity scores
        scores = [1.0 - d for d in distances]

        # Filter by min_score
        filtered = [
            (chunk_id, score)
            for chunk_id, score in zip(ids, scores)
            if score >= min_score
        ]

        if not filtered:
            return []

        top_ids = [f[0] for f in filtered]
        top_scores = [f[1] for f in filtered]

        # Fetch content from SQLite
        return self._fetch_results_by_ids(top_ids, top_scores, return_full_content)

    def _search_brute_force(
        self,
        query_vec: np.ndarray,
        top_k: int,
        min_score: float,
        return_full_content: bool,
    ) -> List[SearchResult]:
        """Brute-force search using NumPy (O(N) fallback).

        Args:
            query_vec: Query vector as numpy array
            top_k: Maximum results to return
            min_score: Minimum similarity score (0-1)
            return_full_content: If True, return full code block content

        Returns:
            List of SearchResult ordered by similarity (highest first)
        """
        logger.warning(
            "Using brute-force vector search (hnswlib not available). "
            "This may cause high memory usage for large indexes. "
            "Install hnswlib for better performance: pip install hnswlib"
        )

        with self._cache_lock:
            # Refresh cache if needed
            if self._embedding_matrix is None:
                if not self._refresh_cache():
                    return []  # No data

            # Vectorized cosine similarity
            query_vec = query_vec.reshape(1, -1)
            query_norm = np.linalg.norm(query_vec)
            if query_norm == 0:
                return []

            # Compute all similarities at once: (N,) scores
            # similarity = (A @ B.T) / (||A|| * ||B||)
            dot_products = np.dot(self._embedding_matrix, query_vec.T).flatten()
            scores = dot_products / (self._embedding_norms.flatten() * query_norm)

            # Filter by min_score and get top-k indices
            valid_mask = scores >= min_score
            valid_indices = np.where(valid_mask)[0]

            if len(valid_indices) == 0:
                return []

            # Sort by score descending and take top_k
            valid_scores = scores[valid_indices]
            sorted_order = np.argsort(valid_scores)[::-1][:top_k]
            top_indices = valid_indices[sorted_order]
            top_scores = valid_scores[sorted_order]

            # Get chunk IDs for top results
            top_ids = [self._chunk_ids[i] for i in top_indices]

        # Fetch content only for top-k results (lazy loading)
        results = self._fetch_results_by_ids(
            top_ids, top_scores.tolist(), return_full_content
        )

        return results

    def _fetch_results_by_ids(
        self,
        chunk_ids: List[int],
        scores: List[float],
        return_full_content: bool,
    ) -> List[SearchResult]:
        """Fetch full result data for specific chunk IDs.

        Args:
            chunk_ids: List of chunk IDs to fetch.
            scores: Corresponding similarity scores.
            return_full_content: Whether to include full content.

        Returns:
            List of SearchResult objects.
        """
        if not chunk_ids:
            return []

        # Build parameterized query for IN clause
        placeholders = ",".join("?" * len(chunk_ids))
        query = f"""
            SELECT id, file_path, content, metadata
            FROM semantic_chunks
            WHERE id IN ({placeholders})
        """

        with sqlite3.connect(self.db_path) as conn:
            conn.execute("PRAGMA mmap_size = 30000000000")
            rows = conn.execute(query, chunk_ids).fetchall()

        # Build ID -> row mapping
        id_to_row = {r[0]: r for r in rows}

        results = []
        for chunk_id, score in zip(chunk_ids, scores):
            row = id_to_row.get(chunk_id)
            if not row:
                continue

            _, file_path, content, metadata_json = row
            metadata = json.loads(metadata_json) if metadata_json else {}

            # Build excerpt (short preview)
            excerpt = content[:200] + "..." if len(content) > 200 else content

            # Extract symbol information from metadata
            symbol_name = metadata.get("symbol_name")
            symbol_kind = metadata.get("symbol_kind")
            start_line = metadata.get("start_line")
            end_line = metadata.get("end_line")

            # Build Symbol object if we have symbol info
            symbol = None
            if symbol_name and symbol_kind and start_line and end_line:
                try:
                    from codexlens.entities import Symbol
                    symbol = Symbol(
                        name=symbol_name,
                        kind=symbol_kind,
                        range=(start_line, end_line)
                    )
                except Exception:
                    pass

            results.append(SearchResult(
                path=file_path,
                score=score,
                excerpt=excerpt,
                content=content if return_full_content else None,
                symbol=symbol,
                metadata=metadata,
                start_line=start_line,
                end_line=end_line,
                symbol_name=symbol_name,
                symbol_kind=symbol_kind,
            ))

        return results

    def count_chunks(self) -> int:
        """Count total chunks in store."""
        with sqlite3.connect(self.db_path) as conn:
            row = conn.execute("SELECT COUNT(*) FROM semantic_chunks").fetchone()
            return row[0] if row else 0

    def clear_cache(self) -> None:
        """Manually clear the embedding cache."""
        self._invalidate_cache()

    @property
    def ann_available(self) -> bool:
        """Check if ANN index is available and ready."""
        return (
            HNSWLIB_AVAILABLE
            and self._ann_index is not None
            and self._ann_index.is_loaded
        )

    @property
    def ann_count(self) -> int:
        """Get number of vectors in ANN index."""
        if self._ann_index is not None:
            return self._ann_index.count()
        return 0

    def close(self) -> None:
        """Close the vector store and release resources.

        This ensures SQLite connections are closed and ANN index is cleared,
        allowing temporary files to be deleted on Windows.
        """
        with self._cache_lock:
            self._embedding_matrix = None
            self._embedding_norms = None
            self._chunk_ids = None

        with self._ann_write_lock:
            self._ann_index = None

    def __enter__(self) -> "VectorStore":
        """Context manager entry."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
        """Context manager exit - close resources."""
        self.close()