fix: 修复嵌入生成内存泄漏，优化性能

- HNSW 索引：预分配从 100 万降至 5 万，添加动态扩容和可控保存 - Embedder：添加 embed_to_numpy() 避免 .tolist() 转换，增强缓存清理 - embedding_manager：每 10 批次重建 embedder 实例，显式 gc.collect() - VectorStore：添加 bulk_insert() 上下文管理器，支持 numpy 批量写入 - Chunker：添加 skip_token_count 轻量模式，使用 char/4 估算（~9x 加速） 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-21 19:08:17 +08:00 · 2025-12-21 19:15:47 +08:00
parent 45f92fe066
commit 5849f751bc
5 changed files with 420 additions and 34 deletions
--- a/codex-lens/src/codexlens/semantic/ann_index.py
+++ b/codex-lens/src/codexlens/semantic/ann_index.py
@@ -13,6 +13,7 @@ Key features:

 from __future__ import annotations

+import logging
 import threading
 from pathlib import Path
 from typing import List, Optional, Tuple
@@ -24,6 +25,8 @@ from . import SEMANTIC_AVAILABLE
 if SEMANTIC_AVAILABLE:
    import numpy as np

+logger = logging.getLogger(__name__)
+
 # Try to import hnswlib (optional dependency)
 try:
    import hnswlib
@@ -48,16 +51,26 @@ class ANNIndex:
    - ef: 50 (search width during query - higher = better recall)
    """

-    def __init__(self, index_path: Path, dim: int) -> None:
+    def __init__(
+        self,
+        index_path: Path,
+        dim: int,
+        initial_capacity: int = 50000,
+        auto_save: bool = False,
+        expansion_threshold: float = 0.8,
+    ) -> None:
        """Initialize ANN index.

        Args:
            index_path: Path to SQLite database (index will be saved as _vectors.hnsw)
            dim: Dimension of embedding vectors
+            initial_capacity: Initial maximum elements capacity (default: 50000)
+            auto_save: Whether to automatically save index after operations (default: False)
+            expansion_threshold: Capacity threshold to trigger auto-expansion (default: 0.8)

        Raises:
            ImportError: If required dependencies are not available
-            ValueError: If dimension is invalid
+            ValueError: If dimension or capacity is invalid
        """
        if not SEMANTIC_AVAILABLE:
            raise ImportError(
@@ -74,6 +87,14 @@ class ANNIndex:
        if dim <= 0:
            raise ValueError(f"Invalid dimension: {dim}")

+        if initial_capacity <= 0:
+            raise ValueError(f"Invalid initial capacity: {initial_capacity}")
+
+        if not 0.0 < expansion_threshold < 1.0:
+            raise ValueError(
+                f"Invalid expansion threshold: {expansion_threshold}. Must be between 0 and 1."
+            )
+
        self.index_path = Path(index_path)
        self.dim = dim

@@ -89,14 +110,23 @@ class ANNIndex:
        self.ef_construction = 200  # Build-time search width (higher = better quality)
        self.ef = 50  # Query-time search width (higher = better recall)

+        # Memory management parameters
+        self._auto_save = auto_save
+        self._expansion_threshold = expansion_threshold
+
        # Thread safety
        self._lock = threading.RLock()

        # HNSW index instance
        self._index: Optional[hnswlib.Index] = None
-        self._max_elements = 1000000  # Initial capacity (auto-resizes)
+        self._max_elements = initial_capacity  # Initial capacity (reduced from 1M to 50K)
        self._current_count = 0  # Track number of vectors

+        logger.info(
+            f"Initialized ANNIndex with capacity={initial_capacity}, "
+            f"auto_save={auto_save}, expansion_threshold={expansion_threshold}"
+        )
+
    def _ensure_index(self) -> None:
        """Ensure HNSW index is initialized (lazy initialization)."""
        if self._index is None:
@@ -108,6 +138,33 @@ class ANNIndex:
            )
            self._index.set_ef(self.ef)
            self._current_count = 0
+            logger.debug(f"Created new HNSW index with capacity {self._max_elements}")
+
+    def _auto_expand_if_needed(self, additional_count: int) -> None:
+        """Auto-expand index capacity if threshold is reached.
+
+        Args:
+            additional_count: Number of vectors to be added
+
+        Note:
+            This is called internally by add_vectors and is thread-safe.
+        """
+        usage_ratio = (self._current_count + additional_count) / self._max_elements
+
+        if usage_ratio >= self._expansion_threshold:
+            # Calculate new capacity (2x current or enough to fit new vectors)
+            new_capacity = max(
+                self._max_elements * 2,
+                self._current_count + additional_count,
+            )
+
+            logger.info(
+                f"Expanding index capacity: {self._max_elements} -> {new_capacity} "
+                f"(usage: {usage_ratio:.1%}, threshold: {self._expansion_threshold:.1%})"
+            )
+
+            self._index.resize_index(new_capacity)
+            self._max_elements = new_capacity

    def add_vectors(self, ids: List[int], vectors: np.ndarray) -> None:
        """Add vectors to the index.
@@ -137,14 +194,8 @@ class ANNIndex:
            try:
                self._ensure_index()

-                # Resize index if needed
-                if self._current_count + len(ids) > self._max_elements:
-                    new_max = max(
-                        self._max_elements * 2,
-                        self._current_count + len(ids)
-                    )
-                    self._index.resize_index(new_max)
-                    self._max_elements = new_max
+                # Auto-expand if threshold reached
+                self._auto_expand_if_needed(len(ids))

                # Ensure vectors are C-contiguous float32 (hnswlib requirement)
                if not vectors.flags['C_CONTIGUOUS'] or vectors.dtype != np.float32:
@@ -154,6 +205,15 @@ class ANNIndex:
                self._index.add_items(vectors, ids)
                self._current_count += len(ids)

+                logger.debug(
+                    f"Added {len(ids)} vectors to index "
+                    f"(total: {self._current_count}/{self._max_elements})"
+                )
+
+                # Auto-save if enabled
+                if self._auto_save:
+                    self.save()
+
            except Exception as e:
                raise StorageError(f"Failed to add vectors to ANN index: {e}")

@@ -178,13 +238,21 @@ class ANNIndex:
                    return  # Nothing to remove

                # Mark vectors as deleted
+                deleted_count = 0
                for vec_id in ids:
                    try:
                        self._index.mark_deleted(vec_id)
+                        deleted_count += 1
                    except RuntimeError:
                        # ID not found - ignore (idempotent deletion)
                        pass

+                logger.debug(f"Marked {deleted_count}/{len(ids)} vectors as deleted")
+
+                # Auto-save if enabled
+                if self._auto_save and deleted_count > 0:
+                    self.save()
+
            except Exception as e:
                raise StorageError(f"Failed to remove vectors from ANN index: {e}")

@@ -248,6 +316,7 @@ class ANNIndex:
        with self._lock:
            try:
                if self._index is None or self._current_count == 0:
+                    logger.debug("Skipping save: index is empty")
                    return  # Nothing to save

                # Ensure parent directory exists
@@ -256,6 +325,11 @@ class ANNIndex:
                # Save index
                self._index.save_index(str(self.hnsw_path))

+                logger.debug(
+                    f"Saved index to {self.hnsw_path} "
+                    f"({self._current_count} vectors, capacity: {self._max_elements})"
+                )
+
            except Exception as e:
                raise StorageError(f"Failed to save ANN index: {e}")

@@ -271,20 +345,28 @@ class ANNIndex:
        with self._lock:
            try:
                if not self.hnsw_path.exists():
+                    logger.debug(f"Index file not found: {self.hnsw_path}")
                    return False  # Index file doesn't exist (not an error)

                # Create fresh index object for loading (don't call init_index first)
                self._index = hnswlib.Index(space=self.space, dim=self.dim)

                # Load index from disk
+                # Note: max_elements here is just for initial allocation, can expand later
                self._index.load_index(str(self.hnsw_path), max_elements=self._max_elements)

-                # Update count from loaded index
+                # Update count and capacity from loaded index
                self._current_count = self._index.get_current_count()
+                self._max_elements = self._index.get_max_elements()

                # Set query-time ef parameter
                self._index.set_ef(self.ef)

+                logger.info(
+                    f"Loaded index from {self.hnsw_path} "
+                    f"({self._current_count} vectors, capacity: {self._max_elements})"
+                )
+
                return True

            except Exception as e:
@@ -299,6 +381,28 @@ class ANNIndex:
        with self._lock:
            return self._current_count

+    @property
+    def capacity(self) -> int:
+        """Get current maximum capacity of the index.
+
+        Returns:
+            Maximum number of vectors the index can hold before expansion
+        """
+        with self._lock:
+            return self._max_elements
+
+    @property
+    def usage_ratio(self) -> float:
+        """Get current usage ratio (count / capacity).
+
+        Returns:
+            Usage ratio between 0.0 and 1.0
+        """
+        with self._lock:
+            if self._max_elements == 0:
+                return 0.0
+            return self._current_count / self._max_elements
+
    @property
    def is_loaded(self) -> bool:
        """Check if index is loaded and ready for use.