mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-03-21 19:08:17 +08:00
fix: 修复嵌入生成内存泄漏,优化性能
- HNSW 索引:预分配从 100 万降至 5 万,添加动态扩容和可控保存 - Embedder:添加 embed_to_numpy() 避免 .tolist() 转换,增强缓存清理 - embedding_manager:每 10 批次重建 embedder 实例,显式 gc.collect() - VectorStore:添加 bulk_insert() 上下文管理器,支持 numpy 批量写入 - Chunker:添加 skip_token_count 轻量模式,使用 char/4 估算(~9x 加速) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -13,6 +13,7 @@ Key features:
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
@@ -24,6 +25,8 @@ from . import SEMANTIC_AVAILABLE
|
||||
if SEMANTIC_AVAILABLE:
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import hnswlib (optional dependency)
|
||||
try:
|
||||
import hnswlib
|
||||
@@ -48,16 +51,26 @@ class ANNIndex:
|
||||
- ef: 50 (search width during query - higher = better recall)
|
||||
"""
|
||||
|
||||
def __init__(self, index_path: Path, dim: int) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
index_path: Path,
|
||||
dim: int,
|
||||
initial_capacity: int = 50000,
|
||||
auto_save: bool = False,
|
||||
expansion_threshold: float = 0.8,
|
||||
) -> None:
|
||||
"""Initialize ANN index.
|
||||
|
||||
Args:
|
||||
index_path: Path to SQLite database (index will be saved as _vectors.hnsw)
|
||||
dim: Dimension of embedding vectors
|
||||
initial_capacity: Initial maximum elements capacity (default: 50000)
|
||||
auto_save: Whether to automatically save index after operations (default: False)
|
||||
expansion_threshold: Capacity threshold to trigger auto-expansion (default: 0.8)
|
||||
|
||||
Raises:
|
||||
ImportError: If required dependencies are not available
|
||||
ValueError: If dimension is invalid
|
||||
ValueError: If dimension or capacity is invalid
|
||||
"""
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
raise ImportError(
|
||||
@@ -74,6 +87,14 @@ class ANNIndex:
|
||||
if dim <= 0:
|
||||
raise ValueError(f"Invalid dimension: {dim}")
|
||||
|
||||
if initial_capacity <= 0:
|
||||
raise ValueError(f"Invalid initial capacity: {initial_capacity}")
|
||||
|
||||
if not 0.0 < expansion_threshold < 1.0:
|
||||
raise ValueError(
|
||||
f"Invalid expansion threshold: {expansion_threshold}. Must be between 0 and 1."
|
||||
)
|
||||
|
||||
self.index_path = Path(index_path)
|
||||
self.dim = dim
|
||||
|
||||
@@ -89,14 +110,23 @@ class ANNIndex:
|
||||
self.ef_construction = 200 # Build-time search width (higher = better quality)
|
||||
self.ef = 50 # Query-time search width (higher = better recall)
|
||||
|
||||
# Memory management parameters
|
||||
self._auto_save = auto_save
|
||||
self._expansion_threshold = expansion_threshold
|
||||
|
||||
# Thread safety
|
||||
self._lock = threading.RLock()
|
||||
|
||||
# HNSW index instance
|
||||
self._index: Optional[hnswlib.Index] = None
|
||||
self._max_elements = 1000000 # Initial capacity (auto-resizes)
|
||||
self._max_elements = initial_capacity # Initial capacity (reduced from 1M to 50K)
|
||||
self._current_count = 0 # Track number of vectors
|
||||
|
||||
logger.info(
|
||||
f"Initialized ANNIndex with capacity={initial_capacity}, "
|
||||
f"auto_save={auto_save}, expansion_threshold={expansion_threshold}"
|
||||
)
|
||||
|
||||
def _ensure_index(self) -> None:
|
||||
"""Ensure HNSW index is initialized (lazy initialization)."""
|
||||
if self._index is None:
|
||||
@@ -108,6 +138,33 @@ class ANNIndex:
|
||||
)
|
||||
self._index.set_ef(self.ef)
|
||||
self._current_count = 0
|
||||
logger.debug(f"Created new HNSW index with capacity {self._max_elements}")
|
||||
|
||||
def _auto_expand_if_needed(self, additional_count: int) -> None:
|
||||
"""Auto-expand index capacity if threshold is reached.
|
||||
|
||||
Args:
|
||||
additional_count: Number of vectors to be added
|
||||
|
||||
Note:
|
||||
This is called internally by add_vectors and is thread-safe.
|
||||
"""
|
||||
usage_ratio = (self._current_count + additional_count) / self._max_elements
|
||||
|
||||
if usage_ratio >= self._expansion_threshold:
|
||||
# Calculate new capacity (2x current or enough to fit new vectors)
|
||||
new_capacity = max(
|
||||
self._max_elements * 2,
|
||||
self._current_count + additional_count,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Expanding index capacity: {self._max_elements} -> {new_capacity} "
|
||||
f"(usage: {usage_ratio:.1%}, threshold: {self._expansion_threshold:.1%})"
|
||||
)
|
||||
|
||||
self._index.resize_index(new_capacity)
|
||||
self._max_elements = new_capacity
|
||||
|
||||
def add_vectors(self, ids: List[int], vectors: np.ndarray) -> None:
|
||||
"""Add vectors to the index.
|
||||
@@ -137,14 +194,8 @@ class ANNIndex:
|
||||
try:
|
||||
self._ensure_index()
|
||||
|
||||
# Resize index if needed
|
||||
if self._current_count + len(ids) > self._max_elements:
|
||||
new_max = max(
|
||||
self._max_elements * 2,
|
||||
self._current_count + len(ids)
|
||||
)
|
||||
self._index.resize_index(new_max)
|
||||
self._max_elements = new_max
|
||||
# Auto-expand if threshold reached
|
||||
self._auto_expand_if_needed(len(ids))
|
||||
|
||||
# Ensure vectors are C-contiguous float32 (hnswlib requirement)
|
||||
if not vectors.flags['C_CONTIGUOUS'] or vectors.dtype != np.float32:
|
||||
@@ -154,6 +205,15 @@ class ANNIndex:
|
||||
self._index.add_items(vectors, ids)
|
||||
self._current_count += len(ids)
|
||||
|
||||
logger.debug(
|
||||
f"Added {len(ids)} vectors to index "
|
||||
f"(total: {self._current_count}/{self._max_elements})"
|
||||
)
|
||||
|
||||
# Auto-save if enabled
|
||||
if self._auto_save:
|
||||
self.save()
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to add vectors to ANN index: {e}")
|
||||
|
||||
@@ -178,13 +238,21 @@ class ANNIndex:
|
||||
return # Nothing to remove
|
||||
|
||||
# Mark vectors as deleted
|
||||
deleted_count = 0
|
||||
for vec_id in ids:
|
||||
try:
|
||||
self._index.mark_deleted(vec_id)
|
||||
deleted_count += 1
|
||||
except RuntimeError:
|
||||
# ID not found - ignore (idempotent deletion)
|
||||
pass
|
||||
|
||||
logger.debug(f"Marked {deleted_count}/{len(ids)} vectors as deleted")
|
||||
|
||||
# Auto-save if enabled
|
||||
if self._auto_save and deleted_count > 0:
|
||||
self.save()
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to remove vectors from ANN index: {e}")
|
||||
|
||||
@@ -248,6 +316,7 @@ class ANNIndex:
|
||||
with self._lock:
|
||||
try:
|
||||
if self._index is None or self._current_count == 0:
|
||||
logger.debug("Skipping save: index is empty")
|
||||
return # Nothing to save
|
||||
|
||||
# Ensure parent directory exists
|
||||
@@ -256,6 +325,11 @@ class ANNIndex:
|
||||
# Save index
|
||||
self._index.save_index(str(self.hnsw_path))
|
||||
|
||||
logger.debug(
|
||||
f"Saved index to {self.hnsw_path} "
|
||||
f"({self._current_count} vectors, capacity: {self._max_elements})"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise StorageError(f"Failed to save ANN index: {e}")
|
||||
|
||||
@@ -271,20 +345,28 @@ class ANNIndex:
|
||||
with self._lock:
|
||||
try:
|
||||
if not self.hnsw_path.exists():
|
||||
logger.debug(f"Index file not found: {self.hnsw_path}")
|
||||
return False # Index file doesn't exist (not an error)
|
||||
|
||||
# Create fresh index object for loading (don't call init_index first)
|
||||
self._index = hnswlib.Index(space=self.space, dim=self.dim)
|
||||
|
||||
# Load index from disk
|
||||
# Note: max_elements here is just for initial allocation, can expand later
|
||||
self._index.load_index(str(self.hnsw_path), max_elements=self._max_elements)
|
||||
|
||||
# Update count from loaded index
|
||||
# Update count and capacity from loaded index
|
||||
self._current_count = self._index.get_current_count()
|
||||
self._max_elements = self._index.get_max_elements()
|
||||
|
||||
# Set query-time ef parameter
|
||||
self._index.set_ef(self.ef)
|
||||
|
||||
logger.info(
|
||||
f"Loaded index from {self.hnsw_path} "
|
||||
f"({self._current_count} vectors, capacity: {self._max_elements})"
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
@@ -299,6 +381,28 @@ class ANNIndex:
|
||||
with self._lock:
|
||||
return self._current_count
|
||||
|
||||
@property
|
||||
def capacity(self) -> int:
|
||||
"""Get current maximum capacity of the index.
|
||||
|
||||
Returns:
|
||||
Maximum number of vectors the index can hold before expansion
|
||||
"""
|
||||
with self._lock:
|
||||
return self._max_elements
|
||||
|
||||
@property
|
||||
def usage_ratio(self) -> float:
|
||||
"""Get current usage ratio (count / capacity).
|
||||
|
||||
Returns:
|
||||
Usage ratio between 0.0 and 1.0
|
||||
"""
|
||||
with self._lock:
|
||||
if self._max_elements == 0:
|
||||
return 0.0
|
||||
return self._current_count / self._max_elements
|
||||
|
||||
@property
|
||||
def is_loaded(self) -> bool:
|
||||
"""Check if index is loaded and ready for use.
|
||||
|
||||
Reference in New Issue
Block a user