mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-09 02:24:11 +08:00
fix: 修复嵌入生成内存泄漏,优化性能
- HNSW 索引:预分配从 100 万降至 5 万,添加动态扩容和可控保存 - Embedder:添加 embed_to_numpy() 避免 .tolist() 转换,增强缓存清理 - embedding_manager:每 10 批次重建 embedder 实例,显式 gc.collect() - VectorStore:添加 bulk_insert() 上下文管理器,支持 numpy 批量写入 - Chunker:添加 skip_token_count 轻量模式,使用 char/4 估算(~9x 加速) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
"""Embedding Manager - Manage semantic embeddings for code indexes."""
|
||||
|
||||
import gc
|
||||
import logging
|
||||
import sqlite3
|
||||
import time
|
||||
@@ -9,7 +10,7 @@ from typing import Dict, List, Optional
|
||||
try:
|
||||
from codexlens.semantic import SEMANTIC_AVAILABLE
|
||||
if SEMANTIC_AVAILABLE:
|
||||
from codexlens.semantic.embedder import Embedder, get_embedder
|
||||
from codexlens.semantic.embedder import Embedder, get_embedder, clear_embedder_cache
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
from codexlens.semantic.chunker import Chunker, ChunkConfig
|
||||
except ImportError:
|
||||
@@ -17,6 +18,9 @@ except ImportError:
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Periodic embedder recreation interval to prevent memory accumulation
|
||||
EMBEDDER_RECREATION_INTERVAL = 10 # Recreate embedder every N batches
|
||||
|
||||
|
||||
def _get_path_column(conn: sqlite3.Connection) -> str:
|
||||
"""Detect whether files table uses 'path' or 'full_path' column.
|
||||
@@ -192,12 +196,13 @@ def generate_embeddings(
|
||||
|
||||
# Initialize components
|
||||
try:
|
||||
# Use cached embedder (singleton) for performance
|
||||
# Initialize embedder (will be periodically recreated to prevent memory leaks)
|
||||
embedder = get_embedder(profile=model_profile)
|
||||
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
|
||||
progress_callback(f"Memory optimization: Embedder will be recreated every {EMBEDDER_RECREATION_INTERVAL} batches")
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
@@ -242,6 +247,14 @@ def generate_embeddings(
|
||||
batch_chunks_with_paths = []
|
||||
files_in_batch_with_chunks = set()
|
||||
|
||||
# Periodic embedder recreation to prevent memory accumulation
|
||||
if batch_number % EMBEDDER_RECREATION_INTERVAL == 0:
|
||||
if progress_callback:
|
||||
progress_callback(f" [Memory optimization] Recreating embedder at batch {batch_number}")
|
||||
clear_embedder_cache()
|
||||
embedder = get_embedder(profile=model_profile)
|
||||
gc.collect()
|
||||
|
||||
# Step 1: Chunking for the current file batch
|
||||
for file_row in file_batch:
|
||||
file_path = file_row[path_column]
|
||||
@@ -269,14 +282,19 @@ def generate_embeddings(
|
||||
if progress_callback:
|
||||
progress_callback(f" Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
|
||||
|
||||
# Step 2: Generate embeddings for this batch
|
||||
# Step 2: Generate embeddings for this batch (use memory-efficient numpy method)
|
||||
batch_embeddings = []
|
||||
try:
|
||||
for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
|
||||
batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
|
||||
batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
|
||||
embeddings = embedder.embed(batch_contents)
|
||||
# Use embed_to_numpy() to avoid unnecessary list conversion
|
||||
embeddings_numpy = embedder.embed_to_numpy(batch_contents)
|
||||
# Convert to list only for storage (VectorStore expects list format)
|
||||
embeddings = [emb.tolist() for emb in embeddings_numpy]
|
||||
batch_embeddings.extend(embeddings)
|
||||
# Explicit cleanup of intermediate data
|
||||
del batch_contents, embeddings_numpy
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
|
||||
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
|
||||
@@ -295,7 +313,9 @@ def generate_embeddings(
|
||||
logger.error(f"Failed to store batch {batch_number}: {str(e)}")
|
||||
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
|
||||
|
||||
# Memory is released here as batch_chunks_with_paths and batch_embeddings go out of scope
|
||||
# Explicit memory cleanup after each batch
|
||||
del batch_chunks_with_paths, batch_embeddings
|
||||
gc.collect()
|
||||
|
||||
except Exception as e:
|
||||
return {"success": False, "error": f"Failed to read or process files: {str(e)}"}
|
||||
|
||||
Reference in New Issue
Block a user