mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-10 02:24:35 +08:00
refactor: 移除图索引功能,修复内存泄露,优化嵌入生成
主要更改: 1. 移除图索引功能 (graph indexing) - 删除 graph_analyzer.py 及相关迁移文件 - 移除 CLI 的 graph 命令和 --enrich 标志 - 清理 chain_search.py 中的图查询方法 (370行) - 删除相关测试文件 2. 修复嵌入生成内存问题 - 重构 generate_embeddings.py 使用流式批处理 - 改用 embedding_manager 的内存安全实现 - 文件从 548 行精简到 259 行 (52.7% 减少) 3. 修复内存泄露 - chain_search.py: quick_search 使用 with 语句管理 ChainSearchEngine - embedding_manager.py: 使用 with 语句管理 VectorStore - vector_store.py: 添加暴力搜索内存警告 4. 代码清理 - 移除 Symbol 模型的 token_count 和 symbol_type 字段 - 清理相关测试用例 测试: 760 passed, 7 skipped 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -194,7 +194,6 @@ def generate_embeddings(
|
||||
try:
|
||||
# Use cached embedder (singleton) for performance
|
||||
embedder = get_embedder(profile=model_profile)
|
||||
vector_store = VectorStore(index_path)
|
||||
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
|
||||
|
||||
if progress_callback:
|
||||
@@ -217,85 +216,86 @@ def generate_embeddings(
|
||||
EMBEDDING_BATCH_SIZE = 8 # jina-embeddings-v2-base-code needs small batches
|
||||
|
||||
try:
|
||||
with sqlite3.connect(index_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
path_column = _get_path_column(conn)
|
||||
with VectorStore(index_path) as vector_store:
|
||||
with sqlite3.connect(index_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
path_column = _get_path_column(conn)
|
||||
|
||||
# Get total file count for progress reporting
|
||||
total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
|
||||
if total_files == 0:
|
||||
return {"success": False, "error": "No files found in index"}
|
||||
# Get total file count for progress reporting
|
||||
total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
|
||||
if total_files == 0:
|
||||
return {"success": False, "error": "No files found in index"}
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...")
|
||||
|
||||
cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
|
||||
batch_number = 0
|
||||
|
||||
while True:
|
||||
# Fetch a batch of files (streaming, not fetchall)
|
||||
file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
|
||||
if not file_batch:
|
||||
break
|
||||
|
||||
batch_number += 1
|
||||
batch_chunks_with_paths = []
|
||||
files_in_batch_with_chunks = set()
|
||||
|
||||
# Step 1: Chunking for the current file batch
|
||||
for file_row in file_batch:
|
||||
file_path = file_row[path_column]
|
||||
content = file_row["content"]
|
||||
language = file_row["language"] or "python"
|
||||
|
||||
try:
|
||||
chunks = chunker.chunk_sliding_window(
|
||||
content,
|
||||
file_path=file_path,
|
||||
language=language
|
||||
)
|
||||
if chunks:
|
||||
for chunk in chunks:
|
||||
batch_chunks_with_paths.append((chunk, file_path))
|
||||
files_in_batch_with_chunks.add(file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to chunk {file_path}: {e}")
|
||||
failed_files.append((file_path, str(e)))
|
||||
|
||||
if not batch_chunks_with_paths:
|
||||
continue
|
||||
|
||||
batch_chunk_count = len(batch_chunks_with_paths)
|
||||
if progress_callback:
|
||||
progress_callback(f" Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
|
||||
progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...")
|
||||
|
||||
# Step 2: Generate embeddings for this batch
|
||||
batch_embeddings = []
|
||||
try:
|
||||
for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
|
||||
batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
|
||||
batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
|
||||
embeddings = embedder.embed(batch_contents)
|
||||
batch_embeddings.extend(embeddings)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
|
||||
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
|
||||
continue
|
||||
cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
|
||||
batch_number = 0
|
||||
|
||||
# Step 3: Assign embeddings to chunks
|
||||
for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
|
||||
chunk.embedding = embedding
|
||||
while True:
|
||||
# Fetch a batch of files (streaming, not fetchall)
|
||||
file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
|
||||
if not file_batch:
|
||||
break
|
||||
|
||||
# Step 4: Store this batch to database immediately (releases memory)
|
||||
try:
|
||||
vector_store.add_chunks_batch(batch_chunks_with_paths)
|
||||
total_chunks_created += batch_chunk_count
|
||||
total_files_processed += len(files_in_batch_with_chunks)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to store batch {batch_number}: {str(e)}")
|
||||
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
|
||||
batch_number += 1
|
||||
batch_chunks_with_paths = []
|
||||
files_in_batch_with_chunks = set()
|
||||
|
||||
# Memory is released here as batch_chunks_with_paths and batch_embeddings go out of scope
|
||||
# Step 1: Chunking for the current file batch
|
||||
for file_row in file_batch:
|
||||
file_path = file_row[path_column]
|
||||
content = file_row["content"]
|
||||
language = file_row["language"] or "python"
|
||||
|
||||
try:
|
||||
chunks = chunker.chunk_sliding_window(
|
||||
content,
|
||||
file_path=file_path,
|
||||
language=language
|
||||
)
|
||||
if chunks:
|
||||
for chunk in chunks:
|
||||
batch_chunks_with_paths.append((chunk, file_path))
|
||||
files_in_batch_with_chunks.add(file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to chunk {file_path}: {e}")
|
||||
failed_files.append((file_path, str(e)))
|
||||
|
||||
if not batch_chunks_with_paths:
|
||||
continue
|
||||
|
||||
batch_chunk_count = len(batch_chunks_with_paths)
|
||||
if progress_callback:
|
||||
progress_callback(f" Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
|
||||
|
||||
# Step 2: Generate embeddings for this batch
|
||||
batch_embeddings = []
|
||||
try:
|
||||
for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
|
||||
batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
|
||||
batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
|
||||
embeddings = embedder.embed(batch_contents)
|
||||
batch_embeddings.extend(embeddings)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
|
||||
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
|
||||
continue
|
||||
|
||||
# Step 3: Assign embeddings to chunks
|
||||
for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
|
||||
chunk.embedding = embedding
|
||||
|
||||
# Step 4: Store this batch to database immediately (releases memory)
|
||||
try:
|
||||
vector_store.add_chunks_batch(batch_chunks_with_paths)
|
||||
total_chunks_created += batch_chunk_count
|
||||
total_files_processed += len(files_in_batch_with_chunks)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to store batch {batch_number}: {str(e)}")
|
||||
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
|
||||
|
||||
# Memory is released here as batch_chunks_with_paths and batch_embeddings go out of scope
|
||||
|
||||
except Exception as e:
|
||||
return {"success": False, "error": f"Failed to read or process files: {str(e)}"}
|
||||
|
||||
Reference in New Issue
Block a user