mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
feat: 引入流式生成器以优化内存使用,改进嵌入生成过程
This commit is contained in:
@@ -4,8 +4,9 @@ import gc
|
||||
import logging
|
||||
import sqlite3
|
||||
import time
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Dict, Generator, List, Optional, Tuple
|
||||
|
||||
try:
|
||||
from codexlens.semantic import SEMANTIC_AVAILABLE
|
||||
@@ -23,6 +24,61 @@ logger = logging.getLogger(__name__)
|
||||
EMBEDDING_BATCH_SIZE = 64 # Increased from 8 for better performance
|
||||
|
||||
|
||||
def _generate_chunks_from_cursor(
|
||||
cursor,
|
||||
chunker,
|
||||
path_column: str,
|
||||
file_batch_size: int,
|
||||
failed_files: List[Tuple[str, str]],
|
||||
) -> Generator[Tuple, None, Tuple[int, int]]:
|
||||
"""Generator that yields chunks from database cursor in a streaming fashion.
|
||||
|
||||
This avoids loading all chunks into memory at once, significantly reducing
|
||||
peak memory usage for large codebases.
|
||||
|
||||
Args:
|
||||
cursor: SQLite cursor with file data
|
||||
chunker: Chunker instance for splitting files
|
||||
path_column: Column name for file path
|
||||
file_batch_size: Number of files to fetch at a time
|
||||
failed_files: List to append failed files to
|
||||
|
||||
Yields:
|
||||
(chunk, file_path) tuples
|
||||
|
||||
Returns:
|
||||
(total_files_processed, batch_count) after iteration completes
|
||||
"""
|
||||
total_files = 0
|
||||
batch_count = 0
|
||||
|
||||
while True:
|
||||
file_batch = cursor.fetchmany(file_batch_size)
|
||||
if not file_batch:
|
||||
break
|
||||
|
||||
batch_count += 1
|
||||
|
||||
for file_row in file_batch:
|
||||
file_path = file_row[path_column]
|
||||
content = file_row["content"]
|
||||
language = file_row["language"] or "python"
|
||||
|
||||
try:
|
||||
chunks = chunker.chunk_sliding_window(
|
||||
content,
|
||||
file_path=file_path,
|
||||
language=language
|
||||
)
|
||||
if chunks:
|
||||
total_files += 1
|
||||
for chunk in chunks:
|
||||
yield (chunk, file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to chunk {file_path}: {e}")
|
||||
failed_files.append((file_path, str(e)))
|
||||
|
||||
|
||||
def _get_path_column(conn: sqlite3.Connection) -> str:
|
||||
"""Detect whether files table uses 'path' or 'full_path' column.
|
||||
|
||||
@@ -199,7 +255,9 @@ def generate_embeddings(
|
||||
try:
|
||||
# Initialize embedder (singleton, reused throughout the function)
|
||||
embedder = get_embedder(profile=model_profile)
|
||||
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
|
||||
# skip_token_count=True: Use fast estimation (len/4) instead of expensive tiktoken
|
||||
# This significantly reduces CPU usage with minimal impact on metadata accuracy
|
||||
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size, skip_token_count=True))
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f"Using model: {embedder.model_name} ({embedder.embedding_dim} dimensions)")
|
||||
@@ -238,79 +296,51 @@ def generate_embeddings(
|
||||
progress_callback(f"Processing {total_files} files for embeddings in batches of {FILE_BATCH_SIZE}...")
|
||||
|
||||
cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
|
||||
|
||||
# --- STREAMING GENERATOR APPROACH ---
|
||||
# Instead of accumulating all chunks from 100 files, we use a generator
|
||||
# that yields chunks on-demand, keeping memory usage low and constant.
|
||||
chunk_generator = _generate_chunks_from_cursor(
|
||||
cursor, chunker, path_column, FILE_BATCH_SIZE, failed_files
|
||||
)
|
||||
|
||||
batch_number = 0
|
||||
files_seen = set()
|
||||
|
||||
while True:
|
||||
# Fetch a batch of files (streaming, not fetchall)
|
||||
file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
|
||||
if not file_batch:
|
||||
# Get a small batch of chunks from the generator (EMBEDDING_BATCH_SIZE at a time)
|
||||
chunk_batch = list(islice(chunk_generator, EMBEDDING_BATCH_SIZE))
|
||||
if not chunk_batch:
|
||||
break
|
||||
|
||||
batch_number += 1
|
||||
batch_chunks_with_paths = []
|
||||
files_in_batch_with_chunks = set()
|
||||
|
||||
# Step 1: Chunking for the current file batch
|
||||
for file_row in file_batch:
|
||||
file_path = file_row[path_column]
|
||||
content = file_row["content"]
|
||||
language = file_row["language"] or "python"
|
||||
# Track unique files for progress
|
||||
for _, file_path in chunk_batch:
|
||||
files_seen.add(file_path)
|
||||
|
||||
try:
|
||||
chunks = chunker.chunk_sliding_window(
|
||||
content,
|
||||
file_path=file_path,
|
||||
language=language
|
||||
)
|
||||
if chunks:
|
||||
for chunk in chunks:
|
||||
batch_chunks_with_paths.append((chunk, file_path))
|
||||
files_in_batch_with_chunks.add(file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to chunk {file_path}: {e}")
|
||||
failed_files.append((file_path, str(e)))
|
||||
|
||||
if not batch_chunks_with_paths:
|
||||
continue
|
||||
|
||||
batch_chunk_count = len(batch_chunks_with_paths)
|
||||
if progress_callback:
|
||||
progress_callback(f" Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
|
||||
|
||||
# Step 2: Generate embeddings for this batch (use memory-efficient numpy method)
|
||||
batch_embeddings = []
|
||||
# Generate embeddings directly to numpy (no tolist() conversion)
|
||||
try:
|
||||
for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
|
||||
batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
|
||||
batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
|
||||
# Use embed_to_numpy() to avoid unnecessary list conversion
|
||||
embeddings_numpy = embedder.embed_to_numpy(batch_contents)
|
||||
# Convert to list only for storage (VectorStore expects list format)
|
||||
embeddings = [emb.tolist() for emb in embeddings_numpy]
|
||||
batch_embeddings.extend(embeddings)
|
||||
# Explicit cleanup of intermediate data
|
||||
del batch_contents, embeddings_numpy
|
||||
batch_contents = [chunk.content for chunk, _ in chunk_batch]
|
||||
embeddings_numpy = embedder.embed_to_numpy(batch_contents)
|
||||
|
||||
# Use add_chunks_batch_numpy to avoid numpy->list->numpy roundtrip
|
||||
vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
|
||||
|
||||
total_chunks_created += len(chunk_batch)
|
||||
total_files_processed = len(files_seen)
|
||||
|
||||
if progress_callback and batch_number % 10 == 0:
|
||||
progress_callback(f" Batch {batch_number}: {total_chunks_created} chunks, {total_files_processed} files")
|
||||
|
||||
# Cleanup intermediate data
|
||||
del batch_contents, embeddings_numpy, chunk_batch
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
|
||||
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
|
||||
logger.error(f"Failed to process embedding batch {batch_number}: {str(e)}")
|
||||
# Continue to next batch instead of failing entirely
|
||||
continue
|
||||
|
||||
# Step 3: Assign embeddings to chunks
|
||||
for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
|
||||
chunk.embedding = embedding
|
||||
|
||||
# Step 4: Store this batch to database (ANN update deferred in bulk_insert mode)
|
||||
try:
|
||||
vector_store.add_chunks_batch(batch_chunks_with_paths)
|
||||
total_chunks_created += batch_chunk_count
|
||||
total_files_processed += len(files_in_batch_with_chunks)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to store batch {batch_number}: {str(e)}")
|
||||
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
|
||||
|
||||
# Release batch references (let Python GC handle cleanup naturally)
|
||||
del batch_chunks_with_paths, batch_embeddings
|
||||
|
||||
# Notify before ANN index finalization (happens when bulk_insert context exits)
|
||||
if progress_callback:
|
||||
progress_callback(f"Finalizing index... Building ANN index for {total_chunks_created} chunks")
|
||||
|
||||
Reference in New Issue
Block a user