mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
refactor: 移除图索引功能,修复内存泄露,优化嵌入生成
主要更改: 1. 移除图索引功能 (graph indexing) - 删除 graph_analyzer.py 及相关迁移文件 - 移除 CLI 的 graph 命令和 --enrich 标志 - 清理 chain_search.py 中的图查询方法 (370行) - 删除相关测试文件 2. 修复嵌入生成内存问题 - 重构 generate_embeddings.py 使用流式批处理 - 改用 embedding_manager 的内存安全实现 - 文件从 548 行精简到 259 行 (52.7% 减少) 3. 修复内存泄露 - chain_search.py: quick_search 使用 with 语句管理 ChainSearchEngine - embedding_manager.py: 使用 with 语句管理 VectorStore - vector_store.py: 添加暴力搜索内存警告 4. 代码清理 - 移除 Symbol 模型的 token_count 和 symbol_type 字段 - 清理相关测试用例 测试: 760 passed, 7 skipped 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,15 +1,9 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate vector embeddings for existing CodexLens indexes.
|
||||
|
||||
This script processes all files in a CodexLens index database and generates
|
||||
semantic vector embeddings for code chunks. The embeddings are stored in the
|
||||
same SQLite database in the 'semantic_chunks' table.
|
||||
|
||||
Performance optimizations:
|
||||
- Parallel file processing using ProcessPoolExecutor
|
||||
- Batch embedding generation for efficient GPU/CPU utilization
|
||||
- Batch database writes to minimize I/O overhead
|
||||
- HNSW index auto-generation for fast similarity search
|
||||
This script is a CLI wrapper around the memory-efficient streaming implementation
|
||||
in codexlens.cli.embedding_manager. It uses batch processing to keep memory usage
|
||||
under 2GB regardless of project size.
|
||||
|
||||
Requirements:
|
||||
pip install codexlens[semantic]
|
||||
@@ -20,27 +14,21 @@ Usage:
|
||||
# Generate embeddings for a single index
|
||||
python generate_embeddings.py /path/to/_index.db
|
||||
|
||||
# Generate embeddings with parallel processing
|
||||
python generate_embeddings.py /path/to/_index.db --workers 4
|
||||
|
||||
# Use specific embedding model and batch size
|
||||
python generate_embeddings.py /path/to/_index.db --model code --batch-size 256
|
||||
# Use specific embedding model
|
||||
python generate_embeddings.py /path/to/_index.db --model code
|
||||
|
||||
# Generate embeddings for all indexes in a directory
|
||||
python generate_embeddings.py --scan ~/.codexlens/indexes
|
||||
|
||||
# Force regeneration
|
||||
python generate_embeddings.py /path/to/_index.db --force
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
import time
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import List
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
@@ -50,100 +38,32 @@ logging.basicConfig(
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileData:
|
||||
"""Data for a single file to process."""
|
||||
full_path: str
|
||||
content: str
|
||||
language: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkData:
|
||||
"""Processed chunk data ready for embedding."""
|
||||
file_path: str
|
||||
content: str
|
||||
metadata: dict
|
||||
# Import the memory-efficient implementation
|
||||
try:
|
||||
from codexlens.cli.embedding_manager import (
|
||||
generate_embeddings,
|
||||
generate_embeddings_recursive,
|
||||
)
|
||||
from codexlens.semantic import SEMANTIC_AVAILABLE
|
||||
except ImportError as exc:
|
||||
logger.error(f"Failed to import codexlens: {exc}")
|
||||
logger.error("Make sure codexlens is installed: pip install codexlens")
|
||||
SEMANTIC_AVAILABLE = False
|
||||
|
||||
|
||||
def check_dependencies():
|
||||
"""Check if semantic search dependencies are available."""
|
||||
try:
|
||||
from codexlens.semantic import SEMANTIC_AVAILABLE
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
logger.error("Semantic search dependencies not available")
|
||||
logger.error("Install with: pip install codexlens[semantic]")
|
||||
logger.error("Or: pip install fastembed numpy hnswlib")
|
||||
return False
|
||||
return True
|
||||
except ImportError as exc:
|
||||
logger.error(f"Failed to import codexlens: {exc}")
|
||||
logger.error("Make sure codexlens is installed: pip install codexlens")
|
||||
if not SEMANTIC_AVAILABLE:
|
||||
logger.error("Semantic search dependencies not available")
|
||||
logger.error("Install with: pip install codexlens[semantic]")
|
||||
logger.error("Or: pip install fastembed numpy hnswlib")
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def count_files(index_db_path: Path) -> int:
|
||||
"""Count total files in index."""
|
||||
try:
|
||||
with sqlite3.connect(index_db_path) as conn:
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM files")
|
||||
return cursor.fetchone()[0]
|
||||
except Exception as exc:
|
||||
logger.error(f"Failed to count files: {exc}")
|
||||
return 0
|
||||
|
||||
|
||||
def check_existing_chunks(index_db_path: Path) -> int:
|
||||
"""Check if semantic chunks already exist."""
|
||||
try:
|
||||
with sqlite3.connect(index_db_path) as conn:
|
||||
# Check if table exists
|
||||
cursor = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
|
||||
)
|
||||
if not cursor.fetchone():
|
||||
return 0
|
||||
|
||||
# Count existing chunks
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM semantic_chunks")
|
||||
return cursor.fetchone()[0]
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
def process_file_worker(args: Tuple[str, str, str, int]) -> List[ChunkData]:
|
||||
"""Worker function to process a single file (runs in separate process).
|
||||
|
||||
Args:
|
||||
args: Tuple of (file_path, content, language, chunk_size)
|
||||
|
||||
Returns:
|
||||
List of ChunkData objects
|
||||
"""
|
||||
file_path, content, language, chunk_size = args
|
||||
|
||||
try:
|
||||
from codexlens.semantic.chunker import Chunker, ChunkConfig
|
||||
|
||||
chunker = Chunker(config=ChunkConfig(max_chunk_size=chunk_size))
|
||||
chunks = chunker.chunk_sliding_window(
|
||||
content,
|
||||
file_path=file_path,
|
||||
language=language
|
||||
)
|
||||
|
||||
return [
|
||||
ChunkData(
|
||||
file_path=file_path,
|
||||
content=chunk.content,
|
||||
metadata=chunk.metadata or {}
|
||||
)
|
||||
for chunk in chunks
|
||||
]
|
||||
except Exception as exc:
|
||||
logger.debug(f"Error processing {file_path}: {exc}")
|
||||
return []
|
||||
def progress_callback(message: str):
|
||||
"""Callback function for progress updates."""
|
||||
logger.info(message)
|
||||
|
||||
|
||||
def generate_embeddings_for_index(
|
||||
@@ -151,259 +71,63 @@ def generate_embeddings_for_index(
|
||||
model_profile: str = "code",
|
||||
force: bool = False,
|
||||
chunk_size: int = 2000,
|
||||
workers: int = 0,
|
||||
batch_size: int = 256,
|
||||
**kwargs # Ignore unused parameters (workers, batch_size) for backward compatibility
|
||||
) -> dict:
|
||||
"""Generate embeddings for all files in an index.
|
||||
"""Generate embeddings for an index using memory-efficient streaming.
|
||||
|
||||
Performance optimizations:
|
||||
- Parallel file processing (chunking)
|
||||
- Batch embedding generation
|
||||
- Batch database writes
|
||||
- HNSW index auto-generation
|
||||
This function wraps the streaming implementation from embedding_manager
|
||||
to maintain CLI compatibility while using the memory-optimized approach.
|
||||
|
||||
Args:
|
||||
index_db_path: Path to _index.db file
|
||||
model_profile: Model profile to use (fast, code, multilingual, balanced)
|
||||
force: If True, regenerate even if embeddings exist
|
||||
chunk_size: Maximum chunk size in characters
|
||||
workers: Number of parallel workers (0 = auto-detect CPU count)
|
||||
batch_size: Batch size for embedding generation
|
||||
**kwargs: Additional parameters (ignored for compatibility)
|
||||
|
||||
Returns:
|
||||
Dictionary with generation statistics
|
||||
"""
|
||||
logger.info(f"Processing index: {index_db_path}")
|
||||
|
||||
# Check existing chunks
|
||||
existing_chunks = check_existing_chunks(index_db_path)
|
||||
if existing_chunks > 0 and not force:
|
||||
logger.warning(f"Index already has {existing_chunks} chunks")
|
||||
logger.warning("Use --force to regenerate")
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Embeddings already exist",
|
||||
"existing_chunks": existing_chunks,
|
||||
}
|
||||
# Call the memory-efficient streaming implementation
|
||||
result = generate_embeddings(
|
||||
index_path=index_db_path,
|
||||
model_profile=model_profile,
|
||||
force=force,
|
||||
chunk_size=chunk_size,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
if force and existing_chunks > 0:
|
||||
logger.info(f"Force mode: clearing {existing_chunks} existing chunks")
|
||||
try:
|
||||
with sqlite3.connect(index_db_path) as conn:
|
||||
conn.execute("DELETE FROM semantic_chunks")
|
||||
conn.commit()
|
||||
# Also remove HNSW index file
|
||||
hnsw_path = index_db_path.parent / "_vectors.hnsw"
|
||||
if hnsw_path.exists():
|
||||
hnsw_path.unlink()
|
||||
logger.info("Removed existing HNSW index")
|
||||
except Exception as exc:
|
||||
logger.error(f"Failed to clear existing data: {exc}")
|
||||
if not result["success"]:
|
||||
if "error" in result:
|
||||
logger.error(result["error"])
|
||||
return result
|
||||
|
||||
# Import dependencies
|
||||
try:
|
||||
from codexlens.semantic.embedder import Embedder
|
||||
from codexlens.semantic.vector_store import VectorStore
|
||||
from codexlens.entities import SemanticChunk
|
||||
except ImportError as exc:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Import failed: {exc}",
|
||||
}
|
||||
|
||||
# Initialize components
|
||||
try:
|
||||
embedder = Embedder(profile=model_profile)
|
||||
vector_store = VectorStore(index_db_path)
|
||||
|
||||
logger.info(f"Using model: {embedder.model_name}")
|
||||
logger.info(f"Embedding dimension: {embedder.embedding_dim}")
|
||||
except Exception as exc:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Failed to initialize components: {exc}",
|
||||
}
|
||||
|
||||
# Read files from index
|
||||
try:
|
||||
with sqlite3.connect(index_db_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.execute("SELECT full_path, content, language FROM files")
|
||||
files = [
|
||||
FileData(
|
||||
full_path=row["full_path"],
|
||||
content=row["content"],
|
||||
language=row["language"] or "python"
|
||||
)
|
||||
for row in cursor.fetchall()
|
||||
]
|
||||
except Exception as exc:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Failed to read files: {exc}",
|
||||
}
|
||||
|
||||
logger.info(f"Found {len(files)} files to process")
|
||||
if len(files) == 0:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "No files found in index",
|
||||
}
|
||||
|
||||
# Determine worker count
|
||||
if workers <= 0:
|
||||
workers = min(multiprocessing.cpu_count(), len(files), 8)
|
||||
logger.info(f"Using {workers} worker(s) for parallel processing")
|
||||
logger.info(f"Batch size for embeddings: {batch_size}")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Phase 1: Parallel chunking
|
||||
logger.info("Phase 1: Chunking files...")
|
||||
chunk_start = time.time()
|
||||
|
||||
all_chunks: List[ChunkData] = []
|
||||
failed_files = []
|
||||
|
||||
# Prepare work items
|
||||
work_items = [
|
||||
(f.full_path, f.content, f.language, chunk_size)
|
||||
for f in files
|
||||
]
|
||||
|
||||
if workers == 1:
|
||||
# Single-threaded for debugging
|
||||
for i, item in enumerate(work_items, 1):
|
||||
try:
|
||||
chunks = process_file_worker(item)
|
||||
all_chunks.extend(chunks)
|
||||
if i % 100 == 0:
|
||||
logger.info(f"Chunked {i}/{len(files)} files ({len(all_chunks)} chunks)")
|
||||
except Exception as exc:
|
||||
failed_files.append((item[0], str(exc)))
|
||||
else:
|
||||
# Parallel processing
|
||||
with ProcessPoolExecutor(max_workers=workers) as executor:
|
||||
futures = {
|
||||
executor.submit(process_file_worker, item): item[0]
|
||||
for item in work_items
|
||||
}
|
||||
|
||||
completed = 0
|
||||
for future in as_completed(futures):
|
||||
file_path = futures[future]
|
||||
completed += 1
|
||||
try:
|
||||
chunks = future.result()
|
||||
all_chunks.extend(chunks)
|
||||
if completed % 100 == 0:
|
||||
logger.info(
|
||||
f"Chunked {completed}/{len(files)} files "
|
||||
f"({len(all_chunks)} chunks)"
|
||||
)
|
||||
except Exception as exc:
|
||||
failed_files.append((file_path, str(exc)))
|
||||
|
||||
chunk_time = time.time() - chunk_start
|
||||
logger.info(f"Chunking completed in {chunk_time:.1f}s: {len(all_chunks)} chunks")
|
||||
|
||||
if not all_chunks:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "No chunks created from files",
|
||||
"files_processed": len(files) - len(failed_files),
|
||||
"files_failed": len(failed_files),
|
||||
}
|
||||
|
||||
# Phase 2: Batch embedding generation
|
||||
logger.info("Phase 2: Generating embeddings...")
|
||||
embed_start = time.time()
|
||||
|
||||
# Extract all content for batch embedding
|
||||
all_contents = [c.content for c in all_chunks]
|
||||
|
||||
# Generate embeddings in batches
|
||||
all_embeddings = []
|
||||
for i in range(0, len(all_contents), batch_size):
|
||||
batch_contents = all_contents[i:i + batch_size]
|
||||
batch_embeddings = embedder.embed(batch_contents)
|
||||
all_embeddings.extend(batch_embeddings)
|
||||
|
||||
progress = min(i + batch_size, len(all_contents))
|
||||
if progress % (batch_size * 4) == 0 or progress == len(all_contents):
|
||||
logger.info(f"Generated embeddings: {progress}/{len(all_contents)}")
|
||||
|
||||
embed_time = time.time() - embed_start
|
||||
logger.info(f"Embedding completed in {embed_time:.1f}s")
|
||||
|
||||
# Phase 3: Batch database write
|
||||
logger.info("Phase 3: Storing chunks...")
|
||||
store_start = time.time()
|
||||
|
||||
# Create SemanticChunk objects with embeddings
|
||||
semantic_chunks_with_paths = []
|
||||
for chunk_data, embedding in zip(all_chunks, all_embeddings):
|
||||
semantic_chunk = SemanticChunk(
|
||||
content=chunk_data.content,
|
||||
metadata=chunk_data.metadata,
|
||||
)
|
||||
semantic_chunk.embedding = embedding
|
||||
semantic_chunks_with_paths.append((semantic_chunk, chunk_data.file_path))
|
||||
|
||||
# Batch write (handles both SQLite and HNSW)
|
||||
write_batch_size = 1000
|
||||
total_stored = 0
|
||||
for i in range(0, len(semantic_chunks_with_paths), write_batch_size):
|
||||
batch = semantic_chunks_with_paths[i:i + write_batch_size]
|
||||
vector_store.add_chunks_batch(batch)
|
||||
total_stored += len(batch)
|
||||
if total_stored % 5000 == 0 or total_stored == len(semantic_chunks_with_paths):
|
||||
logger.info(f"Stored: {total_stored}/{len(semantic_chunks_with_paths)} chunks")
|
||||
|
||||
store_time = time.time() - store_start
|
||||
logger.info(f"Storage completed in {store_time:.1f}s")
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
# Generate summary
|
||||
# Extract result data and log summary
|
||||
data = result["result"]
|
||||
logger.info("=" * 60)
|
||||
logger.info(f"Completed in {elapsed_time:.1f}s")
|
||||
logger.info(f" Chunking: {chunk_time:.1f}s")
|
||||
logger.info(f" Embedding: {embed_time:.1f}s")
|
||||
logger.info(f" Storage: {store_time:.1f}s")
|
||||
logger.info(f"Total chunks created: {len(all_chunks)}")
|
||||
logger.info(f"Files processed: {len(files) - len(failed_files)}/{len(files)}")
|
||||
if vector_store.ann_available:
|
||||
logger.info(f"HNSW index vectors: {vector_store.ann_count}")
|
||||
if failed_files:
|
||||
logger.warning(f"Failed files: {len(failed_files)}")
|
||||
for file_path, error in failed_files[:5]: # Show first 5 failures
|
||||
logger.warning(f" {file_path}: {error}")
|
||||
logger.info(f"Completed in {data['elapsed_time']:.1f}s")
|
||||
logger.info(f"Total chunks created: {data['chunks_created']}")
|
||||
logger.info(f"Files processed: {data['files_processed']}")
|
||||
if data['files_failed'] > 0:
|
||||
logger.warning(f"Failed files: {data['files_failed']}")
|
||||
if data.get('failed_files'):
|
||||
for file_path, error in data['failed_files']:
|
||||
logger.warning(f" {file_path}: {error}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"chunks_created": len(all_chunks),
|
||||
"files_processed": len(files) - len(failed_files),
|
||||
"files_failed": len(failed_files),
|
||||
"elapsed_time": elapsed_time,
|
||||
"chunk_time": chunk_time,
|
||||
"embed_time": embed_time,
|
||||
"store_time": store_time,
|
||||
"ann_vectors": vector_store.ann_count if vector_store.ann_available else 0,
|
||||
"chunks_created": data["chunks_created"],
|
||||
"files_processed": data["files_processed"],
|
||||
"files_failed": data["files_failed"],
|
||||
"elapsed_time": data["elapsed_time"],
|
||||
}
|
||||
|
||||
|
||||
def find_index_databases(scan_dir: Path) -> List[Path]:
|
||||
"""Find all _index.db files in directory tree."""
|
||||
logger.info(f"Scanning for indexes in: {scan_dir}")
|
||||
index_files = list(scan_dir.rglob("_index.db"))
|
||||
logger.info(f"Found {len(index_files)} index databases")
|
||||
return index_files
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate vector embeddings for CodexLens indexes",
|
||||
description="Generate vector embeddings for CodexLens indexes (memory-efficient streaming)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__
|
||||
)
|
||||
@@ -439,14 +163,14 @@ def main():
|
||||
"--workers",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Number of parallel workers for chunking (default: auto-detect CPU count)"
|
||||
help="(Deprecated) Kept for backward compatibility, ignored"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=256,
|
||||
help="Batch size for embedding generation (default: 256)"
|
||||
help="(Deprecated) Kept for backward compatibility, ignored"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@@ -481,43 +205,33 @@ def main():
|
||||
|
||||
# Determine if scanning or single file
|
||||
if args.scan or index_path.is_dir():
|
||||
# Scan mode
|
||||
# Scan mode - use recursive implementation
|
||||
if index_path.is_file():
|
||||
logger.error("--scan requires a directory path")
|
||||
sys.exit(1)
|
||||
|
||||
index_files = find_index_databases(index_path)
|
||||
if not index_files:
|
||||
logger.error(f"No index databases found in: {index_path}")
|
||||
result = generate_embeddings_recursive(
|
||||
index_root=index_path,
|
||||
model_profile=args.model,
|
||||
force=args.force,
|
||||
chunk_size=args.chunk_size,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
if not result["success"]:
|
||||
logger.error(f"Failed: {result.get('error', 'Unknown error')}")
|
||||
sys.exit(1)
|
||||
|
||||
# Process each index
|
||||
total_chunks = 0
|
||||
successful = 0
|
||||
for idx, index_file in enumerate(index_files, 1):
|
||||
logger.info(f"\n{'='*60}")
|
||||
logger.info(f"Processing index {idx}/{len(index_files)}")
|
||||
logger.info(f"{'='*60}")
|
||||
|
||||
result = generate_embeddings_for_index(
|
||||
index_file,
|
||||
model_profile=args.model,
|
||||
force=args.force,
|
||||
chunk_size=args.chunk_size,
|
||||
workers=args.workers,
|
||||
batch_size=args.batch_size,
|
||||
)
|
||||
|
||||
if result["success"]:
|
||||
total_chunks += result["chunks_created"]
|
||||
successful += 1
|
||||
|
||||
# Final summary
|
||||
# Log summary
|
||||
data = result["result"]
|
||||
logger.info(f"\n{'='*60}")
|
||||
logger.info("BATCH PROCESSING COMPLETE")
|
||||
logger.info(f"{'='*60}")
|
||||
logger.info(f"Indexes processed: {successful}/{len(index_files)}")
|
||||
logger.info(f"Total chunks created: {total_chunks}")
|
||||
logger.info(f"Indexes processed: {data['indexes_successful']}/{data['indexes_processed']}")
|
||||
logger.info(f"Total chunks created: {data['total_chunks_created']}")
|
||||
logger.info(f"Total files processed: {data['total_files_processed']}")
|
||||
if data['total_files_failed'] > 0:
|
||||
logger.warning(f"Total files failed: {data['total_files_failed']}")
|
||||
|
||||
else:
|
||||
# Single index mode
|
||||
@@ -530,8 +244,6 @@ def main():
|
||||
model_profile=args.model,
|
||||
force=args.force,
|
||||
chunk_size=args.chunk_size,
|
||||
workers=args.workers,
|
||||
batch_size=args.batch_size,
|
||||
)
|
||||
|
||||
if not result["success"]:
|
||||
|
||||
Reference in New Issue
Block a user