mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-11 02:33:51 +08:00
fix: 修复嵌入生成内存泄漏,优化性能
- HNSW 索引:预分配从 100 万降至 5 万,添加动态扩容和可控保存 - Embedder:添加 embed_to_numpy() 避免 .tolist() 转换,增强缓存清理 - embedding_manager:每 10 批次重建 embedder 实例,显式 gc.collect() - VectorStore:添加 bulk_insert() 上下文管理器,支持 numpy 批量写入 - Chunker:添加 skip_token_count 轻量模式,使用 char/4 估算(~9x 加速) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,29 @@
|
||||
"""Code chunking strategies for semantic search."""
|
||||
"""Code chunking strategies for semantic search.
|
||||
|
||||
This module provides various chunking strategies for breaking down source code
|
||||
into semantic chunks suitable for embedding and search.
|
||||
|
||||
Lightweight Mode:
|
||||
The ChunkConfig supports a `skip_token_count` option for performance optimization.
|
||||
When enabled, token counting uses a fast character-based estimation (char/4)
|
||||
instead of expensive tiktoken encoding.
|
||||
|
||||
Use cases for lightweight mode:
|
||||
- Large-scale indexing where speed is critical
|
||||
- Scenarios where approximate token counts are acceptable
|
||||
- Memory-constrained environments
|
||||
- Initial prototyping and development
|
||||
|
||||
Example:
|
||||
# Default mode (accurate tiktoken encoding)
|
||||
config = ChunkConfig()
|
||||
chunker = Chunker(config)
|
||||
|
||||
# Lightweight mode (fast char/4 estimation)
|
||||
config = ChunkConfig(skip_token_count=True)
|
||||
chunker = Chunker(config)
|
||||
chunks = chunker.chunk_file(content, symbols, path, language)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -17,6 +42,7 @@ class ChunkConfig:
|
||||
overlap: int = 100 # Overlap for sliding window
|
||||
strategy: str = "auto" # Chunking strategy: auto, symbol, sliding_window, hybrid
|
||||
min_chunk_size: int = 50 # Minimum chunk size
|
||||
skip_token_count: bool = False # Skip expensive token counting (use char/4 estimate)
|
||||
|
||||
|
||||
class Chunker:
|
||||
@@ -26,6 +52,23 @@ class Chunker:
|
||||
self.config = config or ChunkConfig()
|
||||
self._tokenizer = get_default_tokenizer()
|
||||
|
||||
def _estimate_token_count(self, text: str) -> int:
|
||||
"""Estimate token count based on config.
|
||||
|
||||
If skip_token_count is True, uses character-based estimation (char/4).
|
||||
Otherwise, uses accurate tiktoken encoding.
|
||||
|
||||
Args:
|
||||
text: Text to count tokens for
|
||||
|
||||
Returns:
|
||||
Estimated token count
|
||||
"""
|
||||
if self.config.skip_token_count:
|
||||
# Fast character-based estimation: ~4 chars per token
|
||||
return max(1, len(text) // 4)
|
||||
return self._tokenizer.count_tokens(text)
|
||||
|
||||
def chunk_by_symbol(
|
||||
self,
|
||||
content: str,
|
||||
@@ -63,7 +106,7 @@ class Chunker:
|
||||
if symbol_token_counts and symbol.name in symbol_token_counts:
|
||||
token_count = symbol_token_counts[symbol.name]
|
||||
else:
|
||||
token_count = self._tokenizer.count_tokens(chunk_content)
|
||||
token_count = self._estimate_token_count(chunk_content)
|
||||
|
||||
chunks.append(SemanticChunk(
|
||||
content=chunk_content,
|
||||
@@ -122,7 +165,7 @@ class Chunker:
|
||||
chunk_content = "".join(lines[start:end])
|
||||
|
||||
if len(chunk_content.strip()) >= self.config.min_chunk_size:
|
||||
token_count = self._tokenizer.count_tokens(chunk_content)
|
||||
token_count = self._estimate_token_count(chunk_content)
|
||||
|
||||
# Calculate correct line numbers
|
||||
if line_mapping:
|
||||
@@ -346,14 +389,14 @@ class HybridChunker:
|
||||
symbol_token_counts: Optional dict mapping symbol names to token counts
|
||||
"""
|
||||
chunks: List[SemanticChunk] = []
|
||||
tokenizer = get_default_tokenizer()
|
||||
|
||||
# Step 1: Extract docstrings as dedicated chunks
|
||||
docstrings = self.docstring_extractor.extract_docstrings(content, language)
|
||||
|
||||
for docstring_content, start_line, end_line in docstrings:
|
||||
if len(docstring_content.strip()) >= self.config.min_chunk_size:
|
||||
token_count = tokenizer.count_tokens(docstring_content)
|
||||
# Use base chunker's token estimation method
|
||||
token_count = self.base_chunker._estimate_token_count(docstring_content)
|
||||
chunks.append(SemanticChunk(
|
||||
content=docstring_content,
|
||||
embedding=None,
|
||||
|
||||
Reference in New Issue
Block a user