fix: 修复嵌入生成内存泄漏，优化性能

- HNSW 索引：预分配从 100 万降至 5 万，添加动态扩容和可控保存 - Embedder：添加 embed_to_numpy() 避免 .tolist() 转换，增强缓存清理 - embedding_manager：每 10 批次重建 embedder 实例，显式 gc.collect() - VectorStore：添加 bulk_insert() 上下文管理器，支持 numpy 批量写入 - Chunker：添加 skip_token_count 轻量模式，使用 char/4 估算（~9x 加速） 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-11 02:33:51 +08:00 · 2025-12-21 19:15:47 +08:00
parent 45f92fe066
commit 5849f751bc
5 changed files with 420 additions and 34 deletions
--- a/codex-lens/src/codexlens/semantic/chunker.py
+++ b/codex-lens/src/codexlens/semantic/chunker.py
@@ -1,4 +1,29 @@
-"""Code chunking strategies for semantic search."""
+"""Code chunking strategies for semantic search.
+
+This module provides various chunking strategies for breaking down source code
+into semantic chunks suitable for embedding and search.
+
+Lightweight Mode:
+    The ChunkConfig supports a `skip_token_count` option for performance optimization.
+    When enabled, token counting uses a fast character-based estimation (char/4)
+    instead of expensive tiktoken encoding.
+
+    Use cases for lightweight mode:
+    - Large-scale indexing where speed is critical
+    - Scenarios where approximate token counts are acceptable
+    - Memory-constrained environments
+    - Initial prototyping and development
+
+    Example:
+        # Default mode (accurate tiktoken encoding)
+        config = ChunkConfig()
+        chunker = Chunker(config)
+
+        # Lightweight mode (fast char/4 estimation)
+        config = ChunkConfig(skip_token_count=True)
+        chunker = Chunker(config)
+        chunks = chunker.chunk_file(content, symbols, path, language)
+"""

 from __future__ import annotations

@@ -17,6 +42,7 @@ class ChunkConfig:
    overlap: int = 100  # Overlap for sliding window
    strategy: str = "auto"  # Chunking strategy: auto, symbol, sliding_window, hybrid
    min_chunk_size: int = 50  # Minimum chunk size
+    skip_token_count: bool = False  # Skip expensive token counting (use char/4 estimate)


 class Chunker:
@@ -26,6 +52,23 @@ class Chunker:
        self.config = config or ChunkConfig()
        self._tokenizer = get_default_tokenizer()

+    def _estimate_token_count(self, text: str) -> int:
+        """Estimate token count based on config.
+
+        If skip_token_count is True, uses character-based estimation (char/4).
+        Otherwise, uses accurate tiktoken encoding.
+
+        Args:
+            text: Text to count tokens for
+
+        Returns:
+            Estimated token count
+        """
+        if self.config.skip_token_count:
+            # Fast character-based estimation: ~4 chars per token
+            return max(1, len(text) // 4)
+        return self._tokenizer.count_tokens(text)
+
    def chunk_by_symbol(
        self,
        content: str,
@@ -63,7 +106,7 @@ class Chunker:
            if symbol_token_counts and symbol.name in symbol_token_counts:
                token_count = symbol_token_counts[symbol.name]
            else:
-                token_count = self._tokenizer.count_tokens(chunk_content)
+                token_count = self._estimate_token_count(chunk_content)

            chunks.append(SemanticChunk(
                content=chunk_content,
@@ -122,7 +165,7 @@ class Chunker:
            chunk_content = "".join(lines[start:end])

            if len(chunk_content.strip()) >= self.config.min_chunk_size:
-                token_count = self._tokenizer.count_tokens(chunk_content)
+                token_count = self._estimate_token_count(chunk_content)

                # Calculate correct line numbers
                if line_mapping:
@@ -346,14 +389,14 @@ class HybridChunker:
            symbol_token_counts: Optional dict mapping symbol names to token counts
        """
        chunks: List[SemanticChunk] = []
-        tokenizer = get_default_tokenizer()

        # Step 1: Extract docstrings as dedicated chunks
        docstrings = self.docstring_extractor.extract_docstrings(content, language)

        for docstring_content, start_line, end_line in docstrings:
            if len(docstring_content.strip()) >= self.config.min_chunk_size:
-                token_count = tokenizer.count_tokens(docstring_content)
+                # Use base chunker's token estimation method
+                token_count = self.base_chunker._estimate_token_count(docstring_content)
                chunks.append(SemanticChunk(
                    content=docstring_content,
                    embedding=None,