mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
主要更改: 1. 移除图索引功能 (graph indexing) - 删除 graph_analyzer.py 及相关迁移文件 - 移除 CLI 的 graph 命令和 --enrich 标志 - 清理 chain_search.py 中的图查询方法 (370行) - 删除相关测试文件 2. 修复嵌入生成内存问题 - 重构 generate_embeddings.py 使用流式批处理 - 改用 embedding_manager 的内存安全实现 - 文件从 548 行精简到 259 行 (52.7% 减少) 3. 修复内存泄露 - chain_search.py: quick_search 使用 with 语句管理 ChainSearchEngine - embedding_manager.py: 使用 with 语句管理 VectorStore - vector_store.py: 添加暴力搜索内存警告 4. 代码清理 - 移除 Symbol 模型的 token_count 和 symbol_type 字段 - 清理相关测试用例 测试: 760 passed, 7 skipped 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
191 lines
6.8 KiB
Python
191 lines
6.8 KiB
Python
"""Tests for token-aware chunking functionality."""
|
|
|
|
import pytest
|
|
|
|
from codexlens.entities import SemanticChunk, Symbol
|
|
from codexlens.semantic.chunker import ChunkConfig, Chunker, HybridChunker
|
|
from codexlens.parsers.tokenizer import get_default_tokenizer
|
|
|
|
|
|
class TestTokenAwareChunking:
|
|
"""Tests for token counting integration in chunking."""
|
|
|
|
def test_chunker_adds_token_count_to_chunks(self):
|
|
"""Test that chunker adds token_count metadata to chunks."""
|
|
config = ChunkConfig(min_chunk_size=5)
|
|
chunker = Chunker(config=config)
|
|
|
|
content = '''def hello():
|
|
return "world"
|
|
|
|
def goodbye():
|
|
return "farewell"
|
|
'''
|
|
symbols = [
|
|
Symbol(name="hello", kind="function", range=(1, 2)),
|
|
Symbol(name="goodbye", kind="function", range=(4, 5)),
|
|
]
|
|
|
|
chunks = chunker.chunk_file(content, symbols, "test.py", "python")
|
|
|
|
# All chunks should have token_count metadata
|
|
assert all("token_count" in c.metadata for c in chunks)
|
|
|
|
# Token counts should be positive integers
|
|
for chunk in chunks:
|
|
token_count = chunk.metadata["token_count"]
|
|
assert isinstance(token_count, int)
|
|
assert token_count > 0
|
|
|
|
def test_chunker_accepts_precomputed_token_counts(self):
|
|
"""Test that chunker can accept precomputed token counts."""
|
|
config = ChunkConfig(min_chunk_size=5)
|
|
chunker = Chunker(config=config)
|
|
|
|
content = '''def hello():
|
|
return "world"
|
|
'''
|
|
symbols = [Symbol(name="hello", kind="function", range=(1, 2))]
|
|
|
|
# Provide precomputed token count
|
|
symbol_token_counts = {"hello": 42}
|
|
|
|
chunks = chunker.chunk_file(content, symbols, "test.py", "python", symbol_token_counts)
|
|
|
|
assert len(chunks) == 1
|
|
assert chunks[0].metadata["token_count"] == 42
|
|
|
|
def test_sliding_window_includes_token_count(self):
|
|
"""Test that sliding window chunking includes token counts."""
|
|
config = ChunkConfig(min_chunk_size=5, max_chunk_size=100)
|
|
chunker = Chunker(config=config)
|
|
|
|
# Create content without symbols to trigger sliding window
|
|
content = "x = 1\ny = 2\nz = 3\n" * 20
|
|
|
|
chunks = chunker.chunk_sliding_window(content, "test.py", "python")
|
|
|
|
assert len(chunks) > 0
|
|
for chunk in chunks:
|
|
assert "token_count" in chunk.metadata
|
|
assert chunk.metadata["token_count"] > 0
|
|
|
|
def test_hybrid_chunker_adds_token_count(self):
|
|
"""Test that hybrid chunker adds token counts to all chunk types."""
|
|
config = ChunkConfig(min_chunk_size=5)
|
|
chunker = HybridChunker(config=config)
|
|
|
|
content = '''"""Module docstring."""
|
|
|
|
def hello():
|
|
"""Function docstring."""
|
|
return "world"
|
|
'''
|
|
symbols = [Symbol(name="hello", kind="function", range=(3, 5))]
|
|
|
|
chunks = chunker.chunk_file(content, symbols, "test.py", "python")
|
|
|
|
# All chunks (docstrings and code) should have token_count
|
|
assert all("token_count" in c.metadata for c in chunks)
|
|
|
|
docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"]
|
|
code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"]
|
|
|
|
assert len(docstring_chunks) > 0
|
|
assert len(code_chunks) > 0
|
|
|
|
# Verify all have valid token counts
|
|
for chunk in chunks:
|
|
assert chunk.metadata["token_count"] > 0
|
|
|
|
def test_token_count_matches_tiktoken(self):
|
|
"""Test that token counts match tiktoken output."""
|
|
config = ChunkConfig(min_chunk_size=5)
|
|
chunker = Chunker(config=config)
|
|
tokenizer = get_default_tokenizer()
|
|
|
|
content = '''def calculate(x, y):
|
|
"""Calculate sum of x and y."""
|
|
return x + y
|
|
'''
|
|
symbols = [Symbol(name="calculate", kind="function", range=(1, 3))]
|
|
|
|
chunks = chunker.chunk_file(content, symbols, "test.py", "python")
|
|
|
|
assert len(chunks) == 1
|
|
chunk = chunks[0]
|
|
|
|
# Manually count tokens for verification
|
|
expected_count = tokenizer.count_tokens(chunk.content)
|
|
assert chunk.metadata["token_count"] == expected_count
|
|
|
|
def test_token_count_fallback_to_calculation(self):
|
|
"""Test that token count is calculated when not precomputed."""
|
|
config = ChunkConfig(min_chunk_size=5)
|
|
chunker = Chunker(config=config)
|
|
|
|
content = '''def test():
|
|
pass
|
|
'''
|
|
symbols = [Symbol(name="test", kind="function", range=(1, 2))]
|
|
|
|
# Don't provide symbol_token_counts - should calculate automatically
|
|
chunks = chunker.chunk_file(content, symbols, "test.py", "python")
|
|
|
|
assert len(chunks) == 1
|
|
assert "token_count" in chunks[0].metadata
|
|
assert chunks[0].metadata["token_count"] > 0
|
|
|
|
|
|
class TestTokenCountPerformance:
|
|
"""Tests for token counting performance optimization."""
|
|
|
|
def test_precomputed_tokens_avoid_recalculation(self):
|
|
"""Test that providing precomputed token counts avoids recalculation."""
|
|
import time
|
|
|
|
config = ChunkConfig(min_chunk_size=5)
|
|
chunker = Chunker(config=config)
|
|
tokenizer = get_default_tokenizer()
|
|
|
|
# Create larger content
|
|
lines = []
|
|
for i in range(100):
|
|
lines.append(f'def func{i}(x):\n')
|
|
lines.append(f' return x * {i}\n')
|
|
lines.append('\n')
|
|
content = "".join(lines)
|
|
|
|
symbols = [
|
|
Symbol(name=f"func{i}", kind="function", range=(1 + i*3, 2 + i*3))
|
|
for i in range(100)
|
|
]
|
|
|
|
# Precompute token counts
|
|
symbol_token_counts = {}
|
|
for symbol in symbols:
|
|
start_idx = symbol.range[0] - 1
|
|
end_idx = symbol.range[1]
|
|
chunk_content = "".join(content.splitlines(keepends=True)[start_idx:end_idx])
|
|
symbol_token_counts[symbol.name] = tokenizer.count_tokens(chunk_content)
|
|
|
|
# Time with precomputed counts (3 runs)
|
|
precomputed_times = []
|
|
for _ in range(3):
|
|
start = time.perf_counter()
|
|
chunker.chunk_file(content, symbols, "test.py", "python", symbol_token_counts)
|
|
precomputed_times.append(time.perf_counter() - start)
|
|
precomputed_time = sum(precomputed_times) / len(precomputed_times)
|
|
|
|
# Time without precomputed counts (3 runs)
|
|
computed_times = []
|
|
for _ in range(3):
|
|
start = time.perf_counter()
|
|
chunker.chunk_file(content, symbols, "test.py", "python")
|
|
computed_times.append(time.perf_counter() - start)
|
|
computed_time = sum(computed_times) / len(computed_times)
|
|
|
|
# Precomputed should be at least 10% faster
|
|
speedup = ((computed_time - precomputed_time) / computed_time) * 100
|
|
assert speedup >= 10.0, f"Speedup {speedup:.2f}% < 10% (computed={computed_time:.4f}s, precomputed={precomputed_time:.4f}s)"
|