Files
Claude-Code-Workflow/codex-lens/tests/test_token_chunking.py
catlog22 3e9a309079 refactor: 移除图索引功能,修复内存泄露,优化嵌入生成
主要更改:

1. 移除图索引功能 (graph indexing)
   - 删除 graph_analyzer.py 及相关迁移文件
   - 移除 CLI 的 graph 命令和 --enrich 标志
   - 清理 chain_search.py 中的图查询方法 (370行)
   - 删除相关测试文件

2. 修复嵌入生成内存问题
   - 重构 generate_embeddings.py 使用流式批处理
   - 改用 embedding_manager 的内存安全实现
   - 文件从 548 行精简到 259 行 (52.7% 减少)

3. 修复内存泄露
   - chain_search.py: quick_search 使用 with 语句管理 ChainSearchEngine
   - embedding_manager.py: 使用 with 语句管理 VectorStore
   - vector_store.py: 添加暴力搜索内存警告

4. 代码清理
   - 移除 Symbol 模型的 token_count 和 symbol_type 字段
   - 清理相关测试用例

测试: 760 passed, 7 skipped

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 16:22:03 +08:00

191 lines
6.8 KiB
Python

"""Tests for token-aware chunking functionality."""
import pytest
from codexlens.entities import SemanticChunk, Symbol
from codexlens.semantic.chunker import ChunkConfig, Chunker, HybridChunker
from codexlens.parsers.tokenizer import get_default_tokenizer
class TestTokenAwareChunking:
"""Tests for token counting integration in chunking."""
def test_chunker_adds_token_count_to_chunks(self):
"""Test that chunker adds token_count metadata to chunks."""
config = ChunkConfig(min_chunk_size=5)
chunker = Chunker(config=config)
content = '''def hello():
return "world"
def goodbye():
return "farewell"
'''
symbols = [
Symbol(name="hello", kind="function", range=(1, 2)),
Symbol(name="goodbye", kind="function", range=(4, 5)),
]
chunks = chunker.chunk_file(content, symbols, "test.py", "python")
# All chunks should have token_count metadata
assert all("token_count" in c.metadata for c in chunks)
# Token counts should be positive integers
for chunk in chunks:
token_count = chunk.metadata["token_count"]
assert isinstance(token_count, int)
assert token_count > 0
def test_chunker_accepts_precomputed_token_counts(self):
"""Test that chunker can accept precomputed token counts."""
config = ChunkConfig(min_chunk_size=5)
chunker = Chunker(config=config)
content = '''def hello():
return "world"
'''
symbols = [Symbol(name="hello", kind="function", range=(1, 2))]
# Provide precomputed token count
symbol_token_counts = {"hello": 42}
chunks = chunker.chunk_file(content, symbols, "test.py", "python", symbol_token_counts)
assert len(chunks) == 1
assert chunks[0].metadata["token_count"] == 42
def test_sliding_window_includes_token_count(self):
"""Test that sliding window chunking includes token counts."""
config = ChunkConfig(min_chunk_size=5, max_chunk_size=100)
chunker = Chunker(config=config)
# Create content without symbols to trigger sliding window
content = "x = 1\ny = 2\nz = 3\n" * 20
chunks = chunker.chunk_sliding_window(content, "test.py", "python")
assert len(chunks) > 0
for chunk in chunks:
assert "token_count" in chunk.metadata
assert chunk.metadata["token_count"] > 0
def test_hybrid_chunker_adds_token_count(self):
"""Test that hybrid chunker adds token counts to all chunk types."""
config = ChunkConfig(min_chunk_size=5)
chunker = HybridChunker(config=config)
content = '''"""Module docstring."""
def hello():
"""Function docstring."""
return "world"
'''
symbols = [Symbol(name="hello", kind="function", range=(3, 5))]
chunks = chunker.chunk_file(content, symbols, "test.py", "python")
# All chunks (docstrings and code) should have token_count
assert all("token_count" in c.metadata for c in chunks)
docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"]
code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"]
assert len(docstring_chunks) > 0
assert len(code_chunks) > 0
# Verify all have valid token counts
for chunk in chunks:
assert chunk.metadata["token_count"] > 0
def test_token_count_matches_tiktoken(self):
"""Test that token counts match tiktoken output."""
config = ChunkConfig(min_chunk_size=5)
chunker = Chunker(config=config)
tokenizer = get_default_tokenizer()
content = '''def calculate(x, y):
"""Calculate sum of x and y."""
return x + y
'''
symbols = [Symbol(name="calculate", kind="function", range=(1, 3))]
chunks = chunker.chunk_file(content, symbols, "test.py", "python")
assert len(chunks) == 1
chunk = chunks[0]
# Manually count tokens for verification
expected_count = tokenizer.count_tokens(chunk.content)
assert chunk.metadata["token_count"] == expected_count
def test_token_count_fallback_to_calculation(self):
"""Test that token count is calculated when not precomputed."""
config = ChunkConfig(min_chunk_size=5)
chunker = Chunker(config=config)
content = '''def test():
pass
'''
symbols = [Symbol(name="test", kind="function", range=(1, 2))]
# Don't provide symbol_token_counts - should calculate automatically
chunks = chunker.chunk_file(content, symbols, "test.py", "python")
assert len(chunks) == 1
assert "token_count" in chunks[0].metadata
assert chunks[0].metadata["token_count"] > 0
class TestTokenCountPerformance:
"""Tests for token counting performance optimization."""
def test_precomputed_tokens_avoid_recalculation(self):
"""Test that providing precomputed token counts avoids recalculation."""
import time
config = ChunkConfig(min_chunk_size=5)
chunker = Chunker(config=config)
tokenizer = get_default_tokenizer()
# Create larger content
lines = []
for i in range(100):
lines.append(f'def func{i}(x):\n')
lines.append(f' return x * {i}\n')
lines.append('\n')
content = "".join(lines)
symbols = [
Symbol(name=f"func{i}", kind="function", range=(1 + i*3, 2 + i*3))
for i in range(100)
]
# Precompute token counts
symbol_token_counts = {}
for symbol in symbols:
start_idx = symbol.range[0] - 1
end_idx = symbol.range[1]
chunk_content = "".join(content.splitlines(keepends=True)[start_idx:end_idx])
symbol_token_counts[symbol.name] = tokenizer.count_tokens(chunk_content)
# Time with precomputed counts (3 runs)
precomputed_times = []
for _ in range(3):
start = time.perf_counter()
chunker.chunk_file(content, symbols, "test.py", "python", symbol_token_counts)
precomputed_times.append(time.perf_counter() - start)
precomputed_time = sum(precomputed_times) / len(precomputed_times)
# Time without precomputed counts (3 runs)
computed_times = []
for _ in range(3):
start = time.perf_counter()
chunker.chunk_file(content, symbols, "test.py", "python")
computed_times.append(time.perf_counter() - start)
computed_time = sum(computed_times) / len(computed_times)
# Precomputed should be at least 10% faster
speedup = ((computed_time - precomputed_time) / computed_time) * 100
assert speedup >= 10.0, f"Speedup {speedup:.2f}% < 10% (computed={computed_time:.4f}s, precomputed={precomputed_time:.4f}s)"