Files
Claude-Code-Workflow/codex-lens/tests/test_token_chunking.py
catlog22 0fe16963cd Add comprehensive tests for tokenizer, performance benchmarks, and TreeSitter parser functionality
- Implemented unit tests for the Tokenizer class, covering various text inputs, edge cases, and fallback mechanisms.
- Created performance benchmarks comparing tiktoken and pure Python implementations for token counting.
- Developed extensive tests for TreeSitterSymbolParser across Python, JavaScript, and TypeScript, ensuring accurate symbol extraction and parsing.
- Added configuration documentation for MCP integration and custom prompts, enhancing usability and flexibility.
- Introduced a refactor script for GraphAnalyzer to streamline future improvements.
2025-12-15 14:36:09 +08:00

248 lines
8.3 KiB
Python

"""Tests for token-aware chunking functionality."""
import pytest
from codexlens.entities import SemanticChunk, Symbol
from codexlens.semantic.chunker import ChunkConfig, Chunker, HybridChunker
from codexlens.parsers.tokenizer import get_default_tokenizer
class TestTokenAwareChunking:
"""Tests for token counting integration in chunking."""
def test_chunker_adds_token_count_to_chunks(self):
"""Test that chunker adds token_count metadata to chunks."""
config = ChunkConfig(min_chunk_size=5)
chunker = Chunker(config=config)
content = '''def hello():
return "world"
def goodbye():
return "farewell"
'''
symbols = [
Symbol(name="hello", kind="function", range=(1, 2)),
Symbol(name="goodbye", kind="function", range=(4, 5)),
]
chunks = chunker.chunk_file(content, symbols, "test.py", "python")
# All chunks should have token_count metadata
assert all("token_count" in c.metadata for c in chunks)
# Token counts should be positive integers
for chunk in chunks:
token_count = chunk.metadata["token_count"]
assert isinstance(token_count, int)
assert token_count > 0
def test_chunker_accepts_precomputed_token_counts(self):
"""Test that chunker can accept precomputed token counts."""
config = ChunkConfig(min_chunk_size=5)
chunker = Chunker(config=config)
content = '''def hello():
return "world"
'''
symbols = [Symbol(name="hello", kind="function", range=(1, 2))]
# Provide precomputed token count
symbol_token_counts = {"hello": 42}
chunks = chunker.chunk_file(content, symbols, "test.py", "python", symbol_token_counts)
assert len(chunks) == 1
assert chunks[0].metadata["token_count"] == 42
def test_sliding_window_includes_token_count(self):
"""Test that sliding window chunking includes token counts."""
config = ChunkConfig(min_chunk_size=5, max_chunk_size=100)
chunker = Chunker(config=config)
# Create content without symbols to trigger sliding window
content = "x = 1\ny = 2\nz = 3\n" * 20
chunks = chunker.chunk_sliding_window(content, "test.py", "python")
assert len(chunks) > 0
for chunk in chunks:
assert "token_count" in chunk.metadata
assert chunk.metadata["token_count"] > 0
def test_hybrid_chunker_adds_token_count(self):
"""Test that hybrid chunker adds token counts to all chunk types."""
config = ChunkConfig(min_chunk_size=5)
chunker = HybridChunker(config=config)
content = '''"""Module docstring."""
def hello():
"""Function docstring."""
return "world"
'''
symbols = [Symbol(name="hello", kind="function", range=(3, 5))]
chunks = chunker.chunk_file(content, symbols, "test.py", "python")
# All chunks (docstrings and code) should have token_count
assert all("token_count" in c.metadata for c in chunks)
docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"]
code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"]
assert len(docstring_chunks) > 0
assert len(code_chunks) > 0
# Verify all have valid token counts
for chunk in chunks:
assert chunk.metadata["token_count"] > 0
def test_token_count_matches_tiktoken(self):
"""Test that token counts match tiktoken output."""
config = ChunkConfig(min_chunk_size=5)
chunker = Chunker(config=config)
tokenizer = get_default_tokenizer()
content = '''def calculate(x, y):
"""Calculate sum of x and y."""
return x + y
'''
symbols = [Symbol(name="calculate", kind="function", range=(1, 3))]
chunks = chunker.chunk_file(content, symbols, "test.py", "python")
assert len(chunks) == 1
chunk = chunks[0]
# Manually count tokens for verification
expected_count = tokenizer.count_tokens(chunk.content)
assert chunk.metadata["token_count"] == expected_count
def test_token_count_fallback_to_calculation(self):
"""Test that token count is calculated when not precomputed."""
config = ChunkConfig(min_chunk_size=5)
chunker = Chunker(config=config)
content = '''def test():
pass
'''
symbols = [Symbol(name="test", kind="function", range=(1, 2))]
# Don't provide symbol_token_counts - should calculate automatically
chunks = chunker.chunk_file(content, symbols, "test.py", "python")
assert len(chunks) == 1
assert "token_count" in chunks[0].metadata
assert chunks[0].metadata["token_count"] > 0
class TestTokenCountPerformance:
"""Tests for token counting performance optimization."""
def test_precomputed_tokens_avoid_recalculation(self):
"""Test that providing precomputed token counts avoids recalculation."""
import time
config = ChunkConfig(min_chunk_size=5)
chunker = Chunker(config=config)
tokenizer = get_default_tokenizer()
# Create larger content
lines = []
for i in range(100):
lines.append(f'def func{i}(x):\n')
lines.append(f' return x * {i}\n')
lines.append('\n')
content = "".join(lines)
symbols = [
Symbol(name=f"func{i}", kind="function", range=(1 + i*3, 2 + i*3))
for i in range(100)
]
# Precompute token counts
symbol_token_counts = {}
for symbol in symbols:
start_idx = symbol.range[0] - 1
end_idx = symbol.range[1]
chunk_content = "".join(content.splitlines(keepends=True)[start_idx:end_idx])
symbol_token_counts[symbol.name] = tokenizer.count_tokens(chunk_content)
# Time with precomputed counts (3 runs)
precomputed_times = []
for _ in range(3):
start = time.perf_counter()
chunker.chunk_file(content, symbols, "test.py", "python", symbol_token_counts)
precomputed_times.append(time.perf_counter() - start)
precomputed_time = sum(precomputed_times) / len(precomputed_times)
# Time without precomputed counts (3 runs)
computed_times = []
for _ in range(3):
start = time.perf_counter()
chunker.chunk_file(content, symbols, "test.py", "python")
computed_times.append(time.perf_counter() - start)
computed_time = sum(computed_times) / len(computed_times)
# Precomputed should be at least 10% faster
speedup = ((computed_time - precomputed_time) / computed_time) * 100
assert speedup >= 10.0, f"Speedup {speedup:.2f}% < 10% (computed={computed_time:.4f}s, precomputed={precomputed_time:.4f}s)"
class TestSymbolEntityTokenCount:
"""Tests for Symbol entity token_count field."""
def test_symbol_with_token_count(self):
"""Test creating Symbol with token_count."""
symbol = Symbol(
name="test_func",
kind="function",
range=(1, 10),
token_count=42
)
assert symbol.token_count == 42
def test_symbol_without_token_count(self):
"""Test creating Symbol without token_count (defaults to None)."""
symbol = Symbol(
name="test_func",
kind="function",
range=(1, 10)
)
assert symbol.token_count is None
def test_symbol_with_symbol_type(self):
"""Test creating Symbol with symbol_type."""
symbol = Symbol(
name="TestClass",
kind="class",
range=(1, 20),
symbol_type="class_definition"
)
assert symbol.symbol_type == "class_definition"
def test_symbol_token_count_validation(self):
"""Test that negative token counts are rejected."""
with pytest.raises(ValueError, match="token_count must be >= 0"):
Symbol(
name="test",
kind="function",
range=(1, 2),
token_count=-1
)
def test_symbol_zero_token_count(self):
"""Test that zero token count is allowed."""
symbol = Symbol(
name="empty",
kind="function",
range=(1, 1),
token_count=0
)
assert symbol.token_count == 0