"""Tests for token-aware chunking functionality.""" import pytest from codexlens.entities import SemanticChunk, Symbol from codexlens.semantic.chunker import ChunkConfig, Chunker, HybridChunker from codexlens.parsers.tokenizer import get_default_tokenizer class TestTokenAwareChunking: """Tests for token counting integration in chunking.""" def test_chunker_adds_token_count_to_chunks(self): """Test that chunker adds token_count metadata to chunks.""" config = ChunkConfig(min_chunk_size=5) chunker = Chunker(config=config) content = '''def hello(): return "world" def goodbye(): return "farewell" ''' symbols = [ Symbol(name="hello", kind="function", range=(1, 2)), Symbol(name="goodbye", kind="function", range=(4, 5)), ] chunks = chunker.chunk_file(content, symbols, "test.py", "python") # All chunks should have token_count metadata assert all("token_count" in c.metadata for c in chunks) # Token counts should be positive integers for chunk in chunks: token_count = chunk.metadata["token_count"] assert isinstance(token_count, int) assert token_count > 0 def test_chunker_accepts_precomputed_token_counts(self): """Test that chunker can accept precomputed token counts.""" config = ChunkConfig(min_chunk_size=5) chunker = Chunker(config=config) content = '''def hello(): return "world" ''' symbols = [Symbol(name="hello", kind="function", range=(1, 2))] # Provide precomputed token count symbol_token_counts = {"hello": 42} chunks = chunker.chunk_file(content, symbols, "test.py", "python", symbol_token_counts) assert len(chunks) == 1 assert chunks[0].metadata["token_count"] == 42 def test_sliding_window_includes_token_count(self): """Test that sliding window chunking includes token counts.""" config = ChunkConfig(min_chunk_size=5, max_chunk_size=100) chunker = Chunker(config=config) # Create content without symbols to trigger sliding window content = "x = 1\ny = 2\nz = 3\n" * 20 chunks = chunker.chunk_sliding_window(content, "test.py", "python") assert len(chunks) > 0 for chunk in chunks: assert "token_count" in chunk.metadata assert chunk.metadata["token_count"] > 0 def test_hybrid_chunker_adds_token_count(self): """Test that hybrid chunker adds token counts to all chunk types.""" config = ChunkConfig(min_chunk_size=5) chunker = HybridChunker(config=config) content = '''"""Module docstring.""" def hello(): """Function docstring.""" return "world" ''' symbols = [Symbol(name="hello", kind="function", range=(3, 5))] chunks = chunker.chunk_file(content, symbols, "test.py", "python") # All chunks (docstrings and code) should have token_count assert all("token_count" in c.metadata for c in chunks) docstring_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "docstring"] code_chunks = [c for c in chunks if c.metadata.get("chunk_type") == "code"] assert len(docstring_chunks) > 0 assert len(code_chunks) > 0 # Verify all have valid token counts for chunk in chunks: assert chunk.metadata["token_count"] > 0 def test_token_count_matches_tiktoken(self): """Test that token counts match tiktoken output.""" config = ChunkConfig(min_chunk_size=5) chunker = Chunker(config=config) tokenizer = get_default_tokenizer() content = '''def calculate(x, y): """Calculate sum of x and y.""" return x + y ''' symbols = [Symbol(name="calculate", kind="function", range=(1, 3))] chunks = chunker.chunk_file(content, symbols, "test.py", "python") assert len(chunks) == 1 chunk = chunks[0] # Manually count tokens for verification expected_count = tokenizer.count_tokens(chunk.content) assert chunk.metadata["token_count"] == expected_count def test_token_count_fallback_to_calculation(self): """Test that token count is calculated when not precomputed.""" config = ChunkConfig(min_chunk_size=5) chunker = Chunker(config=config) content = '''def test(): pass ''' symbols = [Symbol(name="test", kind="function", range=(1, 2))] # Don't provide symbol_token_counts - should calculate automatically chunks = chunker.chunk_file(content, symbols, "test.py", "python") assert len(chunks) == 1 assert "token_count" in chunks[0].metadata assert chunks[0].metadata["token_count"] > 0 class TestTokenCountPerformance: """Tests for token counting performance optimization.""" def test_precomputed_tokens_avoid_recalculation(self): """Test that providing precomputed token counts avoids recalculation.""" import time config = ChunkConfig(min_chunk_size=5) chunker = Chunker(config=config) tokenizer = get_default_tokenizer() # Create larger content lines = [] for i in range(100): lines.append(f'def func{i}(x):\n') lines.append(f' return x * {i}\n') lines.append('\n') content = "".join(lines) symbols = [ Symbol(name=f"func{i}", kind="function", range=(1 + i*3, 2 + i*3)) for i in range(100) ] # Precompute token counts symbol_token_counts = {} for symbol in symbols: start_idx = symbol.range[0] - 1 end_idx = symbol.range[1] chunk_content = "".join(content.splitlines(keepends=True)[start_idx:end_idx]) symbol_token_counts[symbol.name] = tokenizer.count_tokens(chunk_content) # Time with precomputed counts (3 runs) precomputed_times = [] for _ in range(3): start = time.perf_counter() chunker.chunk_file(content, symbols, "test.py", "python", symbol_token_counts) precomputed_times.append(time.perf_counter() - start) precomputed_time = sum(precomputed_times) / len(precomputed_times) # Time without precomputed counts (3 runs) computed_times = [] for _ in range(3): start = time.perf_counter() chunker.chunk_file(content, symbols, "test.py", "python") computed_times.append(time.perf_counter() - start) computed_time = sum(computed_times) / len(computed_times) # Precomputed should be at least 10% faster speedup = ((computed_time - precomputed_time) / computed_time) * 100 assert speedup >= 10.0, f"Speedup {speedup:.2f}% < 10% (computed={computed_time:.4f}s, precomputed={precomputed_time:.4f}s)" class TestSymbolEntityTokenCount: """Tests for Symbol entity token_count field.""" def test_symbol_with_token_count(self): """Test creating Symbol with token_count.""" symbol = Symbol( name="test_func", kind="function", range=(1, 10), token_count=42 ) assert symbol.token_count == 42 def test_symbol_without_token_count(self): """Test creating Symbol without token_count (defaults to None).""" symbol = Symbol( name="test_func", kind="function", range=(1, 10) ) assert symbol.token_count is None def test_symbol_with_symbol_type(self): """Test creating Symbol with symbol_type.""" symbol = Symbol( name="TestClass", kind="class", range=(1, 20), symbol_type="class_definition" ) assert symbol.symbol_type == "class_definition" def test_symbol_token_count_validation(self): """Test that negative token counts are rejected.""" with pytest.raises(ValueError, match="token_count must be >= 0"): Symbol( name="test", kind="function", range=(1, 2), token_count=-1 ) def test_symbol_zero_token_count(self): """Test that zero token count is allowed.""" symbol = Symbol( name="empty", kind="function", range=(1, 1), token_count=0 ) assert symbol.token_count == 0