Files
Claude-Code-Workflow/codex-lens/tests/test_tokenizer.py
catlog22 97640a517a feat(storage): implement storage manager for centralized management and cleanup
- Added a new Storage Manager component to handle storage statistics, project cleanup, and configuration for CCW centralized storage.
- Introduced functions to calculate directory sizes, get project storage stats, and clean specific or all storage.
- Enhanced SQLiteStore with a public API for executing queries securely.
- Updated tests to utilize the new execute_query method and validate storage management functionalities.
- Improved performance by implementing connection pooling with idle timeout management in SQLiteStore.
- Added new fields (token_count, symbol_type) to the symbols table and adjusted related insertions.
- Enhanced error handling and logging for storage operations.
2025-12-15 17:39:38 +08:00

163 lines
4.6 KiB
Python

"""Tests for tokenizer module."""
import pytest
from codexlens.parsers.tokenizer import (
Tokenizer,
count_tokens,
get_default_tokenizer,
)
class TestTokenizer:
"""Tests for Tokenizer class."""
def test_empty_text(self):
tokenizer = Tokenizer()
assert tokenizer.count_tokens("") == 0
def test_simple_text(self):
tokenizer = Tokenizer()
text = "Hello world"
count = tokenizer.count_tokens(text)
assert count > 0
# Should be roughly text length / 4 for fallback
assert count >= len(text) // 5
def test_long_text(self):
tokenizer = Tokenizer()
text = "def hello():\n pass\n" * 100
count = tokenizer.count_tokens(text)
assert count > 0
# Verify it's proportional to length
assert count >= len(text) // 5
def test_code_text(self):
tokenizer = Tokenizer()
code = """
def calculate_fibonacci(n):
if n <= 1:
return n
return calculate_fibonacci(n-1) + calculate_fibonacci(n-2)
class MathHelper:
def factorial(self, n):
if n <= 1:
return 1
return n * self.factorial(n - 1)
"""
count = tokenizer.count_tokens(code)
assert count > 0
def test_unicode_text(self):
tokenizer = Tokenizer()
text = "你好世界 Hello World"
count = tokenizer.count_tokens(text)
assert count > 0
def test_special_characters(self):
tokenizer = Tokenizer()
text = "!@#$%^&*()_+-=[]{}|;':\",./<>?"
count = tokenizer.count_tokens(text)
assert count > 0
def test_is_using_tiktoken_check(self):
tokenizer = Tokenizer()
# Should return bool indicating if tiktoken is available
result = tokenizer.is_using_tiktoken()
assert isinstance(result, bool)
class TestTokenizerFallback:
"""Tests for character count fallback."""
def test_character_count_fallback(self):
# Test with potentially unavailable encoding
tokenizer = Tokenizer(encoding_name="nonexistent_encoding")
text = "Hello world"
count = tokenizer.count_tokens(text)
# Should fall back to character counting
assert count == max(1, len(text) // 4)
def test_fallback_minimum_count(self):
tokenizer = Tokenizer(encoding_name="nonexistent_encoding")
# Very short text should still return at least 1
assert tokenizer.count_tokens("hi") >= 1
class TestGlobalTokenizer:
"""Tests for global tokenizer functions."""
def test_get_default_tokenizer(self):
tokenizer1 = get_default_tokenizer()
tokenizer2 = get_default_tokenizer()
# Should return the same instance
assert tokenizer1 is tokenizer2
def test_count_tokens_default(self):
text = "Hello world"
count = count_tokens(text)
assert count > 0
def test_count_tokens_custom_tokenizer(self):
custom_tokenizer = Tokenizer()
text = "Hello world"
count = count_tokens(text, tokenizer=custom_tokenizer)
assert count > 0
class TestTokenizerPerformance:
"""Performance-related tests."""
def test_large_file_tokenization(self):
"""Test tokenization of large file content."""
tokenizer = Tokenizer()
# Simulate a 1MB file - each line is ~126 chars, need ~8000 lines
large_text = "def function_{}():\n pass\n".format("x" * 100) * 8000
assert len(large_text) > 1_000_000
count = tokenizer.count_tokens(large_text)
assert count > 0
# Verify reasonable token count (at least 10k tokens for 1MB)
# Note: Modern tokenizers compress repetitive content efficiently
assert count >= 10000
def test_multiple_tokenizations(self):
"""Test multiple tokenization calls."""
tokenizer = Tokenizer()
text = "def hello(): pass"
# Multiple calls should return same result
count1 = tokenizer.count_tokens(text)
count2 = tokenizer.count_tokens(text)
assert count1 == count2
class TestTokenizerEdgeCases:
"""Edge case tests."""
def test_only_whitespace(self):
tokenizer = Tokenizer()
count = tokenizer.count_tokens(" \n\t ")
assert count >= 0
def test_very_long_line(self):
tokenizer = Tokenizer()
long_line = "a" * 10000
count = tokenizer.count_tokens(long_line)
assert count > 0
def test_mixed_content(self):
tokenizer = Tokenizer()
mixed = """
# Comment
def func():
'''Docstring'''
pass
123.456
"string"
"""
count = tokenizer.count_tokens(mixed)
assert count > 0