mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-04 01:40:45 +08:00
- Added a new Storage Manager component to handle storage statistics, project cleanup, and configuration for CCW centralized storage. - Introduced functions to calculate directory sizes, get project storage stats, and clean specific or all storage. - Enhanced SQLiteStore with a public API for executing queries securely. - Updated tests to utilize the new execute_query method and validate storage management functionalities. - Improved performance by implementing connection pooling with idle timeout management in SQLiteStore. - Added new fields (token_count, symbol_type) to the symbols table and adjusted related insertions. - Enhanced error handling and logging for storage operations.
163 lines
4.6 KiB
Python
163 lines
4.6 KiB
Python
"""Tests for tokenizer module."""
|
|
|
|
import pytest
|
|
|
|
from codexlens.parsers.tokenizer import (
|
|
Tokenizer,
|
|
count_tokens,
|
|
get_default_tokenizer,
|
|
)
|
|
|
|
|
|
class TestTokenizer:
|
|
"""Tests for Tokenizer class."""
|
|
|
|
def test_empty_text(self):
|
|
tokenizer = Tokenizer()
|
|
assert tokenizer.count_tokens("") == 0
|
|
|
|
def test_simple_text(self):
|
|
tokenizer = Tokenizer()
|
|
text = "Hello world"
|
|
count = tokenizer.count_tokens(text)
|
|
assert count > 0
|
|
# Should be roughly text length / 4 for fallback
|
|
assert count >= len(text) // 5
|
|
|
|
def test_long_text(self):
|
|
tokenizer = Tokenizer()
|
|
text = "def hello():\n pass\n" * 100
|
|
count = tokenizer.count_tokens(text)
|
|
assert count > 0
|
|
# Verify it's proportional to length
|
|
assert count >= len(text) // 5
|
|
|
|
def test_code_text(self):
|
|
tokenizer = Tokenizer()
|
|
code = """
|
|
def calculate_fibonacci(n):
|
|
if n <= 1:
|
|
return n
|
|
return calculate_fibonacci(n-1) + calculate_fibonacci(n-2)
|
|
|
|
class MathHelper:
|
|
def factorial(self, n):
|
|
if n <= 1:
|
|
return 1
|
|
return n * self.factorial(n - 1)
|
|
"""
|
|
count = tokenizer.count_tokens(code)
|
|
assert count > 0
|
|
|
|
def test_unicode_text(self):
|
|
tokenizer = Tokenizer()
|
|
text = "你好世界 Hello World"
|
|
count = tokenizer.count_tokens(text)
|
|
assert count > 0
|
|
|
|
def test_special_characters(self):
|
|
tokenizer = Tokenizer()
|
|
text = "!@#$%^&*()_+-=[]{}|;':\",./<>?"
|
|
count = tokenizer.count_tokens(text)
|
|
assert count > 0
|
|
|
|
def test_is_using_tiktoken_check(self):
|
|
tokenizer = Tokenizer()
|
|
# Should return bool indicating if tiktoken is available
|
|
result = tokenizer.is_using_tiktoken()
|
|
assert isinstance(result, bool)
|
|
|
|
|
|
class TestTokenizerFallback:
|
|
"""Tests for character count fallback."""
|
|
|
|
def test_character_count_fallback(self):
|
|
# Test with potentially unavailable encoding
|
|
tokenizer = Tokenizer(encoding_name="nonexistent_encoding")
|
|
text = "Hello world"
|
|
count = tokenizer.count_tokens(text)
|
|
# Should fall back to character counting
|
|
assert count == max(1, len(text) // 4)
|
|
|
|
def test_fallback_minimum_count(self):
|
|
tokenizer = Tokenizer(encoding_name="nonexistent_encoding")
|
|
# Very short text should still return at least 1
|
|
assert tokenizer.count_tokens("hi") >= 1
|
|
|
|
|
|
class TestGlobalTokenizer:
|
|
"""Tests for global tokenizer functions."""
|
|
|
|
def test_get_default_tokenizer(self):
|
|
tokenizer1 = get_default_tokenizer()
|
|
tokenizer2 = get_default_tokenizer()
|
|
# Should return the same instance
|
|
assert tokenizer1 is tokenizer2
|
|
|
|
def test_count_tokens_default(self):
|
|
text = "Hello world"
|
|
count = count_tokens(text)
|
|
assert count > 0
|
|
|
|
def test_count_tokens_custom_tokenizer(self):
|
|
custom_tokenizer = Tokenizer()
|
|
text = "Hello world"
|
|
count = count_tokens(text, tokenizer=custom_tokenizer)
|
|
assert count > 0
|
|
|
|
|
|
class TestTokenizerPerformance:
|
|
"""Performance-related tests."""
|
|
|
|
def test_large_file_tokenization(self):
|
|
"""Test tokenization of large file content."""
|
|
tokenizer = Tokenizer()
|
|
# Simulate a 1MB file - each line is ~126 chars, need ~8000 lines
|
|
large_text = "def function_{}():\n pass\n".format("x" * 100) * 8000
|
|
assert len(large_text) > 1_000_000
|
|
|
|
count = tokenizer.count_tokens(large_text)
|
|
assert count > 0
|
|
# Verify reasonable token count (at least 10k tokens for 1MB)
|
|
# Note: Modern tokenizers compress repetitive content efficiently
|
|
assert count >= 10000
|
|
|
|
def test_multiple_tokenizations(self):
|
|
"""Test multiple tokenization calls."""
|
|
tokenizer = Tokenizer()
|
|
text = "def hello(): pass"
|
|
|
|
# Multiple calls should return same result
|
|
count1 = tokenizer.count_tokens(text)
|
|
count2 = tokenizer.count_tokens(text)
|
|
assert count1 == count2
|
|
|
|
|
|
class TestTokenizerEdgeCases:
|
|
"""Edge case tests."""
|
|
|
|
def test_only_whitespace(self):
|
|
tokenizer = Tokenizer()
|
|
count = tokenizer.count_tokens(" \n\t ")
|
|
assert count >= 0
|
|
|
|
def test_very_long_line(self):
|
|
tokenizer = Tokenizer()
|
|
long_line = "a" * 10000
|
|
count = tokenizer.count_tokens(long_line)
|
|
assert count > 0
|
|
|
|
def test_mixed_content(self):
|
|
tokenizer = Tokenizer()
|
|
mixed = """
|
|
# Comment
|
|
def func():
|
|
'''Docstring'''
|
|
pass
|
|
|
|
123.456
|
|
"string"
|
|
"""
|
|
count = tokenizer.count_tokens(mixed)
|
|
assert count > 0
|