mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
99 lines
2.6 KiB
Python
99 lines
2.6 KiB
Python
"""Token counting utilities for CodexLens.
|
|
|
|
Provides accurate token counting using tiktoken with character count fallback.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Optional
|
|
|
|
try:
|
|
import tiktoken
|
|
TIKTOKEN_AVAILABLE = True
|
|
except ImportError:
|
|
TIKTOKEN_AVAILABLE = False
|
|
|
|
|
|
class Tokenizer:
|
|
"""Token counter with tiktoken primary and character count fallback."""
|
|
|
|
def __init__(self, encoding_name: str = "cl100k_base") -> None:
|
|
"""Initialize tokenizer.
|
|
|
|
Args:
|
|
encoding_name: Tiktoken encoding name (default: cl100k_base for GPT-4)
|
|
"""
|
|
self._encoding: Optional[object] = None
|
|
self._encoding_name = encoding_name
|
|
|
|
if TIKTOKEN_AVAILABLE:
|
|
try:
|
|
self._encoding = tiktoken.get_encoding(encoding_name)
|
|
except Exception:
|
|
# Fallback to character counting if encoding fails
|
|
self._encoding = None
|
|
|
|
def count_tokens(self, text: str) -> int:
|
|
"""Count tokens in text.
|
|
|
|
Uses tiktoken if available, otherwise falls back to character count / 4.
|
|
|
|
Args:
|
|
text: Text to count tokens for
|
|
|
|
Returns:
|
|
Estimated token count
|
|
"""
|
|
if not text:
|
|
return 0
|
|
|
|
if self._encoding is not None:
|
|
try:
|
|
return len(self._encoding.encode(text)) # type: ignore[attr-defined]
|
|
except Exception:
|
|
# Fall through to character count fallback
|
|
pass
|
|
|
|
# Fallback: rough estimate using character count
|
|
# Average of ~4 characters per token for English text
|
|
return max(1, len(text) // 4)
|
|
|
|
def is_using_tiktoken(self) -> bool:
|
|
"""Check if tiktoken is being used.
|
|
|
|
Returns:
|
|
True if tiktoken is available and initialized
|
|
"""
|
|
return self._encoding is not None
|
|
|
|
|
|
# Global default tokenizer instance
|
|
_default_tokenizer: Optional[Tokenizer] = None
|
|
|
|
|
|
def get_default_tokenizer() -> Tokenizer:
|
|
"""Get the global default tokenizer instance.
|
|
|
|
Returns:
|
|
Shared Tokenizer instance
|
|
"""
|
|
global _default_tokenizer
|
|
if _default_tokenizer is None:
|
|
_default_tokenizer = Tokenizer()
|
|
return _default_tokenizer
|
|
|
|
|
|
def count_tokens(text: str, tokenizer: Optional[Tokenizer] = None) -> int:
|
|
"""Count tokens in text using default or provided tokenizer.
|
|
|
|
Args:
|
|
text: Text to count tokens for
|
|
tokenizer: Optional tokenizer instance (uses default if None)
|
|
|
|
Returns:
|
|
Estimated token count
|
|
"""
|
|
if tokenizer is None:
|
|
tokenizer = get_default_tokenizer()
|
|
return tokenizer.count_tokens(text)
|