mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-13 02:41:50 +08:00
Refactor code structure and remove redundant changes
This commit is contained in:
98
codex-lens/build/lib/codexlens/parsers/tokenizer.py
Normal file
98
codex-lens/build/lib/codexlens/parsers/tokenizer.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""Token counting utilities for CodexLens.
|
||||
|
||||
Provides accurate token counting using tiktoken with character count fallback.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
try:
|
||||
import tiktoken
|
||||
TIKTOKEN_AVAILABLE = True
|
||||
except ImportError:
|
||||
TIKTOKEN_AVAILABLE = False
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
"""Token counter with tiktoken primary and character count fallback."""
|
||||
|
||||
def __init__(self, encoding_name: str = "cl100k_base") -> None:
|
||||
"""Initialize tokenizer.
|
||||
|
||||
Args:
|
||||
encoding_name: Tiktoken encoding name (default: cl100k_base for GPT-4)
|
||||
"""
|
||||
self._encoding: Optional[object] = None
|
||||
self._encoding_name = encoding_name
|
||||
|
||||
if TIKTOKEN_AVAILABLE:
|
||||
try:
|
||||
self._encoding = tiktoken.get_encoding(encoding_name)
|
||||
except Exception:
|
||||
# Fallback to character counting if encoding fails
|
||||
self._encoding = None
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""Count tokens in text.
|
||||
|
||||
Uses tiktoken if available, otherwise falls back to character count / 4.
|
||||
|
||||
Args:
|
||||
text: Text to count tokens for
|
||||
|
||||
Returns:
|
||||
Estimated token count
|
||||
"""
|
||||
if not text:
|
||||
return 0
|
||||
|
||||
if self._encoding is not None:
|
||||
try:
|
||||
return len(self._encoding.encode(text)) # type: ignore[attr-defined]
|
||||
except Exception:
|
||||
# Fall through to character count fallback
|
||||
pass
|
||||
|
||||
# Fallback: rough estimate using character count
|
||||
# Average of ~4 characters per token for English text
|
||||
return max(1, len(text) // 4)
|
||||
|
||||
def is_using_tiktoken(self) -> bool:
|
||||
"""Check if tiktoken is being used.
|
||||
|
||||
Returns:
|
||||
True if tiktoken is available and initialized
|
||||
"""
|
||||
return self._encoding is not None
|
||||
|
||||
|
||||
# Global default tokenizer instance
|
||||
_default_tokenizer: Optional[Tokenizer] = None
|
||||
|
||||
|
||||
def get_default_tokenizer() -> Tokenizer:
|
||||
"""Get the global default tokenizer instance.
|
||||
|
||||
Returns:
|
||||
Shared Tokenizer instance
|
||||
"""
|
||||
global _default_tokenizer
|
||||
if _default_tokenizer is None:
|
||||
_default_tokenizer = Tokenizer()
|
||||
return _default_tokenizer
|
||||
|
||||
|
||||
def count_tokens(text: str, tokenizer: Optional[Tokenizer] = None) -> int:
|
||||
"""Count tokens in text using default or provided tokenizer.
|
||||
|
||||
Args:
|
||||
text: Text to count tokens for
|
||||
tokenizer: Optional tokenizer instance (uses default if None)
|
||||
|
||||
Returns:
|
||||
Estimated token count
|
||||
"""
|
||||
if tokenizer is None:
|
||||
tokenizer = get_default_tokenizer()
|
||||
return tokenizer.count_tokens(text)
|
||||
Reference in New Issue
Block a user