mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
203 lines
6.5 KiB
Python
203 lines
6.5 KiB
Python
"""Optional encoding detection module for CodexLens.
|
||
|
||
Provides automatic encoding detection with graceful fallback to UTF-8.
|
||
Install with: pip install codexlens[encoding]
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
from pathlib import Path
|
||
from typing import Tuple, Optional
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
# Feature flag for encoding detection availability
|
||
ENCODING_DETECTION_AVAILABLE = False
|
||
_import_error: Optional[str] = None
|
||
|
||
|
||
def _detect_chardet_backend() -> Tuple[bool, Optional[str]]:
|
||
"""Detect if chardet or charset-normalizer is available."""
|
||
try:
|
||
import chardet
|
||
return True, None
|
||
except ImportError:
|
||
pass
|
||
|
||
try:
|
||
from charset_normalizer import from_bytes
|
||
return True, None
|
||
except ImportError:
|
||
pass
|
||
|
||
return False, "chardet not available. Install with: pip install codexlens[encoding]"
|
||
|
||
|
||
# Initialize on module load
|
||
ENCODING_DETECTION_AVAILABLE, _import_error = _detect_chardet_backend()
|
||
|
||
|
||
def check_encoding_available() -> Tuple[bool, Optional[str]]:
|
||
"""Check if encoding detection dependencies are available.
|
||
|
||
Returns:
|
||
Tuple of (available, error_message)
|
||
"""
|
||
return ENCODING_DETECTION_AVAILABLE, _import_error
|
||
|
||
|
||
def detect_encoding(content_bytes: bytes, confidence_threshold: float = 0.7) -> str:
|
||
"""Detect encoding from file content bytes.
|
||
|
||
Uses chardet or charset-normalizer with configurable confidence threshold.
|
||
Falls back to UTF-8 if confidence is too low or detection unavailable.
|
||
|
||
Args:
|
||
content_bytes: Raw file content as bytes
|
||
confidence_threshold: Minimum confidence (0.0-1.0) to accept detection
|
||
|
||
Returns:
|
||
Detected encoding name (e.g., 'utf-8', 'iso-8859-1', 'gbk')
|
||
Returns 'utf-8' as fallback if detection fails or confidence too low
|
||
"""
|
||
if not ENCODING_DETECTION_AVAILABLE:
|
||
log.debug("Encoding detection not available, using UTF-8 fallback")
|
||
return "utf-8"
|
||
|
||
if not content_bytes:
|
||
return "utf-8"
|
||
|
||
try:
|
||
# Try chardet first
|
||
try:
|
||
import chardet
|
||
result = chardet.detect(content_bytes)
|
||
encoding = result.get("encoding")
|
||
confidence = result.get("confidence", 0.0)
|
||
|
||
if encoding and confidence >= confidence_threshold:
|
||
log.debug(f"Detected encoding: {encoding} (confidence: {confidence:.2f})")
|
||
# Normalize encoding name: replace underscores with hyphens
|
||
return encoding.lower().replace('_', '-')
|
||
else:
|
||
log.debug(
|
||
f"Low confidence encoding detection: {encoding} "
|
||
f"(confidence: {confidence:.2f}), using UTF-8 fallback"
|
||
)
|
||
return "utf-8"
|
||
except ImportError:
|
||
pass
|
||
|
||
# Fallback to charset-normalizer
|
||
try:
|
||
from charset_normalizer import from_bytes
|
||
results = from_bytes(content_bytes)
|
||
if results:
|
||
best = results.best()
|
||
if best and best.encoding:
|
||
log.debug(f"Detected encoding via charset-normalizer: {best.encoding}")
|
||
# Normalize encoding name: replace underscores with hyphens
|
||
return best.encoding.lower().replace('_', '-')
|
||
except ImportError:
|
||
pass
|
||
|
||
except Exception as e:
|
||
log.warning(f"Encoding detection failed: {e}, using UTF-8 fallback")
|
||
|
||
return "utf-8"
|
||
|
||
|
||
def read_file_safe(
|
||
path: Path | str,
|
||
confidence_threshold: float = 0.7,
|
||
max_detection_bytes: int = 100_000
|
||
) -> Tuple[str, str]:
|
||
"""Read file with automatic encoding detection and safe decoding.
|
||
|
||
Reads file bytes, detects encoding, and decodes with error replacement
|
||
to preserve file structure even with encoding issues.
|
||
|
||
Args:
|
||
path: Path to file to read
|
||
confidence_threshold: Minimum confidence for encoding detection
|
||
max_detection_bytes: Maximum bytes to use for encoding detection (default 100KB)
|
||
|
||
Returns:
|
||
Tuple of (content, detected_encoding)
|
||
- content: Decoded file content (with <20> for unmappable bytes)
|
||
- detected_encoding: Detected encoding name
|
||
|
||
Raises:
|
||
OSError: If file cannot be read
|
||
IsADirectoryError: If path is a directory
|
||
"""
|
||
file_path = Path(path) if isinstance(path, str) else path
|
||
|
||
# Read file bytes
|
||
try:
|
||
content_bytes = file_path.read_bytes()
|
||
except Exception as e:
|
||
log.error(f"Failed to read file {file_path}: {e}")
|
||
raise
|
||
|
||
# Detect encoding from first N bytes for performance
|
||
detection_sample = content_bytes[:max_detection_bytes] if len(content_bytes) > max_detection_bytes else content_bytes
|
||
encoding = detect_encoding(detection_sample, confidence_threshold)
|
||
|
||
# Decode with error replacement to preserve structure
|
||
try:
|
||
content = content_bytes.decode(encoding, errors='replace')
|
||
log.debug(f"Successfully decoded {file_path} using {encoding}")
|
||
return content, encoding
|
||
except Exception as e:
|
||
# Final fallback to UTF-8 with replacement
|
||
log.warning(f"Failed to decode {file_path} with {encoding}, using UTF-8: {e}")
|
||
content = content_bytes.decode('utf-8', errors='replace')
|
||
return content, 'utf-8'
|
||
|
||
|
||
def is_binary_file(path: Path | str, sample_size: int = 8192) -> bool:
|
||
"""Check if file is likely binary by sampling first bytes.
|
||
|
||
Uses heuristic: if >30% of sample bytes are null or non-text, consider binary.
|
||
|
||
Args:
|
||
path: Path to file to check
|
||
sample_size: Number of bytes to sample (default 8KB)
|
||
|
||
Returns:
|
||
True if file appears to be binary, False otherwise
|
||
"""
|
||
file_path = Path(path) if isinstance(path, str) else path
|
||
|
||
try:
|
||
with file_path.open('rb') as f:
|
||
sample = f.read(sample_size)
|
||
|
||
if not sample:
|
||
return False
|
||
|
||
# Count null bytes and non-printable characters
|
||
null_count = sample.count(b'\x00')
|
||
non_text_count = sum(1 for byte in sample if byte < 0x20 and byte not in (0x09, 0x0a, 0x0d))
|
||
|
||
# If >30% null bytes or >50% non-text, consider binary
|
||
null_ratio = null_count / len(sample)
|
||
non_text_ratio = non_text_count / len(sample)
|
||
|
||
return null_ratio > 0.3 or non_text_ratio > 0.5
|
||
|
||
except Exception as e:
|
||
log.debug(f"Binary check failed for {file_path}: {e}, assuming text")
|
||
return False
|
||
|
||
|
||
__all__ = [
|
||
"ENCODING_DETECTION_AVAILABLE",
|
||
"check_encoding_available",
|
||
"detect_encoding",
|
||
"read_file_safe",
|
||
"is_binary_file",
|
||
]
|