Files

203 lines
6.5 KiB
Python
Raw Permalink Blame History

"""Optional encoding detection module for CodexLens.
Provides automatic encoding detection with graceful fallback to UTF-8.
Install with: pip install codexlens[encoding]
"""
from __future__ import annotations
import logging
from pathlib import Path
from typing import Tuple, Optional
log = logging.getLogger(__name__)
# Feature flag for encoding detection availability
ENCODING_DETECTION_AVAILABLE = False
_import_error: Optional[str] = None
def _detect_chardet_backend() -> Tuple[bool, Optional[str]]:
"""Detect if chardet or charset-normalizer is available."""
try:
import chardet
return True, None
except ImportError:
pass
try:
from charset_normalizer import from_bytes
return True, None
except ImportError:
pass
return False, "chardet not available. Install with: pip install codexlens[encoding]"
# Initialize on module load
ENCODING_DETECTION_AVAILABLE, _import_error = _detect_chardet_backend()
def check_encoding_available() -> Tuple[bool, Optional[str]]:
"""Check if encoding detection dependencies are available.
Returns:
Tuple of (available, error_message)
"""
return ENCODING_DETECTION_AVAILABLE, _import_error
def detect_encoding(content_bytes: bytes, confidence_threshold: float = 0.7) -> str:
"""Detect encoding from file content bytes.
Uses chardet or charset-normalizer with configurable confidence threshold.
Falls back to UTF-8 if confidence is too low or detection unavailable.
Args:
content_bytes: Raw file content as bytes
confidence_threshold: Minimum confidence (0.0-1.0) to accept detection
Returns:
Detected encoding name (e.g., 'utf-8', 'iso-8859-1', 'gbk')
Returns 'utf-8' as fallback if detection fails or confidence too low
"""
if not ENCODING_DETECTION_AVAILABLE:
log.debug("Encoding detection not available, using UTF-8 fallback")
return "utf-8"
if not content_bytes:
return "utf-8"
try:
# Try chardet first
try:
import chardet
result = chardet.detect(content_bytes)
encoding = result.get("encoding")
confidence = result.get("confidence", 0.0)
if encoding and confidence >= confidence_threshold:
log.debug(f"Detected encoding: {encoding} (confidence: {confidence:.2f})")
# Normalize encoding name: replace underscores with hyphens
return encoding.lower().replace('_', '-')
else:
log.debug(
f"Low confidence encoding detection: {encoding} "
f"(confidence: {confidence:.2f}), using UTF-8 fallback"
)
return "utf-8"
except ImportError:
pass
# Fallback to charset-normalizer
try:
from charset_normalizer import from_bytes
results = from_bytes(content_bytes)
if results:
best = results.best()
if best and best.encoding:
log.debug(f"Detected encoding via charset-normalizer: {best.encoding}")
# Normalize encoding name: replace underscores with hyphens
return best.encoding.lower().replace('_', '-')
except ImportError:
pass
except Exception as e:
log.warning(f"Encoding detection failed: {e}, using UTF-8 fallback")
return "utf-8"
def read_file_safe(
path: Path | str,
confidence_threshold: float = 0.7,
max_detection_bytes: int = 100_000
) -> Tuple[str, str]:
"""Read file with automatic encoding detection and safe decoding.
Reads file bytes, detects encoding, and decodes with error replacement
to preserve file structure even with encoding issues.
Args:
path: Path to file to read
confidence_threshold: Minimum confidence for encoding detection
max_detection_bytes: Maximum bytes to use for encoding detection (default 100KB)
Returns:
Tuple of (content, detected_encoding)
- content: Decoded file content (with <20> for unmappable bytes)
- detected_encoding: Detected encoding name
Raises:
OSError: If file cannot be read
IsADirectoryError: If path is a directory
"""
file_path = Path(path) if isinstance(path, str) else path
# Read file bytes
try:
content_bytes = file_path.read_bytes()
except Exception as e:
log.error(f"Failed to read file {file_path}: {e}")
raise
# Detect encoding from first N bytes for performance
detection_sample = content_bytes[:max_detection_bytes] if len(content_bytes) > max_detection_bytes else content_bytes
encoding = detect_encoding(detection_sample, confidence_threshold)
# Decode with error replacement to preserve structure
try:
content = content_bytes.decode(encoding, errors='replace')
log.debug(f"Successfully decoded {file_path} using {encoding}")
return content, encoding
except Exception as e:
# Final fallback to UTF-8 with replacement
log.warning(f"Failed to decode {file_path} with {encoding}, using UTF-8: {e}")
content = content_bytes.decode('utf-8', errors='replace')
return content, 'utf-8'
def is_binary_file(path: Path | str, sample_size: int = 8192) -> bool:
"""Check if file is likely binary by sampling first bytes.
Uses heuristic: if >30% of sample bytes are null or non-text, consider binary.
Args:
path: Path to file to check
sample_size: Number of bytes to sample (default 8KB)
Returns:
True if file appears to be binary, False otherwise
"""
file_path = Path(path) if isinstance(path, str) else path
try:
with file_path.open('rb') as f:
sample = f.read(sample_size)
if not sample:
return False
# Count null bytes and non-printable characters
null_count = sample.count(b'\x00')
non_text_count = sum(1 for byte in sample if byte < 0x20 and byte not in (0x09, 0x0a, 0x0d))
# If >30% null bytes or >50% non-text, consider binary
null_ratio = null_count / len(sample)
non_text_ratio = non_text_count / len(sample)
return null_ratio > 0.3 or non_text_ratio > 0.5
except Exception as e:
log.debug(f"Binary check failed for {file_path}: {e}, assuming text")
return False
__all__ = [
"ENCODING_DETECTION_AVAILABLE",
"check_encoding_available",
"detect_encoding",
"read_file_safe",
"is_binary_file",
]