mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
- Introduced styles for the help view including tab transitions, accordion animations, search highlighting, and responsive design. - Implemented core memory styles with modal base styles, memory card designs, and knowledge graph visualization. - Enhanced dark mode support across various components. - Added loading states and empty state designs for better user experience.
373 lines
15 KiB
Python
373 lines
15 KiB
Python
"""Tests for encoding detection module (P1).
|
||
|
||
Tests chardet integration, UTF-8 fallback behavior, confidence thresholds,
|
||
and safe file reading with error replacement.
|
||
"""
|
||
|
||
import tempfile
|
||
from pathlib import Path
|
||
from unittest.mock import Mock, patch
|
||
|
||
import pytest
|
||
|
||
from codexlens.parsers.encoding import (
|
||
ENCODING_DETECTION_AVAILABLE,
|
||
check_encoding_available,
|
||
detect_encoding,
|
||
is_binary_file,
|
||
read_file_safe,
|
||
)
|
||
|
||
|
||
class TestEncodingDetectionAvailability:
|
||
"""Tests for encoding detection feature availability."""
|
||
|
||
def test_encoding_available_flag(self):
|
||
"""Test ENCODING_DETECTION_AVAILABLE flag is boolean."""
|
||
assert isinstance(ENCODING_DETECTION_AVAILABLE, bool)
|
||
|
||
def test_check_encoding_available_returns_tuple(self):
|
||
"""Test check_encoding_available returns (available, error_message)."""
|
||
available, error_msg = check_encoding_available()
|
||
assert isinstance(available, bool)
|
||
if not available:
|
||
assert isinstance(error_msg, str)
|
||
assert "chardet" in error_msg.lower() or "install" in error_msg.lower()
|
||
else:
|
||
assert error_msg is None
|
||
|
||
|
||
class TestDetectEncoding:
|
||
"""Tests for detect_encoding function."""
|
||
|
||
def test_detect_utf8_content(self):
|
||
"""Test detection of UTF-8 encoded content."""
|
||
content = "Hello, World! 你好世界".encode("utf-8")
|
||
encoding = detect_encoding(content)
|
||
# Should detect UTF-8 or use UTF-8 as fallback
|
||
assert encoding.lower() in ["utf-8", "utf8"]
|
||
|
||
def test_detect_latin1_content(self):
|
||
"""Test detection of ISO-8859-1 encoded content."""
|
||
content = "Héllo, Wörld! Ñoño".encode("iso-8859-1")
|
||
encoding = detect_encoding(content)
|
||
# Should detect ISO-8859-1 or fallback to UTF-8
|
||
assert isinstance(encoding, str)
|
||
assert len(encoding) > 0
|
||
|
||
def test_detect_gbk_content(self):
|
||
"""Test detection of GBK encoded content."""
|
||
content = "你好世界 测试文本".encode("gbk")
|
||
encoding = detect_encoding(content)
|
||
# Should detect GBK or fallback to UTF-8
|
||
assert isinstance(encoding, str)
|
||
if ENCODING_DETECTION_AVAILABLE:
|
||
# With chardet, should detect CJK encoding or UTF-8 (chardet may detect similar encodings)
|
||
valid_encodings = ["gbk", "gb2312", "gb18030", "big5", "utf-8", "utf8", "cp949", "euc-kr", "iso-8859-1"]
|
||
assert encoding.lower() in valid_encodings, f"Got unexpected encoding: {encoding}"
|
||
else:
|
||
# Without chardet, should fallback to UTF-8
|
||
assert encoding.lower() in ["utf-8", "utf8"]
|
||
|
||
def test_empty_content_returns_utf8(self):
|
||
"""Test empty content returns UTF-8 fallback."""
|
||
encoding = detect_encoding(b"")
|
||
assert encoding.lower() in ["utf-8", "utf8"]
|
||
|
||
@pytest.mark.skipif(not ENCODING_DETECTION_AVAILABLE, reason="chardet not installed")
|
||
def test_confidence_threshold_filtering(self):
|
||
"""Test low-confidence detections are rejected and fallback to UTF-8."""
|
||
# Use sys.modules to mock chardet.detect
|
||
import sys
|
||
if 'chardet' not in sys.modules:
|
||
pytest.skip("chardet not available")
|
||
|
||
import chardet
|
||
|
||
with patch.object(chardet, "detect") as mock_detect:
|
||
mock_detect.return_value = {
|
||
"encoding": "windows-1252",
|
||
"confidence": 0.3 # Below default threshold of 0.7
|
||
}
|
||
content = b"some text"
|
||
encoding = detect_encoding(content, confidence_threshold=0.7)
|
||
# Should fallback to UTF-8 due to low confidence
|
||
assert encoding.lower() in ["utf-8", "utf8"]
|
||
|
||
@pytest.mark.skipif(not ENCODING_DETECTION_AVAILABLE, reason="chardet not installed")
|
||
def test_high_confidence_accepted(self):
|
||
"""Test high-confidence detections are accepted."""
|
||
import sys
|
||
if 'chardet' not in sys.modules:
|
||
pytest.skip("chardet not available")
|
||
|
||
import chardet
|
||
|
||
with patch.object(chardet, "detect") as mock_detect:
|
||
mock_detect.return_value = {
|
||
"encoding": "utf-8",
|
||
"confidence": 0.95 # Above threshold
|
||
}
|
||
content = b"some text"
|
||
encoding = detect_encoding(content, confidence_threshold=0.7)
|
||
assert encoding.lower() in ["utf-8", "utf8"]
|
||
|
||
@pytest.mark.skipif(not ENCODING_DETECTION_AVAILABLE, reason="chardet not installed")
|
||
def test_chardet_exception_fallback(self):
|
||
"""Test chardet exceptions trigger UTF-8 fallback."""
|
||
import sys
|
||
if 'chardet' not in sys.modules:
|
||
pytest.skip("chardet not available")
|
||
|
||
import chardet
|
||
|
||
with patch.object(chardet, "detect", side_effect=Exception("Mock error")):
|
||
content = b"some text"
|
||
encoding = detect_encoding(content)
|
||
# Should fallback gracefully
|
||
assert encoding.lower() in ["utf-8", "utf8"]
|
||
|
||
def test_fallback_without_chardet(self):
|
||
"""Test graceful fallback when chardet unavailable."""
|
||
# Temporarily disable chardet
|
||
with patch("codexlens.parsers.encoding.ENCODING_DETECTION_AVAILABLE", False):
|
||
content = "测试内容".encode("utf-8")
|
||
encoding = detect_encoding(content)
|
||
assert encoding.lower() in ["utf-8", "utf8"]
|
||
|
||
|
||
class TestReadFileSafe:
|
||
"""Tests for read_file_safe function."""
|
||
|
||
@pytest.fixture
|
||
def temp_file(self):
|
||
"""Create temporary file for testing."""
|
||
with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".txt") as f:
|
||
file_path = Path(f.name)
|
||
yield file_path
|
||
if file_path.exists():
|
||
file_path.unlink()
|
||
|
||
def test_read_utf8_file(self, temp_file):
|
||
"""Test reading UTF-8 encoded file."""
|
||
content_text = "Hello, World! 你好世界"
|
||
temp_file.write_bytes(content_text.encode("utf-8"))
|
||
|
||
content, encoding = read_file_safe(temp_file)
|
||
assert content == content_text
|
||
assert encoding.lower() in ["utf-8", "utf8"]
|
||
|
||
def test_read_gbk_file(self, temp_file):
|
||
"""Test reading GBK encoded file."""
|
||
content_text = "你好世界 测试文本"
|
||
temp_file.write_bytes(content_text.encode("gbk"))
|
||
|
||
content, encoding = read_file_safe(temp_file)
|
||
# Should decode correctly with detected or fallback encoding
|
||
assert isinstance(content, str)
|
||
if ENCODING_DETECTION_AVAILABLE:
|
||
# With chardet, should detect GBK/GB2312/Big5 and decode correctly
|
||
# Chardet may detect Big5 for GBK content, which is acceptable
|
||
assert "你好" in content or "世界" in content or len(content) > 0
|
||
else:
|
||
# Without chardet, UTF-8 fallback with replacement
|
||
assert isinstance(content, str)
|
||
|
||
def test_read_latin1_file(self, temp_file):
|
||
"""Test reading ISO-8859-1 encoded file."""
|
||
content_text = "Héllo Wörld"
|
||
temp_file.write_bytes(content_text.encode("iso-8859-1"))
|
||
|
||
content, encoding = read_file_safe(temp_file)
|
||
assert isinstance(content, str)
|
||
# Should decode with detected or fallback encoding
|
||
assert len(content) > 0
|
||
|
||
def test_error_replacement_preserves_structure(self, temp_file):
|
||
"""Test errors='replace' preserves file structure with unmappable bytes."""
|
||
# Create file with invalid UTF-8 sequence
|
||
invalid_utf8 = b"Valid text\xFF\xFEInvalid bytes\x00More text"
|
||
temp_file.write_bytes(invalid_utf8)
|
||
|
||
content, encoding = read_file_safe(temp_file)
|
||
# Should decode with replacement character
|
||
assert "Valid text" in content
|
||
assert "More text" in content
|
||
# Should contain replacement characters (<28>) for invalid bytes
|
||
assert isinstance(content, str)
|
||
|
||
def test_max_detection_bytes_parameter(self, temp_file):
|
||
"""Test max_detection_bytes limits encoding detection sample size."""
|
||
# Create large file
|
||
large_content = ("测试内容 " * 10000).encode("utf-8") # ~60KB
|
||
temp_file.write_bytes(large_content)
|
||
|
||
# Use small detection sample
|
||
content, encoding = read_file_safe(temp_file, max_detection_bytes=1000)
|
||
assert isinstance(content, str)
|
||
assert len(content) > 0
|
||
|
||
def test_confidence_threshold_parameter(self, temp_file):
|
||
"""Test confidence_threshold parameter affects detection."""
|
||
content_text = "Sample text for encoding detection"
|
||
temp_file.write_bytes(content_text.encode("utf-8"))
|
||
|
||
# High threshold
|
||
content_high, encoding_high = read_file_safe(temp_file, confidence_threshold=0.9)
|
||
assert isinstance(content_high, str)
|
||
|
||
# Low threshold
|
||
content_low, encoding_low = read_file_safe(temp_file, confidence_threshold=0.5)
|
||
assert isinstance(content_low, str)
|
||
|
||
def test_read_nonexistent_file_raises(self):
|
||
"""Test reading nonexistent file raises OSError."""
|
||
with pytest.raises(OSError):
|
||
read_file_safe(Path("/nonexistent/path/file.txt"))
|
||
|
||
def test_read_directory_raises(self, tmp_path):
|
||
"""Test reading directory raises IsADirectoryError."""
|
||
with pytest.raises((IsADirectoryError, OSError)):
|
||
read_file_safe(tmp_path)
|
||
|
||
def test_read_empty_file(self, temp_file):
|
||
"""Test reading empty file returns empty string."""
|
||
temp_file.write_bytes(b"")
|
||
content, encoding = read_file_safe(temp_file)
|
||
assert content == ""
|
||
assert encoding.lower() in ["utf-8", "utf8"]
|
||
|
||
|
||
class TestIsBinaryFile:
|
||
"""Tests for is_binary_file function."""
|
||
|
||
@pytest.fixture
|
||
def temp_file(self):
|
||
"""Create temporary file for testing."""
|
||
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f:
|
||
file_path = Path(f.name)
|
||
yield file_path
|
||
if file_path.exists():
|
||
file_path.unlink()
|
||
|
||
def test_text_file_not_binary(self, temp_file):
|
||
"""Test text file is not classified as binary."""
|
||
temp_file.write_bytes(b"This is a text file\nWith multiple lines\n")
|
||
assert not is_binary_file(temp_file)
|
||
|
||
def test_binary_file_with_null_bytes(self, temp_file):
|
||
"""Test file with >30% null bytes is classified as binary."""
|
||
# Create file with high null byte ratio
|
||
binary_content = b"\x00" * 5000 + b"text" * 100
|
||
temp_file.write_bytes(binary_content)
|
||
assert is_binary_file(temp_file)
|
||
|
||
def test_binary_file_with_non_text_chars(self, temp_file):
|
||
"""Test file with high non-text character ratio is binary."""
|
||
# Create file with non-printable characters
|
||
binary_content = bytes(range(0, 256)) * 50
|
||
temp_file.write_bytes(binary_content)
|
||
# Should be classified as binary due to high non-text ratio
|
||
result = is_binary_file(temp_file)
|
||
# May or may not be binary depending on exact ratio
|
||
assert isinstance(result, bool)
|
||
|
||
def test_empty_file_not_binary(self, temp_file):
|
||
"""Test empty file is not classified as binary."""
|
||
temp_file.write_bytes(b"")
|
||
assert not is_binary_file(temp_file)
|
||
|
||
def test_utf8_text_not_binary(self, temp_file):
|
||
"""Test UTF-8 text file is not classified as binary."""
|
||
temp_file.write_bytes("你好世界 Hello World".encode("utf-8"))
|
||
assert not is_binary_file(temp_file)
|
||
|
||
def test_sample_size_parameter(self, temp_file):
|
||
"""Test sample_size parameter limits bytes checked."""
|
||
# Create large file with text at start, binary later
|
||
content = b"Text content" * 1000 + b"\x00" * 10000
|
||
temp_file.write_bytes(content)
|
||
|
||
# Small sample should see only text
|
||
assert not is_binary_file(temp_file, sample_size=100)
|
||
|
||
# Large sample should see binary content
|
||
result = is_binary_file(temp_file, sample_size=20000)
|
||
assert isinstance(result, bool)
|
||
|
||
def test_tabs_newlines_not_counted_as_non_text(self, temp_file):
|
||
"""Test tabs and newlines are not counted as non-text characters."""
|
||
content = b"Line 1\nLine 2\tTabbed\rCarriage return\n"
|
||
temp_file.write_bytes(content)
|
||
assert not is_binary_file(temp_file)
|
||
|
||
|
||
@pytest.mark.parametrize("encoding,test_content", [
|
||
("utf-8", "Hello 世界 🌍"),
|
||
("gbk", "你好世界"),
|
||
("iso-8859-1", "Héllo Wörld"),
|
||
("windows-1252", "Smart quotes test"),
|
||
])
|
||
class TestEncodingParameterized:
|
||
"""Parameterized tests for various encodings."""
|
||
|
||
def test_detect_and_decode(self, encoding, test_content):
|
||
"""Test detection and decoding roundtrip for various encodings."""
|
||
# Skip if encoding not supported
|
||
try:
|
||
encoded = test_content.encode(encoding)
|
||
except (UnicodeEncodeError, LookupError):
|
||
pytest.skip(f"Encoding {encoding} not supported")
|
||
|
||
detected = detect_encoding(encoded)
|
||
assert isinstance(detected, str)
|
||
|
||
# Decode with detected encoding (with fallback)
|
||
try:
|
||
decoded = encoded.decode(detected, errors='replace')
|
||
assert isinstance(decoded, str)
|
||
except (UnicodeDecodeError, LookupError):
|
||
# Fallback to UTF-8
|
||
decoded = encoded.decode('utf-8', errors='replace')
|
||
assert isinstance(decoded, str)
|
||
|
||
|
||
@pytest.mark.skipif(ENCODING_DETECTION_AVAILABLE, reason="Test fallback behavior when chardet unavailable")
|
||
class TestWithoutChardet:
|
||
"""Tests for behavior when chardet is not available."""
|
||
|
||
def test_all_functions_work_without_chardet(self):
|
||
"""Test all encoding functions work gracefully without chardet."""
|
||
content = b"Test content"
|
||
|
||
# Should all return UTF-8 fallback
|
||
encoding = detect_encoding(content)
|
||
assert encoding.lower() in ["utf-8", "utf8"]
|
||
|
||
available, error = check_encoding_available()
|
||
assert not available
|
||
assert error is not None
|
||
|
||
|
||
@pytest.mark.skipif(not ENCODING_DETECTION_AVAILABLE, reason="Requires chardet")
|
||
class TestWithChardet:
|
||
"""Tests for behavior when chardet is available."""
|
||
|
||
def test_chardet_available_flag(self):
|
||
"""Test ENCODING_DETECTION_AVAILABLE is True when chardet installed."""
|
||
assert ENCODING_DETECTION_AVAILABLE is True
|
||
|
||
def test_check_encoding_available(self):
|
||
"""Test check_encoding_available returns success."""
|
||
available, error = check_encoding_available()
|
||
assert available is True
|
||
assert error is None
|
||
|
||
def test_detect_encoding_uses_chardet(self):
|
||
"""Test detect_encoding uses chardet when available."""
|
||
content = "你好世界".encode("gbk")
|
||
encoding = detect_encoding(content)
|
||
# Should detect GBK or related encoding
|
||
assert isinstance(encoding, str)
|
||
assert len(encoding) > 0
|