mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-12 02:37:45 +08:00
Add comprehensive tests for query parsing and Reciprocal Rank Fusion
- Implemented tests for the QueryParser class, covering various identifier splitting methods (CamelCase, snake_case, kebab-case), OR expansion, and FTS5 operator preservation. - Added parameterized tests to validate expected token outputs for different query formats. - Created edge case tests to ensure robustness against unusual input scenarios. - Developed tests for the Reciprocal Rank Fusion (RRF) algorithm, including score computation, weight handling, and result ranking across multiple sources. - Included tests for normalization of BM25 scores and tagging search results with source metadata.
This commit is contained in:
202
codex-lens/src/codexlens/parsers/encoding.py
Normal file
202
codex-lens/src/codexlens/parsers/encoding.py
Normal file
@@ -0,0 +1,202 @@
|
||||
"""Optional encoding detection module for CodexLens.
|
||||
|
||||
Provides automatic encoding detection with graceful fallback to UTF-8.
|
||||
Install with: pip install codexlens[encoding]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Optional
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Feature flag for encoding detection availability
|
||||
ENCODING_DETECTION_AVAILABLE = False
|
||||
_import_error: Optional[str] = None
|
||||
|
||||
|
||||
def _detect_chardet_backend() -> Tuple[bool, Optional[str]]:
|
||||
"""Detect if chardet or charset-normalizer is available."""
|
||||
try:
|
||||
import chardet
|
||||
return True, None
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from charset_normalizer import from_bytes
|
||||
return True, None
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
return False, "chardet not available. Install with: pip install codexlens[encoding]"
|
||||
|
||||
|
||||
# Initialize on module load
|
||||
ENCODING_DETECTION_AVAILABLE, _import_error = _detect_chardet_backend()
|
||||
|
||||
|
||||
def check_encoding_available() -> Tuple[bool, Optional[str]]:
|
||||
"""Check if encoding detection dependencies are available.
|
||||
|
||||
Returns:
|
||||
Tuple of (available, error_message)
|
||||
"""
|
||||
return ENCODING_DETECTION_AVAILABLE, _import_error
|
||||
|
||||
|
||||
def detect_encoding(content_bytes: bytes, confidence_threshold: float = 0.7) -> str:
|
||||
"""Detect encoding from file content bytes.
|
||||
|
||||
Uses chardet or charset-normalizer with configurable confidence threshold.
|
||||
Falls back to UTF-8 if confidence is too low or detection unavailable.
|
||||
|
||||
Args:
|
||||
content_bytes: Raw file content as bytes
|
||||
confidence_threshold: Minimum confidence (0.0-1.0) to accept detection
|
||||
|
||||
Returns:
|
||||
Detected encoding name (e.g., 'utf-8', 'iso-8859-1', 'gbk')
|
||||
Returns 'utf-8' as fallback if detection fails or confidence too low
|
||||
"""
|
||||
if not ENCODING_DETECTION_AVAILABLE:
|
||||
log.debug("Encoding detection not available, using UTF-8 fallback")
|
||||
return "utf-8"
|
||||
|
||||
if not content_bytes:
|
||||
return "utf-8"
|
||||
|
||||
try:
|
||||
# Try chardet first
|
||||
try:
|
||||
import chardet
|
||||
result = chardet.detect(content_bytes)
|
||||
encoding = result.get("encoding")
|
||||
confidence = result.get("confidence", 0.0)
|
||||
|
||||
if encoding and confidence >= confidence_threshold:
|
||||
log.debug(f"Detected encoding: {encoding} (confidence: {confidence:.2f})")
|
||||
# Normalize encoding name: replace underscores with hyphens
|
||||
return encoding.lower().replace('_', '-')
|
||||
else:
|
||||
log.debug(
|
||||
f"Low confidence encoding detection: {encoding} "
|
||||
f"(confidence: {confidence:.2f}), using UTF-8 fallback"
|
||||
)
|
||||
return "utf-8"
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Fallback to charset-normalizer
|
||||
try:
|
||||
from charset_normalizer import from_bytes
|
||||
results = from_bytes(content_bytes)
|
||||
if results:
|
||||
best = results.best()
|
||||
if best and best.encoding:
|
||||
log.debug(f"Detected encoding via charset-normalizer: {best.encoding}")
|
||||
# Normalize encoding name: replace underscores with hyphens
|
||||
return best.encoding.lower().replace('_', '-')
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"Encoding detection failed: {e}, using UTF-8 fallback")
|
||||
|
||||
return "utf-8"
|
||||
|
||||
|
||||
def read_file_safe(
|
||||
path: Path | str,
|
||||
confidence_threshold: float = 0.7,
|
||||
max_detection_bytes: int = 100_000
|
||||
) -> Tuple[str, str]:
|
||||
"""Read file with automatic encoding detection and safe decoding.
|
||||
|
||||
Reads file bytes, detects encoding, and decodes with error replacement
|
||||
to preserve file structure even with encoding issues.
|
||||
|
||||
Args:
|
||||
path: Path to file to read
|
||||
confidence_threshold: Minimum confidence for encoding detection
|
||||
max_detection_bytes: Maximum bytes to use for encoding detection (default 100KB)
|
||||
|
||||
Returns:
|
||||
Tuple of (content, detected_encoding)
|
||||
- content: Decoded file content (with <20> for unmappable bytes)
|
||||
- detected_encoding: Detected encoding name
|
||||
|
||||
Raises:
|
||||
OSError: If file cannot be read
|
||||
IsADirectoryError: If path is a directory
|
||||
"""
|
||||
file_path = Path(path) if isinstance(path, str) else path
|
||||
|
||||
# Read file bytes
|
||||
try:
|
||||
content_bytes = file_path.read_bytes()
|
||||
except Exception as e:
|
||||
log.error(f"Failed to read file {file_path}: {e}")
|
||||
raise
|
||||
|
||||
# Detect encoding from first N bytes for performance
|
||||
detection_sample = content_bytes[:max_detection_bytes] if len(content_bytes) > max_detection_bytes else content_bytes
|
||||
encoding = detect_encoding(detection_sample, confidence_threshold)
|
||||
|
||||
# Decode with error replacement to preserve structure
|
||||
try:
|
||||
content = content_bytes.decode(encoding, errors='replace')
|
||||
log.debug(f"Successfully decoded {file_path} using {encoding}")
|
||||
return content, encoding
|
||||
except Exception as e:
|
||||
# Final fallback to UTF-8 with replacement
|
||||
log.warning(f"Failed to decode {file_path} with {encoding}, using UTF-8: {e}")
|
||||
content = content_bytes.decode('utf-8', errors='replace')
|
||||
return content, 'utf-8'
|
||||
|
||||
|
||||
def is_binary_file(path: Path | str, sample_size: int = 8192) -> bool:
|
||||
"""Check if file is likely binary by sampling first bytes.
|
||||
|
||||
Uses heuristic: if >30% of sample bytes are null or non-text, consider binary.
|
||||
|
||||
Args:
|
||||
path: Path to file to check
|
||||
sample_size: Number of bytes to sample (default 8KB)
|
||||
|
||||
Returns:
|
||||
True if file appears to be binary, False otherwise
|
||||
"""
|
||||
file_path = Path(path) if isinstance(path, str) else path
|
||||
|
||||
try:
|
||||
with file_path.open('rb') as f:
|
||||
sample = f.read(sample_size)
|
||||
|
||||
if not sample:
|
||||
return False
|
||||
|
||||
# Count null bytes and non-printable characters
|
||||
null_count = sample.count(b'\x00')
|
||||
non_text_count = sum(1 for byte in sample if byte < 0x20 and byte not in (0x09, 0x0a, 0x0d))
|
||||
|
||||
# If >30% null bytes or >50% non-text, consider binary
|
||||
null_ratio = null_count / len(sample)
|
||||
non_text_ratio = non_text_count / len(sample)
|
||||
|
||||
return null_ratio > 0.3 or non_text_ratio > 0.5
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Binary check failed for {file_path}: {e}, assuming text")
|
||||
return False
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ENCODING_DETECTION_AVAILABLE",
|
||||
"check_encoding_available",
|
||||
"detect_encoding",
|
||||
"read_file_safe",
|
||||
"is_binary_file",
|
||||
]
|
||||
Reference in New Issue
Block a user