Add comprehensive tests for query parsing and Reciprocal Rank Fusion

- Implemented tests for the QueryParser class, covering various identifier splitting methods (CamelCase, snake_case, kebab-case), OR expansion, and FTS5 operator preservation. - Added parameterized tests to validate expected token outputs for different query formats. - Created edge case tests to ensure robustness against unusual input scenarios. - Developed tests for the Reciprocal Rank Fusion (RRF) algorithm, including score computation, weight handling, and result ranking across multiple sources. - Included tests for normalization of BM25 scores and tagging search results with source metadata.
2026-02-12 02:37:45 +08:00 · 2025-12-16 10:20:19 +08:00
parent 35485bbbb1
commit 3da0ef2adb
39 changed files with 6171 additions and 240 deletions
--- a/codex-lens/src/codexlens/parsers/encoding.py
+++ b/codex-lens/src/codexlens/parsers/encoding.py
@@ -0,0 +1,202 @@
+"""Optional encoding detection module for CodexLens.
+
+Provides automatic encoding detection with graceful fallback to UTF-8.
+Install with: pip install codexlens[encoding]
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import Tuple, Optional
+
+log = logging.getLogger(__name__)
+
+# Feature flag for encoding detection availability
+ENCODING_DETECTION_AVAILABLE = False
+_import_error: Optional[str] = None
+
+
+def _detect_chardet_backend() -> Tuple[bool, Optional[str]]:
+    """Detect if chardet or charset-normalizer is available."""
+    try:
+        import chardet
+        return True, None
+    except ImportError:
+        pass
+
+    try:
+        from charset_normalizer import from_bytes
+        return True, None
+    except ImportError:
+        pass
+
+    return False, "chardet not available. Install with: pip install codexlens[encoding]"
+
+
+# Initialize on module load
+ENCODING_DETECTION_AVAILABLE, _import_error = _detect_chardet_backend()
+
+
+def check_encoding_available() -> Tuple[bool, Optional[str]]:
+    """Check if encoding detection dependencies are available.
+
+    Returns:
+        Tuple of (available, error_message)
+    """
+    return ENCODING_DETECTION_AVAILABLE, _import_error
+
+
+def detect_encoding(content_bytes: bytes, confidence_threshold: float = 0.7) -> str:
+    """Detect encoding from file content bytes.
+
+    Uses chardet or charset-normalizer with configurable confidence threshold.
+    Falls back to UTF-8 if confidence is too low or detection unavailable.
+
+    Args:
+        content_bytes: Raw file content as bytes
+        confidence_threshold: Minimum confidence (0.0-1.0) to accept detection
+
+    Returns:
+        Detected encoding name (e.g., 'utf-8', 'iso-8859-1', 'gbk')
+        Returns 'utf-8' as fallback if detection fails or confidence too low
+    """
+    if not ENCODING_DETECTION_AVAILABLE:
+        log.debug("Encoding detection not available, using UTF-8 fallback")
+        return "utf-8"
+
+    if not content_bytes:
+        return "utf-8"
+
+    try:
+        # Try chardet first
+        try:
+            import chardet
+            result = chardet.detect(content_bytes)
+            encoding = result.get("encoding")
+            confidence = result.get("confidence", 0.0)
+
+            if encoding and confidence >= confidence_threshold:
+                log.debug(f"Detected encoding: {encoding} (confidence: {confidence:.2f})")
+                # Normalize encoding name: replace underscores with hyphens
+                return encoding.lower().replace('_', '-')
+            else:
+                log.debug(
+                    f"Low confidence encoding detection: {encoding} "
+                    f"(confidence: {confidence:.2f}), using UTF-8 fallback"
+                )
+                return "utf-8"
+        except ImportError:
+            pass
+
+        # Fallback to charset-normalizer
+        try:
+            from charset_normalizer import from_bytes
+            results = from_bytes(content_bytes)
+            if results:
+                best = results.best()
+                if best and best.encoding:
+                    log.debug(f"Detected encoding via charset-normalizer: {best.encoding}")
+                    # Normalize encoding name: replace underscores with hyphens
+                    return best.encoding.lower().replace('_', '-')
+        except ImportError:
+            pass
+
+    except Exception as e:
+        log.warning(f"Encoding detection failed: {e}, using UTF-8 fallback")
+
+    return "utf-8"
+
+
+def read_file_safe(
+    path: Path | str,
+    confidence_threshold: float = 0.7,
+    max_detection_bytes: int = 100_000
+) -> Tuple[str, str]:
+    """Read file with automatic encoding detection and safe decoding.
+
+    Reads file bytes, detects encoding, and decodes with error replacement
+    to preserve file structure even with encoding issues.
+
+    Args:
+        path: Path to file to read
+        confidence_threshold: Minimum confidence for encoding detection
+        max_detection_bytes: Maximum bytes to use for encoding detection (default 100KB)
+
+    Returns:
+        Tuple of (content, detected_encoding)
+        - content: Decoded file content (with <20> for unmappable bytes)
+        - detected_encoding: Detected encoding name
+
+    Raises:
+        OSError: If file cannot be read
+        IsADirectoryError: If path is a directory
+    """
+    file_path = Path(path) if isinstance(path, str) else path
+
+    # Read file bytes
+    try:
+        content_bytes = file_path.read_bytes()
+    except Exception as e:
+        log.error(f"Failed to read file {file_path}: {e}")
+        raise
+
+    # Detect encoding from first N bytes for performance
+    detection_sample = content_bytes[:max_detection_bytes] if len(content_bytes) > max_detection_bytes else content_bytes
+    encoding = detect_encoding(detection_sample, confidence_threshold)
+
+    # Decode with error replacement to preserve structure
+    try:
+        content = content_bytes.decode(encoding, errors='replace')
+        log.debug(f"Successfully decoded {file_path} using {encoding}")
+        return content, encoding
+    except Exception as e:
+        # Final fallback to UTF-8 with replacement
+        log.warning(f"Failed to decode {file_path} with {encoding}, using UTF-8: {e}")
+        content = content_bytes.decode('utf-8', errors='replace')
+        return content, 'utf-8'
+
+
+def is_binary_file(path: Path | str, sample_size: int = 8192) -> bool:
+    """Check if file is likely binary by sampling first bytes.
+
+    Uses heuristic: if >30% of sample bytes are null or non-text, consider binary.
+
+    Args:
+        path: Path to file to check
+        sample_size: Number of bytes to sample (default 8KB)
+
+    Returns:
+        True if file appears to be binary, False otherwise
+    """
+    file_path = Path(path) if isinstance(path, str) else path
+
+    try:
+        with file_path.open('rb') as f:
+            sample = f.read(sample_size)
+
+        if not sample:
+            return False
+
+        # Count null bytes and non-printable characters
+        null_count = sample.count(b'\x00')
+        non_text_count = sum(1 for byte in sample if byte < 0x20 and byte not in (0x09, 0x0a, 0x0d))
+
+        # If >30% null bytes or >50% non-text, consider binary
+        null_ratio = null_count / len(sample)
+        non_text_ratio = non_text_count / len(sample)
+
+        return null_ratio > 0.3 or non_text_ratio > 0.5
+
+    except Exception as e:
+        log.debug(f"Binary check failed for {file_path}: {e}, assuming text")
+        return False
+
+
+__all__ = [
+    "ENCODING_DETECTION_AVAILABLE",
+    "check_encoding_available",
+    "detect_encoding",
+    "read_file_safe",
+    "is_binary_file",
+]