Claude-Code-Workflow/codex-lens/build/lib/codexlens/search/query_parser.py

"""Query preprocessing for CodexLens search.

Provides query expansion for better identifier matching:
- CamelCase splitting: UserAuth → User OR Auth
- snake_case splitting: user_auth → user OR auth
- Preserves original query for exact matching
"""

from __future__ import annotations

import logging
import re
from typing import Set, List

log = logging.getLogger(__name__)


class QueryParser:
    """Parser for preprocessing search queries before FTS5 execution.

    Expands identifier-style queries (CamelCase, snake_case) into OR queries
    to improve recall when searching for code symbols.

    Example transformations:
        - 'UserAuth' → 'UserAuth OR User OR Auth'
        - 'user_auth' → 'user_auth OR user OR auth'
        - 'getUserData' → 'getUserData OR get OR User OR Data'
    """

    # Patterns for identifier splitting
    CAMEL_CASE_PATTERN = re.compile(r'([a-z])([A-Z])')
    SNAKE_CASE_PATTERN = re.compile(r'_+')
    KEBAB_CASE_PATTERN = re.compile(r'-+')

    # Minimum token length to include in expansion (avoid noise from single chars)
    MIN_TOKEN_LENGTH = 2

    # All-caps acronyms pattern (e.g., HTTP, SQL, API)
    ALL_CAPS_PATTERN = re.compile(r'^[A-Z]{2,}$')

    def __init__(self, enable: bool = True, min_token_length: int = 2):
        """Initialize query parser.

        Args:
            enable: Whether to enable query preprocessing
            min_token_length: Minimum token length to include in expansion
        """
        self.enable = enable
        self.min_token_length = min_token_length

    def preprocess_query(self, query: str) -> str:
        """Preprocess query with identifier expansion.

        Args:
            query: Original search query

        Returns:
            Expanded query with OR operator connecting original and split tokens

        Example:
            >>> parser = QueryParser()
            >>> parser.preprocess_query('UserAuth')
            'UserAuth OR User OR Auth'
            >>> parser.preprocess_query('get_user_data')
            'get_user_data OR get OR user OR data'
        """
        if not self.enable:
            return query

        query = query.strip()
        if not query:
            return query

        # Extract tokens from query (handle multiple words/terms)
        # For simple queries, just process the whole thing
        # For complex FTS5 queries with operators, preserve structure
        if self._is_simple_query(query):
            return self._expand_simple_query(query)
        else:
            # Complex query with FTS5 operators, don't expand
            log.debug(f"Skipping expansion for complex FTS5 query: {query}")
            return query

    def _is_simple_query(self, query: str) -> bool:
        """Check if query is simple (no FTS5 operators).

        Args:
            query: Search query

        Returns:
            True if query is simple (safe to expand), False otherwise
        """
        # Check for FTS5 operators that indicate complex query
        fts5_operators = ['OR', 'AND', 'NOT', 'NEAR', '*', '^', '"']
        return not any(op in query for op in fts5_operators)

    def _expand_simple_query(self, query: str) -> str:
        """Expand a simple query with identifier splitting.

        Args:
            query: Simple search query

        Returns:
            Expanded query with OR operators
        """
        tokens: Set[str] = set()

        # Always include original query
        tokens.add(query)

        # Split on whitespace first
        words = query.split()

        for word in words:
            # Extract tokens from this word
            word_tokens = self._extract_tokens(word)
            tokens.update(word_tokens)

        # Filter out short tokens and duplicates
        filtered_tokens = [
            t for t in tokens
            if len(t) >= self.min_token_length
        ]

        # Remove duplicates while preserving original query first
        unique_tokens: List[str] = []
        seen: Set[str] = set()

        # Always put original query first
        if query not in seen and len(query) >= self.min_token_length:
            unique_tokens.append(query)
            seen.add(query)

        # Add other tokens
        for token in filtered_tokens:
            if token not in seen:
                unique_tokens.append(token)
                seen.add(token)

        # Join with OR operator (only if we have multiple tokens)
        if len(unique_tokens) > 1:
            expanded = ' OR '.join(unique_tokens)
            log.debug(f"Expanded query: '{query}' → '{expanded}'")
            return expanded
        else:
            return query

    def _extract_tokens(self, word: str) -> Set[str]:
        """Extract tokens from a single word using various splitting strategies.

        Args:
            word: Single word/identifier to split

        Returns:
            Set of extracted tokens
        """
        tokens: Set[str] = set()

        # Add original word
        tokens.add(word)

        # Handle all-caps acronyms (don't split)
        if self.ALL_CAPS_PATTERN.match(word):
            return tokens

        # CamelCase splitting
        camel_tokens = self._split_camel_case(word)
        tokens.update(camel_tokens)

        # snake_case splitting
        snake_tokens = self._split_snake_case(word)
        tokens.update(snake_tokens)

        # kebab-case splitting
        kebab_tokens = self._split_kebab_case(word)
        tokens.update(kebab_tokens)

        return tokens

    def _split_camel_case(self, word: str) -> List[str]:
        """Split CamelCase identifier into tokens.

        Args:
            word: CamelCase identifier (e.g., 'getUserData')

        Returns:
            List of tokens (e.g., ['get', 'User', 'Data'])
        """
        # Insert space before uppercase letters preceded by lowercase
        spaced = self.CAMEL_CASE_PATTERN.sub(r'\1 \2', word)
        # Split on spaces and filter empty
        return [t for t in spaced.split() if t]

    def _split_snake_case(self, word: str) -> List[str]:
        """Split snake_case identifier into tokens.

        Args:
            word: snake_case identifier (e.g., 'get_user_data')

        Returns:
            List of tokens (e.g., ['get', 'user', 'data'])
        """
        # Split on underscores
        return [t for t in self.SNAKE_CASE_PATTERN.split(word) if t]

    def _split_kebab_case(self, word: str) -> List[str]:
        """Split kebab-case identifier into tokens.

        Args:
            word: kebab-case identifier (e.g., 'get-user-data')

        Returns:
            List of tokens (e.g., ['get', 'user', 'data'])
        """
        # Split on hyphens
        return [t for t in self.KEBAB_CASE_PATTERN.split(word) if t]


# Global default parser instance
_default_parser = QueryParser(enable=True)


def preprocess_query(query: str, enable: bool = True) -> str:
    """Convenience function for query preprocessing.

    Args:
        query: Original search query
        enable: Whether to enable preprocessing

    Returns:
        Preprocessed query with identifier expansion
    """
    if not enable:
        return query

    return _default_parser.preprocess_query(query)


__all__ = [
    "QueryParser",
    "preprocess_query",
]