Refactor code structure and remove redundant changes

2026-02-12 02:37:45 +08:00 · 2026-01-24 14:47:47 +08:00
parent cf5fecd66d
commit f2b0a5bbc9
113 changed files with 43217 additions and 235 deletions
--- a/codex-lens/build/lib/codexlens/search/query_parser.py
+++ b/codex-lens/build/lib/codexlens/search/query_parser.py
@@ -0,0 +1,242 @@
+"""Query preprocessing for CodexLens search.
+
+Provides query expansion for better identifier matching:
+- CamelCase splitting: UserAuth → User OR Auth
+- snake_case splitting: user_auth → user OR auth
+- Preserves original query for exact matching
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from typing import Set, List
+
+log = logging.getLogger(__name__)
+
+
+class QueryParser:
+    """Parser for preprocessing search queries before FTS5 execution.
+
+    Expands identifier-style queries (CamelCase, snake_case) into OR queries
+    to improve recall when searching for code symbols.
+
+    Example transformations:
+        - 'UserAuth' → 'UserAuth OR User OR Auth'
+        - 'user_auth' → 'user_auth OR user OR auth'
+        - 'getUserData' → 'getUserData OR get OR User OR Data'
+    """
+
+    # Patterns for identifier splitting
+    CAMEL_CASE_PATTERN = re.compile(r'([a-z])([A-Z])')
+    SNAKE_CASE_PATTERN = re.compile(r'_+')
+    KEBAB_CASE_PATTERN = re.compile(r'-+')
+
+    # Minimum token length to include in expansion (avoid noise from single chars)
+    MIN_TOKEN_LENGTH = 2
+
+    # All-caps acronyms pattern (e.g., HTTP, SQL, API)
+    ALL_CAPS_PATTERN = re.compile(r'^[A-Z]{2,}$')
+
+    def __init__(self, enable: bool = True, min_token_length: int = 2):
+        """Initialize query parser.
+
+        Args:
+            enable: Whether to enable query preprocessing
+            min_token_length: Minimum token length to include in expansion
+        """
+        self.enable = enable
+        self.min_token_length = min_token_length
+
+    def preprocess_query(self, query: str) -> str:
+        """Preprocess query with identifier expansion.
+
+        Args:
+            query: Original search query
+
+        Returns:
+            Expanded query with OR operator connecting original and split tokens
+
+        Example:
+            >>> parser = QueryParser()
+            >>> parser.preprocess_query('UserAuth')
+            'UserAuth OR User OR Auth'
+            >>> parser.preprocess_query('get_user_data')
+            'get_user_data OR get OR user OR data'
+        """
+        if not self.enable:
+            return query
+
+        query = query.strip()
+        if not query:
+            return query
+
+        # Extract tokens from query (handle multiple words/terms)
+        # For simple queries, just process the whole thing
+        # For complex FTS5 queries with operators, preserve structure
+        if self._is_simple_query(query):
+            return self._expand_simple_query(query)
+        else:
+            # Complex query with FTS5 operators, don't expand
+            log.debug(f"Skipping expansion for complex FTS5 query: {query}")
+            return query
+
+    def _is_simple_query(self, query: str) -> bool:
+        """Check if query is simple (no FTS5 operators).
+
+        Args:
+            query: Search query
+
+        Returns:
+            True if query is simple (safe to expand), False otherwise
+        """
+        # Check for FTS5 operators that indicate complex query
+        fts5_operators = ['OR', 'AND', 'NOT', 'NEAR', '*', '^', '"']
+        return not any(op in query for op in fts5_operators)
+
+    def _expand_simple_query(self, query: str) -> str:
+        """Expand a simple query with identifier splitting.
+
+        Args:
+            query: Simple search query
+
+        Returns:
+            Expanded query with OR operators
+        """
+        tokens: Set[str] = set()
+
+        # Always include original query
+        tokens.add(query)
+
+        # Split on whitespace first
+        words = query.split()
+
+        for word in words:
+            # Extract tokens from this word
+            word_tokens = self._extract_tokens(word)
+            tokens.update(word_tokens)
+
+        # Filter out short tokens and duplicates
+        filtered_tokens = [
+            t for t in tokens
+            if len(t) >= self.min_token_length
+        ]
+
+        # Remove duplicates while preserving original query first
+        unique_tokens: List[str] = []
+        seen: Set[str] = set()
+
+        # Always put original query first
+        if query not in seen and len(query) >= self.min_token_length:
+            unique_tokens.append(query)
+            seen.add(query)
+
+        # Add other tokens
+        for token in filtered_tokens:
+            if token not in seen:
+                unique_tokens.append(token)
+                seen.add(token)
+
+        # Join with OR operator (only if we have multiple tokens)
+        if len(unique_tokens) > 1:
+            expanded = ' OR '.join(unique_tokens)
+            log.debug(f"Expanded query: '{query}' → '{expanded}'")
+            return expanded
+        else:
+            return query
+
+    def _extract_tokens(self, word: str) -> Set[str]:
+        """Extract tokens from a single word using various splitting strategies.
+
+        Args:
+            word: Single word/identifier to split
+
+        Returns:
+            Set of extracted tokens
+        """
+        tokens: Set[str] = set()
+
+        # Add original word
+        tokens.add(word)
+
+        # Handle all-caps acronyms (don't split)
+        if self.ALL_CAPS_PATTERN.match(word):
+            return tokens
+
+        # CamelCase splitting
+        camel_tokens = self._split_camel_case(word)
+        tokens.update(camel_tokens)
+
+        # snake_case splitting
+        snake_tokens = self._split_snake_case(word)
+        tokens.update(snake_tokens)
+
+        # kebab-case splitting
+        kebab_tokens = self._split_kebab_case(word)
+        tokens.update(kebab_tokens)
+
+        return tokens
+
+    def _split_camel_case(self, word: str) -> List[str]:
+        """Split CamelCase identifier into tokens.
+
+        Args:
+            word: CamelCase identifier (e.g., 'getUserData')
+
+        Returns:
+            List of tokens (e.g., ['get', 'User', 'Data'])
+        """
+        # Insert space before uppercase letters preceded by lowercase
+        spaced = self.CAMEL_CASE_PATTERN.sub(r'\1 \2', word)
+        # Split on spaces and filter empty
+        return [t for t in spaced.split() if t]
+
+    def _split_snake_case(self, word: str) -> List[str]:
+        """Split snake_case identifier into tokens.
+
+        Args:
+            word: snake_case identifier (e.g., 'get_user_data')
+
+        Returns:
+            List of tokens (e.g., ['get', 'user', 'data'])
+        """
+        # Split on underscores
+        return [t for t in self.SNAKE_CASE_PATTERN.split(word) if t]
+
+    def _split_kebab_case(self, word: str) -> List[str]:
+        """Split kebab-case identifier into tokens.
+
+        Args:
+            word: kebab-case identifier (e.g., 'get-user-data')
+
+        Returns:
+            List of tokens (e.g., ['get', 'user', 'data'])
+        """
+        # Split on hyphens
+        return [t for t in self.KEBAB_CASE_PATTERN.split(word) if t]
+
+
+# Global default parser instance
+_default_parser = QueryParser(enable=True)
+
+
+def preprocess_query(query: str, enable: bool = True) -> str:
+    """Convenience function for query preprocessing.
+
+    Args:
+        query: Original search query
+        enable: Whether to enable preprocessing
+
+    Returns:
+        Preprocessed query with identifier expansion
+    """
+    if not enable:
+        return query
+
+    return _default_parser.preprocess_query(query)
+
+
+__all__ = [
+    "QueryParser",
+    "preprocess_query",
+]