mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-12 02:37:45 +08:00
Refactor code structure and remove redundant changes
This commit is contained in:
242
codex-lens/build/lib/codexlens/search/query_parser.py
Normal file
242
codex-lens/build/lib/codexlens/search/query_parser.py
Normal file
@@ -0,0 +1,242 @@
|
||||
"""Query preprocessing for CodexLens search.
|
||||
|
||||
Provides query expansion for better identifier matching:
|
||||
- CamelCase splitting: UserAuth → User OR Auth
|
||||
- snake_case splitting: user_auth → user OR auth
|
||||
- Preserves original query for exact matching
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Set, List
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class QueryParser:
|
||||
"""Parser for preprocessing search queries before FTS5 execution.
|
||||
|
||||
Expands identifier-style queries (CamelCase, snake_case) into OR queries
|
||||
to improve recall when searching for code symbols.
|
||||
|
||||
Example transformations:
|
||||
- 'UserAuth' → 'UserAuth OR User OR Auth'
|
||||
- 'user_auth' → 'user_auth OR user OR auth'
|
||||
- 'getUserData' → 'getUserData OR get OR User OR Data'
|
||||
"""
|
||||
|
||||
# Patterns for identifier splitting
|
||||
CAMEL_CASE_PATTERN = re.compile(r'([a-z])([A-Z])')
|
||||
SNAKE_CASE_PATTERN = re.compile(r'_+')
|
||||
KEBAB_CASE_PATTERN = re.compile(r'-+')
|
||||
|
||||
# Minimum token length to include in expansion (avoid noise from single chars)
|
||||
MIN_TOKEN_LENGTH = 2
|
||||
|
||||
# All-caps acronyms pattern (e.g., HTTP, SQL, API)
|
||||
ALL_CAPS_PATTERN = re.compile(r'^[A-Z]{2,}$')
|
||||
|
||||
def __init__(self, enable: bool = True, min_token_length: int = 2):
|
||||
"""Initialize query parser.
|
||||
|
||||
Args:
|
||||
enable: Whether to enable query preprocessing
|
||||
min_token_length: Minimum token length to include in expansion
|
||||
"""
|
||||
self.enable = enable
|
||||
self.min_token_length = min_token_length
|
||||
|
||||
def preprocess_query(self, query: str) -> str:
|
||||
"""Preprocess query with identifier expansion.
|
||||
|
||||
Args:
|
||||
query: Original search query
|
||||
|
||||
Returns:
|
||||
Expanded query with OR operator connecting original and split tokens
|
||||
|
||||
Example:
|
||||
>>> parser = QueryParser()
|
||||
>>> parser.preprocess_query('UserAuth')
|
||||
'UserAuth OR User OR Auth'
|
||||
>>> parser.preprocess_query('get_user_data')
|
||||
'get_user_data OR get OR user OR data'
|
||||
"""
|
||||
if not self.enable:
|
||||
return query
|
||||
|
||||
query = query.strip()
|
||||
if not query:
|
||||
return query
|
||||
|
||||
# Extract tokens from query (handle multiple words/terms)
|
||||
# For simple queries, just process the whole thing
|
||||
# For complex FTS5 queries with operators, preserve structure
|
||||
if self._is_simple_query(query):
|
||||
return self._expand_simple_query(query)
|
||||
else:
|
||||
# Complex query with FTS5 operators, don't expand
|
||||
log.debug(f"Skipping expansion for complex FTS5 query: {query}")
|
||||
return query
|
||||
|
||||
def _is_simple_query(self, query: str) -> bool:
|
||||
"""Check if query is simple (no FTS5 operators).
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
|
||||
Returns:
|
||||
True if query is simple (safe to expand), False otherwise
|
||||
"""
|
||||
# Check for FTS5 operators that indicate complex query
|
||||
fts5_operators = ['OR', 'AND', 'NOT', 'NEAR', '*', '^', '"']
|
||||
return not any(op in query for op in fts5_operators)
|
||||
|
||||
def _expand_simple_query(self, query: str) -> str:
|
||||
"""Expand a simple query with identifier splitting.
|
||||
|
||||
Args:
|
||||
query: Simple search query
|
||||
|
||||
Returns:
|
||||
Expanded query with OR operators
|
||||
"""
|
||||
tokens: Set[str] = set()
|
||||
|
||||
# Always include original query
|
||||
tokens.add(query)
|
||||
|
||||
# Split on whitespace first
|
||||
words = query.split()
|
||||
|
||||
for word in words:
|
||||
# Extract tokens from this word
|
||||
word_tokens = self._extract_tokens(word)
|
||||
tokens.update(word_tokens)
|
||||
|
||||
# Filter out short tokens and duplicates
|
||||
filtered_tokens = [
|
||||
t for t in tokens
|
||||
if len(t) >= self.min_token_length
|
||||
]
|
||||
|
||||
# Remove duplicates while preserving original query first
|
||||
unique_tokens: List[str] = []
|
||||
seen: Set[str] = set()
|
||||
|
||||
# Always put original query first
|
||||
if query not in seen and len(query) >= self.min_token_length:
|
||||
unique_tokens.append(query)
|
||||
seen.add(query)
|
||||
|
||||
# Add other tokens
|
||||
for token in filtered_tokens:
|
||||
if token not in seen:
|
||||
unique_tokens.append(token)
|
||||
seen.add(token)
|
||||
|
||||
# Join with OR operator (only if we have multiple tokens)
|
||||
if len(unique_tokens) > 1:
|
||||
expanded = ' OR '.join(unique_tokens)
|
||||
log.debug(f"Expanded query: '{query}' → '{expanded}'")
|
||||
return expanded
|
||||
else:
|
||||
return query
|
||||
|
||||
def _extract_tokens(self, word: str) -> Set[str]:
|
||||
"""Extract tokens from a single word using various splitting strategies.
|
||||
|
||||
Args:
|
||||
word: Single word/identifier to split
|
||||
|
||||
Returns:
|
||||
Set of extracted tokens
|
||||
"""
|
||||
tokens: Set[str] = set()
|
||||
|
||||
# Add original word
|
||||
tokens.add(word)
|
||||
|
||||
# Handle all-caps acronyms (don't split)
|
||||
if self.ALL_CAPS_PATTERN.match(word):
|
||||
return tokens
|
||||
|
||||
# CamelCase splitting
|
||||
camel_tokens = self._split_camel_case(word)
|
||||
tokens.update(camel_tokens)
|
||||
|
||||
# snake_case splitting
|
||||
snake_tokens = self._split_snake_case(word)
|
||||
tokens.update(snake_tokens)
|
||||
|
||||
# kebab-case splitting
|
||||
kebab_tokens = self._split_kebab_case(word)
|
||||
tokens.update(kebab_tokens)
|
||||
|
||||
return tokens
|
||||
|
||||
def _split_camel_case(self, word: str) -> List[str]:
|
||||
"""Split CamelCase identifier into tokens.
|
||||
|
||||
Args:
|
||||
word: CamelCase identifier (e.g., 'getUserData')
|
||||
|
||||
Returns:
|
||||
List of tokens (e.g., ['get', 'User', 'Data'])
|
||||
"""
|
||||
# Insert space before uppercase letters preceded by lowercase
|
||||
spaced = self.CAMEL_CASE_PATTERN.sub(r'\1 \2', word)
|
||||
# Split on spaces and filter empty
|
||||
return [t for t in spaced.split() if t]
|
||||
|
||||
def _split_snake_case(self, word: str) -> List[str]:
|
||||
"""Split snake_case identifier into tokens.
|
||||
|
||||
Args:
|
||||
word: snake_case identifier (e.g., 'get_user_data')
|
||||
|
||||
Returns:
|
||||
List of tokens (e.g., ['get', 'user', 'data'])
|
||||
"""
|
||||
# Split on underscores
|
||||
return [t for t in self.SNAKE_CASE_PATTERN.split(word) if t]
|
||||
|
||||
def _split_kebab_case(self, word: str) -> List[str]:
|
||||
"""Split kebab-case identifier into tokens.
|
||||
|
||||
Args:
|
||||
word: kebab-case identifier (e.g., 'get-user-data')
|
||||
|
||||
Returns:
|
||||
List of tokens (e.g., ['get', 'user', 'data'])
|
||||
"""
|
||||
# Split on hyphens
|
||||
return [t for t in self.KEBAB_CASE_PATTERN.split(word) if t]
|
||||
|
||||
|
||||
# Global default parser instance
|
||||
_default_parser = QueryParser(enable=True)
|
||||
|
||||
|
||||
def preprocess_query(query: str, enable: bool = True) -> str:
|
||||
"""Convenience function for query preprocessing.
|
||||
|
||||
Args:
|
||||
query: Original search query
|
||||
enable: Whether to enable preprocessing
|
||||
|
||||
Returns:
|
||||
Preprocessed query with identifier expansion
|
||||
"""
|
||||
if not enable:
|
||||
return query
|
||||
|
||||
return _default_parser.preprocess_query(query)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"QueryParser",
|
||||
"preprocess_query",
|
||||
]
|
||||
Reference in New Issue
Block a user