mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
243 lines
7.1 KiB
Python
243 lines
7.1 KiB
Python
"""Query preprocessing for CodexLens search.
|
|
|
|
Provides query expansion for better identifier matching:
|
|
- CamelCase splitting: UserAuth → User OR Auth
|
|
- snake_case splitting: user_auth → user OR auth
|
|
- Preserves original query for exact matching
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from typing import Set, List
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
class QueryParser:
|
|
"""Parser for preprocessing search queries before FTS5 execution.
|
|
|
|
Expands identifier-style queries (CamelCase, snake_case) into OR queries
|
|
to improve recall when searching for code symbols.
|
|
|
|
Example transformations:
|
|
- 'UserAuth' → 'UserAuth OR User OR Auth'
|
|
- 'user_auth' → 'user_auth OR user OR auth'
|
|
- 'getUserData' → 'getUserData OR get OR User OR Data'
|
|
"""
|
|
|
|
# Patterns for identifier splitting
|
|
CAMEL_CASE_PATTERN = re.compile(r'([a-z])([A-Z])')
|
|
SNAKE_CASE_PATTERN = re.compile(r'_+')
|
|
KEBAB_CASE_PATTERN = re.compile(r'-+')
|
|
|
|
# Minimum token length to include in expansion (avoid noise from single chars)
|
|
MIN_TOKEN_LENGTH = 2
|
|
|
|
# All-caps acronyms pattern (e.g., HTTP, SQL, API)
|
|
ALL_CAPS_PATTERN = re.compile(r'^[A-Z]{2,}$')
|
|
|
|
def __init__(self, enable: bool = True, min_token_length: int = 2):
|
|
"""Initialize query parser.
|
|
|
|
Args:
|
|
enable: Whether to enable query preprocessing
|
|
min_token_length: Minimum token length to include in expansion
|
|
"""
|
|
self.enable = enable
|
|
self.min_token_length = min_token_length
|
|
|
|
def preprocess_query(self, query: str) -> str:
|
|
"""Preprocess query with identifier expansion.
|
|
|
|
Args:
|
|
query: Original search query
|
|
|
|
Returns:
|
|
Expanded query with OR operator connecting original and split tokens
|
|
|
|
Example:
|
|
>>> parser = QueryParser()
|
|
>>> parser.preprocess_query('UserAuth')
|
|
'UserAuth OR User OR Auth'
|
|
>>> parser.preprocess_query('get_user_data')
|
|
'get_user_data OR get OR user OR data'
|
|
"""
|
|
if not self.enable:
|
|
return query
|
|
|
|
query = query.strip()
|
|
if not query:
|
|
return query
|
|
|
|
# Extract tokens from query (handle multiple words/terms)
|
|
# For simple queries, just process the whole thing
|
|
# For complex FTS5 queries with operators, preserve structure
|
|
if self._is_simple_query(query):
|
|
return self._expand_simple_query(query)
|
|
else:
|
|
# Complex query with FTS5 operators, don't expand
|
|
log.debug(f"Skipping expansion for complex FTS5 query: {query}")
|
|
return query
|
|
|
|
def _is_simple_query(self, query: str) -> bool:
|
|
"""Check if query is simple (no FTS5 operators).
|
|
|
|
Args:
|
|
query: Search query
|
|
|
|
Returns:
|
|
True if query is simple (safe to expand), False otherwise
|
|
"""
|
|
# Check for FTS5 operators that indicate complex query
|
|
fts5_operators = ['OR', 'AND', 'NOT', 'NEAR', '*', '^', '"']
|
|
return not any(op in query for op in fts5_operators)
|
|
|
|
def _expand_simple_query(self, query: str) -> str:
|
|
"""Expand a simple query with identifier splitting.
|
|
|
|
Args:
|
|
query: Simple search query
|
|
|
|
Returns:
|
|
Expanded query with OR operators
|
|
"""
|
|
tokens: Set[str] = set()
|
|
|
|
# Always include original query
|
|
tokens.add(query)
|
|
|
|
# Split on whitespace first
|
|
words = query.split()
|
|
|
|
for word in words:
|
|
# Extract tokens from this word
|
|
word_tokens = self._extract_tokens(word)
|
|
tokens.update(word_tokens)
|
|
|
|
# Filter out short tokens and duplicates
|
|
filtered_tokens = [
|
|
t for t in tokens
|
|
if len(t) >= self.min_token_length
|
|
]
|
|
|
|
# Remove duplicates while preserving original query first
|
|
unique_tokens: List[str] = []
|
|
seen: Set[str] = set()
|
|
|
|
# Always put original query first
|
|
if query not in seen and len(query) >= self.min_token_length:
|
|
unique_tokens.append(query)
|
|
seen.add(query)
|
|
|
|
# Add other tokens
|
|
for token in filtered_tokens:
|
|
if token not in seen:
|
|
unique_tokens.append(token)
|
|
seen.add(token)
|
|
|
|
# Join with OR operator (only if we have multiple tokens)
|
|
if len(unique_tokens) > 1:
|
|
expanded = ' OR '.join(unique_tokens)
|
|
log.debug(f"Expanded query: '{query}' → '{expanded}'")
|
|
return expanded
|
|
else:
|
|
return query
|
|
|
|
def _extract_tokens(self, word: str) -> Set[str]:
|
|
"""Extract tokens from a single word using various splitting strategies.
|
|
|
|
Args:
|
|
word: Single word/identifier to split
|
|
|
|
Returns:
|
|
Set of extracted tokens
|
|
"""
|
|
tokens: Set[str] = set()
|
|
|
|
# Add original word
|
|
tokens.add(word)
|
|
|
|
# Handle all-caps acronyms (don't split)
|
|
if self.ALL_CAPS_PATTERN.match(word):
|
|
return tokens
|
|
|
|
# CamelCase splitting
|
|
camel_tokens = self._split_camel_case(word)
|
|
tokens.update(camel_tokens)
|
|
|
|
# snake_case splitting
|
|
snake_tokens = self._split_snake_case(word)
|
|
tokens.update(snake_tokens)
|
|
|
|
# kebab-case splitting
|
|
kebab_tokens = self._split_kebab_case(word)
|
|
tokens.update(kebab_tokens)
|
|
|
|
return tokens
|
|
|
|
def _split_camel_case(self, word: str) -> List[str]:
|
|
"""Split CamelCase identifier into tokens.
|
|
|
|
Args:
|
|
word: CamelCase identifier (e.g., 'getUserData')
|
|
|
|
Returns:
|
|
List of tokens (e.g., ['get', 'User', 'Data'])
|
|
"""
|
|
# Insert space before uppercase letters preceded by lowercase
|
|
spaced = self.CAMEL_CASE_PATTERN.sub(r'\1 \2', word)
|
|
# Split on spaces and filter empty
|
|
return [t for t in spaced.split() if t]
|
|
|
|
def _split_snake_case(self, word: str) -> List[str]:
|
|
"""Split snake_case identifier into tokens.
|
|
|
|
Args:
|
|
word: snake_case identifier (e.g., 'get_user_data')
|
|
|
|
Returns:
|
|
List of tokens (e.g., ['get', 'user', 'data'])
|
|
"""
|
|
# Split on underscores
|
|
return [t for t in self.SNAKE_CASE_PATTERN.split(word) if t]
|
|
|
|
def _split_kebab_case(self, word: str) -> List[str]:
|
|
"""Split kebab-case identifier into tokens.
|
|
|
|
Args:
|
|
word: kebab-case identifier (e.g., 'get-user-data')
|
|
|
|
Returns:
|
|
List of tokens (e.g., ['get', 'user', 'data'])
|
|
"""
|
|
# Split on hyphens
|
|
return [t for t in self.KEBAB_CASE_PATTERN.split(word) if t]
|
|
|
|
|
|
# Global default parser instance
|
|
_default_parser = QueryParser(enable=True)
|
|
|
|
|
|
def preprocess_query(query: str, enable: bool = True) -> str:
|
|
"""Convenience function for query preprocessing.
|
|
|
|
Args:
|
|
query: Original search query
|
|
enable: Whether to enable preprocessing
|
|
|
|
Returns:
|
|
Preprocessed query with identifier expansion
|
|
"""
|
|
if not enable:
|
|
return query
|
|
|
|
return _default_parser.preprocess_query(query)
|
|
|
|
|
|
__all__ = [
|
|
"QueryParser",
|
|
"preprocess_query",
|
|
]
|