mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-09 02:24:11 +08:00
386 lines
13 KiB
Python
386 lines
13 KiB
Python
"""Parser factory for CodexLens.
|
|
|
|
Python and JavaScript/TypeScript parsing use Tree-Sitter grammars when
|
|
available. Regex fallbacks are retained to preserve the existing parser
|
|
interface and behavior in minimal environments.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Protocol
|
|
|
|
from codexlens.config import Config
|
|
from codexlens.entities import CodeRelationship, IndexedFile, RelationshipType, Symbol
|
|
from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser
|
|
|
|
|
|
class Parser(Protocol):
|
|
def parse(self, text: str, path: Path) -> IndexedFile: ...
|
|
|
|
|
|
@dataclass
|
|
class SimpleRegexParser:
|
|
language_id: str
|
|
|
|
def parse(self, text: str, path: Path) -> IndexedFile:
|
|
# Try tree-sitter first for supported languages
|
|
if self.language_id in {"python", "javascript", "typescript"}:
|
|
ts_parser = TreeSitterSymbolParser(self.language_id, path)
|
|
if ts_parser.is_available():
|
|
indexed = ts_parser.parse(text, path)
|
|
if indexed is not None:
|
|
return indexed
|
|
|
|
# Fallback to regex parsing
|
|
if self.language_id == "python":
|
|
symbols = _parse_python_symbols_regex(text)
|
|
relationships = _parse_python_relationships_regex(text, path)
|
|
elif self.language_id in {"javascript", "typescript"}:
|
|
symbols = _parse_js_ts_symbols_regex(text)
|
|
relationships = _parse_js_ts_relationships_regex(text, path)
|
|
elif self.language_id == "java":
|
|
symbols = _parse_java_symbols(text)
|
|
relationships = []
|
|
elif self.language_id == "go":
|
|
symbols = _parse_go_symbols(text)
|
|
relationships = []
|
|
elif self.language_id == "markdown":
|
|
symbols = _parse_markdown_symbols(text)
|
|
relationships = []
|
|
elif self.language_id == "text":
|
|
symbols = _parse_text_symbols(text)
|
|
relationships = []
|
|
else:
|
|
symbols = _parse_generic_symbols(text)
|
|
relationships = []
|
|
|
|
return IndexedFile(
|
|
path=str(path.resolve()),
|
|
language=self.language_id,
|
|
symbols=symbols,
|
|
chunks=[],
|
|
relationships=relationships,
|
|
)
|
|
|
|
|
|
class ParserFactory:
|
|
def __init__(self, config: Config) -> None:
|
|
self.config = config
|
|
self._parsers: Dict[str, Parser] = {}
|
|
|
|
def get_parser(self, language_id: str) -> Parser:
|
|
if language_id not in self._parsers:
|
|
self._parsers[language_id] = SimpleRegexParser(language_id)
|
|
return self._parsers[language_id]
|
|
|
|
|
|
# Regex-based fallback parsers
|
|
_PY_CLASS_RE = re.compile(r"^\s*class\s+([A-Za-z_]\w*)\b")
|
|
_PY_DEF_RE = re.compile(r"^\s*(?:async\s+)?def\s+([A-Za-z_]\w*)\s*\(")
|
|
|
|
_PY_IMPORT_RE = re.compile(r"^(?:from\s+([\w.]+)\s+)?import\s+([\w.,\s]+)")
|
|
_PY_CALL_RE = re.compile(r"(?<![.\w])(\w+)\s*\(")
|
|
|
|
|
|
|
|
|
|
def _parse_python_symbols(text: str) -> List[Symbol]:
|
|
"""Parse Python symbols, using tree-sitter if available, regex fallback."""
|
|
ts_parser = TreeSitterSymbolParser("python")
|
|
if ts_parser.is_available():
|
|
symbols = ts_parser.parse_symbols(text)
|
|
if symbols is not None:
|
|
return symbols
|
|
return _parse_python_symbols_regex(text)
|
|
|
|
|
|
def _parse_js_ts_symbols(
|
|
text: str,
|
|
language_id: str = "javascript",
|
|
path: Optional[Path] = None,
|
|
) -> List[Symbol]:
|
|
"""Parse JS/TS symbols, using tree-sitter if available, regex fallback."""
|
|
ts_parser = TreeSitterSymbolParser(language_id, path)
|
|
if ts_parser.is_available():
|
|
symbols = ts_parser.parse_symbols(text)
|
|
if symbols is not None:
|
|
return symbols
|
|
return _parse_js_ts_symbols_regex(text)
|
|
|
|
|
|
def _parse_python_symbols_regex(text: str) -> List[Symbol]:
|
|
symbols: List[Symbol] = []
|
|
current_class_indent: Optional[int] = None
|
|
for i, line in enumerate(text.splitlines(), start=1):
|
|
class_match = _PY_CLASS_RE.match(line)
|
|
if class_match:
|
|
current_class_indent = len(line) - len(line.lstrip(" "))
|
|
symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
|
|
continue
|
|
def_match = _PY_DEF_RE.match(line)
|
|
if def_match:
|
|
indent = len(line) - len(line.lstrip(" "))
|
|
kind = "method" if current_class_indent is not None and indent > current_class_indent else "function"
|
|
symbols.append(Symbol(name=def_match.group(1), kind=kind, range=(i, i)))
|
|
continue
|
|
if current_class_indent is not None:
|
|
indent = len(line) - len(line.lstrip(" "))
|
|
if line.strip() and indent <= current_class_indent:
|
|
current_class_indent = None
|
|
return symbols
|
|
|
|
|
|
def _parse_python_relationships_regex(text: str, path: Path) -> List[CodeRelationship]:
|
|
relationships: List[CodeRelationship] = []
|
|
current_scope: str | None = None
|
|
source_file = str(path.resolve())
|
|
|
|
for line_num, line in enumerate(text.splitlines(), start=1):
|
|
class_match = _PY_CLASS_RE.match(line)
|
|
if class_match:
|
|
current_scope = class_match.group(1)
|
|
continue
|
|
|
|
def_match = _PY_DEF_RE.match(line)
|
|
if def_match:
|
|
current_scope = def_match.group(1)
|
|
continue
|
|
|
|
if current_scope is None:
|
|
continue
|
|
|
|
import_match = _PY_IMPORT_RE.search(line)
|
|
if import_match:
|
|
import_target = import_match.group(1) or import_match.group(2)
|
|
if import_target:
|
|
relationships.append(
|
|
CodeRelationship(
|
|
source_symbol=current_scope,
|
|
target_symbol=import_target.strip(),
|
|
relationship_type=RelationshipType.IMPORTS,
|
|
source_file=source_file,
|
|
target_file=None,
|
|
source_line=line_num,
|
|
)
|
|
)
|
|
|
|
for call_match in _PY_CALL_RE.finditer(line):
|
|
call_name = call_match.group(1)
|
|
if call_name in {
|
|
"if",
|
|
"for",
|
|
"while",
|
|
"return",
|
|
"print",
|
|
"len",
|
|
"str",
|
|
"int",
|
|
"float",
|
|
"list",
|
|
"dict",
|
|
"set",
|
|
"tuple",
|
|
current_scope,
|
|
}:
|
|
continue
|
|
relationships.append(
|
|
CodeRelationship(
|
|
source_symbol=current_scope,
|
|
target_symbol=call_name,
|
|
relationship_type=RelationshipType.CALL,
|
|
source_file=source_file,
|
|
target_file=None,
|
|
source_line=line_num,
|
|
)
|
|
)
|
|
|
|
return relationships
|
|
|
|
|
|
_JS_FUNC_RE = re.compile(r"^\s*(?:export\s+)?(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\(")
|
|
_JS_CLASS_RE = re.compile(r"^\s*(?:export\s+)?class\s+([A-Za-z_$][\w$]*)\b")
|
|
_JS_ARROW_RE = re.compile(
|
|
r"^\s*(?:export\s+)?(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:async\s*)?\(?[^)]*\)?\s*=>"
|
|
)
|
|
_JS_METHOD_RE = re.compile(r"^\s+(?:async\s+)?([A-Za-z_$][\w$]*)\s*\([^)]*\)\s*\{")
|
|
_JS_IMPORT_RE = re.compile(r"import\s+.*\s+from\s+['\"]([^'\"]+)['\"]")
|
|
_JS_CALL_RE = re.compile(r"(?<![.\w])(\w+)\s*\(")
|
|
|
|
|
|
def _parse_js_ts_symbols_regex(text: str) -> List[Symbol]:
|
|
symbols: List[Symbol] = []
|
|
in_class = False
|
|
class_brace_depth = 0
|
|
brace_depth = 0
|
|
|
|
for i, line in enumerate(text.splitlines(), start=1):
|
|
brace_depth += line.count("{") - line.count("}")
|
|
|
|
class_match = _JS_CLASS_RE.match(line)
|
|
if class_match:
|
|
symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
|
|
in_class = True
|
|
class_brace_depth = brace_depth
|
|
continue
|
|
|
|
if in_class and brace_depth < class_brace_depth:
|
|
in_class = False
|
|
|
|
func_match = _JS_FUNC_RE.match(line)
|
|
if func_match:
|
|
symbols.append(Symbol(name=func_match.group(1), kind="function", range=(i, i)))
|
|
continue
|
|
|
|
arrow_match = _JS_ARROW_RE.match(line)
|
|
if arrow_match:
|
|
symbols.append(Symbol(name=arrow_match.group(1), kind="function", range=(i, i)))
|
|
continue
|
|
|
|
if in_class:
|
|
method_match = _JS_METHOD_RE.match(line)
|
|
if method_match:
|
|
name = method_match.group(1)
|
|
if name != "constructor":
|
|
symbols.append(Symbol(name=name, kind="method", range=(i, i)))
|
|
|
|
return symbols
|
|
|
|
|
|
def _parse_js_ts_relationships_regex(text: str, path: Path) -> List[CodeRelationship]:
|
|
relationships: List[CodeRelationship] = []
|
|
current_scope: str | None = None
|
|
source_file = str(path.resolve())
|
|
|
|
for line_num, line in enumerate(text.splitlines(), start=1):
|
|
class_match = _JS_CLASS_RE.match(line)
|
|
if class_match:
|
|
current_scope = class_match.group(1)
|
|
continue
|
|
|
|
func_match = _JS_FUNC_RE.match(line)
|
|
if func_match:
|
|
current_scope = func_match.group(1)
|
|
continue
|
|
|
|
arrow_match = _JS_ARROW_RE.match(line)
|
|
if arrow_match:
|
|
current_scope = arrow_match.group(1)
|
|
continue
|
|
|
|
if current_scope is None:
|
|
continue
|
|
|
|
import_match = _JS_IMPORT_RE.search(line)
|
|
if import_match:
|
|
relationships.append(
|
|
CodeRelationship(
|
|
source_symbol=current_scope,
|
|
target_symbol=import_match.group(1),
|
|
relationship_type=RelationshipType.IMPORTS,
|
|
source_file=source_file,
|
|
target_file=None,
|
|
source_line=line_num,
|
|
)
|
|
)
|
|
|
|
for call_match in _JS_CALL_RE.finditer(line):
|
|
call_name = call_match.group(1)
|
|
if call_name in {current_scope}:
|
|
continue
|
|
relationships.append(
|
|
CodeRelationship(
|
|
source_symbol=current_scope,
|
|
target_symbol=call_name,
|
|
relationship_type=RelationshipType.CALL,
|
|
source_file=source_file,
|
|
target_file=None,
|
|
source_line=line_num,
|
|
)
|
|
)
|
|
|
|
return relationships
|
|
|
|
|
|
_JAVA_CLASS_RE = re.compile(r"^\s*(?:public\s+)?class\s+([A-Za-z_]\w*)\b")
|
|
_JAVA_METHOD_RE = re.compile(
|
|
r"^\s*(?:public|private|protected|static|\s)+[\w<>\[\]]+\s+([A-Za-z_]\w*)\s*\("
|
|
)
|
|
|
|
|
|
def _parse_java_symbols(text: str) -> List[Symbol]:
|
|
symbols: List[Symbol] = []
|
|
for i, line in enumerate(text.splitlines(), start=1):
|
|
class_match = _JAVA_CLASS_RE.match(line)
|
|
if class_match:
|
|
symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
|
|
continue
|
|
method_match = _JAVA_METHOD_RE.match(line)
|
|
if method_match:
|
|
symbols.append(Symbol(name=method_match.group(1), kind="method", range=(i, i)))
|
|
return symbols
|
|
|
|
|
|
_GO_FUNC_RE = re.compile(r"^\s*func\s+(?:\([^)]+\)\s+)?([A-Za-z_]\w*)\s*\(")
|
|
_GO_TYPE_RE = re.compile(r"^\s*type\s+([A-Za-z_]\w*)\s+(?:struct|interface)\b")
|
|
|
|
|
|
def _parse_go_symbols(text: str) -> List[Symbol]:
|
|
symbols: List[Symbol] = []
|
|
for i, line in enumerate(text.splitlines(), start=1):
|
|
type_match = _GO_TYPE_RE.match(line)
|
|
if type_match:
|
|
symbols.append(Symbol(name=type_match.group(1), kind="class", range=(i, i)))
|
|
continue
|
|
func_match = _GO_FUNC_RE.match(line)
|
|
if func_match:
|
|
symbols.append(Symbol(name=func_match.group(1), kind="function", range=(i, i)))
|
|
return symbols
|
|
|
|
|
|
_GENERIC_DEF_RE = re.compile(r"^\s*(?:def|function|func)\s+([A-Za-z_]\w*)\b")
|
|
_GENERIC_CLASS_RE = re.compile(r"^\s*(?:class|struct|interface)\s+([A-Za-z_]\w*)\b")
|
|
|
|
|
|
def _parse_generic_symbols(text: str) -> List[Symbol]:
|
|
symbols: List[Symbol] = []
|
|
for i, line in enumerate(text.splitlines(), start=1):
|
|
class_match = _GENERIC_CLASS_RE.match(line)
|
|
if class_match:
|
|
symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
|
|
continue
|
|
def_match = _GENERIC_DEF_RE.match(line)
|
|
if def_match:
|
|
symbols.append(Symbol(name=def_match.group(1), kind="function", range=(i, i)))
|
|
return symbols
|
|
|
|
|
|
# Markdown heading regex: # Heading, ## Heading, etc.
|
|
_MD_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$")
|
|
|
|
|
|
def _parse_markdown_symbols(text: str) -> List[Symbol]:
|
|
"""Parse Markdown headings as symbols.
|
|
|
|
Extracts # headings as 'section' symbols with heading level as kind suffix.
|
|
"""
|
|
symbols: List[Symbol] = []
|
|
for i, line in enumerate(text.splitlines(), start=1):
|
|
heading_match = _MD_HEADING_RE.match(line)
|
|
if heading_match:
|
|
level = len(heading_match.group(1))
|
|
title = heading_match.group(2).strip()
|
|
# Use 'section' kind with level indicator
|
|
kind = f"h{level}"
|
|
symbols.append(Symbol(name=title, kind=kind, range=(i, i)))
|
|
return symbols
|
|
|
|
|
|
def _parse_text_symbols(text: str) -> List[Symbol]:
|
|
"""Parse plain text files - no symbols, just index content."""
|
|
# Text files don't have structured symbols, return empty list
|
|
# The file content will still be indexed for FTS search
|
|
return []
|