Claude-Code-Workflow/codex-lens/build/lib/codexlens/parsers/factory.py

"""Parser factory for CodexLens.

Python and JavaScript/TypeScript parsing use Tree-Sitter grammars when
available. Regex fallbacks are retained to preserve the existing parser
interface and behavior in minimal environments.
"""

from __future__ import annotations

import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Protocol

from codexlens.config import Config
from codexlens.entities import CodeRelationship, IndexedFile, RelationshipType, Symbol
from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser


class Parser(Protocol):
    def parse(self, text: str, path: Path) -> IndexedFile: ...


@dataclass
class SimpleRegexParser:
    language_id: str

    def parse(self, text: str, path: Path) -> IndexedFile:
        # Try tree-sitter first for supported languages
        if self.language_id in {"python", "javascript", "typescript"}:
            ts_parser = TreeSitterSymbolParser(self.language_id, path)
            if ts_parser.is_available():
                indexed = ts_parser.parse(text, path)
                if indexed is not None:
                    return indexed

        # Fallback to regex parsing
        if self.language_id == "python":
            symbols = _parse_python_symbols_regex(text)
            relationships = _parse_python_relationships_regex(text, path)
        elif self.language_id in {"javascript", "typescript"}:
            symbols = _parse_js_ts_symbols_regex(text)
            relationships = _parse_js_ts_relationships_regex(text, path)
        elif self.language_id == "java":
            symbols = _parse_java_symbols(text)
            relationships = []
        elif self.language_id == "go":
            symbols = _parse_go_symbols(text)
            relationships = []
        elif self.language_id == "markdown":
            symbols = _parse_markdown_symbols(text)
            relationships = []
        elif self.language_id == "text":
            symbols = _parse_text_symbols(text)
            relationships = []
        else:
            symbols = _parse_generic_symbols(text)
            relationships = []

        return IndexedFile(
            path=str(path.resolve()),
            language=self.language_id,
            symbols=symbols,
            chunks=[],
            relationships=relationships,
        )


class ParserFactory:
    def __init__(self, config: Config) -> None:
        self.config = config
        self._parsers: Dict[str, Parser] = {}

    def get_parser(self, language_id: str) -> Parser:
        if language_id not in self._parsers:
            self._parsers[language_id] = SimpleRegexParser(language_id)
        return self._parsers[language_id]


# Regex-based fallback parsers
_PY_CLASS_RE = re.compile(r"^\s*class\s+([A-Za-z_]\w*)\b")
_PY_DEF_RE = re.compile(r"^\s*(?:async\s+)?def\s+([A-Za-z_]\w*)\s*\(")

_PY_IMPORT_RE = re.compile(r"^(?:from\s+([\w.]+)\s+)?import\s+([\w.,\s]+)")
_PY_CALL_RE = re.compile(r"(?<![.\w])(\w+)\s*\(")


def _parse_python_symbols(text: str) -> List[Symbol]:
    """Parse Python symbols, using tree-sitter if available, regex fallback."""
    ts_parser = TreeSitterSymbolParser("python")
    if ts_parser.is_available():
        symbols = ts_parser.parse_symbols(text)
        if symbols is not None:
            return symbols
    return _parse_python_symbols_regex(text)


def _parse_js_ts_symbols(
    text: str,
    language_id: str = "javascript",
    path: Optional[Path] = None,
) -> List[Symbol]:
    """Parse JS/TS symbols, using tree-sitter if available, regex fallback."""
    ts_parser = TreeSitterSymbolParser(language_id, path)
    if ts_parser.is_available():
        symbols = ts_parser.parse_symbols(text)
        if symbols is not None:
            return symbols
    return _parse_js_ts_symbols_regex(text)


def _parse_python_symbols_regex(text: str) -> List[Symbol]:
    symbols: List[Symbol] = []
    current_class_indent: Optional[int] = None
    for i, line in enumerate(text.splitlines(), start=1):
        class_match = _PY_CLASS_RE.match(line)
        if class_match:
            current_class_indent = len(line) - len(line.lstrip(" "))
            symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
            continue
        def_match = _PY_DEF_RE.match(line)
        if def_match:
            indent = len(line) - len(line.lstrip(" "))
            kind = "method" if current_class_indent is not None and indent > current_class_indent else "function"
            symbols.append(Symbol(name=def_match.group(1), kind=kind, range=(i, i)))
            continue
        if current_class_indent is not None:
            indent = len(line) - len(line.lstrip(" "))
            if line.strip() and indent <= current_class_indent:
                current_class_indent = None
    return symbols


def _parse_python_relationships_regex(text: str, path: Path) -> List[CodeRelationship]:
    relationships: List[CodeRelationship] = []
    current_scope: str | None = None
    source_file = str(path.resolve())

    for line_num, line in enumerate(text.splitlines(), start=1):
        class_match = _PY_CLASS_RE.match(line)
        if class_match:
            current_scope = class_match.group(1)
            continue

        def_match = _PY_DEF_RE.match(line)
        if def_match:
            current_scope = def_match.group(1)
            continue

        if current_scope is None:
            continue

        import_match = _PY_IMPORT_RE.search(line)
        if import_match:
            import_target = import_match.group(1) or import_match.group(2)
            if import_target:
                relationships.append(
                    CodeRelationship(
                        source_symbol=current_scope,
                        target_symbol=import_target.strip(),
                        relationship_type=RelationshipType.IMPORTS,
                        source_file=source_file,
                        target_file=None,
                        source_line=line_num,
                    )
                )

        for call_match in _PY_CALL_RE.finditer(line):
            call_name = call_match.group(1)
            if call_name in {
                "if",
                "for",
                "while",
                "return",
                "print",
                "len",
                "str",
                "int",
                "float",
                "list",
                "dict",
                "set",
                "tuple",
                current_scope,
            }:
                continue
            relationships.append(
                CodeRelationship(
                    source_symbol=current_scope,
                    target_symbol=call_name,
                    relationship_type=RelationshipType.CALL,
                    source_file=source_file,
                    target_file=None,
                    source_line=line_num,
                )
            )

    return relationships


_JS_FUNC_RE = re.compile(r"^\s*(?:export\s+)?(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\(")
_JS_CLASS_RE = re.compile(r"^\s*(?:export\s+)?class\s+([A-Za-z_$][\w$]*)\b")
_JS_ARROW_RE = re.compile(
    r"^\s*(?:export\s+)?(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:async\s*)?\(?[^)]*\)?\s*=>"
)
_JS_METHOD_RE = re.compile(r"^\s+(?:async\s+)?([A-Za-z_$][\w$]*)\s*\([^)]*\)\s*\{")
_JS_IMPORT_RE = re.compile(r"import\s+.*\s+from\s+['\"]([^'\"]+)['\"]")
_JS_CALL_RE = re.compile(r"(?<![.\w])(\w+)\s*\(")


def _parse_js_ts_symbols_regex(text: str) -> List[Symbol]:
    symbols: List[Symbol] = []
    in_class = False
    class_brace_depth = 0
    brace_depth = 0

    for i, line in enumerate(text.splitlines(), start=1):
        brace_depth += line.count("{") - line.count("}")

        class_match = _JS_CLASS_RE.match(line)
        if class_match:
            symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
            in_class = True
            class_brace_depth = brace_depth
            continue

        if in_class and brace_depth < class_brace_depth:
            in_class = False

        func_match = _JS_FUNC_RE.match(line)
        if func_match:
            symbols.append(Symbol(name=func_match.group(1), kind="function", range=(i, i)))
            continue

        arrow_match = _JS_ARROW_RE.match(line)
        if arrow_match:
            symbols.append(Symbol(name=arrow_match.group(1), kind="function", range=(i, i)))
            continue

        if in_class:
            method_match = _JS_METHOD_RE.match(line)
            if method_match:
                name = method_match.group(1)
                if name != "constructor":
                    symbols.append(Symbol(name=name, kind="method", range=(i, i)))

    return symbols


def _parse_js_ts_relationships_regex(text: str, path: Path) -> List[CodeRelationship]:
    relationships: List[CodeRelationship] = []
    current_scope: str | None = None
    source_file = str(path.resolve())

    for line_num, line in enumerate(text.splitlines(), start=1):
        class_match = _JS_CLASS_RE.match(line)
        if class_match:
            current_scope = class_match.group(1)
            continue

        func_match = _JS_FUNC_RE.match(line)
        if func_match:
            current_scope = func_match.group(1)
            continue

        arrow_match = _JS_ARROW_RE.match(line)
        if arrow_match:
            current_scope = arrow_match.group(1)
            continue

        if current_scope is None:
            continue

        import_match = _JS_IMPORT_RE.search(line)
        if import_match:
            relationships.append(
                CodeRelationship(
                    source_symbol=current_scope,
                    target_symbol=import_match.group(1),
                    relationship_type=RelationshipType.IMPORTS,
                    source_file=source_file,
                    target_file=None,
                    source_line=line_num,
                )
            )

        for call_match in _JS_CALL_RE.finditer(line):
            call_name = call_match.group(1)
            if call_name in {current_scope}:
                continue
            relationships.append(
                CodeRelationship(
                    source_symbol=current_scope,
                    target_symbol=call_name,
                    relationship_type=RelationshipType.CALL,
                    source_file=source_file,
                    target_file=None,
                    source_line=line_num,
                )
            )

    return relationships


_JAVA_CLASS_RE = re.compile(r"^\s*(?:public\s+)?class\s+([A-Za-z_]\w*)\b")
_JAVA_METHOD_RE = re.compile(
    r"^\s*(?:public|private|protected|static|\s)+[\w<>\[\]]+\s+([A-Za-z_]\w*)\s*\("
)


def _parse_java_symbols(text: str) -> List[Symbol]:
    symbols: List[Symbol] = []
    for i, line in enumerate(text.splitlines(), start=1):
        class_match = _JAVA_CLASS_RE.match(line)
        if class_match:
            symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
            continue
        method_match = _JAVA_METHOD_RE.match(line)
        if method_match:
            symbols.append(Symbol(name=method_match.group(1), kind="method", range=(i, i)))
    return symbols


_GO_FUNC_RE = re.compile(r"^\s*func\s+(?:\([^)]+\)\s+)?([A-Za-z_]\w*)\s*\(")
_GO_TYPE_RE = re.compile(r"^\s*type\s+([A-Za-z_]\w*)\s+(?:struct|interface)\b")


def _parse_go_symbols(text: str) -> List[Symbol]:
    symbols: List[Symbol] = []
    for i, line in enumerate(text.splitlines(), start=1):
        type_match = _GO_TYPE_RE.match(line)
        if type_match:
            symbols.append(Symbol(name=type_match.group(1), kind="class", range=(i, i)))
            continue
        func_match = _GO_FUNC_RE.match(line)
        if func_match:
            symbols.append(Symbol(name=func_match.group(1), kind="function", range=(i, i)))
    return symbols


_GENERIC_DEF_RE = re.compile(r"^\s*(?:def|function|func)\s+([A-Za-z_]\w*)\b")
_GENERIC_CLASS_RE = re.compile(r"^\s*(?:class|struct|interface)\s+([A-Za-z_]\w*)\b")


def _parse_generic_symbols(text: str) -> List[Symbol]:
    symbols: List[Symbol] = []
    for i, line in enumerate(text.splitlines(), start=1):
        class_match = _GENERIC_CLASS_RE.match(line)
        if class_match:
            symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
            continue
        def_match = _GENERIC_DEF_RE.match(line)
        if def_match:
            symbols.append(Symbol(name=def_match.group(1), kind="function", range=(i, i)))
    return symbols


# Markdown heading regex: # Heading, ## Heading, etc.
_MD_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$")


def _parse_markdown_symbols(text: str) -> List[Symbol]:
    """Parse Markdown headings as symbols.

    Extracts # headings as 'section' symbols with heading level as kind suffix.
    """
    symbols: List[Symbol] = []
    for i, line in enumerate(text.splitlines(), start=1):
        heading_match = _MD_HEADING_RE.match(line)
        if heading_match:
            level = len(heading_match.group(1))
            title = heading_match.group(2).strip()
            # Use 'section' kind with level indicator
            kind = f"h{level}"
            symbols.append(Symbol(name=title, kind=kind, range=(i, i)))
    return symbols


def _parse_text_symbols(text: str) -> List[Symbol]:
    """Parse plain text files - no symbols, just index content."""
    # Text files don't have structured symbols, return empty list
    # The file content will still be indexed for FTS search
    return []