Refactor code structure and remove redundant changes

2026-02-14 02:42:04 +08:00 · 2026-01-24 14:47:47 +08:00
parent cf5fecd66d
commit f2b0a5bbc9
113 changed files with 43217 additions and 235 deletions
--- a/codex-lens/build/lib/codexlens/parsers/factory.py
+++ b/codex-lens/build/lib/codexlens/parsers/factory.py
@@ -0,0 +1,385 @@
+"""Parser factory for CodexLens.
+
+Python and JavaScript/TypeScript parsing use Tree-Sitter grammars when
+available. Regex fallbacks are retained to preserve the existing parser
+interface and behavior in minimal environments.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Protocol
+
+from codexlens.config import Config
+from codexlens.entities import CodeRelationship, IndexedFile, RelationshipType, Symbol
+from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser
+
+
+class Parser(Protocol):
+    def parse(self, text: str, path: Path) -> IndexedFile: ...
+
+
+@dataclass
+class SimpleRegexParser:
+    language_id: str
+
+    def parse(self, text: str, path: Path) -> IndexedFile:
+        # Try tree-sitter first for supported languages
+        if self.language_id in {"python", "javascript", "typescript"}:
+            ts_parser = TreeSitterSymbolParser(self.language_id, path)
+            if ts_parser.is_available():
+                indexed = ts_parser.parse(text, path)
+                if indexed is not None:
+                    return indexed
+
+        # Fallback to regex parsing
+        if self.language_id == "python":
+            symbols = _parse_python_symbols_regex(text)
+            relationships = _parse_python_relationships_regex(text, path)
+        elif self.language_id in {"javascript", "typescript"}:
+            symbols = _parse_js_ts_symbols_regex(text)
+            relationships = _parse_js_ts_relationships_regex(text, path)
+        elif self.language_id == "java":
+            symbols = _parse_java_symbols(text)
+            relationships = []
+        elif self.language_id == "go":
+            symbols = _parse_go_symbols(text)
+            relationships = []
+        elif self.language_id == "markdown":
+            symbols = _parse_markdown_symbols(text)
+            relationships = []
+        elif self.language_id == "text":
+            symbols = _parse_text_symbols(text)
+            relationships = []
+        else:
+            symbols = _parse_generic_symbols(text)
+            relationships = []
+
+        return IndexedFile(
+            path=str(path.resolve()),
+            language=self.language_id,
+            symbols=symbols,
+            chunks=[],
+            relationships=relationships,
+        )
+
+
+class ParserFactory:
+    def __init__(self, config: Config) -> None:
+        self.config = config
+        self._parsers: Dict[str, Parser] = {}
+
+    def get_parser(self, language_id: str) -> Parser:
+        if language_id not in self._parsers:
+            self._parsers[language_id] = SimpleRegexParser(language_id)
+        return self._parsers[language_id]
+
+
+# Regex-based fallback parsers
+_PY_CLASS_RE = re.compile(r"^\s*class\s+([A-Za-z_]\w*)\b")
+_PY_DEF_RE = re.compile(r"^\s*(?:async\s+)?def\s+([A-Za-z_]\w*)\s*\(")
+
+_PY_IMPORT_RE = re.compile(r"^(?:from\s+([\w.]+)\s+)?import\s+([\w.,\s]+)")
+_PY_CALL_RE = re.compile(r"(?<![.\w])(\w+)\s*\(")
+
+
+
+
+def _parse_python_symbols(text: str) -> List[Symbol]:
+    """Parse Python symbols, using tree-sitter if available, regex fallback."""
+    ts_parser = TreeSitterSymbolParser("python")
+    if ts_parser.is_available():
+        symbols = ts_parser.parse_symbols(text)
+        if symbols is not None:
+            return symbols
+    return _parse_python_symbols_regex(text)
+
+
+def _parse_js_ts_symbols(
+    text: str,
+    language_id: str = "javascript",
+    path: Optional[Path] = None,
+) -> List[Symbol]:
+    """Parse JS/TS symbols, using tree-sitter if available, regex fallback."""
+    ts_parser = TreeSitterSymbolParser(language_id, path)
+    if ts_parser.is_available():
+        symbols = ts_parser.parse_symbols(text)
+        if symbols is not None:
+            return symbols
+    return _parse_js_ts_symbols_regex(text)
+
+
+def _parse_python_symbols_regex(text: str) -> List[Symbol]:
+    symbols: List[Symbol] = []
+    current_class_indent: Optional[int] = None
+    for i, line in enumerate(text.splitlines(), start=1):
+        class_match = _PY_CLASS_RE.match(line)
+        if class_match:
+            current_class_indent = len(line) - len(line.lstrip(" "))
+            symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
+            continue
+        def_match = _PY_DEF_RE.match(line)
+        if def_match:
+            indent = len(line) - len(line.lstrip(" "))
+            kind = "method" if current_class_indent is not None and indent > current_class_indent else "function"
+            symbols.append(Symbol(name=def_match.group(1), kind=kind, range=(i, i)))
+            continue
+        if current_class_indent is not None:
+            indent = len(line) - len(line.lstrip(" "))
+            if line.strip() and indent <= current_class_indent:
+                current_class_indent = None
+    return symbols
+
+
+def _parse_python_relationships_regex(text: str, path: Path) -> List[CodeRelationship]:
+    relationships: List[CodeRelationship] = []
+    current_scope: str | None = None
+    source_file = str(path.resolve())
+
+    for line_num, line in enumerate(text.splitlines(), start=1):
+        class_match = _PY_CLASS_RE.match(line)
+        if class_match:
+            current_scope = class_match.group(1)
+            continue
+
+        def_match = _PY_DEF_RE.match(line)
+        if def_match:
+            current_scope = def_match.group(1)
+            continue
+
+        if current_scope is None:
+            continue
+
+        import_match = _PY_IMPORT_RE.search(line)
+        if import_match:
+            import_target = import_match.group(1) or import_match.group(2)
+            if import_target:
+                relationships.append(
+                    CodeRelationship(
+                        source_symbol=current_scope,
+                        target_symbol=import_target.strip(),
+                        relationship_type=RelationshipType.IMPORTS,
+                        source_file=source_file,
+                        target_file=None,
+                        source_line=line_num,
+                    )
+                )
+
+        for call_match in _PY_CALL_RE.finditer(line):
+            call_name = call_match.group(1)
+            if call_name in {
+                "if",
+                "for",
+                "while",
+                "return",
+                "print",
+                "len",
+                "str",
+                "int",
+                "float",
+                "list",
+                "dict",
+                "set",
+                "tuple",
+                current_scope,
+            }:
+                continue
+            relationships.append(
+                CodeRelationship(
+                    source_symbol=current_scope,
+                    target_symbol=call_name,
+                    relationship_type=RelationshipType.CALL,
+                    source_file=source_file,
+                    target_file=None,
+                    source_line=line_num,
+                )
+            )
+
+    return relationships
+
+
+_JS_FUNC_RE = re.compile(r"^\s*(?:export\s+)?(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\(")
+_JS_CLASS_RE = re.compile(r"^\s*(?:export\s+)?class\s+([A-Za-z_$][\w$]*)\b")
+_JS_ARROW_RE = re.compile(
+    r"^\s*(?:export\s+)?(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:async\s*)?\(?[^)]*\)?\s*=>"
+)
+_JS_METHOD_RE = re.compile(r"^\s+(?:async\s+)?([A-Za-z_$][\w$]*)\s*\([^)]*\)\s*\{")
+_JS_IMPORT_RE = re.compile(r"import\s+.*\s+from\s+['\"]([^'\"]+)['\"]")
+_JS_CALL_RE = re.compile(r"(?<![.\w])(\w+)\s*\(")
+
+
+def _parse_js_ts_symbols_regex(text: str) -> List[Symbol]:
+    symbols: List[Symbol] = []
+    in_class = False
+    class_brace_depth = 0
+    brace_depth = 0
+
+    for i, line in enumerate(text.splitlines(), start=1):
+        brace_depth += line.count("{") - line.count("}")
+
+        class_match = _JS_CLASS_RE.match(line)
+        if class_match:
+            symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
+            in_class = True
+            class_brace_depth = brace_depth
+            continue
+
+        if in_class and brace_depth < class_brace_depth:
+            in_class = False
+
+        func_match = _JS_FUNC_RE.match(line)
+        if func_match:
+            symbols.append(Symbol(name=func_match.group(1), kind="function", range=(i, i)))
+            continue
+
+        arrow_match = _JS_ARROW_RE.match(line)
+        if arrow_match:
+            symbols.append(Symbol(name=arrow_match.group(1), kind="function", range=(i, i)))
+            continue
+
+        if in_class:
+            method_match = _JS_METHOD_RE.match(line)
+            if method_match:
+                name = method_match.group(1)
+                if name != "constructor":
+                    symbols.append(Symbol(name=name, kind="method", range=(i, i)))
+
+    return symbols
+
+
+def _parse_js_ts_relationships_regex(text: str, path: Path) -> List[CodeRelationship]:
+    relationships: List[CodeRelationship] = []
+    current_scope: str | None = None
+    source_file = str(path.resolve())
+
+    for line_num, line in enumerate(text.splitlines(), start=1):
+        class_match = _JS_CLASS_RE.match(line)
+        if class_match:
+            current_scope = class_match.group(1)
+            continue
+
+        func_match = _JS_FUNC_RE.match(line)
+        if func_match:
+            current_scope = func_match.group(1)
+            continue
+
+        arrow_match = _JS_ARROW_RE.match(line)
+        if arrow_match:
+            current_scope = arrow_match.group(1)
+            continue
+
+        if current_scope is None:
+            continue
+
+        import_match = _JS_IMPORT_RE.search(line)
+        if import_match:
+            relationships.append(
+                CodeRelationship(
+                    source_symbol=current_scope,
+                    target_symbol=import_match.group(1),
+                    relationship_type=RelationshipType.IMPORTS,
+                    source_file=source_file,
+                    target_file=None,
+                    source_line=line_num,
+                )
+            )
+
+        for call_match in _JS_CALL_RE.finditer(line):
+            call_name = call_match.group(1)
+            if call_name in {current_scope}:
+                continue
+            relationships.append(
+                CodeRelationship(
+                    source_symbol=current_scope,
+                    target_symbol=call_name,
+                    relationship_type=RelationshipType.CALL,
+                    source_file=source_file,
+                    target_file=None,
+                    source_line=line_num,
+                )
+            )
+
+    return relationships
+
+
+_JAVA_CLASS_RE = re.compile(r"^\s*(?:public\s+)?class\s+([A-Za-z_]\w*)\b")
+_JAVA_METHOD_RE = re.compile(
+    r"^\s*(?:public|private|protected|static|\s)+[\w<>\[\]]+\s+([A-Za-z_]\w*)\s*\("
+)
+
+
+def _parse_java_symbols(text: str) -> List[Symbol]:
+    symbols: List[Symbol] = []
+    for i, line in enumerate(text.splitlines(), start=1):
+        class_match = _JAVA_CLASS_RE.match(line)
+        if class_match:
+            symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
+            continue
+        method_match = _JAVA_METHOD_RE.match(line)
+        if method_match:
+            symbols.append(Symbol(name=method_match.group(1), kind="method", range=(i, i)))
+    return symbols
+
+
+_GO_FUNC_RE = re.compile(r"^\s*func\s+(?:\([^)]+\)\s+)?([A-Za-z_]\w*)\s*\(")
+_GO_TYPE_RE = re.compile(r"^\s*type\s+([A-Za-z_]\w*)\s+(?:struct|interface)\b")
+
+
+def _parse_go_symbols(text: str) -> List[Symbol]:
+    symbols: List[Symbol] = []
+    for i, line in enumerate(text.splitlines(), start=1):
+        type_match = _GO_TYPE_RE.match(line)
+        if type_match:
+            symbols.append(Symbol(name=type_match.group(1), kind="class", range=(i, i)))
+            continue
+        func_match = _GO_FUNC_RE.match(line)
+        if func_match:
+            symbols.append(Symbol(name=func_match.group(1), kind="function", range=(i, i)))
+    return symbols
+
+
+_GENERIC_DEF_RE = re.compile(r"^\s*(?:def|function|func)\s+([A-Za-z_]\w*)\b")
+_GENERIC_CLASS_RE = re.compile(r"^\s*(?:class|struct|interface)\s+([A-Za-z_]\w*)\b")
+
+
+def _parse_generic_symbols(text: str) -> List[Symbol]:
+    symbols: List[Symbol] = []
+    for i, line in enumerate(text.splitlines(), start=1):
+        class_match = _GENERIC_CLASS_RE.match(line)
+        if class_match:
+            symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
+            continue
+        def_match = _GENERIC_DEF_RE.match(line)
+        if def_match:
+            symbols.append(Symbol(name=def_match.group(1), kind="function", range=(i, i)))
+    return symbols
+
+
+# Markdown heading regex: # Heading, ## Heading, etc.
+_MD_HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$")
+
+
+def _parse_markdown_symbols(text: str) -> List[Symbol]:
+    """Parse Markdown headings as symbols.
+    
+    Extracts # headings as 'section' symbols with heading level as kind suffix.
+    """
+    symbols: List[Symbol] = []
+    for i, line in enumerate(text.splitlines(), start=1):
+        heading_match = _MD_HEADING_RE.match(line)
+        if heading_match:
+            level = len(heading_match.group(1))
+            title = heading_match.group(2).strip()
+            # Use 'section' kind with level indicator
+            kind = f"h{level}"
+            symbols.append(Symbol(name=title, kind=kind, range=(i, i)))
+    return symbols
+
+
+def _parse_text_symbols(text: str) -> List[Symbol]:
+    """Parse plain text files - no symbols, just index content."""
+    # Text files don't have structured symbols, return empty list
+    # The file content will still be indexed for FTS search
+    return []