mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-11 02:33:51 +08:00
Add comprehensive tests for tokenizer, performance benchmarks, and TreeSitter parser functionality
- Implemented unit tests for the Tokenizer class, covering various text inputs, edge cases, and fallback mechanisms. - Created performance benchmarks comparing tiktoken and pure Python implementations for token counting. - Developed extensive tests for TreeSitterSymbolParser across Python, JavaScript, and TypeScript, ensuring accurate symbol extraction and parsing. - Added configuration documentation for MCP integration and custom prompts, enhancing usability and flexibility. - Introduced a refactor script for GraphAnalyzer to streamline future improvements.
This commit is contained in:
@@ -10,19 +10,11 @@ from __future__ import annotations
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Protocol
|
||||
|
||||
try:
|
||||
from tree_sitter import Language as TreeSitterLanguage
|
||||
from tree_sitter import Node as TreeSitterNode
|
||||
from tree_sitter import Parser as TreeSitterParser
|
||||
except Exception: # pragma: no cover
|
||||
TreeSitterLanguage = None # type: ignore[assignment]
|
||||
TreeSitterNode = None # type: ignore[assignment]
|
||||
TreeSitterParser = None # type: ignore[assignment]
|
||||
from typing import Dict, List, Optional, Protocol
|
||||
|
||||
from codexlens.config import Config
|
||||
from codexlens.entities import IndexedFile, Symbol
|
||||
from codexlens.parsers.treesitter_parser import TreeSitterSymbolParser
|
||||
|
||||
|
||||
class Parser(Protocol):
|
||||
@@ -34,10 +26,24 @@ class SimpleRegexParser:
|
||||
language_id: str
|
||||
|
||||
def parse(self, text: str, path: Path) -> IndexedFile:
|
||||
# Try tree-sitter first for supported languages
|
||||
if self.language_id in {"python", "javascript", "typescript"}:
|
||||
ts_parser = TreeSitterSymbolParser(self.language_id, path)
|
||||
if ts_parser.is_available():
|
||||
symbols = ts_parser.parse_symbols(text)
|
||||
if symbols is not None:
|
||||
return IndexedFile(
|
||||
path=str(path.resolve()),
|
||||
language=self.language_id,
|
||||
symbols=symbols,
|
||||
chunks=[],
|
||||
)
|
||||
|
||||
# Fallback to regex parsing
|
||||
if self.language_id == "python":
|
||||
symbols = _parse_python_symbols(text)
|
||||
symbols = _parse_python_symbols_regex(text)
|
||||
elif self.language_id in {"javascript", "typescript"}:
|
||||
symbols = _parse_js_ts_symbols(text, self.language_id, path)
|
||||
symbols = _parse_js_ts_symbols_regex(text)
|
||||
elif self.language_id == "java":
|
||||
symbols = _parse_java_symbols(text)
|
||||
elif self.language_id == "go":
|
||||
@@ -64,120 +70,35 @@ class ParserFactory:
|
||||
return self._parsers[language_id]
|
||||
|
||||
|
||||
# Regex-based fallback parsers
|
||||
_PY_CLASS_RE = re.compile(r"^\s*class\s+([A-Za-z_]\w*)\b")
|
||||
_PY_DEF_RE = re.compile(r"^\s*(?:async\s+)?def\s+([A-Za-z_]\w*)\s*\(")
|
||||
|
||||
_TREE_SITTER_LANGUAGE_CACHE: Dict[str, TreeSitterLanguage] = {}
|
||||
|
||||
|
||||
def _get_tree_sitter_language(language_id: str, path: Path | None = None) -> TreeSitterLanguage | None:
|
||||
if TreeSitterLanguage is None:
|
||||
return None
|
||||
|
||||
cache_key = language_id
|
||||
if language_id == "typescript" and path is not None and path.suffix.lower() == ".tsx":
|
||||
cache_key = "tsx"
|
||||
|
||||
cached = _TREE_SITTER_LANGUAGE_CACHE.get(cache_key)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
try:
|
||||
if cache_key == "python":
|
||||
import tree_sitter_python # type: ignore[import-not-found]
|
||||
|
||||
language = TreeSitterLanguage(tree_sitter_python.language())
|
||||
elif cache_key == "javascript":
|
||||
import tree_sitter_javascript # type: ignore[import-not-found]
|
||||
|
||||
language = TreeSitterLanguage(tree_sitter_javascript.language())
|
||||
elif cache_key == "typescript":
|
||||
import tree_sitter_typescript # type: ignore[import-not-found]
|
||||
|
||||
language = TreeSitterLanguage(tree_sitter_typescript.language_typescript())
|
||||
elif cache_key == "tsx":
|
||||
import tree_sitter_typescript # type: ignore[import-not-found]
|
||||
|
||||
language = TreeSitterLanguage(tree_sitter_typescript.language_tsx())
|
||||
else:
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
_TREE_SITTER_LANGUAGE_CACHE[cache_key] = language
|
||||
return language
|
||||
def _parse_python_symbols(text: str) -> List[Symbol]:
|
||||
"""Parse Python symbols, using tree-sitter if available, regex fallback."""
|
||||
ts_parser = TreeSitterSymbolParser("python")
|
||||
if ts_parser.is_available():
|
||||
symbols = ts_parser.parse_symbols(text)
|
||||
if symbols is not None:
|
||||
return symbols
|
||||
return _parse_python_symbols_regex(text)
|
||||
|
||||
|
||||
def _iter_tree_sitter_nodes(root: TreeSitterNode) -> Iterable[TreeSitterNode]:
|
||||
stack: List[TreeSitterNode] = [root]
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
yield node
|
||||
for child in reversed(node.children):
|
||||
stack.append(child)
|
||||
|
||||
|
||||
def _node_text(source_bytes: bytes, node: TreeSitterNode) -> str:
|
||||
return source_bytes[node.start_byte:node.end_byte].decode("utf8")
|
||||
|
||||
|
||||
def _node_range(node: TreeSitterNode) -> tuple[int, int]:
|
||||
start_line = node.start_point[0] + 1
|
||||
end_line = node.end_point[0] + 1
|
||||
return (start_line, max(start_line, end_line))
|
||||
|
||||
|
||||
def _python_kind_for_function_node(node: TreeSitterNode) -> str:
|
||||
parent = node.parent
|
||||
while parent is not None:
|
||||
if parent.type in {"function_definition", "async_function_definition"}:
|
||||
return "function"
|
||||
if parent.type == "class_definition":
|
||||
return "method"
|
||||
parent = parent.parent
|
||||
return "function"
|
||||
|
||||
|
||||
def _parse_python_symbols_tree_sitter(text: str) -> List[Symbol] | None:
|
||||
if TreeSitterParser is None:
|
||||
return None
|
||||
|
||||
language = _get_tree_sitter_language("python")
|
||||
if language is None:
|
||||
return None
|
||||
|
||||
parser = TreeSitterParser()
|
||||
if hasattr(parser, "set_language"):
|
||||
parser.set_language(language) # type: ignore[attr-defined]
|
||||
else:
|
||||
parser.language = language # type: ignore[assignment]
|
||||
|
||||
source_bytes = text.encode("utf8")
|
||||
tree = parser.parse(source_bytes)
|
||||
root = tree.root_node
|
||||
|
||||
symbols: List[Symbol] = []
|
||||
for node in _iter_tree_sitter_nodes(root):
|
||||
if node.type == "class_definition":
|
||||
name_node = node.child_by_field_name("name")
|
||||
if name_node is None:
|
||||
continue
|
||||
symbols.append(Symbol(
|
||||
name=_node_text(source_bytes, name_node),
|
||||
kind="class",
|
||||
range=_node_range(node),
|
||||
))
|
||||
elif node.type in {"function_definition", "async_function_definition"}:
|
||||
name_node = node.child_by_field_name("name")
|
||||
if name_node is None:
|
||||
continue
|
||||
symbols.append(Symbol(
|
||||
name=_node_text(source_bytes, name_node),
|
||||
kind=_python_kind_for_function_node(node),
|
||||
range=_node_range(node),
|
||||
))
|
||||
|
||||
return symbols
|
||||
def _parse_js_ts_symbols(
|
||||
text: str,
|
||||
language_id: str = "javascript",
|
||||
path: Optional[Path] = None,
|
||||
) -> List[Symbol]:
|
||||
"""Parse JS/TS symbols, using tree-sitter if available, regex fallback."""
|
||||
ts_parser = TreeSitterSymbolParser(language_id, path)
|
||||
if ts_parser.is_available():
|
||||
symbols = ts_parser.parse_symbols(text)
|
||||
if symbols is not None:
|
||||
return symbols
|
||||
return _parse_js_ts_symbols_regex(text)
|
||||
|
||||
|
||||
def _parse_python_symbols_regex(text: str) -> List[Symbol]:
|
||||
@@ -202,13 +123,6 @@ def _parse_python_symbols_regex(text: str) -> List[Symbol]:
|
||||
return symbols
|
||||
|
||||
|
||||
def _parse_python_symbols(text: str) -> List[Symbol]:
|
||||
symbols = _parse_python_symbols_tree_sitter(text)
|
||||
if symbols is not None:
|
||||
return symbols
|
||||
return _parse_python_symbols_regex(text)
|
||||
|
||||
|
||||
_JS_FUNC_RE = re.compile(r"^\s*(?:export\s+)?(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\(")
|
||||
_JS_CLASS_RE = re.compile(r"^\s*(?:export\s+)?class\s+([A-Za-z_$][\w$]*)\b")
|
||||
_JS_ARROW_RE = re.compile(
|
||||
@@ -217,88 +131,6 @@ _JS_ARROW_RE = re.compile(
|
||||
_JS_METHOD_RE = re.compile(r"^\s+(?:async\s+)?([A-Za-z_$][\w$]*)\s*\([^)]*\)\s*\{")
|
||||
|
||||
|
||||
def _js_has_class_ancestor(node: TreeSitterNode) -> bool:
|
||||
parent = node.parent
|
||||
while parent is not None:
|
||||
if parent.type in {"class_declaration", "class"}:
|
||||
return True
|
||||
parent = parent.parent
|
||||
return False
|
||||
|
||||
|
||||
def _parse_js_ts_symbols_tree_sitter(
|
||||
text: str,
|
||||
language_id: str,
|
||||
path: Path | None = None,
|
||||
) -> List[Symbol] | None:
|
||||
if TreeSitterParser is None:
|
||||
return None
|
||||
|
||||
language = _get_tree_sitter_language(language_id, path)
|
||||
if language is None:
|
||||
return None
|
||||
|
||||
parser = TreeSitterParser()
|
||||
if hasattr(parser, "set_language"):
|
||||
parser.set_language(language) # type: ignore[attr-defined]
|
||||
else:
|
||||
parser.language = language # type: ignore[assignment]
|
||||
|
||||
source_bytes = text.encode("utf8")
|
||||
tree = parser.parse(source_bytes)
|
||||
root = tree.root_node
|
||||
|
||||
symbols: List[Symbol] = []
|
||||
for node in _iter_tree_sitter_nodes(root):
|
||||
if node.type in {"class_declaration", "class"}:
|
||||
name_node = node.child_by_field_name("name")
|
||||
if name_node is None:
|
||||
continue
|
||||
symbols.append(Symbol(
|
||||
name=_node_text(source_bytes, name_node),
|
||||
kind="class",
|
||||
range=_node_range(node),
|
||||
))
|
||||
elif node.type in {"function_declaration", "generator_function_declaration"}:
|
||||
name_node = node.child_by_field_name("name")
|
||||
if name_node is None:
|
||||
continue
|
||||
symbols.append(Symbol(
|
||||
name=_node_text(source_bytes, name_node),
|
||||
kind="function",
|
||||
range=_node_range(node),
|
||||
))
|
||||
elif node.type == "variable_declarator":
|
||||
name_node = node.child_by_field_name("name")
|
||||
value_node = node.child_by_field_name("value")
|
||||
if (
|
||||
name_node is None
|
||||
or value_node is None
|
||||
or name_node.type not in {"identifier", "property_identifier"}
|
||||
or value_node.type != "arrow_function"
|
||||
):
|
||||
continue
|
||||
symbols.append(Symbol(
|
||||
name=_node_text(source_bytes, name_node),
|
||||
kind="function",
|
||||
range=_node_range(node),
|
||||
))
|
||||
elif node.type == "method_definition" and _js_has_class_ancestor(node):
|
||||
name_node = node.child_by_field_name("name")
|
||||
if name_node is None:
|
||||
continue
|
||||
name = _node_text(source_bytes, name_node)
|
||||
if name == "constructor":
|
||||
continue
|
||||
symbols.append(Symbol(
|
||||
name=name,
|
||||
kind="method",
|
||||
range=_node_range(node),
|
||||
))
|
||||
|
||||
return symbols
|
||||
|
||||
|
||||
def _parse_js_ts_symbols_regex(text: str) -> List[Symbol]:
|
||||
symbols: List[Symbol] = []
|
||||
in_class = False
|
||||
@@ -338,17 +170,6 @@ def _parse_js_ts_symbols_regex(text: str) -> List[Symbol]:
|
||||
return symbols
|
||||
|
||||
|
||||
def _parse_js_ts_symbols(
|
||||
text: str,
|
||||
language_id: str = "javascript",
|
||||
path: Path | None = None,
|
||||
) -> List[Symbol]:
|
||||
symbols = _parse_js_ts_symbols_tree_sitter(text, language_id, path)
|
||||
if symbols is not None:
|
||||
return symbols
|
||||
return _parse_js_ts_symbols_regex(text)
|
||||
|
||||
|
||||
_JAVA_CLASS_RE = re.compile(r"^\s*(?:public\s+)?class\s+([A-Za-z_]\w*)\b")
|
||||
_JAVA_METHOD_RE = re.compile(
|
||||
r"^\s*(?:public|private|protected|static|\s)+[\w<>\[\]]+\s+([A-Za-z_]\w*)\s*\("
|
||||
|
||||
98
codex-lens/src/codexlens/parsers/tokenizer.py
Normal file
98
codex-lens/src/codexlens/parsers/tokenizer.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""Token counting utilities for CodexLens.
|
||||
|
||||
Provides accurate token counting using tiktoken with character count fallback.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
try:
|
||||
import tiktoken
|
||||
TIKTOKEN_AVAILABLE = True
|
||||
except ImportError:
|
||||
TIKTOKEN_AVAILABLE = False
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
"""Token counter with tiktoken primary and character count fallback."""
|
||||
|
||||
def __init__(self, encoding_name: str = "cl100k_base") -> None:
|
||||
"""Initialize tokenizer.
|
||||
|
||||
Args:
|
||||
encoding_name: Tiktoken encoding name (default: cl100k_base for GPT-4)
|
||||
"""
|
||||
self._encoding: Optional[object] = None
|
||||
self._encoding_name = encoding_name
|
||||
|
||||
if TIKTOKEN_AVAILABLE:
|
||||
try:
|
||||
self._encoding = tiktoken.get_encoding(encoding_name)
|
||||
except Exception:
|
||||
# Fallback to character counting if encoding fails
|
||||
self._encoding = None
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""Count tokens in text.
|
||||
|
||||
Uses tiktoken if available, otherwise falls back to character count / 4.
|
||||
|
||||
Args:
|
||||
text: Text to count tokens for
|
||||
|
||||
Returns:
|
||||
Estimated token count
|
||||
"""
|
||||
if not text:
|
||||
return 0
|
||||
|
||||
if self._encoding is not None:
|
||||
try:
|
||||
return len(self._encoding.encode(text)) # type: ignore[attr-defined]
|
||||
except Exception:
|
||||
# Fall through to character count fallback
|
||||
pass
|
||||
|
||||
# Fallback: rough estimate using character count
|
||||
# Average of ~4 characters per token for English text
|
||||
return max(1, len(text) // 4)
|
||||
|
||||
def is_using_tiktoken(self) -> bool:
|
||||
"""Check if tiktoken is being used.
|
||||
|
||||
Returns:
|
||||
True if tiktoken is available and initialized
|
||||
"""
|
||||
return self._encoding is not None
|
||||
|
||||
|
||||
# Global default tokenizer instance
|
||||
_default_tokenizer: Optional[Tokenizer] = None
|
||||
|
||||
|
||||
def get_default_tokenizer() -> Tokenizer:
|
||||
"""Get the global default tokenizer instance.
|
||||
|
||||
Returns:
|
||||
Shared Tokenizer instance
|
||||
"""
|
||||
global _default_tokenizer
|
||||
if _default_tokenizer is None:
|
||||
_default_tokenizer = Tokenizer()
|
||||
return _default_tokenizer
|
||||
|
||||
|
||||
def count_tokens(text: str, tokenizer: Optional[Tokenizer] = None) -> int:
|
||||
"""Count tokens in text using default or provided tokenizer.
|
||||
|
||||
Args:
|
||||
text: Text to count tokens for
|
||||
tokenizer: Optional tokenizer instance (uses default if None)
|
||||
|
||||
Returns:
|
||||
Estimated token count
|
||||
"""
|
||||
if tokenizer is None:
|
||||
tokenizer = get_default_tokenizer()
|
||||
return tokenizer.count_tokens(text)
|
||||
335
codex-lens/src/codexlens/parsers/treesitter_parser.py
Normal file
335
codex-lens/src/codexlens/parsers/treesitter_parser.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""Tree-sitter based parser for CodexLens.
|
||||
|
||||
Provides precise AST-level parsing with fallback to regex-based parsing.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
try:
|
||||
from tree_sitter import Language as TreeSitterLanguage
|
||||
from tree_sitter import Node as TreeSitterNode
|
||||
from tree_sitter import Parser as TreeSitterParser
|
||||
TREE_SITTER_AVAILABLE = True
|
||||
except ImportError:
|
||||
TreeSitterLanguage = None # type: ignore[assignment]
|
||||
TreeSitterNode = None # type: ignore[assignment]
|
||||
TreeSitterParser = None # type: ignore[assignment]
|
||||
TREE_SITTER_AVAILABLE = False
|
||||
|
||||
from codexlens.entities import IndexedFile, Symbol
|
||||
from codexlens.parsers.tokenizer import get_default_tokenizer
|
||||
|
||||
|
||||
class TreeSitterSymbolParser:
|
||||
"""Parser using tree-sitter for AST-level symbol extraction."""
|
||||
|
||||
def __init__(self, language_id: str, path: Optional[Path] = None) -> None:
|
||||
"""Initialize tree-sitter parser for a language.
|
||||
|
||||
Args:
|
||||
language_id: Language identifier (python, javascript, typescript, etc.)
|
||||
path: Optional file path for language variant detection (e.g., .tsx)
|
||||
"""
|
||||
self.language_id = language_id
|
||||
self.path = path
|
||||
self._parser: Optional[object] = None
|
||||
self._language: Optional[TreeSitterLanguage] = None
|
||||
self._tokenizer = get_default_tokenizer()
|
||||
|
||||
if TREE_SITTER_AVAILABLE:
|
||||
self._initialize_parser()
|
||||
|
||||
def _initialize_parser(self) -> None:
|
||||
"""Initialize tree-sitter parser and language."""
|
||||
if TreeSitterParser is None or TreeSitterLanguage is None:
|
||||
return
|
||||
|
||||
try:
|
||||
# Load language grammar
|
||||
if self.language_id == "python":
|
||||
import tree_sitter_python
|
||||
self._language = TreeSitterLanguage(tree_sitter_python.language())
|
||||
elif self.language_id == "javascript":
|
||||
import tree_sitter_javascript
|
||||
self._language = TreeSitterLanguage(tree_sitter_javascript.language())
|
||||
elif self.language_id == "typescript":
|
||||
import tree_sitter_typescript
|
||||
# Detect TSX files by extension
|
||||
if self.path is not None and self.path.suffix.lower() == ".tsx":
|
||||
self._language = TreeSitterLanguage(tree_sitter_typescript.language_tsx())
|
||||
else:
|
||||
self._language = TreeSitterLanguage(tree_sitter_typescript.language_typescript())
|
||||
else:
|
||||
return
|
||||
|
||||
# Create parser
|
||||
self._parser = TreeSitterParser()
|
||||
if hasattr(self._parser, "set_language"):
|
||||
self._parser.set_language(self._language) # type: ignore[attr-defined]
|
||||
else:
|
||||
self._parser.language = self._language # type: ignore[assignment]
|
||||
|
||||
except Exception:
|
||||
# Gracefully handle missing language bindings
|
||||
self._parser = None
|
||||
self._language = None
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if tree-sitter parser is available.
|
||||
|
||||
Returns:
|
||||
True if parser is initialized and ready
|
||||
"""
|
||||
return self._parser is not None and self._language is not None
|
||||
|
||||
|
||||
def parse_symbols(self, text: str) -> Optional[List[Symbol]]:
|
||||
"""Parse source code and extract symbols without creating IndexedFile.
|
||||
|
||||
Args:
|
||||
text: Source code text
|
||||
|
||||
Returns:
|
||||
List of symbols if parsing succeeds, None if tree-sitter unavailable
|
||||
"""
|
||||
if not self.is_available() or self._parser is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
source_bytes = text.encode("utf8")
|
||||
tree = self._parser.parse(source_bytes) # type: ignore[attr-defined]
|
||||
root = tree.root_node
|
||||
|
||||
return self._extract_symbols(source_bytes, root)
|
||||
except Exception:
|
||||
# Gracefully handle parsing errors
|
||||
return None
|
||||
|
||||
def parse(self, text: str, path: Path) -> Optional[IndexedFile]:
|
||||
"""Parse source code and extract symbols.
|
||||
|
||||
Args:
|
||||
text: Source code text
|
||||
path: File path
|
||||
|
||||
Returns:
|
||||
IndexedFile if parsing succeeds, None if tree-sitter unavailable
|
||||
"""
|
||||
if not self.is_available() or self._parser is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
symbols = self.parse_symbols(text)
|
||||
if symbols is None:
|
||||
return None
|
||||
|
||||
return IndexedFile(
|
||||
path=str(path.resolve()),
|
||||
language=self.language_id,
|
||||
symbols=symbols,
|
||||
chunks=[],
|
||||
)
|
||||
except Exception:
|
||||
# Gracefully handle parsing errors
|
||||
return None
|
||||
|
||||
def _extract_symbols(self, source_bytes: bytes, root: TreeSitterNode) -> List[Symbol]:
|
||||
"""Extract symbols from AST.
|
||||
|
||||
Args:
|
||||
source_bytes: Source code as bytes
|
||||
root: Root AST node
|
||||
|
||||
Returns:
|
||||
List of extracted symbols
|
||||
"""
|
||||
if self.language_id == "python":
|
||||
return self._extract_python_symbols(source_bytes, root)
|
||||
elif self.language_id in {"javascript", "typescript"}:
|
||||
return self._extract_js_ts_symbols(source_bytes, root)
|
||||
else:
|
||||
return []
|
||||
|
||||
def _extract_python_symbols(self, source_bytes: bytes, root: TreeSitterNode) -> List[Symbol]:
|
||||
"""Extract Python symbols from AST.
|
||||
|
||||
Args:
|
||||
source_bytes: Source code as bytes
|
||||
root: Root AST node
|
||||
|
||||
Returns:
|
||||
List of Python symbols (classes, functions, methods)
|
||||
"""
|
||||
symbols: List[Symbol] = []
|
||||
|
||||
for node in self._iter_nodes(root):
|
||||
if node.type == "class_definition":
|
||||
name_node = node.child_by_field_name("name")
|
||||
if name_node is None:
|
||||
continue
|
||||
symbols.append(Symbol(
|
||||
name=self._node_text(source_bytes, name_node),
|
||||
kind="class",
|
||||
range=self._node_range(node),
|
||||
))
|
||||
elif node.type in {"function_definition", "async_function_definition"}:
|
||||
name_node = node.child_by_field_name("name")
|
||||
if name_node is None:
|
||||
continue
|
||||
symbols.append(Symbol(
|
||||
name=self._node_text(source_bytes, name_node),
|
||||
kind=self._python_function_kind(node),
|
||||
range=self._node_range(node),
|
||||
))
|
||||
|
||||
return symbols
|
||||
|
||||
def _extract_js_ts_symbols(self, source_bytes: bytes, root: TreeSitterNode) -> List[Symbol]:
|
||||
"""Extract JavaScript/TypeScript symbols from AST.
|
||||
|
||||
Args:
|
||||
source_bytes: Source code as bytes
|
||||
root: Root AST node
|
||||
|
||||
Returns:
|
||||
List of JS/TS symbols (classes, functions, methods)
|
||||
"""
|
||||
symbols: List[Symbol] = []
|
||||
|
||||
for node in self._iter_nodes(root):
|
||||
if node.type in {"class_declaration", "class"}:
|
||||
name_node = node.child_by_field_name("name")
|
||||
if name_node is None:
|
||||
continue
|
||||
symbols.append(Symbol(
|
||||
name=self._node_text(source_bytes, name_node),
|
||||
kind="class",
|
||||
range=self._node_range(node),
|
||||
))
|
||||
elif node.type in {"function_declaration", "generator_function_declaration"}:
|
||||
name_node = node.child_by_field_name("name")
|
||||
if name_node is None:
|
||||
continue
|
||||
symbols.append(Symbol(
|
||||
name=self._node_text(source_bytes, name_node),
|
||||
kind="function",
|
||||
range=self._node_range(node),
|
||||
))
|
||||
elif node.type == "variable_declarator":
|
||||
name_node = node.child_by_field_name("name")
|
||||
value_node = node.child_by_field_name("value")
|
||||
if (
|
||||
name_node is None
|
||||
or value_node is None
|
||||
or name_node.type not in {"identifier", "property_identifier"}
|
||||
or value_node.type != "arrow_function"
|
||||
):
|
||||
continue
|
||||
symbols.append(Symbol(
|
||||
name=self._node_text(source_bytes, name_node),
|
||||
kind="function",
|
||||
range=self._node_range(node),
|
||||
))
|
||||
elif node.type == "method_definition" and self._has_class_ancestor(node):
|
||||
name_node = node.child_by_field_name("name")
|
||||
if name_node is None:
|
||||
continue
|
||||
name = self._node_text(source_bytes, name_node)
|
||||
if name == "constructor":
|
||||
continue
|
||||
symbols.append(Symbol(
|
||||
name=name,
|
||||
kind="method",
|
||||
range=self._node_range(node),
|
||||
))
|
||||
|
||||
return symbols
|
||||
|
||||
def _python_function_kind(self, node: TreeSitterNode) -> str:
|
||||
"""Determine if Python function is a method or standalone function.
|
||||
|
||||
Args:
|
||||
node: Function definition node
|
||||
|
||||
Returns:
|
||||
'method' if inside a class, 'function' otherwise
|
||||
"""
|
||||
parent = node.parent
|
||||
while parent is not None:
|
||||
if parent.type in {"function_definition", "async_function_definition"}:
|
||||
return "function"
|
||||
if parent.type == "class_definition":
|
||||
return "method"
|
||||
parent = parent.parent
|
||||
return "function"
|
||||
|
||||
def _has_class_ancestor(self, node: TreeSitterNode) -> bool:
|
||||
"""Check if node has a class ancestor.
|
||||
|
||||
Args:
|
||||
node: AST node to check
|
||||
|
||||
Returns:
|
||||
True if node is inside a class
|
||||
"""
|
||||
parent = node.parent
|
||||
while parent is not None:
|
||||
if parent.type in {"class_declaration", "class"}:
|
||||
return True
|
||||
parent = parent.parent
|
||||
return False
|
||||
|
||||
def _iter_nodes(self, root: TreeSitterNode):
|
||||
"""Iterate over all nodes in AST.
|
||||
|
||||
Args:
|
||||
root: Root node to start iteration
|
||||
|
||||
Yields:
|
||||
AST nodes in depth-first order
|
||||
"""
|
||||
stack = [root]
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
yield node
|
||||
for child in reversed(node.children):
|
||||
stack.append(child)
|
||||
|
||||
def _node_text(self, source_bytes: bytes, node: TreeSitterNode) -> str:
|
||||
"""Extract text for a node.
|
||||
|
||||
Args:
|
||||
source_bytes: Source code as bytes
|
||||
node: AST node
|
||||
|
||||
Returns:
|
||||
Text content of node
|
||||
"""
|
||||
return source_bytes[node.start_byte:node.end_byte].decode("utf8")
|
||||
|
||||
def _node_range(self, node: TreeSitterNode) -> tuple[int, int]:
|
||||
"""Get line range for a node.
|
||||
|
||||
Args:
|
||||
node: AST node
|
||||
|
||||
Returns:
|
||||
(start_line, end_line) tuple, 1-based inclusive
|
||||
"""
|
||||
start_line = node.start_point[0] + 1
|
||||
end_line = node.end_point[0] + 1
|
||||
return (start_line, max(start_line, end_line))
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""Count tokens in text.
|
||||
|
||||
Args:
|
||||
text: Text to count tokens for
|
||||
|
||||
Returns:
|
||||
Token count
|
||||
"""
|
||||
return self._tokenizer.count_tokens(text)
|
||||
Reference in New Issue
Block a user