feat: Add support for Tree-Sitter parsing and enhance SQLite storage performance

2026-02-10 02:24:35 +08:00 · 2025-12-12 18:40:24 +08:00
parent 92d2085b64
commit c42f91a7fe
5 changed files with 519 additions and 27 deletions
--- a/codex-lens/pyproject.toml
+++ b/codex-lens/pyproject.toml
@@ -17,6 +17,9 @@ dependencies = [
    "rich>=13",
    "pydantic>=2.0",
    "tree-sitter>=0.20",
    "tree-sitter-python>=0.25",
    "tree-sitter-javascript>=0.25",
    "tree-sitter-typescript>=0.23",
    "pathspec>=0.11",
 ]
@@ -24,6 +27,7 @@ dependencies = [
 semantic = [
    "numpy>=1.24",
    "sentence-transformers>=2.2",
    "fastembed>=0.2",
 ]
 [project.urls]
@@ -31,4 +35,3 @@ Homepage = "https://github.com/openai/codex-lens"
 [tool.setuptools]
 package-dir = { "" = "src" }
--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -62,25 +62,42 @@ def _iter_source_files(
    languages: Optional[List[str]] = None,
 ) -> Iterable[Path]:
    ignore_dirs = {".git", ".venv", "venv", "node_modules", "__pycache__", ".codexlens"}
-    ignore_patterns = _load_gitignore(base_path)
+    
-    pathspec = None
+    # Cache for PathSpec objects per directory
-    if ignore_patterns:
+    pathspec_cache: Dict[Path, Optional[Any]] = {}
    def get_pathspec_for_dir(dir_path: Path) -> Optional[Any]:
        """Get PathSpec for a directory, loading .gitignore if present."""
        if dir_path in pathspec_cache:
            return pathspec_cache[dir_path]
        ignore_patterns = _load_gitignore(dir_path)
        if not ignore_patterns:
            pathspec_cache[dir_path] = None
            return None
        try:
            from pathspec import PathSpec
            from pathspec.patterns.gitwildmatch import GitWildMatchPattern
            pathspec = PathSpec.from_lines(GitWildMatchPattern, ignore_patterns)
            pathspec_cache[dir_path] = pathspec
            return pathspec
        except Exception:
-            pathspec = None
+            pathspec_cache[dir_path] = None
            return None
    for root, dirs, files in os.walk(base_path):
        dirs[:] = [d for d in dirs if d not in ignore_dirs and not d.startswith(".")]
        root_path = Path(root)
        # Get pathspec for current directory
        pathspec = get_pathspec_for_dir(root_path)
        for file in files:
            if file.startswith("."):
                continue
            full_path = root_path / file
-            rel = full_path.relative_to(base_path)
+            rel = full_path.relative_to(root_path)
            if pathspec and pathspec.match_file(str(rel)):
                continue
            language_id = config.language_for_path(full_path)
@@ -112,6 +129,25 @@ def _get_store_for_path(path: Path, use_global: bool = False) -> tuple[SQLiteSto
    return SQLiteStore(config.db_path), config.db_path
 def _is_safe_to_clean(target_dir: Path) -> bool:
    """Verify directory is a CodexLens directory before deletion.
    Checks for presence of .codexlens directory or index.db file.
    """
    if not target_dir.exists():
        return True
    # Check if it's the .codexlens directory itself
    if target_dir.name == ".codexlens":
        # Verify it contains index.db or cache directory
        return (target_dir / "index.db").exists() or (target_dir / "cache").exists()
    # Check if it contains .codexlens subdirectory
    return (target_dir / ".codexlens").exists()
@app.command()
 def init(
    path: Path = typer.Argument(Path("."), exists=True, file_okay=False, dir_okay=True, help="Project root to index."),
@@ -469,12 +505,16 @@ def clean(
            config = Config()
            import shutil
            if config.index_dir.exists():
                if not _is_safe_to_clean(config.index_dir):
                    raise CodexLensError(f"Safety check failed: {config.index_dir} does not appear to be a CodexLens directory")
                shutil.rmtree(config.index_dir)
            result = {"cleaned": str(config.index_dir), "type": "global"}
        else:
            workspace = WorkspaceConfig.from_path(base_path)
            if workspace and workspace.codexlens_dir.exists():
                import shutil
                if not _is_safe_to_clean(workspace.codexlens_dir):
                    raise CodexLensError(f"Safety check failed: {workspace.codexlens_dir} does not appear to be a CodexLens directory")
                shutil.rmtree(workspace.codexlens_dir)
                result = {"cleaned": str(workspace.codexlens_dir), "type": "workspace"}
            else:
--- a/codex-lens/src/codexlens/parsers/factory.py
+++ b/codex-lens/src/codexlens/parsers/factory.py
@@ -1,8 +1,8 @@
 """Parser factory for CodexLens.
-The project currently ships lightweight regex-based parsers per language.
+Python and JavaScript/TypeScript parsing use Tree-Sitter grammars when
-They can be swapped for tree-sitter based parsers later without changing
+available. Regex fallbacks are retained to preserve the existing parser
-CLI or storage interfaces.
+interface and behavior in minimal environments.
 """
 from __future__ import annotations
@@ -10,7 +10,16 @@ from __future__ import annotations
 import re
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, List, Optional, Protocol
+from typing import Dict, Iterable, List, Optional, Protocol
 try:
    from tree_sitter import Language as TreeSitterLanguage
    from tree_sitter import Node as TreeSitterNode
    from tree_sitter import Parser as TreeSitterParser
 except Exception:  # pragma: no cover
    TreeSitterLanguage = None  # type: ignore[assignment]
    TreeSitterNode = None  # type: ignore[assignment]
    TreeSitterParser = None  # type: ignore[assignment]
 from codexlens.config import Config
 from codexlens.entities import IndexedFile, Symbol
@@ -25,11 +34,10 @@ class SimpleRegexParser:
    language_id: str
    def parse(self, text: str, path: Path) -> IndexedFile:
        symbols: List[Symbol] = []
        if self.language_id == "python":
            symbols = _parse_python_symbols(text)
        elif self.language_id in {"javascript", "typescript"}:
-            symbols = _parse_js_ts_symbols(text)
+            symbols = _parse_js_ts_symbols(text, self.language_id, path)
        elif self.language_id == "java":
            symbols = _parse_java_symbols(text)
        elif self.language_id == "go":
@@ -57,24 +65,135 @@ class ParserFactory:
 _PY_CLASS_RE = re.compile(r"^\s*class\s+([A-Za-z_]\w*)\b")
-_PY_DEF_RE = re.compile(r"^\s*def\s+([A-Za-z_]\w*)\s*\(")
+_PY_DEF_RE = re.compile(r"^\s*(?:async\s+)?def\s+([A-Za-z_]\w*)\s*\(")
 _TREE_SITTER_LANGUAGE_CACHE: Dict[str, TreeSitterLanguage] = {}
-def _parse_python_symbols(text: str) -> List[Symbol]:
+def _get_tree_sitter_language(language_id: str, path: Path | None = None) -> TreeSitterLanguage | None:
    if TreeSitterLanguage is None:
        return None
    cache_key = language_id
    if language_id == "typescript" and path is not None and path.suffix.lower() == ".tsx":
        cache_key = "tsx"
    cached = _TREE_SITTER_LANGUAGE_CACHE.get(cache_key)
    if cached is not None:
        return cached
    try:
        if cache_key == "python":
            import tree_sitter_python  # type: ignore[import-not-found]
            language = TreeSitterLanguage(tree_sitter_python.language())
        elif cache_key == "javascript":
            import tree_sitter_javascript  # type: ignore[import-not-found]
            language = TreeSitterLanguage(tree_sitter_javascript.language())
        elif cache_key == "typescript":
            import tree_sitter_typescript  # type: ignore[import-not-found]
            language = TreeSitterLanguage(tree_sitter_typescript.language_typescript())
        elif cache_key == "tsx":
            import tree_sitter_typescript  # type: ignore[import-not-found]
            language = TreeSitterLanguage(tree_sitter_typescript.language_tsx())
        else:
            return None
    except Exception:
        return None
    _TREE_SITTER_LANGUAGE_CACHE[cache_key] = language
    return language
 def _iter_tree_sitter_nodes(root: TreeSitterNode) -> Iterable[TreeSitterNode]:
    stack: List[TreeSitterNode] = [root]
    while stack:
        node = stack.pop()
        yield node
        for child in reversed(node.children):
            stack.append(child)
 def _node_text(source_bytes: bytes, node: TreeSitterNode) -> str:
    return source_bytes[node.start_byte:node.end_byte].decode("utf8")
 def _node_range(node: TreeSitterNode) -> tuple[int, int]:
    start_line = node.start_point[0] + 1
    end_line = node.end_point[0] + 1
    return (start_line, max(start_line, end_line))
 def _python_kind_for_function_node(node: TreeSitterNode) -> str:
    parent = node.parent
    while parent is not None:
        if parent.type in {"function_definition", "async_function_definition"}:
            return "function"
        if parent.type == "class_definition":
            return "method"
        parent = parent.parent
    return "function"
 def _parse_python_symbols_tree_sitter(text: str) -> List[Symbol] | None:
    if TreeSitterParser is None:
        return None
    language = _get_tree_sitter_language("python")
    if language is None:
        return None
    parser = TreeSitterParser()
    if hasattr(parser, "set_language"):
        parser.set_language(language)  # type: ignore[attr-defined]
    else:
        parser.language = language  # type: ignore[assignment]
    source_bytes = text.encode("utf8")
    tree = parser.parse(source_bytes)
    root = tree.root_node
    symbols: List[Symbol] = []
    for node in _iter_tree_sitter_nodes(root):
        if node.type == "class_definition":
            name_node = node.child_by_field_name("name")
            if name_node is None:
                continue
            symbols.append(Symbol(
                name=_node_text(source_bytes, name_node),
                kind="class",
                range=_node_range(node),
            ))
        elif node.type in {"function_definition", "async_function_definition"}:
            name_node = node.child_by_field_name("name")
            if name_node is None:
                continue
            symbols.append(Symbol(
                name=_node_text(source_bytes, name_node),
                kind=_python_kind_for_function_node(node),
                range=_node_range(node),
            ))
    return symbols
 def _parse_python_symbols_regex(text: str) -> List[Symbol]:
    symbols: List[Symbol] = []
    current_class_indent: Optional[int] = None
    for i, line in enumerate(text.splitlines(), start=1):
-        if _PY_CLASS_RE.match(line):
+        class_match = _PY_CLASS_RE.match(line)
-            name = _PY_CLASS_RE.match(line).group(1)
+        if class_match:
            current_class_indent = len(line) - len(line.lstrip(" "))
-            symbols.append(Symbol(name=name, kind="class", range=(i, i)))
+            symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
            continue
        def_match = _PY_DEF_RE.match(line)
        if def_match:
            name = def_match.group(1)
            indent = len(line) - len(line.lstrip(" "))
            kind = "method" if current_class_indent is not None and indent > current_class_indent else "function"
-            symbols.append(Symbol(name=name, kind=kind, range=(i, i)))
+            symbols.append(Symbol(name=def_match.group(1), kind=kind, range=(i, i)))
            continue
        if current_class_indent is not None:
            indent = len(line) - len(line.lstrip(" "))
@@ -83,23 +202,153 @@ def _parse_python_symbols(text: str) -> List[Symbol]:
    return symbols
-_JS_FUNC_RE = re.compile(r"^\s*(?:export\s+)?function\s+([A-Za-z_$][\w$]*)\s*\(")
+def _parse_python_symbols(text: str) -> List[Symbol]:
    symbols = _parse_python_symbols_tree_sitter(text)
    if symbols is not None:
        return symbols
    return _parse_python_symbols_regex(text)
 _JS_FUNC_RE = re.compile(r"^\s*(?:export\s+)?(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\(")
 _JS_CLASS_RE = re.compile(r"^\s*(?:export\s+)?class\s+([A-Za-z_$][\w$]*)\b")
 _JS_ARROW_RE = re.compile(
    r"^\s*(?:export\s+)?(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:async\s*)?\(?[^)]*\)?\s*=>"
 )
 _JS_METHOD_RE = re.compile(r"^\s+(?:async\s+)?([A-Za-z_$][\w$]*)\s*\([^)]*\)\s*\{")
-def _parse_js_ts_symbols(text: str) -> List[Symbol]:
+def _js_has_class_ancestor(node: TreeSitterNode) -> bool:
    parent = node.parent
    while parent is not None:
        if parent.type in {"class_declaration", "class"}:
            return True
        parent = parent.parent
    return False
 def _parse_js_ts_symbols_tree_sitter(
    text: str,
    language_id: str,
    path: Path | None = None,
 ) -> List[Symbol] | None:
    if TreeSitterParser is None:
        return None
    language = _get_tree_sitter_language(language_id, path)
    if language is None:
        return None
    parser = TreeSitterParser()
    if hasattr(parser, "set_language"):
        parser.set_language(language)  # type: ignore[attr-defined]
    else:
        parser.language = language  # type: ignore[assignment]
    source_bytes = text.encode("utf8")
    tree = parser.parse(source_bytes)
    root = tree.root_node
    symbols: List[Symbol] = []
    for node in _iter_tree_sitter_nodes(root):
        if node.type in {"class_declaration", "class"}:
            name_node = node.child_by_field_name("name")
            if name_node is None:
                continue
            symbols.append(Symbol(
                name=_node_text(source_bytes, name_node),
                kind="class",
                range=_node_range(node),
            ))
        elif node.type in {"function_declaration", "generator_function_declaration"}:
            name_node = node.child_by_field_name("name")
            if name_node is None:
                continue
            symbols.append(Symbol(
                name=_node_text(source_bytes, name_node),
                kind="function",
                range=_node_range(node),
            ))
        elif node.type == "variable_declarator":
            name_node = node.child_by_field_name("name")
            value_node = node.child_by_field_name("value")
            if (
                name_node is None
                or value_node is None
                or name_node.type not in {"identifier", "property_identifier"}
                or value_node.type != "arrow_function"
            ):
                continue
            symbols.append(Symbol(
                name=_node_text(source_bytes, name_node),
                kind="function",
                range=_node_range(node),
            ))
        elif node.type == "method_definition" and _js_has_class_ancestor(node):
            name_node = node.child_by_field_name("name")
            if name_node is None:
                continue
            name = _node_text(source_bytes, name_node)
            if name == "constructor":
                continue
            symbols.append(Symbol(
                name=name,
                kind="method",
                range=_node_range(node),
            ))
    return symbols
 def _parse_js_ts_symbols_regex(text: str) -> List[Symbol]:
    symbols: List[Symbol] = []
    in_class = False
    class_brace_depth = 0
    brace_depth = 0
    for i, line in enumerate(text.splitlines(), start=1):
        brace_depth += line.count("{") - line.count("}")
        class_match = _JS_CLASS_RE.match(line)
        if class_match:
            symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
            in_class = True
            class_brace_depth = brace_depth
            continue
        if in_class and brace_depth < class_brace_depth:
            in_class = False
        func_match = _JS_FUNC_RE.match(line)
        if func_match:
            symbols.append(Symbol(name=func_match.group(1), kind="function", range=(i, i)))
            continue
-        class_match = _JS_CLASS_RE.match(line)
+
-        if class_match:
+        arrow_match = _JS_ARROW_RE.match(line)
-            symbols.append(Symbol(name=class_match.group(1), kind="class", range=(i, i)))
+        if arrow_match:
            symbols.append(Symbol(name=arrow_match.group(1), kind="function", range=(i, i)))
            continue
        if in_class:
            method_match = _JS_METHOD_RE.match(line)
            if method_match:
                name = method_match.group(1)
                if name != "constructor":
                    symbols.append(Symbol(name=name, kind="method", range=(i, i)))
    return symbols
 def _parse_js_ts_symbols(
    text: str,
    language_id: str = "javascript",
    path: Path | None = None,
 ) -> List[Symbol]:
    symbols = _parse_js_ts_symbols_tree_sitter(text, language_id, path)
    if symbols is not None:
        return symbols
    return _parse_js_ts_symbols_regex(text)
 _JAVA_CLASS_RE = re.compile(r"^\s*(?:public\s+)?class\s+([A-Za-z_]\w*)\b")
 _JAVA_METHOD_RE = re.compile(
    r"^\s*(?:public|private|protected|static|\s)+[\w<>\[\]]+\s+([A-Za-z_]\w*)\s*\("
@@ -151,4 +400,3 @@ def _parse_generic_symbols(text: str) -> List[Symbol]:
        if def_match:
            symbols.append(Symbol(name=def_match.group(1), kind="function", range=(i, i)))
    return symbols
--- a/codex-lens/src/codexlens/storage/sqlite_store.py
+++ b/codex-lens/src/codexlens/storage/sqlite_store.py
@@ -118,6 +118,59 @@ class SQLiteStore:
                )
            conn.commit()
    def add_files(self, files_data: List[tuple[IndexedFile, str]]) -> None:
        """Add multiple files in a single transaction for better performance.
        Args:
            files_data: List of (indexed_file, content) tuples
        """
        with self._lock:
            conn = self._get_connection()
            try:
                conn.execute("BEGIN")
                for indexed_file, content in files_data:
                    path = str(Path(indexed_file.path).resolve())
                    language = indexed_file.language
                    mtime = Path(path).stat().st_mtime if Path(path).exists() else None
                    line_count = content.count(chr(10)) + 1
                    conn.execute(
                        """
                        INSERT INTO files(path, language, content, mtime, line_count)
                        VALUES(?, ?, ?, ?, ?)
                        ON CONFLICT(path) DO UPDATE SET
                            language=excluded.language,
                            content=excluded.content,
                            mtime=excluded.mtime,
                            line_count=excluded.line_count
                        """,
                        (path, language, content, mtime, line_count),
                    )
                    row = conn.execute("SELECT id FROM files WHERE path=?", (path,)).fetchone()
                    if not row:
                        raise StorageError(f"Failed to read file id for {path}")
                    file_id = int(row["id"])
                    conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
                    if indexed_file.symbols:
                        conn.executemany(
                            """
                            INSERT INTO symbols(file_id, name, kind, start_line, end_line)
                            VALUES(?, ?, ?, ?, ?)
                            """,
                            [
                                (file_id, s.name, s.kind, s.range[0], s.range[1])
                                for s in indexed_file.symbols
                            ],
                        )
                conn.commit()
            except Exception:
                conn.rollback()
                raise
    def remove_file(self, path: str | Path) -> bool:
        """Remove a file from the index."""
        with self._lock:
@@ -178,7 +231,7 @@ class SQLiteStore:
            results: List[SearchResult] = []
            for row in rows:
                rank = float(row["rank"]) if row["rank"] is not None else 0.0
-                score = max(0.0, -rank)
+                score = abs(rank) if rank < 0 else 0.0
                results.append(
                    SearchResult(
                        path=row["path"],
--- a/codex-lens/tests/test_parsers.py
+++ b/codex-lens/tests/test_parsers.py
@@ -0,0 +1,148 @@
 """Tests for CodexLens parsers."""
 from pathlib import Path
 import pytest
 from codexlens.parsers.factory import (
    SimpleRegexParser,
    _parse_js_ts_symbols,
    _parse_python_symbols,
 )
 TREE_SITTER_JS_AVAILABLE = True
 try:
    import tree_sitter_javascript  # type: ignore[import-not-found]  # noqa: F401
 except Exception:
    TREE_SITTER_JS_AVAILABLE = False
 class TestPythonParser:
    """Tests for Python symbol parsing."""
    def test_parse_function(self):
        code = "def hello():\n    pass"
        symbols = _parse_python_symbols(code)
        assert len(symbols) == 1
        assert symbols[0].name == "hello"
        assert symbols[0].kind == "function"
    def test_parse_async_function(self):
        code = "async def fetch_data():\n    pass"
        symbols = _parse_python_symbols(code)
        assert len(symbols) == 1
        assert symbols[0].name == "fetch_data"
        assert symbols[0].kind == "function"
    def test_parse_class(self):
        code = "class MyClass:\n    pass"
        symbols = _parse_python_symbols(code)
        assert len(symbols) == 1
        assert symbols[0].name == "MyClass"
        assert symbols[0].kind == "class"
    def test_parse_method(self):
        code = "class MyClass:\n    def method(self):\n        pass"
        symbols = _parse_python_symbols(code)
        assert len(symbols) == 2
        assert symbols[0].name == "MyClass"
        assert symbols[0].kind == "class"
        assert symbols[1].name == "method"
        assert symbols[1].kind == "method"
    def test_parse_async_method(self):
        code = "class MyClass:\n    async def async_method(self):\n        pass"
        symbols = _parse_python_symbols(code)
        assert len(symbols) == 2
        assert symbols[1].name == "async_method"
        assert symbols[1].kind == "method"
 class TestJavaScriptParser:
    """Tests for JavaScript/TypeScript symbol parsing."""
    def test_parse_function(self):
        code = "function hello() {}"
        symbols = _parse_js_ts_symbols(code)
        assert len(symbols) == 1
        assert symbols[0].name == "hello"
        assert symbols[0].kind == "function"
    def test_parse_async_function(self):
        code = "async function fetchData() {}"
        symbols = _parse_js_ts_symbols(code)
        assert len(symbols) == 1
        assert symbols[0].name == "fetchData"
        assert symbols[0].kind == "function"
    def test_parse_arrow_function(self):
        code = "const hello = () => {}"
        symbols = _parse_js_ts_symbols(code)
        assert len(symbols) == 1
        assert symbols[0].name == "hello"
        assert symbols[0].kind == "function"
    def test_parse_async_arrow_function(self):
        code = "const fetchData = async () => {}"
        symbols = _parse_js_ts_symbols(code)
        assert len(symbols) == 1
        assert symbols[0].name == "fetchData"
        assert symbols[0].kind == "function"
    def test_parse_class(self):
        code = "class MyClass {}"
        symbols = _parse_js_ts_symbols(code)
        assert len(symbols) == 1
        assert symbols[0].name == "MyClass"
        assert symbols[0].kind == "class"
    def test_parse_export_function(self):
        code = "export function hello() {}"
        symbols = _parse_js_ts_symbols(code)
        assert len(symbols) == 1
        assert symbols[0].name == "hello"
        assert symbols[0].kind == "function"
    def test_parse_export_class(self):
        code = "export class MyClass {}"
        symbols = _parse_js_ts_symbols(code)
        assert len(symbols) == 1
        assert symbols[0].name == "MyClass"
        assert symbols[0].kind == "class"
    def test_parse_export_arrow_function(self):
        code = "export const hello = () => {}"
        symbols = _parse_js_ts_symbols(code)
        assert len(symbols) == 1
        assert symbols[0].name == "hello"
        assert symbols[0].kind == "function"
    @pytest.mark.skipif(not TREE_SITTER_JS_AVAILABLE, reason="tree-sitter-javascript not installed")
    def test_parse_class_methods(self):
        code = (
            "class MyClass {\n"
            "  method() {}\n"
            "  async asyncMethod() {}\n"
            "  static staticMethod() {}\n"
            "  constructor() {}\n"
            "}"
        )
        symbols = _parse_js_ts_symbols(code)
        names_kinds = [(s.name, s.kind) for s in symbols]
        assert ("MyClass", "class") in names_kinds
        assert ("method", "method") in names_kinds
        assert ("asyncMethod", "method") in names_kinds
        assert ("staticMethod", "method") in names_kinds
        assert all(name != "constructor" for name, _ in names_kinds)
 class TestParserInterface:
    """High-level interface tests."""
    def test_simple_parser_parse(self):
        parser = SimpleRegexParser("python")
        indexed = parser.parse("def hello():\n    pass", Path("test.py"))
        assert indexed.language == "python"
        assert len(indexed.symbols) == 1
        assert indexed.symbols[0].name == "hello"