feat: add experimental support for AST parsing and static graph indexing

- Introduced CLI options for using AST grep parsers and enabling static graph relationships during indexing. - Updated configuration management to load new settings for AST parsing and static graph types. - Enhanced AST grep processor to handle imports with aliases and improve relationship tracking. - Modified TreeSitter parsers to support synthetic module scopes for better static graph persistence. - Implemented global relationship updates in the incremental indexer for static graph expansion. - Added new ArtifactTag and FloatingFileBrowser components to the frontend for improved terminal dashboard functionality. - Created utility functions for detecting CCW artifacts in terminal output with associated tests.
2026-02-28 09:23:08 +08:00 · 2026-02-15 23:12:06 +08:00
parent 48a6a1f2aa
commit 8938c47f88
39 changed files with 2956 additions and 297 deletions
--- a/codex-lens/docs/CONFIGURATION.md
+++ b/codex-lens/docs/CONFIGURATION.md
@@ -125,6 +125,13 @@ CODEXLENS_DEBUG=false
    "tool": "gemini",
    "timeout_ms": 300000,
    "batch_size": 5
+  },
+  "parsing": {
+    "use_astgrep": false
+  },
+  "indexing": {
+    "static_graph_enabled": false,
+    "static_graph_relationship_types": ["imports", "inherits"]
  }
 }
 ```
@@ -167,6 +174,32 @@ CODEXLENS_DEBUG=false
 | `timeout_ms` | int | 超时时间 (毫秒) |
 | `batch_size` | int | 批处理大小 |

+### Parsing 设置
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| `use_astgrep` | bool | 优先使用 ast-grep 解析关系（实验性；当前主要用于 Python relationships） |
+
+### Indexing 设置（静态图）
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| `static_graph_enabled` | bool | 索引时将 relationships 写入全局 `global_relationships`，用于搜索阶段静态图扩展 |
+| `static_graph_relationship_types` | array | 允许持久化的关系类型：`imports` / `inherits` / `calls` |
+
+**CLI 覆盖（单次运行，不写入 settings.json）**:
+
+```bash
+# 索引时启用静态图 relationships + 使用 ast-grep（如果可用）
+codexlens index init --use-astgrep --static-graph --static-graph-types imports,inherits,calls
+```
+
+**Search staged 静态图扩展（高级）**:
+
+```bash
+codexlens search --cascade-strategy staged --staged-stage2-mode static_global_graph
+```
+
 ## FastEmbed 模型配置文件

 使用 `fastembed` 后端时的预定义模型:
--- a/codex-lens/pyproject.toml
+++ b/codex-lens/pyproject.toml
@@ -23,8 +23,8 @@ dependencies = [
    "pathspec>=0.11",
    "watchdog>=3.0",
    # ast-grep for pattern-based AST matching (PyO3 bindings)
-    # Note: May have compatibility issues with Python 3.13
-    "ast-grep-py>=0.3.0; python_version < '3.13'",
+    # ast-grep-py 0.40+ supports Python 3.13
+    "ast-grep-py>=0.40.0",
 ]

 [project.optional-dependencies]
--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -126,6 +126,21 @@ def index_init(
    no_embeddings: bool = typer.Option(False, "--no-embeddings", help="Skip automatic embedding generation (if semantic deps installed)."),
    backend: Optional[str] = typer.Option(None, "--backend", "-b", help="Embedding backend: fastembed (local) or litellm (remote API). Defaults to settings.json config."),
    model: Optional[str] = typer.Option(None, "--model", "-m", help="Embedding model: profile name for fastembed or model name for litellm. Defaults to settings.json config."),
+    use_astgrep: Optional[bool] = typer.Option(
+        None,
+        "--use-astgrep/--no-use-astgrep",
+        help="Prefer ast-grep parsers when available (experimental). Overrides settings.json config.",
+    ),
+    static_graph: Optional[bool] = typer.Option(
+        None,
+        "--static-graph/--no-static-graph",
+        help="Persist global relationships during indexing for static graph expansion. Overrides settings.json config.",
+    ),
+    static_graph_types: Optional[str] = typer.Option(
+        None,
+        "--static-graph-types",
+        help="Comma-separated relationship types to persist: imports,inherits,calls. Overrides settings.json config.",
+    ),
    max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls for embedding generation. Recommended: 4-8 for litellm backend."),
    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
@@ -154,6 +169,33 @@ def index_init(

    # Fallback to settings.json config if CLI params not provided
    config.load_settings()  # Ensure settings are loaded
+
+    # Apply CLI overrides for parsing/indexing behavior
+    if use_astgrep is not None:
+        config.use_astgrep = bool(use_astgrep)
+    if static_graph is not None:
+        config.static_graph_enabled = bool(static_graph)
+    if static_graph_types is not None:
+        allowed = {"imports", "inherits", "calls"}
+        parsed = [
+            t.strip().lower()
+            for t in static_graph_types.split(",")
+            if t.strip()
+        ]
+        invalid = [t for t in parsed if t not in allowed]
+        if invalid:
+            msg = (
+                "Invalid --static-graph-types. Must be a comma-separated list of: "
+                f"{', '.join(sorted(allowed))}. Got: {invalid}"
+            )
+            if json_mode:
+                print_json(success=False, error=msg)
+            else:
+                console.print(f"[red]Error:[/red] {msg}")
+            raise typer.Exit(code=1)
+        if parsed:
+            config.static_graph_relationship_types = parsed
+
    actual_backend = backend or config.embedding_backend
    actual_model = model or config.embedding_model

@@ -412,8 +454,10 @@ def watch(

    manager: WatcherManager | None = None
    try:
+        watch_config = Config.load()
        manager = WatcherManager(
            root_path=base_path,
+            config=watch_config,
            watcher_config=watcher_config,
            on_indexed=on_indexed,
        )
@@ -459,7 +503,7 @@ def search(
        None,
        "--staged-stage2-mode",
        hidden=True,
-        help="[Advanced] Stage 2 expansion mode for cascade strategy 'staged': precomputed | realtime.",
+        help="[Advanced] Stage 2 expansion mode for cascade strategy 'staged': precomputed | realtime | static_global_graph.",
    ),
    # Hidden deprecated parameter for backward compatibility
    mode: Optional[str] = typer.Option(None, "--mode", hidden=True, help="[DEPRECATED] Use --method instead."),
@@ -615,8 +659,8 @@ def search(
        # Optional staged cascade overrides (only meaningful for cascade strategy 'staged')
        if staged_stage2_mode is not None:
            stage2 = staged_stage2_mode.strip().lower()
-            if stage2 not in {"precomputed", "realtime"}:
-                msg = "Invalid --staged-stage2-mode. Must be: precomputed | realtime."
+            if stage2 not in {"precomputed", "realtime", "static_global_graph"}:
+                msg = "Invalid --staged-stage2-mode. Must be: precomputed | realtime | static_global_graph."
                if json_mode:
                    print_json(success=False, error=msg)
                else:
@@ -810,7 +854,7 @@ def inspect(
 ) -> None:
    """Analyze a single file and display symbols."""
    _configure_logging(verbose, json_mode)
-    config = Config()
+    config = Config.load()
    factory = ParserFactory(config)

    file_path = file.expanduser().resolve()
@@ -3145,8 +3189,10 @@ def watch(
        console.print("[dim]Press Ctrl+C to stop[/dim]\n")

        # Create and start watcher manager
+        watch_config = Config.load()
        manager = WatcherManager(
            root_path=watch_path,
+            config=watch_config,
            watcher_config=watcher_config,
            on_indexed=lambda result: _display_index_result(result),
        )
@@ -3681,7 +3727,7 @@ def index_update(
        registry = RegistryStore()
        registry.initialize()
        mapper = PathMapper()
-        config = Config()
+        config = Config.load()

        resolved_path = file_path.resolve()

@@ -3776,7 +3822,7 @@ def index_all(
        from codexlens.config import Config
        from codexlens.storage.index_tree import IndexTreeBuilder

-        config = Config()
+        config = Config.load()
        languages = _parse_languages(language)
        registry = RegistryStore()
        registry.initialize()
--- a/codex-lens/src/codexlens/config.py
+++ b/codex-lens/src/codexlens/config.py
@@ -294,6 +294,15 @@ class Config:
                "timeout_ms": self.llm_timeout_ms,
                "batch_size": self.llm_batch_size,
            },
+            "parsing": {
+                # Prefer ast-grep processors when available (experimental).
+                "use_astgrep": self.use_astgrep,
+            },
+            "indexing": {
+                # Persist global relationship edges during index build for static graph expansion.
+                "static_graph_enabled": self.static_graph_enabled,
+                "static_graph_relationship_types": self.static_graph_relationship_types,
+            },
            "reranker": {
                "enabled": self.enable_cross_encoder_rerank,
                "backend": self.reranker_backend,
@@ -413,6 +422,34 @@ class Config:
                if "fine_k" in cascade:
                    self.cascade_fine_k = cascade["fine_k"]

+                # Load parsing settings
+                parsing = settings.get("parsing", {})
+                if isinstance(parsing, dict) and "use_astgrep" in parsing:
+                    self.use_astgrep = bool(parsing["use_astgrep"])
+
+                # Load indexing settings
+                indexing = settings.get("indexing", {})
+                if isinstance(indexing, dict):
+                    if "static_graph_enabled" in indexing:
+                        self.static_graph_enabled = bool(indexing["static_graph_enabled"])
+                    if "static_graph_relationship_types" in indexing:
+                        raw_types = indexing["static_graph_relationship_types"]
+                        if isinstance(raw_types, list):
+                            allowed = {"imports", "inherits", "calls"}
+                            cleaned = []
+                            for item in raw_types:
+                                val = str(item).strip().lower()
+                                if val and val in allowed:
+                                    cleaned.append(val)
+                            if cleaned:
+                                self.static_graph_relationship_types = cleaned
+                        else:
+                            log.warning(
+                                "Invalid indexing.static_graph_relationship_types in %s: %r (expected list)",
+                                self.settings_path,
+                                raw_types,
+                            )
+
                # Load API settings
                api = settings.get("api", {})
                if "max_workers" in api:
--- a/codex-lens/src/codexlens/parsers/astgrep_processor.py
+++ b/codex-lens/src/codexlens/parsers/astgrep_processor.py
@@ -299,12 +299,25 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
            if func_name:
                all_matches.append((start_line, end_line, "func_def", func_name, node))

-        # Get import matches
+        # Get import matches (process import_with_alias first to avoid duplicates)
+        import_alias_positions: set = set()
+
+        # Process import with alias: import X as Y
+        import_alias_matches = self.run_ast_grep(source_code, get_pattern("import_with_alias"))
+        for node in import_alias_matches:
+            module = self._get_match(node, "MODULE")
+            alias = self._get_match(node, "ALIAS")
+            start_line, end_line = self._get_line_range(node)
+            if module and alias:
+                import_alias_positions.add(start_line)
+                all_matches.append((start_line, end_line, "import_alias", f"{module}:{alias}", node))
+
+        # Process simple imports: import X (skip lines with aliases)
        import_matches = self.run_ast_grep(source_code, get_pattern("import_stmt"))
        for node in import_matches:
            module = self._get_match(node, "MODULE")
            start_line, end_line = self._get_line_range(node)
-            if module:
+            if module and start_line not in import_alias_positions:
                all_matches.append((start_line, end_line, "import", module, node))

        from_matches = self.run_ast_grep(source_code, get_pattern("import_from"))
@@ -429,7 +442,7 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
                        ))

            elif match_type == "import":
-                # Process import statement
+                # Process simple import statement
                module = symbol
                # Simple import: add base name to alias map
                base_name = module.split(".", 1)[0]
@@ -443,6 +456,22 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
                    source_line=start_line,
                ))

+            elif match_type == "import_alias":
+                # Process import with alias: import X as Y
+                parts = symbol.split(":", 1)
+                module = parts[0]
+                alias = parts[1] if len(parts) > 1 else ""
+                if alias:
+                    update_aliases({alias: module})
+                relationships.append(CodeRelationship(
+                    source_symbol=get_current_scope(),
+                    target_symbol=module,
+                    relationship_type=RelationshipType.IMPORTS,
+                    source_file=source_file,
+                    target_file=None,
+                    source_line=start_line,
+                ))
+
            elif match_type == "from_import":
                # Process from-import statement
                parts = symbol.split(":", 1)
@@ -647,6 +676,22 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
            return match.group(1).strip()
        return ""

+    def _extract_import_names_from_text(self, import_text: str) -> str:
+        """Extract imported names from from-import statement.
+
+        Args:
+            import_text: Full text of import statement (e.g., "from typing import List, Dict")
+
+        Returns:
+            Names text (e.g., "List, Dict") or empty string
+        """
+        import re
+        # Match "from MODULE import NAMES" - extract NAMES
+        match = re.search(r'from\s+[\w.]+\s+import\s+(.+)$', import_text, re.MULTILINE)
+        if match:
+            return match.group(1).strip()
+        return ""
+
    def extract_calls(
        self,
        source_code: str,
@@ -736,16 +781,19 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
        relationships: List[CodeRelationship] = []
        alias_map: Dict[str, str] = {}

-        # Process simple imports: import X
-        import_matches = self.run_ast_grep(source_code, get_pattern("import_stmt"))
-        for node in import_matches:
+        # Track processed lines to avoid duplicates
+        processed_lines: set = set()
+
+        # Process import with alias FIRST: import X as Y
+        alias_matches = self.run_ast_grep(source_code, get_pattern("import_with_alias"))
+        for node in alias_matches:
            module = self._get_match(node, "MODULE")
+            alias = self._get_match(node, "ALIAS")
            line = self._get_line_number(node)

-            if module:
-                # Add to alias map: first part of module
-                base_name = module.split(".", 1)[0]
-                alias_map[base_name] = module
+            if module and alias:
+                alias_map[alias] = module
+                processed_lines.add(line)

                relationships.append(CodeRelationship(
                    source_symbol=source_symbol,
@@ -756,15 +804,16 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
                    source_line=line,
                ))

-        # Process import with alias: import X as Y
-        alias_matches = self.run_ast_grep(source_code, get_pattern("import_with_alias"))
-        for node in alias_matches:
+        # Process simple imports: import X (skip lines already processed)
+        import_matches = self.run_ast_grep(source_code, get_pattern("import_stmt"))
+        for node in import_matches:
            module = self._get_match(node, "MODULE")
-            alias = self._get_match(node, "ALIAS")
            line = self._get_line_number(node)

-            if module and alias:
-                alias_map[alias] = module
+            if module and line not in processed_lines:
+                # Add to alias map: first part of module
+                base_name = module.split(".", 1)[0]
+                alias_map[base_name] = module

                relationships.append(CodeRelationship(
                    source_symbol=source_symbol,
@@ -779,7 +828,6 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
        from_matches = self.run_ast_grep(source_code, get_pattern("import_from"))
        for node in from_matches:
            module = self._get_match(node, "MODULE")
-            names = self._get_match(node, "NAMES")
            line = self._get_line_number(node)

            if module:
@@ -793,6 +841,10 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
                    source_line=line,
                ))

+                # Parse names from node text (ast-grep-py 0.40+ doesn't capture $$$ multi-match)
+                node_text = self._binding._get_node_text(node) if self._binding else ""
+                names = self._extract_import_names_from_text(node_text)
+
                # Add aliases for imported names
                if names and names != "*":
                    for name in names.split(","):
--- a/codex-lens/src/codexlens/parsers/factory.py
+++ b/codex-lens/src/codexlens/parsers/factory.py
@@ -24,11 +24,16 @@ class Parser(Protocol):
@dataclass
 class SimpleRegexParser:
    language_id: str
+    config: Optional[Config] = None

    def parse(self, text: str, path: Path) -> IndexedFile:
        # Try tree-sitter first for supported languages
        if self.language_id in {"python", "javascript", "typescript"}:
-            ts_parser = TreeSitterSymbolParser(self.language_id, path)
+            ts_parser = TreeSitterSymbolParser(
+                self.language_id,
+                path,
+                config=self.config,
+            )
            if ts_parser.is_available():
                indexed = ts_parser.parse(text, path)
                if indexed is not None:
@@ -73,7 +78,10 @@ class ParserFactory:

    def get_parser(self, language_id: str) -> Parser:
        if language_id not in self._parsers:
-            self._parsers[language_id] = SimpleRegexParser(language_id)
+            self._parsers[language_id] = SimpleRegexParser(
+                language_id,
+                config=self.config,
+            )
        return self._parsers[language_id]


--- a/codex-lens/src/codexlens/parsers/treesitter_parser.py
+++ b/codex-lens/src/codexlens/parsers/treesitter_parser.py
@@ -291,7 +291,9 @@ class TreeSitterSymbolParser:
        source_file = str(path.resolve())
        relationships: List[CodeRelationship] = []

-        scope_stack: List[str] = []
+        # Use a synthetic module scope so module-level imports/calls can be recorded
+        # (useful for static global graph persistence).
+        scope_stack: List[str] = ["<module>"]
        alias_stack: List[Dict[str, str]] = [{}]

        def record_import(target_symbol: str, source_line: int) -> None:
@@ -398,7 +400,9 @@ class TreeSitterSymbolParser:
        source_file = str(path.resolve())
        relationships: List[CodeRelationship] = []

-        scope_stack: List[str] = []
+        # Use a synthetic module scope so module-level imports/calls can be recorded
+        # (useful for static global graph persistence).
+        scope_stack: List[str] = ["<module>"]
        alias_stack: List[Dict[str, str]] = [{}]

        def record_import(target_symbol: str, source_line: int) -> None:
--- a/codex-lens/src/codexlens/storage/index_tree.py
+++ b/codex-lens/src/codexlens/storage/index_tree.py
@@ -519,6 +519,7 @@ class IndexTreeBuilder:
            "global_symbol_index_enabled": self.config.global_symbol_index_enabled,
            "static_graph_enabled": self.config.static_graph_enabled,
            "static_graph_relationship_types": self.config.static_graph_relationship_types,
+            "use_astgrep": getattr(self.config, "use_astgrep", False),
        }

        worker_args = [
@@ -984,6 +985,7 @@ def _build_dir_worker(args: tuple) -> DirBuildResult:
        global_symbol_index_enabled=bool(config_dict.get("global_symbol_index_enabled", True)),
        static_graph_enabled=bool(config_dict.get("static_graph_enabled", False)),
        static_graph_relationship_types=list(config_dict.get("static_graph_relationship_types", ["imports", "inherits"])),
+        use_astgrep=bool(config_dict.get("use_astgrep", False)),
    )

    parser_factory = ParserFactory(config)
--- a/codex-lens/src/codexlens/watcher/incremental_indexer.py
+++ b/codex-lens/src/codexlens/watcher/incremental_indexer.py
@@ -89,7 +89,18 @@ class IncrementalIndexer:
                    project_info = self.registry.get_project(source_root)
                    if project_info:
                        project_id = project_info.id
-                self._global_index = GlobalSymbolIndex(global_db_path, project_id=project_id)
+                try:
+                    self._global_index = GlobalSymbolIndex(global_db_path, project_id=project_id)
+                    # Ensure schema exists (best-effort). The DB should already be initialized
+                    # by `codexlens index init`, but watcher/index-update should be robust.
+                    self._global_index.initialize()
+                except Exception as exc:
+                    logger.debug(
+                        "Failed to initialize global symbol index at %s: %s",
+                        global_db_path,
+                        exc,
+                    )
+                    self._global_index = None

        return self._global_index
    
@@ -262,6 +273,34 @@ class IncrementalIndexer:
                # Update merkle root
                store.update_merkle_root()

+                # Update global relationships for static graph expansion (best-effort).
+                if getattr(self.config, "static_graph_enabled", False):
+                    try:
+                        source_root = self.mapper.get_project_root(path) or dir_path
+                        index_root = self.mapper.source_to_index_dir(source_root)
+                        global_index = self._get_global_index(index_root, source_root=source_root)
+                        if global_index is not None:
+                            allowed_types = set(
+                                getattr(
+                                    self.config,
+                                    "static_graph_relationship_types",
+                                    ["imports", "inherits"],
+                                )
+                                or []
+                            )
+                            filtered_rels = [
+                                r
+                                for r in (indexed_file.relationships or [])
+                                if r.relationship_type.value in allowed_types
+                            ]
+                            global_index.update_file_relationships(path, filtered_rels)
+                    except Exception as exc:
+                        logger.debug(
+                            "Failed to update global relationships for %s: %s",
+                            path,
+                            exc,
+                        )
+
                logger.debug("Indexed file: %s (%d symbols)", path, len(indexed_file.symbols))

                return FileIndexResult(
@@ -329,6 +368,21 @@ class IncrementalIndexer:
            try:
                store.remove_file(str(path))
                store.update_merkle_root()
+
+                # Best-effort cleanup of static graph relationships (keeps global DB consistent).
+                if getattr(self.config, "static_graph_enabled", False):
+                    try:
+                        source_root = self.mapper.get_project_root(path) or dir_path
+                        index_root = self.mapper.source_to_index_dir(source_root)
+                        global_index = self._get_global_index(index_root, source_root=source_root)
+                        if global_index is not None:
+                            global_index.delete_file_relationships(path)
+                    except Exception as exc:
+                        logger.debug(
+                            "Failed to delete global relationships for %s: %s",
+                            path,
+                            exc,
+                        )
                logger.debug("Removed file from index: %s", path)
                return True

--- a/codex-lens/tests/test_parsers.py
+++ b/codex-lens/tests/test_parsers.py
@@ -377,6 +377,43 @@ class TestParserFactory:
            finally:
                del os.environ["CODEXLENS_DATA_DIR"]

+    def test_factory_passes_config_to_treesitter(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        """Ensure ParserFactory config is forwarded into TreeSitterSymbolParser."""
+        from codexlens.entities import IndexedFile
+
+        captured: dict = {}
+
+        class FakeTreeSitterSymbolParser:
+            def __init__(self, language_id, path=None, config=None) -> None:
+                captured["config"] = config
+                self.language_id = language_id
+
+            def is_available(self) -> bool:
+                return True
+
+            def parse(self, text: str, path: Path) -> IndexedFile:
+                return IndexedFile(
+                    path=str(path.resolve()),
+                    language=self.language_id,
+                    symbols=[],
+                    chunks=[],
+                    relationships=[],
+                )
+
+        monkeypatch.setattr(
+            "codexlens.parsers.factory.TreeSitterSymbolParser",
+            FakeTreeSitterSymbolParser,
+        )
+
+        config = Config()
+        config.use_astgrep = True
+
+        factory = ParserFactory(config)
+        parser = factory.get_parser("python")
+        parser.parse("def hello():\n    pass\n", Path("test.py"))
+
+        assert captured.get("config") is config
+

 class TestParserEdgeCases:
    """Edge case tests for parsers."""