feat: add experimental support for AST parsing and static graph indexing

- Introduced CLI options for using AST grep parsers and enabling static graph relationships during indexing.
- Updated configuration management to load new settings for AST parsing and static graph types.
- Enhanced AST grep processor to handle imports with aliases and improve relationship tracking.
- Modified TreeSitter parsers to support synthetic module scopes for better static graph persistence.
- Implemented global relationship updates in the incremental indexer for static graph expansion.
- Added new ArtifactTag and FloatingFileBrowser components to the frontend for improved terminal dashboard functionality.
- Created utility functions for detecting CCW artifacts in terminal output with associated tests.
This commit is contained in:
catlog22
2026-02-15 23:12:06 +08:00
parent 48a6a1f2aa
commit 8938c47f88
39 changed files with 2956 additions and 297 deletions

View File

@@ -125,6 +125,13 @@ CODEXLENS_DEBUG=false
"tool": "gemini",
"timeout_ms": 300000,
"batch_size": 5
},
"parsing": {
"use_astgrep": false
},
"indexing": {
"static_graph_enabled": false,
"static_graph_relationship_types": ["imports", "inherits"]
}
}
```
@@ -167,6 +174,32 @@ CODEXLENS_DEBUG=false
| `timeout_ms` | int | 超时时间 (毫秒) |
| `batch_size` | int | 批处理大小 |
### Parsing 设置
| 字段 | 类型 | 说明 |
|------|------|------|
| `use_astgrep` | bool | 优先使用 ast-grep 解析关系(实验性;当前主要用于 Python relationships |
### Indexing 设置(静态图)
| 字段 | 类型 | 说明 |
|------|------|------|
| `static_graph_enabled` | bool | 索引时将 relationships 写入全局 `global_relationships`,用于搜索阶段静态图扩展 |
| `static_graph_relationship_types` | array | 允许持久化的关系类型:`imports` / `inherits` / `calls` |
**CLI 覆盖(单次运行,不写入 settings.json**:
```bash
# 索引时启用静态图 relationships + 使用 ast-grep如果可用
codexlens index init --use-astgrep --static-graph --static-graph-types imports,inherits,calls
```
**Search staged 静态图扩展(高级)**:
```bash
codexlens search --cascade-strategy staged --staged-stage2-mode static_global_graph
```
## FastEmbed 模型配置文件
使用 `fastembed` 后端时的预定义模型:

View File

@@ -23,8 +23,8 @@ dependencies = [
"pathspec>=0.11",
"watchdog>=3.0",
# ast-grep for pattern-based AST matching (PyO3 bindings)
# Note: May have compatibility issues with Python 3.13
"ast-grep-py>=0.3.0; python_version < '3.13'",
# ast-grep-py 0.40+ supports Python 3.13
"ast-grep-py>=0.40.0",
]
[project.optional-dependencies]

View File

@@ -126,6 +126,21 @@ def index_init(
no_embeddings: bool = typer.Option(False, "--no-embeddings", help="Skip automatic embedding generation (if semantic deps installed)."),
backend: Optional[str] = typer.Option(None, "--backend", "-b", help="Embedding backend: fastembed (local) or litellm (remote API). Defaults to settings.json config."),
model: Optional[str] = typer.Option(None, "--model", "-m", help="Embedding model: profile name for fastembed or model name for litellm. Defaults to settings.json config."),
use_astgrep: Optional[bool] = typer.Option(
None,
"--use-astgrep/--no-use-astgrep",
help="Prefer ast-grep parsers when available (experimental). Overrides settings.json config.",
),
static_graph: Optional[bool] = typer.Option(
None,
"--static-graph/--no-static-graph",
help="Persist global relationships during indexing for static graph expansion. Overrides settings.json config.",
),
static_graph_types: Optional[str] = typer.Option(
None,
"--static-graph-types",
help="Comma-separated relationship types to persist: imports,inherits,calls. Overrides settings.json config.",
),
max_workers: int = typer.Option(1, "--max-workers", min=1, help="Max concurrent API calls for embedding generation. Recommended: 4-8 for litellm backend."),
json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable debug logging."),
@@ -154,6 +169,33 @@ def index_init(
# Fallback to settings.json config if CLI params not provided
config.load_settings() # Ensure settings are loaded
# Apply CLI overrides for parsing/indexing behavior
if use_astgrep is not None:
config.use_astgrep = bool(use_astgrep)
if static_graph is not None:
config.static_graph_enabled = bool(static_graph)
if static_graph_types is not None:
allowed = {"imports", "inherits", "calls"}
parsed = [
t.strip().lower()
for t in static_graph_types.split(",")
if t.strip()
]
invalid = [t for t in parsed if t not in allowed]
if invalid:
msg = (
"Invalid --static-graph-types. Must be a comma-separated list of: "
f"{', '.join(sorted(allowed))}. Got: {invalid}"
)
if json_mode:
print_json(success=False, error=msg)
else:
console.print(f"[red]Error:[/red] {msg}")
raise typer.Exit(code=1)
if parsed:
config.static_graph_relationship_types = parsed
actual_backend = backend or config.embedding_backend
actual_model = model or config.embedding_model
@@ -412,8 +454,10 @@ def watch(
manager: WatcherManager | None = None
try:
watch_config = Config.load()
manager = WatcherManager(
root_path=base_path,
config=watch_config,
watcher_config=watcher_config,
on_indexed=on_indexed,
)
@@ -459,7 +503,7 @@ def search(
None,
"--staged-stage2-mode",
hidden=True,
help="[Advanced] Stage 2 expansion mode for cascade strategy 'staged': precomputed | realtime.",
help="[Advanced] Stage 2 expansion mode for cascade strategy 'staged': precomputed | realtime | static_global_graph.",
),
# Hidden deprecated parameter for backward compatibility
mode: Optional[str] = typer.Option(None, "--mode", hidden=True, help="[DEPRECATED] Use --method instead."),
@@ -615,8 +659,8 @@ def search(
# Optional staged cascade overrides (only meaningful for cascade strategy 'staged')
if staged_stage2_mode is not None:
stage2 = staged_stage2_mode.strip().lower()
if stage2 not in {"precomputed", "realtime"}:
msg = "Invalid --staged-stage2-mode. Must be: precomputed | realtime."
if stage2 not in {"precomputed", "realtime", "static_global_graph"}:
msg = "Invalid --staged-stage2-mode. Must be: precomputed | realtime | static_global_graph."
if json_mode:
print_json(success=False, error=msg)
else:
@@ -810,7 +854,7 @@ def inspect(
) -> None:
"""Analyze a single file and display symbols."""
_configure_logging(verbose, json_mode)
config = Config()
config = Config.load()
factory = ParserFactory(config)
file_path = file.expanduser().resolve()
@@ -3145,8 +3189,10 @@ def watch(
console.print("[dim]Press Ctrl+C to stop[/dim]\n")
# Create and start watcher manager
watch_config = Config.load()
manager = WatcherManager(
root_path=watch_path,
config=watch_config,
watcher_config=watcher_config,
on_indexed=lambda result: _display_index_result(result),
)
@@ -3681,7 +3727,7 @@ def index_update(
registry = RegistryStore()
registry.initialize()
mapper = PathMapper()
config = Config()
config = Config.load()
resolved_path = file_path.resolve()
@@ -3776,7 +3822,7 @@ def index_all(
from codexlens.config import Config
from codexlens.storage.index_tree import IndexTreeBuilder
config = Config()
config = Config.load()
languages = _parse_languages(language)
registry = RegistryStore()
registry.initialize()

View File

@@ -294,6 +294,15 @@ class Config:
"timeout_ms": self.llm_timeout_ms,
"batch_size": self.llm_batch_size,
},
"parsing": {
# Prefer ast-grep processors when available (experimental).
"use_astgrep": self.use_astgrep,
},
"indexing": {
# Persist global relationship edges during index build for static graph expansion.
"static_graph_enabled": self.static_graph_enabled,
"static_graph_relationship_types": self.static_graph_relationship_types,
},
"reranker": {
"enabled": self.enable_cross_encoder_rerank,
"backend": self.reranker_backend,
@@ -413,6 +422,34 @@ class Config:
if "fine_k" in cascade:
self.cascade_fine_k = cascade["fine_k"]
# Load parsing settings
parsing = settings.get("parsing", {})
if isinstance(parsing, dict) and "use_astgrep" in parsing:
self.use_astgrep = bool(parsing["use_astgrep"])
# Load indexing settings
indexing = settings.get("indexing", {})
if isinstance(indexing, dict):
if "static_graph_enabled" in indexing:
self.static_graph_enabled = bool(indexing["static_graph_enabled"])
if "static_graph_relationship_types" in indexing:
raw_types = indexing["static_graph_relationship_types"]
if isinstance(raw_types, list):
allowed = {"imports", "inherits", "calls"}
cleaned = []
for item in raw_types:
val = str(item).strip().lower()
if val and val in allowed:
cleaned.append(val)
if cleaned:
self.static_graph_relationship_types = cleaned
else:
log.warning(
"Invalid indexing.static_graph_relationship_types in %s: %r (expected list)",
self.settings_path,
raw_types,
)
# Load API settings
api = settings.get("api", {})
if "max_workers" in api:

View File

@@ -299,12 +299,25 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
if func_name:
all_matches.append((start_line, end_line, "func_def", func_name, node))
# Get import matches
# Get import matches (process import_with_alias first to avoid duplicates)
import_alias_positions: set = set()
# Process import with alias: import X as Y
import_alias_matches = self.run_ast_grep(source_code, get_pattern("import_with_alias"))
for node in import_alias_matches:
module = self._get_match(node, "MODULE")
alias = self._get_match(node, "ALIAS")
start_line, end_line = self._get_line_range(node)
if module and alias:
import_alias_positions.add(start_line)
all_matches.append((start_line, end_line, "import_alias", f"{module}:{alias}", node))
# Process simple imports: import X (skip lines with aliases)
import_matches = self.run_ast_grep(source_code, get_pattern("import_stmt"))
for node in import_matches:
module = self._get_match(node, "MODULE")
start_line, end_line = self._get_line_range(node)
if module:
if module and start_line not in import_alias_positions:
all_matches.append((start_line, end_line, "import", module, node))
from_matches = self.run_ast_grep(source_code, get_pattern("import_from"))
@@ -429,7 +442,7 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
))
elif match_type == "import":
# Process import statement
# Process simple import statement
module = symbol
# Simple import: add base name to alias map
base_name = module.split(".", 1)[0]
@@ -443,6 +456,22 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
source_line=start_line,
))
elif match_type == "import_alias":
# Process import with alias: import X as Y
parts = symbol.split(":", 1)
module = parts[0]
alias = parts[1] if len(parts) > 1 else ""
if alias:
update_aliases({alias: module})
relationships.append(CodeRelationship(
source_symbol=get_current_scope(),
target_symbol=module,
relationship_type=RelationshipType.IMPORTS,
source_file=source_file,
target_file=None,
source_line=start_line,
))
elif match_type == "from_import":
# Process from-import statement
parts = symbol.split(":", 1)
@@ -647,6 +676,22 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
return match.group(1).strip()
return ""
def _extract_import_names_from_text(self, import_text: str) -> str:
"""Extract imported names from from-import statement.
Args:
import_text: Full text of import statement (e.g., "from typing import List, Dict")
Returns:
Names text (e.g., "List, Dict") or empty string
"""
import re
# Match "from MODULE import NAMES" - extract NAMES
match = re.search(r'from\s+[\w.]+\s+import\s+(.+)$', import_text, re.MULTILINE)
if match:
return match.group(1).strip()
return ""
def extract_calls(
self,
source_code: str,
@@ -736,16 +781,19 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
relationships: List[CodeRelationship] = []
alias_map: Dict[str, str] = {}
# Process simple imports: import X
import_matches = self.run_ast_grep(source_code, get_pattern("import_stmt"))
for node in import_matches:
# Track processed lines to avoid duplicates
processed_lines: set = set()
# Process import with alias FIRST: import X as Y
alias_matches = self.run_ast_grep(source_code, get_pattern("import_with_alias"))
for node in alias_matches:
module = self._get_match(node, "MODULE")
alias = self._get_match(node, "ALIAS")
line = self._get_line_number(node)
if module:
# Add to alias map: first part of module
base_name = module.split(".", 1)[0]
alias_map[base_name] = module
if module and alias:
alias_map[alias] = module
processed_lines.add(line)
relationships.append(CodeRelationship(
source_symbol=source_symbol,
@@ -756,15 +804,16 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
source_line=line,
))
# Process import with alias: import X as Y
alias_matches = self.run_ast_grep(source_code, get_pattern("import_with_alias"))
for node in alias_matches:
# Process simple imports: import X (skip lines already processed)
import_matches = self.run_ast_grep(source_code, get_pattern("import_stmt"))
for node in import_matches:
module = self._get_match(node, "MODULE")
alias = self._get_match(node, "ALIAS")
line = self._get_line_number(node)
if module and alias:
alias_map[alias] = module
if module and line not in processed_lines:
# Add to alias map: first part of module
base_name = module.split(".", 1)[0]
alias_map[base_name] = module
relationships.append(CodeRelationship(
source_symbol=source_symbol,
@@ -779,7 +828,6 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
from_matches = self.run_ast_grep(source_code, get_pattern("import_from"))
for node in from_matches:
module = self._get_match(node, "MODULE")
names = self._get_match(node, "NAMES")
line = self._get_line_number(node)
if module:
@@ -793,6 +841,10 @@ class AstGrepPythonProcessor(BaseAstGrepProcessor):
source_line=line,
))
# Parse names from node text (ast-grep-py 0.40+ doesn't capture $$$ multi-match)
node_text = self._binding._get_node_text(node) if self._binding else ""
names = self._extract_import_names_from_text(node_text)
# Add aliases for imported names
if names and names != "*":
for name in names.split(","):

View File

@@ -24,11 +24,16 @@ class Parser(Protocol):
@dataclass
class SimpleRegexParser:
language_id: str
config: Optional[Config] = None
def parse(self, text: str, path: Path) -> IndexedFile:
# Try tree-sitter first for supported languages
if self.language_id in {"python", "javascript", "typescript"}:
ts_parser = TreeSitterSymbolParser(self.language_id, path)
ts_parser = TreeSitterSymbolParser(
self.language_id,
path,
config=self.config,
)
if ts_parser.is_available():
indexed = ts_parser.parse(text, path)
if indexed is not None:
@@ -73,7 +78,10 @@ class ParserFactory:
def get_parser(self, language_id: str) -> Parser:
if language_id not in self._parsers:
self._parsers[language_id] = SimpleRegexParser(language_id)
self._parsers[language_id] = SimpleRegexParser(
language_id,
config=self.config,
)
return self._parsers[language_id]

View File

@@ -291,7 +291,9 @@ class TreeSitterSymbolParser:
source_file = str(path.resolve())
relationships: List[CodeRelationship] = []
scope_stack: List[str] = []
# Use a synthetic module scope so module-level imports/calls can be recorded
# (useful for static global graph persistence).
scope_stack: List[str] = ["<module>"]
alias_stack: List[Dict[str, str]] = [{}]
def record_import(target_symbol: str, source_line: int) -> None:
@@ -398,7 +400,9 @@ class TreeSitterSymbolParser:
source_file = str(path.resolve())
relationships: List[CodeRelationship] = []
scope_stack: List[str] = []
# Use a synthetic module scope so module-level imports/calls can be recorded
# (useful for static global graph persistence).
scope_stack: List[str] = ["<module>"]
alias_stack: List[Dict[str, str]] = [{}]
def record_import(target_symbol: str, source_line: int) -> None:

View File

@@ -519,6 +519,7 @@ class IndexTreeBuilder:
"global_symbol_index_enabled": self.config.global_symbol_index_enabled,
"static_graph_enabled": self.config.static_graph_enabled,
"static_graph_relationship_types": self.config.static_graph_relationship_types,
"use_astgrep": getattr(self.config, "use_astgrep", False),
}
worker_args = [
@@ -984,6 +985,7 @@ def _build_dir_worker(args: tuple) -> DirBuildResult:
global_symbol_index_enabled=bool(config_dict.get("global_symbol_index_enabled", True)),
static_graph_enabled=bool(config_dict.get("static_graph_enabled", False)),
static_graph_relationship_types=list(config_dict.get("static_graph_relationship_types", ["imports", "inherits"])),
use_astgrep=bool(config_dict.get("use_astgrep", False)),
)
parser_factory = ParserFactory(config)

View File

@@ -89,7 +89,18 @@ class IncrementalIndexer:
project_info = self.registry.get_project(source_root)
if project_info:
project_id = project_info.id
self._global_index = GlobalSymbolIndex(global_db_path, project_id=project_id)
try:
self._global_index = GlobalSymbolIndex(global_db_path, project_id=project_id)
# Ensure schema exists (best-effort). The DB should already be initialized
# by `codexlens index init`, but watcher/index-update should be robust.
self._global_index.initialize()
except Exception as exc:
logger.debug(
"Failed to initialize global symbol index at %s: %s",
global_db_path,
exc,
)
self._global_index = None
return self._global_index
@@ -262,6 +273,34 @@ class IncrementalIndexer:
# Update merkle root
store.update_merkle_root()
# Update global relationships for static graph expansion (best-effort).
if getattr(self.config, "static_graph_enabled", False):
try:
source_root = self.mapper.get_project_root(path) or dir_path
index_root = self.mapper.source_to_index_dir(source_root)
global_index = self._get_global_index(index_root, source_root=source_root)
if global_index is not None:
allowed_types = set(
getattr(
self.config,
"static_graph_relationship_types",
["imports", "inherits"],
)
or []
)
filtered_rels = [
r
for r in (indexed_file.relationships or [])
if r.relationship_type.value in allowed_types
]
global_index.update_file_relationships(path, filtered_rels)
except Exception as exc:
logger.debug(
"Failed to update global relationships for %s: %s",
path,
exc,
)
logger.debug("Indexed file: %s (%d symbols)", path, len(indexed_file.symbols))
return FileIndexResult(
@@ -329,6 +368,21 @@ class IncrementalIndexer:
try:
store.remove_file(str(path))
store.update_merkle_root()
# Best-effort cleanup of static graph relationships (keeps global DB consistent).
if getattr(self.config, "static_graph_enabled", False):
try:
source_root = self.mapper.get_project_root(path) or dir_path
index_root = self.mapper.source_to_index_dir(source_root)
global_index = self._get_global_index(index_root, source_root=source_root)
if global_index is not None:
global_index.delete_file_relationships(path)
except Exception as exc:
logger.debug(
"Failed to delete global relationships for %s: %s",
path,
exc,
)
logger.debug("Removed file from index: %s", path)
return True

View File

@@ -377,6 +377,43 @@ class TestParserFactory:
finally:
del os.environ["CODEXLENS_DATA_DIR"]
def test_factory_passes_config_to_treesitter(self, monkeypatch: pytest.MonkeyPatch) -> None:
"""Ensure ParserFactory config is forwarded into TreeSitterSymbolParser."""
from codexlens.entities import IndexedFile
captured: dict = {}
class FakeTreeSitterSymbolParser:
def __init__(self, language_id, path=None, config=None) -> None:
captured["config"] = config
self.language_id = language_id
def is_available(self) -> bool:
return True
def parse(self, text: str, path: Path) -> IndexedFile:
return IndexedFile(
path=str(path.resolve()),
language=self.language_id,
symbols=[],
chunks=[],
relationships=[],
)
monkeypatch.setattr(
"codexlens.parsers.factory.TreeSitterSymbolParser",
FakeTreeSitterSymbolParser,
)
config = Config()
config.use_astgrep = True
factory = ParserFactory(config)
parser = factory.get_parser("python")
parser.parse("def hello():\n pass\n", Path("test.py"))
assert captured.get("config") is config
class TestParserEdgeCases:
"""Edge case tests for parsers."""