Add benchmark results and tests for LSP graph builder and staged search

- Introduced a new benchmark results file for performance comparison on 2026-02-09.
- Added a test for LspGraphBuilder to ensure it does not expand nodes at maximum depth.
- Created a test for the staged search pipeline to validate fallback behavior when stage 1 returns empty results.
This commit is contained in:
catlog22
2026-02-09 21:43:13 +08:00
parent 4344e79e68
commit 362f354f1c
25 changed files with 2613 additions and 51 deletions

View File

@@ -0,0 +1,340 @@
/**
* Core AST outline parsing engine using web-tree-sitter.
*
* Parses source files into structured symbol outlines (functions, classes, methods, etc.)
* with line offsets compatible with read_file(offset, limit).
*/
import { createRequire } from 'node:module';
import { dirname, join } from 'path';
import Parser from 'web-tree-sitter';
import type { LanguageConfig } from './outline-queries.js';
export interface OutlineSymbol {
kind: 'function' | 'class' | 'method' | 'interface' | 'type' | 'enum' | 'property';
name: string;
line: number; // 0-based, compatible with read_file offset
endLine: number; // 0-based
doc: string | null;
signature: string; // truncated to 200 chars
parent: string | null;
children: number; // nested method/property count (class/interface)
}
export interface OutlineResult {
file: string;
language: string;
symbols: OutlineSymbol[];
totalSymbols: number;
}
// Singleton init guard
let initialized = false;
// Language WASM cache (Language loading is heavy IO, cache aggressively)
const languageCache = new Map<string, Parser.Language>();
// Resolve WASM paths via createRequire (works in ESM)
const _require = createRequire(import.meta.url);
function getWasmDir(): string {
return join(dirname(_require.resolve('tree-sitter-wasms/package.json')), 'out');
}
async function ensureInit(): Promise<void> {
if (initialized) return;
await Parser.init();
initialized = true;
}
async function loadLanguage(grammarName: string): Promise<Parser.Language> {
const cached = languageCache.get(grammarName);
if (cached) return cached;
const wasmPath = join(getWasmDir(), `tree-sitter-${grammarName}.wasm`);
const lang = await Parser.Language.load(wasmPath);
languageCache.set(grammarName, lang);
return lang;
}
/**
* Parse a source file into an outline of symbols.
*/
export async function parseOutline(
filePath: string,
content: string,
config: LanguageConfig
): Promise<OutlineResult> {
await ensureInit();
const language = await loadLanguage(config.grammarName);
const parser = new Parser();
parser.setLanguage(language);
const tree = parser.parse(content);
if (!tree) {
parser.delete();
return { file: filePath, language: config.grammarName, symbols: [], totalSymbols: 0 };
}
let query: Parser.Query;
try {
query = language.query(config.symbolQuery);
} catch (err) {
tree.delete();
parser.delete();
throw new Error(`Query compilation failed for ${config.grammarName}: ${(err as Error).message}`);
}
const matches = query.matches(tree.rootNode);
const contentLines = content.split('\n');
const symbols: OutlineSymbol[] = [];
for (const match of matches) {
const symbol = processMatch(match, contentLines, config.grammarName);
if (symbol) symbols.push(symbol);
}
// Sort by line position
symbols.sort((a, b) => a.line - b.line);
// Clean up native resources
query.delete();
tree.delete();
parser.delete();
return {
file: filePath,
language: config.grammarName,
symbols,
totalSymbols: symbols.length,
};
}
/**
* Process a single query match into an OutlineSymbol.
*/
function processMatch(
match: Parser.QueryMatch,
contentLines: string[],
language: string
): OutlineSymbol | null {
let nameNode: Parser.SyntaxNode | null = null;
let defNode: Parser.SyntaxNode | null = null;
let kind = 'function';
for (const capture of match.captures) {
if (capture.name === 'name') {
nameNode = capture.node;
} else if (capture.name.startsWith('definition.')) {
defNode = capture.node;
kind = capture.name.slice('definition.'.length);
}
}
if (!defNode || !nameNode) return null;
const name = nameNode.text;
const line = defNode.startPosition.row;
const endLine = defNode.endPosition.row;
const signature = extractSignature(defNode.text, language);
const doc = extractDoc(defNode, contentLines, language);
const parent = findParent(defNode);
const children = countChildren(defNode, kind);
return {
kind: kind as OutlineSymbol['kind'],
name,
line,
endLine,
doc,
signature,
parent,
children,
};
}
/**
* Extract a concise signature from the node text.
* Takes the first line, removes the body start, truncates to 200 chars.
*/
function extractSignature(nodeText: string, language: string): string {
const firstLine = nodeText.split('\n')[0].trimEnd();
let sig = firstLine;
if (language === 'python') {
// Remove trailing colon (body start)
if (sig.endsWith(':')) {
sig = sig.slice(0, -1).trimEnd();
}
} else {
// Remove opening brace and everything after
const braceIdx = sig.indexOf('{');
if (braceIdx > 0) {
sig = sig.substring(0, braceIdx).trimEnd();
}
}
if (sig.length > 200) {
sig = sig.substring(0, 200) + '...';
}
return sig;
}
/**
* Extract documentation comment for a definition node.
*/
function extractDoc(
defNode: Parser.SyntaxNode,
contentLines: string[],
language: string
): string | null {
if (language === 'python') {
return extractPythonDocstring(defNode);
}
return extractCommentDoc(defNode, contentLines);
}
/**
* Extract comment doc by looking at lines before the definition.
*/
function extractCommentDoc(
defNode: Parser.SyntaxNode,
contentLines: string[]
): string | null {
const defLine = defNode.startPosition.row;
let endIdx = defLine - 1;
if (endIdx < 0) return null;
// Skip at most one blank line
if (contentLines[endIdx].trim() === '') {
endIdx--;
if (endIdx < 0) return null;
}
const endText = contentLines[endIdx].trim();
// Block comment ending with */
if (endText.endsWith('*/')) {
let startIdx = endIdx;
while (startIdx > 0 && !contentLines[startIdx].trim().startsWith('/*')) {
startIdx--;
}
return cleanBlockComment(contentLines.slice(startIdx, endIdx + 1).join('\n'));
}
// Line comments (// or /// or #)
if (endText.startsWith('//') || endText.startsWith('#')) {
let startIdx = endIdx;
while (startIdx > 0) {
const prevText = contentLines[startIdx - 1].trim();
if (prevText.startsWith('//') || prevText.startsWith('#')) {
startIdx--;
} else {
break;
}
}
return cleanLineComments(contentLines.slice(startIdx, endIdx + 1).join('\n'));
}
return null;
}
/**
* Extract Python docstring from function/class body.
*/
function extractPythonDocstring(defNode: Parser.SyntaxNode): string | null {
const body = defNode.childForFieldName('body');
if (!body) return null;
const firstChild = body.namedChildren[0];
if (!firstChild || firstChild.type !== 'expression_statement') return null;
const expr = firstChild.namedChildren[0];
if (!expr || (expr.type !== 'string' && expr.type !== 'concatenated_string')) return null;
let text = expr.text;
// Remove triple-quote markers
for (const quote of ['"""', "'''"]) {
if (text.startsWith(quote) && text.endsWith(quote)) {
text = text.slice(3, -3);
break;
}
}
text = text.trim();
return text || null;
}
/**
* Clean block comment text.
*/
function cleanBlockComment(text: string): string | null {
let lines = text.split('\n');
// Remove /* and */
lines[0] = lines[0].replace(/^\s*\/\*\*?\s?/, '');
lines[lines.length - 1] = lines[lines.length - 1].replace(/\s*\*\/\s*$/, '');
// Remove leading * from middle lines
lines = lines.map(l => l.replace(/^\s*\*\s?/, ''));
const result = lines.join('\n').trim();
return result || null;
}
/**
* Clean line comment (// or #) text.
*/
function cleanLineComments(text: string): string | null {
const lines = text.split('\n').map(l => l.replace(/^\s*(?:\/\/\/?\s?|#\s?)/, ''));
const result = lines.join('\n').trim();
return result || null;
}
/**
* Find the parent class/interface/impl name for a definition node.
*/
function findParent(defNode: Parser.SyntaxNode): string | null {
let current = defNode.parent;
while (current) {
const type = current.type;
// Common parent types across languages
if (
type === 'class_declaration' || type === 'interface_declaration' ||
type === 'class_definition' || type === 'enum_declaration' ||
type === 'impl_item' || type === 'class_specifier' || type === 'struct_specifier'
) {
// Try 'name' field first, then 'type' field (for Rust impl_item)
const nameNode = current.childForFieldName('name') || current.childForFieldName('type');
if (nameNode) return nameNode.text;
}
current = current.parent;
}
return null;
}
/**
* Count direct children (methods/properties) for class/interface nodes.
*/
function countChildren(defNode: Parser.SyntaxNode, kind: string): number {
if (kind !== 'class' && kind !== 'interface') return 0;
// Find the body node (class_body, interface_body, block, declaration_list, etc.)
let body = defNode.childForFieldName('body');
if (!body) {
for (const child of defNode.namedChildren) {
if (
child.type === 'class_body' || child.type === 'interface_body' ||
child.type === 'declaration_list' || child.type === 'block' ||
child.type === 'enum_body' || child.type === 'field_declaration_list'
) {
body = child;
break;
}
}
}
if (!body) return 0;
return body.namedChildCount;
}

View File

@@ -0,0 +1,150 @@
/**
* Language configurations and tree-sitter query definitions for outline parsing.
*/
import { extname } from 'path';
export interface LanguageConfig {
grammarName: string;
extensions: string[];
symbolQuery: string;
}
export const LANGUAGE_CONFIGS: Record<string, LanguageConfig> = {
typescript: {
grammarName: 'typescript',
extensions: ['.ts'],
symbolQuery: [
'(function_declaration name: (identifier) @name) @definition.function',
'(class_declaration name: (type_identifier) @name) @definition.class',
'(method_definition name: (property_identifier) @name) @definition.method',
'(abstract_method_signature name: (property_identifier) @name) @definition.method',
'(interface_declaration name: (type_identifier) @name) @definition.interface',
'(type_alias_declaration name: (type_identifier) @name) @definition.type',
'(enum_declaration name: (identifier) @name) @definition.enum',
'(variable_declarator name: (identifier) @name value: (arrow_function)) @definition.function',
'(variable_declarator name: (identifier) @name value: (function_expression)) @definition.function',
].join('\n'),
},
tsx: {
grammarName: 'tsx',
extensions: ['.tsx'],
symbolQuery: [
'(function_declaration name: (identifier) @name) @definition.function',
'(class_declaration name: (type_identifier) @name) @definition.class',
'(method_definition name: (property_identifier) @name) @definition.method',
'(interface_declaration name: (type_identifier) @name) @definition.interface',
'(type_alias_declaration name: (type_identifier) @name) @definition.type',
'(enum_declaration name: (identifier) @name) @definition.enum',
'(variable_declarator name: (identifier) @name value: (arrow_function)) @definition.function',
'(variable_declarator name: (identifier) @name value: (function_expression)) @definition.function',
].join('\n'),
},
javascript: {
grammarName: 'javascript',
extensions: ['.js', '.jsx', '.mjs', '.cjs'],
symbolQuery: [
'(function_declaration name: (identifier) @name) @definition.function',
'(class_declaration name: (identifier) @name) @definition.class',
'(method_definition name: (property_identifier) @name) @definition.method',
'(variable_declarator name: (identifier) @name value: (arrow_function)) @definition.function',
'(variable_declarator name: (identifier) @name value: (function_expression)) @definition.function',
].join('\n'),
},
python: {
grammarName: 'python',
extensions: ['.py'],
symbolQuery: [
'(function_definition name: (identifier) @name) @definition.function',
'(class_definition name: (identifier) @name) @definition.class',
].join('\n'),
},
go: {
grammarName: 'go',
extensions: ['.go'],
symbolQuery: [
'(function_declaration name: (identifier) @name) @definition.function',
'(method_declaration name: (field_identifier) @name) @definition.method',
'(type_spec name: (type_identifier) @name) @definition.type',
].join('\n'),
},
rust: {
grammarName: 'rust',
extensions: ['.rs'],
symbolQuery: [
'(function_item name: (identifier) @name) @definition.function',
'(struct_item name: (type_identifier) @name) @definition.class',
'(enum_item name: (type_identifier) @name) @definition.enum',
'(trait_item name: (type_identifier) @name) @definition.interface',
'(impl_item type: (type_identifier) @name) @definition.class',
].join('\n'),
},
java: {
grammarName: 'java',
extensions: ['.java'],
symbolQuery: [
'(class_declaration name: (identifier) @name) @definition.class',
'(method_declaration name: (identifier) @name) @definition.method',
'(interface_declaration name: (identifier) @name) @definition.interface',
'(enum_declaration name: (identifier) @name) @definition.enum',
'(constructor_declaration name: (identifier) @name) @definition.method',
].join('\n'),
},
csharp: {
grammarName: 'c_sharp',
extensions: ['.cs'],
symbolQuery: [
'(class_declaration name: (identifier) @name) @definition.class',
'(method_declaration name: (identifier) @name) @definition.method',
'(interface_declaration name: (identifier) @name) @definition.interface',
'(enum_declaration name: (identifier) @name) @definition.enum',
'(constructor_declaration name: (identifier) @name) @definition.method',
].join('\n'),
},
c: {
grammarName: 'c',
extensions: ['.c', '.h'],
symbolQuery: [
'(function_definition declarator: (function_declarator declarator: (identifier) @name)) @definition.function',
'(struct_specifier name: (type_identifier) @name) @definition.class',
'(enum_specifier name: (type_identifier) @name) @definition.enum',
].join('\n'),
},
cpp: {
grammarName: 'cpp',
extensions: ['.cpp', '.hpp', '.cc', '.cxx'],
symbolQuery: [
'(function_definition declarator: (function_declarator declarator: (identifier) @name)) @definition.function',
'(function_definition declarator: (function_declarator declarator: (qualified_identifier name: (identifier) @name))) @definition.function',
'(class_specifier name: (type_identifier) @name) @definition.class',
'(struct_specifier name: (type_identifier) @name) @definition.class',
'(enum_specifier name: (type_identifier) @name) @definition.enum',
].join('\n'),
},
};
// Build extension → language name lookup map
const EXTENSION_MAP = new Map<string, string>();
for (const [lang, config] of Object.entries(LANGUAGE_CONFIGS)) {
for (const ext of config.extensions) {
EXTENSION_MAP.set(ext, lang);
}
}
/**
* Detect language config from file path extension or explicit hint.
* Returns null if language is not supported.
*/
export function detectLanguage(filePath: string, hint?: string): LanguageConfig | null {
if (hint) {
const normalized = hint.toLowerCase();
const config = LANGUAGE_CONFIGS[normalized];
if (config) return config;
}
const ext = extname(filePath).toLowerCase();
const lang = EXTENSION_MAP.get(ext);
if (lang) return LANGUAGE_CONFIGS[lang];
return null;
}