feat: 增加搜索功能的代码过滤选项,支持排除特定文件扩展名和仅返回代码文件

This commit is contained in:
catlog22
2026-01-06 23:19:47 +08:00
parent ef770ff29b
commit 1298fdd20f
5 changed files with 131 additions and 31 deletions

View File

@@ -1,22 +1,13 @@
---
name: code-reviewer
description: Comprehensive code review skill for identifying security vulnerabilities and best practices violations. Triggers on "code review", "review code", "security audit", "代码审查".
allowed-tools: Read, Glob, Grep, mcp__ace-tool__search_context, mcp__ccw-tools__smart_search
---
# Code Reviewer
Comprehensive code review skill for identifying security vulnerabilities and best practices violations.
## Metadata
```yaml
name: code-reviewer
description: 帮助审查代码的安全漏洞和最佳实践
version: 1.0.0
execution_mode: sequential
allowed-tools:
- Read
- Glob
- Grep
- mcp__ace-tool__search_context
- mcp__ccw-tools__smart_search
```
## Architecture Overview
```

View File

@@ -2,9 +2,20 @@
用于生成新 Skill 入口文件的模板。
## 模板结构
## ⚠️ 重要YAML Front Matter 规范
> **CRITICAL**: SKILL.md 文件必须以 YAML front matter 开头,即以 `---` 作为文件第一行。
>
> **禁止**使用以下格式:
> - `# Title` 然后 `## Metadata` + yaml 代码块 ❌
> - 任何在 `---` 之前的内容 ❌
>
> **正确格式**:文件第一行必须是 `---`
## 可直接应用的模板
以下是完整的 SKILL.md 模板。生成时**直接复制应用**,将 `{{变量}}` 替换为实际值:
```markdown
---
name: {{skill_name}}
description: {{description}}. Triggers on {{triggers}}.
@@ -33,9 +44,9 @@ allowed-tools: {{allowed_tools}}
\`\`\`javascript
const timestamp = new Date().toISOString().slice(0,19).replace(/[-:T]/g, '');
const workDir = `{{output_location}}`;
const workDir = \`{{output_location}}\`;
Bash(`mkdir -p "${workDir}"`);
Bash(\`mkdir -p "\${workDir}"\`);
{{additional_dirs}}
\`\`\`
@@ -48,7 +59,8 @@ Bash(`mkdir -p "${workDir}"`);
## Reference Documents
{{reference_table}}
```
---
## 变量说明

View File

@@ -1218,7 +1218,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
* Requires index
*/
async function executeCodexLensExactMode(params: Params): Promise<SearchResult> {
const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false } = params;
const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false, excludeExtensions, codeOnly = false, offset = 0 } = params;
if (!query) {
return {
@@ -1241,10 +1241,18 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
// Request more results to support split (full content + extra files)
const totalToFetch = maxResults + extraFilesCount;
const args = ['search', query, '--limit', totalToFetch.toString(), '--method', 'fts', '--json'];
const args = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'fts', '--json'];
if (enrich) {
args.push('--enrich');
}
// Add code_only filter if requested
if (codeOnly) {
args.push('--code-only');
}
// Add exclude_extensions filter if provided
if (excludeExtensions && excludeExtensions.length > 0) {
args.push('--exclude-extensions', excludeExtensions.join(','));
}
const result = await executeCodexLens(args, { cwd: path });
if (!result.success) {
@@ -1278,10 +1286,18 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
// Fallback to fuzzy mode if exact returns no results
if (allResults.length === 0) {
const fuzzyArgs = ['search', query, '--limit', totalToFetch.toString(), '--method', 'fts', '--use-fuzzy', '--json'];
const fuzzyArgs = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'fts', '--use-fuzzy', '--json'];
if (enrich) {
fuzzyArgs.push('--enrich');
}
// Add code_only filter if requested
if (codeOnly) {
fuzzyArgs.push('--code-only');
}
// Add exclude_extensions filter if provided
if (excludeExtensions && excludeExtensions.length > 0) {
fuzzyArgs.push('--exclude-extensions', excludeExtensions.join(','));
}
const fuzzyResult = await executeCodexLens(fuzzyArgs, { cwd: path });
if (fuzzyResult.success) {
@@ -1343,7 +1359,7 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
*/
async function executeHybridMode(params: Params): Promise<SearchResult> {
const timer = createTimer();
const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false, excludeExtensions, codeOnly = false } = params;
const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false, excludeExtensions, codeOnly = false, offset = 0 } = params;
if (!query) {
return {
@@ -1368,10 +1384,18 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
// Request more results to support split (full content + extra files)
const totalToFetch = maxResults + extraFilesCount;
const args = ['search', query, '--limit', totalToFetch.toString(), '--method', 'dense_rerank', '--json'];
const args = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'dense_rerank', '--json'];
if (enrich) {
args.push('--enrich');
}
// Add code_only filter if requested
if (codeOnly) {
args.push('--code-only');
}
// Add exclude_extensions filter if provided
if (excludeExtensions && excludeExtensions.length > 0) {
args.push('--exclude-extensions', excludeExtensions.join(','));
}
const result = await executeCodexLens(args, { cwd: path });
timer.mark('codexlens_search');

View File

@@ -435,6 +435,8 @@ def search(
files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
method: str = typer.Option("dense_rerank", "--method", "-m", help="Search method: 'dense_rerank' (semantic, default), 'fts' (exact keyword)."),
use_fuzzy: bool = typer.Option(False, "--use-fuzzy", help="Enable fuzzy matching in FTS method."),
code_only: bool = typer.Option(False, "--code-only", help="Only return code files (excludes md, txt, json, yaml, xml, etc.)."),
exclude_extensions: Optional[str] = typer.Option(None, "--exclude-extensions", help="Comma-separated list of file extensions to exclude (e.g., 'md,txt,json')."),
# Hidden advanced options for backward compatibility
weights: Optional[str] = typer.Option(
None,
@@ -654,10 +656,18 @@ def search(
else:
raise ValueError(f"Invalid method: {actual_method}")
# Parse exclude_extensions from comma-separated string
exclude_exts_list = None
if exclude_extensions:
exclude_exts_list = [ext.strip() for ext in exclude_extensions.split(',') if ext.strip()]
options = SearchOptions(
depth=depth,
total_limit=limit,
offset=offset,
files_only=files_only,
code_only=code_only,
exclude_extensions=exclude_exts_list,
hybrid_mode=hybrid_mode,
enable_fuzzy=enable_fuzzy,
enable_vector=enable_vector,

View File

@@ -44,9 +44,12 @@ class SearchOptions:
max_workers: Number of parallel worker threads
limit_per_dir: Maximum results per directory
total_limit: Total result limit across all directories
offset: Pagination offset - skip first N results (default 0)
include_symbols: Whether to include symbol search results
files_only: Return only file paths without excerpts
include_semantic: Whether to include semantic keyword search results
code_only: Only return code files (excludes md, txt, json, yaml, xml, etc.)
exclude_extensions: List of file extensions to exclude (e.g., ["md", "txt", "json"])
hybrid_mode: Enable hybrid search with RRF fusion (default False)
enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True)
enable_vector: Enable vector semantic search (default False)
@@ -61,9 +64,12 @@ class SearchOptions:
max_workers: int = 8
limit_per_dir: int = 10
total_limit: int = 100
offset: int = 0
include_symbols: bool = False
files_only: bool = False
include_semantic: bool = False
code_only: bool = False
exclude_extensions: Optional[List[str]] = None
hybrid_mode: bool = False
enable_fuzzy: bool = True
enable_vector: bool = False
@@ -234,8 +240,14 @@ class ChainSearchEngine:
)
stats.errors = search_stats.errors
# Step 3.5: Filter by extension if requested
if options.code_only or options.exclude_extensions:
results = self._filter_by_extension(
results, options.code_only, options.exclude_extensions
)
# Step 4: Merge and rank
final_results = self._merge_and_rank(results, options.total_limit)
final_results = self._merge_and_rank(results, options.total_limit, options.offset)
# Step 5: Optional grouping of similar results
if options.group_results:
@@ -2092,21 +2104,72 @@ class ChainSearchEngine:
self.logger.debug(f"Search error in {index_path}: {exc}")
return []
def _filter_by_extension(self, results: List[SearchResult],
code_only: bool = False,
exclude_extensions: Optional[List[str]] = None) -> List[SearchResult]:
"""Filter search results by file extension.
Args:
results: Search results to filter
code_only: If True, exclude non-code files (md, txt, json, yaml, xml, etc.)
exclude_extensions: List of extensions to exclude (e.g., ["md", "txt"])
Returns:
Filtered results
"""
# Non-code file extensions (same as MCP tool smart-search.ts)
NON_CODE_EXTENSIONS = {
'md', 'txt', 'json', 'yaml', 'yml', 'xml', 'csv', 'log',
'ini', 'cfg', 'conf', 'toml', 'env', 'properties',
'html', 'htm', 'svg', 'png', 'jpg', 'jpeg', 'gif', 'ico', 'webp',
'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
'lock', 'sum', 'mod',
}
# Build exclusion set
excluded_exts = set()
if exclude_extensions:
# Normalize extensions (remove leading dots, lowercase)
excluded_exts = {ext.lower().lstrip('.') for ext in exclude_extensions}
if code_only:
excluded_exts.update(NON_CODE_EXTENSIONS)
if not excluded_exts:
return results
# Filter results
filtered = []
for result in results:
path_str = result.path
if not path_str:
continue
# Extract extension from path
if '.' in path_str:
ext = path_str.rsplit('.', 1)[-1].lower()
if ext in excluded_exts:
continue # Skip this result
filtered.append(result)
return filtered
def _merge_and_rank(self, results: List[SearchResult],
limit: int) -> List[SearchResult]:
limit: int, offset: int = 0) -> List[SearchResult]:
"""Aggregate, deduplicate, and rank results.
Process:
1. Deduplicate by path (keep highest score)
2. Sort by score descending
3. Limit to requested count
3. Apply offset and limit for pagination
Args:
results: Raw results from all indexes
limit: Maximum results to return
offset: Number of results to skip (pagination offset)
Returns:
Deduplicated and ranked results
Deduplicated and ranked results with pagination
"""
# Deduplicate by path, keeping best score
path_to_result: Dict[str, SearchResult] = {}
@@ -2119,8 +2182,8 @@ class ChainSearchEngine:
unique_results = list(path_to_result.values())
unique_results.sort(key=lambda r: r.score, reverse=True)
# Apply limit
return unique_results[:limit]
# Apply offset and limit for pagination
return unique_results[offset:offset + limit]
def _search_symbols_parallel(self, index_paths: List[Path],
name: str,