feat: 增加搜索功能的代码过滤选项，支持排除特定文件扩展名和仅返回代码文件

2026-02-05 01:50:27 +08:00 · 2026-01-06 23:19:47 +08:00
parent ef770ff29b
commit 1298fdd20f
5 changed files with 131 additions and 31 deletions
--- a/.claude/skills/code-reviewer/SKILL.md
+++ b/.claude/skills/code-reviewer/SKILL.md
@@ -1,22 +1,13 @@
+---
+name: code-reviewer
+description: Comprehensive code review skill for identifying security vulnerabilities and best practices violations. Triggers on "code review", "review code", "security audit", "代码审查".
+allowed-tools: Read, Glob, Grep, mcp__ace-tool__search_context, mcp__ccw-tools__smart_search
+---
+
 # Code Reviewer

 Comprehensive code review skill for identifying security vulnerabilities and best practices violations.

-## Metadata
-
-```yaml
-name: code-reviewer
-description: 帮助审查代码的安全漏洞和最佳实践
-version: 1.0.0
-execution_mode: sequential
-allowed-tools:
-  - Read
-  - Glob
-  - Grep
-  - mcp__ace-tool__search_context
-  - mcp__ccw-tools__smart_search
-```
-
 ## Architecture Overview

 ```
--- a/.claude/skills/skill-generator/templates/skill-md.md
+++ b/.claude/skills/skill-generator/templates/skill-md.md
@@ -2,9 +2,20 @@

 用于生成新 Skill 入口文件的模板。

-## 模板结构
+## ⚠️ 重要：YAML Front Matter 规范
+
+> **CRITICAL**: SKILL.md 文件必须以 YAML front matter 开头，即以 `---` 作为文件第一行。
+>
+> **禁止**使用以下格式：
+> - `# Title` 然后 `## Metadata` + yaml 代码块 ❌
+> - 任何在 `---` 之前的内容 ❌
+>
+> **正确格式**：文件第一行必须是 `---`
+
+## 可直接应用的模板
+
+以下是完整的 SKILL.md 模板。生成时**直接复制应用**，将 `{{变量}}` 替换为实际值：

-```markdown
 ---
 name: {{skill_name}}
 description: {{description}}. Triggers on {{triggers}}.
@@ -33,9 +44,9 @@ allowed-tools: {{allowed_tools}}

 \`\`\`javascript
 const timestamp = new Date().toISOString().slice(0,19).replace(/[-:T]/g, '');
-const workDir = `{{output_location}}`;
+const workDir = \`{{output_location}}\`;

-Bash(`mkdir -p "${workDir}"`);
+Bash(\`mkdir -p "\${workDir}"\`);
 {{additional_dirs}}
 \`\`\`

@@ -48,7 +59,8 @@ Bash(`mkdir -p "${workDir}"`);
 ## Reference Documents

 {{reference_table}}
-```
+
+---

 ## 变量说明

--- a/ccw/src/tools/smart-search.ts
+++ b/ccw/src/tools/smart-search.ts
@@ -1218,7 +1218,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
 * Requires index
 */
 async function executeCodexLensExactMode(params: Params): Promise<SearchResult> {
-  const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false } = params;
+  const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false, excludeExtensions, codeOnly = false, offset = 0 } = params;

  if (!query) {
    return {
@@ -1241,10 +1241,18 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>

  // Request more results to support split (full content + extra files)
  const totalToFetch = maxResults + extraFilesCount;
-  const args = ['search', query, '--limit', totalToFetch.toString(), '--method', 'fts', '--json'];
+  const args = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'fts', '--json'];
  if (enrich) {
    args.push('--enrich');
  }
+  // Add code_only filter if requested
+  if (codeOnly) {
+    args.push('--code-only');
+  }
+  // Add exclude_extensions filter if provided
+  if (excludeExtensions && excludeExtensions.length > 0) {
+    args.push('--exclude-extensions', excludeExtensions.join(','));
+  }
  const result = await executeCodexLens(args, { cwd: path });

  if (!result.success) {
@@ -1278,10 +1286,18 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>

  // Fallback to fuzzy mode if exact returns no results
  if (allResults.length === 0) {
-    const fuzzyArgs = ['search', query, '--limit', totalToFetch.toString(), '--method', 'fts', '--use-fuzzy', '--json'];
+    const fuzzyArgs = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'fts', '--use-fuzzy', '--json'];
    if (enrich) {
      fuzzyArgs.push('--enrich');
    }
+    // Add code_only filter if requested
+    if (codeOnly) {
+      fuzzyArgs.push('--code-only');
+    }
+    // Add exclude_extensions filter if provided
+    if (excludeExtensions && excludeExtensions.length > 0) {
+      fuzzyArgs.push('--exclude-extensions', excludeExtensions.join(','));
+    }
    const fuzzyResult = await executeCodexLens(fuzzyArgs, { cwd: path });

    if (fuzzyResult.success) {
@@ -1343,7 +1359,7 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
 */
 async function executeHybridMode(params: Params): Promise<SearchResult> {
  const timer = createTimer();
-  const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false, excludeExtensions, codeOnly = false } = params;
+  const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false, excludeExtensions, codeOnly = false, offset = 0 } = params;

  if (!query) {
    return {
@@ -1368,10 +1384,18 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {

  // Request more results to support split (full content + extra files)
  const totalToFetch = maxResults + extraFilesCount;
-  const args = ['search', query, '--limit', totalToFetch.toString(), '--method', 'dense_rerank', '--json'];
+  const args = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'dense_rerank', '--json'];
  if (enrich) {
    args.push('--enrich');
  }
+  // Add code_only filter if requested
+  if (codeOnly) {
+    args.push('--code-only');
+  }
+  // Add exclude_extensions filter if provided
+  if (excludeExtensions && excludeExtensions.length > 0) {
+    args.push('--exclude-extensions', excludeExtensions.join(','));
+  }
  const result = await executeCodexLens(args, { cwd: path });
  timer.mark('codexlens_search');

--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -435,6 +435,8 @@ def search(
    files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
    method: str = typer.Option("dense_rerank", "--method", "-m", help="Search method: 'dense_rerank' (semantic, default), 'fts' (exact keyword)."),
    use_fuzzy: bool = typer.Option(False, "--use-fuzzy", help="Enable fuzzy matching in FTS method."),
+    code_only: bool = typer.Option(False, "--code-only", help="Only return code files (excludes md, txt, json, yaml, xml, etc.)."),
+    exclude_extensions: Optional[str] = typer.Option(None, "--exclude-extensions", help="Comma-separated list of file extensions to exclude (e.g., 'md,txt,json')."),
    # Hidden advanced options for backward compatibility
    weights: Optional[str] = typer.Option(
        None,
@@ -654,10 +656,18 @@ def search(
        else:
            raise ValueError(f"Invalid method: {actual_method}")

+        # Parse exclude_extensions from comma-separated string
+        exclude_exts_list = None
+        if exclude_extensions:
+            exclude_exts_list = [ext.strip() for ext in exclude_extensions.split(',') if ext.strip()]
+
        options = SearchOptions(
            depth=depth,
            total_limit=limit,
+            offset=offset,
            files_only=files_only,
+            code_only=code_only,
+            exclude_extensions=exclude_exts_list,
            hybrid_mode=hybrid_mode,
            enable_fuzzy=enable_fuzzy,
            enable_vector=enable_vector,
--- a/codex-lens/src/codexlens/search/chain_search.py
+++ b/codex-lens/src/codexlens/search/chain_search.py
@@ -44,9 +44,12 @@ class SearchOptions:
        max_workers: Number of parallel worker threads
        limit_per_dir: Maximum results per directory
        total_limit: Total result limit across all directories
+        offset: Pagination offset - skip first N results (default 0)
        include_symbols: Whether to include symbol search results
        files_only: Return only file paths without excerpts
        include_semantic: Whether to include semantic keyword search results
+        code_only: Only return code files (excludes md, txt, json, yaml, xml, etc.)
+        exclude_extensions: List of file extensions to exclude (e.g., ["md", "txt", "json"])
        hybrid_mode: Enable hybrid search with RRF fusion (default False)
        enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True)
        enable_vector: Enable vector semantic search (default False)
@@ -61,9 +64,12 @@ class SearchOptions:
    max_workers: int = 8
    limit_per_dir: int = 10
    total_limit: int = 100
+    offset: int = 0
    include_symbols: bool = False
    files_only: bool = False
    include_semantic: bool = False
+    code_only: bool = False
+    exclude_extensions: Optional[List[str]] = None
    hybrid_mode: bool = False
    enable_fuzzy: bool = True
    enable_vector: bool = False
@@ -234,8 +240,14 @@ class ChainSearchEngine:
        )
        stats.errors = search_stats.errors

+        # Step 3.5: Filter by extension if requested
+        if options.code_only or options.exclude_extensions:
+            results = self._filter_by_extension(
+                results, options.code_only, options.exclude_extensions
+            )
+
        # Step 4: Merge and rank
-        final_results = self._merge_and_rank(results, options.total_limit)
+        final_results = self._merge_and_rank(results, options.total_limit, options.offset)

        # Step 5: Optional grouping of similar results
        if options.group_results:
@@ -2092,21 +2104,72 @@ class ChainSearchEngine:
            self.logger.debug(f"Search error in {index_path}: {exc}")
            return []

+    def _filter_by_extension(self, results: List[SearchResult],
+                              code_only: bool = False,
+                              exclude_extensions: Optional[List[str]] = None) -> List[SearchResult]:
+        """Filter search results by file extension.
+
+        Args:
+            results: Search results to filter
+            code_only: If True, exclude non-code files (md, txt, json, yaml, xml, etc.)
+            exclude_extensions: List of extensions to exclude (e.g., ["md", "txt"])
+
+        Returns:
+            Filtered results
+        """
+        # Non-code file extensions (same as MCP tool smart-search.ts)
+        NON_CODE_EXTENSIONS = {
+            'md', 'txt', 'json', 'yaml', 'yml', 'xml', 'csv', 'log',
+            'ini', 'cfg', 'conf', 'toml', 'env', 'properties',
+            'html', 'htm', 'svg', 'png', 'jpg', 'jpeg', 'gif', 'ico', 'webp',
+            'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
+            'lock', 'sum', 'mod',
+        }
+
+        # Build exclusion set
+        excluded_exts = set()
+        if exclude_extensions:
+            # Normalize extensions (remove leading dots, lowercase)
+            excluded_exts = {ext.lower().lstrip('.') for ext in exclude_extensions}
+        if code_only:
+            excluded_exts.update(NON_CODE_EXTENSIONS)
+
+        if not excluded_exts:
+            return results
+
+        # Filter results
+        filtered = []
+        for result in results:
+            path_str = result.path
+            if not path_str:
+                continue
+
+            # Extract extension from path
+            if '.' in path_str:
+                ext = path_str.rsplit('.', 1)[-1].lower()
+                if ext in excluded_exts:
+                    continue  # Skip this result
+
+            filtered.append(result)
+
+        return filtered
+
    def _merge_and_rank(self, results: List[SearchResult],
-                         limit: int) -> List[SearchResult]:
+                         limit: int, offset: int = 0) -> List[SearchResult]:
        """Aggregate, deduplicate, and rank results.

        Process:
        1. Deduplicate by path (keep highest score)
        2. Sort by score descending
-        3. Limit to requested count
+        3. Apply offset and limit for pagination

        Args:
            results: Raw results from all indexes
            limit: Maximum results to return
+            offset: Number of results to skip (pagination offset)

        Returns:
-            Deduplicated and ranked results
+            Deduplicated and ranked results with pagination
        """
        # Deduplicate by path, keeping best score
        path_to_result: Dict[str, SearchResult] = {}
@@ -2119,8 +2182,8 @@ class ChainSearchEngine:
        unique_results = list(path_to_result.values())
        unique_results.sort(key=lambda r: r.score, reverse=True)

-        # Apply limit
-        return unique_results[:limit]
+        # Apply offset and limit for pagination
+        return unique_results[offset:offset + limit]

    def _search_symbols_parallel(self, index_paths: List[Path],
                                  name: str,