feat: 增加搜索功能的代码过滤选项，支持排除特定文件扩展名和仅返回代码文件

2026-02-12 02:37:45 +08:00 · 2026-01-06 23:19:47 +08:00
parent ef770ff29b
commit 1298fdd20f
5 changed files with 131 additions and 31 deletions
--- a/.claude/skills/code-reviewer/SKILL.md
+++ b/.claude/skills/code-reviewer/SKILL.md
@@ -1,22 +1,13 @@
 ---
 name: code-reviewer
 description: Comprehensive code review skill for identifying security vulnerabilities and best practices violations. Triggers on "code review", "review code", "security audit", "代码审查".
 allowed-tools: Read, Glob, Grep, mcp__ace-tool__search_context, mcp__ccw-tools__smart_search
 ---
 # Code Reviewer
 Comprehensive code review skill for identifying security vulnerabilities and best practices violations.
 ## Metadata
 ```yaml
 name: code-reviewer
 description: 帮助审查代码的安全漏洞和最佳实践
 version: 1.0.0
 execution_mode: sequential
 allowed-tools:
  - Read
  - Glob
  - Grep
  - mcp__ace-tool__search_context
  - mcp__ccw-tools__smart_search
 ```
 ## Architecture Overview
 ```
--- a/.claude/skills/skill-generator/templates/skill-md.md
+++ b/.claude/skills/skill-generator/templates/skill-md.md
@@ -2,9 +2,20 @@
 用于生成新 Skill 入口文件的模板。
-## 模板结构
+## ⚠️ 重要：YAML Front Matter 规范
 > **CRITICAL**: SKILL.md 文件必须以 YAML front matter 开头，即以 `---` 作为文件第一行。
 >
 > **禁止**使用以下格式：
 > - `# Title` 然后 `## Metadata` + yaml 代码块 ❌
 > - 任何在 `---` 之前的内容 ❌
 >
 > **正确格式**：文件第一行必须是 `---`
 ## 可直接应用的模板
 以下是完整的 SKILL.md 模板。生成时**直接复制应用**，将 `{{变量}}` 替换为实际值：
 ```markdown
 ---
 name: {{skill_name}}
 description: {{description}}. Triggers on {{triggers}}.
@@ -33,9 +44,9 @@ allowed-tools: {{allowed_tools}}
 \`\`\`javascript
 const timestamp = new Date().toISOString().slice(0,19).replace(/[-:T]/g, '');
-const workDir = `{{output_location}}`;
+const workDir = \`{{output_location}}\`;
-Bash(`mkdir -p "${workDir}"`);
+Bash(\`mkdir -p "\${workDir}"\`);
 {{additional_dirs}}
 \`\`\`
@@ -48,7 +59,8 @@ Bash(`mkdir -p "${workDir}"`);
 ## Reference Documents
 {{reference_table}}
-```
+
 ---
 ## 变量说明
--- a/ccw/src/tools/smart-search.ts
+++ b/ccw/src/tools/smart-search.ts
@@ -1218,7 +1218,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
 * Requires index
 */
 async function executeCodexLensExactMode(params: Params): Promise<SearchResult> {
-  const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false } = params;
+  const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false, excludeExtensions, codeOnly = false, offset = 0 } = params;
  if (!query) {
    return {
@@ -1241,10 +1241,18 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
  // Request more results to support split (full content + extra files)
  const totalToFetch = maxResults + extraFilesCount;
-  const args = ['search', query, '--limit', totalToFetch.toString(), '--method', 'fts', '--json'];
+  const args = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'fts', '--json'];
  if (enrich) {
    args.push('--enrich');
  }
  // Add code_only filter if requested
  if (codeOnly) {
    args.push('--code-only');
  }
  // Add exclude_extensions filter if provided
  if (excludeExtensions && excludeExtensions.length > 0) {
    args.push('--exclude-extensions', excludeExtensions.join(','));
  }
  const result = await executeCodexLens(args, { cwd: path });
  if (!result.success) {
@@ -1278,10 +1286,18 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
  // Fallback to fuzzy mode if exact returns no results
  if (allResults.length === 0) {
-    const fuzzyArgs = ['search', query, '--limit', totalToFetch.toString(), '--method', 'fts', '--use-fuzzy', '--json'];
+    const fuzzyArgs = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'fts', '--use-fuzzy', '--json'];
    if (enrich) {
      fuzzyArgs.push('--enrich');
    }
    // Add code_only filter if requested
    if (codeOnly) {
      fuzzyArgs.push('--code-only');
    }
    // Add exclude_extensions filter if provided
    if (excludeExtensions && excludeExtensions.length > 0) {
      fuzzyArgs.push('--exclude-extensions', excludeExtensions.join(','));
    }
    const fuzzyResult = await executeCodexLens(fuzzyArgs, { cwd: path });
    if (fuzzyResult.success) {
@@ -1343,7 +1359,7 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
 */
 async function executeHybridMode(params: Params): Promise<SearchResult> {
  const timer = createTimer();
-  const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false, excludeExtensions, codeOnly = false } = params;
+  const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false, excludeExtensions, codeOnly = false, offset = 0 } = params;
  if (!query) {
    return {
@@ -1368,10 +1384,18 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
  // Request more results to support split (full content + extra files)
  const totalToFetch = maxResults + extraFilesCount;
-  const args = ['search', query, '--limit', totalToFetch.toString(), '--method', 'dense_rerank', '--json'];
+  const args = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'dense_rerank', '--json'];
  if (enrich) {
    args.push('--enrich');
  }
  // Add code_only filter if requested
  if (codeOnly) {
    args.push('--code-only');
  }
  // Add exclude_extensions filter if provided
  if (excludeExtensions && excludeExtensions.length > 0) {
    args.push('--exclude-extensions', excludeExtensions.join(','));
  }
  const result = await executeCodexLens(args, { cwd: path });
  timer.mark('codexlens_search');
--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -435,6 +435,8 @@ def search(
    files_only: bool = typer.Option(False, "--files-only", "-f", help="Return only file paths without content snippets."),
    method: str = typer.Option("dense_rerank", "--method", "-m", help="Search method: 'dense_rerank' (semantic, default), 'fts' (exact keyword)."),
    use_fuzzy: bool = typer.Option(False, "--use-fuzzy", help="Enable fuzzy matching in FTS method."),
    code_only: bool = typer.Option(False, "--code-only", help="Only return code files (excludes md, txt, json, yaml, xml, etc.)."),
    exclude_extensions: Optional[str] = typer.Option(None, "--exclude-extensions", help="Comma-separated list of file extensions to exclude (e.g., 'md,txt,json')."),
    # Hidden advanced options for backward compatibility
    weights: Optional[str] = typer.Option(
        None,
@@ -654,10 +656,18 @@ def search(
        else:
            raise ValueError(f"Invalid method: {actual_method}")
        # Parse exclude_extensions from comma-separated string
        exclude_exts_list = None
        if exclude_extensions:
            exclude_exts_list = [ext.strip() for ext in exclude_extensions.split(',') if ext.strip()]
        options = SearchOptions(
            depth=depth,
            total_limit=limit,
            offset=offset,
            files_only=files_only,
            code_only=code_only,
            exclude_extensions=exclude_exts_list,
            hybrid_mode=hybrid_mode,
            enable_fuzzy=enable_fuzzy,
            enable_vector=enable_vector,
--- a/codex-lens/src/codexlens/search/chain_search.py
+++ b/codex-lens/src/codexlens/search/chain_search.py
@@ -44,9 +44,12 @@ class SearchOptions:
        max_workers: Number of parallel worker threads
        limit_per_dir: Maximum results per directory
        total_limit: Total result limit across all directories
        offset: Pagination offset - skip first N results (default 0)
        include_symbols: Whether to include symbol search results
        files_only: Return only file paths without excerpts
        include_semantic: Whether to include semantic keyword search results
        code_only: Only return code files (excludes md, txt, json, yaml, xml, etc.)
        exclude_extensions: List of file extensions to exclude (e.g., ["md", "txt", "json"])
        hybrid_mode: Enable hybrid search with RRF fusion (default False)
        enable_fuzzy: Enable fuzzy FTS in hybrid mode (default True)
        enable_vector: Enable vector semantic search (default False)
@@ -61,9 +64,12 @@ class SearchOptions:
    max_workers: int = 8
    limit_per_dir: int = 10
    total_limit: int = 100
    offset: int = 0
    include_symbols: bool = False
    files_only: bool = False
    include_semantic: bool = False
    code_only: bool = False
    exclude_extensions: Optional[List[str]] = None
    hybrid_mode: bool = False
    enable_fuzzy: bool = True
    enable_vector: bool = False
@@ -234,8 +240,14 @@ class ChainSearchEngine:
        )
        stats.errors = search_stats.errors
        # Step 3.5: Filter by extension if requested
        if options.code_only or options.exclude_extensions:
            results = self._filter_by_extension(
                results, options.code_only, options.exclude_extensions
            )
        # Step 4: Merge and rank
-        final_results = self._merge_and_rank(results, options.total_limit)
+        final_results = self._merge_and_rank(results, options.total_limit, options.offset)
        # Step 5: Optional grouping of similar results
        if options.group_results:
@@ -2092,21 +2104,72 @@ class ChainSearchEngine:
            self.logger.debug(f"Search error in {index_path}: {exc}")
            return []
    def _filter_by_extension(self, results: List[SearchResult],
                              code_only: bool = False,
                              exclude_extensions: Optional[List[str]] = None) -> List[SearchResult]:
        """Filter search results by file extension.
        Args:
            results: Search results to filter
            code_only: If True, exclude non-code files (md, txt, json, yaml, xml, etc.)
            exclude_extensions: List of extensions to exclude (e.g., ["md", "txt"])
        Returns:
            Filtered results
        """
        # Non-code file extensions (same as MCP tool smart-search.ts)
        NON_CODE_EXTENSIONS = {
            'md', 'txt', 'json', 'yaml', 'yml', 'xml', 'csv', 'log',
            'ini', 'cfg', 'conf', 'toml', 'env', 'properties',
            'html', 'htm', 'svg', 'png', 'jpg', 'jpeg', 'gif', 'ico', 'webp',
            'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
            'lock', 'sum', 'mod',
        }
        # Build exclusion set
        excluded_exts = set()
        if exclude_extensions:
            # Normalize extensions (remove leading dots, lowercase)
            excluded_exts = {ext.lower().lstrip('.') for ext in exclude_extensions}
        if code_only:
            excluded_exts.update(NON_CODE_EXTENSIONS)
        if not excluded_exts:
            return results
        # Filter results
        filtered = []
        for result in results:
            path_str = result.path
            if not path_str:
                continue
            # Extract extension from path
            if '.' in path_str:
                ext = path_str.rsplit('.', 1)[-1].lower()
                if ext in excluded_exts:
                    continue  # Skip this result
            filtered.append(result)
        return filtered
    def _merge_and_rank(self, results: List[SearchResult],
-                         limit: int) -> List[SearchResult]:
+                         limit: int, offset: int = 0) -> List[SearchResult]:
        """Aggregate, deduplicate, and rank results.
        Process:
        1. Deduplicate by path (keep highest score)
        2. Sort by score descending
-        3. Limit to requested count
+        3. Apply offset and limit for pagination
        Args:
            results: Raw results from all indexes
            limit: Maximum results to return
            offset: Number of results to skip (pagination offset)
        Returns:
-            Deduplicated and ranked results
+            Deduplicated and ranked results with pagination
        """
        # Deduplicate by path, keeping best score
        path_to_result: Dict[str, SearchResult] = {}
@@ -2119,8 +2182,8 @@ class ChainSearchEngine:
        unique_results = list(path_to_result.values())
        unique_results.sort(key=lambda r: r.score, reverse=True)
-        # Apply limit
+        # Apply offset and limit for pagination
-        return unique_results[:limit]
+        return unique_results[offset:offset + limit]
    def _search_symbols_parallel(self, index_paths: List[Path],
                                  name: str,