fix: 修复 SmartSearch 的 ripgrep limit 和 FTS 分词器问题

- Ripgrep 模式: 添加总结果数量限制，防止返回超过 2MB 数据 - --max-count 只限制每个文件的匹配数，现在在收集结果时应用 limit - 达到限制时在 metadata 中添加 warning 提示 - FTS 分词器: 将点号(.)添加到 tokenchars，修复 PortRole.FLOW 等带点号标识符的精确搜索 - 更新 dir_index.py 和 migration_004_dual_fts.py 中的 tokenize 配置 - 需要重建索引才能生效 - Exact 模式: 添加 fuzzy 回退，当精确搜索无结果时自动尝试模糊搜索 - 回退时在 metadata 中标注 fallback: 'fuzzy' 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 01:50:27 +08:00 · 2025-12-22 09:50:29 +08:00
parent 7ecc64614a
commit e60d793c8c
3 changed files with 66 additions and 6 deletions
--- a/ccw/src/tools/smart-search.ts
+++ b/ccw/src/tools/smart-search.ts
@@ -774,6 +774,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {

    let stdout = '';
    let stderr = '';
+    let resultLimitReached = false;

    child.stdout.on('data', (data) => {
      stdout += data.toString();
@@ -786,8 +787,16 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
    child.on('close', (code) => {
      const results: ExactMatch[] = [];
      const lines = stdout.split('\n').filter((line) => line.trim());
+      // Limit total results to prevent memory overflow (--max-count only limits per-file)
+      const effectiveLimit = maxResults > 0 ? maxResults : 500;

      for (const line of lines) {
+        // Stop collecting if we've reached the limit
+        if (results.length >= effectiveLimit) {
+          resultLimitReached = true;
+          break;
+        }
+
        try {
          const item = JSON.parse(line);

@@ -817,6 +826,15 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
      const scoredResults = tokens.length > 1 ? scoreByTokenMatch(results, tokens) : results;

      if (code === 0 || code === 1 || (isWindowsDeviceError && scoredResults.length > 0)) {
+        // Build warning message for various conditions
+        const warnings: string[] = [];
+        if (resultLimitReached) {
+          warnings.push(`Result limit reached (${effectiveLimit}). Use a more specific query or increase limit.`);
+        }
+        if (isWindowsDeviceError) {
+          warnings.push('Some Windows device files were skipped');
+        }
+
        resolve({
          success: true,
          results: scoredResults,
@@ -827,7 +845,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
            query,
            tokens: tokens.length > 1 ? tokens : undefined,  // Include tokens in metadata for debugging
            tokenized: tokens.length > 1,
-            ...(isWindowsDeviceError && { warning: 'Some Windows device files were skipped' }),
+            ...(warnings.length > 0 && { warning: warnings.join('; ') }),
          },
        });
      } else if (isWindowsDeviceError && results.length === 0) {
@@ -923,6 +941,46 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
    // Keep empty results
  }

+  // Fallback to fuzzy mode if exact returns no results
+  if (results.length === 0) {
+    const fuzzyArgs = ['search', query, '--limit', maxResults.toString(), '--mode', 'fuzzy', '--json'];
+    if (enrich) {
+      fuzzyArgs.push('--enrich');
+    }
+    const fuzzyResult = await executeCodexLens(fuzzyArgs, { cwd: path });
+
+    if (fuzzyResult.success) {
+      try {
+        const parsed = JSON.parse(stripAnsi(fuzzyResult.output || '{}'));
+        const data = parsed.result?.results || parsed.results || parsed;
+        results = (Array.isArray(data) ? data : []).map((item: any) => ({
+          file: item.path || item.file,
+          score: item.score || 0,
+          content: item.excerpt || item.content || '',
+          symbol: item.symbol || null,
+        }));
+      } catch {
+        // Keep empty results
+      }
+
+      if (results.length > 0) {
+        return {
+          success: true,
+          results,
+          metadata: {
+            mode: 'exact',
+            backend: 'codexlens',
+            count: results.length,
+            query,
+            warning: indexStatus.warning,
+            note: 'No exact matches found, showing fuzzy results',
+            fallback: 'fuzzy',
+          },
+        };
+      }
+    }
+  }
+
  return {
    success: true,
    results,
--- a/codex-lens/src/codexlens/storage/dir_index.py
+++ b/codex-lens/src/codexlens/storage/dir_index.py
@@ -1651,16 +1651,17 @@ class DirIndexStore:
            from codexlens.storage.sqlite_utils import check_trigram_support

            has_trigram = check_trigram_support(conn)
-            fuzzy_tokenizer = "trigram" if has_trigram else "unicode61 tokenchars '_-'"
+            fuzzy_tokenizer = "trigram" if has_trigram else "unicode61 tokenchars '_-.'"

            # Exact FTS table with unicode61 tokenizer
+            # Note: tokenchars includes '.' to properly tokenize qualified names like PortRole.FLOW
            conn.execute(
                """
                CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_exact USING fts5(
                    name, full_path UNINDEXED, content,
                    content='files',
                    content_rowid='id',
-                    tokenize="unicode61 tokenchars '_-'"
+                    tokenize="unicode61 tokenchars '_-.'"
                )
                """
            )
--- a/codex-lens/src/codexlens/storage/migrations/migration_004_dual_fts.py
+++ b/codex-lens/src/codexlens/storage/migrations/migration_004_dual_fts.py
@@ -45,7 +45,7 @@ def upgrade(db_conn: Connection):
                f"Trigram tokenizer not available (requires SQLite >= 3.34), "
                f"using extended unicode61 tokenizer for fuzzy matching"
            )
-            fuzzy_tokenizer = "unicode61 tokenchars '_-'"
+            fuzzy_tokenizer = "unicode61 tokenchars '_-.'"

        # Start transaction
        cursor.execute("BEGIN TRANSACTION")
@@ -122,7 +122,8 @@ def upgrade(db_conn: Connection):
        # Drop old FTS table
        cursor.execute("DROP TABLE IF EXISTS files_fts")

-        # Create exact FTS table (unicode61 with underscores/hyphens as token chars)
+        # Create exact FTS table (unicode61 with underscores/hyphens/dots as token chars)
+        # Note: tokenchars includes '.' to properly tokenize qualified names like PortRole.FLOW
        log.info("Creating files_fts_exact table with unicode61 tokenizer...")
        cursor.execute(
            """
@@ -130,7 +131,7 @@ def upgrade(db_conn: Connection):
                name, full_path UNINDEXED, content,
                content='files',
                content_rowid='id',
-                tokenize="unicode61 tokenchars '_-'"
+                tokenize="unicode61 tokenchars '_-.'"
            )
            """
        )