feat: Enhance CodexLens search functionality with new parameters and result handling

- Added search limit, content length, and extra files input fields in the CodexLens manager UI. - Updated API request parameters to include new fields: max_content_length and extra_files_count. - Refactored smart-search.ts to support new parameters with default values. - Implemented result splitting logic to return both full content and additional file paths. - Updated CLI commands to remove worker limits and allow dynamic scaling based on endpoint count. - Introduced EmbeddingPoolConfig for improved embedding management and auto-discovery of providers. - Enhanced search engines to utilize new parameters for fuzzy and exact searches. - Added support for embedding single texts in the LiteLLM embedder.
2026-02-06 01:54:11 +08:00 · 2025-12-25 16:16:44 +08:00
parent a1413dd1b3
commit ebcbb11cb2
6 changed files with 188 additions and 63 deletions
--- a/ccw/src/tools/smart-search.ts
+++ b/ccw/src/tools/smart-search.ts
@@ -36,10 +36,12 @@ const ParamsSchema = z.object({
  path: z.string().optional(),
  paths: z.array(z.string()).default([]),
  contextLines: z.number().default(0),
-  maxResults: z.number().default(20),  // Increased default
+  maxResults: z.number().default(5),  // Default 5 with full content
  includeHidden: z.boolean().default(false),
  languages: z.array(z.string()).optional(),
-  limit: z.number().default(20),  // Increased default
+  limit: z.number().default(5),  // Default 5 with full content
+  extraFilesCount: z.number().default(10),  // Additional file-only results
+  maxContentLength: z.number().default(200),  // Max content length for truncation (50-2000)
  offset: z.number().default(0),  // NEW: Pagination offset (start_index)
  enrich: z.boolean().default(false),
  // Search modifiers for ripgrep mode
@@ -268,6 +270,7 @@ interface SearchMetadata {
 interface SearchResult {
  success: boolean;
  results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | FileMatch[] | unknown;
+  extra_files?: string[];  // Additional file paths without content
  output?: string;
  metadata?: SearchMetadata;
  error?: string;
@@ -301,6 +304,42 @@ function stripAnsi(str: string): string {
  return str.replace(/\x1b\[[0-9;]*m/g, '');
 }

+/** Default maximum content length to return (avoid excessive output) */
+const DEFAULT_MAX_CONTENT_LENGTH = 200;
+
+/**
+ * Truncate content to specified length with ellipsis
+ * @param content - The content to truncate
+ * @param maxLength - Maximum length (default: 200)
+ */
+function truncateContent(content: string | null | undefined, maxLength: number = DEFAULT_MAX_CONTENT_LENGTH): string {
+  if (!content) return '';
+  if (content.length <= maxLength) return content;
+  return content.slice(0, maxLength) + '...';
+}
+
+/**
+ * Split results into full content results and extra file-only results
+ * Generic function supporting both SemanticMatch and ExactMatch types
+ * @param allResults - All search results (must have 'file' property)
+ * @param fullContentLimit - Number of results with full content (default: 5)
+ * @param extraFilesCount - Number of additional file-only results (default: 10)
+ */
+function splitResultsWithExtraFiles<T extends { file: string }>(
+  allResults: T[],
+  fullContentLimit: number = 5,
+  extraFilesCount: number = 10
+): { results: T[]; extra_files: string[] } {
+  // First N results with full content
+  const results = allResults.slice(0, fullContentLimit);
+
+  // Next M results as file paths only (deduplicated)
+  const extraResults = allResults.slice(fullContentLimit, fullContentLimit + extraFilesCount);
+  const extra_files = [...new Set(extraResults.map(r => r.file))];
+
+  return { results, extra_files };
+}
+
 /**
 * Check if CodexLens index exists for current directory
 * @param path - Directory path to check
@@ -714,7 +753,7 @@ async function executeAutoMode(params: Params): Promise<SearchResult> {
 * Supports tokenized multi-word queries with OR matching and result ranking
 */
 async function executeRipgrepMode(params: Params): Promise<SearchResult> {
-  const { query, paths = [], contextLines = 0, maxResults = 10, includeHidden = false, path = '.', regex = true, caseSensitive = true, tokenize = true } = params;
+  const { query, paths = [], contextLines = 0, maxResults = 5, extraFilesCount = 10, maxContentLength = 200, includeHidden = false, path = '.', regex = true, caseSensitive = true, tokenize = true } = params;

  if (!query) {
    return {
@@ -726,6 +765,9 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
  // Check if ripgrep is available
  const hasRipgrep = checkToolAvailability('rg');

+  // Calculate total to fetch for split (full content + extra files)
+  const totalToFetch = maxResults + extraFilesCount;
+
  // If ripgrep not available, fall back to CodexLens exact mode
  if (!hasRipgrep) {
    const readyStatus = await ensureCodexLensReady();
@@ -737,7 +779,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
    }

    // Use CodexLens exact mode as fallback
-    const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'exact', '--json'];
+    const args = ['search', query, '--limit', totalToFetch.toString(), '--mode', 'exact', '--json'];
    const result = await executeCodexLens(args, { cwd: path });

    if (!result.success) {
@@ -754,23 +796,27 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
    }

    // Parse results
-    let results: SemanticMatch[] = [];
+    let allResults: SemanticMatch[] = [];
    try {
      const parsed = JSON.parse(stripAnsi(result.output || '{}'));
      const data = parsed.result?.results || parsed.results || parsed;
-      results = (Array.isArray(data) ? data : []).map((item: any) => ({
+      allResults = (Array.isArray(data) ? data : []).map((item: any) => ({
        file: item.path || item.file,
        score: item.score || 0,
-        content: item.excerpt || item.content || '',
+        content: truncateContent(item.content || item.excerpt, maxContentLength),
        symbol: item.symbol || null,
      }));
    } catch {
      // Keep empty results
    }

+    // Split results: first N with full content, rest as file paths only
+    const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
+
    return {
      success: true,
      results,
+      extra_files: extra_files.length > 0 ? extra_files : undefined,
      metadata: {
        mode: 'ripgrep',
        backend: 'codexlens-fallback',
@@ -781,12 +827,12 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
    };
  }

-  // Use ripgrep
+  // Use ripgrep - request more results to support split
  const { command, args, tokens } = buildRipgrepCommand({
    query,
    paths: paths.length > 0 ? paths : [path],
    contextLines,
-    maxResults,
+    maxResults: totalToFetch,  // Fetch more to support split
    includeHidden,
    regex,
    caseSensitive,
@@ -812,14 +858,14 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
    });

    child.on('close', (code) => {
-      const results: ExactMatch[] = [];
+      const allResults: ExactMatch[] = [];
      const lines = stdout.split('\n').filter((line) => line.trim());
      // Limit total results to prevent memory overflow (--max-count only limits per-file)
-      const effectiveLimit = maxResults > 0 ? maxResults : 500;
+      const effectiveLimit = totalToFetch > 0 ? totalToFetch : 500;

      for (const line of lines) {
        // Stop collecting if we've reached the limit
-        if (results.length >= effectiveLimit) {
+        if (allResults.length >= effectiveLimit) {
          resultLimitReached = true;
          break;
        }
@@ -837,7 +883,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
                  : 1,
              content: item.data.lines.text.trim(),
            };
-            results.push(match);
+            allResults.push(match);
          }
        } catch {
          continue;
@@ -850,9 +896,12 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {

      // Apply token-based scoring and sorting for multi-word queries
      // Results matching more tokens are ranked higher (exact matches first)
-      const scoredResults = tokens.length > 1 ? scoreByTokenMatch(results, tokens) : results;
+      const scoredResults = tokens.length > 1 ? scoreByTokenMatch(allResults, tokens) : allResults;

      if (code === 0 || code === 1 || (isWindowsDeviceError && scoredResults.length > 0)) {
+        // Split results: first N with full content, rest as file paths only
+        const { results, extra_files } = splitResultsWithExtraFiles(scoredResults, maxResults, extraFilesCount);
+
        // Build warning message for various conditions
        const warnings: string[] = [];
        if (resultLimitReached) {
@@ -864,18 +913,19 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {

        resolve({
          success: true,
-          results: scoredResults,
+          results,
+          extra_files: extra_files.length > 0 ? extra_files : undefined,
          metadata: {
            mode: 'ripgrep',
            backend: 'ripgrep',
-            count: scoredResults.length,
+            count: results.length,
            query,
            tokens: tokens.length > 1 ? tokens : undefined,  // Include tokens in metadata for debugging
            tokenized: tokens.length > 1,
            ...(warnings.length > 0 && { warning: warnings.join('; ') }),
          },
        });
-      } else if (isWindowsDeviceError && results.length === 0) {
+      } else if (isWindowsDeviceError && allResults.length === 0) {
        // Windows device error but no results - might be the only issue
        resolve({
          success: true,
@@ -912,7 +962,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
 * Requires index
 */
 async function executeCodexLensExactMode(params: Params): Promise<SearchResult> {
-  const { query, path = '.', maxResults = 10, enrich = false } = params;
+  const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false } = params;

  if (!query) {
    return {
@@ -933,7 +983,9 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
  // Check index status
  const indexStatus = await checkIndexStatus(path);

-  const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'exact', '--json'];
+  // Request more results to support split (full content + extra files)
+  const totalToFetch = maxResults + extraFilesCount;
+  const args = ['search', query, '--limit', totalToFetch.toString(), '--mode', 'exact', '--json'];
  if (enrich) {
    args.push('--enrich');
  }
@@ -954,14 +1006,14 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
  }

  // Parse results
-  let results: SemanticMatch[] = [];
+  let allResults: SemanticMatch[] = [];
  try {
    const parsed = JSON.parse(stripAnsi(result.output || '{}'));
    const data = parsed.result?.results || parsed.results || parsed;
-    results = (Array.isArray(data) ? data : []).map((item: any) => ({
+    allResults = (Array.isArray(data) ? data : []).map((item: any) => ({
      file: item.path || item.file,
      score: item.score || 0,
-      content: item.excerpt || item.content || '',
+      content: truncateContent(item.content || item.excerpt, maxContentLength),
      symbol: item.symbol || null,
    }));
  } catch {
@@ -969,8 +1021,8 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
  }

  // Fallback to fuzzy mode if exact returns no results
-  if (results.length === 0) {
-    const fuzzyArgs = ['search', query, '--limit', maxResults.toString(), '--mode', 'fuzzy', '--json'];
+  if (allResults.length === 0) {
+    const fuzzyArgs = ['search', query, '--limit', totalToFetch.toString(), '--mode', 'fuzzy', '--json'];
    if (enrich) {
      fuzzyArgs.push('--enrich');
    }
@@ -980,20 +1032,23 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
      try {
        const parsed = JSON.parse(stripAnsi(fuzzyResult.output || '{}'));
        const data = parsed.result?.results || parsed.results || parsed;
-        results = (Array.isArray(data) ? data : []).map((item: any) => ({
+        allResults = (Array.isArray(data) ? data : []).map((item: any) => ({
          file: item.path || item.file,
          score: item.score || 0,
-          content: item.excerpt || item.content || '',
+          content: truncateContent(item.content || item.excerpt, maxContentLength),
          symbol: item.symbol || null,
        }));
      } catch {
        // Keep empty results
      }

-      if (results.length > 0) {
+      if (allResults.length > 0) {
+        // Split results: first N with full content, rest as file paths only
+        const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
        return {
          success: true,
          results,
+          extra_files: extra_files.length > 0 ? extra_files : undefined,
          metadata: {
            mode: 'exact',
            backend: 'codexlens',
@@ -1008,9 +1063,13 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
    }
  }

+  // Split results: first N with full content, rest as file paths only
+  const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
+
  return {
    success: true,
    results,
+    extra_files: extra_files.length > 0 ? extra_files : undefined,
    metadata: {
      mode: 'exact',
      backend: 'codexlens',
@@ -1027,7 +1086,7 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
 * Requires index with embeddings
 */
 async function executeHybridMode(params: Params): Promise<SearchResult> {
-  const { query, path = '.', maxResults = 10, enrich = false } = params;
+  const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false } = params;

  if (!query) {
    return {
@@ -1048,7 +1107,9 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
  // Check index status
  const indexStatus = await checkIndexStatus(path);

-  const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'hybrid', '--json'];
+  // Request more results to support split (full content + extra files)
+  const totalToFetch = maxResults + extraFilesCount;
+  const args = ['search', query, '--limit', totalToFetch.toString(), '--mode', 'hybrid', '--json'];
  if (enrich) {
    args.push('--enrich');
  }
@@ -1069,14 +1130,14 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
  }

  // Parse results
-  let results: SemanticMatch[] = [];
+  let allResults: SemanticMatch[] = [];
  let baselineInfo: { score: number; count: number } | null = null;
  let initialCount = 0;

  try {
    const parsed = JSON.parse(stripAnsi(result.output || '{}'));
    const data = parsed.result?.results || parsed.results || parsed;
-    results = (Array.isArray(data) ? data : []).map((item: any) => {
+    allResults = (Array.isArray(data) ? data : []).map((item: any) => {
      const rawScore = item.score || 0;
      // Hybrid mode returns distance scores (lower is better).
      // Convert to similarity scores (higher is better) for consistency.
@@ -1085,27 +1146,27 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
      return {
        file: item.path || item.file,
        score: similarityScore,
-        content: item.excerpt || item.content || '',
+        content: truncateContent(item.content || item.excerpt, maxContentLength),
        symbol: item.symbol || null,
      };
    });

-    initialCount = results.length;
+    initialCount = allResults.length;

    // Post-processing pipeline to improve semantic search quality
    // 0. Filter dominant baseline scores (hot spot detection)
-    const baselineResult = filterDominantBaselineScores(results);
-    results = baselineResult.filteredResults;
+    const baselineResult = filterDominantBaselineScores(allResults);
+    allResults = baselineResult.filteredResults;
    baselineInfo = baselineResult.baselineInfo;

    // 1. Filter noisy files (coverage, node_modules, etc.)
-    results = filterNoisyFiles(results);
+    allResults = filterNoisyFiles(allResults);
    // 2. Boost results containing query keywords
-    results = applyKeywordBoosting(results, query);
+    allResults = applyKeywordBoosting(allResults, query);
    // 3. Enforce score diversity (penalize identical scores)
-    results = enforceScoreDiversity(results);
+    allResults = enforceScoreDiversity(allResults);
    // 4. Re-sort by adjusted scores
-    results.sort((a, b) => b.score - a.score);
+    allResults.sort((a, b) => b.score - a.score);
  } catch {
    return {
      success: true,
@@ -1121,15 +1182,19 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
    };
  }

+  // Split results: first N with full content, rest as file paths only
+  const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
+
  // Build metadata with baseline info if detected
  let note = 'Hybrid mode uses RRF fusion (exact + fuzzy + vector) for best results';
  if (baselineInfo) {
-    note += ` | Filtered ${initialCount - results.length} hot-spot results with baseline score ~${baselineInfo.score.toFixed(4)}`;
+    note += ` | Filtered ${initialCount - allResults.length} hot-spot results with baseline score ~${baselineInfo.score.toFixed(4)}`;
  }

  return {
    success: true,
    results,
+    extra_files: extra_files.length > 0 ? extra_files : undefined,
    metadata: {
      mode: 'hybrid',
      backend: 'codexlens',
@@ -1540,7 +1605,7 @@ export const schema: ToolSchema = {
      mode: {
        type: 'string',
        enum: SEARCH_MODES,
-        description: 'Search mode: auto (default), hybrid (best quality), exact (CodexLens FTS), ripgrep (fast, no index), priority (fallback: hybrid->exact->ripgrep)',
+        description: 'Search mode: auto, hybrid (best quality), exact (CodexLens FTS), ripgrep (fast, no index), priority (fallback chain)',
        default: 'auto',
      },
      output_mode: {
@@ -1576,6 +1641,16 @@ export const schema: ToolSchema = {
        description: 'Alias for maxResults (default: 20)',
        default: 20,
      },
+      extraFilesCount: {
+        type: 'number',
+        description: 'Number of additional file-only results (paths without content)',
+        default: 10,
+      },
+      maxContentLength: {
+        type: 'number',
+        description: 'Maximum content length for truncation (50-2000)',
+        default: 200,
+      },
      offset: {
        type: 'number',
        description: 'Pagination offset - skip first N results (default: 0)',