feat: Enhance CodexLens search functionality with new parameters and result handling

- Added search limit, content length, and extra files input fields in the CodexLens manager UI. - Updated API request parameters to include new fields: max_content_length and extra_files_count. - Refactored smart-search.ts to support new parameters with default values. - Implemented result splitting logic to return both full content and additional file paths. - Updated CLI commands to remove worker limits and allow dynamic scaling based on endpoint count. - Introduced EmbeddingPoolConfig for improved embedding management and auto-discovery of providers. - Enhanced search engines to utilize new parameters for fuzzy and exact searches. - Added support for embedding single texts in the LiteLLM embedder.
2026-02-05 01:50:27 +08:00 · 2025-12-25 16:16:44 +08:00
parent a1413dd1b3
commit ebcbb11cb2
6 changed files with 188 additions and 63 deletions
--- a/.claude/workflows/cli-tools-usage.md
+++ b/.claude/workflows/cli-tools-usage.md
@@ -466,27 +466,21 @@ RULES: $(cat ~/.claude/workflows/cli-templates/protocols/write-protocol.md) $(ca
 ```
 ---
-## Configuration
+## ⚙️ Execution Configuration
-### Timeout Allocation (Bash)
+### Dynamic Timeout Allocation
-controlled by external bash `timeout` command:
+**Minimum timeout: 5 minutes (300000ms)** - Never set below this threshold.
-**Recommended Time Allocation**:
+**Timeout Ranges**:
 - **Simple** (analysis, search): 5-10min (300000-600000ms)
 - **Medium** (refactoring, documentation): 10-20min (600000-1200000ms)
 - **Complex** (implementation, migration): 20-60min (1200000-3600000ms)
 - **Heavy** (large codebase, multi-file): 60-120min (3600000-7200000ms)
- **Simple** (5-10min): Analysis, search
+**Codex Multiplier**: 3x of allocated time (minimum 15min / 900000ms)
  - `timeout 300` ~ `timeout 600`
- **Medium** (10-20min): Refactoring, documentation
+**Auto-detection**: Analyze PURPOSE and TASK fields to determine timeout
  - `timeout 600` ~ `timeout 1200`
 - **Complex** (20-60min): Implementation, migration
  - `timeout 1200` ~ `timeout 3600`
 - **Heavy** (60-120min): Large codebase, multi-file
  - `timeout 3600` ~ `timeout 7200`
 **Codex Multiplier**: 3x allocated time (minimum 15min / 900s)
 ### Permission Framework
--- a/ccw/src/core/routes/codexlens-routes.ts
+++ b/ccw/src/core/routes/codexlens-routes.ts
@@ -586,6 +586,8 @@ export async function handleCodexLensRoutes(ctx: RouteContext): Promise<boolean>
    const query = url.searchParams.get('query') || '';
    const limit = parseInt(url.searchParams.get('limit') || '20', 10);
    const mode = url.searchParams.get('mode') || 'exact';  // exact, fuzzy, hybrid, vector
    const maxContentLength = parseInt(url.searchParams.get('max_content_length') || '200', 10);
    const extraFilesCount = parseInt(url.searchParams.get('extra_files_count') || '10', 10);
    const projectPath = url.searchParams.get('path') || initialPath;
    if (!query) {
@@ -595,15 +597,46 @@ export async function handleCodexLensRoutes(ctx: RouteContext): Promise<boolean>
    }
    try {
-      const args = ['search', query, '--path', projectPath, '--limit', limit.toString(), '--mode', mode, '--json'];
+      // Request more results to support split (full content + extra files)
      const totalToFetch = limit + extraFilesCount;
      const args = ['search', query, '--path', projectPath, '--limit', totalToFetch.toString(), '--mode', mode, '--json'];
      const result = await executeCodexLens(args, { cwd: projectPath });
      if (result.success) {
        try {
          const parsed = extractJSON(result.output);
          const allResults = parsed.result?.results || [];
          // Truncate content and split results
          const truncateContent = (content: string | null | undefined): string => {
            if (!content) return '';
            if (content.length <= maxContentLength) return content;
            return content.slice(0, maxContentLength) + '...';
          };
          // Split results: first N with full content, rest as file paths only
          const resultsWithContent = allResults.slice(0, limit).map((r: any) => ({
            ...r,
            content: truncateContent(r.content || r.excerpt),
            excerpt: truncateContent(r.excerpt || r.content),
          }));
          const extraResults = allResults.slice(limit, limit + extraFilesCount);
          const extraFiles = [...new Set(extraResults.map((r: any) => r.path || r.file))];
          res.writeHead(200, { 'Content-Type': 'application/json' });
-          res.end(JSON.stringify({ success: true, ...parsed.result }));
+          res.end(JSON.stringify({
            success: true,
            results: resultsWithContent,
            extra_files: extraFiles.length > 0 ? extraFiles : undefined,
            metadata: {
              total: allResults.length,
              limit,
              max_content_length: maxContentLength,
              extra_files_count: extraFilesCount,
            },
          }));
        } catch {
          res.writeHead(200, { 'Content-Type': 'application/json' });
          res.end(JSON.stringify({ success: true, results: [], output: result.output }));
--- a/ccw/src/tools/smart-search.ts
+++ b/ccw/src/tools/smart-search.ts
@@ -36,10 +36,12 @@ const ParamsSchema = z.object({
  path: z.string().optional(),
  paths: z.array(z.string()).default([]),
  contextLines: z.number().default(0),
-  maxResults: z.number().default(20),  // Increased default
+  maxResults: z.number().default(5),  // Default 5 with full content
  includeHidden: z.boolean().default(false),
  languages: z.array(z.string()).optional(),
-  limit: z.number().default(20),  // Increased default
+  limit: z.number().default(5),  // Default 5 with full content
  extraFilesCount: z.number().default(10),  // Additional file-only results
  maxContentLength: z.number().default(200),  // Max content length for truncation (50-2000)
  offset: z.number().default(0),  // NEW: Pagination offset (start_index)
  enrich: z.boolean().default(false),
  // Search modifiers for ripgrep mode
@@ -268,6 +270,7 @@ interface SearchMetadata {
 interface SearchResult {
  success: boolean;
  results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | FileMatch[] | unknown;
  extra_files?: string[];  // Additional file paths without content
  output?: string;
  metadata?: SearchMetadata;
  error?: string;
@@ -301,6 +304,42 @@ function stripAnsi(str: string): string {
  return str.replace(/\x1b\[[0-9;]*m/g, '');
 }
 /** Default maximum content length to return (avoid excessive output) */
 const DEFAULT_MAX_CONTENT_LENGTH = 200;
 /**
 * Truncate content to specified length with ellipsis
 * @param content - The content to truncate
 * @param maxLength - Maximum length (default: 200)
 */
 function truncateContent(content: string | null | undefined, maxLength: number = DEFAULT_MAX_CONTENT_LENGTH): string {
  if (!content) return '';
  if (content.length <= maxLength) return content;
  return content.slice(0, maxLength) + '...';
 }
 /**
 * Split results into full content results and extra file-only results
 * Generic function supporting both SemanticMatch and ExactMatch types
 * @param allResults - All search results (must have 'file' property)
 * @param fullContentLimit - Number of results with full content (default: 5)
 * @param extraFilesCount - Number of additional file-only results (default: 10)
 */
 function splitResultsWithExtraFiles<T extends { file: string }>(
  allResults: T[],
  fullContentLimit: number = 5,
  extraFilesCount: number = 10
 ): { results: T[]; extra_files: string[] } {
  // First N results with full content
  const results = allResults.slice(0, fullContentLimit);
  // Next M results as file paths only (deduplicated)
  const extraResults = allResults.slice(fullContentLimit, fullContentLimit + extraFilesCount);
  const extra_files = [...new Set(extraResults.map(r => r.file))];
  return { results, extra_files };
 }
 /**
 * Check if CodexLens index exists for current directory
 * @param path - Directory path to check
@@ -714,7 +753,7 @@ async function executeAutoMode(params: Params): Promise<SearchResult> {
 * Supports tokenized multi-word queries with OR matching and result ranking
 */
 async function executeRipgrepMode(params: Params): Promise<SearchResult> {
-  const { query, paths = [], contextLines = 0, maxResults = 10, includeHidden = false, path = '.', regex = true, caseSensitive = true, tokenize = true } = params;
+  const { query, paths = [], contextLines = 0, maxResults = 5, extraFilesCount = 10, maxContentLength = 200, includeHidden = false, path = '.', regex = true, caseSensitive = true, tokenize = true } = params;
  if (!query) {
    return {
@@ -726,6 +765,9 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
  // Check if ripgrep is available
  const hasRipgrep = checkToolAvailability('rg');
  // Calculate total to fetch for split (full content + extra files)
  const totalToFetch = maxResults + extraFilesCount;
  // If ripgrep not available, fall back to CodexLens exact mode
  if (!hasRipgrep) {
    const readyStatus = await ensureCodexLensReady();
@@ -737,7 +779,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
    }
    // Use CodexLens exact mode as fallback
-    const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'exact', '--json'];
+    const args = ['search', query, '--limit', totalToFetch.toString(), '--mode', 'exact', '--json'];
    const result = await executeCodexLens(args, { cwd: path });
    if (!result.success) {
@@ -754,23 +796,27 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
    }
    // Parse results
-    let results: SemanticMatch[] = [];
+    let allResults: SemanticMatch[] = [];
    try {
      const parsed = JSON.parse(stripAnsi(result.output || '{}'));
      const data = parsed.result?.results || parsed.results || parsed;
-      results = (Array.isArray(data) ? data : []).map((item: any) => ({
+      allResults = (Array.isArray(data) ? data : []).map((item: any) => ({
        file: item.path || item.file,
        score: item.score || 0,
-        content: item.excerpt || item.content || '',
+        content: truncateContent(item.content || item.excerpt, maxContentLength),
        symbol: item.symbol || null,
      }));
    } catch {
      // Keep empty results
    }
    // Split results: first N with full content, rest as file paths only
    const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
    return {
      success: true,
      results,
      extra_files: extra_files.length > 0 ? extra_files : undefined,
      metadata: {
        mode: 'ripgrep',
        backend: 'codexlens-fallback',
@@ -781,12 +827,12 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
    };
  }
-  // Use ripgrep
+  // Use ripgrep - request more results to support split
  const { command, args, tokens } = buildRipgrepCommand({
    query,
    paths: paths.length > 0 ? paths : [path],
    contextLines,
-    maxResults,
+    maxResults: totalToFetch,  // Fetch more to support split
    includeHidden,
    regex,
    caseSensitive,
@@ -812,14 +858,14 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
    });
    child.on('close', (code) => {
-      const results: ExactMatch[] = [];
+      const allResults: ExactMatch[] = [];
      const lines = stdout.split('\n').filter((line) => line.trim());
      // Limit total results to prevent memory overflow (--max-count only limits per-file)
-      const effectiveLimit = maxResults > 0 ? maxResults : 500;
+      const effectiveLimit = totalToFetch > 0 ? totalToFetch : 500;
      for (const line of lines) {
        // Stop collecting if we've reached the limit
-        if (results.length >= effectiveLimit) {
+        if (allResults.length >= effectiveLimit) {
          resultLimitReached = true;
          break;
        }
@@ -837,7 +883,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
                  : 1,
              content: item.data.lines.text.trim(),
            };
-            results.push(match);
+            allResults.push(match);
          }
        } catch {
          continue;
@@ -850,9 +896,12 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
      // Apply token-based scoring and sorting for multi-word queries
      // Results matching more tokens are ranked higher (exact matches first)
-      const scoredResults = tokens.length > 1 ? scoreByTokenMatch(results, tokens) : results;
+      const scoredResults = tokens.length > 1 ? scoreByTokenMatch(allResults, tokens) : allResults;
      if (code === 0 || code === 1 || (isWindowsDeviceError && scoredResults.length > 0)) {
        // Split results: first N with full content, rest as file paths only
        const { results, extra_files } = splitResultsWithExtraFiles(scoredResults, maxResults, extraFilesCount);
        // Build warning message for various conditions
        const warnings: string[] = [];
        if (resultLimitReached) {
@@ -864,18 +913,19 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
        resolve({
          success: true,
-          results: scoredResults,
+          results,
          extra_files: extra_files.length > 0 ? extra_files : undefined,
          metadata: {
            mode: 'ripgrep',
            backend: 'ripgrep',
-            count: scoredResults.length,
+            count: results.length,
            query,
            tokens: tokens.length > 1 ? tokens : undefined,  // Include tokens in metadata for debugging
            tokenized: tokens.length > 1,
            ...(warnings.length > 0 && { warning: warnings.join('; ') }),
          },
        });
-      } else if (isWindowsDeviceError && results.length === 0) {
+      } else if (isWindowsDeviceError && allResults.length === 0) {
        // Windows device error but no results - might be the only issue
        resolve({
          success: true,
@@ -912,7 +962,7 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
 * Requires index
 */
 async function executeCodexLensExactMode(params: Params): Promise<SearchResult> {
-  const { query, path = '.', maxResults = 10, enrich = false } = params;
+  const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false } = params;
  if (!query) {
    return {
@@ -933,7 +983,9 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
  // Check index status
  const indexStatus = await checkIndexStatus(path);
-  const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'exact', '--json'];
+  // Request more results to support split (full content + extra files)
  const totalToFetch = maxResults + extraFilesCount;
  const args = ['search', query, '--limit', totalToFetch.toString(), '--mode', 'exact', '--json'];
  if (enrich) {
    args.push('--enrich');
  }
@@ -954,14 +1006,14 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
  }
  // Parse results
-  let results: SemanticMatch[] = [];
+  let allResults: SemanticMatch[] = [];
  try {
    const parsed = JSON.parse(stripAnsi(result.output || '{}'));
    const data = parsed.result?.results || parsed.results || parsed;
-    results = (Array.isArray(data) ? data : []).map((item: any) => ({
+    allResults = (Array.isArray(data) ? data : []).map((item: any) => ({
      file: item.path || item.file,
      score: item.score || 0,
-      content: item.excerpt || item.content || '',
+      content: truncateContent(item.content || item.excerpt, maxContentLength),
      symbol: item.symbol || null,
    }));
  } catch {
@@ -969,8 +1021,8 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
  }
  // Fallback to fuzzy mode if exact returns no results
-  if (results.length === 0) {
+  if (allResults.length === 0) {
-    const fuzzyArgs = ['search', query, '--limit', maxResults.toString(), '--mode', 'fuzzy', '--json'];
+    const fuzzyArgs = ['search', query, '--limit', totalToFetch.toString(), '--mode', 'fuzzy', '--json'];
    if (enrich) {
      fuzzyArgs.push('--enrich');
    }
@@ -980,20 +1032,23 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
      try {
        const parsed = JSON.parse(stripAnsi(fuzzyResult.output || '{}'));
        const data = parsed.result?.results || parsed.results || parsed;
-        results = (Array.isArray(data) ? data : []).map((item: any) => ({
+        allResults = (Array.isArray(data) ? data : []).map((item: any) => ({
          file: item.path || item.file,
          score: item.score || 0,
-          content: item.excerpt || item.content || '',
+          content: truncateContent(item.content || item.excerpt, maxContentLength),
          symbol: item.symbol || null,
        }));
      } catch {
        // Keep empty results
      }
-      if (results.length > 0) {
+      if (allResults.length > 0) {
        // Split results: first N with full content, rest as file paths only
        const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
        return {
          success: true,
          results,
          extra_files: extra_files.length > 0 ? extra_files : undefined,
          metadata: {
            mode: 'exact',
            backend: 'codexlens',
@@ -1008,9 +1063,13 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
    }
  }
  // Split results: first N with full content, rest as file paths only
  const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
  return {
    success: true,
    results,
    extra_files: extra_files.length > 0 ? extra_files : undefined,
    metadata: {
      mode: 'exact',
      backend: 'codexlens',
@@ -1027,7 +1086,7 @@ async function executeCodexLensExactMode(params: Params): Promise<SearchResult>
 * Requires index with embeddings
 */
 async function executeHybridMode(params: Params): Promise<SearchResult> {
-  const { query, path = '.', maxResults = 10, enrich = false } = params;
+  const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false } = params;
  if (!query) {
    return {
@@ -1048,7 +1107,9 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
  // Check index status
  const indexStatus = await checkIndexStatus(path);
-  const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'hybrid', '--json'];
+  // Request more results to support split (full content + extra files)
  const totalToFetch = maxResults + extraFilesCount;
  const args = ['search', query, '--limit', totalToFetch.toString(), '--mode', 'hybrid', '--json'];
  if (enrich) {
    args.push('--enrich');
  }
@@ -1069,14 +1130,14 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
  }
  // Parse results
-  let results: SemanticMatch[] = [];
+  let allResults: SemanticMatch[] = [];
  let baselineInfo: { score: number; count: number } | null = null;
  let initialCount = 0;
  try {
    const parsed = JSON.parse(stripAnsi(result.output || '{}'));
    const data = parsed.result?.results || parsed.results || parsed;
-    results = (Array.isArray(data) ? data : []).map((item: any) => {
+    allResults = (Array.isArray(data) ? data : []).map((item: any) => {
      const rawScore = item.score || 0;
      // Hybrid mode returns distance scores (lower is better).
      // Convert to similarity scores (higher is better) for consistency.
@@ -1085,27 +1146,27 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
      return {
        file: item.path || item.file,
        score: similarityScore,
-        content: item.excerpt || item.content || '',
+        content: truncateContent(item.content || item.excerpt, maxContentLength),
        symbol: item.symbol || null,
      };
    });
-    initialCount = results.length;
+    initialCount = allResults.length;
    // Post-processing pipeline to improve semantic search quality
    // 0. Filter dominant baseline scores (hot spot detection)
-    const baselineResult = filterDominantBaselineScores(results);
+    const baselineResult = filterDominantBaselineScores(allResults);
-    results = baselineResult.filteredResults;
+    allResults = baselineResult.filteredResults;
    baselineInfo = baselineResult.baselineInfo;
    // 1. Filter noisy files (coverage, node_modules, etc.)
-    results = filterNoisyFiles(results);
+    allResults = filterNoisyFiles(allResults);
    // 2. Boost results containing query keywords
-    results = applyKeywordBoosting(results, query);
+    allResults = applyKeywordBoosting(allResults, query);
    // 3. Enforce score diversity (penalize identical scores)
-    results = enforceScoreDiversity(results);
+    allResults = enforceScoreDiversity(allResults);
    // 4. Re-sort by adjusted scores
-    results.sort((a, b) => b.score - a.score);
+    allResults.sort((a, b) => b.score - a.score);
  } catch {
    return {
      success: true,
@@ -1121,15 +1182,19 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
    };
  }
  // Split results: first N with full content, rest as file paths only
  const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
  // Build metadata with baseline info if detected
  let note = 'Hybrid mode uses RRF fusion (exact + fuzzy + vector) for best results';
  if (baselineInfo) {
-    note += ` | Filtered ${initialCount - results.length} hot-spot results with baseline score ~${baselineInfo.score.toFixed(4)}`;
+    note += ` | Filtered ${initialCount - allResults.length} hot-spot results with baseline score ~${baselineInfo.score.toFixed(4)}`;
  }
  return {
    success: true,
    results,
    extra_files: extra_files.length > 0 ? extra_files : undefined,
    metadata: {
      mode: 'hybrid',
      backend: 'codexlens',
@@ -1540,7 +1605,7 @@ export const schema: ToolSchema = {
      mode: {
        type: 'string',
        enum: SEARCH_MODES,
-        description: 'Search mode: auto (default), hybrid (best quality), exact (CodexLens FTS), ripgrep (fast, no index), priority (fallback: hybrid->exact->ripgrep)',
+        description: 'Search mode: auto, hybrid (best quality), exact (CodexLens FTS), ripgrep (fast, no index), priority (fallback chain)',
        default: 'auto',
      },
      output_mode: {
@@ -1576,6 +1641,16 @@ export const schema: ToolSchema = {
        description: 'Alias for maxResults (default: 20)',
        default: 20,
      },
      extraFilesCount: {
        type: 'number',
        description: 'Number of additional file-only results (paths without content)',
        default: 10,
      },
      maxContentLength: {
        type: 'number',
        description: 'Maximum content length for truncation (50-2000)',
        default: 200,
      },
      offset: {
        type: 'number',
        description: 'Pagination offset - skip first N results (default: 0)',
--- a/codex-lens/src/codexlens/search/chain_search.py
+++ b/codex-lens/src/codexlens/search/chain_search.py
@@ -494,9 +494,13 @@ class ChainSearchEngine:
                    else:
                        # Use fuzzy FTS if enable_fuzzy=True (mode="fuzzy"), otherwise exact FTS
                        if enable_fuzzy:
-                            fts_results = store.search_fts_fuzzy(query, limit=limit)
+                            fts_results = store.search_fts_fuzzy(
                                query, limit=limit, return_full_content=True
                            )
                        else:
-                            fts_results = store.search_fts(query, limit=limit)
+                            fts_results = store.search_fts_exact(
                                query, limit=limit, return_full_content=True
                            )
                    # Optionally add semantic keyword results
                    if include_semantic:
--- a/codex-lens/src/codexlens/search/hybrid_search.py
+++ b/codex-lens/src/codexlens/search/hybrid_search.py
@@ -200,7 +200,9 @@ class HybridSearchEngine:
        """
        try:
            with DirIndexStore(index_path) as store:
-                return store.search_fts_exact(query, limit=limit)
+                return store.search_fts_exact(
                    query, limit=limit, return_full_content=True
                )
        except Exception as exc:
            self.logger.debug("Exact search error: %s", exc)
            return []
@@ -220,7 +222,9 @@ class HybridSearchEngine:
        """
        try:
            with DirIndexStore(index_path) as store:
-                return store.search_fts_fuzzy(query, limit=limit)
+                return store.search_fts_fuzzy(
                    query, limit=limit, return_full_content=True
                )
        except Exception as exc:
            self.logger.debug("Fuzzy search error: %s", exc)
            return []
--- a/codex-lens/src/codexlens/semantic/litellm_embedder.py
+++ b/codex-lens/src/codexlens/semantic/litellm_embedder.py
@@ -127,3 +127,18 @@ class LiteLLMEmbedderWrapper(BaseEmbedder):
        # LiteLLM handles batching internally, ignore batch_size parameter
        return self._embedder.embed(texts)
    def embed_single(self, text: str) -> list[float]:
        """Generate embedding for a single text.
        Args:
            text: Text to embed.
        Returns:
            list[float]: Embedding vector as a list of floats.
        """
        # Sanitize text before embedding
        sanitized = self._sanitize_text(text)
        embedding = self._embedder.embed([sanitized])
        return embedding[0].tolist()