/** * Smart Search Tool - Unified intelligent search with CodexLens integration * * Features: * - Fuzzy mode: FTS + ripgrep fusion with RRF ranking (default) * - Semantic mode: Dense coarse retrieval + cross-encoder reranking * - CodexLens integration (init, dense_rerank, fts) * - Ripgrep fallback for exact mode * - Index status checking and warnings * - Multi-backend search routing with RRF ranking * * Actions: * - init: Initialize CodexLens index * - search: Intelligent search with fuzzy (default) or semantic mode * - status: Check index status * - update: Incremental index update for changed files * - watch: Start file watcher for automatic updates */ import { z } from 'zod'; import type { ToolSchema, ToolResult } from '../types/tool.js'; import { spawn, execSync } from 'child_process'; import { ensureReady as ensureCodexLensReady, executeCodexLens, } from './codex-lens.js'; import type { ProgressInfo } from './codex-lens.js'; import { getProjectRoot } from '../utils/path-validator.js'; // Timing utilities for performance analysis const TIMING_ENABLED = process.env.SMART_SEARCH_TIMING === '1' || process.env.DEBUG?.includes('timing'); interface TimingData { [key: string]: number; } function createTimer(): { mark: (name: string) => void; getTimings: () => TimingData; log: () => void } { const startTime = performance.now(); const marks: { name: string; time: number }[] = []; let lastMark = startTime; return { mark(name: string) { const now = performance.now(); marks.push({ name, time: now - lastMark }); lastMark = now; }, getTimings(): TimingData { const timings: TimingData = {}; marks.forEach(m => { timings[m.name] = Math.round(m.time * 100) / 100; }); timings['_total'] = Math.round((performance.now() - startTime) * 100) / 100; return timings; }, log() { if (TIMING_ENABLED) { const timings = this.getTimings(); console.error(`[TIMING] smart-search: ${JSON.stringify(timings)}`); } } }; } // Define Zod schema for validation const ParamsSchema = z.object({ // Action: search (content), find_files (path/name pattern), init, status, update (incremental), watch // Note: search_files is deprecated, use search with output_mode='files_only' action: z.enum(['init', 'search', 'search_files', 'find_files', 'status', 'update', 'watch']).default('search'), query: z.string().optional().describe('Content search query (for action="search")'), pattern: z.string().optional().describe('Glob pattern for path matching (for action="find_files")'), mode: z.enum(['fuzzy', 'semantic']).default('fuzzy'), output_mode: z.enum(['full', 'files_only', 'count']).default('full'), path: z.string().optional(), paths: z.array(z.string()).default([]), contextLines: z.number().default(0), maxResults: z.number().default(5), // Default 5 with full content includeHidden: z.boolean().default(false), languages: z.array(z.string()).optional(), limit: z.number().default(5), // Default 5 with full content extraFilesCount: z.number().default(10), // Additional file-only results maxContentLength: z.number().default(200), // Max content length for truncation (50-2000) offset: z.number().default(0), // NEW: Pagination offset (start_index) enrich: z.boolean().default(false), // Search modifiers for ripgrep mode regex: z.boolean().default(true), // Use regex pattern matching (default: enabled) caseSensitive: z.boolean().default(true), // Case sensitivity (default: case-sensitive) tokenize: z.boolean().default(true), // Tokenize multi-word queries for OR matching (default: enabled) // File type filtering excludeExtensions: z.array(z.string()).optional().describe('File extensions to exclude from results (e.g., ["md", "txt"])'), codeOnly: z.boolean().default(false).describe('Only return code files (excludes md, txt, json, yaml, xml, etc.)'), // Watcher options debounce: z.number().default(1000).describe('Debounce interval in ms for watch action'), // Fuzzy matching is implicit in hybrid mode (RRF fusion) }); type Params = z.infer; // Search mode constants const SEARCH_MODES = ['fuzzy', 'semantic'] as const; // Classification confidence threshold const CONFIDENCE_THRESHOLD = 0.7; // File filtering configuration (ported from code-index) const FILTER_CONFIG = { exclude_directories: new Set([ '.git', '.svn', '.hg', '.bzr', 'node_modules', '__pycache__', '.venv', 'venv', 'vendor', 'bower_components', 'dist', 'build', 'target', 'out', 'bin', 'obj', '.idea', '.vscode', '.vs', '.sublime-workspace', '.pytest_cache', '.coverage', '.tox', '.nyc_output', 'coverage', 'htmlcov', '.next', '.nuxt', '.cache', '.parcel-cache', '.DS_Store', 'Thumbs.db', ]), exclude_files: new Set([ '*.tmp', '*.temp', '*.swp', '*.swo', '*.bak', '*~', '*.orig', '*.log', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml', 'Pipfile.lock', ]), // Windows device files - must use **/ pattern to match in any directory // These cause "os error 1" on Windows when accessed windows_device_files: new Set([ 'nul', 'con', 'aux', 'prn', 'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9', 'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9', ]), }; function buildExcludeArgs(): string[] { const args: string[] = []; for (const dir of FILTER_CONFIG.exclude_directories) { args.push('--glob', `!**/${dir}/**`); } for (const pattern of FILTER_CONFIG.exclude_files) { args.push('--glob', `!${pattern}`); } // Windows device files need case-insensitive matching in any directory for (const device of FILTER_CONFIG.windows_device_files) { args.push('--glob', `!**/${device}`); args.push('--glob', `!**/${device.toUpperCase()}`); } return args; } /** * Tokenize query for multi-word OR matching * Splits on whitespace and common delimiters, filters stop words and short tokens * @param query - The search query * @returns Array of tokens */ function tokenizeQuery(query: string): string[] { // Stop words for filtering (common English + programming keywords) const stopWords = new Set([ 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through', 'and', 'but', 'if', 'or', 'not', 'this', 'that', 'these', 'those', 'it', 'its', 'how', 'what', 'where', 'when', 'why', 'which', 'who', 'whom', ]); // Split on whitespace and common delimiters, keep meaningful tokens const tokens = query .split(/[\s,;:]+/) .map(token => token.trim()) .filter(token => { // Keep tokens that are: // - At least 2 characters long // - Not a stop word (case-insensitive) // - Or look like identifiers (contain underscore/camelCase) if (token.length < 2) return false; if (stopWords.has(token.toLowerCase()) && !token.includes('_') && !/[A-Z]/.test(token)) { return false; } return true; }); return tokens; } /** * Score results based on token match count for ranking * @param results - Search results * @param tokens - Query tokens * @returns Results with match scores */ function scoreByTokenMatch(results: ExactMatch[], tokens: string[]): ExactMatch[] { if (tokens.length <= 1) return results; // Create case-insensitive patterns for each token const tokenPatterns = tokens.map(t => { const escaped = t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); return new RegExp(escaped, 'i'); }); return results.map(r => { const content = r.content || ''; const file = r.file || ''; const searchText = `${file} ${content}`; // Count how many tokens match let matchCount = 0; for (const pattern of tokenPatterns) { if (pattern.test(searchText)) { matchCount++; } } // Calculate match ratio (0 to 1) const matchRatio = matchCount / tokens.length; return { ...r, matchScore: matchRatio, matchCount, }; }).sort((a, b) => { // Sort by match ratio (descending), then by line number if (b.matchScore !== a.matchScore) { return b.matchScore - a.matchScore; } return (a.line || 0) - (b.line || 0); }); } interface Classification { mode: string; confidence: number; reasoning: string; } interface ExactMatch { file: string; line: number; column: number; content: string; matchScore?: number; // Token match ratio (0-1) for multi-word queries matchCount?: number; // Number of tokens matched } interface RelationshipInfo { type: string; // 'calls', 'imports', 'called_by', 'imported_by' direction: 'outgoing' | 'incoming'; target?: string; // Target symbol name (for outgoing) source?: string; // Source symbol name (for incoming) file: string; // File path line?: number; // Line number } interface SemanticMatch { file: string; score: number; content: string; symbol: string | null; relationships?: RelationshipInfo[]; } interface GraphMatch { file: string; symbols: unknown; relationships: unknown[]; } // File match for find_files action (path-based search) interface FileMatch { path: string; type: 'file' | 'directory'; name: string; // Filename only extension?: string; // File extension (without dot) } interface PaginationInfo { offset: number; // Starting index of returned results limit: number; // Number of results requested total: number; // Total number of results found has_more: boolean; // True if more results are available } interface SearchMetadata { mode?: string; backend?: string; count?: number; query?: string; pattern?: string; // For find_files action classified_as?: string; confidence?: number; reasoning?: string; embeddings_coverage_percent?: number; warning?: string; note?: string; index_status?: 'indexed' | 'not_indexed' | 'partial'; fallback?: string; // Fallback mode used (e.g., 'fuzzy') fallback_history?: string[]; suggested_weights?: Record; // Tokenization metadata (ripgrep mode) tokens?: string[]; // Query tokens used for multi-word search tokenized?: boolean; // Whether tokenization was applied // Pagination metadata pagination?: PaginationInfo; // Performance timing data (when SMART_SEARCH_TIMING=1 or DEBUG includes 'timing') timing?: TimingData; // Init action specific action?: string; path?: string; progress?: { stage: string; message: string; percent: number; filesProcessed?: number; totalFiles?: number; }; progressHistory?: ProgressInfo[]; } interface SearchResult { success: boolean; results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | FileMatch[] | unknown; extra_files?: string[]; // Additional file paths without content output?: string; metadata?: SearchMetadata; error?: string; status?: unknown; message?: string; } interface ModelInfo { model_profile?: string; model_name?: string; embedding_dim?: number; backend?: string; created_at?: string; updated_at?: string; } interface IndexStatus { indexed: boolean; has_embeddings: boolean; file_count?: number; embeddings_coverage_percent?: number; total_chunks?: number; model_info?: ModelInfo | null; warning?: string; } /** * Strip ANSI color codes from string (for JSON parsing) */ function stripAnsi(str: string): string { return str.replace(/\x1b\[[0-9;]*m/g, ''); } /** Default maximum content length to return (avoid excessive output) */ const DEFAULT_MAX_CONTENT_LENGTH = 200; /** * Truncate content to specified length with ellipsis * @param content - The content to truncate * @param maxLength - Maximum length (default: 200) */ function truncateContent(content: string | null | undefined, maxLength: number = DEFAULT_MAX_CONTENT_LENGTH): string { if (!content) return ''; if (content.length <= maxLength) return content; return content.slice(0, maxLength) + '...'; } /** * Split results into full content results and extra file-only results * Generic function supporting both SemanticMatch and ExactMatch types * @param allResults - All search results (must have 'file' property) * @param fullContentLimit - Number of results with full content (default: 5) * @param extraFilesCount - Number of additional file-only results (default: 10) */ function splitResultsWithExtraFiles( allResults: T[], fullContentLimit: number = 5, extraFilesCount: number = 10 ): { results: T[]; extra_files: string[] } { // First N results with full content const results = allResults.slice(0, fullContentLimit); // Next M results as file paths only (deduplicated) const extraResults = allResults.slice(fullContentLimit, fullContentLimit + extraFilesCount); const extra_files = [...new Set(extraResults.map(r => r.file))]; return { results, extra_files }; } /** * Check if CodexLens index exists for current directory * @param path - Directory path to check * @returns Index status */ async function checkIndexStatus(path: string = '.'): Promise { try { const result = await executeCodexLens(['status', '--json'], { cwd: path }); if (!result.success) { return { indexed: false, has_embeddings: false, warning: 'No CodexLens index found. Run smart_search(action="init") to create index for better search results.', }; } // Parse status output try { // Strip ANSI color codes from JSON output const cleanOutput = stripAnsi(result.output || '{}'); const parsed = JSON.parse(cleanOutput); // Handle both direct and nested response formats (status returns {success, result: {...}}) const status = parsed.result || parsed; const indexed = status.projects_count > 0 || status.total_files > 0; // Get embeddings coverage from comprehensive status const embeddingsData = status.embeddings || {}; const embeddingsCoverage = embeddingsData.coverage_percent || 0; const has_embeddings = embeddingsCoverage >= 50; // Threshold: 50% const totalChunks = embeddingsData.total_chunks || 0; // Extract model info if available const modelInfoData = embeddingsData.model_info; const modelInfo: ModelInfo | undefined = modelInfoData ? { model_profile: modelInfoData.model_profile, model_name: modelInfoData.model_name, embedding_dim: modelInfoData.embedding_dim, backend: modelInfoData.backend, created_at: modelInfoData.created_at, updated_at: modelInfoData.updated_at, } : undefined; let warning: string | undefined; if (!indexed) { warning = 'No CodexLens index found. Run smart_search(action="init") to create index for better search results.'; } else if (embeddingsCoverage === 0) { warning = 'Index exists but no embeddings generated. Run: codexlens embeddings-generate --recursive'; } else if (embeddingsCoverage < 50) { warning = `Embeddings coverage is ${embeddingsCoverage.toFixed(1)}% (below 50%). Hybrid search will use exact mode. Run: codexlens embeddings-generate --recursive`; } return { indexed, has_embeddings, file_count: status.total_files, embeddings_coverage_percent: embeddingsCoverage, total_chunks: totalChunks, // Ensure model_info is null instead of undefined so it's included in JSON model_info: modelInfo ?? null, warning, }; } catch { return { indexed: false, has_embeddings: false, warning: 'Failed to parse index status', }; } } catch { return { indexed: false, has_embeddings: false, warning: 'CodexLens not available', }; } } /** * Detection heuristics for intent classification */ /** * Detect literal string query (simple alphanumeric or quoted strings) */ function detectLiteral(query: string): boolean { return /^[a-zA-Z0-9_-]+$/.test(query) || /^["'].*["']$/.test(query); } /** * Detect regex pattern (contains regex metacharacters) */ function detectRegex(query: string): boolean { return /[.*+?^${}()|[\]\\]/.test(query); } /** * Detect natural language query (sentence structure, questions, multi-word phrases) */ function detectNaturalLanguage(query: string): boolean { return query.split(/\s+/).length >= 3 || /\?$/.test(query); } /** * Detect file path query (path separators, file extensions) */ function detectFilePath(query: string): boolean { return /[/\\]/.test(query) || /\.[a-z]{2,4}$/i.test(query); } /** * Detect relationship query (import, export, dependency keywords) */ function detectRelationship(query: string): boolean { return /(import|export|uses?|depends?|calls?|extends?)\s/i.test(query); } function looksLikeCodeQuery(query: string): boolean { if (/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(query)) return true; if (/[:.<>\-=(){}[\]]/.test(query) && query.split(/\s+/).length <= 2) return true; if (/\.\*|\\\(|\\\[|\\s/.test(query)) return true; if (/^[a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*$/.test(query)) return true; return false; } /** * Classify query intent and recommend search mode * Simple mapping: hybrid (NL + index + embeddings) | exact (index or insufficient embeddings) | ripgrep (no index) * @param query - Search query string * @param hasIndex - Whether CodexLens index exists * @param hasSufficientEmbeddings - Whether embeddings coverage >= 50% * @returns Classification result */ function classifyIntent(query: string, hasIndex: boolean = false, hasSufficientEmbeddings: boolean = false): Classification { const isNaturalLanguage = detectNaturalLanguage(query); const isCodeQuery = looksLikeCodeQuery(query); const isRegexPattern = detectRegex(query); let mode: string; let confidence: number; if (!hasIndex) { mode = 'ripgrep'; confidence = 1.0; } else if (isCodeQuery || isRegexPattern) { mode = 'exact'; confidence = 0.95; } else if (isNaturalLanguage && hasSufficientEmbeddings) { mode = 'hybrid'; confidence = 0.9; } else { mode = 'exact'; confidence = 0.8; } const detectedPatterns: string[] = []; if (detectLiteral(query)) detectedPatterns.push('literal'); if (detectRegex(query)) detectedPatterns.push('regex'); if (detectNaturalLanguage(query)) detectedPatterns.push('natural language'); if (detectFilePath(query)) detectedPatterns.push('file path'); if (detectRelationship(query)) detectedPatterns.push('relationship'); if (isCodeQuery) detectedPatterns.push('code identifier'); const reasoning = `Query classified as ${mode} (confidence: ${confidence.toFixed(2)}, detected: ${detectedPatterns.join(', ')}, index: ${hasIndex ? 'available' : 'not available'}, embeddings: ${hasSufficientEmbeddings ? 'sufficient' : 'insufficient'})`; return { mode, confidence, reasoning }; } /** * Check if a tool is available in PATH * @param toolName - Tool executable name * @returns True if available */ function checkToolAvailability(toolName: string): boolean { try { const isWindows = process.platform === 'win32'; const command = isWindows ? 'where' : 'which'; execSync(`${command} ${toolName}`, { stdio: 'ignore' }); return true; } catch { return false; } } /** * Build ripgrep command arguments * Supports tokenized multi-word queries with OR matching * @param params - Search parameters * @returns Command, arguments, and tokens used */ function buildRipgrepCommand(params: { query: string; paths: string[]; contextLines: number; maxResults: number; includeHidden: boolean; regex?: boolean; caseSensitive?: boolean; tokenize?: boolean; }): { command: string; args: string[]; tokens: string[] } { const { query, paths = ['.'], contextLines = 0, maxResults = 10, includeHidden = false, regex = false, caseSensitive = true, tokenize = true } = params; const args = [ '-n', '--color=never', '--json', ]; // Add file filtering (unless includeHidden is true) if (!includeHidden) { args.push(...buildExcludeArgs()); } // Case sensitivity if (!caseSensitive) { args.push('--ignore-case'); } if (contextLines > 0) { args.push('-C', contextLines.toString()); } if (maxResults > 0) { args.push('--max-count', maxResults.toString()); } if (includeHidden) { args.push('--hidden'); } // Tokenize query for multi-word OR matching const tokens = tokenize ? tokenizeQuery(query) : [query]; if (tokens.length > 1) { // Multi-token: use multiple -e patterns (OR matching) // Each token is escaped for regex safety unless regex mode is enabled for (const token of tokens) { if (regex) { args.push('-e', token); } else { // Escape regex special chars for literal matching const escaped = token.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); args.push('-e', escaped); } } } else { // Single token or no tokenization: use original behavior if (regex) { args.push('-e', query); } else { args.push('-F', query); } } args.push(...paths); return { command: 'rg', args, tokens }; } /** * Action: init - Initialize CodexLens index (FTS only, no embeddings) * For semantic/vector search, use ccw view dashboard or codexlens CLI directly */ async function executeInitAction(params: Params): Promise { const { path = '.', languages } = params; // Check CodexLens availability const readyStatus = await ensureCodexLensReady(); if (!readyStatus.ready) { return { success: false, error: `CodexLens not available: ${readyStatus.error}. CodexLens will be auto-installed on first use.`, }; } // Build args with --no-embeddings for FTS-only index (faster) // Use 'index init' subcommand (new CLI structure) const args = ['index', 'init', path, '--no-embeddings']; if (languages && languages.length > 0) { args.push('--language', languages.join(',')); } // Track progress updates const progressUpdates: ProgressInfo[] = []; let lastProgress: ProgressInfo | null = null; const result = await executeCodexLens(args, { cwd: path, timeout: 1800000, // 30 minutes for large codebases onProgress: (progress: ProgressInfo) => { progressUpdates.push(progress); lastProgress = progress; }, }); // Build metadata with progress info const metadata: SearchMetadata = { action: 'init', path, }; if (lastProgress !== null) { const p = lastProgress as ProgressInfo; metadata.progress = { stage: p.stage, message: p.message, percent: p.percent, filesProcessed: p.filesProcessed, totalFiles: p.totalFiles, }; } if (progressUpdates.length > 0) { metadata.progressHistory = progressUpdates.slice(-5); // Keep last 5 progress updates } const successMessage = result.success ? `FTS index created for ${path}. Note: For semantic/vector search, create vector index via "ccw view" dashboard or run "codexlens init ${path}" (without --no-embeddings).` : undefined; return { success: result.success, error: result.error, message: successMessage, metadata, }; } /** * Action: status - Check CodexLens index status */ async function executeStatusAction(params: Params): Promise { const { path = '.' } = params; const indexStatus = await checkIndexStatus(path); return { success: true, status: indexStatus, message: indexStatus.warning || `Index status: ${indexStatus.indexed ? 'indexed' : 'not indexed'}, embeddings: ${indexStatus.has_embeddings ? 'available' : 'not available'}`, }; } /** * Action: update - Incremental index update * Updates index for changed files without full rebuild */ async function executeUpdateAction(params: Params): Promise { const { path = '.', languages } = params; // Check CodexLens availability const readyStatus = await ensureCodexLensReady(); if (!readyStatus.ready) { return { success: false, error: `CodexLens not available: ${readyStatus.error}`, }; } // Check if index exists first const indexStatus = await checkIndexStatus(path); if (!indexStatus.indexed) { return { success: false, error: `Directory not indexed. Run smart_search(action="init") first.`, }; } // Build args for incremental init (without --force) // Use 'index init' subcommand (new CLI structure) const args = ['index', 'init', path]; if (languages && languages.length > 0) { args.push('--language', languages.join(',')); } // Track progress updates const progressUpdates: ProgressInfo[] = []; let lastProgress: ProgressInfo | null = null; const result = await executeCodexLens(args, { cwd: path, timeout: 600000, // 10 minutes for incremental updates onProgress: (progress: ProgressInfo) => { progressUpdates.push(progress); lastProgress = progress; }, }); // Build metadata with progress info const metadata: SearchMetadata = { action: 'update', path, }; if (lastProgress !== null) { const p = lastProgress as ProgressInfo; metadata.progress = { stage: p.stage, message: p.message, percent: p.percent, filesProcessed: p.filesProcessed, totalFiles: p.totalFiles, }; } if (progressUpdates.length > 0) { metadata.progressHistory = progressUpdates.slice(-5); } return { success: result.success, error: result.error, message: result.success ? `Incremental update completed for ${path}` : undefined, metadata, }; } /** * Action: watch - Start file watcher for automatic incremental updates * Note: This starts a background process, returns immediately with status */ async function executeWatchAction(params: Params): Promise { const { path = '.', languages, debounce = 1000 } = params; // Check CodexLens availability const readyStatus = await ensureCodexLensReady(); if (!readyStatus.ready) { return { success: false, error: `CodexLens not available: ${readyStatus.error}`, }; } // Check if index exists first const indexStatus = await checkIndexStatus(path); if (!indexStatus.indexed) { return { success: false, error: `Directory not indexed. Run smart_search(action="init") first.`, }; } // Build args for watch command const args = ['watch', path, '--debounce', debounce.toString()]; if (languages && languages.length > 0) { args.push('--language', languages.join(',')); } // Start watcher in background (non-blocking) // Note: The watcher runs until manually stopped const result = await executeCodexLens(args, { cwd: path, timeout: 5000, // Short timeout for initial startup check }); return { success: true, message: `File watcher started for ${path}. Use Ctrl+C or kill the process to stop.`, metadata: { action: 'watch', path, note: 'Watcher runs in background. Changes are indexed automatically with debounce.', }, }; } /** * Mode: fuzzy - FTS + ripgrep fusion with RRF ranking * Runs both exact (FTS) and ripgrep searches in parallel, merges and ranks results */ async function executeFuzzyMode(params: Params): Promise { const { query, path = '.', maxResults = 5, extraFilesCount = 10 } = params; if (!query) { return { success: false, error: 'Query is required for search', }; } const timer = createTimer(); // Run both searches in parallel const [ftsResult, ripgrepResult] = await Promise.allSettled([ executeCodexLensExactMode(params), executeRipgrepMode(params), ]); timer.mark('parallel_search'); // Collect results from both sources const resultsMap = new Map(); // Add FTS results if successful if (ftsResult.status === 'fulfilled' && ftsResult.value.success && ftsResult.value.results) { resultsMap.set('exact', ftsResult.value.results as any[]); } // Add ripgrep results if successful if (ripgrepResult.status === 'fulfilled' && ripgrepResult.value.success && ripgrepResult.value.results) { resultsMap.set('ripgrep', ripgrepResult.value.results as any[]); } // If both failed, return error if (resultsMap.size === 0) { const errors: string[] = []; if (ftsResult.status === 'rejected') errors.push(`FTS: ${ftsResult.reason}`); if (ripgrepResult.status === 'rejected') errors.push(`Ripgrep: ${ripgrepResult.reason}`); return { success: false, error: `Both search backends failed: ${errors.join('; ')}`, }; } // Apply RRF fusion with fuzzy-optimized weights // Fuzzy mode: balanced between exact and ripgrep const fusionWeights = { exact: 0.5, ripgrep: 0.5 }; const totalToFetch = maxResults + extraFilesCount; const fusedResults = applyRRFFusion(resultsMap, fusionWeights, totalToFetch); timer.mark('rrf_fusion'); // Normalize results format const normalizedResults = fusedResults.map((item: any) => ({ file: item.file || item.path, line: item.line || 0, column: item.column || 0, content: item.content || '', score: item.fusion_score || 0, matchCount: item.matchCount, matchScore: item.matchScore, })); // Split results: first N with full content, rest as file paths only const { results, extra_files } = splitResultsWithExtraFiles(normalizedResults, maxResults, extraFilesCount); // Log timing timer.log(); const timings = timer.getTimings(); return { success: true, results, extra_files: extra_files.length > 0 ? extra_files : undefined, metadata: { mode: 'fuzzy', backend: 'fts+ripgrep', count: results.length, query, note: `Fuzzy search using RRF fusion of FTS and ripgrep (weights: exact=${fusionWeights.exact}, ripgrep=${fusionWeights.ripgrep})`, timing: TIMING_ENABLED ? timings : undefined, }, }; } /** * Mode: auto - Intent classification and mode selection * Routes to: hybrid (NL + index) | exact (index) | ripgrep (no index) */ async function executeAutoMode(params: Params): Promise { const { query, path = '.' } = params; if (!query) { return { success: false, error: 'Query is required for search action', }; } // Check index status const indexStatus = await checkIndexStatus(path); // Classify intent with index and embeddings awareness const classification = classifyIntent( query, indexStatus.indexed, indexStatus.has_embeddings // This now considers 50% threshold ); // Route to appropriate mode based on classification let result: SearchResult; switch (classification.mode) { case 'hybrid': result = await executeHybridMode(params); break; case 'exact': result = await executeCodexLensExactMode(params); break; case 'ripgrep': result = await executeRipgrepMode(params); break; default: // Fallback to ripgrep result = await executeRipgrepMode(params); break; } // Add classification metadata if (result.metadata) { result.metadata.classified_as = classification.mode; result.metadata.confidence = classification.confidence; result.metadata.reasoning = classification.reasoning; result.metadata.embeddings_coverage_percent = indexStatus.embeddings_coverage_percent; result.metadata.index_status = indexStatus.indexed ? (indexStatus.has_embeddings ? 'indexed' : 'partial') : 'not_indexed'; // Add warning if needed if (indexStatus.warning) { result.metadata.warning = indexStatus.warning; } } return result; } /** * Mode: ripgrep - Fast literal string matching using ripgrep * No index required, fallback to CodexLens if ripgrep unavailable * Supports tokenized multi-word queries with OR matching and result ranking */ async function executeRipgrepMode(params: Params): Promise { const { query, paths = [], contextLines = 0, maxResults = 5, extraFilesCount = 10, maxContentLength = 200, includeHidden = false, path = '.', regex = true, caseSensitive = true, tokenize = true } = params; if (!query) { return { success: false, error: 'Query is required for search', }; } // Check if ripgrep is available const hasRipgrep = checkToolAvailability('rg'); // Calculate total to fetch for split (full content + extra files) const totalToFetch = maxResults + extraFilesCount; // If ripgrep not available, fall back to CodexLens exact mode if (!hasRipgrep) { const readyStatus = await ensureCodexLensReady(); if (!readyStatus.ready) { return { success: false, error: 'Neither ripgrep nor CodexLens available. Install ripgrep (rg) or CodexLens for search functionality.', }; } // Use CodexLens fts mode as fallback const args = ['search', query, '--limit', totalToFetch.toString(), '--method', 'fts', '--json']; const result = await executeCodexLens(args, { cwd: path }); if (!result.success) { return { success: false, error: result.error, metadata: { mode: 'ripgrep', backend: 'codexlens-fallback', count: 0, query, }, }; } // Parse results let allResults: SemanticMatch[] = []; try { const parsed = JSON.parse(stripAnsi(result.output || '{}')); const data = parsed.result?.results || parsed.results || parsed; allResults = (Array.isArray(data) ? data : []).map((item: any) => ({ file: item.path || item.file, score: item.score || 0, content: truncateContent(item.content || item.excerpt, maxContentLength), symbol: item.symbol || null, })); } catch { // Keep empty results } // Split results: first N with full content, rest as file paths only const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount); return { success: true, results, extra_files: extra_files.length > 0 ? extra_files : undefined, metadata: { mode: 'ripgrep', backend: 'codexlens-fallback', count: results.length, query, note: 'Using CodexLens exact mode (ripgrep not available)', }, }; } // Use ripgrep - request more results to support split const { command, args, tokens } = buildRipgrepCommand({ query, paths: paths.length > 0 ? paths : [path], contextLines, maxResults: totalToFetch, // Fetch more to support split includeHidden, regex, caseSensitive, tokenize, }); return new Promise((resolve) => { const child = spawn(command, args, { cwd: path || getProjectRoot(), stdio: ['ignore', 'pipe', 'pipe'], }); let stdout = ''; let stderr = ''; let resultLimitReached = false; child.stdout.on('data', (data) => { stdout += data.toString(); }); child.stderr.on('data', (data) => { stderr += data.toString(); }); child.on('close', (code) => { const allResults: ExactMatch[] = []; const lines = stdout.split('\n').filter((line) => line.trim()); // Limit total results to prevent memory overflow (--max-count only limits per-file) const effectiveLimit = totalToFetch > 0 ? totalToFetch : 500; for (const line of lines) { // Stop collecting if we've reached the limit if (allResults.length >= effectiveLimit) { resultLimitReached = true; break; } try { const item = JSON.parse(line); if (item.type === 'match') { const match: ExactMatch = { file: item.data.path.text, line: item.data.line_number, column: item.data.submatches && item.data.submatches[0] ? item.data.submatches[0].start + 1 : 1, content: item.data.lines.text.trim(), }; allResults.push(match); } } catch { continue; } } // Handle Windows device file errors gracefully (os error 1) // If we have results despite the error, return them as partial success const isWindowsDeviceError = stderr.includes('os error 1') || stderr.includes('函数不正确'); // Apply token-based scoring and sorting for multi-word queries // Results matching more tokens are ranked higher (exact matches first) const scoredResults = tokens.length > 1 ? scoreByTokenMatch(allResults, tokens) : allResults; if (code === 0 || code === 1 || (isWindowsDeviceError && scoredResults.length > 0)) { // Split results: first N with full content, rest as file paths only const { results, extra_files } = splitResultsWithExtraFiles(scoredResults, maxResults, extraFilesCount); // Build warning message for various conditions const warnings: string[] = []; if (resultLimitReached) { warnings.push(`Result limit reached (${effectiveLimit}). Use a more specific query or increase limit.`); } if (isWindowsDeviceError) { warnings.push('Some Windows device files were skipped'); } resolve({ success: true, results, extra_files: extra_files.length > 0 ? extra_files : undefined, metadata: { mode: 'ripgrep', backend: 'ripgrep', count: results.length, query, tokens: tokens.length > 1 ? tokens : undefined, // Include tokens in metadata for debugging tokenized: tokens.length > 1, ...(warnings.length > 0 && { warning: warnings.join('; ') }), }, }); } else if (isWindowsDeviceError && allResults.length === 0) { // Windows device error but no results - might be the only issue resolve({ success: true, results: [], metadata: { mode: 'ripgrep', backend: 'ripgrep', count: 0, query, warning: 'No matches found (some Windows device files were skipped)', }, }); } else { resolve({ success: false, error: `ripgrep execution failed with code ${code}: ${stderr}`, results: [], }); } }); child.on('error', (error) => { resolve({ success: false, error: `Failed to spawn ripgrep: ${error.message}`, results: [], }); }); }); } /** * Mode: exact - CodexLens exact/FTS search * Requires index */ async function executeCodexLensExactMode(params: Params): Promise { const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false, excludeExtensions, codeOnly = false, offset = 0 } = params; if (!query) { return { success: false, error: 'Query is required for search', }; } // Check CodexLens availability const readyStatus = await ensureCodexLensReady(); if (!readyStatus.ready) { return { success: false, error: `CodexLens not available: ${readyStatus.error}`, }; } // Check index status const indexStatus = await checkIndexStatus(path); // Request more results to support split (full content + extra files) const totalToFetch = maxResults + extraFilesCount; const args = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'fts', '--json']; if (enrich) { args.push('--enrich'); } // Add code_only filter if requested if (codeOnly) { args.push('--code-only'); } // Add exclude_extensions filter if provided if (excludeExtensions && excludeExtensions.length > 0) { args.push('--exclude-extensions', excludeExtensions.join(',')); } const result = await executeCodexLens(args, { cwd: path }); if (!result.success) { return { success: false, error: result.error, metadata: { mode: 'exact', backend: 'codexlens', count: 0, query, warning: indexStatus.warning, }, }; } // Parse results let allResults: SemanticMatch[] = []; try { const parsed = JSON.parse(stripAnsi(result.output || '{}')); const data = parsed.result?.results || parsed.results || parsed; allResults = (Array.isArray(data) ? data : []).map((item: any) => ({ file: item.path || item.file, score: item.score || 0, content: truncateContent(item.content || item.excerpt, maxContentLength), symbol: item.symbol || null, })); } catch { // Keep empty results } // Fallback to fuzzy mode if exact returns no results if (allResults.length === 0) { const fuzzyArgs = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'fts', '--use-fuzzy', '--json']; if (enrich) { fuzzyArgs.push('--enrich'); } // Add code_only filter if requested if (codeOnly) { fuzzyArgs.push('--code-only'); } // Add exclude_extensions filter if provided if (excludeExtensions && excludeExtensions.length > 0) { fuzzyArgs.push('--exclude-extensions', excludeExtensions.join(',')); } const fuzzyResult = await executeCodexLens(fuzzyArgs, { cwd: path }); if (fuzzyResult.success) { try { const parsed = JSON.parse(stripAnsi(fuzzyResult.output || '{}')); const data = parsed.result?.results || parsed.results || parsed; allResults = (Array.isArray(data) ? data : []).map((item: any) => ({ file: item.path || item.file, score: item.score || 0, content: truncateContent(item.content || item.excerpt, maxContentLength), symbol: item.symbol || null, })); } catch { // Keep empty results } if (allResults.length > 0) { // Split results: first N with full content, rest as file paths only const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount); return { success: true, results, extra_files: extra_files.length > 0 ? extra_files : undefined, metadata: { mode: 'exact', backend: 'codexlens', count: results.length, query, warning: indexStatus.warning, note: 'No exact matches found, showing fuzzy results', fallback: 'fuzzy', }, }; } } } // Split results: first N with full content, rest as file paths only const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount); return { success: true, results, extra_files: extra_files.length > 0 ? extra_files : undefined, metadata: { mode: 'exact', backend: 'codexlens', count: results.length, query, warning: indexStatus.warning, }, }; } /** * Mode: hybrid - Best quality semantic search * Uses CodexLens dense_rerank method (dense coarse + cross-encoder rerank) * Requires index with embeddings */ async function executeHybridMode(params: Params): Promise { const timer = createTimer(); const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false, excludeExtensions, codeOnly = false, offset = 0 } = params; if (!query) { return { success: false, error: 'Query is required for search', }; } // Check CodexLens availability const readyStatus = await ensureCodexLensReady(); timer.mark('codexlens_ready_check'); if (!readyStatus.ready) { return { success: false, error: `CodexLens not available: ${readyStatus.error}`, }; } // Check index status const indexStatus = await checkIndexStatus(path); timer.mark('index_status_check'); // Request more results to support split (full content + extra files) const totalToFetch = maxResults + extraFilesCount; const args = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'dense_rerank', '--json']; if (enrich) { args.push('--enrich'); } // Add code_only filter if requested if (codeOnly) { args.push('--code-only'); } // Add exclude_extensions filter if provided if (excludeExtensions && excludeExtensions.length > 0) { args.push('--exclude-extensions', excludeExtensions.join(',')); } const result = await executeCodexLens(args, { cwd: path }); timer.mark('codexlens_search'); if (!result.success) { timer.log(); return { success: false, error: result.error, metadata: { mode: 'hybrid', backend: 'codexlens', count: 0, query, warning: indexStatus.warning, }, }; } // Parse results let allResults: SemanticMatch[] = []; let baselineInfo: { score: number; count: number } | null = null; let initialCount = 0; try { const parsed = JSON.parse(stripAnsi(result.output || '{}')); const data = parsed.result?.results || parsed.results || parsed; allResults = (Array.isArray(data) ? data : []).map((item: any) => { const rawScore = item.score || 0; // Hybrid mode returns distance scores (lower is better). // Convert to similarity scores (higher is better) for consistency. // Formula: similarity = 1 / (1 + distance) const similarityScore = rawScore > 0 ? 1 / (1 + rawScore) : 1; return { file: item.path || item.file, score: similarityScore, content: truncateContent(item.content || item.excerpt, maxContentLength), symbol: item.symbol || null, }; }); timer.mark('parse_results'); initialCount = allResults.length; // Post-processing pipeline to improve semantic search quality // 0. Filter dominant baseline scores (hot spot detection) const baselineResult = filterDominantBaselineScores(allResults); allResults = baselineResult.filteredResults; baselineInfo = baselineResult.baselineInfo; // 1. Filter noisy files (coverage, node_modules, etc.) and excluded extensions allResults = filterNoisyFiles(allResults, { excludeExtensions, codeOnly }); // 2. Boost results containing query keywords allResults = applyKeywordBoosting(allResults, query); // 3. Enforce score diversity (penalize identical scores) allResults = enforceScoreDiversity(allResults); // 4. Re-sort by adjusted scores allResults.sort((a, b) => b.score - a.score); timer.mark('post_processing'); } catch { return { success: true, results: [], output: result.output, metadata: { mode: 'hybrid', backend: 'codexlens', count: 0, query, warning: indexStatus.warning || 'Failed to parse JSON output', }, }; } // Split results: first N with full content, rest as file paths only const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount); timer.mark('split_results'); // Build metadata with baseline info if detected let note = 'Using dense_rerank (dense coarse + cross-encoder rerank) for semantic search'; if (baselineInfo) { note += ` | Filtered ${initialCount - allResults.length} hot-spot results with baseline score ~${baselineInfo.score.toFixed(4)}`; } // Log timing data timer.log(); const timings = timer.getTimings(); return { success: true, results, extra_files: extra_files.length > 0 ? extra_files : undefined, metadata: { mode: 'hybrid', backend: 'codexlens', count: results.length, query, note, warning: indexStatus.warning, suggested_weights: getRRFWeights(query), timing: TIMING_ENABLED ? timings : undefined, }, }; } /** * Query intent used to adapt RRF weights (Python parity). * * Keep this logic aligned with CodexLens Python hybrid search: * `codex-lens/src/codexlens/search/hybrid_search.py` */ export type QueryIntent = 'keyword' | 'semantic' | 'mixed'; // Python default: vector 60%, exact 30%, fuzzy 10% const DEFAULT_RRF_WEIGHTS = { exact: 0.3, fuzzy: 0.1, vector: 0.6, } as const; function normalizeWeights(weights: Record): Record { const sum = Object.values(weights).reduce((acc, v) => acc + v, 0); if (!Number.isFinite(sum) || sum <= 0) return { ...weights }; return Object.fromEntries(Object.entries(weights).map(([k, v]) => [k, v / sum])); } /** * Detect query intent using the same heuristic signals as Python: * - Code patterns: `.`, `::`, `->`, CamelCase, snake_case, common code keywords * - Natural language patterns: >5 words, question marks, interrogatives, common verbs */ export function detectQueryIntent(query: string): QueryIntent { const trimmed = query.trim(); if (!trimmed) return 'mixed'; const lower = trimmed.toLowerCase(); const wordCount = trimmed.split(/\s+/).filter(Boolean).length; const hasCodeSignals = /(::|->|\.)/.test(trimmed) || /[A-Z][a-z]+[A-Z]/.test(trimmed) || /\b\w+_\w+\b/.test(trimmed) || /\b(def|class|function|const|let|var|import|from|return|async|await|interface|type)\b/i.test(lower); const hasNaturalSignals = wordCount > 5 || /\?/.test(trimmed) || /\b(how|what|why|when|where)\b/i.test(trimmed) || /\b(handle|explain|fix|implement|create|build|use|find|search|convert|parse|generate|support)\b/i.test(trimmed); if (hasCodeSignals && hasNaturalSignals) return 'mixed'; if (hasCodeSignals) return 'keyword'; if (hasNaturalSignals) return 'semantic'; return 'mixed'; } /** * Intent → weights mapping (Python parity). * - keyword: exact-heavy * - semantic: vector-heavy * - mixed: keep defaults */ export function adjustWeightsByIntent( intent: QueryIntent, baseWeights: Record, ): Record { if (intent === 'keyword') return normalizeWeights({ exact: 0.5, fuzzy: 0.1, vector: 0.4 }); if (intent === 'semantic') return normalizeWeights({ exact: 0.2, fuzzy: 0.1, vector: 0.7 }); return normalizeWeights({ ...baseWeights }); } export function getRRFWeights( query: string, baseWeights: Record = DEFAULT_RRF_WEIGHTS, ): Record { return adjustWeightsByIntent(detectQueryIntent(query), baseWeights); } /** * Post-processing: Filter noisy files from semantic search results * Uses FILTER_CONFIG patterns to remove irrelevant files. * Optimized: pre-compiled regexes, accurate path segment matching. */ // Pre-compile file exclusion regexes once (avoid recompilation in loop) const FILE_EXCLUDE_REGEXES = [...FILTER_CONFIG.exclude_files].map(pattern => new RegExp('^' + pattern.replace(/[.*+?^${}()|[\]\\]/g, '\\$&').replace(/\\\*/g, '.*') + '$') ); // Non-code file extensions (for codeOnly filter) const NON_CODE_EXTENSIONS = new Set([ 'md', 'txt', 'json', 'yaml', 'yml', 'xml', 'csv', 'log', 'ini', 'cfg', 'conf', 'toml', 'env', 'properties', 'html', 'htm', 'svg', 'png', 'jpg', 'jpeg', 'gif', 'ico', 'webp', 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx', 'lock', 'sum', 'mod', ]); interface FilterOptions { excludeExtensions?: string[]; codeOnly?: boolean; } function filterNoisyFiles(results: SemanticMatch[], options: FilterOptions = {}): SemanticMatch[] { const { excludeExtensions = [], codeOnly = false } = options; // Build extension filter set const excludedExtSet = new Set(excludeExtensions.map(ext => ext.toLowerCase().replace(/^\./, ''))); if (codeOnly) { NON_CODE_EXTENSIONS.forEach(ext => excludedExtSet.add(ext)); } return results.filter(r => { const filePath = r.file || ''; if (!filePath) return true; const segments = filePath.split(/[/\\]/); // Accurate directory check: segment must exactly match excluded directory if (segments.some(segment => FILTER_CONFIG.exclude_directories.has(segment))) { return false; } // Accurate file check: pattern matches filename only (not full path) const filename = segments.pop() || ''; if (FILE_EXCLUDE_REGEXES.some(regex => regex.test(filename))) { return false; } // Extension filter check if (excludedExtSet.size > 0) { const ext = filename.split('.').pop()?.toLowerCase() || ''; if (excludedExtSet.has(ext)) { return false; } } return true; }); } /** * Post-processing: Boost results containing query keywords * Extracts keywords from query and boosts matching results. * Optimized: uses whole-word matching with regex for accuracy. */ // Helper to escape regex special characters function escapeRegExp(str: string): string { return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } function applyKeywordBoosting(results: SemanticMatch[], query: string): SemanticMatch[] { // Extract meaningful keywords (ignore common words) const stopWords = new Set(['the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', 'used', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'and', 'but', 'if', 'or', 'because', 'until', 'while', 'although', 'though', 'after', 'before', 'when', 'whenever', 'where', 'wherever', 'whether', 'which', 'who', 'whom', 'whose', 'what', 'whatever', 'whichever', 'whoever', 'whomever', 'this', 'that', 'these', 'those', 'it', 'its']); const keywords = query .toLowerCase() .split(/[\s,.;:()"{}[\]-]+/) // More robust splitting on punctuation .filter(word => word.length > 2 && !stopWords.has(word)); if (keywords.length === 0) return results; // Create case-insensitive regexes for whole-word matching const keywordRegexes = keywords.map(kw => new RegExp(`\\b${escapeRegExp(kw)}\\b`, 'i')); return results.map(r => { const content = r.content || ''; const file = r.file || ''; // Count keyword matches using whole-word regex let matchCount = 0; for (const regex of keywordRegexes) { if (regex.test(content) || regex.test(file)) { matchCount++; } } // Apply boost only if there are matches if (matchCount > 0) { const matchRatio = matchCount / keywords.length; const boost = 1 + (matchRatio * 0.3); // Up to 30% boost for full match return { ...r, score: r.score * boost, }; } return r; }); } /** * Post-processing: Enforce score diversity * Penalizes results with identical scores (indicates undifferentiated matching) */ function enforceScoreDiversity(results: SemanticMatch[]): SemanticMatch[] { if (results.length < 2) return results; // Count occurrences of each score (rounded to 3 decimal places for comparison) const scoreCounts = new Map(); for (const r of results) { const roundedScore = Math.round(r.score * 1000) / 1000; scoreCounts.set(roundedScore, (scoreCounts.get(roundedScore) || 0) + 1); } // Apply penalty to scores that appear more than twice return results.map(r => { const roundedScore = Math.round(r.score * 1000) / 1000; const count = scoreCounts.get(roundedScore) || 1; if (count > 2) { // Progressive penalty: more duplicates = bigger penalty const penalty = Math.max(0.7, 1 - (count * 0.05)); return { ...r, score: r.score * penalty }; } return r; }); } /** * Post-processing: Filter results with dominant baseline score (hot spot detection) * When backend returns default "hot spot" files with identical high scores, * this function detects and removes them. * * Detection criteria: * - A single score appears in >50% of results * - That score is suspiciously high (>0.9) * - This indicates fallback mechanism returned placeholder results */ function filterDominantBaselineScores( results: SemanticMatch[] ): { filteredResults: SemanticMatch[]; baselineInfo: { score: number; count: number } | null } { if (results.length < 4) { return { filteredResults: results, baselineInfo: null }; } // Count occurrences of each score (rounded to 4 decimal places) const scoreCounts = new Map(); results.forEach(r => { const rounded = Math.round(r.score * 10000) / 10000; scoreCounts.set(rounded, (scoreCounts.get(rounded) || 0) + 1); }); // Find the most dominant score let dominantScore: number | null = null; let dominantCount = 0; scoreCounts.forEach((count, score) => { if (count > dominantCount) { dominantCount = count; dominantScore = score; } }); // If a single score is present in >50% of results and is high (>0.9), // treat it as a suspicious baseline score and filter it out const BASELINE_THRESHOLD = 0.5; // >50% of results have same score const HIGH_SCORE_THRESHOLD = 0.9; // Score above 0.9 is suspiciously high if ( dominantScore !== null && dominantCount > results.length * BASELINE_THRESHOLD && dominantScore > HIGH_SCORE_THRESHOLD ) { const filteredResults = results.filter(r => { const rounded = Math.round(r.score * 10000) / 10000; return rounded !== dominantScore; }); return { filteredResults, baselineInfo: { score: dominantScore, count: dominantCount }, }; } return { filteredResults: results, baselineInfo: null }; } /** * TypeScript implementation of Reciprocal Rank Fusion * Reference: codex-lens/src/codexlens/search/ranking.py * Formula: score(d) = Σ weight_source / (k + rank_source(d)) */ function applyRRFFusion( resultsMap: Map, weightsOrQuery: Record | string, limit: number, k: number = 60, ): any[] { const weights = typeof weightsOrQuery === 'string' ? getRRFWeights(weightsOrQuery) : weightsOrQuery; const pathScores = new Map(); resultsMap.forEach((results, source) => { const weight = weights[source] || 0; if (weight === 0 || !results) return; results.forEach((result, rank) => { const path = result.file || result.path; if (!path) return; const rrfContribution = weight / (k + rank + 1); if (!pathScores.has(path)) { pathScores.set(path, { score: 0, result, sources: [] }); } const entry = pathScores.get(path)!; entry.score += rrfContribution; if (!entry.sources.includes(source)) { entry.sources.push(source); } }); }); // Sort by fusion score descending return Array.from(pathScores.values()) .sort((a, b) => b.score - a.score) .slice(0, limit) .map(item => ({ ...item.result, fusion_score: item.score, matched_backends: item.sources, })); } /** * Promise wrapper with timeout support * @param promise - The promise to wrap * @param ms - Timeout in milliseconds * @param modeName - Name of the mode for error message * @returns A new promise that rejects on timeout */ function withTimeout(promise: Promise, ms: number, modeName: string): Promise { return new Promise((resolve, reject) => { const timer = setTimeout(() => { reject(new Error(`'${modeName}' search timed out after ${ms}ms`)); }, ms); promise .then(resolve) .catch(reject) .finally(() => clearTimeout(timer)); }); } /** * Mode: priority - Fallback search strategy: hybrid -> exact -> ripgrep * Returns results from the first backend that succeeds and provides results. * More efficient than parallel mode - stops as soon as valid results are found. */ async function executePriorityFallbackMode(params: Params): Promise { const { query, path = '.' } = params; const fallbackHistory: string[] = []; if (!query) { return { success: false, error: 'Query is required for search' }; } // Check index status first const indexStatus = await checkIndexStatus(path); // 1. Try Hybrid search (highest priority) - 90s timeout for large indexes if (indexStatus.indexed && indexStatus.has_embeddings) { try { const hybridResult = await withTimeout(executeHybridMode(params), 90000, 'hybrid'); if (hybridResult.success && hybridResult.results && (hybridResult.results as any[]).length > 0) { fallbackHistory.push('hybrid: success'); return { ...hybridResult, metadata: { ...hybridResult.metadata, mode: 'priority', note: 'Result from hybrid search (semantic + vector).', fallback_history: fallbackHistory, }, }; } fallbackHistory.push('hybrid: no results'); } catch (error) { fallbackHistory.push(`hybrid: ${(error as Error).message}`); } } else { fallbackHistory.push(`hybrid: skipped (${!indexStatus.indexed ? 'no index' : 'no embeddings'})`); } // 2. Fallback to Exact search - 10s timeout if (indexStatus.indexed) { try { const exactResult = await withTimeout(executeCodexLensExactMode(params), 10000, 'exact'); if (exactResult.success && exactResult.results && (exactResult.results as any[]).length > 0) { fallbackHistory.push('exact: success'); return { ...exactResult, metadata: { ...exactResult.metadata, mode: 'priority', note: 'Result from exact/FTS search (fallback from hybrid).', fallback_history: fallbackHistory, }, }; } fallbackHistory.push('exact: no results'); } catch (error) { fallbackHistory.push(`exact: ${(error as Error).message}`); } } else { fallbackHistory.push('exact: skipped (no index)'); } // 3. Final fallback to Ripgrep - 5s timeout try { const ripgrepResult = await withTimeout(executeRipgrepMode(params), 5000, 'ripgrep'); fallbackHistory.push(ripgrepResult.success ? 'ripgrep: success' : 'ripgrep: failed'); return { ...ripgrepResult, metadata: { ...ripgrepResult.metadata, mode: 'priority', note: 'Result from ripgrep search (final fallback).', fallback_history: fallbackHistory, }, }; } catch (error) { fallbackHistory.push(`ripgrep: ${(error as Error).message}`); } // All modes failed return { success: false, error: 'All search backends in priority mode failed or returned no results.', metadata: { mode: 'priority', query, fallback_history: fallbackHistory, } as any, }; } // Tool schema for MCP export const schema: ToolSchema = { name: 'smart_search', description: `Unified code search tool with content search, file discovery, and semantic search capabilities. **Actions:** - search: Search file content (default) - find_files: Find files by path/name pattern (glob matching) - init: Create FTS index - status: Check index status - update: Incremental index update (for changed files) - watch: Start file watcher for automatic updates **Content Search (action="search"):** smart_search(query="authentication logic") # fuzzy mode (default) - FTS + ripgrep fusion smart_search(query="MyClass", mode="fuzzy") # fuzzy mode - fast hybrid search smart_search(query="how to auth", mode="semantic") # semantic mode - dense + reranker **File Discovery (action="find_files"):** smart_search(action="find_files", pattern="*.ts") # find all TypeScript files smart_search(action="find_files", pattern="src/**/*.js") # recursive glob pattern smart_search(action="find_files", pattern="test_*.py") # find test files smart_search(action="find_files", pattern="*.tsx", offset=20, limit=10) # pagination **Index Maintenance:** smart_search(action="update", path="/project") # incremental index update smart_search(action="watch", path="/project") # start file watcher smart_search(action="watch", debounce=2000) # custom debounce interval **Pagination:** All actions support offset/limit for paginated results: smart_search(query="auth", limit=10, offset=0) # first page smart_search(query="auth", limit=10, offset=10) # second page **Modes:** fuzzy (FTS + ripgrep fusion, default), semantic (dense + reranker)`, inputSchema: { type: 'object', properties: { action: { type: 'string', enum: ['init', 'search', 'find_files', 'status', 'search_files'], description: 'Action: search (content search), find_files (path pattern matching), init (create index), status (check index). Note: search_files is deprecated.', default: 'search', }, query: { type: 'string', description: 'Content search query (for action="search")', }, pattern: { type: 'string', description: 'Glob pattern for file discovery (for action="find_files"). Examples: "*.ts", "src/**/*.js", "test_*.py"', }, mode: { type: 'string', enum: SEARCH_MODES, description: 'Search mode: fuzzy (FTS + ripgrep fusion, default), semantic (dense + reranker for natural language queries)', default: 'fuzzy', }, output_mode: { type: 'string', enum: ['full', 'files_only', 'count'], description: 'Output format: full (default), files_only (paths only), count (per-file counts)', default: 'full', }, path: { type: 'string', description: 'Directory path for init/search actions (default: current directory)', }, paths: { type: 'array', description: 'Multiple paths to search within (for search action)', items: { type: 'string', }, default: [], }, contextLines: { type: 'number', description: 'Number of context lines around matches (exact mode only)', default: 0, }, maxResults: { type: 'number', description: 'Maximum number of results (default: 20)', default: 20, }, limit: { type: 'number', description: 'Alias for maxResults (default: 20)', default: 20, }, extraFilesCount: { type: 'number', description: 'Number of additional file-only results (paths without content)', default: 10, }, maxContentLength: { type: 'number', description: 'Maximum content length for truncation (50-2000)', default: 200, }, offset: { type: 'number', description: 'Pagination offset - skip first N results (default: 0)', default: 0, }, includeHidden: { type: 'boolean', description: 'Include hidden files/directories', default: false, }, languages: { type: 'array', items: { type: 'string' }, description: 'Languages to index (for init action). Example: ["javascript", "typescript"]', }, enrich: { type: 'boolean', description: 'Enrich search results with code graph relationships (calls, imports, called_by, imported_by).', default: false, }, regex: { type: 'boolean', description: 'Use regex pattern matching instead of literal string (ripgrep mode only). Default: enabled. Example: smart_search(query="class.*Builder")', default: true, }, caseSensitive: { type: 'boolean', description: 'Case-sensitive search (default: true). Set to false for case-insensitive matching.', default: true, }, tokenize: { type: 'boolean', description: 'Tokenize multi-word queries for OR matching (ripgrep mode). Default: true. Results are ranked by token match count (exact matches first).', default: true, }, }, required: [], }, }; /** * Action: find_files - Find files by path/name pattern (glob matching) * Unlike search which looks inside file content, find_files matches file paths */ async function executeFindFilesAction(params: Params): Promise { const { pattern, path = '.', limit = 20, offset = 0, includeHidden = false, caseSensitive = true } = params; if (!pattern) { return { success: false, error: 'Pattern is required for find_files action. Use glob patterns like "*.ts", "src/**/*.js", or "test_*.py"', }; } // Use ripgrep with --files flag for fast file listing with glob pattern const hasRipgrep = checkToolAvailability('rg'); if (!hasRipgrep) { // Fallback to CodexLens file listing if available const readyStatus = await ensureCodexLensReady(); if (!readyStatus.ready) { return { success: false, error: 'Neither ripgrep nor CodexLens available for file discovery.', }; } // Try CodexLens file list command const args = ['list-files', '--json']; const result = await executeCodexLens(args, { cwd: path }); if (!result.success) { return { success: false, error: `Failed to list files: ${result.error}`, }; } // Parse and filter results by pattern let files: string[] = []; try { const parsed = JSON.parse(stripAnsi(result.output || '[]')); files = Array.isArray(parsed) ? parsed : (parsed.files || []); } catch { return { success: false, error: 'Failed to parse file list from CodexLens', }; } // Apply glob pattern matching using minimatch-style regex const globRegex = globToRegex(pattern, caseSensitive); const matchedFiles = files.filter(f => globRegex.test(f)); // Apply pagination const total = matchedFiles.length; const paginatedFiles = matchedFiles.slice(offset, offset + limit); const results: FileMatch[] = paginatedFiles.map(filePath => { const parts = filePath.split(/[/\\]/); const name = parts[parts.length - 1] || ''; const ext = name.includes('.') ? name.split('.').pop() : undefined; return { path: filePath, type: 'file' as const, name, extension: ext, }; }); return { success: true, results, metadata: { pattern, backend: 'codexlens', count: results.length, pagination: { offset, limit, total, has_more: offset + limit < total, }, }, }; } // Use ripgrep --files with glob pattern for fast file discovery return new Promise((resolve) => { const args = ['--files']; // Add exclude patterns if (!includeHidden) { args.push(...buildExcludeArgs()); } else { args.push('--hidden'); } // Add glob pattern args.push('--glob', pattern); // Case sensitivity for glob matching if (!caseSensitive) { args.push('--iglob', pattern); // Remove the case-sensitive glob and use iglob instead const globIndex = args.indexOf('--glob'); if (globIndex !== -1) { args.splice(globIndex, 2); } } const child = spawn('rg', args, { cwd: path || getProjectRoot(), stdio: ['ignore', 'pipe', 'pipe'], }); let stdout = ''; let stderr = ''; child.stdout.on('data', (data) => { stdout += data.toString(); }); child.stderr.on('data', (data) => { stderr += data.toString(); }); child.on('close', (code) => { // ripgrep returns 1 when no matches found, which is not an error if (code !== 0 && code !== 1 && !stderr.includes('os error 1')) { resolve({ success: false, error: `ripgrep file search failed: ${stderr}`, }); return; } const allFiles = stdout.split('\n').filter(line => line.trim()); const total = allFiles.length; // Apply pagination const paginatedFiles = allFiles.slice(offset, offset + limit); const results: FileMatch[] = paginatedFiles.map(filePath => { const normalizedPath = filePath.replace(/\\/g, '/'); const parts = normalizedPath.split('/'); const name = parts[parts.length - 1] || ''; const ext = name.includes('.') ? name.split('.').pop() : undefined; return { path: normalizedPath, type: 'file' as const, name, extension: ext, }; }); resolve({ success: true, results, metadata: { pattern, backend: 'ripgrep', count: results.length, pagination: { offset, limit, total, has_more: offset + limit < total, }, }, }); }); child.on('error', (error) => { resolve({ success: false, error: `Failed to spawn ripgrep: ${error.message}`, }); }); }); } /** * Convert glob pattern to regex for file matching * Supports: *, **, ?, [abc], [!abc] */ function globToRegex(pattern: string, caseSensitive: boolean = true): RegExp { let i = 0; const out: string[] = []; const special = '.^$+{}|()'; while (i < pattern.length) { const c = pattern[i]; if (c === '*') { if (i + 1 < pattern.length && pattern[i + 1] === '*') { // ** matches any path including / out.push('.*'); i += 2; // Skip following / if present if (pattern[i] === '/') { i++; } continue; } else { // * matches any character except / out.push('[^/]*'); } } else if (c === '?') { out.push('[^/]'); } else if (c === '[') { // Character class let j = i + 1; let negated = false; if (pattern[j] === '!' || pattern[j] === '^') { negated = true; j++; } let classContent = ''; while (j < pattern.length && pattern[j] !== ']') { classContent += pattern[j]; j++; } if (negated) { out.push(`[^${classContent}]`); } else { out.push(`[${classContent}]`); } i = j; } else if (special.includes(c)) { out.push('\\' + c); } else { out.push(c); } i++; } const flags = caseSensitive ? '' : 'i'; return new RegExp('^' + out.join('') + '$', flags); } /** * Apply pagination to search results and add pagination metadata */ function applyPagination( results: T[], offset: number, limit: number ): { paginatedResults: T[]; pagination: PaginationInfo } { const total = results.length; const paginatedResults = results.slice(offset, offset + limit); return { paginatedResults, pagination: { offset, limit, total, has_more: offset + limit < total, }, }; } /** * Transform results based on output_mode */ function transformOutput( results: ExactMatch[] | SemanticMatch[] | GraphMatch[] | unknown[], outputMode: 'full' | 'files_only' | 'count' ): unknown { if (!Array.isArray(results)) { return results; } switch (outputMode) { case 'files_only': { // Extract unique file paths const files = [...new Set(results.map((r: any) => r.file))].filter(Boolean); return { files, count: files.length }; } case 'count': { // Count matches per file const counts: Record = {}; for (const r of results) { const file = (r as any).file; if (file) { counts[file] = (counts[file] || 0) + 1; } } return { files: Object.entries(counts).map(([file, count]) => ({ file, count })), total: results.length, }; } case 'full': default: return results; } } // Handler function export async function handler(params: Record): Promise> { const parsed = ParamsSchema.safeParse(params); if (!parsed.success) { return { success: false, error: `Invalid params: ${parsed.error.message}` }; } const { action, mode, output_mode, offset = 0 } = parsed.data; // Sync limit and maxResults - use the larger of the two if both provided // This ensures user-provided values take precedence over defaults const effectiveLimit = Math.max(parsed.data.limit || 20, parsed.data.maxResults || 20); parsed.data.maxResults = effectiveLimit; parsed.data.limit = effectiveLimit; // Track if search_files was used (deprecated) let deprecationWarning: string | undefined; try { let result: SearchResult; // Handle actions switch (action) { case 'init': result = await executeInitAction(parsed.data); break; case 'status': result = await executeStatusAction(parsed.data); break; case 'find_files': // NEW: File path/name pattern matching (glob-based) result = await executeFindFilesAction(parsed.data); break; case 'update': // Incremental index update result = await executeUpdateAction(parsed.data); break; case 'watch': // Start file watcher (returns status, watcher runs in background) result = await executeWatchAction(parsed.data); break; case 'search_files': // DEPRECATED: Redirect to search with files_only output deprecationWarning = 'action="search_files" is deprecated. Use action="search" with output_mode="files_only" for content-to-files search, or action="find_files" for path pattern matching.'; parsed.data.output_mode = 'files_only'; // Fall through to search case 'search': default: // Handle search modes: fuzzy | semantic switch (mode) { case 'fuzzy': result = await executeFuzzyMode(parsed.data); break; case 'semantic': result = await executeHybridMode(parsed.data); break; default: throw new Error(`Unsupported mode: ${mode}. Use: fuzzy or semantic`); } break; } // Transform output based on output_mode (for search actions only) if (action === 'search' || action === 'search_files') { if (result.success && result.results && output_mode !== 'full') { result.results = transformOutput(result.results as any[], output_mode); } // Add pagination metadata for search results if not already present if (result.success && result.results && Array.isArray(result.results)) { const totalResults = (result.results as any[]).length; if (!result.metadata) { result.metadata = {}; } if (!result.metadata.pagination) { result.metadata.pagination = { offset: 0, limit: effectiveLimit, total: totalResults, has_more: false, // Already limited by backend }; } } } // Add deprecation warning if applicable if (deprecationWarning && result.metadata) { result.metadata.warning = deprecationWarning; } return result.success ? { success: true, result } : { success: false, error: result.error }; } catch (error) { return { success: false, error: (error as Error).message }; } } /** * Execute init action with external progress callback * Used by MCP server for streaming progress */ export async function executeInitWithProgress( params: Record, onProgress?: (progress: ProgressInfo) => void ): Promise { const path = (params.path as string) || '.'; const languages = params.languages as string[] | undefined; // Check CodexLens availability const readyStatus = await ensureCodexLensReady(); if (!readyStatus.ready) { return { success: false, error: `CodexLens not available: ${readyStatus.error}. CodexLens will be auto-installed on first use.`, }; } // Use 'index init' subcommand (new CLI structure) const args = ['index', 'init', path]; if (languages && languages.length > 0) { args.push('--language', languages.join(',')); } // Track progress updates const progressUpdates: ProgressInfo[] = []; let lastProgress: ProgressInfo | null = null; const result = await executeCodexLens(args, { cwd: path, timeout: 1800000, // 30 minutes for large codebases onProgress: (progress: ProgressInfo) => { progressUpdates.push(progress); lastProgress = progress; // Call external progress callback if provided if (onProgress) { onProgress(progress); } }, }); // Build metadata with progress info const metadata: SearchMetadata = { action: 'init', path, }; if (lastProgress !== null) { const p = lastProgress as ProgressInfo; metadata.progress = { stage: p.stage, message: p.message, percent: p.percent, filesProcessed: p.filesProcessed, totalFiles: p.totalFiles, }; } if (progressUpdates.length > 0) { metadata.progressHistory = progressUpdates.slice(-5); } return { success: result.success, error: result.error, message: result.success ? `CodexLens index created successfully for ${path}` : undefined, metadata, }; }