Files
Claude-Code-Workflow/ccw/src/tools/smart-search.ts

2548 lines
80 KiB
TypeScript

/**
* Smart Search Tool - Unified intelligent search with CodexLens integration
*
* Features:
* - Fuzzy mode: FTS + ripgrep fusion with RRF ranking (default)
* - Semantic mode: Dense coarse retrieval + cross-encoder reranking
* - CodexLens integration (init, dense_rerank, fts)
* - Ripgrep fallback for exact mode
* - Index status checking and warnings
* - Multi-backend search routing with RRF ranking
*
* Actions:
* - init: Initialize CodexLens index
* - search: Intelligent search with fuzzy (default) or semantic mode
* - status: Check index status
* - update: Incremental index update for changed files
* - watch: Start file watcher for automatic updates
*/
import { z } from 'zod';
import type { ToolSchema, ToolResult } from '../types/tool.js';
import { spawn, execSync } from 'child_process';
import {
ensureReady as ensureCodexLensReady,
executeCodexLens,
} from './codex-lens.js';
import type { ProgressInfo } from './codex-lens.js';
import { getProjectRoot } from '../utils/path-validator.js';
// Timing utilities for performance analysis
const TIMING_ENABLED = process.env.SMART_SEARCH_TIMING === '1' || process.env.DEBUG?.includes('timing');
interface TimingData {
[key: string]: number;
}
function createTimer(): { mark: (name: string) => void; getTimings: () => TimingData; log: () => void } {
const startTime = performance.now();
const marks: { name: string; time: number }[] = [];
let lastMark = startTime;
return {
mark(name: string) {
const now = performance.now();
marks.push({ name, time: now - lastMark });
lastMark = now;
},
getTimings(): TimingData {
const timings: TimingData = {};
marks.forEach(m => { timings[m.name] = Math.round(m.time * 100) / 100; });
timings['_total'] = Math.round((performance.now() - startTime) * 100) / 100;
return timings;
},
log() {
if (TIMING_ENABLED) {
const timings = this.getTimings();
console.error(`[TIMING] smart-search: ${JSON.stringify(timings)}`);
}
}
};
}
// Define Zod schema for validation
const ParamsSchema = z.object({
// Action: search (content), find_files (path/name pattern), init, status, update (incremental), watch
// Note: search_files is deprecated, use search with output_mode='files_only'
action: z.enum(['init', 'search', 'search_files', 'find_files', 'status', 'update', 'watch']).default('search'),
query: z.string().optional().describe('Content search query (for action="search")'),
pattern: z.string().optional().describe('Glob pattern for path matching (for action="find_files")'),
mode: z.enum(['fuzzy', 'semantic']).default('fuzzy'),
output_mode: z.enum(['full', 'files_only', 'count']).default('full'),
path: z.string().optional(),
paths: z.array(z.string()).default([]),
contextLines: z.number().default(0),
maxResults: z.number().default(5), // Default 5 with full content
includeHidden: z.boolean().default(false),
languages: z.array(z.string()).optional(),
limit: z.number().default(5), // Default 5 with full content
extraFilesCount: z.number().default(10), // Additional file-only results
maxContentLength: z.number().default(200), // Max content length for truncation (50-2000)
offset: z.number().default(0), // NEW: Pagination offset (start_index)
enrich: z.boolean().default(false),
// Search modifiers for ripgrep mode
regex: z.boolean().default(true), // Use regex pattern matching (default: enabled)
caseSensitive: z.boolean().default(true), // Case sensitivity (default: case-sensitive)
tokenize: z.boolean().default(true), // Tokenize multi-word queries for OR matching (default: enabled)
// File type filtering
excludeExtensions: z.array(z.string()).optional().describe('File extensions to exclude from results (e.g., ["md", "txt"])'),
codeOnly: z.boolean().default(false).describe('Only return code files (excludes md, txt, json, yaml, xml, etc.)'),
// Watcher options
debounce: z.number().default(1000).describe('Debounce interval in ms for watch action'),
// Fuzzy matching is implicit in hybrid mode (RRF fusion)
});
type Params = z.infer<typeof ParamsSchema>;
// Search mode constants
const SEARCH_MODES = ['fuzzy', 'semantic'] as const;
// Classification confidence threshold
const CONFIDENCE_THRESHOLD = 0.7;
// File filtering configuration (ported from code-index)
const FILTER_CONFIG = {
exclude_directories: new Set([
'.git', '.svn', '.hg', '.bzr',
'node_modules', '__pycache__', '.venv', 'venv', 'vendor', 'bower_components',
'dist', 'build', 'target', 'out', 'bin', 'obj',
'.idea', '.vscode', '.vs', '.sublime-workspace',
'.pytest_cache', '.coverage', '.tox', '.nyc_output', 'coverage', 'htmlcov',
'.next', '.nuxt', '.cache', '.parcel-cache',
'.DS_Store', 'Thumbs.db',
]),
exclude_files: new Set([
'*.tmp', '*.temp', '*.swp', '*.swo', '*.bak', '*~', '*.orig', '*.log',
'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml', 'Pipfile.lock',
]),
// Windows device files - must use **/ pattern to match in any directory
// These cause "os error 1" on Windows when accessed
windows_device_files: new Set([
'nul', 'con', 'aux', 'prn',
'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9',
'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9',
]),
};
function buildExcludeArgs(): string[] {
const args: string[] = [];
for (const dir of FILTER_CONFIG.exclude_directories) {
args.push('--glob', `!**/${dir}/**`);
}
for (const pattern of FILTER_CONFIG.exclude_files) {
args.push('--glob', `!${pattern}`);
}
// Windows device files need case-insensitive matching in any directory
for (const device of FILTER_CONFIG.windows_device_files) {
args.push('--glob', `!**/${device}`);
args.push('--glob', `!**/${device.toUpperCase()}`);
}
return args;
}
/**
* Tokenize query for multi-word OR matching
* Splits on whitespace and common delimiters, filters stop words and short tokens
* @param query - The search query
* @returns Array of tokens
*/
function tokenizeQuery(query: string): string[] {
// Stop words for filtering (common English + programming keywords)
const stopWords = new Set([
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
'should', 'may', 'might', 'must', 'can', 'to', 'of', 'in', 'for', 'on',
'with', 'at', 'by', 'from', 'as', 'into', 'through', 'and', 'but', 'if',
'or', 'not', 'this', 'that', 'these', 'those', 'it', 'its', 'how', 'what',
'where', 'when', 'why', 'which', 'who', 'whom',
]);
// Split on whitespace and common delimiters, keep meaningful tokens
const tokens = query
.split(/[\s,;:]+/)
.map(token => token.trim())
.filter(token => {
// Keep tokens that are:
// - At least 2 characters long
// - Not a stop word (case-insensitive)
// - Or look like identifiers (contain underscore/camelCase)
if (token.length < 2) return false;
if (stopWords.has(token.toLowerCase()) && !token.includes('_') && !/[A-Z]/.test(token)) {
return false;
}
return true;
});
return tokens;
}
/**
* Score results based on token match count for ranking
* @param results - Search results
* @param tokens - Query tokens
* @returns Results with match scores
*/
function scoreByTokenMatch(results: ExactMatch[], tokens: string[]): ExactMatch[] {
if (tokens.length <= 1) return results;
// Create case-insensitive patterns for each token
const tokenPatterns = tokens.map(t => {
const escaped = t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
return new RegExp(escaped, 'i');
});
return results.map(r => {
const content = r.content || '';
const file = r.file || '';
const searchText = `${file} ${content}`;
// Count how many tokens match
let matchCount = 0;
for (const pattern of tokenPatterns) {
if (pattern.test(searchText)) {
matchCount++;
}
}
// Calculate match ratio (0 to 1)
const matchRatio = matchCount / tokens.length;
return {
...r,
matchScore: matchRatio,
matchCount,
};
}).sort((a, b) => {
// Sort by match ratio (descending), then by line number
if (b.matchScore !== a.matchScore) {
return b.matchScore - a.matchScore;
}
return (a.line || 0) - (b.line || 0);
});
}
interface Classification {
mode: string;
confidence: number;
reasoning: string;
}
interface ExactMatch {
file: string;
line: number;
column: number;
content: string;
matchScore?: number; // Token match ratio (0-1) for multi-word queries
matchCount?: number; // Number of tokens matched
}
interface RelationshipInfo {
type: string; // 'calls', 'imports', 'called_by', 'imported_by'
direction: 'outgoing' | 'incoming';
target?: string; // Target symbol name (for outgoing)
source?: string; // Source symbol name (for incoming)
file: string; // File path
line?: number; // Line number
}
interface SemanticMatch {
file: string;
score: number;
content: string;
symbol: string | null;
relationships?: RelationshipInfo[];
}
interface GraphMatch {
file: string;
symbols: unknown;
relationships: unknown[];
}
// File match for find_files action (path-based search)
interface FileMatch {
path: string;
type: 'file' | 'directory';
name: string; // Filename only
extension?: string; // File extension (without dot)
}
interface PaginationInfo {
offset: number; // Starting index of returned results
limit: number; // Number of results requested
total: number; // Total number of results found
has_more: boolean; // True if more results are available
}
interface SearchMetadata {
mode?: string;
backend?: string;
count?: number;
query?: string;
pattern?: string; // For find_files action
classified_as?: string;
confidence?: number;
reasoning?: string;
embeddings_coverage_percent?: number;
warning?: string;
note?: string;
index_status?: 'indexed' | 'not_indexed' | 'partial';
fallback?: string; // Fallback mode used (e.g., 'fuzzy')
fallback_history?: string[];
suggested_weights?: Record<string, number>;
// Tokenization metadata (ripgrep mode)
tokens?: string[]; // Query tokens used for multi-word search
tokenized?: boolean; // Whether tokenization was applied
// Pagination metadata
pagination?: PaginationInfo;
// Performance timing data (when SMART_SEARCH_TIMING=1 or DEBUG includes 'timing')
timing?: TimingData;
// Init action specific
action?: string;
path?: string;
progress?: {
stage: string;
message: string;
percent: number;
filesProcessed?: number;
totalFiles?: number;
};
progressHistory?: ProgressInfo[];
}
interface SearchResult {
success: boolean;
results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | FileMatch[] | unknown;
extra_files?: string[]; // Additional file paths without content
output?: string;
metadata?: SearchMetadata;
error?: string;
status?: unknown;
message?: string;
}
interface ModelInfo {
model_profile?: string;
model_name?: string;
embedding_dim?: number;
backend?: string;
created_at?: string;
updated_at?: string;
}
interface IndexStatus {
indexed: boolean;
has_embeddings: boolean;
file_count?: number;
embeddings_coverage_percent?: number;
total_chunks?: number;
model_info?: ModelInfo | null;
warning?: string;
}
/**
* Strip ANSI color codes from string (for JSON parsing)
*/
function stripAnsi(str: string): string {
return str.replace(/\x1b\[[0-9;]*m/g, '');
}
/** Default maximum content length to return (avoid excessive output) */
const DEFAULT_MAX_CONTENT_LENGTH = 200;
/**
* Truncate content to specified length with ellipsis
* @param content - The content to truncate
* @param maxLength - Maximum length (default: 200)
*/
function truncateContent(content: string | null | undefined, maxLength: number = DEFAULT_MAX_CONTENT_LENGTH): string {
if (!content) return '';
if (content.length <= maxLength) return content;
return content.slice(0, maxLength) + '...';
}
/**
* Split results into full content results and extra file-only results
* Generic function supporting both SemanticMatch and ExactMatch types
* @param allResults - All search results (must have 'file' property)
* @param fullContentLimit - Number of results with full content (default: 5)
* @param extraFilesCount - Number of additional file-only results (default: 10)
*/
function splitResultsWithExtraFiles<T extends { file: string }>(
allResults: T[],
fullContentLimit: number = 5,
extraFilesCount: number = 10
): { results: T[]; extra_files: string[] } {
// First N results with full content
const results = allResults.slice(0, fullContentLimit);
// Next M results as file paths only (deduplicated)
const extraResults = allResults.slice(fullContentLimit, fullContentLimit + extraFilesCount);
const extra_files = [...new Set(extraResults.map(r => r.file))];
return { results, extra_files };
}
/**
* Check if CodexLens index exists for current directory
* @param path - Directory path to check
* @returns Index status
*/
async function checkIndexStatus(path: string = '.'): Promise<IndexStatus> {
try {
const result = await executeCodexLens(['status', '--json'], { cwd: path });
if (!result.success) {
return {
indexed: false,
has_embeddings: false,
warning: 'No CodexLens index found. Run smart_search(action="init") to create index for better search results.',
};
}
// Parse status output
try {
// Strip ANSI color codes from JSON output
const cleanOutput = stripAnsi(result.output || '{}');
const parsed = JSON.parse(cleanOutput);
// Handle both direct and nested response formats (status returns {success, result: {...}})
const status = parsed.result || parsed;
const indexed = status.projects_count > 0 || status.total_files > 0;
// Get embeddings coverage from comprehensive status
const embeddingsData = status.embeddings || {};
const embeddingsCoverage = embeddingsData.coverage_percent || 0;
const has_embeddings = embeddingsCoverage >= 50; // Threshold: 50%
const totalChunks = embeddingsData.total_chunks || 0;
// Extract model info if available
const modelInfoData = embeddingsData.model_info;
const modelInfo: ModelInfo | undefined = modelInfoData ? {
model_profile: modelInfoData.model_profile,
model_name: modelInfoData.model_name,
embedding_dim: modelInfoData.embedding_dim,
backend: modelInfoData.backend,
created_at: modelInfoData.created_at,
updated_at: modelInfoData.updated_at,
} : undefined;
let warning: string | undefined;
if (!indexed) {
warning = 'No CodexLens index found. Run smart_search(action="init") to create index for better search results.';
} else if (embeddingsCoverage === 0) {
warning = 'Index exists but no embeddings generated. Run: codexlens embeddings-generate --recursive';
} else if (embeddingsCoverage < 50) {
warning = `Embeddings coverage is ${embeddingsCoverage.toFixed(1)}% (below 50%). Hybrid search will use exact mode. Run: codexlens embeddings-generate --recursive`;
}
return {
indexed,
has_embeddings,
file_count: status.total_files,
embeddings_coverage_percent: embeddingsCoverage,
total_chunks: totalChunks,
// Ensure model_info is null instead of undefined so it's included in JSON
model_info: modelInfo ?? null,
warning,
};
} catch {
return {
indexed: false,
has_embeddings: false,
warning: 'Failed to parse index status',
};
}
} catch {
return {
indexed: false,
has_embeddings: false,
warning: 'CodexLens not available',
};
}
}
/**
* Detection heuristics for intent classification
*/
/**
* Detect literal string query (simple alphanumeric or quoted strings)
*/
function detectLiteral(query: string): boolean {
return /^[a-zA-Z0-9_-]+$/.test(query) || /^["'].*["']$/.test(query);
}
/**
* Detect regex pattern (contains regex metacharacters)
*/
function detectRegex(query: string): boolean {
return /[.*+?^${}()|[\]\\]/.test(query);
}
/**
* Detect natural language query (sentence structure, questions, multi-word phrases)
*/
function detectNaturalLanguage(query: string): boolean {
return query.split(/\s+/).length >= 3 || /\?$/.test(query);
}
/**
* Detect file path query (path separators, file extensions)
*/
function detectFilePath(query: string): boolean {
return /[/\\]/.test(query) || /\.[a-z]{2,4}$/i.test(query);
}
/**
* Detect relationship query (import, export, dependency keywords)
*/
function detectRelationship(query: string): boolean {
return /(import|export|uses?|depends?|calls?|extends?)\s/i.test(query);
}
function looksLikeCodeQuery(query: string): boolean {
if (/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(query)) return true;
if (/[:.<>\-=(){}[\]]/.test(query) && query.split(/\s+/).length <= 2) return true;
if (/\.\*|\\\(|\\\[|\\s/.test(query)) return true;
if (/^[a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*$/.test(query)) return true;
return false;
}
/**
* Classify query intent and recommend search mode
* Simple mapping: hybrid (NL + index + embeddings) | exact (index or insufficient embeddings) | ripgrep (no index)
* @param query - Search query string
* @param hasIndex - Whether CodexLens index exists
* @param hasSufficientEmbeddings - Whether embeddings coverage >= 50%
* @returns Classification result
*/
function classifyIntent(query: string, hasIndex: boolean = false, hasSufficientEmbeddings: boolean = false): Classification {
const isNaturalLanguage = detectNaturalLanguage(query);
const isCodeQuery = looksLikeCodeQuery(query);
const isRegexPattern = detectRegex(query);
let mode: string;
let confidence: number;
if (!hasIndex) {
mode = 'ripgrep';
confidence = 1.0;
} else if (isCodeQuery || isRegexPattern) {
mode = 'exact';
confidence = 0.95;
} else if (isNaturalLanguage && hasSufficientEmbeddings) {
mode = 'hybrid';
confidence = 0.9;
} else {
mode = 'exact';
confidence = 0.8;
}
const detectedPatterns: string[] = [];
if (detectLiteral(query)) detectedPatterns.push('literal');
if (detectRegex(query)) detectedPatterns.push('regex');
if (detectNaturalLanguage(query)) detectedPatterns.push('natural language');
if (detectFilePath(query)) detectedPatterns.push('file path');
if (detectRelationship(query)) detectedPatterns.push('relationship');
if (isCodeQuery) detectedPatterns.push('code identifier');
const reasoning = `Query classified as ${mode} (confidence: ${confidence.toFixed(2)}, detected: ${detectedPatterns.join(', ')}, index: ${hasIndex ? 'available' : 'not available'}, embeddings: ${hasSufficientEmbeddings ? 'sufficient' : 'insufficient'})`;
return { mode, confidence, reasoning };
}
/**
* Check if a tool is available in PATH
* @param toolName - Tool executable name
* @returns True if available
*/
function checkToolAvailability(toolName: string): boolean {
try {
const isWindows = process.platform === 'win32';
const command = isWindows ? 'where' : 'which';
execSync(`${command} ${toolName}`, { stdio: 'ignore' });
return true;
} catch {
return false;
}
}
/**
* Build ripgrep command arguments
* Supports tokenized multi-word queries with OR matching
* @param params - Search parameters
* @returns Command, arguments, and tokens used
*/
function buildRipgrepCommand(params: {
query: string;
paths: string[];
contextLines: number;
maxResults: number;
includeHidden: boolean;
regex?: boolean;
caseSensitive?: boolean;
tokenize?: boolean;
}): { command: string; args: string[]; tokens: string[] } {
const { query, paths = ['.'], contextLines = 0, maxResults = 10, includeHidden = false, regex = false, caseSensitive = true, tokenize = true } = params;
const args = [
'-n',
'--color=never',
'--json',
];
// Add file filtering (unless includeHidden is true)
if (!includeHidden) {
args.push(...buildExcludeArgs());
}
// Case sensitivity
if (!caseSensitive) {
args.push('--ignore-case');
}
if (contextLines > 0) {
args.push('-C', contextLines.toString());
}
if (maxResults > 0) {
args.push('--max-count', maxResults.toString());
}
if (includeHidden) {
args.push('--hidden');
}
// Tokenize query for multi-word OR matching
const tokens = tokenize ? tokenizeQuery(query) : [query];
if (tokens.length > 1) {
// Multi-token: use multiple -e patterns (OR matching)
// Each token is escaped for regex safety unless regex mode is enabled
for (const token of tokens) {
if (regex) {
args.push('-e', token);
} else {
// Escape regex special chars for literal matching
const escaped = token.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
args.push('-e', escaped);
}
}
} else {
// Single token or no tokenization: use original behavior
if (regex) {
args.push('-e', query);
} else {
args.push('-F', query);
}
}
args.push(...paths);
return { command: 'rg', args, tokens };
}
/**
* Action: init - Initialize CodexLens index (FTS only, no embeddings)
* For semantic/vector search, use ccw view dashboard or codexlens CLI directly
*/
async function executeInitAction(params: Params): Promise<SearchResult> {
const { path = '.', languages } = params;
// Check CodexLens availability
const readyStatus = await ensureCodexLensReady();
if (!readyStatus.ready) {
return {
success: false,
error: `CodexLens not available: ${readyStatus.error}. CodexLens will be auto-installed on first use.`,
};
}
// Build args with --no-embeddings for FTS-only index (faster)
// Use 'index init' subcommand (new CLI structure)
const args = ['index', 'init', path, '--no-embeddings'];
if (languages && languages.length > 0) {
args.push('--language', languages.join(','));
}
// Track progress updates
const progressUpdates: ProgressInfo[] = [];
let lastProgress: ProgressInfo | null = null;
const result = await executeCodexLens(args, {
cwd: path,
timeout: 1800000, // 30 minutes for large codebases
onProgress: (progress: ProgressInfo) => {
progressUpdates.push(progress);
lastProgress = progress;
},
});
// Build metadata with progress info
const metadata: SearchMetadata = {
action: 'init',
path,
};
if (lastProgress !== null) {
const p = lastProgress as ProgressInfo;
metadata.progress = {
stage: p.stage,
message: p.message,
percent: p.percent,
filesProcessed: p.filesProcessed,
totalFiles: p.totalFiles,
};
}
if (progressUpdates.length > 0) {
metadata.progressHistory = progressUpdates.slice(-5); // Keep last 5 progress updates
}
const successMessage = result.success
? `FTS index created for ${path}. Note: For semantic/vector search, create vector index via "ccw view" dashboard or run "codexlens init ${path}" (without --no-embeddings).`
: undefined;
return {
success: result.success,
error: result.error,
message: successMessage,
metadata,
};
}
/**
* Action: status - Check CodexLens index status
*/
async function executeStatusAction(params: Params): Promise<SearchResult> {
const { path = '.' } = params;
const indexStatus = await checkIndexStatus(path);
return {
success: true,
status: indexStatus,
message: indexStatus.warning || `Index status: ${indexStatus.indexed ? 'indexed' : 'not indexed'}, embeddings: ${indexStatus.has_embeddings ? 'available' : 'not available'}`,
};
}
/**
* Action: update - Incremental index update
* Updates index for changed files without full rebuild
*/
async function executeUpdateAction(params: Params): Promise<SearchResult> {
const { path = '.', languages } = params;
// Check CodexLens availability
const readyStatus = await ensureCodexLensReady();
if (!readyStatus.ready) {
return {
success: false,
error: `CodexLens not available: ${readyStatus.error}`,
};
}
// Check if index exists first
const indexStatus = await checkIndexStatus(path);
if (!indexStatus.indexed) {
return {
success: false,
error: `Directory not indexed. Run smart_search(action="init") first.`,
};
}
// Build args for incremental init (without --force)
// Use 'index init' subcommand (new CLI structure)
const args = ['index', 'init', path];
if (languages && languages.length > 0) {
args.push('--language', languages.join(','));
}
// Track progress updates
const progressUpdates: ProgressInfo[] = [];
let lastProgress: ProgressInfo | null = null;
const result = await executeCodexLens(args, {
cwd: path,
timeout: 600000, // 10 minutes for incremental updates
onProgress: (progress: ProgressInfo) => {
progressUpdates.push(progress);
lastProgress = progress;
},
});
// Build metadata with progress info
const metadata: SearchMetadata = {
action: 'update',
path,
};
if (lastProgress !== null) {
const p = lastProgress as ProgressInfo;
metadata.progress = {
stage: p.stage,
message: p.message,
percent: p.percent,
filesProcessed: p.filesProcessed,
totalFiles: p.totalFiles,
};
}
if (progressUpdates.length > 0) {
metadata.progressHistory = progressUpdates.slice(-5);
}
return {
success: result.success,
error: result.error,
message: result.success
? `Incremental update completed for ${path}`
: undefined,
metadata,
};
}
/**
* Action: watch - Start file watcher for automatic incremental updates
* Note: This starts a background process, returns immediately with status
*/
async function executeWatchAction(params: Params): Promise<SearchResult> {
const { path = '.', languages, debounce = 1000 } = params;
// Check CodexLens availability
const readyStatus = await ensureCodexLensReady();
if (!readyStatus.ready) {
return {
success: false,
error: `CodexLens not available: ${readyStatus.error}`,
};
}
// Check if index exists first
const indexStatus = await checkIndexStatus(path);
if (!indexStatus.indexed) {
return {
success: false,
error: `Directory not indexed. Run smart_search(action="init") first.`,
};
}
// Build args for watch command
const args = ['watch', path, '--debounce', debounce.toString()];
if (languages && languages.length > 0) {
args.push('--language', languages.join(','));
}
// Start watcher in background (non-blocking)
// Note: The watcher runs until manually stopped
const result = await executeCodexLens(args, {
cwd: path,
timeout: 5000, // Short timeout for initial startup check
});
return {
success: true,
message: `File watcher started for ${path}. Use Ctrl+C or kill the process to stop.`,
metadata: {
action: 'watch',
path,
note: 'Watcher runs in background. Changes are indexed automatically with debounce.',
},
};
}
/**
* Mode: fuzzy - FTS + ripgrep fusion with RRF ranking
* Runs both exact (FTS) and ripgrep searches in parallel, merges and ranks results
*/
async function executeFuzzyMode(params: Params): Promise<SearchResult> {
const { query, path = '.', maxResults = 5, extraFilesCount = 10 } = params;
if (!query) {
return {
success: false,
error: 'Query is required for search',
};
}
const timer = createTimer();
// Run both searches in parallel
const [ftsResult, ripgrepResult] = await Promise.allSettled([
executeCodexLensExactMode(params),
executeRipgrepMode(params),
]);
timer.mark('parallel_search');
// Collect results from both sources
const resultsMap = new Map<string, any[]>();
// Add FTS results if successful
if (ftsResult.status === 'fulfilled' && ftsResult.value.success && ftsResult.value.results) {
resultsMap.set('exact', ftsResult.value.results as any[]);
}
// Add ripgrep results if successful
if (ripgrepResult.status === 'fulfilled' && ripgrepResult.value.success && ripgrepResult.value.results) {
resultsMap.set('ripgrep', ripgrepResult.value.results as any[]);
}
// If both failed, return error
if (resultsMap.size === 0) {
const errors: string[] = [];
if (ftsResult.status === 'rejected') errors.push(`FTS: ${ftsResult.reason}`);
if (ripgrepResult.status === 'rejected') errors.push(`Ripgrep: ${ripgrepResult.reason}`);
return {
success: false,
error: `Both search backends failed: ${errors.join('; ')}`,
};
}
// Apply RRF fusion with fuzzy-optimized weights
// Fuzzy mode: balanced between exact and ripgrep
const fusionWeights = { exact: 0.5, ripgrep: 0.5 };
const totalToFetch = maxResults + extraFilesCount;
const fusedResults = applyRRFFusion(resultsMap, fusionWeights, totalToFetch);
timer.mark('rrf_fusion');
// Normalize results format
const normalizedResults = fusedResults.map((item: any) => ({
file: item.file || item.path,
line: item.line || 0,
column: item.column || 0,
content: item.content || '',
score: item.fusion_score || 0,
matchCount: item.matchCount,
matchScore: item.matchScore,
}));
// Split results: first N with full content, rest as file paths only
const { results, extra_files } = splitResultsWithExtraFiles(normalizedResults, maxResults, extraFilesCount);
// Log timing
timer.log();
const timings = timer.getTimings();
return {
success: true,
results,
extra_files: extra_files.length > 0 ? extra_files : undefined,
metadata: {
mode: 'fuzzy',
backend: 'fts+ripgrep',
count: results.length,
query,
note: `Fuzzy search using RRF fusion of FTS and ripgrep (weights: exact=${fusionWeights.exact}, ripgrep=${fusionWeights.ripgrep})`,
timing: TIMING_ENABLED ? timings : undefined,
},
};
}
/**
* Mode: auto - Intent classification and mode selection
* Routes to: hybrid (NL + index) | exact (index) | ripgrep (no index)
*/
async function executeAutoMode(params: Params): Promise<SearchResult> {
const { query, path = '.' } = params;
if (!query) {
return {
success: false,
error: 'Query is required for search action',
};
}
// Check index status
const indexStatus = await checkIndexStatus(path);
// Classify intent with index and embeddings awareness
const classification = classifyIntent(
query,
indexStatus.indexed,
indexStatus.has_embeddings // This now considers 50% threshold
);
// Route to appropriate mode based on classification
let result: SearchResult;
switch (classification.mode) {
case 'hybrid':
result = await executeHybridMode(params);
break;
case 'exact':
result = await executeCodexLensExactMode(params);
break;
case 'ripgrep':
result = await executeRipgrepMode(params);
break;
default:
// Fallback to ripgrep
result = await executeRipgrepMode(params);
break;
}
// Add classification metadata
if (result.metadata) {
result.metadata.classified_as = classification.mode;
result.metadata.confidence = classification.confidence;
result.metadata.reasoning = classification.reasoning;
result.metadata.embeddings_coverage_percent = indexStatus.embeddings_coverage_percent;
result.metadata.index_status = indexStatus.indexed
? (indexStatus.has_embeddings ? 'indexed' : 'partial')
: 'not_indexed';
// Add warning if needed
if (indexStatus.warning) {
result.metadata.warning = indexStatus.warning;
}
}
return result;
}
/**
* Mode: ripgrep - Fast literal string matching using ripgrep
* No index required, fallback to CodexLens if ripgrep unavailable
* Supports tokenized multi-word queries with OR matching and result ranking
*/
async function executeRipgrepMode(params: Params): Promise<SearchResult> {
const { query, paths = [], contextLines = 0, maxResults = 5, extraFilesCount = 10, maxContentLength = 200, includeHidden = false, path = '.', regex = true, caseSensitive = true, tokenize = true } = params;
if (!query) {
return {
success: false,
error: 'Query is required for search',
};
}
// Check if ripgrep is available
const hasRipgrep = checkToolAvailability('rg');
// Calculate total to fetch for split (full content + extra files)
const totalToFetch = maxResults + extraFilesCount;
// If ripgrep not available, fall back to CodexLens exact mode
if (!hasRipgrep) {
const readyStatus = await ensureCodexLensReady();
if (!readyStatus.ready) {
return {
success: false,
error: 'Neither ripgrep nor CodexLens available. Install ripgrep (rg) or CodexLens for search functionality.',
};
}
// Use CodexLens fts mode as fallback
const args = ['search', query, '--limit', totalToFetch.toString(), '--method', 'fts', '--json'];
const result = await executeCodexLens(args, { cwd: path });
if (!result.success) {
return {
success: false,
error: result.error,
metadata: {
mode: 'ripgrep',
backend: 'codexlens-fallback',
count: 0,
query,
},
};
}
// Parse results
let allResults: SemanticMatch[] = [];
try {
const parsed = JSON.parse(stripAnsi(result.output || '{}'));
const data = parsed.result?.results || parsed.results || parsed;
allResults = (Array.isArray(data) ? data : []).map((item: any) => ({
file: item.path || item.file,
score: item.score || 0,
content: truncateContent(item.content || item.excerpt, maxContentLength),
symbol: item.symbol || null,
}));
} catch {
// Keep empty results
}
// Split results: first N with full content, rest as file paths only
const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
return {
success: true,
results,
extra_files: extra_files.length > 0 ? extra_files : undefined,
metadata: {
mode: 'ripgrep',
backend: 'codexlens-fallback',
count: results.length,
query,
note: 'Using CodexLens exact mode (ripgrep not available)',
},
};
}
// Use ripgrep - request more results to support split
const { command, args, tokens } = buildRipgrepCommand({
query,
paths: paths.length > 0 ? paths : [path],
contextLines,
maxResults: totalToFetch, // Fetch more to support split
includeHidden,
regex,
caseSensitive,
tokenize,
});
return new Promise((resolve) => {
const child = spawn(command, args, {
cwd: path || getProjectRoot(),
stdio: ['ignore', 'pipe', 'pipe'],
});
let stdout = '';
let stderr = '';
let resultLimitReached = false;
child.stdout.on('data', (data) => {
stdout += data.toString();
});
child.stderr.on('data', (data) => {
stderr += data.toString();
});
child.on('close', (code) => {
const allResults: ExactMatch[] = [];
const lines = stdout.split('\n').filter((line) => line.trim());
// Limit total results to prevent memory overflow (--max-count only limits per-file)
const effectiveLimit = totalToFetch > 0 ? totalToFetch : 500;
for (const line of lines) {
// Stop collecting if we've reached the limit
if (allResults.length >= effectiveLimit) {
resultLimitReached = true;
break;
}
try {
const item = JSON.parse(line);
if (item.type === 'match') {
const match: ExactMatch = {
file: item.data.path.text,
line: item.data.line_number,
column:
item.data.submatches && item.data.submatches[0]
? item.data.submatches[0].start + 1
: 1,
content: item.data.lines.text.trim(),
};
allResults.push(match);
}
} catch {
continue;
}
}
// Handle Windows device file errors gracefully (os error 1)
// If we have results despite the error, return them as partial success
const isWindowsDeviceError = stderr.includes('os error 1') || stderr.includes('函数不正确');
// Apply token-based scoring and sorting for multi-word queries
// Results matching more tokens are ranked higher (exact matches first)
const scoredResults = tokens.length > 1 ? scoreByTokenMatch(allResults, tokens) : allResults;
if (code === 0 || code === 1 || (isWindowsDeviceError && scoredResults.length > 0)) {
// Split results: first N with full content, rest as file paths only
const { results, extra_files } = splitResultsWithExtraFiles(scoredResults, maxResults, extraFilesCount);
// Build warning message for various conditions
const warnings: string[] = [];
if (resultLimitReached) {
warnings.push(`Result limit reached (${effectiveLimit}). Use a more specific query or increase limit.`);
}
if (isWindowsDeviceError) {
warnings.push('Some Windows device files were skipped');
}
resolve({
success: true,
results,
extra_files: extra_files.length > 0 ? extra_files : undefined,
metadata: {
mode: 'ripgrep',
backend: 'ripgrep',
count: results.length,
query,
tokens: tokens.length > 1 ? tokens : undefined, // Include tokens in metadata for debugging
tokenized: tokens.length > 1,
...(warnings.length > 0 && { warning: warnings.join('; ') }),
},
});
} else if (isWindowsDeviceError && allResults.length === 0) {
// Windows device error but no results - might be the only issue
resolve({
success: true,
results: [],
metadata: {
mode: 'ripgrep',
backend: 'ripgrep',
count: 0,
query,
warning: 'No matches found (some Windows device files were skipped)',
},
});
} else {
resolve({
success: false,
error: `ripgrep execution failed with code ${code}: ${stderr}`,
results: [],
});
}
});
child.on('error', (error) => {
resolve({
success: false,
error: `Failed to spawn ripgrep: ${error.message}`,
results: [],
});
});
});
}
/**
* Mode: exact - CodexLens exact/FTS search
* Requires index
*/
async function executeCodexLensExactMode(params: Params): Promise<SearchResult> {
const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false, excludeExtensions, codeOnly = false, offset = 0 } = params;
if (!query) {
return {
success: false,
error: 'Query is required for search',
};
}
// Check CodexLens availability
const readyStatus = await ensureCodexLensReady();
if (!readyStatus.ready) {
return {
success: false,
error: `CodexLens not available: ${readyStatus.error}`,
};
}
// Check index status
const indexStatus = await checkIndexStatus(path);
// Request more results to support split (full content + extra files)
const totalToFetch = maxResults + extraFilesCount;
const args = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'fts', '--json'];
if (enrich) {
args.push('--enrich');
}
// Add code_only filter if requested
if (codeOnly) {
args.push('--code-only');
}
// Add exclude_extensions filter if provided
if (excludeExtensions && excludeExtensions.length > 0) {
args.push('--exclude-extensions', excludeExtensions.join(','));
}
const result = await executeCodexLens(args, { cwd: path });
if (!result.success) {
return {
success: false,
error: result.error,
metadata: {
mode: 'exact',
backend: 'codexlens',
count: 0,
query,
warning: indexStatus.warning,
},
};
}
// Parse results
let allResults: SemanticMatch[] = [];
try {
const parsed = JSON.parse(stripAnsi(result.output || '{}'));
const data = parsed.result?.results || parsed.results || parsed;
allResults = (Array.isArray(data) ? data : []).map((item: any) => ({
file: item.path || item.file,
score: item.score || 0,
content: truncateContent(item.content || item.excerpt, maxContentLength),
symbol: item.symbol || null,
}));
} catch {
// Keep empty results
}
// Fallback to fuzzy mode if exact returns no results
if (allResults.length === 0) {
const fuzzyArgs = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'fts', '--use-fuzzy', '--json'];
if (enrich) {
fuzzyArgs.push('--enrich');
}
// Add code_only filter if requested
if (codeOnly) {
fuzzyArgs.push('--code-only');
}
// Add exclude_extensions filter if provided
if (excludeExtensions && excludeExtensions.length > 0) {
fuzzyArgs.push('--exclude-extensions', excludeExtensions.join(','));
}
const fuzzyResult = await executeCodexLens(fuzzyArgs, { cwd: path });
if (fuzzyResult.success) {
try {
const parsed = JSON.parse(stripAnsi(fuzzyResult.output || '{}'));
const data = parsed.result?.results || parsed.results || parsed;
allResults = (Array.isArray(data) ? data : []).map((item: any) => ({
file: item.path || item.file,
score: item.score || 0,
content: truncateContent(item.content || item.excerpt, maxContentLength),
symbol: item.symbol || null,
}));
} catch {
// Keep empty results
}
if (allResults.length > 0) {
// Split results: first N with full content, rest as file paths only
const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
return {
success: true,
results,
extra_files: extra_files.length > 0 ? extra_files : undefined,
metadata: {
mode: 'exact',
backend: 'codexlens',
count: results.length,
query,
warning: indexStatus.warning,
note: 'No exact matches found, showing fuzzy results',
fallback: 'fuzzy',
},
};
}
}
}
// Split results: first N with full content, rest as file paths only
const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
return {
success: true,
results,
extra_files: extra_files.length > 0 ? extra_files : undefined,
metadata: {
mode: 'exact',
backend: 'codexlens',
count: results.length,
query,
warning: indexStatus.warning,
},
};
}
/**
* Mode: hybrid - Best quality semantic search
* Uses CodexLens dense_rerank method (dense coarse + cross-encoder rerank)
* Requires index with embeddings
*/
async function executeHybridMode(params: Params): Promise<SearchResult> {
const timer = createTimer();
const { query, path = '.', maxResults = 5, extraFilesCount = 10, maxContentLength = 200, enrich = false, excludeExtensions, codeOnly = false, offset = 0 } = params;
if (!query) {
return {
success: false,
error: 'Query is required for search',
};
}
// Check CodexLens availability
const readyStatus = await ensureCodexLensReady();
timer.mark('codexlens_ready_check');
if (!readyStatus.ready) {
return {
success: false,
error: `CodexLens not available: ${readyStatus.error}`,
};
}
// Check index status
const indexStatus = await checkIndexStatus(path);
timer.mark('index_status_check');
// Request more results to support split (full content + extra files)
const totalToFetch = maxResults + extraFilesCount;
const args = ['search', query, '--limit', totalToFetch.toString(), '--offset', offset.toString(), '--method', 'dense_rerank', '--json'];
if (enrich) {
args.push('--enrich');
}
// Add code_only filter if requested
if (codeOnly) {
args.push('--code-only');
}
// Add exclude_extensions filter if provided
if (excludeExtensions && excludeExtensions.length > 0) {
args.push('--exclude-extensions', excludeExtensions.join(','));
}
const result = await executeCodexLens(args, { cwd: path });
timer.mark('codexlens_search');
if (!result.success) {
timer.log();
return {
success: false,
error: result.error,
metadata: {
mode: 'hybrid',
backend: 'codexlens',
count: 0,
query,
warning: indexStatus.warning,
},
};
}
// Parse results
let allResults: SemanticMatch[] = [];
let baselineInfo: { score: number; count: number } | null = null;
let initialCount = 0;
try {
const parsed = JSON.parse(stripAnsi(result.output || '{}'));
const data = parsed.result?.results || parsed.results || parsed;
allResults = (Array.isArray(data) ? data : []).map((item: any) => {
const rawScore = item.score || 0;
// Hybrid mode returns distance scores (lower is better).
// Convert to similarity scores (higher is better) for consistency.
// Formula: similarity = 1 / (1 + distance)
const similarityScore = rawScore > 0 ? 1 / (1 + rawScore) : 1;
return {
file: item.path || item.file,
score: similarityScore,
content: truncateContent(item.content || item.excerpt, maxContentLength),
symbol: item.symbol || null,
};
});
timer.mark('parse_results');
initialCount = allResults.length;
// Post-processing pipeline to improve semantic search quality
// 0. Filter dominant baseline scores (hot spot detection)
const baselineResult = filterDominantBaselineScores(allResults);
allResults = baselineResult.filteredResults;
baselineInfo = baselineResult.baselineInfo;
// 1. Filter noisy files (coverage, node_modules, etc.) and excluded extensions
allResults = filterNoisyFiles(allResults, { excludeExtensions, codeOnly });
// 2. Boost results containing query keywords
allResults = applyKeywordBoosting(allResults, query);
// 3. Enforce score diversity (penalize identical scores)
allResults = enforceScoreDiversity(allResults);
// 4. Re-sort by adjusted scores
allResults.sort((a, b) => b.score - a.score);
timer.mark('post_processing');
} catch {
return {
success: true,
results: [],
output: result.output,
metadata: {
mode: 'hybrid',
backend: 'codexlens',
count: 0,
query,
warning: indexStatus.warning || 'Failed to parse JSON output',
},
};
}
// Split results: first N with full content, rest as file paths only
const { results, extra_files } = splitResultsWithExtraFiles(allResults, maxResults, extraFilesCount);
timer.mark('split_results');
// Build metadata with baseline info if detected
let note = 'Using dense_rerank (dense coarse + cross-encoder rerank) for semantic search';
if (baselineInfo) {
note += ` | Filtered ${initialCount - allResults.length} hot-spot results with baseline score ~${baselineInfo.score.toFixed(4)}`;
}
// Log timing data
timer.log();
const timings = timer.getTimings();
return {
success: true,
results,
extra_files: extra_files.length > 0 ? extra_files : undefined,
metadata: {
mode: 'hybrid',
backend: 'codexlens',
count: results.length,
query,
note,
warning: indexStatus.warning,
suggested_weights: getRRFWeights(query),
timing: TIMING_ENABLED ? timings : undefined,
},
};
}
/**
* Query intent used to adapt RRF weights (Python parity).
*
* Keep this logic aligned with CodexLens Python hybrid search:
* `codex-lens/src/codexlens/search/hybrid_search.py`
*/
export type QueryIntent = 'keyword' | 'semantic' | 'mixed';
// Python default: vector 60%, exact 30%, fuzzy 10%
const DEFAULT_RRF_WEIGHTS = {
exact: 0.3,
fuzzy: 0.1,
vector: 0.6,
} as const;
function normalizeWeights(weights: Record<string, number>): Record<string, number> {
const sum = Object.values(weights).reduce((acc, v) => acc + v, 0);
if (!Number.isFinite(sum) || sum <= 0) return { ...weights };
return Object.fromEntries(Object.entries(weights).map(([k, v]) => [k, v / sum]));
}
/**
* Detect query intent using the same heuristic signals as Python:
* - Code patterns: `.`, `::`, `->`, CamelCase, snake_case, common code keywords
* - Natural language patterns: >5 words, question marks, interrogatives, common verbs
*/
export function detectQueryIntent(query: string): QueryIntent {
const trimmed = query.trim();
if (!trimmed) return 'mixed';
const lower = trimmed.toLowerCase();
const wordCount = trimmed.split(/\s+/).filter(Boolean).length;
const hasCodeSignals =
/(::|->|\.)/.test(trimmed) ||
/[A-Z][a-z]+[A-Z]/.test(trimmed) ||
/\b\w+_\w+\b/.test(trimmed) ||
/\b(def|class|function|const|let|var|import|from|return|async|await|interface|type)\b/i.test(lower);
const hasNaturalSignals =
wordCount > 5 ||
/\?/.test(trimmed) ||
/\b(how|what|why|when|where)\b/i.test(trimmed) ||
/\b(handle|explain|fix|implement|create|build|use|find|search|convert|parse|generate|support)\b/i.test(trimmed);
if (hasCodeSignals && hasNaturalSignals) return 'mixed';
if (hasCodeSignals) return 'keyword';
if (hasNaturalSignals) return 'semantic';
return 'mixed';
}
/**
* Intent → weights mapping (Python parity).
* - keyword: exact-heavy
* - semantic: vector-heavy
* - mixed: keep defaults
*/
export function adjustWeightsByIntent(
intent: QueryIntent,
baseWeights: Record<string, number>,
): Record<string, number> {
if (intent === 'keyword') return normalizeWeights({ exact: 0.5, fuzzy: 0.1, vector: 0.4 });
if (intent === 'semantic') return normalizeWeights({ exact: 0.2, fuzzy: 0.1, vector: 0.7 });
return normalizeWeights({ ...baseWeights });
}
export function getRRFWeights(
query: string,
baseWeights: Record<string, number> = DEFAULT_RRF_WEIGHTS,
): Record<string, number> {
return adjustWeightsByIntent(detectQueryIntent(query), baseWeights);
}
/**
* Post-processing: Filter noisy files from semantic search results
* Uses FILTER_CONFIG patterns to remove irrelevant files.
* Optimized: pre-compiled regexes, accurate path segment matching.
*/
// Pre-compile file exclusion regexes once (avoid recompilation in loop)
const FILE_EXCLUDE_REGEXES = [...FILTER_CONFIG.exclude_files].map(pattern =>
new RegExp('^' + pattern.replace(/[.*+?^${}()|[\]\\]/g, '\\$&').replace(/\\\*/g, '.*') + '$')
);
// Non-code file extensions (for codeOnly filter)
const NON_CODE_EXTENSIONS = new Set([
'md', 'txt', 'json', 'yaml', 'yml', 'xml', 'csv', 'log',
'ini', 'cfg', 'conf', 'toml', 'env', 'properties',
'html', 'htm', 'svg', 'png', 'jpg', 'jpeg', 'gif', 'ico', 'webp',
'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
'lock', 'sum', 'mod',
]);
interface FilterOptions {
excludeExtensions?: string[];
codeOnly?: boolean;
}
function filterNoisyFiles(results: SemanticMatch[], options: FilterOptions = {}): SemanticMatch[] {
const { excludeExtensions = [], codeOnly = false } = options;
// Build extension filter set
const excludedExtSet = new Set(excludeExtensions.map(ext => ext.toLowerCase().replace(/^\./, '')));
if (codeOnly) {
NON_CODE_EXTENSIONS.forEach(ext => excludedExtSet.add(ext));
}
return results.filter(r => {
const filePath = r.file || '';
if (!filePath) return true;
const segments = filePath.split(/[/\\]/);
// Accurate directory check: segment must exactly match excluded directory
if (segments.some(segment => FILTER_CONFIG.exclude_directories.has(segment))) {
return false;
}
// Accurate file check: pattern matches filename only (not full path)
const filename = segments.pop() || '';
if (FILE_EXCLUDE_REGEXES.some(regex => regex.test(filename))) {
return false;
}
// Extension filter check
if (excludedExtSet.size > 0) {
const ext = filename.split('.').pop()?.toLowerCase() || '';
if (excludedExtSet.has(ext)) {
return false;
}
}
return true;
});
}
/**
* Post-processing: Boost results containing query keywords
* Extracts keywords from query and boosts matching results.
* Optimized: uses whole-word matching with regex for accuracy.
*/
// Helper to escape regex special characters
function escapeRegExp(str: string): string {
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
function applyKeywordBoosting(results: SemanticMatch[], query: string): SemanticMatch[] {
// Extract meaningful keywords (ignore common words)
const stopWords = new Set(['the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', 'used', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'and', 'but', 'if', 'or', 'because', 'until', 'while', 'although', 'though', 'after', 'before', 'when', 'whenever', 'where', 'wherever', 'whether', 'which', 'who', 'whom', 'whose', 'what', 'whatever', 'whichever', 'whoever', 'whomever', 'this', 'that', 'these', 'those', 'it', 'its']);
const keywords = query
.toLowerCase()
.split(/[\s,.;:()"{}[\]-]+/) // More robust splitting on punctuation
.filter(word => word.length > 2 && !stopWords.has(word));
if (keywords.length === 0) return results;
// Create case-insensitive regexes for whole-word matching
const keywordRegexes = keywords.map(kw => new RegExp(`\\b${escapeRegExp(kw)}\\b`, 'i'));
return results.map(r => {
const content = r.content || '';
const file = r.file || '';
// Count keyword matches using whole-word regex
let matchCount = 0;
for (const regex of keywordRegexes) {
if (regex.test(content) || regex.test(file)) {
matchCount++;
}
}
// Apply boost only if there are matches
if (matchCount > 0) {
const matchRatio = matchCount / keywords.length;
const boost = 1 + (matchRatio * 0.3); // Up to 30% boost for full match
return {
...r,
score: r.score * boost,
};
}
return r;
});
}
/**
* Post-processing: Enforce score diversity
* Penalizes results with identical scores (indicates undifferentiated matching)
*/
function enforceScoreDiversity(results: SemanticMatch[]): SemanticMatch[] {
if (results.length < 2) return results;
// Count occurrences of each score (rounded to 3 decimal places for comparison)
const scoreCounts = new Map<number, number>();
for (const r of results) {
const roundedScore = Math.round(r.score * 1000) / 1000;
scoreCounts.set(roundedScore, (scoreCounts.get(roundedScore) || 0) + 1);
}
// Apply penalty to scores that appear more than twice
return results.map(r => {
const roundedScore = Math.round(r.score * 1000) / 1000;
const count = scoreCounts.get(roundedScore) || 1;
if (count > 2) {
// Progressive penalty: more duplicates = bigger penalty
const penalty = Math.max(0.7, 1 - (count * 0.05));
return { ...r, score: r.score * penalty };
}
return r;
});
}
/**
* Post-processing: Filter results with dominant baseline score (hot spot detection)
* When backend returns default "hot spot" files with identical high scores,
* this function detects and removes them.
*
* Detection criteria:
* - A single score appears in >50% of results
* - That score is suspiciously high (>0.9)
* - This indicates fallback mechanism returned placeholder results
*/
function filterDominantBaselineScores(
results: SemanticMatch[]
): { filteredResults: SemanticMatch[]; baselineInfo: { score: number; count: number } | null } {
if (results.length < 4) {
return { filteredResults: results, baselineInfo: null };
}
// Count occurrences of each score (rounded to 4 decimal places)
const scoreCounts = new Map<number, number>();
results.forEach(r => {
const rounded = Math.round(r.score * 10000) / 10000;
scoreCounts.set(rounded, (scoreCounts.get(rounded) || 0) + 1);
});
// Find the most dominant score
let dominantScore: number | null = null;
let dominantCount = 0;
scoreCounts.forEach((count, score) => {
if (count > dominantCount) {
dominantCount = count;
dominantScore = score;
}
});
// If a single score is present in >50% of results and is high (>0.9),
// treat it as a suspicious baseline score and filter it out
const BASELINE_THRESHOLD = 0.5; // >50% of results have same score
const HIGH_SCORE_THRESHOLD = 0.9; // Score above 0.9 is suspiciously high
if (
dominantScore !== null &&
dominantCount > results.length * BASELINE_THRESHOLD &&
dominantScore > HIGH_SCORE_THRESHOLD
) {
const filteredResults = results.filter(r => {
const rounded = Math.round(r.score * 10000) / 10000;
return rounded !== dominantScore;
});
return {
filteredResults,
baselineInfo: { score: dominantScore, count: dominantCount },
};
}
return { filteredResults: results, baselineInfo: null };
}
/**
* TypeScript implementation of Reciprocal Rank Fusion
* Reference: codex-lens/src/codexlens/search/ranking.py
* Formula: score(d) = Σ weight_source / (k + rank_source(d))
*/
function applyRRFFusion(
resultsMap: Map<string, any[]>,
weightsOrQuery: Record<string, number> | string,
limit: number,
k: number = 60,
): any[] {
const weights = typeof weightsOrQuery === 'string' ? getRRFWeights(weightsOrQuery) : weightsOrQuery;
const pathScores = new Map<string, { score: number; result: any; sources: string[] }>();
resultsMap.forEach((results, source) => {
const weight = weights[source] || 0;
if (weight === 0 || !results) return;
results.forEach((result, rank) => {
const path = result.file || result.path;
if (!path) return;
const rrfContribution = weight / (k + rank + 1);
if (!pathScores.has(path)) {
pathScores.set(path, { score: 0, result, sources: [] });
}
const entry = pathScores.get(path)!;
entry.score += rrfContribution;
if (!entry.sources.includes(source)) {
entry.sources.push(source);
}
});
});
// Sort by fusion score descending
return Array.from(pathScores.values())
.sort((a, b) => b.score - a.score)
.slice(0, limit)
.map(item => ({
...item.result,
fusion_score: item.score,
matched_backends: item.sources,
}));
}
/**
* Promise wrapper with timeout support
* @param promise - The promise to wrap
* @param ms - Timeout in milliseconds
* @param modeName - Name of the mode for error message
* @returns A new promise that rejects on timeout
*/
function withTimeout<T>(promise: Promise<T>, ms: number, modeName: string): Promise<T> {
return new Promise((resolve, reject) => {
const timer = setTimeout(() => {
reject(new Error(`'${modeName}' search timed out after ${ms}ms`));
}, ms);
promise
.then(resolve)
.catch(reject)
.finally(() => clearTimeout(timer));
});
}
/**
* Mode: priority - Fallback search strategy: hybrid -> exact -> ripgrep
* Returns results from the first backend that succeeds and provides results.
* More efficient than parallel mode - stops as soon as valid results are found.
*/
async function executePriorityFallbackMode(params: Params): Promise<SearchResult> {
const { query, path = '.' } = params;
const fallbackHistory: string[] = [];
if (!query) {
return { success: false, error: 'Query is required for search' };
}
// Check index status first
const indexStatus = await checkIndexStatus(path);
// 1. Try Hybrid search (highest priority) - 90s timeout for large indexes
if (indexStatus.indexed && indexStatus.has_embeddings) {
try {
const hybridResult = await withTimeout(executeHybridMode(params), 90000, 'hybrid');
if (hybridResult.success && hybridResult.results && (hybridResult.results as any[]).length > 0) {
fallbackHistory.push('hybrid: success');
return {
...hybridResult,
metadata: {
...hybridResult.metadata,
mode: 'priority',
note: 'Result from hybrid search (semantic + vector).',
fallback_history: fallbackHistory,
},
};
}
fallbackHistory.push('hybrid: no results');
} catch (error) {
fallbackHistory.push(`hybrid: ${(error as Error).message}`);
}
} else {
fallbackHistory.push(`hybrid: skipped (${!indexStatus.indexed ? 'no index' : 'no embeddings'})`);
}
// 2. Fallback to Exact search - 10s timeout
if (indexStatus.indexed) {
try {
const exactResult = await withTimeout(executeCodexLensExactMode(params), 10000, 'exact');
if (exactResult.success && exactResult.results && (exactResult.results as any[]).length > 0) {
fallbackHistory.push('exact: success');
return {
...exactResult,
metadata: {
...exactResult.metadata,
mode: 'priority',
note: 'Result from exact/FTS search (fallback from hybrid).',
fallback_history: fallbackHistory,
},
};
}
fallbackHistory.push('exact: no results');
} catch (error) {
fallbackHistory.push(`exact: ${(error as Error).message}`);
}
} else {
fallbackHistory.push('exact: skipped (no index)');
}
// 3. Final fallback to Ripgrep - 5s timeout
try {
const ripgrepResult = await withTimeout(executeRipgrepMode(params), 5000, 'ripgrep');
fallbackHistory.push(ripgrepResult.success ? 'ripgrep: success' : 'ripgrep: failed');
return {
...ripgrepResult,
metadata: {
...ripgrepResult.metadata,
mode: 'priority',
note: 'Result from ripgrep search (final fallback).',
fallback_history: fallbackHistory,
},
};
} catch (error) {
fallbackHistory.push(`ripgrep: ${(error as Error).message}`);
}
// All modes failed
return {
success: false,
error: 'All search backends in priority mode failed or returned no results.',
metadata: {
mode: 'priority',
query,
fallback_history: fallbackHistory,
} as any,
};
}
// Tool schema for MCP
export const schema: ToolSchema = {
name: 'smart_search',
description: `Unified code search tool with content search, file discovery, and semantic search capabilities.
**Actions:**
- search: Search file content (default)
- find_files: Find files by path/name pattern (glob matching)
- init: Create FTS index
- status: Check index status
- update: Incremental index update (for changed files)
- watch: Start file watcher for automatic updates
**Content Search (action="search"):**
smart_search(query="authentication logic") # fuzzy mode (default) - FTS + ripgrep fusion
smart_search(query="MyClass", mode="fuzzy") # fuzzy mode - fast hybrid search
smart_search(query="how to auth", mode="semantic") # semantic mode - dense + reranker
**File Discovery (action="find_files"):**
smart_search(action="find_files", pattern="*.ts") # find all TypeScript files
smart_search(action="find_files", pattern="src/**/*.js") # recursive glob pattern
smart_search(action="find_files", pattern="test_*.py") # find test files
smart_search(action="find_files", pattern="*.tsx", offset=20, limit=10) # pagination
**Index Maintenance:**
smart_search(action="update", path="/project") # incremental index update
smart_search(action="watch", path="/project") # start file watcher
smart_search(action="watch", debounce=2000) # custom debounce interval
**Pagination:** All actions support offset/limit for paginated results:
smart_search(query="auth", limit=10, offset=0) # first page
smart_search(query="auth", limit=10, offset=10) # second page
**Modes:** fuzzy (FTS + ripgrep fusion, default), semantic (dense + reranker)`,
inputSchema: {
type: 'object',
properties: {
action: {
type: 'string',
enum: ['init', 'search', 'find_files', 'status', 'search_files'],
description: 'Action: search (content search), find_files (path pattern matching), init (create index), status (check index). Note: search_files is deprecated.',
default: 'search',
},
query: {
type: 'string',
description: 'Content search query (for action="search")',
},
pattern: {
type: 'string',
description: 'Glob pattern for file discovery (for action="find_files"). Examples: "*.ts", "src/**/*.js", "test_*.py"',
},
mode: {
type: 'string',
enum: SEARCH_MODES,
description: 'Search mode: fuzzy (FTS + ripgrep fusion, default), semantic (dense + reranker for natural language queries)',
default: 'fuzzy',
},
output_mode: {
type: 'string',
enum: ['full', 'files_only', 'count'],
description: 'Output format: full (default), files_only (paths only), count (per-file counts)',
default: 'full',
},
path: {
type: 'string',
description: 'Directory path for init/search actions (default: current directory)',
},
paths: {
type: 'array',
description: 'Multiple paths to search within (for search action)',
items: {
type: 'string',
},
default: [],
},
contextLines: {
type: 'number',
description: 'Number of context lines around matches (exact mode only)',
default: 0,
},
maxResults: {
type: 'number',
description: 'Maximum number of results (default: 20)',
default: 20,
},
limit: {
type: 'number',
description: 'Alias for maxResults (default: 20)',
default: 20,
},
extraFilesCount: {
type: 'number',
description: 'Number of additional file-only results (paths without content)',
default: 10,
},
maxContentLength: {
type: 'number',
description: 'Maximum content length for truncation (50-2000)',
default: 200,
},
offset: {
type: 'number',
description: 'Pagination offset - skip first N results (default: 0)',
default: 0,
},
includeHidden: {
type: 'boolean',
description: 'Include hidden files/directories',
default: false,
},
languages: {
type: 'array',
items: { type: 'string' },
description: 'Languages to index (for init action). Example: ["javascript", "typescript"]',
},
enrich: {
type: 'boolean',
description: 'Enrich search results with code graph relationships (calls, imports, called_by, imported_by).',
default: false,
},
regex: {
type: 'boolean',
description: 'Use regex pattern matching instead of literal string (ripgrep mode only). Default: enabled. Example: smart_search(query="class.*Builder")',
default: true,
},
caseSensitive: {
type: 'boolean',
description: 'Case-sensitive search (default: true). Set to false for case-insensitive matching.',
default: true,
},
tokenize: {
type: 'boolean',
description: 'Tokenize multi-word queries for OR matching (ripgrep mode). Default: true. Results are ranked by token match count (exact matches first).',
default: true,
},
},
required: [],
},
};
/**
* Action: find_files - Find files by path/name pattern (glob matching)
* Unlike search which looks inside file content, find_files matches file paths
*/
async function executeFindFilesAction(params: Params): Promise<SearchResult> {
const { pattern, path = '.', limit = 20, offset = 0, includeHidden = false, caseSensitive = true } = params;
if (!pattern) {
return {
success: false,
error: 'Pattern is required for find_files action. Use glob patterns like "*.ts", "src/**/*.js", or "test_*.py"',
};
}
// Use ripgrep with --files flag for fast file listing with glob pattern
const hasRipgrep = checkToolAvailability('rg');
if (!hasRipgrep) {
// Fallback to CodexLens file listing if available
const readyStatus = await ensureCodexLensReady();
if (!readyStatus.ready) {
return {
success: false,
error: 'Neither ripgrep nor CodexLens available for file discovery.',
};
}
// Try CodexLens file list command
const args = ['list-files', '--json'];
const result = await executeCodexLens(args, { cwd: path });
if (!result.success) {
return {
success: false,
error: `Failed to list files: ${result.error}`,
};
}
// Parse and filter results by pattern
let files: string[] = [];
try {
const parsed = JSON.parse(stripAnsi(result.output || '[]'));
files = Array.isArray(parsed) ? parsed : (parsed.files || []);
} catch {
return {
success: false,
error: 'Failed to parse file list from CodexLens',
};
}
// Apply glob pattern matching using minimatch-style regex
const globRegex = globToRegex(pattern, caseSensitive);
const matchedFiles = files.filter(f => globRegex.test(f));
// Apply pagination
const total = matchedFiles.length;
const paginatedFiles = matchedFiles.slice(offset, offset + limit);
const results: FileMatch[] = paginatedFiles.map(filePath => {
const parts = filePath.split(/[/\\]/);
const name = parts[parts.length - 1] || '';
const ext = name.includes('.') ? name.split('.').pop() : undefined;
return {
path: filePath,
type: 'file' as const,
name,
extension: ext,
};
});
return {
success: true,
results,
metadata: {
pattern,
backend: 'codexlens',
count: results.length,
pagination: {
offset,
limit,
total,
has_more: offset + limit < total,
},
},
};
}
// Use ripgrep --files with glob pattern for fast file discovery
return new Promise((resolve) => {
const args = ['--files'];
// Add exclude patterns
if (!includeHidden) {
args.push(...buildExcludeArgs());
} else {
args.push('--hidden');
}
// Add glob pattern
args.push('--glob', pattern);
// Case sensitivity for glob matching
if (!caseSensitive) {
args.push('--iglob', pattern);
// Remove the case-sensitive glob and use iglob instead
const globIndex = args.indexOf('--glob');
if (globIndex !== -1) {
args.splice(globIndex, 2);
}
}
const child = spawn('rg', args, {
cwd: path || getProjectRoot(),
stdio: ['ignore', 'pipe', 'pipe'],
});
let stdout = '';
let stderr = '';
child.stdout.on('data', (data) => {
stdout += data.toString();
});
child.stderr.on('data', (data) => {
stderr += data.toString();
});
child.on('close', (code) => {
// ripgrep returns 1 when no matches found, which is not an error
if (code !== 0 && code !== 1 && !stderr.includes('os error 1')) {
resolve({
success: false,
error: `ripgrep file search failed: ${stderr}`,
});
return;
}
const allFiles = stdout.split('\n').filter(line => line.trim());
const total = allFiles.length;
// Apply pagination
const paginatedFiles = allFiles.slice(offset, offset + limit);
const results: FileMatch[] = paginatedFiles.map(filePath => {
const normalizedPath = filePath.replace(/\\/g, '/');
const parts = normalizedPath.split('/');
const name = parts[parts.length - 1] || '';
const ext = name.includes('.') ? name.split('.').pop() : undefined;
return {
path: normalizedPath,
type: 'file' as const,
name,
extension: ext,
};
});
resolve({
success: true,
results,
metadata: {
pattern,
backend: 'ripgrep',
count: results.length,
pagination: {
offset,
limit,
total,
has_more: offset + limit < total,
},
},
});
});
child.on('error', (error) => {
resolve({
success: false,
error: `Failed to spawn ripgrep: ${error.message}`,
});
});
});
}
/**
* Convert glob pattern to regex for file matching
* Supports: *, **, ?, [abc], [!abc]
*/
function globToRegex(pattern: string, caseSensitive: boolean = true): RegExp {
let i = 0;
const out: string[] = [];
const special = '.^$+{}|()';
while (i < pattern.length) {
const c = pattern[i];
if (c === '*') {
if (i + 1 < pattern.length && pattern[i + 1] === '*') {
// ** matches any path including /
out.push('.*');
i += 2;
// Skip following / if present
if (pattern[i] === '/') {
i++;
}
continue;
} else {
// * matches any character except /
out.push('[^/]*');
}
} else if (c === '?') {
out.push('[^/]');
} else if (c === '[') {
// Character class
let j = i + 1;
let negated = false;
if (pattern[j] === '!' || pattern[j] === '^') {
negated = true;
j++;
}
let classContent = '';
while (j < pattern.length && pattern[j] !== ']') {
classContent += pattern[j];
j++;
}
if (negated) {
out.push(`[^${classContent}]`);
} else {
out.push(`[${classContent}]`);
}
i = j;
} else if (special.includes(c)) {
out.push('\\' + c);
} else {
out.push(c);
}
i++;
}
const flags = caseSensitive ? '' : 'i';
return new RegExp('^' + out.join('') + '$', flags);
}
/**
* Apply pagination to search results and add pagination metadata
*/
function applyPagination<T>(
results: T[],
offset: number,
limit: number
): { paginatedResults: T[]; pagination: PaginationInfo } {
const total = results.length;
const paginatedResults = results.slice(offset, offset + limit);
return {
paginatedResults,
pagination: {
offset,
limit,
total,
has_more: offset + limit < total,
},
};
}
/**
* Transform results based on output_mode
*/
function transformOutput(
results: ExactMatch[] | SemanticMatch[] | GraphMatch[] | unknown[],
outputMode: 'full' | 'files_only' | 'count'
): unknown {
if (!Array.isArray(results)) {
return results;
}
switch (outputMode) {
case 'files_only': {
// Extract unique file paths
const files = [...new Set(results.map((r: any) => r.file))].filter(Boolean);
return { files, count: files.length };
}
case 'count': {
// Count matches per file
const counts: Record<string, number> = {};
for (const r of results) {
const file = (r as any).file;
if (file) {
counts[file] = (counts[file] || 0) + 1;
}
}
return {
files: Object.entries(counts).map(([file, count]) => ({ file, count })),
total: results.length,
};
}
case 'full':
default:
return results;
}
}
// Handler function
export async function handler(params: Record<string, unknown>): Promise<ToolResult<SearchResult>> {
const parsed = ParamsSchema.safeParse(params);
if (!parsed.success) {
return { success: false, error: `Invalid params: ${parsed.error.message}` };
}
const { action, mode, output_mode, offset = 0 } = parsed.data;
// Sync limit and maxResults - use the larger of the two if both provided
// This ensures user-provided values take precedence over defaults
const effectiveLimit = Math.max(parsed.data.limit || 20, parsed.data.maxResults || 20);
parsed.data.maxResults = effectiveLimit;
parsed.data.limit = effectiveLimit;
// Track if search_files was used (deprecated)
let deprecationWarning: string | undefined;
try {
let result: SearchResult;
// Handle actions
switch (action) {
case 'init':
result = await executeInitAction(parsed.data);
break;
case 'status':
result = await executeStatusAction(parsed.data);
break;
case 'find_files':
// NEW: File path/name pattern matching (glob-based)
result = await executeFindFilesAction(parsed.data);
break;
case 'update':
// Incremental index update
result = await executeUpdateAction(parsed.data);
break;
case 'watch':
// Start file watcher (returns status, watcher runs in background)
result = await executeWatchAction(parsed.data);
break;
case 'search_files':
// DEPRECATED: Redirect to search with files_only output
deprecationWarning = 'action="search_files" is deprecated. Use action="search" with output_mode="files_only" for content-to-files search, or action="find_files" for path pattern matching.';
parsed.data.output_mode = 'files_only';
// Fall through to search
case 'search':
default:
// Handle search modes: fuzzy | semantic
switch (mode) {
case 'fuzzy':
result = await executeFuzzyMode(parsed.data);
break;
case 'semantic':
result = await executeHybridMode(parsed.data);
break;
default:
throw new Error(`Unsupported mode: ${mode}. Use: fuzzy or semantic`);
}
break;
}
// Transform output based on output_mode (for search actions only)
if (action === 'search' || action === 'search_files') {
if (result.success && result.results && output_mode !== 'full') {
result.results = transformOutput(result.results as any[], output_mode);
}
// Add pagination metadata for search results if not already present
if (result.success && result.results && Array.isArray(result.results)) {
const totalResults = (result.results as any[]).length;
if (!result.metadata) {
result.metadata = {};
}
if (!result.metadata.pagination) {
result.metadata.pagination = {
offset: 0,
limit: effectiveLimit,
total: totalResults,
has_more: false, // Already limited by backend
};
}
}
}
// Add deprecation warning if applicable
if (deprecationWarning && result.metadata) {
result.metadata.warning = deprecationWarning;
}
return result.success ? { success: true, result } : { success: false, error: result.error };
} catch (error) {
return { success: false, error: (error as Error).message };
}
}
/**
* Execute init action with external progress callback
* Used by MCP server for streaming progress
*/
export async function executeInitWithProgress(
params: Record<string, unknown>,
onProgress?: (progress: ProgressInfo) => void
): Promise<SearchResult> {
const path = (params.path as string) || '.';
const languages = params.languages as string[] | undefined;
// Check CodexLens availability
const readyStatus = await ensureCodexLensReady();
if (!readyStatus.ready) {
return {
success: false,
error: `CodexLens not available: ${readyStatus.error}. CodexLens will be auto-installed on first use.`,
};
}
// Use 'index init' subcommand (new CLI structure)
const args = ['index', 'init', path];
if (languages && languages.length > 0) {
args.push('--language', languages.join(','));
}
// Track progress updates
const progressUpdates: ProgressInfo[] = [];
let lastProgress: ProgressInfo | null = null;
const result = await executeCodexLens(args, {
cwd: path,
timeout: 1800000, // 30 minutes for large codebases
onProgress: (progress: ProgressInfo) => {
progressUpdates.push(progress);
lastProgress = progress;
// Call external progress callback if provided
if (onProgress) {
onProgress(progress);
}
},
});
// Build metadata with progress info
const metadata: SearchMetadata = {
action: 'init',
path,
};
if (lastProgress !== null) {
const p = lastProgress as ProgressInfo;
metadata.progress = {
stage: p.stage,
message: p.message,
percent: p.percent,
filesProcessed: p.filesProcessed,
totalFiles: p.totalFiles,
};
}
if (progressUpdates.length > 0) {
metadata.progressHistory = progressUpdates.slice(-5);
}
return {
success: result.success,
error: result.error,
message: result.success
? `CodexLens index created successfully for ${path}`
: undefined,
metadata,
};
}