Files
Claude-Code-Workflow/ccw/src/tools/smart-search.ts

3635 lines
117 KiB
TypeScript

/**
* Smart Search Tool - Unified intelligent search powered by codexlens-search v2
*
* Features:
* - Semantic search: 2-stage vector (binary coarse + ANN fine) + FTS5 + RRF fusion + reranking
* - Ripgrep fallback for fast exact/regex matching
* - File discovery via glob patterns
* - Incremental indexing with Mark-and-Filter strategy
* - File watcher for automatic index updates
*
* Actions:
* - search: Semantic search via v2 bridge with ripgrep fallback
* - init: Initialize v2 index and sync files
* - status: Check v2 index statistics
* - update: Incremental sync for changed files
* - watch: Start file watcher for automatic updates
* - find_files: Glob-based file path matching
*/
import { z } from 'zod';
import type { ToolSchema, ToolResult } from '../types/tool.js';
import { spawn, spawnSync, type SpawnOptions } from 'child_process';
import { existsSync, readFileSync, statSync } from 'fs';
import { dirname, join, resolve } from 'path';
import {
ensureReady as ensureCodexLensReady,
checkSemanticStatus,
ensureLiteLLMEmbedderReady,
executeCodexLens,
getVenvPythonPath,
} from './codex-lens.js';
import { execFile } from 'child_process';
import type { ProgressInfo } from './codex-lens.js';
import { getProjectRoot } from '../utils/path-validator.js';
import { getCodexLensDataDir } from '../utils/codexlens-path.js';
import { EXEC_TIMEOUTS } from '../utils/exec-constants.js';
import { generateRotationEndpoints } from '../config/litellm-api-config-manager.js';
import type { RotationEndpointConfig } from '../config/litellm-api-config-manager.js';
// Timing utilities for performance analysis
const TIMING_ENABLED = process.env.SMART_SEARCH_TIMING === '1' || process.env.DEBUG?.includes('timing');
const SEARCH_OUTPUT_MODES = ['full', 'files_only', 'count', 'ace'] as const;
type SearchOutputMode = typeof SEARCH_OUTPUT_MODES[number];
interface TimingData {
[key: string]: number;
}
function createTimer(): { mark: (name: string) => void; getTimings: () => TimingData; log: () => void } {
const startTime = performance.now();
const marks: { name: string; time: number }[] = [];
let lastMark = startTime;
return {
mark(name: string) {
const now = performance.now();
marks.push({ name, time: now - lastMark });
lastMark = now;
},
getTimings(): TimingData {
const timings: TimingData = {};
marks.forEach(m => { timings[m.name] = Math.round(m.time * 100) / 100; });
timings['_total'] = Math.round((performance.now() - startTime) * 100) / 100;
return timings;
},
log() {
if (TIMING_ENABLED) {
const timings = this.getTimings();
console.error(`[TIMING] smart-search: ${JSON.stringify(timings)}`);
}
}
};
}
// Define Zod schema for validation
const ParamsSchema = z.object({
// Action: search (content), find_files (path/name pattern), init, status, update (incremental sync), watch
// Note: search_files is deprecated, use search with output_mode='files_only'
action: z.enum(['init', 'search', 'search_files', 'find_files', 'status', 'update', 'watch']).default('search'),
query: z.string().optional().describe('Content search query (for action="search")'),
pattern: z.string().optional().describe('Glob pattern for path matching (for action="find_files")'),
mode: z.enum(['fuzzy', 'semantic']).default('fuzzy'),
output_mode: z.enum(SEARCH_OUTPUT_MODES).default('ace'),
path: z.string().optional(),
paths: z.array(z.string()).default([]),
contextLines: z.number().default(0),
maxResults: z.number().default(5), // Default 5 with full content
includeHidden: z.boolean().default(false),
force: z.boolean().default(false).describe('Force full rebuild for action="init".'),
limit: z.number().default(5), // Default 5 with full content
extraFilesCount: z.number().default(10), // Additional file-only results
maxContentLength: z.number().default(200), // Max content length for truncation (50-2000)
offset: z.number().default(0), // NEW: Pagination offset (start_index)
// Search modifiers for ripgrep mode
regex: z.boolean().default(true), // Use regex pattern matching (default: enabled)
caseSensitive: z.boolean().default(true), // Case sensitivity (default: case-sensitive)
tokenize: z.boolean().default(true), // Tokenize multi-word queries for OR matching (default: enabled)
// File type filtering (default: code only)
excludeExtensions: z.array(z.string()).optional().describe('File extensions to exclude from results (e.g., ["md", "txt"])'),
codeOnly: z.boolean().default(true).describe('Only return code files (excludes md, txt, json, yaml, xml, etc.). Default: true'),
withDoc: z.boolean().default(false).describe('Include documentation files (md, txt, rst, etc.). Overrides codeOnly when true'),
// Watcher options
debounce: z.number().default(1000).describe('Debounce interval in ms for watch action'),
// Fuzzy matching is implicit in hybrid mode (RRF fusion)
});
type Params = z.infer<typeof ParamsSchema>;
// Search mode constants
const SEARCH_MODES = ['fuzzy', 'semantic'] as const;
// Classification confidence threshold
const CONFIDENCE_THRESHOLD = 0.7;
// File filtering configuration (ported from code-index)
const FILTER_CONFIG = {
exclude_directories: new Set([
'.git', '.svn', '.hg', '.bzr',
'node_modules', '__pycache__', '.venv', 'venv', 'vendor', 'bower_components',
'dist', 'build', 'target', 'out', 'bin', 'obj',
'.idea', '.vscode', '.vs', '.sublime-workspace',
'.pytest_cache', '.coverage', '.tox', '.nyc_output', 'coverage', 'htmlcov',
'.next', '.nuxt', '.cache', '.parcel-cache',
'.DS_Store', 'Thumbs.db',
]),
exclude_files: new Set([
'*.tmp', '*.temp', '*.swp', '*.swo', '*.bak', '*~', '*.orig', '*.log',
'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml', 'Pipfile.lock',
]),
// Windows device files - must use **/ pattern to match in any directory
// These cause "os error 1" on Windows when accessed
windows_device_files: new Set([
'nul', 'con', 'aux', 'prn',
'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9',
'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9',
]),
};
function buildExcludeArgs(): string[] {
const args: string[] = [];
for (const dir of FILTER_CONFIG.exclude_directories) {
args.push('--glob', `!**/${dir}/**`);
}
for (const pattern of FILTER_CONFIG.exclude_files) {
args.push('--glob', `!${pattern}`);
}
// Windows device files need case-insensitive matching in any directory
for (const device of FILTER_CONFIG.windows_device_files) {
args.push('--glob', `!**/${device}`);
args.push('--glob', `!**/${device.toUpperCase()}`);
}
return args;
}
/**
* Tokenize query for multi-word OR matching
* Splits on whitespace and common delimiters, filters stop words and short tokens
* @param query - The search query
* @returns Array of tokens
*/
function tokenizeQuery(query: string): string[] {
// Stop words for filtering (common English + programming keywords)
const stopWords = new Set([
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
'should', 'may', 'might', 'must', 'can', 'to', 'of', 'in', 'for', 'on',
'with', 'at', 'by', 'from', 'as', 'into', 'through', 'and', 'but', 'if',
'or', 'not', 'this', 'that', 'these', 'those', 'it', 'its', 'how', 'what',
'where', 'when', 'why', 'which', 'who', 'whom',
]);
// Split on whitespace and common delimiters, keep meaningful tokens
const tokens = query
.split(/[\s,;:]+/)
.map(token => token.trim())
.filter(token => {
// Keep tokens that are:
// - At least 2 characters long
// - Not a stop word (case-insensitive)
// - Or look like identifiers (contain underscore/camelCase)
if (token.length < 2) return false;
if (stopWords.has(token.toLowerCase()) && !token.includes('_') && !/[A-Z]/.test(token)) {
return false;
}
return true;
});
return tokens;
}
/**
* Score results based on token match count for ranking
* @param results - Search results
* @param tokens - Query tokens
* @returns Results with match scores
*/
function scoreByTokenMatch(results: ExactMatch[], tokens: string[]): ExactMatch[] {
if (tokens.length <= 1) return results;
// Create case-insensitive patterns for each token
const tokenPatterns = tokens.map(t => {
const escaped = t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
return new RegExp(escaped, 'i');
});
return results.map(r => {
const content = r.content || '';
const file = r.file || '';
const searchText = `${file} ${content}`;
// Count how many tokens match
let matchCount = 0;
for (const pattern of tokenPatterns) {
if (pattern.test(searchText)) {
matchCount++;
}
}
// Calculate match ratio (0 to 1)
const matchRatio = matchCount / tokens.length;
return {
...r,
matchScore: matchRatio,
matchCount,
};
}).sort((a, b) => {
// Sort by match ratio (descending), then by line number
if (b.matchScore !== a.matchScore) {
return b.matchScore - a.matchScore;
}
return (a.line || 0) - (b.line || 0);
});
}
interface Classification {
mode: string;
confidence: number;
reasoning: string;
}
interface ChunkLine {
line: number;
text: string;
isMatch: boolean;
}
interface ExactMatch {
file: string;
line: number;
column: number;
content: string;
endLine?: number;
chunkLines?: ChunkLine[];
matchScore?: number; // Token match ratio (0-1) for multi-word queries
matchCount?: number; // Number of tokens matched
}
interface RelationshipInfo {
type: string; // 'calls', 'imports', 'called_by', 'imported_by'
direction: 'outgoing' | 'incoming';
target?: string; // Target symbol name (for outgoing)
source?: string; // Source symbol name (for incoming)
file: string; // File path
line?: number; // Line number
}
interface SemanticMatch {
file: string;
line?: number;
column?: number;
score: number;
content: string;
symbol: string | null;
relationships?: RelationshipInfo[];
}
interface GraphMatch {
file: string;
symbols: unknown;
relationships: unknown[];
}
// File match for find_files action (path-based search)
interface FileMatch {
path: string;
type: 'file' | 'directory';
name: string; // Filename only
extension?: string; // File extension (without dot)
}
interface PaginationInfo {
offset: number; // Starting index of returned results
limit: number; // Number of results requested
total: number; // Total number of results found
has_more: boolean; // True if more results are available
}
interface SearchSuggestion {
title: string;
command: string;
reason: string;
}
interface SearchMetadata {
mode?: string;
backend?: string;
count?: number;
query?: string;
pattern?: string; // For find_files action
classified_as?: string;
confidence?: number;
reasoning?: string;
embeddings_coverage_percent?: number;
warning?: string;
note?: string;
index_status?: 'indexed' | 'not_indexed' | 'partial';
fallback?: string; // Fallback mode used (e.g., 'fuzzy')
fallback_history?: string[];
suggested_weights?: Record<string, number>;
// Tokenization metadata (ripgrep mode)
tokens?: string[]; // Query tokens used for multi-word search
tokenized?: boolean; // Whether tokenization was applied
suggestions?: SearchSuggestion[];
// Pagination metadata
pagination?: PaginationInfo;
// Performance timing data (when SMART_SEARCH_TIMING=1 or DEBUG includes 'timing')
timing?: TimingData;
// Init action specific
action?: string;
path?: string;
progress?: {
stage: string;
message: string;
percent: number;
filesProcessed?: number;
totalFiles?: number;
};
progressHistory?: ProgressInfo[];
api_max_workers?: number;
endpoint_count?: number;
use_gpu?: boolean;
reranker_enabled?: boolean;
reranker_backend?: string;
reranker_model?: string;
cascade_strategy?: string;
staged_stage2_mode?: string;
static_graph_enabled?: boolean;
preset?: string;
}
interface SearchResult {
success: boolean;
results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | FileMatch[] | AceLikeOutput | unknown;
extra_files?: string[]; // Additional file paths without content
output?: string;
metadata?: SearchMetadata;
error?: string;
status?: unknown;
message?: string;
}
interface AceLikeSection {
path: string;
line?: number;
endLine?: number;
column?: number;
score?: number;
symbol?: string | null;
snippet: string;
lines?: ChunkLine[];
}
interface AceLikeGroup {
path: string;
sections: AceLikeSection[];
total_matches: number;
}
interface AceLikeOutput {
format: 'ace';
text: string;
groups: AceLikeGroup[];
sections: AceLikeSection[];
total: number;
}
interface ModelInfo {
model_profile?: string;
model_name?: string;
embedding_dim?: number;
backend?: string;
created_at?: string;
updated_at?: string;
}
interface CodexLensConfig {
config_file?: string;
index_dir?: string;
embedding_backend?: string; // 'fastembed' (local) or 'litellm' (api)
embedding_model?: string;
embedding_auto_embed_missing?: boolean;
reranker_enabled?: boolean;
reranker_backend?: string; // 'onnx' (local) or 'api'
reranker_model?: string;
reranker_top_k?: number;
api_max_workers?: number;
api_batch_size?: number;
cascade_strategy?: string;
staged_stage2_mode?: string;
static_graph_enabled?: boolean;
}
interface IndexStatus {
indexed: boolean;
has_embeddings: boolean;
file_count?: number;
embeddings_coverage_percent?: number;
total_chunks?: number;
model_info?: ModelInfo | null;
config?: CodexLensConfig | null;
warning?: string;
}
function readCodexLensSettingsSnapshot(): Partial<CodexLensConfig> {
const settingsPath = join(getCodexLensDataDir(), 'settings.json');
if (!existsSync(settingsPath)) {
return {};
}
try {
const parsed = JSON.parse(readFileSync(settingsPath, 'utf-8')) as Record<string, any>;
const embedding = (parsed.embedding ?? {}) as Record<string, any>;
const reranker = (parsed.reranker ?? {}) as Record<string, any>;
const api = (parsed.api ?? {}) as Record<string, any>;
const cascade = (parsed.cascade ?? {}) as Record<string, any>;
const staged = (parsed.staged ?? {}) as Record<string, any>;
const indexing = (parsed.indexing ?? {}) as Record<string, any>;
return {
embedding_backend: normalizeEmbeddingBackend(typeof embedding.backend === 'string' ? embedding.backend : undefined),
embedding_model: typeof embedding.model === 'string' ? embedding.model : undefined,
embedding_auto_embed_missing: typeof embedding.auto_embed_missing === 'boolean' ? embedding.auto_embed_missing : undefined,
reranker_enabled: typeof reranker.enabled === 'boolean' ? reranker.enabled : undefined,
reranker_backend: typeof reranker.backend === 'string' ? reranker.backend : undefined,
reranker_model: typeof reranker.model === 'string' ? reranker.model : undefined,
reranker_top_k: typeof reranker.top_k === 'number' ? reranker.top_k : undefined,
api_max_workers: typeof api.max_workers === 'number' ? api.max_workers : undefined,
api_batch_size: typeof api.batch_size === 'number' ? api.batch_size : undefined,
cascade_strategy: typeof cascade.strategy === 'string' ? cascade.strategy : undefined,
staged_stage2_mode: typeof staged.stage2_mode === 'string' ? staged.stage2_mode : undefined,
static_graph_enabled: typeof indexing.static_graph_enabled === 'boolean' ? indexing.static_graph_enabled : undefined,
};
} catch {
return {};
}
}
/**
* Strip ANSI color codes from string (for JSON parsing)
*/
function stripAnsi(str: string): string {
return str.replace(/\x1b\[[0-9;]*m/g, '');
}
/** Default maximum content length to return (avoid excessive output) */
const DEFAULT_MAX_CONTENT_LENGTH = 200;
const CODEX_LENS_FTS_COMPATIBILITY_PATTERNS = [
/UsageError:\s*Got unexpected extra arguments?/i,
/Option ['"]--method['"] does not take a value/i,
/TyperArgument\.make_metavar\(\) takes 1 positional argument but 2 were given/i,
];
let codexLensFtsBackendBroken = false;
const autoInitJobs = new Map<string, { startedAt: number; languages?: string[] }>();
const autoEmbedJobs = new Map<string, { startedAt: number; backend?: string; model?: string }>();
type SmartSearchRuntimeOverrides = {
checkSemanticStatus?: typeof checkSemanticStatus;
getVenvPythonPath?: typeof getVenvPythonPath;
spawnProcess?: typeof spawn;
now?: () => number;
};
const runtimeOverrides: SmartSearchRuntimeOverrides = {};
function getSemanticStatusRuntime(): typeof checkSemanticStatus {
return runtimeOverrides.checkSemanticStatus ?? checkSemanticStatus;
}
function getVenvPythonPathRuntime(): typeof getVenvPythonPath {
return runtimeOverrides.getVenvPythonPath ?? getVenvPythonPath;
}
function getSpawnRuntime(): typeof spawn {
return runtimeOverrides.spawnProcess ?? spawn;
}
function getNowRuntime(): number {
return (runtimeOverrides.now ?? Date.now)();
}
function buildSmartSearchSpawnOptions(cwd: string, overrides: SpawnOptions = {}): SpawnOptions {
const { env, ...rest } = overrides;
return {
cwd,
shell: false,
windowsHide: true,
env: { ...process.env, PYTHONIOENCODING: 'utf-8', ...env },
...rest,
};
}
function shouldDetachBackgroundSmartSearchProcess(): boolean {
// On Windows, detached Python children can still create a transient console
// window even when windowsHide is set. Background warmup only needs to outlive
// the current request, not the MCP server process.
return process.platform !== 'win32';
}
/**
* Truncate content to specified length with ellipsis
* @param content - The content to truncate
* @param maxLength - Maximum length (default: 200)
*/
function truncateContent(content: string | null | undefined, maxLength: number = DEFAULT_MAX_CONTENT_LENGTH): string {
if (!content) return '';
if (content.length <= maxLength) return content;
return content.slice(0, maxLength) + '...';
}
/**
* Split results into full content results and extra file-only results
* Generic function supporting both SemanticMatch and ExactMatch types
* @param allResults - All search results (must have 'file' property)
* @param fullContentLimit - Number of results with full content (default: 5)
* @param extraFilesCount - Number of additional file-only results (default: 10)
*/
function splitResultsWithExtraFiles<T extends { file: string }>(
allResults: T[],
fullContentLimit: number = 5,
extraFilesCount: number = 10
): { results: T[]; extra_files: string[] } {
// First N results with full content
const results = allResults.slice(0, fullContentLimit);
// Next M results as file paths only (deduplicated)
const extraResults = allResults.slice(fullContentLimit, fullContentLimit + extraFilesCount);
const extra_files = [...new Set(extraResults.map(r => r.file))];
return { results, extra_files };
}
interface SearchScope {
workingDirectory: string;
searchPaths: string[];
targetFile?: string;
}
interface RipgrepQueryModeResolution {
regex: boolean;
tokenize: boolean;
tokens: string[];
literalFallback: boolean;
warning?: string;
}
const GENERATED_QUERY_RE = /(?<!\w)(dist|build|out|coverage|htmlcov|generated|bundle|compiled|artifact|artifacts|\.workflow)(?!\w)/i;
const ENV_STYLE_QUERY_RE = /\b[A-Z][A-Z0-9]+(?:_[A-Z0-9]+)+\b/;
const TOPIC_TOKEN_RE = /[A-Za-z][A-Za-z0-9]*/g;
const LEXICAL_PRIORITY_SURFACE_TOKENS = new Set([
'config',
'configs',
'configuration',
'configurations',
'setting',
'settings',
'backend',
'backends',
'environment',
'env',
'variable',
'variables',
'factory',
'factories',
'override',
'overrides',
'option',
'options',
'flag',
'flags',
'mode',
'modes',
]);
const LEXICAL_PRIORITY_FOCUS_TOKENS = new Set([
'embedding',
'embeddings',
'reranker',
'rerankers',
'onnx',
'api',
'litellm',
'fastembed',
'local',
'legacy',
'stage',
'stage2',
'stage3',
'stage4',
'precomputed',
'realtime',
'static',
'global',
'graph',
'selection',
'model',
'models',
]);
function sanitizeSearchQuery(query: string | undefined): string | undefined {
if (!query) {
return query;
}
return query.replace(/\r?\n\s*/g, ' ').trim();
}
function sanitizeSearchPath(pathValue: string | undefined): string | undefined {
if (!pathValue) {
return pathValue;
}
return pathValue.replace(/\r?\n\s*/g, '').trim();
}
function resolveSearchScope(pathValue: string = '.', paths: string[] = []): SearchScope {
const normalizedPath = sanitizeSearchPath(pathValue) || '.';
const normalizedPaths = paths.map((item) => sanitizeSearchPath(item) || item);
const fallbackPath = normalizedPath || getProjectRoot();
try {
const resolvedPath = resolve(fallbackPath);
const stats = statSync(resolvedPath);
if (stats.isFile()) {
return {
workingDirectory: dirname(resolvedPath),
searchPaths: normalizedPaths.length > 0 ? normalizedPaths : [resolvedPath],
targetFile: resolvedPath,
};
}
return {
workingDirectory: resolvedPath,
searchPaths: normalizedPaths.length > 0 ? normalizedPaths : ['.'],
};
} catch {
return {
workingDirectory: fallbackPath,
searchPaths: normalizedPaths.length > 0 ? normalizedPaths : [normalizedPath || '.'],
};
}
}
function normalizeResultFilePath(filePath: string, workingDirectory: string): string {
return resolve(workingDirectory, filePath).replace(/\\/g, '/');
}
function filterResultsToTargetFile<T extends { file: string }>(results: T[], scope: SearchScope): T[] {
if (!scope.targetFile) {
return results;
}
const normalizedTarget = scope.targetFile.replace(/\\/g, '/');
return results.filter((result) => normalizeResultFilePath(result.file, scope.workingDirectory) === normalizedTarget);
}
function parseCodexLensJsonOutput(output: string | undefined): any | null {
const cleanOutput = stripAnsi(output || '').trim();
if (!cleanOutput) {
return null;
}
const candidates = [
cleanOutput,
...cleanOutput.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.startsWith('{') || line.startsWith('[')),
];
const firstBrace = cleanOutput.indexOf('{');
const lastBrace = cleanOutput.lastIndexOf('}');
if (firstBrace !== -1 && lastBrace > firstBrace) {
candidates.push(cleanOutput.slice(firstBrace, lastBrace + 1));
}
const firstBracket = cleanOutput.indexOf('[');
const lastBracket = cleanOutput.lastIndexOf(']');
if (firstBracket !== -1 && lastBracket > firstBracket) {
candidates.push(cleanOutput.slice(firstBracket, lastBracket + 1));
}
for (const candidate of candidates) {
try {
return JSON.parse(candidate);
} catch {
continue;
}
}
return null;
}
function isValidRegexPattern(pattern: string): boolean {
try {
new RegExp(pattern);
return true;
} catch {
return false;
}
}
function resolveRipgrepQueryMode(query: string, regex: boolean = true, tokenize: boolean = true): RipgrepQueryModeResolution {
const tokens = tokenize ? tokenizeQuery(query) : [query];
if (!regex) {
return {
regex: false,
tokenize,
tokens,
literalFallback: false,
};
}
const invalidTokens = tokens.filter((token) => token.length > 0 && !isValidRegexPattern(token));
if (invalidTokens.length === 0) {
return {
regex: true,
tokenize,
tokens,
literalFallback: false,
};
}
const preview = truncateContent(invalidTokens[0], 40);
return {
regex: false,
tokenize,
tokens,
literalFallback: true,
warning: invalidTokens.length === 1
? `Query token "${preview}" is not a valid regular expression. Falling back to literal ripgrep matching.`
: 'Query contains invalid regular expression tokens. Falling back to literal ripgrep matching.',
};
}
function isCodexLensCliCompatibilityError(error: string | undefined): boolean {
if (!error) {
return false;
}
const cleanError = stripAnsi(error);
return CODEX_LENS_FTS_COMPATIBILITY_PATTERNS.some((pattern) => pattern.test(cleanError));
}
function noteCodexLensFtsCompatibility(error: string | undefined): boolean {
if (!isCodexLensCliCompatibilityError(error)) {
return false;
}
codexLensFtsBackendBroken = true;
return true;
}
function shouldSurfaceCodexLensFtsCompatibilityWarning(options: {
compatibilityTriggeredThisQuery: boolean;
skipExactDueToCompatibility: boolean;
ripgrepResultCount: number;
}): boolean {
if (options.ripgrepResultCount > 0) {
return false;
}
return options.compatibilityTriggeredThisQuery || options.skipExactDueToCompatibility;
}
function summarizeBackendError(error: string | undefined): string {
const cleanError = stripAnsi(error || '').trim();
if (!cleanError) {
return 'unknown error';
}
if (isCodexLensCliCompatibilityError(cleanError)) {
return 'CodexLens exact search CLI is incompatible with the current Typer/Click runtime';
}
const regexSummary = cleanError.match(/error:\s*([^\r\n]+)/i);
if (/regex parse error/i.test(cleanError) && regexSummary?.[1]) {
return `invalid regular expression (${regexSummary[1].trim()})`;
}
const usageSummary = cleanError.match(/UsageError:\s*([^\r\n]+)/i);
if (usageSummary?.[1]) {
return usageSummary[1].trim();
}
const firstMeaningfulLine = cleanError
.split(/\r?\n/)
.map((line) => line.trim())
.find((line) => line && !line.startsWith('│') && !line.startsWith('┌') && !line.startsWith('└'));
return truncateContent(firstMeaningfulLine || cleanError, 180);
}
function mapCodexLensSemanticMatches(data: any[], scope: SearchScope, maxContentLength: number): SemanticMatch[] {
return filterResultsToTargetFile(data.map((item: any) => {
const rawScore = item.score || 0;
const similarityScore = rawScore > 0 ? 1 / (1 + rawScore) : 1;
return {
file: item.path || item.file,
line: typeof item.line === 'number' ? item.line : undefined,
column: typeof item.column === 'number' ? item.column : undefined,
score: similarityScore,
content: truncateContent(item.content || item.excerpt, maxContentLength),
symbol: item.symbol || null,
};
}), scope);
}
function parsePlainTextFileMatches(output: string | undefined, scope: SearchScope): SemanticMatch[] {
const lines = stripAnsi(output || '')
.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean);
const fileLines = lines.filter((line) => {
if (line.includes('RuntimeWarning:') || line.startsWith('warn(') || line.startsWith('Warning:')) {
return false;
}
const resolvedPath = /^[a-zA-Z]:[\\/]|^\//.test(line)
? line
: resolve(scope.workingDirectory, line);
try {
return statSync(resolvedPath).isFile();
} catch {
return false;
}
});
return filterResultsToTargetFile(
[...new Set(fileLines)].map((file, index) => ({
file,
score: Math.max(0.1, 1 - index * 0.05),
content: '',
symbol: null,
})),
scope,
);
}
function hasCentralizedVectorArtifacts(indexRoot: unknown): boolean {
if (typeof indexRoot !== 'string' || !indexRoot.trim()) {
return false;
}
const resolvedRoot = resolve(indexRoot);
return [
join(resolvedRoot, '_vectors.hnsw'),
join(resolvedRoot, '_vectors_meta.db'),
join(resolvedRoot, '_binary_vectors.mmap'),
].every((artifactPath) => existsSync(artifactPath));
}
function asObjectRecord(value: unknown): Record<string, unknown> | undefined {
if (!value || typeof value !== 'object' || Array.isArray(value)) {
return undefined;
}
return value as Record<string, unknown>;
}
function asFiniteNumber(value: unknown): number | undefined {
if (typeof value !== 'number' || !Number.isFinite(value)) {
return undefined;
}
return value;
}
function asBoolean(value: unknown): boolean | undefined {
return typeof value === 'boolean' ? value : undefined;
}
function extractEmbeddingsStatusSummary(embeddingsData: unknown): {
coveragePercent: number;
totalChunks: number;
hasEmbeddings: boolean;
} {
const embeddings = asObjectRecord(embeddingsData) ?? {};
const root = asObjectRecord(embeddings.root) ?? embeddings;
const centralized = asObjectRecord(embeddings.centralized);
const totalIndexes = asFiniteNumber(root.total_indexes)
?? asFiniteNumber(embeddings.total_indexes)
?? 0;
const indexesWithEmbeddings = asFiniteNumber(root.indexes_with_embeddings)
?? asFiniteNumber(embeddings.indexes_with_embeddings)
?? 0;
const totalChunks = asFiniteNumber(root.total_chunks)
?? asFiniteNumber(embeddings.total_chunks)
?? 0;
const coveragePercent = asFiniteNumber(root.coverage_percent)
?? asFiniteNumber(embeddings.coverage_percent)
?? (totalIndexes > 0 ? (indexesWithEmbeddings / totalIndexes) * 100 : 0);
const hasEmbeddings = asBoolean(root.has_embeddings)
?? asBoolean(centralized?.usable)
?? (totalChunks > 0 || indexesWithEmbeddings > 0 || coveragePercent > 0);
return {
coveragePercent,
totalChunks,
hasEmbeddings,
};
}
function selectEmbeddingsStatusPayload(statusData: unknown): Record<string, unknown> {
const status = asObjectRecord(statusData) ?? {};
return asObjectRecord(status.embeddings_status) ?? asObjectRecord(status.embeddings) ?? {};
}
function collectBackendError(
errors: string[],
backendName: string,
backendResult: PromiseSettledResult<SearchResult>,
): void {
if (backendResult.status === 'rejected') {
errors.push(`${backendName}: ${summarizeBackendError(String(backendResult.reason))}`);
return;
}
if (!backendResult.value.success) {
errors.push(`${backendName}: ${summarizeBackendError(backendResult.value.error)}`);
}
}
function mergeWarnings(...warnings: Array<string | undefined>): string | undefined {
const merged = [...new Set(
warnings
.filter((warning): warning is string => typeof warning === 'string' && warning.trim().length > 0)
.map((warning) => warning.trim())
)];
return merged.length > 0 ? merged.join(' | ') : undefined;
}
function mergeNotes(...notes: Array<string | undefined>): string | undefined {
const merged = [...new Set(
notes
.filter((note): note is string => typeof note === 'string' && note.trim().length > 0)
.map((note) => note.trim())
)];
return merged.length > 0 ? merged.join(' | ') : undefined;
}
function mergeSuggestions(...groups: Array<SearchSuggestion[] | undefined>): SearchSuggestion[] | undefined {
const merged = new Map<string, SearchSuggestion>();
for (const group of groups) {
for (const suggestion of group ?? []) {
if (!merged.has(suggestion.command)) {
merged.set(suggestion.command, suggestion);
}
}
}
return merged.size > 0 ? [...merged.values()] : undefined;
}
function formatSmartSearchCommand(action: string, pathValue: string, extraParams: Record<string, unknown> = {}): string {
const normalizedPath = pathValue.replace(/\\/g, '/');
const args = [`action=${JSON.stringify(action)}`, `path=${JSON.stringify(normalizedPath)}`];
for (const [key, value] of Object.entries(extraParams)) {
if (value === undefined) {
continue;
}
args.push(`${key}=${JSON.stringify(value)}`);
}
return `smart_search(${args.join(', ')})`;
}
function parseOptionalBooleanEnv(raw: string | undefined): boolean | undefined {
const normalized = raw?.trim().toLowerCase();
if (!normalized) {
return undefined;
}
if (['1', 'true', 'on', 'yes'].includes(normalized)) {
return true;
}
if (['0', 'false', 'off', 'no'].includes(normalized)) {
return false;
}
return undefined;
}
function isAutoEmbedMissingEnabled(config: CodexLensConfig | null | undefined): boolean {
const envOverride = parseOptionalBooleanEnv(process.env.CODEXLENS_AUTO_EMBED_MISSING);
if (envOverride !== undefined) {
return envOverride;
}
if (process.platform === 'win32') {
return false;
}
if (typeof config?.embedding_auto_embed_missing === 'boolean') {
return config.embedding_auto_embed_missing;
}
return true;
}
function isAutoInitMissingEnabled(): boolean {
const envOverride = parseOptionalBooleanEnv(process.env.CODEXLENS_AUTO_INIT_MISSING);
if (envOverride !== undefined) {
return envOverride;
}
return process.platform !== 'win32';
}
function getAutoEmbedMissingDisabledReason(config: CodexLensConfig | null | undefined): string {
const envOverride = parseOptionalBooleanEnv(process.env.CODEXLENS_AUTO_EMBED_MISSING);
if (envOverride === false) {
return 'Automatic embedding warmup is disabled by CODEXLENS_AUTO_EMBED_MISSING=false.';
}
if (config?.embedding_auto_embed_missing === false) {
return 'Automatic embedding warmup is disabled by embedding.auto_embed_missing=false.';
}
if (process.platform === 'win32') {
return 'Automatic embedding warmup is disabled by default on Windows even if CodexLens config resolves auto_embed_missing=true. Set CODEXLENS_AUTO_EMBED_MISSING=true to opt in.';
}
return 'Automatic embedding warmup is disabled.';
}
function getAutoInitMissingDisabledReason(): string {
const envOverride = parseOptionalBooleanEnv(process.env.CODEXLENS_AUTO_INIT_MISSING);
if (envOverride === false) {
return 'Automatic static index warmup is disabled by CODEXLENS_AUTO_INIT_MISSING=false.';
}
if (process.platform === 'win32') {
return 'Automatic static index warmup is disabled by default on Windows. Set CODEXLENS_AUTO_INIT_MISSING=true to opt in.';
}
return 'Automatic static index warmup is disabled.';
}
function buildIndexSuggestions(indexStatus: IndexStatus, scope: SearchScope): SearchSuggestion[] | undefined {
const suggestions: SearchSuggestion[] = [];
if (!indexStatus.indexed) {
suggestions.push({
title: 'Initialize index',
command: formatSmartSearchCommand('init', scope.workingDirectory),
reason: 'No CodexLens index exists for this path yet.',
});
suggestions.push({
title: 'Check index status',
command: formatSmartSearchCommand('status', scope.workingDirectory),
reason: 'Verify whether the target path is mapped to the expected CodexLens project root.',
});
return suggestions;
}
if (!indexStatus.has_embeddings) {
suggestions.push({
title: 'Generate embeddings',
command: formatSmartSearchCommand('embed', scope.workingDirectory),
reason: 'The index exists, but semantic/vector retrieval is unavailable until embeddings are generated.',
});
} else if ((indexStatus.embeddings_coverage_percent ?? 0) < 50) {
suggestions.push({
title: 'Rebuild embeddings',
command: formatSmartSearchCommand('embed', scope.workingDirectory, { force: true }),
reason: `Embedding coverage is only ${(indexStatus.embeddings_coverage_percent ?? 0).toFixed(1)}%, so semantic search quality is degraded.`,
});
}
if (indexStatus.warning?.includes('Failed to parse index status')) {
suggestions.push({
title: 'Re-check status',
command: formatSmartSearchCommand('status', scope.workingDirectory),
reason: 'The index health payload could not be parsed cleanly.',
});
}
return suggestions.length > 0 ? suggestions : undefined;
}
/**
* Check if CodexLens index exists for current directory
* @param path - Directory path to check
* @returns Index status
*/
async function checkIndexStatus(path: string = '.'): Promise<IndexStatus> {
const scope = resolveSearchScope(path);
try {
// Fetch both status and config in parallel
const [statusResult, configResult] = await Promise.all([
executeCodexLens(['index', 'status', scope.workingDirectory], { cwd: scope.workingDirectory }),
executeCodexLens(['config', '--json'], { cwd: scope.workingDirectory }),
]);
// Parse config
const settingsConfig = readCodexLensSettingsSnapshot();
let config: CodexLensConfig | null = Object.keys(settingsConfig).length > 0 ? { ...settingsConfig } : null;
if (configResult.success && configResult.output) {
try {
const cleanConfigOutput = stripAnsi(configResult.output);
const parsedConfig = JSON.parse(cleanConfigOutput);
const configData = parsedConfig.result || parsedConfig;
config = {
...settingsConfig,
config_file: configData.config_file,
index_dir: configData.index_dir,
embedding_backend: normalizeEmbeddingBackend(configData.embedding_backend) ?? settingsConfig.embedding_backend,
embedding_model: typeof configData.embedding_model === 'string' ? configData.embedding_model : settingsConfig.embedding_model,
embedding_auto_embed_missing: typeof configData.embedding_auto_embed_missing === 'boolean'
? configData.embedding_auto_embed_missing
: settingsConfig.embedding_auto_embed_missing,
reranker_enabled: typeof configData.reranker_enabled === 'boolean' ? configData.reranker_enabled : settingsConfig.reranker_enabled,
reranker_backend: typeof configData.reranker_backend === 'string' ? configData.reranker_backend : settingsConfig.reranker_backend,
reranker_model: typeof configData.reranker_model === 'string' ? configData.reranker_model : settingsConfig.reranker_model,
reranker_top_k: typeof configData.reranker_top_k === 'number' ? configData.reranker_top_k : settingsConfig.reranker_top_k,
};
} catch {
// Config parse failed, continue without it
}
}
if (!statusResult.success) {
return {
indexed: false,
has_embeddings: false,
config,
warning: 'No CodexLens index found. Run smart_search(action="init") to create index for better search results.',
};
}
// Parse status output
try {
// Strip ANSI color codes from JSON output
const cleanOutput = stripAnsi(statusResult.output || '{}');
const parsed = JSON.parse(cleanOutput);
// Handle both direct and nested response formats (status returns {success, result: {...}})
const status = parsed.result || parsed;
// Get embeddings coverage from comprehensive status
const embeddingsData = selectEmbeddingsStatusPayload(status);
const legacyEmbeddingsData = asObjectRecord(status.embeddings) ?? {};
const embeddingsSummary = extractEmbeddingsStatusSummary(embeddingsData);
const totalIndexes = Number(legacyEmbeddingsData.total_indexes || asObjectRecord(embeddingsData)?.total_indexes || 0);
const embeddingsCoverage = embeddingsSummary.coveragePercent;
const totalChunks = embeddingsSummary.totalChunks;
const indexed = Boolean(status.projects_count > 0 || status.total_files > 0 || status.index_root || totalIndexes > 0 || totalChunks > 0);
const has_embeddings = embeddingsSummary.hasEmbeddings;
// Extract model info if available
const modelInfoData = asObjectRecord(embeddingsData.model_info);
const modelInfo: ModelInfo | undefined = modelInfoData ? {
model_profile: typeof modelInfoData.model_profile === 'string' ? modelInfoData.model_profile : undefined,
model_name: typeof modelInfoData.model_name === 'string' ? modelInfoData.model_name : undefined,
embedding_dim: typeof modelInfoData.embedding_dim === 'number' ? modelInfoData.embedding_dim : undefined,
backend: typeof modelInfoData.backend === 'string' ? modelInfoData.backend : undefined,
created_at: typeof modelInfoData.created_at === 'string' ? modelInfoData.created_at : undefined,
updated_at: typeof modelInfoData.updated_at === 'string' ? modelInfoData.updated_at : undefined,
} : undefined;
let warning: string | undefined;
if (!indexed) {
warning = 'No CodexLens index found. Run smart_search(action="init") to create index for better search results.';
} else if (embeddingsCoverage === 0) {
warning = 'Index exists but no embeddings generated. Run smart_search(action="embed") to build the vector index.';
} else if (embeddingsCoverage < 50) {
warning = `Embeddings coverage is ${embeddingsCoverage.toFixed(1)}% (below 50%). Hybrid search will degrade. Run smart_search(action="embed") to improve vector coverage.`;
}
return {
indexed,
has_embeddings,
file_count: status.total_files,
embeddings_coverage_percent: embeddingsCoverage,
total_chunks: totalChunks,
// Ensure model_info is null instead of undefined so it's included in JSON
model_info: modelInfo ?? null,
config,
warning,
};
} catch {
return {
indexed: false,
has_embeddings: false,
config,
warning: 'Failed to parse index status',
};
}
} catch {
return {
indexed: false,
has_embeddings: false,
warning: 'CodexLens not available',
};
}
}
/**
* Detection heuristics for intent classification
*/
/**
* Detect literal string query (simple alphanumeric or quoted strings)
*/
function detectLiteral(query: string): boolean {
return /^[a-zA-Z0-9_-]+$/.test(query) || /^["'].*["']$/.test(query);
}
/**
* Detect regex pattern (contains regex metacharacters)
*/
function detectRegex(query: string): boolean {
return /[.*+?^${}()|[\]\\]/.test(query);
}
/**
* Detect natural language query (sentence structure, questions, multi-word phrases)
*/
function detectNaturalLanguage(query: string): boolean {
return query.split(/\s+/).length >= 3 || /\?$/.test(query);
}
/**
* Detect file path query (path separators, file extensions)
*/
function detectFilePath(query: string): boolean {
return /[/\\]/.test(query) || /\.[a-z]{2,4}$/i.test(query);
}
/**
* Detect relationship query (import, export, dependency keywords)
*/
function detectRelationship(query: string): boolean {
return /(import|export|uses?|depends?|calls?|extends?)\s/i.test(query);
}
function looksLikeCodeQuery(query: string): boolean {
if (/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(query)) return true;
if (/[:.<>\-=(){}[\]]/.test(query) && query.split(/\s+/).length <= 2) return true;
if (/\.\*|\\\(|\\\[|\\s/.test(query)) return true;
if (/^[a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*$/.test(query)) return true;
return false;
}
function queryTargetsGeneratedFiles(query: string): boolean {
return GENERATED_QUERY_RE.test(query.trim());
}
function prefersLexicalPriorityQuery(query: string): boolean {
const trimmed = query.trim();
if (!trimmed) return false;
if (ENV_STYLE_QUERY_RE.test(trimmed)) return true;
const tokens = new Set((trimmed.match(TOPIC_TOKEN_RE) ?? []).map((token) => token.toLowerCase()));
if (tokens.size === 0) return false;
if (tokens.has('factory') || tokens.has('factories')) return true;
if ((tokens.has('environment') || tokens.has('env')) && (tokens.has('variable') || tokens.has('variables'))) {
return true;
}
if (
tokens.has('backend') &&
['embedding', 'embeddings', 'reranker', 'rerankers', 'onnx', 'api', 'litellm', 'fastembed', 'local', 'legacy']
.some((token) => tokens.has(token))
) {
return true;
}
let surfaceHit = false;
let focusHit = false;
for (const token of tokens) {
if (LEXICAL_PRIORITY_SURFACE_TOKENS.has(token)) surfaceHit = true;
if (LEXICAL_PRIORITY_FOCUS_TOKENS.has(token)) focusHit = true;
if (surfaceHit && focusHit) return true;
}
return false;
}
/**
* Classify query intent and recommend search mode
* Simple mapping: hybrid (NL + index + embeddings) | exact (index or insufficient embeddings) | ripgrep (no index)
* @param query - Search query string
* @param hasIndex - Whether CodexLens index exists
* @param hasSufficientEmbeddings - Whether embeddings coverage >= 50%
* @returns Classification result
*/
function classifyIntent(query: string, hasIndex: boolean = false, hasSufficientEmbeddings: boolean = false): Classification {
const isNaturalLanguage = detectNaturalLanguage(query);
const isCodeQuery = looksLikeCodeQuery(query);
const isRegexPattern = detectRegex(query);
const targetsGeneratedFiles = queryTargetsGeneratedFiles(query);
const prefersLexicalPriority = prefersLexicalPriorityQuery(query);
let mode: string;
let confidence: number;
if (!hasIndex) {
mode = 'ripgrep';
confidence = 1.0;
} else if (targetsGeneratedFiles || prefersLexicalPriority || isCodeQuery || isRegexPattern) {
mode = 'exact';
confidence = targetsGeneratedFiles ? 0.97 : prefersLexicalPriority ? 0.93 : 0.95;
} else if (isNaturalLanguage && hasSufficientEmbeddings) {
mode = 'hybrid';
confidence = 0.9;
} else {
mode = 'exact';
confidence = 0.8;
}
const detectedPatterns: string[] = [];
if (detectLiteral(query)) detectedPatterns.push('literal');
if (detectRegex(query)) detectedPatterns.push('regex');
if (detectNaturalLanguage(query)) detectedPatterns.push('natural language');
if (detectFilePath(query)) detectedPatterns.push('file path');
if (detectRelationship(query)) detectedPatterns.push('relationship');
if (targetsGeneratedFiles) detectedPatterns.push('generated artifact');
if (prefersLexicalPriority) detectedPatterns.push('lexical priority');
if (isCodeQuery) detectedPatterns.push('code identifier');
const reasoning = `Query classified as ${mode} (confidence: ${confidence.toFixed(2)}, detected: ${detectedPatterns.join(', ')}, index: ${hasIndex ? 'available' : 'not available'}, embeddings: ${hasSufficientEmbeddings ? 'sufficient' : 'insufficient'})`;
return { mode, confidence, reasoning };
}
/**
* Check if a tool is available in PATH
* @param toolName - Tool executable name
* @returns True if available
*/
function checkToolAvailability(
toolName: string,
lookupRuntime: typeof spawnSync = spawnSync,
): boolean {
try {
const isWindows = process.platform === 'win32';
const command = isWindows ? 'where' : 'which';
const result = lookupRuntime(command, [toolName], {
shell: false,
windowsHide: true,
stdio: 'ignore',
timeout: EXEC_TIMEOUTS.SYSTEM_INFO,
env: { ...process.env, PYTHONIOENCODING: 'utf-8' },
});
return !result.error && result.status === 0;
} catch {
return false;
}
}
/**
* Build ripgrep command arguments
* Supports tokenized multi-word queries with OR matching
* @param params - Search parameters
* @returns Command, arguments, and tokens used
*/
function buildRipgrepCommand(params: {
query: string;
paths: string[];
contextLines: number;
maxResults: number;
includeHidden: boolean;
regex?: boolean;
caseSensitive?: boolean;
tokenize?: boolean;
}): { command: string; args: string[]; tokens: string[]; warning?: string; literalFallback: boolean; regex: boolean } {
const { query, paths = ['.'], contextLines = 0, maxResults = 10, includeHidden = false, regex = false, caseSensitive = true, tokenize = true } = params;
const queryMode = resolveRipgrepQueryMode(query, regex, tokenize);
const args = [
'-n',
'--color=never',
'--json',
];
// Add file filtering (unless includeHidden is true)
if (!includeHidden) {
args.push(...buildExcludeArgs());
}
// Case sensitivity
if (!caseSensitive) {
args.push('--ignore-case');
}
if (contextLines > 0) {
args.push('-C', contextLines.toString());
}
if (maxResults > 0) {
args.push('--max-count', maxResults.toString());
}
if (includeHidden) {
args.push('--hidden');
}
const { tokens } = queryMode;
if (tokens.length > 1) {
// Multi-token: use multiple -e patterns (OR matching)
// Each token is escaped for regex safety unless regex mode is enabled
for (const token of tokens) {
if (queryMode.regex) {
args.push('-e', token);
} else {
// Escape regex special chars for literal matching
const escaped = token.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
args.push('-e', escaped);
}
}
} else {
// Single token or no tokenization: use original behavior
if (queryMode.regex) {
args.push('-e', query);
} else {
args.push('-F', query);
}
}
args.push(...paths);
return {
command: 'rg',
args,
tokens,
warning: queryMode.warning,
literalFallback: queryMode.literalFallback,
regex: queryMode.regex,
};
}
interface RipgrepChunkAccumulator {
file: string;
chunkLines: ChunkLine[];
firstMatchLine?: number;
firstMatchColumn?: number;
lastLine?: number;
matchCount: number;
}
function finalizeRipgrepChunk(accumulator: RipgrepChunkAccumulator | undefined): ExactMatch | null {
if (!accumulator || accumulator.matchCount === 0 || accumulator.chunkLines.length === 0) {
return null;
}
const firstLine = accumulator.chunkLines[0]?.line ?? accumulator.firstMatchLine ?? 1;
const lastLine = accumulator.chunkLines[accumulator.chunkLines.length - 1]?.line ?? accumulator.firstMatchLine ?? firstLine;
return {
file: accumulator.file,
line: accumulator.firstMatchLine ?? firstLine,
endLine: lastLine,
column: accumulator.firstMatchColumn ?? 1,
content: accumulator.chunkLines.map((line) => line.text).join('\n').trim(),
chunkLines: [...accumulator.chunkLines],
};
}
function parseRipgrepJsonResults(stdout: string, effectiveLimit: number): { results: ExactMatch[]; resultLimitReached: boolean } {
const allResults: ExactMatch[] = [];
const activeChunks = new Map<string, RipgrepChunkAccumulator>();
const lines = stdout.split('\n').filter((line) => line.trim());
let resultLimitReached = false;
const flushChunk = (file: string) => {
const finalized = finalizeRipgrepChunk(activeChunks.get(file));
activeChunks.delete(file);
if (!finalized) {
return;
}
allResults.push(finalized);
if (allResults.length >= effectiveLimit) {
resultLimitReached = true;
}
};
for (const line of lines) {
if (resultLimitReached) {
break;
}
try {
const item = JSON.parse(line);
if (item.type !== 'match' && item.type !== 'context' && item.type !== 'end') {
continue;
}
const file = item.data?.path?.text as string | undefined;
if (!file) {
continue;
}
if (item.type === 'end') {
flushChunk(file);
continue;
}
const lineNumber = typeof item.data?.line_number === 'number' ? item.data.line_number : undefined;
const rawText = typeof item.data?.lines?.text === 'string'
? item.data.lines.text.replace(/\r?\n$/, '')
: '';
if (lineNumber === undefined) {
continue;
}
let current = activeChunks.get(file);
const isContiguous = current && current.lastLine !== undefined && lineNumber <= current.lastLine + 1;
if (!current || !isContiguous) {
if (current) {
flushChunk(file);
if (resultLimitReached) {
break;
}
}
current = {
file,
chunkLines: [],
matchCount: 0,
};
activeChunks.set(file, current);
}
const previousLine = current.chunkLines[current.chunkLines.length - 1];
const duplicateLine = previousLine && previousLine.line === lineNumber && previousLine.text === rawText;
if (!duplicateLine) {
current.chunkLines.push({
line: lineNumber,
text: rawText,
isMatch: item.type === 'match',
});
} else if (item.type === 'match') {
previousLine.isMatch = true;
}
if (item.type === 'match') {
current.matchCount += 1;
if (current.firstMatchLine === undefined) {
current.firstMatchLine = lineNumber;
current.firstMatchColumn =
item.data.submatches && item.data.submatches[0]
? item.data.submatches[0].start + 1
: 1;
}
}
current.lastLine = lineNumber;
} catch {
continue;
}
}
if (!resultLimitReached) {
for (const file of [...activeChunks.keys()]) {
flushChunk(file);
if (resultLimitReached) {
break;
}
}
}
return { results: allResults.slice(0, effectiveLimit), resultLimitReached };
}
function normalizeEmbeddingBackend(backend?: string): string | undefined {
if (!backend) {
return undefined;
}
const normalized = backend.trim().toLowerCase();
if (!normalized) {
return undefined;
}
if (normalized === 'api') {
return 'litellm';
}
if (normalized === 'local') {
return 'fastembed';
}
return normalized;
}
function buildIndexInitArgs(projectPath: string, options: { force?: boolean; languages?: string[]; noEmbeddings?: boolean } = {}): string[] {
const { force = false, languages, noEmbeddings = true } = options;
const args = ['index', 'init', projectPath];
if (noEmbeddings) {
args.push('--no-embeddings');
}
if (force) {
args.push('--force');
}
if (languages && languages.length > 0) {
args.push(...languages.flatMap((language) => ['--language', language]));
}
return args;
}
function resolveEmbeddingSelection(
requestedBackend: string | undefined,
requestedModel: string | undefined,
config: CodexLensConfig | null | undefined,
): { backend?: string; model?: string; preset: 'explicit' | 'config' | 'bulk-local-fast'; note?: string } {
const normalizedRequestedBackend = normalizeEmbeddingBackend(requestedBackend);
const normalizedRequestedModel = requestedModel?.trim() || undefined;
if (normalizedRequestedBackend) {
return {
backend: normalizedRequestedBackend,
model: normalizedRequestedModel || config?.embedding_model,
preset: 'explicit',
};
}
if (normalizedRequestedModel) {
const inferredBackend = config?.embedding_backend
|| (['fast', 'code'].includes(normalizedRequestedModel) ? 'fastembed' : undefined);
return {
backend: inferredBackend,
model: normalizedRequestedModel,
preset: inferredBackend ? 'config' : 'explicit',
};
}
return {
backend: 'fastembed',
model: 'fast',
preset: 'bulk-local-fast',
note: config?.embedding_backend && config.embedding_backend !== 'fastembed'
? `Using recommended bulk indexing preset: local-fast instead of configured ${config.embedding_backend}. Pass embeddingBackend="api" to force remote API embeddings.`
: 'Using recommended bulk indexing preset: local-fast. Pass embeddingBackend="api" to force remote API embeddings.',
};
}
const EMBED_PROGRESS_PREFIX = '__CCW_EMBED_PROGRESS__';
function resolveEmbeddingEndpoints(backend?: string): RotationEndpointConfig[] {
if (backend !== 'litellm') {
return [];
}
try {
return generateRotationEndpoints(getProjectRoot()).filter((endpoint) => {
const apiKey = endpoint.api_key?.trim() ?? '';
return Boolean(
apiKey &&
apiKey.length > 8 &&
!/^\*+$/.test(apiKey) &&
endpoint.api_base?.trim() &&
endpoint.model?.trim()
);
});
} catch {
return [];
}
}
function resolveApiWorkerCount(
requestedWorkers: number | undefined,
backend: string | undefined,
endpoints: RotationEndpointConfig[]
): number | undefined {
if (backend !== 'litellm') {
return undefined;
}
if (typeof requestedWorkers === 'number' && Number.isFinite(requestedWorkers)) {
return Math.max(1, Math.floor(requestedWorkers));
}
if (endpoints.length <= 1) {
return 4;
}
return Math.min(16, Math.max(4, endpoints.length * 2));
}
function extractEmbedJsonLine(stdout: string): string | undefined {
const lines = stdout
.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean)
.filter((line) => !line.startsWith(EMBED_PROGRESS_PREFIX));
return [...lines].reverse().find((line) => line.startsWith('{') && line.endsWith('}'));
}
function buildEmbeddingPythonCode(params: {
projectPath: string;
backend?: string;
model?: string;
force: boolean;
maxWorkers?: number;
endpoints?: RotationEndpointConfig[];
}): string {
const { projectPath, backend, model, force, maxWorkers, endpoints = [] } = params;
return `
import json
import sys
from pathlib import Path
from codexlens.storage.path_mapper import PathMapper
from codexlens.storage.registry import RegistryStore
from codexlens.cli.embedding_manager import generate_dense_embeddings_centralized
target_path = Path(r"__PROJECT_PATH__").expanduser().resolve()
backend = __BACKEND__
model = __MODEL__
force = __FORCE__
max_workers = __MAX_WORKERS__
endpoints = json.loads(r'''__ENDPOINTS_JSON__''')
def progress_update(message: str):
print("__CCW_EMBED_PROGRESS__" + str(message), flush=True)
registry = RegistryStore()
registry.initialize()
try:
project = registry.get_project(target_path)
index_root = None
if project is not None:
index_root = Path(project.index_root)
else:
mapper = PathMapper()
index_db = mapper.source_to_index_db(target_path)
if index_db.exists():
index_root = index_db.parent
else:
nearest = registry.find_nearest_index(target_path)
if nearest is not None:
index_root = Path(nearest.index_path).parent
if index_root is None:
print(json.dumps({"success": False, "error": f"No index found for: {target_path}"}), flush=True)
sys.exit(1)
result = generate_dense_embeddings_centralized(
index_root,
embedding_backend=backend,
model_profile=model,
force=force,
use_gpu=True,
max_workers=max_workers,
endpoints=endpoints if endpoints else None,
progress_callback=progress_update,
)
print(json.dumps(result), flush=True)
if not result.get("success"):
sys.exit(1)
finally:
registry.close()
`
.replace('__PROJECT_PATH__', projectPath.replace(/\\/g, '\\\\'))
.replace('__BACKEND__', backend ? JSON.stringify(backend) : 'None')
.replace('__MODEL__', model ? JSON.stringify(model) : 'None')
.replace('__FORCE__', force ? 'True' : 'False')
.replace('__MAX_WORKERS__', typeof maxWorkers === 'number' ? String(Math.max(1, Math.floor(maxWorkers))) : 'None')
.replace('__ENDPOINTS_JSON__', JSON.stringify(endpoints).replace(/\\/g, '\\\\').replace(/'''/g, "\\'\\'\\'"));
}
function spawnBackgroundEmbeddingsViaPython(params: {
projectPath: string;
backend?: string;
model?: string;
force: boolean;
maxWorkers?: number;
endpoints?: RotationEndpointConfig[];
}): { success: boolean; error?: string } {
const { projectPath, backend, model } = params;
try {
const child = getSpawnRuntime()(
getVenvPythonPathRuntime()(),
['-c', buildEmbeddingPythonCode(params)],
buildSmartSearchSpawnOptions(projectPath, {
detached: shouldDetachBackgroundSmartSearchProcess(),
stdio: 'ignore',
}),
);
autoEmbedJobs.set(projectPath, {
startedAt: getNowRuntime(),
backend,
model,
});
const cleanup = () => {
autoEmbedJobs.delete(projectPath);
};
child.on('error', cleanup);
child.on('close', cleanup);
child.unref();
return { success: true };
} catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : String(error),
};
}
}
function spawnBackgroundIndexInit(params: {
projectPath: string;
languages?: string[];
}): { success: boolean; error?: string } {
const { projectPath, languages } = params;
try {
const pythonPath = getVenvPythonPathRuntime()();
if (!existsSync(pythonPath)) {
return {
success: false,
error: 'CodexLens Python environment is not ready yet.',
};
}
const child = getSpawnRuntime()(
pythonPath,
['-m', 'codexlens', ...buildIndexInitArgs(projectPath, { languages })],
buildSmartSearchSpawnOptions(projectPath, {
detached: shouldDetachBackgroundSmartSearchProcess(),
stdio: 'ignore',
}),
);
autoInitJobs.set(projectPath, {
startedAt: getNowRuntime(),
languages,
});
const cleanup = () => {
autoInitJobs.delete(projectPath);
};
child.on('error', cleanup);
child.on('close', cleanup);
child.unref();
return { success: true };
} catch (error) {
return {
success: false,
error: error instanceof Error ? error.message : String(error),
};
}
}
async function maybeStartBackgroundAutoInit(
scope: SearchScope,
indexStatus: IndexStatus,
): Promise<{ note?: string; warning?: string }> {
if (indexStatus.indexed) {
return {};
}
if (!isAutoInitMissingEnabled()) {
return {
note: getAutoInitMissingDisabledReason(),
};
}
if (autoInitJobs.has(scope.workingDirectory)) {
return {
note: 'Background static index build is already running for this path.',
};
}
const spawned = spawnBackgroundIndexInit({
projectPath: scope.workingDirectory,
});
if (!spawned.success) {
return {
warning: `Automatic static index warmup could not start: ${spawned.error}`,
};
}
return {
note: 'Background static index build started for this path. Re-run search shortly for indexed FTS results.',
};
}
async function maybeStartBackgroundAutoEmbed(
scope: SearchScope,
indexStatus: IndexStatus,
): Promise<{ note?: string; warning?: string }> {
if (!indexStatus.indexed || indexStatus.has_embeddings) {
return {};
}
if (!isAutoEmbedMissingEnabled(indexStatus.config)) {
return {
note: getAutoEmbedMissingDisabledReason(indexStatus.config),
};
}
if (autoEmbedJobs.has(scope.workingDirectory)) {
return {
note: 'Background embedding build is already running for this path.',
};
}
const backend = normalizeEmbeddingBackend(indexStatus.config?.embedding_backend) ?? 'fastembed';
const model = indexStatus.config?.embedding_model?.trim() || undefined;
const semanticStatus = await getSemanticStatusRuntime()();
if (!semanticStatus.available) {
return {
warning: 'Automatic embedding warmup skipped because semantic dependencies are not ready.',
};
}
if (backend === 'litellm' && !semanticStatus.litellmAvailable) {
return {
warning: 'Automatic embedding warmup skipped because the LiteLLM embedder is not ready.',
};
}
const endpoints = resolveEmbeddingEndpoints(backend);
const configuredApiMaxWorkers = indexStatus.config?.api_max_workers;
const effectiveApiMaxWorkers = typeof configuredApiMaxWorkers === 'number'
? Math.max(1, Math.floor(configuredApiMaxWorkers))
: resolveApiWorkerCount(undefined, backend, endpoints);
const spawned = spawnBackgroundEmbeddingsViaPython({
projectPath: scope.workingDirectory,
backend,
model,
force: false,
maxWorkers: effectiveApiMaxWorkers,
endpoints,
});
if (!spawned.success) {
return {
warning: `Automatic embedding warmup could not start: ${spawned.error}`,
};
}
return {
note: 'Background embedding build started for this path. Re-run semantic search shortly for vector results.',
};
}
// v1 executeEmbeddingsViaPython removed — v2 uses built-in fastembed models
// v1 executeInitAction removed — replaced by executeInitActionV2
// v1 executeEmbedAction removed — v2 auto-embeds during sync
// v1 executeStatusAction removed — replaced by executeStatusActionV2
// v1 executeUpdateAction and executeWatchAction removed — replaced by V2 versions
// v1 executeFuzzyMode and executeAutoMode removed — v2 bridge handles all search
/**
* Mode: ripgrep - Fast literal string matching using ripgrep
* No index required, fallback to CodexLens if ripgrep unavailable
* Supports tokenized multi-word queries with OR matching and result ranking
*/
async function executeRipgrepMode(params: Params): Promise<SearchResult> {
const { query, paths = [], contextLines = 0, maxResults = 5, extraFilesCount = 10, maxContentLength = 200, includeHidden = false, path = '.', regex = true, caseSensitive = true, tokenize = true, codeOnly = true, withDoc = false, excludeExtensions } = params;
const scope = resolveSearchScope(path, paths);
// withDoc overrides codeOnly
const effectiveCodeOnly = withDoc ? false : codeOnly;
if (!query) {
return {
success: false,
error: 'Query is required for search',
};
}
// Check if ripgrep is available
const hasRipgrep = checkToolAvailability('rg');
// Calculate total to fetch for split (full content + extra files)
const totalToFetch = maxResults + extraFilesCount;
// If ripgrep not available, fall back to CodexLens exact mode
if (!hasRipgrep) {
const readyStatus = await ensureCodexLensReady();
if (!readyStatus.ready) {
return {
success: false,
error: 'Neither ripgrep nor CodexLens available. Install ripgrep (rg) or CodexLens for search functionality.',
};
}
// Use CodexLens fts mode as fallback
const args = ['search', query, '--limit', totalToFetch.toString(), '--method', 'fts', '--json'];
const result = await executeCodexLens(args, { cwd: scope.workingDirectory });
if (!result.success) {
noteCodexLensFtsCompatibility(result.error);
return {
success: false,
error: summarizeBackendError(result.error),
metadata: {
mode: 'ripgrep',
backend: 'codexlens-fallback',
count: 0,
query,
},
};
}
// Parse results
let allResults: SemanticMatch[] = [];
try {
const parsed = JSON.parse(stripAnsi(result.output || '{}'));
const data = parsed.result?.results || parsed.results || parsed;
allResults = (Array.isArray(data) ? data : []).map((item: any) => ({
file: item.path || item.file,
score: item.score || 0,
content: truncateContent(item.content || item.excerpt, maxContentLength),
symbol: item.symbol || null,
}));
} catch {
// Keep empty results
}
const scopedResults = filterResultsToTargetFile(allResults, scope);
// Split results: first N with full content, rest as file paths only
const { results, extra_files } = splitResultsWithExtraFiles(scopedResults, maxResults, extraFilesCount);
return {
success: true,
results,
extra_files: extra_files.length > 0 ? extra_files : undefined,
metadata: {
mode: 'ripgrep',
backend: 'codexlens-fallback',
count: results.length,
query,
note: 'Using CodexLens exact mode (ripgrep not available)',
},
};
}
// Use ripgrep - request more results to support split
const { command, args, tokens, warning: queryModeWarning } = buildRipgrepCommand({
query,
paths: scope.searchPaths,
contextLines,
maxResults: totalToFetch, // Fetch more to support split
includeHidden,
regex,
caseSensitive,
tokenize,
});
return new Promise((resolve) => {
const child = getSpawnRuntime()(
command,
args,
buildSmartSearchSpawnOptions(scope.workingDirectory || getProjectRoot(), {
stdio: ['ignore', 'pipe', 'pipe'],
}),
);
let stdout = '';
let stderr = '';
let resultLimitReached = false;
child.stdout?.on('data', (data) => {
stdout += data.toString();
});
child.stderr?.on('data', (data) => {
stderr += data.toString();
});
child.on('close', (code) => {
// Limit total results to prevent memory overflow (--max-count only limits per-file)
const effectiveLimit = totalToFetch > 0 ? totalToFetch : 500;
const parsedResults = parseRipgrepJsonResults(stdout, effectiveLimit);
const allResults = parsedResults.results;
resultLimitReached = parsedResults.resultLimitReached;
// Handle Windows device file errors gracefully (os error 1)
// If we have results despite the error, return them as partial success
const isWindowsDeviceError = stderr.includes('os error 1') || stderr.includes('函数不正确');
// Apply token-based scoring and sorting for multi-word queries
// Results matching more tokens are ranked higher (exact matches first)
const scoredResults = tokens.length > 1 ? scoreByTokenMatch(allResults, tokens) : allResults;
// Apply code-only and extension filtering
const filteredResults = filterNoisyFiles(scoredResults as any[], { codeOnly: effectiveCodeOnly, excludeExtensions });
if (code === 0 || code === 1 || (isWindowsDeviceError && filteredResults.length > 0)) {
// Split results: first N with full content, rest as file paths only
const { results, extra_files } = splitResultsWithExtraFiles(filteredResults, maxResults, extraFilesCount);
// Build warning message for various conditions
const warnings: string[] = [];
if (queryModeWarning) {
warnings.push(queryModeWarning);
}
if (resultLimitReached) {
warnings.push(`Result limit reached (${effectiveLimit}). Use a more specific query or increase limit.`);
}
if (isWindowsDeviceError) {
warnings.push('Some Windows device files were skipped');
}
resolve({
success: true,
results,
extra_files: extra_files.length > 0 ? extra_files : undefined,
metadata: {
mode: 'ripgrep',
backend: 'ripgrep',
count: results.length,
query,
tokens: tokens.length > 1 ? tokens : undefined, // Include tokens in metadata for debugging
tokenized: tokens.length > 1,
...(warnings.length > 0 && { warning: warnings.join('; ') }),
},
});
} else if (isWindowsDeviceError && allResults.length === 0) {
// Windows device error but no results - might be the only issue
resolve({
success: true,
results: [],
metadata: {
mode: 'ripgrep',
backend: 'ripgrep',
count: 0,
query,
warning: 'No matches found (some Windows device files were skipped)',
},
});
} else {
resolve({
success: false,
error: `ripgrep execution failed with code ${code}: ${stderr}`,
results: [],
});
}
});
child.on('error', (error) => {
resolve({
success: false,
error: `Failed to spawn ripgrep: ${error.message}`,
results: [],
});
});
});
}
// ========================================
// codexlens-search v2 bridge integration
// ========================================
/**
* Execute search via codexlens-search (v2) bridge CLI.
* Spawns 'codexlens-search search --query X --top-k Y --db-path Z' and parses JSON output.
*
* @param query - Search query string
* @param topK - Number of results to return
* @param dbPath - Path to the v2 index database directory
* @returns Parsed search results as SemanticMatch array
*/
async function executeCodexLensV2Bridge(
query: string,
topK: number,
dbPath: string,
): Promise<SearchResult> {
return new Promise((resolve) => {
const args = [
'--db-path', dbPath,
'search',
'--query', query,
'--top-k', String(topK),
];
execFile('codexlens-search', args, {
encoding: 'utf-8',
timeout: EXEC_TIMEOUTS.PROCESS_SPAWN,
windowsHide: true,
env: { ...process.env, PYTHONIOENCODING: 'utf-8' },
}, (error, stdout, stderr) => {
if (error) {
console.warn(`[CodexLens-v2] Bridge search failed: ${error.message}`);
resolve({
success: false,
error: `codexlens-search v2 bridge failed: ${error.message}`,
});
return;
}
try {
const parsed = JSON.parse(stdout.trim());
// Bridge outputs {"error": string} on failure
if (parsed && typeof parsed === 'object' && 'error' in parsed) {
resolve({
success: false,
error: `codexlens-search v2: ${parsed.error}`,
});
return;
}
// Bridge outputs array of {path, score, line, end_line, snippet, content}
const raw: Array<{
path?: string; score?: number; line?: number;
end_line?: number; snippet?: string; content?: string;
}> = Array.isArray(parsed) ? parsed : [];
// Build AceLike sections and group by file
const sections: AceLikeSection[] = raw.map(r => ({
path: r.path || '',
line: r.line || undefined,
endLine: r.end_line || undefined,
score: r.score || 0,
symbol: null,
snippet: r.content || r.snippet || '',
}));
const groupMap = new Map<string, AceLikeSection[]>();
for (const s of sections) {
const arr = groupMap.get(s.path) || [];
arr.push(s);
groupMap.set(s.path, arr);
}
const groups: AceLikeGroup[] = Array.from(groupMap.entries()).map(
([path, secs]) => ({ path, sections: secs, total_matches: secs.length })
);
// Render text view with line numbers
const textParts: string[] = [];
for (const s of sections) {
const lineInfo = s.line ? `:${s.line}${s.endLine ? `-${s.endLine}` : ''}` : '';
textParts.push(`Path: ${s.path}${lineInfo}\n${s.snippet}\n`);
}
const aceLikeOutput: AceLikeOutput = {
format: 'ace',
text: textParts.join('\n'),
groups,
sections,
total: sections.length,
};
resolve({
success: true,
results: aceLikeOutput,
metadata: {
mode: 'semantic' as any,
backend: 'codexlens-v2',
count: sections.length,
query,
note: 'Using codexlens-search v2 bridge (2-stage vector + reranking)',
},
});
} catch (parseErr) {
console.warn(`[CodexLens-v2] Failed to parse bridge output: ${(parseErr as Error).message}`);
resolve({
success: false,
error: `Failed to parse codexlens-search v2 output: ${(parseErr as Error).message}`,
output: stdout,
});
}
});
});
}
/**
* Execute a generic codexlens-search v2 bridge subcommand (init, status, sync, watch, etc.).
* Returns parsed JSON output from the bridge CLI.
*/
async function executeV2BridgeCommand(
subcommand: string,
args: string[],
options?: { timeout?: number; dbPath?: string },
): Promise<SearchResult> {
return new Promise((resolve) => {
// --db-path is a global arg and must come BEFORE the subcommand
const globalArgs = options?.dbPath ? ['--db-path', options.dbPath] : [];
const fullArgs = [...globalArgs, subcommand, ...args];
execFile('codexlens-search', fullArgs, {
encoding: 'utf-8',
timeout: options?.timeout ?? EXEC_TIMEOUTS.PROCESS_SPAWN,
windowsHide: true,
env: { ...process.env, PYTHONIOENCODING: 'utf-8' },
}, (error, stdout, stderr) => {
if (error) {
resolve({
success: false,
error: `codexlens-search ${subcommand} failed: ${error.message}`,
});
return;
}
try {
const parsed = JSON.parse(stdout.trim());
if (parsed && typeof parsed === 'object' && 'error' in parsed) {
resolve({ success: false, error: `codexlens-search: ${parsed.error}` });
return;
}
resolve({ success: true, status: parsed, message: parsed.status || `${subcommand} completed`, metadata: { action: subcommand } });
} catch {
resolve({ success: false, error: `Failed to parse codexlens-search ${subcommand} output`, output: stdout });
}
});
});
}
/**
* Action: init (v2) - Initialize index and sync files.
*/
async function executeInitActionV2(params: Params): Promise<SearchResult> {
const { path = '.' } = params;
const scope = resolveSearchScope(path);
const dbPath = join(scope.workingDirectory, '.codexlens');
// Step 1: init empty index
const initResult = await executeV2BridgeCommand('init', [], { dbPath });
if (!initResult.success) return initResult;
// Step 2: sync all files
const syncResult = await executeV2BridgeCommand('sync', [
'--root', scope.workingDirectory,
], { timeout: 1800000, dbPath }); // 30 min for large codebases
return {
success: syncResult.success,
error: syncResult.error,
message: syncResult.success
? `Index initialized and synced for ${scope.workingDirectory}`
: undefined,
metadata: { action: 'init', path: scope.workingDirectory },
status: syncResult.status,
};
}
/**
* Action: status (v2) - Report index statistics.
*/
async function executeStatusActionV2(params: Params): Promise<SearchResult> {
const { path = '.' } = params;
const scope = resolveSearchScope(path);
const dbPath = join(scope.workingDirectory, '.codexlens');
return executeV2BridgeCommand('status', [], { dbPath });
}
/**
* Action: update (v2) - Incremental sync (re-sync changed files).
*/
async function executeUpdateActionV2(params: Params): Promise<SearchResult> {
const { path = '.' } = params;
const scope = resolveSearchScope(path);
const dbPath = join(scope.workingDirectory, '.codexlens');
return executeV2BridgeCommand('sync', [
'--root', scope.workingDirectory,
], { timeout: 600000, dbPath }); // 10 min
}
/**
* Action: watch (v2) - Start file watcher for auto-updates.
*/
async function executeWatchActionV2(params: Params): Promise<SearchResult> {
const { path = '.', debounce = 1000 } = params;
const scope = resolveSearchScope(path);
const dbPath = join(scope.workingDirectory, '.codexlens');
// Watch runs indefinitely — start it with a short initial timeout to confirm startup
const result = await executeV2BridgeCommand('watch', [
'--root', scope.workingDirectory,
'--debounce-ms', debounce.toString(),
], { timeout: 5000, dbPath });
return {
success: true,
message: `File watcher started for ${scope.workingDirectory}. Changes are indexed automatically.`,
metadata: { action: 'watch', path: scope.workingDirectory },
status: result.status,
};
}
// v1 executeCodexLensExactMode removed — v2 bridge handles search
// v1 executeHybridMode removed — v2 bridge handles semantic search
// v1 executeHybridMode removed — v2 bridge handles semantic search
/**
* Query intent used to adapt RRF weights (Python parity).
*
* Keep this logic aligned with CodexLens Python hybrid search:
* `codex-lens/src/codexlens/search/hybrid_search.py`
*/
export type QueryIntent = 'keyword' | 'semantic' | 'mixed';
// Python default: vector 60%, exact 30%, fuzzy 10%
const DEFAULT_RRF_WEIGHTS = {
exact: 0.3,
fuzzy: 0.1,
vector: 0.6,
} as const;
function normalizeWeights(weights: Record<string, number>): Record<string, number> {
const sum = Object.values(weights).reduce((acc, v) => acc + v, 0);
if (!Number.isFinite(sum) || sum <= 0) return { ...weights };
return Object.fromEntries(Object.entries(weights).map(([k, v]) => [k, v / sum]));
}
/**
* Detect query intent using the same heuristic signals as Python:
* - Code patterns: `.`, `::`, `->`, CamelCase, snake_case, common code keywords
* - Natural language patterns: >5 words, question marks, interrogatives, common verbs
*/
export function detectQueryIntent(query: string): QueryIntent {
const trimmed = query.trim();
if (!trimmed) return 'mixed';
const lower = trimmed.toLowerCase();
const wordCount = trimmed.split(/\s+/).filter(Boolean).length;
const hasCodeSignals =
/(::|->|\.)/.test(trimmed) ||
/[A-Z][a-z]+[A-Z]/.test(trimmed) ||
/\b\w+_\w+\b/.test(trimmed) ||
/\b(def|class|function|const|let|var|import|from|return|async|await|interface|type)\b/i.test(lower);
const hasNaturalSignals =
wordCount > 5 ||
/\?/.test(trimmed) ||
/\b(how|what|why|when|where)\b/i.test(trimmed) ||
/\b(handle|explain|fix|implement|create|build|use|find|search|convert|parse|generate|support)\b/i.test(trimmed);
if (hasCodeSignals && hasNaturalSignals) return 'mixed';
if (hasCodeSignals) return 'keyword';
if (hasNaturalSignals) return 'semantic';
return 'mixed';
}
/**
* Intent → weights mapping (Python parity).
* - keyword: exact-heavy
* - semantic: vector-heavy
* - mixed: keep defaults
*/
export function adjustWeightsByIntent(
intent: QueryIntent,
baseWeights: Record<string, number>,
): Record<string, number> {
if (intent === 'keyword') return normalizeWeights({ exact: 0.5, fuzzy: 0.1, vector: 0.4 });
if (intent === 'semantic') return normalizeWeights({ exact: 0.2, fuzzy: 0.1, vector: 0.7 });
return normalizeWeights({ ...baseWeights });
}
export function getRRFWeights(
query: string,
baseWeights: Record<string, number> = DEFAULT_RRF_WEIGHTS,
): Record<string, number> {
return adjustWeightsByIntent(detectQueryIntent(query), baseWeights);
}
/**
* Post-processing: Filter noisy files from semantic search results
* Uses FILTER_CONFIG patterns to remove irrelevant files.
* Optimized: pre-compiled regexes, accurate path segment matching.
*/
// Pre-compile file exclusion regexes once (avoid recompilation in loop)
const FILE_EXCLUDE_REGEXES = [...FILTER_CONFIG.exclude_files].map(pattern =>
new RegExp('^' + pattern.replace(/[.*+?^${}()|[\]\\]/g, '\\$&').replace(/\\\*/g, '.*') + '$')
);
// Non-code file extensions (for codeOnly filter)
const NON_CODE_EXTENSIONS = new Set([
'md', 'txt', 'json', 'yaml', 'yml', 'xml', 'csv', 'log',
'ini', 'cfg', 'conf', 'toml', 'env', 'properties',
'html', 'htm', 'svg', 'png', 'jpg', 'jpeg', 'gif', 'ico', 'webp',
'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
'lock', 'sum', 'mod',
]);
interface FilterOptions {
excludeExtensions?: string[];
codeOnly?: boolean;
}
function filterNoisyFiles(results: SemanticMatch[], options: FilterOptions = {}): SemanticMatch[] {
const { excludeExtensions = [], codeOnly = false } = options;
// Build extension filter set
const excludedExtSet = new Set(excludeExtensions.map(ext => ext.toLowerCase().replace(/^\./, '')));
if (codeOnly) {
NON_CODE_EXTENSIONS.forEach(ext => excludedExtSet.add(ext));
}
return results.filter(r => {
// Support both 'file' and 'path' field names (different backends use different names)
const filePath = r.file || (r as any).path || '';
if (!filePath) return true;
const segments: string[] = filePath.split(/[/\\]/);
// Accurate directory check: segment must exactly match excluded directory
if (segments.some((segment: string) => FILTER_CONFIG.exclude_directories.has(segment))) {
return false;
}
// Accurate file check: pattern matches filename only (not full path)
const filename = segments.pop() || '';
if (FILE_EXCLUDE_REGEXES.some(regex => regex.test(filename))) {
return false;
}
// Extension filter check
if (excludedExtSet.size > 0) {
const ext = filename.split('.').pop()?.toLowerCase() || '';
if (excludedExtSet.has(ext)) {
return false;
}
}
return true;
});
}
/**
* Post-processing: Boost results containing query keywords
* Extracts keywords from query and boosts matching results.
* Optimized: uses whole-word matching with regex for accuracy.
*/
// Helper to escape regex special characters
function escapeRegExp(str: string): string {
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
function applyKeywordBoosting(results: SemanticMatch[], query: string): SemanticMatch[] {
// Extract meaningful keywords (ignore common words)
const stopWords = new Set(['the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', 'used', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'and', 'but', 'if', 'or', 'because', 'until', 'while', 'although', 'though', 'after', 'before', 'when', 'whenever', 'where', 'wherever', 'whether', 'which', 'who', 'whom', 'whose', 'what', 'whatever', 'whichever', 'whoever', 'whomever', 'this', 'that', 'these', 'those', 'it', 'its']);
const keywords = query
.toLowerCase()
.split(/[\s,.;:()"{}[\]-]+/) // More robust splitting on punctuation
.filter(word => word.length > 2 && !stopWords.has(word));
if (keywords.length === 0) return results;
// Create case-insensitive regexes for whole-word matching
const keywordRegexes = keywords.map(kw => new RegExp(`\\b${escapeRegExp(kw)}\\b`, 'i'));
return results.map(r => {
const content = r.content || '';
const file = r.file || '';
// Count keyword matches using whole-word regex
let matchCount = 0;
for (const regex of keywordRegexes) {
if (regex.test(content) || regex.test(file)) {
matchCount++;
}
}
// Apply boost only if there are matches
if (matchCount > 0) {
const matchRatio = matchCount / keywords.length;
const boost = 1 + (matchRatio * 0.3); // Up to 30% boost for full match
return {
...r,
score: r.score * boost,
};
}
return r;
});
}
/**
* Post-processing: Enforce score diversity
* Penalizes results with identical scores (indicates undifferentiated matching)
*/
function enforceScoreDiversity(results: SemanticMatch[]): SemanticMatch[] {
if (results.length < 2) return results;
// Count occurrences of each score (rounded to 3 decimal places for comparison)
const scoreCounts = new Map<number, number>();
for (const r of results) {
const roundedScore = Math.round(r.score * 1000) / 1000;
scoreCounts.set(roundedScore, (scoreCounts.get(roundedScore) || 0) + 1);
}
// Apply penalty to scores that appear more than twice
return results.map(r => {
const roundedScore = Math.round(r.score * 1000) / 1000;
const count = scoreCounts.get(roundedScore) || 1;
if (count > 2) {
// Progressive penalty: more duplicates = bigger penalty
const penalty = Math.max(0.7, 1 - (count * 0.05));
return { ...r, score: r.score * penalty };
}
return r;
});
}
/**
* Post-processing: Filter results with dominant baseline score (hot spot detection)
* When backend returns default "hot spot" files with identical high scores,
* this function detects and removes them.
*
* Detection criteria:
* - A single score appears in >50% of results
* - That score is suspiciously high (>0.9)
* - This indicates fallback mechanism returned placeholder results
*/
function filterDominantBaselineScores(
results: SemanticMatch[]
): { filteredResults: SemanticMatch[]; baselineInfo: { score: number; count: number } | null } {
if (results.length < 4) {
return { filteredResults: results, baselineInfo: null };
}
// Count occurrences of each score (rounded to 4 decimal places)
const scoreCounts = new Map<number, number>();
results.forEach(r => {
const rounded = Math.round(r.score * 10000) / 10000;
scoreCounts.set(rounded, (scoreCounts.get(rounded) || 0) + 1);
});
// Find the most dominant score
let dominantScore: number | null = null;
let dominantCount = 0;
scoreCounts.forEach((count, score) => {
if (count > dominantCount) {
dominantCount = count;
dominantScore = score;
}
});
// If a single score is present in >50% of results and is high (>0.9),
// treat it as a suspicious baseline score and filter it out
const BASELINE_THRESHOLD = 0.5; // >50% of results have same score
const HIGH_SCORE_THRESHOLD = 0.9; // Score above 0.9 is suspiciously high
if (
dominantScore !== null &&
dominantCount > results.length * BASELINE_THRESHOLD &&
dominantScore > HIGH_SCORE_THRESHOLD
) {
const filteredResults = results.filter(r => {
const rounded = Math.round(r.score * 10000) / 10000;
return rounded !== dominantScore;
});
return {
filteredResults,
baselineInfo: { score: dominantScore, count: dominantCount },
};
}
return { filteredResults: results, baselineInfo: null };
}
/**
* TypeScript implementation of Reciprocal Rank Fusion
* Reference: codex-lens/src/codexlens/search/ranking.py
* Formula: score(d) = Σ weight_source / (k + rank_source(d))
*/
function normalizeFusionSnippet(value: unknown): string | undefined {
if (typeof value !== 'string') {
return undefined;
}
const normalized = value.replace(/\s+/g, ' ').trim();
return normalized ? normalized.slice(0, 240) : undefined;
}
function buildFusionIdentity(result: any): string | null {
const path = typeof result?.file === 'string'
? result.file
: typeof result?.path === 'string'
? result.path
: undefined;
if (!path) {
return null;
}
const line = typeof result?.line === 'number' && Number.isFinite(result.line)
? result.line
: undefined;
const endLine = typeof result?.endLine === 'number' && Number.isFinite(result.endLine)
? result.endLine
: line;
const column = typeof result?.column === 'number' && Number.isFinite(result.column)
? result.column
: undefined;
if (line !== undefined) {
return `${path}#L${line}-${endLine ?? line}:C${column ?? 0}`;
}
const symbol = typeof result?.symbol === 'string' && result.symbol.trim()
? result.symbol.trim()
: undefined;
const snippet = normalizeFusionSnippet(result?.content);
if (symbol && snippet) {
return `${path}::${symbol}::${snippet}`;
}
if (snippet) {
return `${path}::${snippet}`;
}
if (symbol) {
return `${path}::${symbol}`;
}
return path;
}
function scoreFusionRepresentative(result: any): number {
let score = 0;
if (typeof result?.line === 'number' && Number.isFinite(result.line)) {
score += 1000;
}
if (typeof result?.endLine === 'number' && Number.isFinite(result.endLine)) {
score += 250;
}
if (typeof result?.column === 'number' && Number.isFinite(result.column)) {
score += 50;
}
if (Array.isArray(result?.chunkLines) && result.chunkLines.length > 0) {
score += 500 + result.chunkLines.length;
}
if (typeof result?.symbol === 'string' && result.symbol.trim()) {
score += 50;
}
if (typeof result?.content === 'string') {
score += Math.min(result.content.length, 200);
}
return score;
}
function applyRRFFusion(
resultsMap: Map<string, any[]>,
weightsOrQuery: Record<string, number> | string,
limit: number,
k: number = 60,
): any[] {
const weights = typeof weightsOrQuery === 'string' ? getRRFWeights(weightsOrQuery) : weightsOrQuery;
const fusedScores = new Map<string, { score: number; result: any; sources: string[]; representativeScore: number }>();
resultsMap.forEach((results, source) => {
const weight = weights[source] || 0;
if (weight === 0 || !results) return;
results.forEach((result, rank) => {
const identity = buildFusionIdentity(result);
if (!identity) return;
const rrfContribution = weight / (k + rank + 1);
const representativeScore = scoreFusionRepresentative(result);
if (!fusedScores.has(identity)) {
fusedScores.set(identity, { score: 0, result, sources: [], representativeScore });
}
const entry = fusedScores.get(identity)!;
entry.score += rrfContribution;
if (representativeScore > entry.representativeScore) {
entry.result = result;
entry.representativeScore = representativeScore;
}
if (!entry.sources.includes(source)) {
entry.sources.push(source);
}
});
});
// Sort by fusion score descending
return Array.from(fusedScores.values())
.sort((a, b) => b.score - a.score)
.slice(0, limit)
.map(item => ({
...item.result,
fusion_score: item.score,
matched_backends: item.sources,
}));
}
/**
* Promise wrapper with timeout support
* @param promise - The promise to wrap
* @param ms - Timeout in milliseconds
* @param modeName - Name of the mode for error message
* @returns A new promise that rejects on timeout
*/
function withTimeout<T>(promise: Promise<T>, ms: number, modeName: string): Promise<T> {
return new Promise((resolve, reject) => {
const timer = setTimeout(() => {
reject(new Error(`'${modeName}' search timed out after ${ms}ms`));
}, ms);
promise
.then(resolve)
.catch(reject)
.finally(() => clearTimeout(timer));
});
}
// v1 executePriorityFallbackMode removed — v2 bridge + ripgrep fallback handles all search
// Tool schema for MCP
export const schema: ToolSchema = {
name: 'smart_search',
description: `Unified code search tool powered by codexlens-search v2 (2-stage vector + FTS5 + reranking).
Recommended flow: use **action=\"search\"** for lookups, **action=\"init\"** to build the semantic index, and **action=\"update\"** when files change.
**Actions & Required Parameters:**
* **search** (default): Semantic code search with ripgrep fallback.
* **query** (string, **REQUIRED**): Content to search for.
* *limit* (number): Max results (default: 5).
* *path* (string): Directory or single file to search (default: current directory).
* *contextLines* (number): Context lines around matches (default: 0).
* *regex* (boolean): Use regex matching in ripgrep fallback (default: true).
* *caseSensitive* (boolean): Case-sensitive search (default: true).
* **find_files**: Find files by path/name pattern.
* **pattern** (string, **REQUIRED**): Glob pattern (e.g., "*.ts", "src/**/*.js").
* *limit* (number): Max results (default: 20).
* *offset* (number): Pagination offset (default: 0).
* *includeHidden* (boolean): Include hidden files (default: false).
* **init**: Initialize v2 semantic index and sync all files.
* *path* (string): Directory to index (default: current).
* **status**: Check v2 index statistics. (No required params)
* **update**: Incremental sync for changed files.
* *path* (string): Directory to update (default: current).
* **watch**: Start file watcher for auto-updates.
* *path* (string): Directory to watch (default: current).
**Examples:**
smart_search(query="authentication logic") # Semantic search (default)
smart_search(action="init", path="/project") # Build v2 index
smart_search(action="update", path="/project") # Sync changed files
smart_search(query="auth", limit=10, offset=0) # Paginated search`,
inputSchema: {
type: 'object',
properties: {
action: {
type: 'string',
enum: ['init', 'search', 'find_files', 'status', 'update', 'watch', 'search_files'],
description: 'Action: search (semantic search, default), find_files (path pattern matching), init (build v2 index), status (check index), update (incremental sync), watch (auto-update watcher). Note: search_files is deprecated.',
default: 'search',
},
query: {
type: 'string',
description: 'Content search query (for action="search").',
},
pattern: {
type: 'string',
description: 'Glob pattern for file discovery (for action="find_files"). Examples: "*.ts", "src/**/*.js", "test_*.py"',
},
mode: {
type: 'string',
enum: SEARCH_MODES,
description: 'Search mode: fuzzy (v2 semantic + ripgrep fallback, default) or semantic (v2 semantic search only).',
default: 'fuzzy',
},
output_mode: {
type: 'string',
enum: [...SEARCH_OUTPUT_MODES],
description: 'Output format: ace (default, ACE-style grouped code sections + rendered text), full (raw matches), files_only (paths only), count (per-file counts)',
default: 'ace',
},
path: {
type: 'string',
description: 'Directory path for init/search actions (default: current directory). For action=search, a single file path is also accepted and results are automatically scoped back to that file.',
},
paths: {
type: 'array',
description: 'Multiple paths to search within (for search action)',
items: {
type: 'string',
},
default: [],
},
contextLines: {
type: 'number',
description: 'Number of context lines around matches (exact mode only)',
default: 0,
},
maxResults: {
type: 'number',
description: 'Maximum number of full-content results (default: 5)',
default: 5,
},
limit: {
type: 'number',
description: 'Alias for maxResults (default: 5)',
default: 5,
},
extraFilesCount: {
type: 'number',
description: 'Number of additional file-only results (paths without content)',
default: 10,
},
maxContentLength: {
type: 'number',
description: 'Maximum content length for truncation (50-2000)',
default: 200,
},
offset: {
type: 'number',
description: 'Pagination offset - skip first N results (default: 0)',
default: 0,
},
includeHidden: {
type: 'boolean',
description: 'Include hidden files/directories',
default: false,
},
force: {
type: 'boolean',
description: 'Force full rebuild for action="init".',
default: false,
},
regex: {
type: 'boolean',
description: 'Use regex pattern matching instead of literal string (ripgrep mode only). Default: enabled. Example: smart_search(query="class.*Builder")',
default: true,
},
caseSensitive: {
type: 'boolean',
description: 'Case-sensitive search (default: true). Set to false for case-insensitive matching.',
default: true,
},
tokenize: {
type: 'boolean',
description: 'Tokenize multi-word queries for OR matching (ripgrep mode). Default: true. Results are ranked by token match count (exact matches first).',
default: true,
},
},
required: [],
},
};
/**
* Action: find_files - Find files by path/name pattern (glob matching)
* Unlike search which looks inside file content, find_files matches file paths
*/
async function executeFindFilesAction(params: Params): Promise<SearchResult> {
const { pattern, path = '.', limit = 20, offset = 0, includeHidden = false, caseSensitive = true } = params;
const scope = resolveSearchScope(path);
if (!pattern) {
return {
success: false,
error: 'Pattern is required for find_files action. Use glob patterns like "*.ts", "src/**/*.js", or "test_*.py"',
};
}
// Use ripgrep with --files flag for fast file listing with glob pattern
const hasRipgrep = checkToolAvailability('rg');
if (!hasRipgrep) {
// Fallback to CodexLens file listing if available
const readyStatus = await ensureCodexLensReady();
if (!readyStatus.ready) {
return {
success: false,
error: 'Neither ripgrep nor CodexLens available for file discovery.',
};
}
// Try CodexLens file list command
const args = ['list-files', '--json'];
const result = await executeCodexLens(args, { cwd: scope.workingDirectory });
if (!result.success) {
return {
success: false,
error: `Failed to list files: ${result.error}`,
};
}
// Parse and filter results by pattern
let files: string[] = [];
try {
const parsed = JSON.parse(stripAnsi(result.output || '[]'));
files = Array.isArray(parsed) ? parsed : (parsed.files || []);
} catch {
return {
success: false,
error: 'Failed to parse file list from CodexLens',
};
}
// Apply glob pattern matching using minimatch-style regex
const globRegex = globToRegex(pattern, caseSensitive);
const matchedFiles = files.filter(f => globRegex.test(f));
// Apply pagination
const total = matchedFiles.length;
const paginatedFiles = matchedFiles.slice(offset, offset + limit);
const results: FileMatch[] = paginatedFiles.map(filePath => {
const parts = filePath.split(/[/\\]/);
const name = parts[parts.length - 1] || '';
const ext = name.includes('.') ? name.split('.').pop() : undefined;
return {
path: filePath,
type: 'file' as const,
name,
extension: ext,
};
});
return {
success: true,
results,
metadata: {
pattern,
backend: 'codexlens',
count: results.length,
pagination: {
offset,
limit,
total,
has_more: offset + limit < total,
},
},
};
}
// Use ripgrep --files with glob pattern for fast file discovery
return new Promise((resolve) => {
const args = ['--files'];
// Add exclude patterns
if (!includeHidden) {
args.push(...buildExcludeArgs());
} else {
args.push('--hidden');
}
// Add glob pattern
args.push('--glob', pattern);
// Case sensitivity for glob matching
if (!caseSensitive) {
args.push('--iglob', pattern);
// Remove the case-sensitive glob and use iglob instead
const globIndex = args.indexOf('--glob');
if (globIndex !== -1) {
args.splice(globIndex, 2);
}
}
const child = getSpawnRuntime()(
'rg',
args,
buildSmartSearchSpawnOptions(scope.workingDirectory || getProjectRoot(), {
stdio: ['ignore', 'pipe', 'pipe'],
}),
);
let stdout = '';
let stderr = '';
child.stdout?.on('data', (data) => {
stdout += data.toString();
});
child.stderr?.on('data', (data) => {
stderr += data.toString();
});
child.on('close', (code) => {
// ripgrep returns 1 when no matches found, which is not an error
if (code !== 0 && code !== 1 && !stderr.includes('os error 1')) {
resolve({
success: false,
error: `ripgrep file search failed: ${stderr}`,
});
return;
}
const allFiles = stdout.split('\n').filter(line => line.trim());
const total = allFiles.length;
// Apply pagination
const paginatedFiles = allFiles.slice(offset, offset + limit);
const results: FileMatch[] = paginatedFiles.map(filePath => {
const normalizedPath = filePath.replace(/\\/g, '/');
const parts = normalizedPath.split('/');
const name = parts[parts.length - 1] || '';
const ext = name.includes('.') ? name.split('.').pop() : undefined;
return {
path: normalizedPath,
type: 'file' as const,
name,
extension: ext,
};
});
resolve({
success: true,
results,
metadata: {
pattern,
backend: 'ripgrep',
count: results.length,
pagination: {
offset,
limit,
total,
has_more: offset + limit < total,
},
},
});
});
child.on('error', (error) => {
resolve({
success: false,
error: `Failed to spawn ripgrep: ${error.message}`,
});
});
});
}
/**
* Convert glob pattern to regex for file matching
* Supports: *, **, ?, [abc], [!abc]
*/
function globToRegex(pattern: string, caseSensitive: boolean = true): RegExp {
let i = 0;
const out: string[] = [];
const special = '.^$+{}|()';
while (i < pattern.length) {
const c = pattern[i];
if (c === '*') {
if (i + 1 < pattern.length && pattern[i + 1] === '*') {
// ** matches any path including /
out.push('.*');
i += 2;
// Skip following / if present
if (pattern[i] === '/') {
i++;
}
continue;
} else {
// * matches any character except /
out.push('[^/]*');
}
} else if (c === '?') {
out.push('[^/]');
} else if (c === '[') {
// Character class
let j = i + 1;
let negated = false;
if (pattern[j] === '!' || pattern[j] === '^') {
negated = true;
j++;
}
let classContent = '';
while (j < pattern.length && pattern[j] !== ']') {
classContent += pattern[j];
j++;
}
if (negated) {
out.push(`[^${classContent}]`);
} else {
out.push(`[${classContent}]`);
}
i = j;
} else if (special.includes(c)) {
out.push('\\' + c);
} else {
out.push(c);
}
i++;
}
const flags = caseSensitive ? '' : 'i';
return new RegExp('^' + out.join('') + '$', flags);
}
/**
* Apply pagination to search results and add pagination metadata
*/
function applyPagination<T>(
results: T[],
offset: number,
limit: number
): { paginatedResults: T[]; pagination: PaginationInfo } {
const total = results.length;
const paginatedResults = results.slice(offset, offset + limit);
return {
paginatedResults,
pagination: {
offset,
limit,
total,
has_more: offset + limit < total,
},
};
}
function formatChunkRange(section: AceLikeSection): string {
if (section.lines && section.lines.length > 0) {
const start = section.lines[0]?.line;
const end = section.lines[section.lines.length - 1]?.line;
if (typeof start === 'number' && typeof end === 'number' && end > start) {
return `${start}-${end}`;
}
if (typeof start === 'number') {
return String(start);
}
}
if (section.line && section.endLine && section.endLine > section.line) {
return `${section.line}-${section.endLine}`;
}
if (section.line) {
return String(section.line);
}
return '?';
}
function renderAceSnippet(section: AceLikeSection): string[] {
if (section.lines && section.lines.length > 0) {
return section.lines.map((line) => {
const marker = line.isMatch ? '>' : ' ';
return `${marker} ${String(line.line).padStart(4, ' ')} | ${line.text}`;
});
}
return section.snippet.split(/\r?\n/).map((line) => ` ${line}`);
}
function formatAceLikeOutput(
results: ExactMatch[] | SemanticMatch[] | GraphMatch[] | FileMatch[] | unknown[],
): AceLikeOutput {
const sections: AceLikeSection[] = [];
for (const result of results) {
const candidate = result as Record<string, unknown>;
const path = typeof candidate.file === 'string'
? candidate.file
: typeof candidate.path === 'string'
? candidate.path
: undefined;
if (!path) {
continue;
}
const line = typeof candidate.line === 'number' && candidate.line > 0 ? candidate.line : undefined;
const column = typeof candidate.column === 'number' && candidate.column > 0 ? candidate.column : undefined;
const score = typeof candidate.score === 'number' ? candidate.score : undefined;
const symbol = typeof candidate.symbol === 'string' ? candidate.symbol : null;
const rawSnippet = typeof candidate.content === 'string'
? candidate.content
: typeof candidate.name === 'string'
? candidate.name
: typeof candidate.type === 'string'
? `[${candidate.type}]`
: '';
sections.push({
path,
line,
endLine: typeof candidate.endLine === 'number' && candidate.endLine >= (line ?? 0) ? candidate.endLine : line,
column,
score,
symbol,
snippet: rawSnippet || '[no snippet available]',
lines: Array.isArray(candidate.chunkLines) ? candidate.chunkLines as ChunkLine[] : undefined,
});
}
const groupsMap = new Map<string, AceLikeGroup>();
for (const section of sections) {
if (!groupsMap.has(section.path)) {
groupsMap.set(section.path, {
path: section.path,
sections: [],
total_matches: 0,
});
}
const group = groupsMap.get(section.path)!;
group.sections.push(section);
group.total_matches += 1;
}
const groups = [...groupsMap.values()];
const textParts = ['The following code sections were retrieved:'];
for (const group of groups) {
textParts.push('');
textParts.push(`Path: ${group.path}`);
group.sections.forEach((section, index) => {
const chunkLabel = group.sections.length > 1 ? `Chunk ${index + 1}` : 'Chunk';
textParts.push(`${chunkLabel}: lines ${formatChunkRange(section)}${section.score !== undefined ? ` | score=${section.score.toFixed(4)}` : ''}`);
if (section.symbol) {
textParts.push(`Symbol: ${section.symbol}`);
}
for (const snippetLine of renderAceSnippet(section)) {
textParts.push(snippetLine);
}
if (index < group.sections.length - 1) {
textParts.push('');
}
});
}
return {
format: 'ace',
text: textParts.join('\n'),
groups,
sections,
total: sections.length,
};
}
/**
* Transform results based on output_mode
*/
function transformOutput(
results: ExactMatch[] | SemanticMatch[] | GraphMatch[] | unknown[],
outputMode: SearchOutputMode
): unknown {
if (!Array.isArray(results)) {
return results;
}
switch (outputMode) {
case 'files_only': {
// Extract unique file paths
const files = [...new Set(results.map((r: any) => r.file))].filter(Boolean);
return { files, count: files.length };
}
case 'count': {
// Count matches per file
const counts: Record<string, number> = {};
for (const r of results) {
const file = (r as any).file;
if (file) {
counts[file] = (counts[file] || 0) + 1;
}
}
return {
files: Object.entries(counts).map(([file, count]) => ({ file, count })),
total: results.length,
};
}
case 'ace':
return formatAceLikeOutput(results);
case 'full':
default:
return results;
}
}
function enrichMetadataWithIndexStatus(
metadata: SearchMetadata | undefined,
indexStatus: IndexStatus,
scope: SearchScope,
): SearchMetadata {
const nextMetadata: SearchMetadata = { ...(metadata ?? {}) };
nextMetadata.embeddings_coverage_percent = indexStatus.embeddings_coverage_percent;
nextMetadata.index_status = indexStatus.indexed
? (indexStatus.has_embeddings ? 'indexed' : 'partial')
: 'not_indexed';
nextMetadata.reranker_enabled = indexStatus.config?.reranker_enabled;
nextMetadata.reranker_backend = indexStatus.config?.reranker_backend;
nextMetadata.reranker_model = indexStatus.config?.reranker_model;
nextMetadata.cascade_strategy = indexStatus.config?.cascade_strategy;
nextMetadata.staged_stage2_mode = indexStatus.config?.staged_stage2_mode;
nextMetadata.static_graph_enabled = indexStatus.config?.static_graph_enabled;
nextMetadata.warning = mergeWarnings(nextMetadata.warning, indexStatus.warning);
nextMetadata.suggestions = mergeSuggestions(nextMetadata.suggestions, buildIndexSuggestions(indexStatus, scope));
return nextMetadata;
}
// Handler function
export async function handler(params: Record<string, unknown>): Promise<ToolResult<SearchResult>> {
const parsed = ParamsSchema.safeParse(params);
if (!parsed.success) {
return { success: false, error: `Invalid params: ${parsed.error.message}` };
}
parsed.data.query = sanitizeSearchQuery(parsed.data.query);
parsed.data.pattern = sanitizeSearchPath(parsed.data.pattern);
parsed.data.path = sanitizeSearchPath(parsed.data.path);
parsed.data.paths = parsed.data.paths.map((item) => sanitizeSearchPath(item) || item);
const { action, mode, output_mode, offset = 0 } = parsed.data;
// Sync limit and maxResults while preserving explicit small values.
// If both are provided, use the larger one. If only one is provided, honor it.
const rawLimit = typeof params.limit === 'number' ? params.limit : undefined;
const rawMaxResults = typeof params.maxResults === 'number' ? params.maxResults : undefined;
const effectiveLimit = rawLimit !== undefined && rawMaxResults !== undefined
? Math.max(rawLimit, rawMaxResults)
: rawMaxResults ?? rawLimit ?? parsed.data.maxResults ?? parsed.data.limit ?? 5;
parsed.data.maxResults = effectiveLimit;
parsed.data.limit = effectiveLimit;
// Track if search_files was used (deprecated)
let deprecationWarning: string | undefined;
try {
let result: SearchResult;
// Handle actions — all routed through codexlens-search v2 bridge
switch (action) {
case 'init':
result = await executeInitActionV2(parsed.data);
break;
case 'status':
result = await executeStatusActionV2(parsed.data);
break;
case 'find_files':
result = await executeFindFilesAction(parsed.data);
break;
case 'update':
result = await executeUpdateActionV2(parsed.data);
break;
case 'watch':
result = await executeWatchActionV2(parsed.data);
break;
case 'search_files':
// DEPRECATED: Redirect to search with files_only output
deprecationWarning = 'action="search_files" is deprecated. Use action="search" with output_mode="files_only" for content-to-files search, or action="find_files" for path pattern matching.';
parsed.data.output_mode = 'files_only';
// Fall through to search
case 'search':
default: {
// v2 bridge for semantic search
const scope = resolveSearchScope(parsed.data.path ?? '.');
const dbPath = join(scope.workingDirectory, '.codexlens');
const topK = (parsed.data.maxResults || 5) + (parsed.data.extraFilesCount || 10);
const v2Result = await executeCodexLensV2Bridge(parsed.data.query || '', topK, dbPath);
if (v2Result.success) {
result = v2Result;
break;
}
// v2 failed — fall back to ripgrep-only search
console.warn(`[CodexLens-v2] Bridge failed, falling back to ripgrep: ${v2Result.error}`);
result = await executeRipgrepMode(parsed.data);
break;
}
}
let backgroundNote: string | undefined;
// Transform output based on output_mode (for search actions only)
if (action === 'search' || action === 'search_files') {
// Add pagination metadata for search results if not already present
if (result.success && result.results && Array.isArray(result.results)) {
const totalResults = (result.results as any[]).length;
if (!result.metadata) {
result.metadata = {};
}
if (!result.metadata.pagination) {
result.metadata.pagination = {
offset: 0,
limit: effectiveLimit,
total: totalResults,
has_more: false, // Already limited by backend
};
}
}
if (result.success && result.results && output_mode !== 'full') {
result.results = transformOutput(result.results as any[], output_mode);
if (
output_mode === 'ace'
&& result.results
&& typeof result.results === 'object'
&& 'format' in result.results
&& result.results.format === 'ace'
) {
const advisoryLines: string[] = [];
if (result.metadata?.warning) {
advisoryLines.push('', 'Warnings:', `- ${result.metadata.warning}`);
}
if (backgroundNote) {
advisoryLines.push('', 'Notes:', `- ${backgroundNote}`);
}
if (result.metadata?.suggestions && result.metadata.suggestions.length > 0) {
advisoryLines.push('', 'Suggestions:');
for (const suggestion of result.metadata.suggestions) {
advisoryLines.push(`- ${suggestion.title}: ${suggestion.command}`);
advisoryLines.push(` ${suggestion.reason}`);
}
}
const aceResults = result.results as AceLikeOutput;
if (advisoryLines.length > 0) {
aceResults.text += `\n${advisoryLines.join('\n')}`;
}
}
}
}
// Add deprecation warning if applicable
if (deprecationWarning && result.metadata) {
result.metadata.warning = deprecationWarning;
}
return result.success ? { success: true, result } : { success: false, error: result.error };
} catch (error) {
return { success: false, error: (error as Error).message };
}
}
/**
* Execute init action with external progress callback
* Used by MCP server for streaming progress
* @param params - Search parameters (path, languages, force)
* @param onProgress - Optional callback for progress updates
*/
export const __testables = {
isCodexLensCliCompatibilityError,
shouldSurfaceCodexLensFtsCompatibilityWarning,
buildSmartSearchSpawnOptions,
shouldDetachBackgroundSmartSearchProcess,
checkToolAvailability,
parseCodexLensJsonOutput,
parsePlainTextFileMatches,
hasCentralizedVectorArtifacts,
extractEmbeddingsStatusSummary,
selectEmbeddingsStatusPayload,
resolveRipgrepQueryMode,
queryTargetsGeneratedFiles,
prefersLexicalPriorityQuery,
classifyIntent,
resolveEmbeddingSelection,
parseOptionalBooleanEnv,
isAutoInitMissingEnabled,
isAutoEmbedMissingEnabled,
getAutoInitMissingDisabledReason,
getAutoEmbedMissingDisabledReason,
buildIndexSuggestions,
maybeStartBackgroundAutoInit,
maybeStartBackgroundAutoEmbed,
__setRuntimeOverrides(overrides: Partial<SmartSearchRuntimeOverrides>) {
Object.assign(runtimeOverrides, overrides);
},
__resetRuntimeOverrides() {
for (const key of Object.keys(runtimeOverrides) as Array<keyof SmartSearchRuntimeOverrides>) {
delete runtimeOverrides[key];
}
},
__resetBackgroundJobs() {
autoInitJobs.clear();
autoEmbedJobs.clear();
},
};
export async function executeInitWithProgress(
params: Record<string, unknown>,
onProgress?: (progress: ProgressInfo) => void
): Promise<SearchResult> {
const path = (params.path as string) || '.';
const scope = resolveSearchScope(path);
const dbPath = join(scope.workingDirectory, '.codexlens');
// Notify progress start
if (onProgress) {
onProgress({ stage: 'init', message: 'Initializing v2 index...', percent: 0 } as ProgressInfo);
}
// Step 1: init empty index
const initResult = await executeV2BridgeCommand('init', [], { dbPath });
if (!initResult.success) return initResult;
if (onProgress) {
onProgress({ stage: 'sync', message: 'Syncing files...', percent: 10 } as ProgressInfo);
}
// Step 2: sync all files
const syncResult = await executeV2BridgeCommand('sync', [
'--root', scope.workingDirectory,
], { timeout: 1800000, dbPath });
if (onProgress) {
onProgress({ stage: 'complete', message: 'Index build complete', percent: 100 } as ProgressInfo);
}
return {
success: syncResult.success,
error: syncResult.error,
message: syncResult.success
? `v2 index created and synced for ${scope.workingDirectory}`
: undefined,
metadata: { action: 'init', path: scope.workingDirectory },
status: syncResult.status,
};
}