From fd4a15c84ef2cf604b85267b34cf39d024a910f2 Mon Sep 17 00:00:00 2001 From: catlog22 Date: Sat, 20 Dec 2025 21:44:15 +0800 Subject: [PATCH] fix: improve chunking logic in Chunker class and enhance smart search tool with comprehensive features - Updated the Chunker class to adjust the window movement logic, ensuring proper handling of overlap lines. - Introduced a new smart search tool with features including intent classification, CodexLens integration, multi-backend search routing, and index status checking. - Implemented various search modes (auto, hybrid, exact, ripgrep, priority) with detailed metadata and error handling. - Added support for progress tracking during index initialization and enhanced output transformation based on user-defined modes. - Included comprehensive documentation for usage and parameters in the smart search tool. --- ccw/src/core/routes/codexlens-routes.ts | 5 +- ccw/src/templates/dashboard-js/i18n.js | 16 + .../dashboard-js/views/cli-manager.js | 22 +- .../dashboard-js/views/codexlens-manager.js | 84 +- ccw/src/tools/codex-lens.ts | 118 +- ccw/src/tools/smart-search.ts | 786 ++++++++++- ccw/src/tools/smart-search.ts.backup | 1233 +++++++++++++++++ .../src/codexlens/cli/embedding_manager.py | 234 ++-- codex-lens/src/codexlens/semantic/chunker.py | 9 +- 9 files changed, 2289 insertions(+), 218 deletions(-) create mode 100644 ccw/src/tools/smart-search.ts.backup diff --git a/ccw/src/core/routes/codexlens-routes.ts b/ccw/src/core/routes/codexlens-routes.ts index e91f198b..012f7f14 100644 --- a/ccw/src/core/routes/codexlens-routes.ts +++ b/ccw/src/core/routes/codexlens-routes.ts @@ -384,13 +384,16 @@ export async function handleCodexLensRoutes(ctx: RouteContext): Promise // API: CodexLens Init (Initialize workspace index) if (pathname === '/api/codexlens/init' && req.method === 'POST') { handlePostRequest(req, res, async (body) => { - const { path: projectPath, indexType = 'vector' } = body; + const { path: projectPath, indexType = 'vector', embeddingModel = 'code' } = body; const targetPath = projectPath || initialPath; // Build CLI arguments based on index type const args = ['init', targetPath, '--json']; if (indexType === 'normal') { args.push('--no-embeddings'); + } else { + // Add embedding model selection for vector index + args.push('--embedding-model', embeddingModel); } // Broadcast start event diff --git a/ccw/src/templates/dashboard-js/i18n.js b/ccw/src/templates/dashboard-js/i18n.js index 5dcc4a5f..b438740a 100644 --- a/ccw/src/templates/dashboard-js/i18n.js +++ b/ccw/src/templates/dashboard-js/i18n.js @@ -275,6 +275,7 @@ const i18n = { 'codexlens.semanticInstalled': 'Semantic dependencies installed', 'codexlens.semanticNotInstalled': 'Semantic dependencies not installed', 'codexlens.installDeps': 'Install Dependencies', + 'codexlens.installDepsPrompt': 'Would you like to install them now? (This may take a few minutes)\n\nClick "Cancel" to create FTS index only.', 'codexlens.installingDeps': 'Installing dependencies...', 'codexlens.depsInstalled': 'Dependencies installed successfully', 'codexlens.depsInstallFailed': 'Failed to install dependencies', @@ -324,8 +325,15 @@ const i18n = { 'index.cleanAllSuccess': 'All indexes cleaned', 'index.vectorIndex': 'Vector', 'index.normalIndex': 'FTS', + 'index.fullIndex': 'Full Index', 'index.vectorDesc': 'Semantic search with embeddings', 'index.normalDesc': 'Fast full-text search only', + 'index.fullDesc': 'FTS + Semantic search (recommended)', + 'index.selectModel': 'Select embedding model', + 'index.modelCode': 'Code (768d)', + 'index.modelFast': 'Fast (384d)', + 'index.modelMultilingual': 'Multilingual (1024d)', + 'index.modelBalanced': 'Balanced (1024d)', // Semantic Search Configuration 'semantic.settings': 'Semantic Search Settings', @@ -1596,6 +1604,7 @@ const i18n = { 'codexlens.semanticInstalled': '语义搜索依赖已安装', 'codexlens.semanticNotInstalled': '语义搜索依赖未安装', 'codexlens.installDeps': '安装依赖', + 'codexlens.installDepsPrompt': '是否立即安装?(可能需要几分钟)\n\n点击"取消"将只创建 FTS 索引。', 'codexlens.installingDeps': '安装依赖中...', 'codexlens.depsInstalled': '依赖安装成功', 'codexlens.depsInstallFailed': '依赖安装失败', @@ -1645,8 +1654,15 @@ const i18n = { 'index.cleanAllSuccess': '所有索引已清理', 'index.vectorIndex': '向量索引', 'index.normalIndex': 'FTS索引', + 'index.fullIndex': '全部索引', 'index.vectorDesc': '语义搜索(含嵌入向量)', 'index.normalDesc': '快速全文搜索', + 'index.fullDesc': 'FTS + 语义搜索(推荐)', + 'index.selectModel': '选择嵌入模型', + 'index.modelCode': '代码优化 (768维)', + 'index.modelFast': '快速轻量 (384维)', + 'index.modelMultilingual': '多语言 (1024维)', + 'index.modelBalanced': '高精度 (1024维)', // Semantic Search 配置 'semantic.settings': '语义搜索设置', diff --git a/ccw/src/templates/dashboard-js/views/cli-manager.js b/ccw/src/templates/dashboard-js/views/cli-manager.js index 859c684d..9f79b82c 100644 --- a/ccw/src/templates/dashboard-js/views/cli-manager.js +++ b/ccw/src/templates/dashboard-js/views/cli-manager.js @@ -338,6 +338,17 @@ async function renderCliManager() { if (window.lucide) lucide.createIcons(); } +// ========== Helper Functions ========== + +/** + * Get selected embedding model from dropdown + * @returns {string} Selected model profile (code, fast, multilingual, balanced) + */ +function getSelectedModel() { + var select = document.getElementById('codexlensModelSelect'); + return select ? select.value : 'code'; +} + // ========== Tools Section (Left Column) ========== function renderToolsSection() { var container = document.getElementById('tools-section'); @@ -392,8 +403,15 @@ function renderToolsSection() { '
' + (codexLensStatus.ready ? ' v' + (codexLensStatus.version || 'installed') + '' + - '' + - '' + + '' + + '' + + '' + + '' + '' : ' ' + t('cli.notInstalled') + '' + '') + diff --git a/ccw/src/templates/dashboard-js/views/codexlens-manager.js b/ccw/src/templates/dashboard-js/views/codexlens-manager.js index 18c665b2..6e20c09d 100644 --- a/ccw/src/templates/dashboard-js/views/codexlens-manager.js +++ b/ccw/src/templates/dashboard-js/views/codexlens-manager.js @@ -554,10 +554,54 @@ async function deleteModel(profile) { /** * Initialize CodexLens index with bottom floating progress bar - * @param {string} indexType - 'vector' (with embeddings) or 'normal' (FTS only) + * @param {string} indexType - 'vector' (with embeddings), 'normal' (FTS only), or 'full' (FTS + Vector) + * @param {string} embeddingModel - Model profile: 'code', 'fast', 'multilingual', 'balanced' */ -function initCodexLensIndex(indexType) { +async function initCodexLensIndex(indexType, embeddingModel) { indexType = indexType || 'vector'; + embeddingModel = embeddingModel || 'code'; + + // For vector or full index, check if semantic dependencies are available + if (indexType === 'vector' || indexType === 'full') { + try { + var semanticResponse = await fetch('/api/codexlens/semantic/status'); + var semanticStatus = await semanticResponse.json(); + + if (!semanticStatus.available) { + // Semantic deps not installed - show confirmation dialog + var installDeps = confirm( + (t('codexlens.semanticNotInstalled') || 'Semantic search dependencies are not installed.') + '\n\n' + + (t('codexlens.installDepsPrompt') || 'Would you like to install them now? (This may take a few minutes)\n\nClick "Cancel" to create FTS index only.') + ); + + if (installDeps) { + // Install semantic dependencies first + showRefreshToast(t('codexlens.installingDeps') || 'Installing semantic dependencies...', 'info'); + try { + var installResponse = await fetch('/api/codexlens/semantic/install', { method: 'POST' }); + var installResult = await installResponse.json(); + + if (!installResult.success) { + showRefreshToast((t('codexlens.depsInstallFailed') || 'Failed to install dependencies') + ': ' + installResult.error, 'error'); + // Fall back to FTS only + indexType = 'normal'; + } else { + showRefreshToast(t('codexlens.depsInstalled') || 'Dependencies installed successfully', 'success'); + } + } catch (err) { + showRefreshToast((t('common.error') || 'Error') + ': ' + err.message, 'error'); + indexType = 'normal'; + } + } else { + // User chose to skip - create FTS only + indexType = 'normal'; + } + } + } catch (err) { + console.warn('[CodexLens] Could not check semantic status:', err); + // Continue with requested type, backend will handle fallback + } + } // Remove existing progress bar if any closeCodexLensIndexModal(); @@ -566,7 +610,24 @@ function initCodexLensIndex(indexType) { var progressBar = document.createElement('div'); progressBar.id = 'codexlensIndexFloating'; progressBar.className = 'fixed bottom-0 left-0 right-0 z-50 bg-card border-t border-border shadow-lg transform transition-transform duration-300'; - var indexTypeLabel = indexType === 'vector' ? 'Vector' : 'FTS'; + + // Determine display label + var indexTypeLabel; + if (indexType === 'full') { + indexTypeLabel = 'FTS + Vector'; + } else if (indexType === 'vector') { + indexTypeLabel = 'Vector'; + } else { + indexTypeLabel = 'FTS'; + } + + // Add model info for vector indexes + var modelLabel = ''; + if (indexType !== 'normal') { + var modelNames = { code: 'Code', fast: 'Fast', multilingual: 'Multi', balanced: 'Balanced' }; + modelLabel = ' [' + (modelNames[embeddingModel] || embeddingModel) + ']'; + } + progressBar.innerHTML = '
' + '
' + @@ -574,7 +635,7 @@ function initCodexLensIndex(indexType) { '
' + '
' + '
' + - '' + t('codexlens.indexing') + ' (' + indexTypeLabel + ')' + + '' + t('codexlens.indexing') + ' (' + indexTypeLabel + modelLabel + ')' + '0%' + '
' + '
' + t('codexlens.preparingIndex') + '
' + @@ -594,16 +655,21 @@ function initCodexLensIndex(indexType) { document.body.appendChild(progressBar); if (window.lucide) lucide.createIcons(); - // Start indexing with specified type - startCodexLensIndexing(indexType); + // For 'full' type, use 'vector' in the API (it creates FTS + embeddings) + var apiIndexType = (indexType === 'full') ? 'vector' : indexType; + + // Start indexing with specified type and model + startCodexLensIndexing(apiIndexType, embeddingModel); } /** * Start the indexing process * @param {string} indexType - 'vector' or 'normal' + * @param {string} embeddingModel - Model profile: 'code', 'fast', 'multilingual', 'balanced' */ -async function startCodexLensIndexing(indexType) { +async function startCodexLensIndexing(indexType, embeddingModel) { indexType = indexType || 'vector'; + embeddingModel = embeddingModel || 'code'; var statusText = document.getElementById('codexlensIndexStatus'); var progressBar = document.getElementById('codexlensIndexProgressBar'); var percentText = document.getElementById('codexlensIndexPercent'); @@ -635,11 +701,11 @@ async function startCodexLensIndexing(indexType) { } try { - console.log('[CodexLens] Starting index for:', projectPath, 'type:', indexType); + console.log('[CodexLens] Starting index for:', projectPath, 'type:', indexType, 'model:', embeddingModel); var response = await fetch('/api/codexlens/init', { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ path: projectPath, indexType: indexType }) + body: JSON.stringify({ path: projectPath, indexType: indexType, embeddingModel: embeddingModel }) }); var result = await response.json(); diff --git a/ccw/src/tools/codex-lens.ts b/ccw/src/tools/codex-lens.ts index 6d171fd4..d6229b6e 100644 --- a/ccw/src/tools/codex-lens.ts +++ b/ccw/src/tools/codex-lens.ts @@ -429,7 +429,7 @@ function parseProgressLine(line: string): ProgressInfo | null { } /** - * Execute CodexLens CLI command + * Execute CodexLens CLI command with real-time progress updates * @param args - CLI arguments * @param options - Execution options * @returns Execution result @@ -463,34 +463,110 @@ async function executeCodexLens(args: string[], options: ExecuteOptions = {}): P fullCmd = `${quotedPython} -m codexlens ${cmdArgs.join(' ')}`; } - // Use exec with shell option for cross-platform compatibility - exec(fullCmd, { - cwd: process.platform === 'win32' ? undefined : cwd, // Don't use cwd on Windows, use cd command instead + // Use spawn with shell for real-time progress updates + // spawn streams output in real-time, unlike exec which buffers until completion + const child = spawn(fullCmd, [], { + cwd: process.platform === 'win32' ? undefined : cwd, + shell: process.platform === 'win32' ? process.env.ComSpec || true : true, timeout, - maxBuffer: 50 * 1024 * 1024, // 50MB buffer for large outputs - shell: process.platform === 'win32' ? process.env.ComSpec : undefined, - }, (error, stdout, stderr) => { - if (error) { - if (error.killed) { - resolve({ success: false, error: 'Command timed out' }); - } else { - resolve({ success: false, error: stderr || error.message }); - } - return; - } + }); - // Report final progress if callback provided - if (onProgress && stdout) { - const lines = stdout.split('\n'); - for (const line of lines) { - const progress = parseProgressLine(line.trim()); + let stdout = ''; + let stderr = ''; + let stdoutLineBuffer = ''; + let stderrLineBuffer = ''; + let timeoutHandle: NodeJS.Timeout | null = null; + let resolved = false; + + // Helper to safely resolve only once + const safeResolve = (result: ExecuteResult) => { + if (resolved) return; + resolved = true; + if (timeoutHandle) { + clearTimeout(timeoutHandle); + timeoutHandle = null; + } + resolve(result); + }; + + // Set up timeout handler + if (timeout > 0) { + timeoutHandle = setTimeout(() => { + if (!resolved) { + child.kill('SIGTERM'); + // Give it a moment to die gracefully, then force kill + setTimeout(() => { + if (!resolved) { + child.kill('SIGKILL'); + } + }, 5000); + safeResolve({ success: false, error: 'Command timed out' }); + } + }, timeout); + } + + // Process stdout line by line for real-time progress + child.stdout?.on('data', (data: Buffer) => { + const chunk = data.toString(); + stdoutLineBuffer += chunk; + stdout += chunk; + + // Process complete lines + const lines = stdoutLineBuffer.split('\n'); + stdoutLineBuffer = lines.pop() || ''; // Keep incomplete line in buffer + + for (const line of lines) { + const trimmedLine = line.trim(); + if (trimmedLine && onProgress) { + const progress = parseProgressLine(trimmedLine); if (progress) { onProgress(progress); } } } + }); - resolve({ success: true, output: stdout.trim() }); + // Collect stderr + child.stderr?.on('data', (data: Buffer) => { + const chunk = data.toString(); + stderrLineBuffer += chunk; + stderr += chunk; + + // Also check stderr for progress (some tools output progress to stderr) + const lines = stderrLineBuffer.split('\n'); + stderrLineBuffer = lines.pop() || ''; + + for (const line of lines) { + const trimmedLine = line.trim(); + if (trimmedLine && onProgress) { + const progress = parseProgressLine(trimmedLine); + if (progress) { + onProgress(progress); + } + } + } + }); + + // Handle process errors (spawn failure) + child.on('error', (err) => { + safeResolve({ success: false, error: `Failed to start process: ${err.message}` }); + }); + + // Handle process completion + child.on('close', (code) => { + // Process any remaining buffered content + if (stdoutLineBuffer.trim() && onProgress) { + const progress = parseProgressLine(stdoutLineBuffer.trim()); + if (progress) { + onProgress(progress); + } + } + + if (code === 0) { + safeResolve({ success: true, output: stdout.trim() }); + } else { + safeResolve({ success: false, error: stderr.trim() || `Process exited with code ${code}` }); + } }); }); } diff --git a/ccw/src/tools/smart-search.ts b/ccw/src/tools/smart-search.ts index 173ce0fa..dfae27cb 100644 --- a/ccw/src/tools/smart-search.ts +++ b/ccw/src/tools/smart-search.ts @@ -25,18 +25,26 @@ import type { ProgressInfo } from './codex-lens.js'; // Define Zod schema for validation const ParamsSchema = z.object({ - action: z.enum(['init', 'search', 'search_files', 'status']).default('search'), - query: z.string().optional(), + // Action: search (content), find_files (path/name pattern), init, status + // Note: search_files is deprecated, use search with output_mode='files_only' + action: z.enum(['init', 'search', 'search_files', 'find_files', 'status']).default('search'), + query: z.string().optional().describe('Content search query (for action="search")'), + pattern: z.string().optional().describe('Glob pattern for path matching (for action="find_files")'), mode: z.enum(['auto', 'hybrid', 'exact', 'ripgrep', 'priority']).default('auto'), output_mode: z.enum(['full', 'files_only', 'count']).default('full'), path: z.string().optional(), paths: z.array(z.string()).default([]), contextLines: z.number().default(0), - maxResults: z.number().default(10), + maxResults: z.number().default(20), // Increased default includeHidden: z.boolean().default(false), languages: z.array(z.string()).optional(), - limit: z.number().default(10), + limit: z.number().default(20), // Increased default + offset: z.number().default(0), // NEW: Pagination offset (start_index) enrich: z.boolean().default(false), + // Search modifiers for ripgrep mode + regex: z.boolean().default(true), // Use regex pattern matching (default: enabled) + caseSensitive: z.boolean().default(true), // Case sensitivity (default: case-sensitive) + // Fuzzy matching is implicit in hybrid mode (RRF fusion) }); type Params = z.infer; @@ -47,6 +55,46 @@ const SEARCH_MODES = ['auto', 'hybrid', 'exact', 'ripgrep', 'priority'] as const // Classification confidence threshold const CONFIDENCE_THRESHOLD = 0.7; +// File filtering configuration (ported from code-index) +const FILTER_CONFIG = { + exclude_directories: new Set([ + '.git', '.svn', '.hg', '.bzr', + 'node_modules', '__pycache__', '.venv', 'venv', 'vendor', 'bower_components', + 'dist', 'build', 'target', 'out', 'bin', 'obj', + '.idea', '.vscode', '.vs', '.sublime-workspace', + '.pytest_cache', '.coverage', '.tox', '.nyc_output', 'coverage', 'htmlcov', + '.next', '.nuxt', '.cache', '.parcel-cache', + '.DS_Store', 'Thumbs.db', + ]), + exclude_files: new Set([ + '*.tmp', '*.temp', '*.swp', '*.swo', '*.bak', '*~', '*.orig', '*.log', + 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml', 'Pipfile.lock', + ]), + // Windows device files - must use **/ pattern to match in any directory + // These cause "os error 1" on Windows when accessed + windows_device_files: new Set([ + 'nul', 'con', 'aux', 'prn', + 'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9', + 'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9', + ]), +}; + +function buildExcludeArgs(): string[] { + const args: string[] = []; + for (const dir of FILTER_CONFIG.exclude_directories) { + args.push('--glob', `!**/${dir}/**`); + } + for (const pattern of FILTER_CONFIG.exclude_files) { + args.push('--glob', `!${pattern}`); + } + // Windows device files need case-insensitive matching in any directory + for (const device of FILTER_CONFIG.windows_device_files) { + args.push('--glob', `!**/${device}`); + args.push('--glob', `!**/${device.toUpperCase()}`); + } + return args; +} + interface Classification { mode: string; confidence: number; @@ -83,11 +131,27 @@ interface GraphMatch { relationships: unknown[]; } +// File match for find_files action (path-based search) +interface FileMatch { + path: string; + type: 'file' | 'directory'; + name: string; // Filename only + extension?: string; // File extension (without dot) +} + +interface PaginationInfo { + offset: number; // Starting index of returned results + limit: number; // Number of results requested + total: number; // Total number of results found + has_more: boolean; // True if more results are available +} + interface SearchMetadata { mode?: string; backend?: string; count?: number; query?: string; + pattern?: string; // For find_files action classified_as?: string; confidence?: number; reasoning?: string; @@ -96,6 +160,9 @@ interface SearchMetadata { note?: string; index_status?: 'indexed' | 'not_indexed' | 'partial'; fallback_history?: string[]; + suggested_weights?: Record; + // Pagination metadata + pagination?: PaginationInfo; // Init action specific action?: string; path?: string; @@ -111,7 +178,7 @@ interface SearchMetadata { interface SearchResult { success: boolean; - results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | unknown; + results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | FileMatch[] | unknown; output?: string; metadata?: SearchMetadata; error?: string; @@ -236,6 +303,14 @@ function detectRelationship(query: string): boolean { return /(import|export|uses?|depends?|calls?|extends?)\s/i.test(query); } +function looksLikeCodeQuery(query: string): boolean { + if (/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(query)) return true; + if (/[:.<>\-=(){}[\]]/.test(query) && query.split(/\s+/).length <= 2) return true; + if (/\.\*|\\\(|\\\[|\\s/.test(query)) return true; + if (/^[a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*$/.test(query)) return true; + return false; +} + /** * Classify query intent and recommend search mode * Simple mapping: hybrid (NL + index + embeddings) | exact (index or insufficient embeddings) | ripgrep (no index) @@ -245,34 +320,34 @@ function detectRelationship(query: string): boolean { * @returns Classification result */ function classifyIntent(query: string, hasIndex: boolean = false, hasSufficientEmbeddings: boolean = false): Classification { - // Detect query patterns const isNaturalLanguage = detectNaturalLanguage(query); + const isCodeQuery = looksLikeCodeQuery(query); + const isRegexPattern = detectRegex(query); - // Simple decision tree let mode: string; let confidence: number; if (!hasIndex) { - // No index: use ripgrep mode = 'ripgrep'; confidence = 1.0; + } else if (isCodeQuery || isRegexPattern) { + mode = 'exact'; + confidence = 0.95; } else if (isNaturalLanguage && hasSufficientEmbeddings) { - // Natural language + sufficient embeddings: use hybrid mode = 'hybrid'; confidence = 0.9; } else { - // Simple query OR insufficient embeddings: use exact mode = 'exact'; confidence = 0.8; } - // Build reasoning string const detectedPatterns: string[] = []; if (detectLiteral(query)) detectedPatterns.push('literal'); if (detectRegex(query)) detectedPatterns.push('regex'); if (detectNaturalLanguage(query)) detectedPatterns.push('natural language'); if (detectFilePath(query)) detectedPatterns.push('file path'); if (detectRelationship(query)) detectedPatterns.push('relationship'); + if (isCodeQuery) detectedPatterns.push('code identifier'); const reasoning = `Query classified as ${mode} (confidence: ${confidence.toFixed(2)}, detected: ${detectedPatterns.join(', ')}, index: ${hasIndex ? 'available' : 'not available'}, embeddings: ${hasSufficientEmbeddings ? 'sufficient' : 'insufficient'})`; @@ -306,34 +381,46 @@ function buildRipgrepCommand(params: { contextLines: number; maxResults: number; includeHidden: boolean; + regex?: boolean; + caseSensitive?: boolean; }): { command: string; args: string[] } { - const { query, paths = ['.'], contextLines = 0, maxResults = 10, includeHidden = false } = params; + const { query, paths = ['.'], contextLines = 0, maxResults = 10, includeHidden = false, regex = false, caseSensitive = true } = params; const args = [ - '-n', // Show line numbers - '--color=never', // Disable color output - '--json', // Output in JSON format + '-n', + '--color=never', + '--json', ]; - // Add context lines if specified + // Add file filtering (unless includeHidden is true) + if (!includeHidden) { + args.push(...buildExcludeArgs()); + } + + // Case sensitivity + if (!caseSensitive) { + args.push('--ignore-case'); + } + if (contextLines > 0) { args.push('-C', contextLines.toString()); } - // Add max results limit if (maxResults > 0) { args.push('--max-count', maxResults.toString()); } - // Include hidden files if specified if (includeHidden) { args.push('--hidden'); } - // Use literal/fixed string matching for exact mode - args.push('-F', query); + // Regex mode (-e) vs fixed string mode (-F) + if (regex) { + args.push('-e', query); + } else { + args.push('-F', query); + } - // Add search paths args.push(...paths); return { command: 'rg', args }; @@ -492,7 +579,7 @@ async function executeAutoMode(params: Params): Promise { * No index required, fallback to CodexLens if ripgrep unavailable */ async function executeRipgrepMode(params: Params): Promise { - const { query, paths = [], contextLines = 0, maxResults = 10, includeHidden = false, path = '.' } = params; + const { query, paths = [], contextLines = 0, maxResults = 10, includeHidden = false, path = '.', regex = true, caseSensitive = true } = params; if (!query) { return { @@ -566,6 +653,8 @@ async function executeRipgrepMode(params: Params): Promise { contextLines, maxResults, includeHidden, + regex, + caseSensitive, }); return new Promise((resolve) => { @@ -587,31 +676,34 @@ async function executeRipgrepMode(params: Params): Promise { child.on('close', (code) => { const results: ExactMatch[] = []; + const lines = stdout.split('\n').filter((line) => line.trim()); - if (code === 0 || (code === 1 && stdout.trim())) { - const lines = stdout.split('\n').filter((line) => line.trim()); + for (const line of lines) { + try { + const item = JSON.parse(line); - for (const line of lines) { - try { - const item = JSON.parse(line); - - if (item.type === 'match') { - const match: ExactMatch = { - file: item.data.path.text, - line: item.data.line_number, - column: - item.data.submatches && item.data.submatches[0] - ? item.data.submatches[0].start + 1 - : 1, - content: item.data.lines.text.trim(), - }; - results.push(match); - } - } catch { - continue; + if (item.type === 'match') { + const match: ExactMatch = { + file: item.data.path.text, + line: item.data.line_number, + column: + item.data.submatches && item.data.submatches[0] + ? item.data.submatches[0].start + 1 + : 1, + content: item.data.lines.text.trim(), + }; + results.push(match); } + } catch { + continue; } + } + // Handle Windows device file errors gracefully (os error 1) + // If we have results despite the error, return them as partial success + const isWindowsDeviceError = stderr.includes('os error 1') || stderr.includes('函数不正确'); + + if (code === 0 || code === 1 || (isWindowsDeviceError && results.length > 0)) { resolve({ success: true, results, @@ -620,6 +712,20 @@ async function executeRipgrepMode(params: Params): Promise { backend: 'ripgrep', count: results.length, query, + ...(isWindowsDeviceError && { warning: 'Some Windows device files were skipped' }), + }, + }); + } else if (isWindowsDeviceError && results.length === 0) { + // Windows device error but no results - might be the only issue + resolve({ + success: true, + results: [], + metadata: { + mode: 'ripgrep', + backend: 'ripgrep', + count: 0, + query, + warning: 'No matches found (some Windows device files were skipped)', }, }); } else { @@ -764,15 +870,42 @@ async function executeHybridMode(params: Params): Promise { // Parse results let results: SemanticMatch[] = []; + let baselineInfo: { score: number; count: number } | null = null; + let initialCount = 0; + try { const parsed = JSON.parse(stripAnsi(result.output || '{}')); const data = parsed.result?.results || parsed.results || parsed; - results = (Array.isArray(data) ? data : []).map((item: any) => ({ - file: item.path || item.file, - score: item.score || 0, - content: item.excerpt || item.content || '', - symbol: item.symbol || null, - })); + results = (Array.isArray(data) ? data : []).map((item: any) => { + const rawScore = item.score || 0; + // Hybrid mode returns distance scores (lower is better). + // Convert to similarity scores (higher is better) for consistency. + // Formula: similarity = 1 / (1 + distance) + const similarityScore = rawScore > 0 ? 1 / (1 + rawScore) : 1; + return { + file: item.path || item.file, + score: similarityScore, + content: item.excerpt || item.content || '', + symbol: item.symbol || null, + }; + }); + + initialCount = results.length; + + // Post-processing pipeline to improve semantic search quality + // 0. Filter dominant baseline scores (hot spot detection) + const baselineResult = filterDominantBaselineScores(results); + results = baselineResult.filteredResults; + baselineInfo = baselineResult.baselineInfo; + + // 1. Filter noisy files (coverage, node_modules, etc.) + results = filterNoisyFiles(results); + // 2. Boost results containing query keywords + results = applyKeywordBoosting(results, query); + // 3. Enforce score diversity (penalize identical scores) + results = enforceScoreDiversity(results); + // 4. Re-sort by adjusted scores + results.sort((a, b) => b.score - a.score); } catch { return { success: true, @@ -788,6 +921,12 @@ async function executeHybridMode(params: Params): Promise { }; } + // Build metadata with baseline info if detected + let note = 'Hybrid mode uses RRF fusion (exact + fuzzy + vector) for best results'; + if (baselineInfo) { + note += ` | Filtered ${initialCount - results.length} hot-spot results with baseline score ~${baselineInfo.score.toFixed(4)}`; + } + return { success: true, results, @@ -796,12 +935,195 @@ async function executeHybridMode(params: Params): Promise { backend: 'codexlens', count: results.length, query, - note: 'Hybrid mode uses RRF fusion (exact + fuzzy + vector) for best results', + note, warning: indexStatus.warning, + suggested_weights: getRRFWeights(query), }, }; } +const RRF_WEIGHTS = { + code: { exact: 0.7, fuzzy: 0.2, vector: 0.1 }, + natural: { exact: 0.4, fuzzy: 0.2, vector: 0.4 }, + default: { exact: 0.5, fuzzy: 0.2, vector: 0.3 }, +}; + +function getRRFWeights(query: string): Record { + const isCode = looksLikeCodeQuery(query); + const isNatural = detectNaturalLanguage(query); + if (isCode) return RRF_WEIGHTS.code; + if (isNatural) return RRF_WEIGHTS.natural; + return RRF_WEIGHTS.default; +} + +/** + * Post-processing: Filter noisy files from semantic search results + * Uses FILTER_CONFIG patterns to remove irrelevant files. + * Optimized: pre-compiled regexes, accurate path segment matching. + */ +// Pre-compile file exclusion regexes once (avoid recompilation in loop) +const FILE_EXCLUDE_REGEXES = [...FILTER_CONFIG.exclude_files].map(pattern => + new RegExp('^' + pattern.replace(/[.*+?^${}()|[\]\\]/g, '\\$&').replace(/\\\*/g, '.*') + '$') +); + +function filterNoisyFiles(results: SemanticMatch[]): SemanticMatch[] { + return results.filter(r => { + const filePath = r.file || ''; + if (!filePath) return true; + + const segments = filePath.split(/[/\\]/); + + // Accurate directory check: segment must exactly match excluded directory + if (segments.some(segment => FILTER_CONFIG.exclude_directories.has(segment))) { + return false; + } + + // Accurate file check: pattern matches filename only (not full path) + const filename = segments.pop() || ''; + if (FILE_EXCLUDE_REGEXES.some(regex => regex.test(filename))) { + return false; + } + + return true; + }); +} + +/** + * Post-processing: Boost results containing query keywords + * Extracts keywords from query and boosts matching results. + * Optimized: uses whole-word matching with regex for accuracy. + */ +// Helper to escape regex special characters +function escapeRegExp(str: string): string { + return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +function applyKeywordBoosting(results: SemanticMatch[], query: string): SemanticMatch[] { + // Extract meaningful keywords (ignore common words) + const stopWords = new Set(['the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', 'used', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'and', 'but', 'if', 'or', 'because', 'until', 'while', 'although', 'though', 'after', 'before', 'when', 'whenever', 'where', 'wherever', 'whether', 'which', 'who', 'whom', 'whose', 'what', 'whatever', 'whichever', 'whoever', 'whomever', 'this', 'that', 'these', 'those', 'it', 'its']); + + const keywords = query + .toLowerCase() + .split(/[\s,.;:()"{}[\]-]+/) // More robust splitting on punctuation + .filter(word => word.length > 2 && !stopWords.has(word)); + + if (keywords.length === 0) return results; + + // Create case-insensitive regexes for whole-word matching + const keywordRegexes = keywords.map(kw => new RegExp(`\\b${escapeRegExp(kw)}\\b`, 'i')); + + return results.map(r => { + const content = r.content || ''; + const file = r.file || ''; + + // Count keyword matches using whole-word regex + let matchCount = 0; + for (const regex of keywordRegexes) { + if (regex.test(content) || regex.test(file)) { + matchCount++; + } + } + + // Apply boost only if there are matches + if (matchCount > 0) { + const matchRatio = matchCount / keywords.length; + const boost = 1 + (matchRatio * 0.3); // Up to 30% boost for full match + return { + ...r, + score: r.score * boost, + }; + } + + return r; + }); +} + +/** + * Post-processing: Enforce score diversity + * Penalizes results with identical scores (indicates undifferentiated matching) + */ +function enforceScoreDiversity(results: SemanticMatch[]): SemanticMatch[] { + if (results.length < 2) return results; + + // Count occurrences of each score (rounded to 3 decimal places for comparison) + const scoreCounts = new Map(); + for (const r of results) { + const roundedScore = Math.round(r.score * 1000) / 1000; + scoreCounts.set(roundedScore, (scoreCounts.get(roundedScore) || 0) + 1); + } + + // Apply penalty to scores that appear more than twice + return results.map(r => { + const roundedScore = Math.round(r.score * 1000) / 1000; + const count = scoreCounts.get(roundedScore) || 1; + + if (count > 2) { + // Progressive penalty: more duplicates = bigger penalty + const penalty = Math.max(0.7, 1 - (count * 0.05)); + return { ...r, score: r.score * penalty }; + } + return r; + }); +} + +/** + * Post-processing: Filter results with dominant baseline score (hot spot detection) + * When backend returns default "hot spot" files with identical high scores, + * this function detects and removes them. + * + * Detection criteria: + * - A single score appears in >50% of results + * - That score is suspiciously high (>0.9) + * - This indicates fallback mechanism returned placeholder results + */ +function filterDominantBaselineScores( + results: SemanticMatch[] +): { filteredResults: SemanticMatch[]; baselineInfo: { score: number; count: number } | null } { + if (results.length < 4) { + return { filteredResults: results, baselineInfo: null }; + } + + // Count occurrences of each score (rounded to 4 decimal places) + const scoreCounts = new Map(); + results.forEach(r => { + const rounded = Math.round(r.score * 10000) / 10000; + scoreCounts.set(rounded, (scoreCounts.get(rounded) || 0) + 1); + }); + + // Find the most dominant score + let dominantScore: number | null = null; + let dominantCount = 0; + scoreCounts.forEach((count, score) => { + if (count > dominantCount) { + dominantCount = count; + dominantScore = score; + } + }); + + // If a single score is present in >50% of results and is high (>0.9), + // treat it as a suspicious baseline score and filter it out + const BASELINE_THRESHOLD = 0.5; // >50% of results have same score + const HIGH_SCORE_THRESHOLD = 0.9; // Score above 0.9 is suspiciously high + + if ( + dominantScore !== null && + dominantCount > results.length * BASELINE_THRESHOLD && + dominantScore > HIGH_SCORE_THRESHOLD + ) { + const filteredResults = results.filter(r => { + const rounded = Math.round(r.score * 10000) / 10000; + return rounded !== dominantScore; + }); + + return { + filteredResults, + baselineInfo: { score: dominantScore, count: dominantCount }, + }; + } + + return { filteredResults: results, baselineInfo: null }; +} + /** * TypeScript implementation of Reciprocal Rank Fusion * Reference: codex-lens/src/codexlens/search/ranking.py @@ -963,34 +1285,52 @@ async function executePriorityFallbackMode(params: Params): Promise { + const { pattern, path = '.', limit = 20, offset = 0, includeHidden = false, caseSensitive = true } = params; + + if (!pattern) { + return { + success: false, + error: 'Pattern is required for find_files action. Use glob patterns like "*.ts", "src/**/*.js", or "test_*.py"', + }; + } + + // Use ripgrep with --files flag for fast file listing with glob pattern + const hasRipgrep = checkToolAvailability('rg'); + + if (!hasRipgrep) { + // Fallback to CodexLens file listing if available + const readyStatus = await ensureCodexLensReady(); + if (!readyStatus.ready) { + return { + success: false, + error: 'Neither ripgrep nor CodexLens available for file discovery.', + }; + } + + // Try CodexLens file list command + const args = ['list-files', '--json']; + const result = await executeCodexLens(args, { cwd: path }); + + if (!result.success) { + return { + success: false, + error: `Failed to list files: ${result.error}`, + }; + } + + // Parse and filter results by pattern + let files: string[] = []; + try { + const parsed = JSON.parse(stripAnsi(result.output || '[]')); + files = Array.isArray(parsed) ? parsed : (parsed.files || []); + } catch { + return { + success: false, + error: 'Failed to parse file list from CodexLens', + }; + } + + // Apply glob pattern matching using minimatch-style regex + const globRegex = globToRegex(pattern, caseSensitive); + const matchedFiles = files.filter(f => globRegex.test(f)); + + // Apply pagination + const total = matchedFiles.length; + const paginatedFiles = matchedFiles.slice(offset, offset + limit); + + const results: FileMatch[] = paginatedFiles.map(filePath => { + const parts = filePath.split(/[/\\]/); + const name = parts[parts.length - 1] || ''; + const ext = name.includes('.') ? name.split('.').pop() : undefined; + return { + path: filePath, + type: 'file' as const, + name, + extension: ext, + }; + }); + + return { + success: true, + results, + metadata: { + pattern, + backend: 'codexlens', + count: results.length, + pagination: { + offset, + limit, + total, + has_more: offset + limit < total, + }, + }, + }; + } + + // Use ripgrep --files with glob pattern for fast file discovery + return new Promise((resolve) => { + const args = ['--files']; + + // Add exclude patterns + if (!includeHidden) { + args.push(...buildExcludeArgs()); + } else { + args.push('--hidden'); + } + + // Add glob pattern + args.push('--glob', pattern); + + // Case sensitivity for glob matching + if (!caseSensitive) { + args.push('--iglob', pattern); + // Remove the case-sensitive glob and use iglob instead + const globIndex = args.indexOf('--glob'); + if (globIndex !== -1) { + args.splice(globIndex, 2); + } + } + + const child = spawn('rg', args, { + cwd: path || process.cwd(), + stdio: ['ignore', 'pipe', 'pipe'], + }); + + let stdout = ''; + let stderr = ''; + + child.stdout.on('data', (data) => { + stdout += data.toString(); + }); + + child.stderr.on('data', (data) => { + stderr += data.toString(); + }); + + child.on('close', (code) => { + // ripgrep returns 1 when no matches found, which is not an error + if (code !== 0 && code !== 1 && !stderr.includes('os error 1')) { + resolve({ + success: false, + error: `ripgrep file search failed: ${stderr}`, + }); + return; + } + + const allFiles = stdout.split('\n').filter(line => line.trim()); + const total = allFiles.length; + + // Apply pagination + const paginatedFiles = allFiles.slice(offset, offset + limit); + + const results: FileMatch[] = paginatedFiles.map(filePath => { + const normalizedPath = filePath.replace(/\\/g, '/'); + const parts = normalizedPath.split('/'); + const name = parts[parts.length - 1] || ''; + const ext = name.includes('.') ? name.split('.').pop() : undefined; + return { + path: normalizedPath, + type: 'file' as const, + name, + extension: ext, + }; + }); + + resolve({ + success: true, + results, + metadata: { + pattern, + backend: 'ripgrep', + count: results.length, + pagination: { + offset, + limit, + total, + has_more: offset + limit < total, + }, + }, + }); + }); + + child.on('error', (error) => { + resolve({ + success: false, + error: `Failed to spawn ripgrep: ${error.message}`, + }); + }); + }); +} + +/** + * Convert glob pattern to regex for file matching + * Supports: *, **, ?, [abc], [!abc] + */ +function globToRegex(pattern: string, caseSensitive: boolean = true): RegExp { + let i = 0; + const out: string[] = []; + const special = '.^$+{}|()'; + + while (i < pattern.length) { + const c = pattern[i]; + + if (c === '*') { + if (i + 1 < pattern.length && pattern[i + 1] === '*') { + // ** matches any path including / + out.push('.*'); + i += 2; + // Skip following / if present + if (pattern[i] === '/') { + i++; + } + continue; + } else { + // * matches any character except / + out.push('[^/]*'); + } + } else if (c === '?') { + out.push('[^/]'); + } else if (c === '[') { + // Character class + let j = i + 1; + let negated = false; + if (pattern[j] === '!' || pattern[j] === '^') { + negated = true; + j++; + } + let classContent = ''; + while (j < pattern.length && pattern[j] !== ']') { + classContent += pattern[j]; + j++; + } + if (negated) { + out.push(`[^${classContent}]`); + } else { + out.push(`[${classContent}]`); + } + i = j; + } else if (special.includes(c)) { + out.push('\\' + c); + } else { + out.push(c); + } + i++; + } + + const flags = caseSensitive ? '' : 'i'; + return new RegExp('^' + out.join('') + '$', flags); +} + +/** + * Apply pagination to search results and add pagination metadata + */ +function applyPagination( + results: T[], + offset: number, + limit: number +): { paginatedResults: T[]; pagination: PaginationInfo } { + const total = results.length; + const paginatedResults = results.slice(offset, offset + limit); + + return { + paginatedResults, + pagination: { + offset, + limit, + total, + has_more: offset + limit < total, + }, + }; +} + /** * Transform results based on output_mode */ @@ -1095,14 +1713,17 @@ export async function handler(params: Record): Promise): Promise): Promise; + +// Search mode constants +const SEARCH_MODES = ['auto', 'hybrid', 'exact', 'ripgrep', 'priority'] as const; + +// Classification confidence threshold +const CONFIDENCE_THRESHOLD = 0.7; + +interface Classification { + mode: string; + confidence: number; + reasoning: string; +} + +interface ExactMatch { + file: string; + line: number; + column: number; + content: string; +} + +interface RelationshipInfo { + type: string; // 'calls', 'imports', 'called_by', 'imported_by' + direction: 'outgoing' | 'incoming'; + target?: string; // Target symbol name (for outgoing) + source?: string; // Source symbol name (for incoming) + file: string; // File path + line?: number; // Line number +} + +interface SemanticMatch { + file: string; + score: number; + content: string; + symbol: string | null; + relationships?: RelationshipInfo[]; +} + +interface GraphMatch { + file: string; + symbols: unknown; + relationships: unknown[]; +} + +interface SearchMetadata { + mode?: string; + backend?: string; + count?: number; + query?: string; + classified_as?: string; + confidence?: number; + reasoning?: string; + embeddings_coverage_percent?: number; + warning?: string; + note?: string; + index_status?: 'indexed' | 'not_indexed' | 'partial'; + fallback_history?: string[]; + // Init action specific + action?: string; + path?: string; + progress?: { + stage: string; + message: string; + percent: number; + filesProcessed?: number; + totalFiles?: number; + }; + progressHistory?: ProgressInfo[]; +} + +interface SearchResult { + success: boolean; + results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | unknown; + output?: string; + metadata?: SearchMetadata; + error?: string; + status?: unknown; + message?: string; +} + +interface IndexStatus { + indexed: boolean; + has_embeddings: boolean; + file_count?: number; + embeddings_coverage_percent?: number; + warning?: string; +} + +/** + * Strip ANSI color codes from string (for JSON parsing) + */ +function stripAnsi(str: string): string { + return str.replace(/\x1b\[[0-9;]*m/g, ''); +} + +/** + * Check if CodexLens index exists for current directory + * @param path - Directory path to check + * @returns Index status + */ +async function checkIndexStatus(path: string = '.'): Promise { + try { + const result = await executeCodexLens(['status', '--json'], { cwd: path }); + + if (!result.success) { + return { + indexed: false, + has_embeddings: false, + warning: 'No CodexLens index found. Run smart_search(action="init") to create index for better search results.', + }; + } + + // Parse status output + try { + // Strip ANSI color codes from JSON output + const cleanOutput = stripAnsi(result.output || '{}'); + const parsed = JSON.parse(cleanOutput); + // Handle both direct and nested response formats (status returns {success, result: {...}}) + const status = parsed.result || parsed; + const indexed = status.projects_count > 0 || status.total_files > 0; + + // Get embeddings coverage from comprehensive status + const embeddingsData = status.embeddings || {}; + const embeddingsCoverage = embeddingsData.coverage_percent || 0; + const has_embeddings = embeddingsCoverage >= 50; // Threshold: 50% + + let warning: string | undefined; + if (!indexed) { + warning = 'No CodexLens index found. Run smart_search(action="init") to create index for better search results.'; + } else if (embeddingsCoverage === 0) { + warning = 'Index exists but no embeddings generated. Run: codexlens embeddings-generate --recursive'; + } else if (embeddingsCoverage < 50) { + warning = `Embeddings coverage is ${embeddingsCoverage.toFixed(1)}% (below 50%). Hybrid search will use exact mode. Run: codexlens embeddings-generate --recursive`; + } + + return { + indexed, + has_embeddings, + file_count: status.total_files, + embeddings_coverage_percent: embeddingsCoverage, + warning, + }; + } catch { + return { + indexed: false, + has_embeddings: false, + warning: 'Failed to parse index status', + }; + } + } catch { + return { + indexed: false, + has_embeddings: false, + warning: 'CodexLens not available', + }; + } +} + +/** + * Detection heuristics for intent classification + */ + +/** + * Detect literal string query (simple alphanumeric or quoted strings) + */ +function detectLiteral(query: string): boolean { + return /^[a-zA-Z0-9_-]+$/.test(query) || /^["'].*["']$/.test(query); +} + +/** + * Detect regex pattern (contains regex metacharacters) + */ +function detectRegex(query: string): boolean { + return /[.*+?^${}()|[\]\\]/.test(query); +} + +/** + * Detect natural language query (sentence structure, questions, multi-word phrases) + */ +function detectNaturalLanguage(query: string): boolean { + return query.split(/\s+/).length >= 3 || /\?$/.test(query); +} + +/** + * Detect file path query (path separators, file extensions) + */ +function detectFilePath(query: string): boolean { + return /[/\\]/.test(query) || /\.[a-z]{2,4}$/i.test(query); +} + +/** + * Detect relationship query (import, export, dependency keywords) + */ +function detectRelationship(query: string): boolean { + return /(import|export|uses?|depends?|calls?|extends?)\s/i.test(query); +} + +/** + * Classify query intent and recommend search mode + * Simple mapping: hybrid (NL + index + embeddings) | exact (index or insufficient embeddings) | ripgrep (no index) + * @param query - Search query string + * @param hasIndex - Whether CodexLens index exists + * @param hasSufficientEmbeddings - Whether embeddings coverage >= 50% + * @returns Classification result + */ +function classifyIntent(query: string, hasIndex: boolean = false, hasSufficientEmbeddings: boolean = false): Classification { + // Detect query patterns + const isNaturalLanguage = detectNaturalLanguage(query); + + // Simple decision tree + let mode: string; + let confidence: number; + + if (!hasIndex) { + // No index: use ripgrep + mode = 'ripgrep'; + confidence = 1.0; + } else if (isNaturalLanguage && hasSufficientEmbeddings) { + // Natural language + sufficient embeddings: use hybrid + mode = 'hybrid'; + confidence = 0.9; + } else { + // Simple query OR insufficient embeddings: use exact + mode = 'exact'; + confidence = 0.8; + } + + // Build reasoning string + const detectedPatterns: string[] = []; + if (detectLiteral(query)) detectedPatterns.push('literal'); + if (detectRegex(query)) detectedPatterns.push('regex'); + if (detectNaturalLanguage(query)) detectedPatterns.push('natural language'); + if (detectFilePath(query)) detectedPatterns.push('file path'); + if (detectRelationship(query)) detectedPatterns.push('relationship'); + + const reasoning = `Query classified as ${mode} (confidence: ${confidence.toFixed(2)}, detected: ${detectedPatterns.join(', ')}, index: ${hasIndex ? 'available' : 'not available'}, embeddings: ${hasSufficientEmbeddings ? 'sufficient' : 'insufficient'})`; + + return { mode, confidence, reasoning }; +} + +/** + * Check if a tool is available in PATH + * @param toolName - Tool executable name + * @returns True if available + */ +function checkToolAvailability(toolName: string): boolean { + try { + const isWindows = process.platform === 'win32'; + const command = isWindows ? 'where' : 'which'; + execSync(`${command} ${toolName}`, { stdio: 'ignore' }); + return true; + } catch { + return false; + } +} + +/** + * Build ripgrep command arguments + * @param params - Search parameters + * @returns Command and arguments + */ +function buildRipgrepCommand(params: { + query: string; + paths: string[]; + contextLines: number; + maxResults: number; + includeHidden: boolean; +}): { command: string; args: string[] } { + const { query, paths = ['.'], contextLines = 0, maxResults = 10, includeHidden = false } = params; + + const args = [ + '-n', // Show line numbers + '--color=never', // Disable color output + '--json', // Output in JSON format + ]; + + // Add context lines if specified + if (contextLines > 0) { + args.push('-C', contextLines.toString()); + } + + // Add max results limit + if (maxResults > 0) { + args.push('--max-count', maxResults.toString()); + } + + // Include hidden files if specified + if (includeHidden) { + args.push('--hidden'); + } + + // Use literal/fixed string matching for exact mode + args.push('-F', query); + + // Add search paths + args.push(...paths); + + return { command: 'rg', args }; +} + +/** + * Action: init - Initialize CodexLens index (FTS only, no embeddings) + * For semantic/vector search, use ccw view dashboard or codexlens CLI directly + */ +async function executeInitAction(params: Params): Promise { + const { path = '.', languages } = params; + + // Check CodexLens availability + const readyStatus = await ensureCodexLensReady(); + if (!readyStatus.ready) { + return { + success: false, + error: `CodexLens not available: ${readyStatus.error}. CodexLens will be auto-installed on first use.`, + }; + } + + // Build args with --no-embeddings for FTS-only index (faster) + const args = ['init', path, '--no-embeddings']; + if (languages && languages.length > 0) { + args.push('--languages', languages.join(',')); + } + + // Track progress updates + const progressUpdates: ProgressInfo[] = []; + let lastProgress: ProgressInfo | null = null; + + const result = await executeCodexLens(args, { + cwd: path, + timeout: 1800000, // 30 minutes for large codebases + onProgress: (progress: ProgressInfo) => { + progressUpdates.push(progress); + lastProgress = progress; + }, + }); + + // Build metadata with progress info + const metadata: SearchMetadata = { + action: 'init', + path, + }; + + if (lastProgress !== null) { + const p = lastProgress as ProgressInfo; + metadata.progress = { + stage: p.stage, + message: p.message, + percent: p.percent, + filesProcessed: p.filesProcessed, + totalFiles: p.totalFiles, + }; + } + + if (progressUpdates.length > 0) { + metadata.progressHistory = progressUpdates.slice(-5); // Keep last 5 progress updates + } + + const successMessage = result.success + ? `FTS index created for ${path}. Note: For semantic/vector search, create vector index via "ccw view" dashboard or run "codexlens init ${path}" (without --no-embeddings).` + : undefined; + + return { + success: result.success, + error: result.error, + message: successMessage, + metadata, + }; +} + +/** + * Action: status - Check CodexLens index status + */ +async function executeStatusAction(params: Params): Promise { + const { path = '.' } = params; + + const indexStatus = await checkIndexStatus(path); + + return { + success: true, + status: indexStatus, + message: indexStatus.warning || `Index status: ${indexStatus.indexed ? 'indexed' : 'not indexed'}, embeddings: ${indexStatus.has_embeddings ? 'available' : 'not available'}`, + }; +} + +/** + * Mode: auto - Intent classification and mode selection + * Routes to: hybrid (NL + index) | exact (index) | ripgrep (no index) + */ +async function executeAutoMode(params: Params): Promise { + const { query, path = '.' } = params; + + if (!query) { + return { + success: false, + error: 'Query is required for search action', + }; + } + + // Check index status + const indexStatus = await checkIndexStatus(path); + + // Classify intent with index and embeddings awareness + const classification = classifyIntent( + query, + indexStatus.indexed, + indexStatus.has_embeddings // This now considers 50% threshold + ); + + // Route to appropriate mode based on classification + let result: SearchResult; + + switch (classification.mode) { + case 'hybrid': + result = await executeHybridMode(params); + break; + + case 'exact': + result = await executeCodexLensExactMode(params); + break; + + case 'ripgrep': + result = await executeRipgrepMode(params); + break; + + default: + // Fallback to ripgrep + result = await executeRipgrepMode(params); + break; + } + + // Add classification metadata + if (result.metadata) { + result.metadata.classified_as = classification.mode; + result.metadata.confidence = classification.confidence; + result.metadata.reasoning = classification.reasoning; + result.metadata.embeddings_coverage_percent = indexStatus.embeddings_coverage_percent; + result.metadata.index_status = indexStatus.indexed + ? (indexStatus.has_embeddings ? 'indexed' : 'partial') + : 'not_indexed'; + + // Add warning if needed + if (indexStatus.warning) { + result.metadata.warning = indexStatus.warning; + } + } + + return result; +} + +/** + * Mode: ripgrep - Fast literal string matching using ripgrep + * No index required, fallback to CodexLens if ripgrep unavailable + */ +async function executeRipgrepMode(params: Params): Promise { + const { query, paths = [], contextLines = 0, maxResults = 10, includeHidden = false, path = '.' } = params; + + if (!query) { + return { + success: false, + error: 'Query is required for search', + }; + } + + // Check if ripgrep is available + const hasRipgrep = checkToolAvailability('rg'); + + // If ripgrep not available, fall back to CodexLens exact mode + if (!hasRipgrep) { + const readyStatus = await ensureCodexLensReady(); + if (!readyStatus.ready) { + return { + success: false, + error: 'Neither ripgrep nor CodexLens available. Install ripgrep (rg) or CodexLens for search functionality.', + }; + } + + // Use CodexLens exact mode as fallback + const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'exact', '--json']; + const result = await executeCodexLens(args, { cwd: path }); + + if (!result.success) { + return { + success: false, + error: result.error, + metadata: { + mode: 'ripgrep', + backend: 'codexlens-fallback', + count: 0, + query, + }, + }; + } + + // Parse results + let results: SemanticMatch[] = []; + try { + const parsed = JSON.parse(stripAnsi(result.output || '{}')); + const data = parsed.result?.results || parsed.results || parsed; + results = (Array.isArray(data) ? data : []).map((item: any) => ({ + file: item.path || item.file, + score: item.score || 0, + content: item.excerpt || item.content || '', + symbol: item.symbol || null, + })); + } catch { + // Keep empty results + } + + return { + success: true, + results, + metadata: { + mode: 'ripgrep', + backend: 'codexlens-fallback', + count: results.length, + query, + note: 'Using CodexLens exact mode (ripgrep not available)', + }, + }; + } + + // Use ripgrep + const { command, args } = buildRipgrepCommand({ + query, + paths: paths.length > 0 ? paths : [path], + contextLines, + maxResults, + includeHidden, + }); + + return new Promise((resolve) => { + const child = spawn(command, args, { + cwd: path || process.cwd(), + stdio: ['ignore', 'pipe', 'pipe'], + }); + + let stdout = ''; + let stderr = ''; + + child.stdout.on('data', (data) => { + stdout += data.toString(); + }); + + child.stderr.on('data', (data) => { + stderr += data.toString(); + }); + + child.on('close', (code) => { + const results: ExactMatch[] = []; + + if (code === 0 || (code === 1 && stdout.trim())) { + const lines = stdout.split('\n').filter((line) => line.trim()); + + for (const line of lines) { + try { + const item = JSON.parse(line); + + if (item.type === 'match') { + const match: ExactMatch = { + file: item.data.path.text, + line: item.data.line_number, + column: + item.data.submatches && item.data.submatches[0] + ? item.data.submatches[0].start + 1 + : 1, + content: item.data.lines.text.trim(), + }; + results.push(match); + } + } catch { + continue; + } + } + + resolve({ + success: true, + results, + metadata: { + mode: 'ripgrep', + backend: 'ripgrep', + count: results.length, + query, + }, + }); + } else { + resolve({ + success: false, + error: `ripgrep execution failed with code ${code}: ${stderr}`, + results: [], + }); + } + }); + + child.on('error', (error) => { + resolve({ + success: false, + error: `Failed to spawn ripgrep: ${error.message}`, + results: [], + }); + }); + }); +} + +/** + * Mode: exact - CodexLens exact/FTS search + * Requires index + */ +async function executeCodexLensExactMode(params: Params): Promise { + const { query, path = '.', maxResults = 10, enrich = false } = params; + + if (!query) { + return { + success: false, + error: 'Query is required for search', + }; + } + + // Check CodexLens availability + const readyStatus = await ensureCodexLensReady(); + if (!readyStatus.ready) { + return { + success: false, + error: `CodexLens not available: ${readyStatus.error}`, + }; + } + + // Check index status + const indexStatus = await checkIndexStatus(path); + + const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'exact', '--json']; + if (enrich) { + args.push('--enrich'); + } + const result = await executeCodexLens(args, { cwd: path }); + + if (!result.success) { + return { + success: false, + error: result.error, + metadata: { + mode: 'exact', + backend: 'codexlens', + count: 0, + query, + warning: indexStatus.warning, + }, + }; + } + + // Parse results + let results: SemanticMatch[] = []; + try { + const parsed = JSON.parse(stripAnsi(result.output || '{}')); + const data = parsed.result?.results || parsed.results || parsed; + results = (Array.isArray(data) ? data : []).map((item: any) => ({ + file: item.path || item.file, + score: item.score || 0, + content: item.excerpt || item.content || '', + symbol: item.symbol || null, + })); + } catch { + // Keep empty results + } + + return { + success: true, + results, + metadata: { + mode: 'exact', + backend: 'codexlens', + count: results.length, + query, + warning: indexStatus.warning, + }, + }; +} + +/** + * Mode: hybrid - Best quality search with RRF fusion + * Uses CodexLens hybrid mode (exact + fuzzy + vector) + * Requires index with embeddings + */ +async function executeHybridMode(params: Params): Promise { + const { query, path = '.', maxResults = 10, enrich = false } = params; + + if (!query) { + return { + success: false, + error: 'Query is required for search', + }; + } + + // Check CodexLens availability + const readyStatus = await ensureCodexLensReady(); + if (!readyStatus.ready) { + return { + success: false, + error: `CodexLens not available: ${readyStatus.error}`, + }; + } + + // Check index status + const indexStatus = await checkIndexStatus(path); + + const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'hybrid', '--json']; + if (enrich) { + args.push('--enrich'); + } + const result = await executeCodexLens(args, { cwd: path }); + + if (!result.success) { + return { + success: false, + error: result.error, + metadata: { + mode: 'hybrid', + backend: 'codexlens', + count: 0, + query, + warning: indexStatus.warning, + }, + }; + } + + // Parse results + let results: SemanticMatch[] = []; + try { + const parsed = JSON.parse(stripAnsi(result.output || '{}')); + const data = parsed.result?.results || parsed.results || parsed; + results = (Array.isArray(data) ? data : []).map((item: any) => ({ + file: item.path || item.file, + score: item.score || 0, + content: item.excerpt || item.content || '', + symbol: item.symbol || null, + })); + } catch { + return { + success: true, + results: [], + output: result.output, + metadata: { + mode: 'hybrid', + backend: 'codexlens', + count: 0, + query, + warning: indexStatus.warning || 'Failed to parse JSON output', + }, + }; + } + + return { + success: true, + results, + metadata: { + mode: 'hybrid', + backend: 'codexlens', + count: results.length, + query, + note: 'Hybrid mode uses RRF fusion (exact + fuzzy + vector) for best results', + warning: indexStatus.warning, + }, + }; +} + +/** + * TypeScript implementation of Reciprocal Rank Fusion + * Reference: codex-lens/src/codexlens/search/ranking.py + * Formula: score(d) = Σ weight_source / (k + rank_source(d)) + */ +function applyRRFFusion( + resultsMap: Map, + weights: Record, + limit: number, + k: number = 60, +): any[] { + const pathScores = new Map(); + + resultsMap.forEach((results, source) => { + const weight = weights[source] || 0; + if (weight === 0 || !results) return; + + results.forEach((result, rank) => { + const path = result.file || result.path; + if (!path) return; + + const rrfContribution = weight / (k + rank + 1); + + if (!pathScores.has(path)) { + pathScores.set(path, { score: 0, result, sources: [] }); + } + const entry = pathScores.get(path)!; + entry.score += rrfContribution; + if (!entry.sources.includes(source)) { + entry.sources.push(source); + } + }); + }); + + // Sort by fusion score descending + return Array.from(pathScores.values()) + .sort((a, b) => b.score - a.score) + .slice(0, limit) + .map(item => ({ + ...item.result, + fusion_score: item.score, + matched_backends: item.sources, + })); +} + +/** + * Promise wrapper with timeout support + * @param promise - The promise to wrap + * @param ms - Timeout in milliseconds + * @param modeName - Name of the mode for error message + * @returns A new promise that rejects on timeout + */ +function withTimeout(promise: Promise, ms: number, modeName: string): Promise { + return new Promise((resolve, reject) => { + const timer = setTimeout(() => { + reject(new Error(`'${modeName}' search timed out after ${ms}ms`)); + }, ms); + + promise + .then(resolve) + .catch(reject) + .finally(() => clearTimeout(timer)); + }); +} + +/** + * Mode: priority - Fallback search strategy: hybrid -> exact -> ripgrep + * Returns results from the first backend that succeeds and provides results. + * More efficient than parallel mode - stops as soon as valid results are found. + */ +async function executePriorityFallbackMode(params: Params): Promise { + const { query, path = '.' } = params; + const fallbackHistory: string[] = []; + + if (!query) { + return { success: false, error: 'Query is required for search' }; + } + + // Check index status first + const indexStatus = await checkIndexStatus(path); + + // 1. Try Hybrid search (highest priority) - 90s timeout for large indexes + if (indexStatus.indexed && indexStatus.has_embeddings) { + try { + const hybridResult = await withTimeout(executeHybridMode(params), 90000, 'hybrid'); + if (hybridResult.success && hybridResult.results && (hybridResult.results as any[]).length > 0) { + fallbackHistory.push('hybrid: success'); + return { + ...hybridResult, + metadata: { + ...hybridResult.metadata, + mode: 'priority', + note: 'Result from hybrid search (semantic + vector).', + fallback_history: fallbackHistory, + }, + }; + } + fallbackHistory.push('hybrid: no results'); + } catch (error) { + fallbackHistory.push(`hybrid: ${(error as Error).message}`); + } + } else { + fallbackHistory.push(`hybrid: skipped (${!indexStatus.indexed ? 'no index' : 'no embeddings'})`); + } + + // 2. Fallback to Exact search - 10s timeout + if (indexStatus.indexed) { + try { + const exactResult = await withTimeout(executeCodexLensExactMode(params), 10000, 'exact'); + if (exactResult.success && exactResult.results && (exactResult.results as any[]).length > 0) { + fallbackHistory.push('exact: success'); + return { + ...exactResult, + metadata: { + ...exactResult.metadata, + mode: 'priority', + note: 'Result from exact/FTS search (fallback from hybrid).', + fallback_history: fallbackHistory, + }, + }; + } + fallbackHistory.push('exact: no results'); + } catch (error) { + fallbackHistory.push(`exact: ${(error as Error).message}`); + } + } else { + fallbackHistory.push('exact: skipped (no index)'); + } + + // 3. Final fallback to Ripgrep - 5s timeout + try { + const ripgrepResult = await withTimeout(executeRipgrepMode(params), 5000, 'ripgrep'); + fallbackHistory.push(ripgrepResult.success ? 'ripgrep: success' : 'ripgrep: failed'); + return { + ...ripgrepResult, + metadata: { + ...ripgrepResult.metadata, + mode: 'priority', + note: 'Result from ripgrep search (final fallback).', + fallback_history: fallbackHistory, + }, + }; + } catch (error) { + fallbackHistory.push(`ripgrep: ${(error as Error).message}`); + } + + // All modes failed + return { + success: false, + error: 'All search backends in priority mode failed or returned no results.', + metadata: { + mode: 'priority', + query, + fallback_history: fallbackHistory, + } as any, + }; +} + +// Tool schema for MCP +export const schema: ToolSchema = { + name: 'smart_search', + description: `Intelligent code search with five modes. Use "auto" mode (default) for intelligent routing. + +**Usage:** + smart_search(query="authentication logic") # auto mode - routes to best backend + smart_search(query="MyClass", mode="exact") # exact mode - precise FTS matching + smart_search(query="auth", mode="ripgrep") # ripgrep mode - fast literal search (no index) + smart_search(query="how to auth", mode="hybrid") # hybrid mode - semantic search (requires index) + +**Index Management:** + smart_search(action="init") # Create FTS index for current directory + smart_search(action="status") # Check index and embedding status + +**Graph Enrichment:** + smart_search(query="func", enrich=true) # Enrich results with code relationships (calls, imports, called_by, imported_by) + +**Modes:** auto (intelligent routing), hybrid (semantic, needs index), exact (FTS), ripgrep (fast, no index), priority (fallback: hybrid→exact→ripgrep)`, + inputSchema: { + type: 'object', + properties: { + action: { + type: 'string', + enum: ['init', 'search', 'search_files', 'status'], + description: 'Action to perform: init (create FTS index, no embeddings), search (default), search_files (paths only), status (check index)', + default: 'search', + }, + query: { + type: 'string', + description: 'Search query (required for search/search_files actions)', + }, + mode: { + type: 'string', + enum: SEARCH_MODES, + description: 'Search mode: auto (default), hybrid (best quality), exact (CodexLens FTS), ripgrep (fast, no index), priority (fallback: hybrid->exact->ripgrep)', + default: 'auto', + }, + output_mode: { + type: 'string', + enum: ['full', 'files_only', 'count'], + description: 'Output format: full (default), files_only (paths only), count (per-file counts)', + default: 'full', + }, + path: { + type: 'string', + description: 'Directory path for init/search actions (default: current directory)', + }, + paths: { + type: 'array', + description: 'Multiple paths to search within (for search action)', + items: { + type: 'string', + }, + default: [], + }, + contextLines: { + type: 'number', + description: 'Number of context lines around matches (exact mode only)', + default: 0, + }, + maxResults: { + type: 'number', + description: 'Maximum number of results (default: 10)', + default: 10, + }, + limit: { + type: 'number', + description: 'Alias for maxResults', + default: 10, + }, + includeHidden: { + type: 'boolean', + description: 'Include hidden files/directories', + default: false, + }, + languages: { + type: 'array', + items: { type: 'string' }, + description: 'Languages to index (for init action). Example: ["javascript", "typescript"]', + }, + enrich: { + type: 'boolean', + description: 'Enrich search results with code graph relationships (calls, imports, called_by, imported_by).', + default: false, + }, + }, + required: [], + }, +}; + +/** + * Transform results based on output_mode + */ +function transformOutput( + results: ExactMatch[] | SemanticMatch[] | GraphMatch[] | unknown[], + outputMode: 'full' | 'files_only' | 'count' +): unknown { + if (!Array.isArray(results)) { + return results; + } + + switch (outputMode) { + case 'files_only': { + // Extract unique file paths + const files = [...new Set(results.map((r: any) => r.file))].filter(Boolean); + return { files, count: files.length }; + } + case 'count': { + // Count matches per file + const counts: Record = {}; + for (const r of results) { + const file = (r as any).file; + if (file) { + counts[file] = (counts[file] || 0) + 1; + } + } + return { + files: Object.entries(counts).map(([file, count]) => ({ file, count })), + total: results.length, + }; + } + case 'full': + default: + return results; + } +} + +// Handler function +export async function handler(params: Record): Promise> { + const parsed = ParamsSchema.safeParse(params); + if (!parsed.success) { + return { success: false, error: `Invalid params: ${parsed.error.message}` }; + } + + const { action, mode, output_mode } = parsed.data; + + // Sync limit and maxResults - use the larger of the two if both provided + // This ensures user-provided values take precedence over defaults + const effectiveLimit = Math.max(parsed.data.limit || 10, parsed.data.maxResults || 10); + parsed.data.maxResults = effectiveLimit; + parsed.data.limit = effectiveLimit; + + try { + let result: SearchResult; + + // Handle actions + switch (action) { + case 'init': + result = await executeInitAction(parsed.data); + break; + + case 'status': + result = await executeStatusAction(parsed.data); + break; + + case 'search_files': + // For search_files, use search mode but force files_only output + parsed.data.output_mode = 'files_only'; + // Fall through to search + + case 'search': + default: + // Handle search modes: auto | hybrid | exact | ripgrep | priority + switch (mode) { + case 'auto': + result = await executeAutoMode(parsed.data); + break; + case 'hybrid': + result = await executeHybridMode(parsed.data); + break; + case 'exact': + result = await executeCodexLensExactMode(parsed.data); + break; + case 'ripgrep': + result = await executeRipgrepMode(parsed.data); + break; + case 'priority': + result = await executePriorityFallbackMode(parsed.data); + break; + default: + throw new Error(`Unsupported mode: ${mode}. Use: auto, hybrid, exact, ripgrep, or priority`); + } + break; + } + + // Transform output based on output_mode (for search actions only) + if (action === 'search' || action === 'search_files') { + if (result.success && result.results && output_mode !== 'full') { + result.results = transformOutput(result.results as any[], output_mode); + } + } + + return result.success ? { success: true, result } : { success: false, error: result.error }; + } catch (error) { + return { success: false, error: (error as Error).message }; + } +} + +/** + * Execute init action with external progress callback + * Used by MCP server for streaming progress + */ +export async function executeInitWithProgress( + params: Record, + onProgress?: (progress: ProgressInfo) => void +): Promise { + const path = (params.path as string) || '.'; + const languages = params.languages as string[] | undefined; + + // Check CodexLens availability + const readyStatus = await ensureCodexLensReady(); + if (!readyStatus.ready) { + return { + success: false, + error: `CodexLens not available: ${readyStatus.error}. CodexLens will be auto-installed on first use.`, + }; + } + + const args = ['init', path]; + if (languages && languages.length > 0) { + args.push('--languages', languages.join(',')); + } + + // Track progress updates + const progressUpdates: ProgressInfo[] = []; + let lastProgress: ProgressInfo | null = null; + + const result = await executeCodexLens(args, { + cwd: path, + timeout: 1800000, // 30 minutes for large codebases + onProgress: (progress: ProgressInfo) => { + progressUpdates.push(progress); + lastProgress = progress; + // Call external progress callback if provided + if (onProgress) { + onProgress(progress); + } + }, + }); + + // Build metadata with progress info + const metadata: SearchMetadata = { + action: 'init', + path, + }; + + if (lastProgress !== null) { + const p = lastProgress as ProgressInfo; + metadata.progress = { + stage: p.stage, + message: p.message, + percent: p.percent, + filesProcessed: p.filesProcessed, + totalFiles: p.totalFiles, + }; + } + + if (progressUpdates.length > 0) { + metadata.progressHistory = progressUpdates.slice(-5); + } + + return { + success: result.success, + error: result.error, + message: result.success + ? `CodexLens index created successfully for ${path}` + : undefined, + metadata, + }; +} diff --git a/codex-lens/src/codexlens/cli/embedding_manager.py b/codex-lens/src/codexlens/cli/embedding_manager.py index 5adb3ca9..f4067840 100644 --- a/codex-lens/src/codexlens/cli/embedding_manager.py +++ b/codex-lens/src/codexlens/cli/embedding_manager.py @@ -18,6 +18,27 @@ except ImportError: logger = logging.getLogger(__name__) +def _get_path_column(conn: sqlite3.Connection) -> str: + """Detect whether files table uses 'path' or 'full_path' column. + + Args: + conn: SQLite connection to the index database + + Returns: + Column name ('path' or 'full_path') + + Raises: + ValueError: If neither column exists in files table + """ + cursor = conn.execute("PRAGMA table_info(files)") + columns = {row[1] for row in cursor.fetchall()} + if 'full_path' in columns: + return 'full_path' + elif 'path' in columns: + return 'path' + raise ValueError("files table has neither 'path' nor 'full_path' column") + + def check_index_embeddings(index_path: Path) -> Dict[str, any]: """Check if an index has embeddings and return statistics. @@ -75,10 +96,11 @@ def check_index_embeddings(index_path: Path) -> Dict[str, any]: files_with_chunks = cursor.fetchone()[0] # Get a sample of files without embeddings - cursor = conn.execute(""" - SELECT full_path + path_column = _get_path_column(conn) + cursor = conn.execute(f""" + SELECT {path_column} FROM files - WHERE full_path NOT IN ( + WHERE {path_column} NOT IN ( SELECT DISTINCT file_path FROM semantic_chunks ) LIMIT 5 @@ -113,7 +135,10 @@ def generate_embeddings( chunk_size: int = 2000, progress_callback: Optional[callable] = None, ) -> Dict[str, any]: - """Generate embeddings for an index. + """Generate embeddings for an index using memory-efficient batch processing. + + This function processes files in small batches to keep memory usage under 2GB, + regardless of the total project size. Args: index_path: Path to _index.db file @@ -181,126 +206,107 @@ def generate_embeddings( "error": f"Failed to initialize components: {str(e)}", } - # Read files from index + # --- MEMORY-OPTIMIZED STREAMING PROCESSING --- + # Process files in small batches to control memory usage + # This keeps peak memory under 2GB regardless of project size + start_time = time.time() + failed_files = [] + total_chunks_created = 0 + total_files_processed = 0 + FILE_BATCH_SIZE = 100 # Process 100 files at a time + EMBEDDING_BATCH_SIZE = 8 # jina-embeddings-v2-base-code needs small batches + try: with sqlite3.connect(index_path) as conn: conn.row_factory = sqlite3.Row - cursor = conn.execute("SELECT full_path, content, language FROM files") - files = cursor.fetchall() + path_column = _get_path_column(conn) + + # Get total file count for progress reporting + total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0] + if total_files == 0: + return {"success": False, "error": "No files found in index"} + + if progress_callback: + progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...") + + cursor = conn.execute(f"SELECT {path_column}, content, language FROM files") + batch_number = 0 + + while True: + # Fetch a batch of files (streaming, not fetchall) + file_batch = cursor.fetchmany(FILE_BATCH_SIZE) + if not file_batch: + break + + batch_number += 1 + batch_chunks_with_paths = [] + files_in_batch_with_chunks = set() + + # Step 1: Chunking for the current file batch + for file_row in file_batch: + file_path = file_row[path_column] + content = file_row["content"] + language = file_row["language"] or "python" + + try: + chunks = chunker.chunk_sliding_window( + content, + file_path=file_path, + language=language + ) + if chunks: + for chunk in chunks: + batch_chunks_with_paths.append((chunk, file_path)) + files_in_batch_with_chunks.add(file_path) + except Exception as e: + logger.error(f"Failed to chunk {file_path}: {e}") + failed_files.append((file_path, str(e))) + + if not batch_chunks_with_paths: + continue + + batch_chunk_count = len(batch_chunks_with_paths) + if progress_callback: + progress_callback(f" Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks") + + # Step 2: Generate embeddings for this batch + batch_embeddings = [] + try: + for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE): + batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count) + batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]] + embeddings = embedder.embed(batch_contents) + batch_embeddings.extend(embeddings) + except Exception as e: + logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}") + failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch]) + continue + + # Step 3: Assign embeddings to chunks + for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings): + chunk.embedding = embedding + + # Step 4: Store this batch to database immediately (releases memory) + try: + vector_store.add_chunks_batch(batch_chunks_with_paths) + total_chunks_created += batch_chunk_count + total_files_processed += len(files_in_batch_with_chunks) + except Exception as e: + logger.error(f"Failed to store batch {batch_number}: {str(e)}") + failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch]) + + # Memory is released here as batch_chunks_with_paths and batch_embeddings go out of scope + except Exception as e: - return { - "success": False, - "error": f"Failed to read files: {str(e)}", - } - - if len(files) == 0: - return { - "success": False, - "error": "No files found in index", - } - - if progress_callback: - progress_callback(f"Processing {len(files)} files...") - - # Process all files using batch operations for optimal performance - start_time = time.time() - failed_files = [] - - # --- OPTIMIZATION Step 1: Collect all chunks from all files --- - if progress_callback: - progress_callback(f"Step 1/4: Chunking {len(files)} files...") - - all_chunks_with_paths = [] # List of (chunk, file_path) tuples - files_with_chunks = set() - - for idx, file_row in enumerate(files, 1): - file_path = file_row["full_path"] - content = file_row["content"] - language = file_row["language"] or "python" - - try: - chunks = chunker.chunk_sliding_window( - content, - file_path=file_path, - language=language - ) - if chunks: - for chunk in chunks: - all_chunks_with_paths.append((chunk, file_path)) - files_with_chunks.add(file_path) - except Exception as e: - logger.error(f"Failed to chunk {file_path}: {e}") - failed_files.append((file_path, str(e))) - - if not all_chunks_with_paths: - elapsed_time = time.time() - start_time - return { - "success": True, - "result": { - "chunks_created": 0, - "files_processed": len(files) - len(failed_files), - "files_failed": len(failed_files), - "elapsed_time": elapsed_time, - "model_profile": model_profile, - "model_name": embedder.model_name, - "failed_files": failed_files[:5], - "index_path": str(index_path), - }, - } - - total_chunks = len(all_chunks_with_paths) - - # --- OPTIMIZATION Step 2: Batch generate embeddings with memory-safe batching --- - # Use smaller batches to avoid OOM errors while still benefiting from batch processing - # jina-embeddings-v2-base-code with long chunks needs small batches - BATCH_SIZE = 8 # Conservative batch size for memory efficiency - - if progress_callback: - num_batches = (total_chunks + BATCH_SIZE - 1) // BATCH_SIZE - progress_callback(f"Step 2/4: Generating embeddings for {total_chunks} chunks ({num_batches} batches)...") - - try: - all_embeddings = [] - for batch_start in range(0, total_chunks, BATCH_SIZE): - batch_end = min(batch_start + BATCH_SIZE, total_chunks) - batch_contents = [chunk.content for chunk, _ in all_chunks_with_paths[batch_start:batch_end]] - batch_embeddings = embedder.embed(batch_contents) - all_embeddings.extend(batch_embeddings) - - if progress_callback and total_chunks > BATCH_SIZE: - progress_callback(f" Batch {batch_start // BATCH_SIZE + 1}/{(total_chunks + BATCH_SIZE - 1) // BATCH_SIZE}: {len(batch_embeddings)} embeddings") - except Exception as e: - return { - "success": False, - "error": f"Failed to generate embeddings: {str(e)}", - } - - # --- OPTIMIZATION Step 3: Assign embeddings back to chunks --- - if progress_callback: - progress_callback(f"Step 3/4: Assigning {len(all_embeddings)} embeddings...") - - for (chunk, _), embedding in zip(all_chunks_with_paths, all_embeddings): - chunk.embedding = embedding - - # --- OPTIMIZATION Step 4: Batch store all chunks in single transaction --- - if progress_callback: - progress_callback(f"Step 4/4: Storing {total_chunks} chunks to database...") - - try: - vector_store.add_chunks_batch(all_chunks_with_paths) - except Exception as e: - return { - "success": False, - "error": f"Failed to store chunks: {str(e)}", - } + return {"success": False, "error": f"Failed to read or process files: {str(e)}"} elapsed_time = time.time() - start_time return { "success": True, "result": { - "chunks_created": total_chunks, - "files_processed": len(files_with_chunks), + "chunks_created": total_chunks_created, + "files_processed": total_files_processed, "files_failed": len(failed_files), "elapsed_time": elapsed_time, "model_profile": model_profile, diff --git a/codex-lens/src/codexlens/semantic/chunker.py b/codex-lens/src/codexlens/semantic/chunker.py index 38366dfb..a1df4686 100644 --- a/codex-lens/src/codexlens/semantic/chunker.py +++ b/codex-lens/src/codexlens/semantic/chunker.py @@ -150,8 +150,13 @@ class Chunker: chunk_idx += 1 # Move window, accounting for overlap - start = end - overlap_lines - if start >= len(lines) - overlap_lines: + step = lines_per_chunk - overlap_lines + if step <= 0: + step = 1 # Failsafe to prevent infinite loop + start += step + + # Break if we've reached the end + if end >= len(lines): break return chunks