mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-11 02:33:51 +08:00
fix: improve chunking logic in Chunker class and enhance smart search tool with comprehensive features
- Updated the Chunker class to adjust the window movement logic, ensuring proper handling of overlap lines. - Introduced a new smart search tool with features including intent classification, CodexLens integration, multi-backend search routing, and index status checking. - Implemented various search modes (auto, hybrid, exact, ripgrep, priority) with detailed metadata and error handling. - Added support for progress tracking during index initialization and enhanced output transformation based on user-defined modes. - Included comprehensive documentation for usage and parameters in the smart search tool.
This commit is contained in:
@@ -384,13 +384,16 @@ export async function handleCodexLensRoutes(ctx: RouteContext): Promise<boolean>
|
|||||||
// API: CodexLens Init (Initialize workspace index)
|
// API: CodexLens Init (Initialize workspace index)
|
||||||
if (pathname === '/api/codexlens/init' && req.method === 'POST') {
|
if (pathname === '/api/codexlens/init' && req.method === 'POST') {
|
||||||
handlePostRequest(req, res, async (body) => {
|
handlePostRequest(req, res, async (body) => {
|
||||||
const { path: projectPath, indexType = 'vector' } = body;
|
const { path: projectPath, indexType = 'vector', embeddingModel = 'code' } = body;
|
||||||
const targetPath = projectPath || initialPath;
|
const targetPath = projectPath || initialPath;
|
||||||
|
|
||||||
// Build CLI arguments based on index type
|
// Build CLI arguments based on index type
|
||||||
const args = ['init', targetPath, '--json'];
|
const args = ['init', targetPath, '--json'];
|
||||||
if (indexType === 'normal') {
|
if (indexType === 'normal') {
|
||||||
args.push('--no-embeddings');
|
args.push('--no-embeddings');
|
||||||
|
} else {
|
||||||
|
// Add embedding model selection for vector index
|
||||||
|
args.push('--embedding-model', embeddingModel);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Broadcast start event
|
// Broadcast start event
|
||||||
|
|||||||
@@ -275,6 +275,7 @@ const i18n = {
|
|||||||
'codexlens.semanticInstalled': 'Semantic dependencies installed',
|
'codexlens.semanticInstalled': 'Semantic dependencies installed',
|
||||||
'codexlens.semanticNotInstalled': 'Semantic dependencies not installed',
|
'codexlens.semanticNotInstalled': 'Semantic dependencies not installed',
|
||||||
'codexlens.installDeps': 'Install Dependencies',
|
'codexlens.installDeps': 'Install Dependencies',
|
||||||
|
'codexlens.installDepsPrompt': 'Would you like to install them now? (This may take a few minutes)\n\nClick "Cancel" to create FTS index only.',
|
||||||
'codexlens.installingDeps': 'Installing dependencies...',
|
'codexlens.installingDeps': 'Installing dependencies...',
|
||||||
'codexlens.depsInstalled': 'Dependencies installed successfully',
|
'codexlens.depsInstalled': 'Dependencies installed successfully',
|
||||||
'codexlens.depsInstallFailed': 'Failed to install dependencies',
|
'codexlens.depsInstallFailed': 'Failed to install dependencies',
|
||||||
@@ -324,8 +325,15 @@ const i18n = {
|
|||||||
'index.cleanAllSuccess': 'All indexes cleaned',
|
'index.cleanAllSuccess': 'All indexes cleaned',
|
||||||
'index.vectorIndex': 'Vector',
|
'index.vectorIndex': 'Vector',
|
||||||
'index.normalIndex': 'FTS',
|
'index.normalIndex': 'FTS',
|
||||||
|
'index.fullIndex': 'Full Index',
|
||||||
'index.vectorDesc': 'Semantic search with embeddings',
|
'index.vectorDesc': 'Semantic search with embeddings',
|
||||||
'index.normalDesc': 'Fast full-text search only',
|
'index.normalDesc': 'Fast full-text search only',
|
||||||
|
'index.fullDesc': 'FTS + Semantic search (recommended)',
|
||||||
|
'index.selectModel': 'Select embedding model',
|
||||||
|
'index.modelCode': 'Code (768d)',
|
||||||
|
'index.modelFast': 'Fast (384d)',
|
||||||
|
'index.modelMultilingual': 'Multilingual (1024d)',
|
||||||
|
'index.modelBalanced': 'Balanced (1024d)',
|
||||||
|
|
||||||
// Semantic Search Configuration
|
// Semantic Search Configuration
|
||||||
'semantic.settings': 'Semantic Search Settings',
|
'semantic.settings': 'Semantic Search Settings',
|
||||||
@@ -1596,6 +1604,7 @@ const i18n = {
|
|||||||
'codexlens.semanticInstalled': '语义搜索依赖已安装',
|
'codexlens.semanticInstalled': '语义搜索依赖已安装',
|
||||||
'codexlens.semanticNotInstalled': '语义搜索依赖未安装',
|
'codexlens.semanticNotInstalled': '语义搜索依赖未安装',
|
||||||
'codexlens.installDeps': '安装依赖',
|
'codexlens.installDeps': '安装依赖',
|
||||||
|
'codexlens.installDepsPrompt': '是否立即安装?(可能需要几分钟)\n\n点击"取消"将只创建 FTS 索引。',
|
||||||
'codexlens.installingDeps': '安装依赖中...',
|
'codexlens.installingDeps': '安装依赖中...',
|
||||||
'codexlens.depsInstalled': '依赖安装成功',
|
'codexlens.depsInstalled': '依赖安装成功',
|
||||||
'codexlens.depsInstallFailed': '依赖安装失败',
|
'codexlens.depsInstallFailed': '依赖安装失败',
|
||||||
@@ -1645,8 +1654,15 @@ const i18n = {
|
|||||||
'index.cleanAllSuccess': '所有索引已清理',
|
'index.cleanAllSuccess': '所有索引已清理',
|
||||||
'index.vectorIndex': '向量索引',
|
'index.vectorIndex': '向量索引',
|
||||||
'index.normalIndex': 'FTS索引',
|
'index.normalIndex': 'FTS索引',
|
||||||
|
'index.fullIndex': '全部索引',
|
||||||
'index.vectorDesc': '语义搜索(含嵌入向量)',
|
'index.vectorDesc': '语义搜索(含嵌入向量)',
|
||||||
'index.normalDesc': '快速全文搜索',
|
'index.normalDesc': '快速全文搜索',
|
||||||
|
'index.fullDesc': 'FTS + 语义搜索(推荐)',
|
||||||
|
'index.selectModel': '选择嵌入模型',
|
||||||
|
'index.modelCode': '代码优化 (768维)',
|
||||||
|
'index.modelFast': '快速轻量 (384维)',
|
||||||
|
'index.modelMultilingual': '多语言 (1024维)',
|
||||||
|
'index.modelBalanced': '高精度 (1024维)',
|
||||||
|
|
||||||
// Semantic Search 配置
|
// Semantic Search 配置
|
||||||
'semantic.settings': '语义搜索设置',
|
'semantic.settings': '语义搜索设置',
|
||||||
|
|||||||
@@ -338,6 +338,17 @@ async function renderCliManager() {
|
|||||||
if (window.lucide) lucide.createIcons();
|
if (window.lucide) lucide.createIcons();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ========== Helper Functions ==========
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get selected embedding model from dropdown
|
||||||
|
* @returns {string} Selected model profile (code, fast, multilingual, balanced)
|
||||||
|
*/
|
||||||
|
function getSelectedModel() {
|
||||||
|
var select = document.getElementById('codexlensModelSelect');
|
||||||
|
return select ? select.value : 'code';
|
||||||
|
}
|
||||||
|
|
||||||
// ========== Tools Section (Left Column) ==========
|
// ========== Tools Section (Left Column) ==========
|
||||||
function renderToolsSection() {
|
function renderToolsSection() {
|
||||||
var container = document.getElementById('tools-section');
|
var container = document.getElementById('tools-section');
|
||||||
@@ -392,8 +403,15 @@ function renderToolsSection() {
|
|||||||
'<div class="tool-item-right">' +
|
'<div class="tool-item-right">' +
|
||||||
(codexLensStatus.ready
|
(codexLensStatus.ready
|
||||||
? '<span class="tool-status-text success"><i data-lucide="check-circle" class="w-3.5 h-3.5"></i> v' + (codexLensStatus.version || 'installed') + '</span>' +
|
? '<span class="tool-status-text success"><i data-lucide="check-circle" class="w-3.5 h-3.5"></i> v' + (codexLensStatus.version || 'installed') + '</span>' +
|
||||||
'<button class="btn-sm btn-outline" onclick="event.stopPropagation(); initCodexLensIndex(\'vector\')" title="' + (t('index.vectorDesc') || 'Semantic search with embeddings') + '"><i data-lucide="sparkles" class="w-3 h-3"></i> ' + (t('index.vectorIndex') || 'Vector') + '</button>' +
|
'<select id="codexlensModelSelect" class="btn-sm bg-muted border border-border rounded text-xs" onclick="event.stopPropagation()" title="' + (t('index.selectModel') || 'Select embedding model') + '">' +
|
||||||
'<button class="btn-sm btn-outline" onclick="event.stopPropagation(); initCodexLensIndex(\'normal\')" title="' + (t('index.normalDesc') || 'Fast full-text search only') + '"><i data-lucide="file-text" class="w-3 h-3"></i> ' + (t('index.normalIndex') || 'FTS') + '</button>' +
|
'<option value="code">' + (t('index.modelCode') || 'Code (768d)') + '</option>' +
|
||||||
|
'<option value="fast">' + (t('index.modelFast') || 'Fast (384d)') + '</option>' +
|
||||||
|
'<option value="multilingual">' + (t('index.modelMultilingual') || 'Multilingual (1024d)') + '</option>' +
|
||||||
|
'<option value="balanced">' + (t('index.modelBalanced') || 'Balanced (1024d)') + '</option>' +
|
||||||
|
'</select>' +
|
||||||
|
'<button class="btn-sm btn-primary" onclick="event.stopPropagation(); initCodexLensIndex(\'full\', getSelectedModel())" title="' + (t('index.fullDesc') || 'FTS + Semantic search (recommended)') + '"><i data-lucide="layers" class="w-3 h-3"></i> ' + (t('index.fullIndex') || '全部索引') + '</button>' +
|
||||||
|
'<button class="btn-sm btn-outline" onclick="event.stopPropagation(); initCodexLensIndex(\'vector\', getSelectedModel())" title="' + (t('index.vectorDesc') || 'Semantic search with embeddings') + '"><i data-lucide="sparkles" class="w-3 h-3"></i> ' + (t('index.vectorIndex') || '向量索引') + '</button>' +
|
||||||
|
'<button class="btn-sm btn-outline" onclick="event.stopPropagation(); initCodexLensIndex(\'normal\')" title="' + (t('index.normalDesc') || 'Fast full-text search only') + '"><i data-lucide="file-text" class="w-3 h-3"></i> ' + (t('index.normalIndex') || 'FTS索引') + '</button>' +
|
||||||
'<button class="btn-sm btn-outline btn-danger" onclick="event.stopPropagation(); uninstallCodexLens()"><i data-lucide="trash-2" class="w-3 h-3"></i> ' + t('cli.uninstall') + '</button>'
|
'<button class="btn-sm btn-outline btn-danger" onclick="event.stopPropagation(); uninstallCodexLens()"><i data-lucide="trash-2" class="w-3 h-3"></i> ' + t('cli.uninstall') + '</button>'
|
||||||
: '<span class="tool-status-text muted"><i data-lucide="circle-dashed" class="w-3.5 h-3.5"></i> ' + t('cli.notInstalled') + '</span>' +
|
: '<span class="tool-status-text muted"><i data-lucide="circle-dashed" class="w-3.5 h-3.5"></i> ' + t('cli.notInstalled') + '</span>' +
|
||||||
'<button class="btn-sm btn-primary" onclick="event.stopPropagation(); installCodexLens()"><i data-lucide="download" class="w-3 h-3"></i> ' + t('cli.install') + '</button>') +
|
'<button class="btn-sm btn-primary" onclick="event.stopPropagation(); installCodexLens()"><i data-lucide="download" class="w-3 h-3"></i> ' + t('cli.install') + '</button>') +
|
||||||
|
|||||||
@@ -554,10 +554,54 @@ async function deleteModel(profile) {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize CodexLens index with bottom floating progress bar
|
* Initialize CodexLens index with bottom floating progress bar
|
||||||
* @param {string} indexType - 'vector' (with embeddings) or 'normal' (FTS only)
|
* @param {string} indexType - 'vector' (with embeddings), 'normal' (FTS only), or 'full' (FTS + Vector)
|
||||||
|
* @param {string} embeddingModel - Model profile: 'code', 'fast', 'multilingual', 'balanced'
|
||||||
*/
|
*/
|
||||||
function initCodexLensIndex(indexType) {
|
async function initCodexLensIndex(indexType, embeddingModel) {
|
||||||
indexType = indexType || 'vector';
|
indexType = indexType || 'vector';
|
||||||
|
embeddingModel = embeddingModel || 'code';
|
||||||
|
|
||||||
|
// For vector or full index, check if semantic dependencies are available
|
||||||
|
if (indexType === 'vector' || indexType === 'full') {
|
||||||
|
try {
|
||||||
|
var semanticResponse = await fetch('/api/codexlens/semantic/status');
|
||||||
|
var semanticStatus = await semanticResponse.json();
|
||||||
|
|
||||||
|
if (!semanticStatus.available) {
|
||||||
|
// Semantic deps not installed - show confirmation dialog
|
||||||
|
var installDeps = confirm(
|
||||||
|
(t('codexlens.semanticNotInstalled') || 'Semantic search dependencies are not installed.') + '\n\n' +
|
||||||
|
(t('codexlens.installDepsPrompt') || 'Would you like to install them now? (This may take a few minutes)\n\nClick "Cancel" to create FTS index only.')
|
||||||
|
);
|
||||||
|
|
||||||
|
if (installDeps) {
|
||||||
|
// Install semantic dependencies first
|
||||||
|
showRefreshToast(t('codexlens.installingDeps') || 'Installing semantic dependencies...', 'info');
|
||||||
|
try {
|
||||||
|
var installResponse = await fetch('/api/codexlens/semantic/install', { method: 'POST' });
|
||||||
|
var installResult = await installResponse.json();
|
||||||
|
|
||||||
|
if (!installResult.success) {
|
||||||
|
showRefreshToast((t('codexlens.depsInstallFailed') || 'Failed to install dependencies') + ': ' + installResult.error, 'error');
|
||||||
|
// Fall back to FTS only
|
||||||
|
indexType = 'normal';
|
||||||
|
} else {
|
||||||
|
showRefreshToast(t('codexlens.depsInstalled') || 'Dependencies installed successfully', 'success');
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
showRefreshToast((t('common.error') || 'Error') + ': ' + err.message, 'error');
|
||||||
|
indexType = 'normal';
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// User chose to skip - create FTS only
|
||||||
|
indexType = 'normal';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.warn('[CodexLens] Could not check semantic status:', err);
|
||||||
|
// Continue with requested type, backend will handle fallback
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Remove existing progress bar if any
|
// Remove existing progress bar if any
|
||||||
closeCodexLensIndexModal();
|
closeCodexLensIndexModal();
|
||||||
@@ -566,7 +610,24 @@ function initCodexLensIndex(indexType) {
|
|||||||
var progressBar = document.createElement('div');
|
var progressBar = document.createElement('div');
|
||||||
progressBar.id = 'codexlensIndexFloating';
|
progressBar.id = 'codexlensIndexFloating';
|
||||||
progressBar.className = 'fixed bottom-0 left-0 right-0 z-50 bg-card border-t border-border shadow-lg transform transition-transform duration-300';
|
progressBar.className = 'fixed bottom-0 left-0 right-0 z-50 bg-card border-t border-border shadow-lg transform transition-transform duration-300';
|
||||||
var indexTypeLabel = indexType === 'vector' ? 'Vector' : 'FTS';
|
|
||||||
|
// Determine display label
|
||||||
|
var indexTypeLabel;
|
||||||
|
if (indexType === 'full') {
|
||||||
|
indexTypeLabel = 'FTS + Vector';
|
||||||
|
} else if (indexType === 'vector') {
|
||||||
|
indexTypeLabel = 'Vector';
|
||||||
|
} else {
|
||||||
|
indexTypeLabel = 'FTS';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add model info for vector indexes
|
||||||
|
var modelLabel = '';
|
||||||
|
if (indexType !== 'normal') {
|
||||||
|
var modelNames = { code: 'Code', fast: 'Fast', multilingual: 'Multi', balanced: 'Balanced' };
|
||||||
|
modelLabel = ' [' + (modelNames[embeddingModel] || embeddingModel) + ']';
|
||||||
|
}
|
||||||
|
|
||||||
progressBar.innerHTML =
|
progressBar.innerHTML =
|
||||||
'<div class="max-w-4xl mx-auto px-4 py-3">' +
|
'<div class="max-w-4xl mx-auto px-4 py-3">' +
|
||||||
'<div class="flex items-center justify-between gap-4">' +
|
'<div class="flex items-center justify-between gap-4">' +
|
||||||
@@ -574,7 +635,7 @@ function initCodexLensIndex(indexType) {
|
|||||||
'<div class="animate-spin w-5 h-5 border-2 border-primary border-t-transparent rounded-full flex-shrink-0" id="codexlensIndexSpinner"></div>' +
|
'<div class="animate-spin w-5 h-5 border-2 border-primary border-t-transparent rounded-full flex-shrink-0" id="codexlensIndexSpinner"></div>' +
|
||||||
'<div class="flex-1 min-w-0">' +
|
'<div class="flex-1 min-w-0">' +
|
||||||
'<div class="flex items-center gap-2">' +
|
'<div class="flex items-center gap-2">' +
|
||||||
'<span class="font-medium text-sm">' + t('codexlens.indexing') + ' (' + indexTypeLabel + ')</span>' +
|
'<span class="font-medium text-sm">' + t('codexlens.indexing') + ' (' + indexTypeLabel + modelLabel + ')</span>' +
|
||||||
'<span class="text-xs text-muted-foreground" id="codexlensIndexPercent">0%</span>' +
|
'<span class="text-xs text-muted-foreground" id="codexlensIndexPercent">0%</span>' +
|
||||||
'</div>' +
|
'</div>' +
|
||||||
'<div class="text-xs text-muted-foreground truncate" id="codexlensIndexStatus">' + t('codexlens.preparingIndex') + '</div>' +
|
'<div class="text-xs text-muted-foreground truncate" id="codexlensIndexStatus">' + t('codexlens.preparingIndex') + '</div>' +
|
||||||
@@ -594,16 +655,21 @@ function initCodexLensIndex(indexType) {
|
|||||||
document.body.appendChild(progressBar);
|
document.body.appendChild(progressBar);
|
||||||
if (window.lucide) lucide.createIcons();
|
if (window.lucide) lucide.createIcons();
|
||||||
|
|
||||||
// Start indexing with specified type
|
// For 'full' type, use 'vector' in the API (it creates FTS + embeddings)
|
||||||
startCodexLensIndexing(indexType);
|
var apiIndexType = (indexType === 'full') ? 'vector' : indexType;
|
||||||
|
|
||||||
|
// Start indexing with specified type and model
|
||||||
|
startCodexLensIndexing(apiIndexType, embeddingModel);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Start the indexing process
|
* Start the indexing process
|
||||||
* @param {string} indexType - 'vector' or 'normal'
|
* @param {string} indexType - 'vector' or 'normal'
|
||||||
|
* @param {string} embeddingModel - Model profile: 'code', 'fast', 'multilingual', 'balanced'
|
||||||
*/
|
*/
|
||||||
async function startCodexLensIndexing(indexType) {
|
async function startCodexLensIndexing(indexType, embeddingModel) {
|
||||||
indexType = indexType || 'vector';
|
indexType = indexType || 'vector';
|
||||||
|
embeddingModel = embeddingModel || 'code';
|
||||||
var statusText = document.getElementById('codexlensIndexStatus');
|
var statusText = document.getElementById('codexlensIndexStatus');
|
||||||
var progressBar = document.getElementById('codexlensIndexProgressBar');
|
var progressBar = document.getElementById('codexlensIndexProgressBar');
|
||||||
var percentText = document.getElementById('codexlensIndexPercent');
|
var percentText = document.getElementById('codexlensIndexPercent');
|
||||||
@@ -635,11 +701,11 @@ async function startCodexLensIndexing(indexType) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
console.log('[CodexLens] Starting index for:', projectPath, 'type:', indexType);
|
console.log('[CodexLens] Starting index for:', projectPath, 'type:', indexType, 'model:', embeddingModel);
|
||||||
var response = await fetch('/api/codexlens/init', {
|
var response = await fetch('/api/codexlens/init', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: { 'Content-Type': 'application/json' },
|
headers: { 'Content-Type': 'application/json' },
|
||||||
body: JSON.stringify({ path: projectPath, indexType: indexType })
|
body: JSON.stringify({ path: projectPath, indexType: indexType, embeddingModel: embeddingModel })
|
||||||
});
|
});
|
||||||
|
|
||||||
var result = await response.json();
|
var result = await response.json();
|
||||||
|
|||||||
@@ -429,7 +429,7 @@ function parseProgressLine(line: string): ProgressInfo | null {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute CodexLens CLI command
|
* Execute CodexLens CLI command with real-time progress updates
|
||||||
* @param args - CLI arguments
|
* @param args - CLI arguments
|
||||||
* @param options - Execution options
|
* @param options - Execution options
|
||||||
* @returns Execution result
|
* @returns Execution result
|
||||||
@@ -463,34 +463,110 @@ async function executeCodexLens(args: string[], options: ExecuteOptions = {}): P
|
|||||||
fullCmd = `${quotedPython} -m codexlens ${cmdArgs.join(' ')}`;
|
fullCmd = `${quotedPython} -m codexlens ${cmdArgs.join(' ')}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use exec with shell option for cross-platform compatibility
|
// Use spawn with shell for real-time progress updates
|
||||||
exec(fullCmd, {
|
// spawn streams output in real-time, unlike exec which buffers until completion
|
||||||
cwd: process.platform === 'win32' ? undefined : cwd, // Don't use cwd on Windows, use cd command instead
|
const child = spawn(fullCmd, [], {
|
||||||
|
cwd: process.platform === 'win32' ? undefined : cwd,
|
||||||
|
shell: process.platform === 'win32' ? process.env.ComSpec || true : true,
|
||||||
timeout,
|
timeout,
|
||||||
maxBuffer: 50 * 1024 * 1024, // 50MB buffer for large outputs
|
});
|
||||||
shell: process.platform === 'win32' ? process.env.ComSpec : undefined,
|
|
||||||
}, (error, stdout, stderr) => {
|
let stdout = '';
|
||||||
if (error) {
|
let stderr = '';
|
||||||
if (error.killed) {
|
let stdoutLineBuffer = '';
|
||||||
resolve({ success: false, error: 'Command timed out' });
|
let stderrLineBuffer = '';
|
||||||
} else {
|
let timeoutHandle: NodeJS.Timeout | null = null;
|
||||||
resolve({ success: false, error: stderr || error.message });
|
let resolved = false;
|
||||||
|
|
||||||
|
// Helper to safely resolve only once
|
||||||
|
const safeResolve = (result: ExecuteResult) => {
|
||||||
|
if (resolved) return;
|
||||||
|
resolved = true;
|
||||||
|
if (timeoutHandle) {
|
||||||
|
clearTimeout(timeoutHandle);
|
||||||
|
timeoutHandle = null;
|
||||||
}
|
}
|
||||||
return;
|
resolve(result);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Set up timeout handler
|
||||||
|
if (timeout > 0) {
|
||||||
|
timeoutHandle = setTimeout(() => {
|
||||||
|
if (!resolved) {
|
||||||
|
child.kill('SIGTERM');
|
||||||
|
// Give it a moment to die gracefully, then force kill
|
||||||
|
setTimeout(() => {
|
||||||
|
if (!resolved) {
|
||||||
|
child.kill('SIGKILL');
|
||||||
|
}
|
||||||
|
}, 5000);
|
||||||
|
safeResolve({ success: false, error: 'Command timed out' });
|
||||||
|
}
|
||||||
|
}, timeout);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Report final progress if callback provided
|
// Process stdout line by line for real-time progress
|
||||||
if (onProgress && stdout) {
|
child.stdout?.on('data', (data: Buffer) => {
|
||||||
const lines = stdout.split('\n');
|
const chunk = data.toString();
|
||||||
|
stdoutLineBuffer += chunk;
|
||||||
|
stdout += chunk;
|
||||||
|
|
||||||
|
// Process complete lines
|
||||||
|
const lines = stdoutLineBuffer.split('\n');
|
||||||
|
stdoutLineBuffer = lines.pop() || ''; // Keep incomplete line in buffer
|
||||||
|
|
||||||
for (const line of lines) {
|
for (const line of lines) {
|
||||||
const progress = parseProgressLine(line.trim());
|
const trimmedLine = line.trim();
|
||||||
|
if (trimmedLine && onProgress) {
|
||||||
|
const progress = parseProgressLine(trimmedLine);
|
||||||
if (progress) {
|
if (progress) {
|
||||||
onProgress(progress);
|
onProgress(progress);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
});
|
||||||
|
|
||||||
resolve({ success: true, output: stdout.trim() });
|
// Collect stderr
|
||||||
|
child.stderr?.on('data', (data: Buffer) => {
|
||||||
|
const chunk = data.toString();
|
||||||
|
stderrLineBuffer += chunk;
|
||||||
|
stderr += chunk;
|
||||||
|
|
||||||
|
// Also check stderr for progress (some tools output progress to stderr)
|
||||||
|
const lines = stderrLineBuffer.split('\n');
|
||||||
|
stderrLineBuffer = lines.pop() || '';
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
const trimmedLine = line.trim();
|
||||||
|
if (trimmedLine && onProgress) {
|
||||||
|
const progress = parseProgressLine(trimmedLine);
|
||||||
|
if (progress) {
|
||||||
|
onProgress(progress);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle process errors (spawn failure)
|
||||||
|
child.on('error', (err) => {
|
||||||
|
safeResolve({ success: false, error: `Failed to start process: ${err.message}` });
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle process completion
|
||||||
|
child.on('close', (code) => {
|
||||||
|
// Process any remaining buffered content
|
||||||
|
if (stdoutLineBuffer.trim() && onProgress) {
|
||||||
|
const progress = parseProgressLine(stdoutLineBuffer.trim());
|
||||||
|
if (progress) {
|
||||||
|
onProgress(progress);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (code === 0) {
|
||||||
|
safeResolve({ success: true, output: stdout.trim() });
|
||||||
|
} else {
|
||||||
|
safeResolve({ success: false, error: stderr.trim() || `Process exited with code ${code}` });
|
||||||
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,18 +25,26 @@ import type { ProgressInfo } from './codex-lens.js';
|
|||||||
|
|
||||||
// Define Zod schema for validation
|
// Define Zod schema for validation
|
||||||
const ParamsSchema = z.object({
|
const ParamsSchema = z.object({
|
||||||
action: z.enum(['init', 'search', 'search_files', 'status']).default('search'),
|
// Action: search (content), find_files (path/name pattern), init, status
|
||||||
query: z.string().optional(),
|
// Note: search_files is deprecated, use search with output_mode='files_only'
|
||||||
|
action: z.enum(['init', 'search', 'search_files', 'find_files', 'status']).default('search'),
|
||||||
|
query: z.string().optional().describe('Content search query (for action="search")'),
|
||||||
|
pattern: z.string().optional().describe('Glob pattern for path matching (for action="find_files")'),
|
||||||
mode: z.enum(['auto', 'hybrid', 'exact', 'ripgrep', 'priority']).default('auto'),
|
mode: z.enum(['auto', 'hybrid', 'exact', 'ripgrep', 'priority']).default('auto'),
|
||||||
output_mode: z.enum(['full', 'files_only', 'count']).default('full'),
|
output_mode: z.enum(['full', 'files_only', 'count']).default('full'),
|
||||||
path: z.string().optional(),
|
path: z.string().optional(),
|
||||||
paths: z.array(z.string()).default([]),
|
paths: z.array(z.string()).default([]),
|
||||||
contextLines: z.number().default(0),
|
contextLines: z.number().default(0),
|
||||||
maxResults: z.number().default(10),
|
maxResults: z.number().default(20), // Increased default
|
||||||
includeHidden: z.boolean().default(false),
|
includeHidden: z.boolean().default(false),
|
||||||
languages: z.array(z.string()).optional(),
|
languages: z.array(z.string()).optional(),
|
||||||
limit: z.number().default(10),
|
limit: z.number().default(20), // Increased default
|
||||||
|
offset: z.number().default(0), // NEW: Pagination offset (start_index)
|
||||||
enrich: z.boolean().default(false),
|
enrich: z.boolean().default(false),
|
||||||
|
// Search modifiers for ripgrep mode
|
||||||
|
regex: z.boolean().default(true), // Use regex pattern matching (default: enabled)
|
||||||
|
caseSensitive: z.boolean().default(true), // Case sensitivity (default: case-sensitive)
|
||||||
|
// Fuzzy matching is implicit in hybrid mode (RRF fusion)
|
||||||
});
|
});
|
||||||
|
|
||||||
type Params = z.infer<typeof ParamsSchema>;
|
type Params = z.infer<typeof ParamsSchema>;
|
||||||
@@ -47,6 +55,46 @@ const SEARCH_MODES = ['auto', 'hybrid', 'exact', 'ripgrep', 'priority'] as const
|
|||||||
// Classification confidence threshold
|
// Classification confidence threshold
|
||||||
const CONFIDENCE_THRESHOLD = 0.7;
|
const CONFIDENCE_THRESHOLD = 0.7;
|
||||||
|
|
||||||
|
// File filtering configuration (ported from code-index)
|
||||||
|
const FILTER_CONFIG = {
|
||||||
|
exclude_directories: new Set([
|
||||||
|
'.git', '.svn', '.hg', '.bzr',
|
||||||
|
'node_modules', '__pycache__', '.venv', 'venv', 'vendor', 'bower_components',
|
||||||
|
'dist', 'build', 'target', 'out', 'bin', 'obj',
|
||||||
|
'.idea', '.vscode', '.vs', '.sublime-workspace',
|
||||||
|
'.pytest_cache', '.coverage', '.tox', '.nyc_output', 'coverage', 'htmlcov',
|
||||||
|
'.next', '.nuxt', '.cache', '.parcel-cache',
|
||||||
|
'.DS_Store', 'Thumbs.db',
|
||||||
|
]),
|
||||||
|
exclude_files: new Set([
|
||||||
|
'*.tmp', '*.temp', '*.swp', '*.swo', '*.bak', '*~', '*.orig', '*.log',
|
||||||
|
'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml', 'Pipfile.lock',
|
||||||
|
]),
|
||||||
|
// Windows device files - must use **/ pattern to match in any directory
|
||||||
|
// These cause "os error 1" on Windows when accessed
|
||||||
|
windows_device_files: new Set([
|
||||||
|
'nul', 'con', 'aux', 'prn',
|
||||||
|
'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9',
|
||||||
|
'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9',
|
||||||
|
]),
|
||||||
|
};
|
||||||
|
|
||||||
|
function buildExcludeArgs(): string[] {
|
||||||
|
const args: string[] = [];
|
||||||
|
for (const dir of FILTER_CONFIG.exclude_directories) {
|
||||||
|
args.push('--glob', `!**/${dir}/**`);
|
||||||
|
}
|
||||||
|
for (const pattern of FILTER_CONFIG.exclude_files) {
|
||||||
|
args.push('--glob', `!${pattern}`);
|
||||||
|
}
|
||||||
|
// Windows device files need case-insensitive matching in any directory
|
||||||
|
for (const device of FILTER_CONFIG.windows_device_files) {
|
||||||
|
args.push('--glob', `!**/${device}`);
|
||||||
|
args.push('--glob', `!**/${device.toUpperCase()}`);
|
||||||
|
}
|
||||||
|
return args;
|
||||||
|
}
|
||||||
|
|
||||||
interface Classification {
|
interface Classification {
|
||||||
mode: string;
|
mode: string;
|
||||||
confidence: number;
|
confidence: number;
|
||||||
@@ -83,11 +131,27 @@ interface GraphMatch {
|
|||||||
relationships: unknown[];
|
relationships: unknown[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// File match for find_files action (path-based search)
|
||||||
|
interface FileMatch {
|
||||||
|
path: string;
|
||||||
|
type: 'file' | 'directory';
|
||||||
|
name: string; // Filename only
|
||||||
|
extension?: string; // File extension (without dot)
|
||||||
|
}
|
||||||
|
|
||||||
|
interface PaginationInfo {
|
||||||
|
offset: number; // Starting index of returned results
|
||||||
|
limit: number; // Number of results requested
|
||||||
|
total: number; // Total number of results found
|
||||||
|
has_more: boolean; // True if more results are available
|
||||||
|
}
|
||||||
|
|
||||||
interface SearchMetadata {
|
interface SearchMetadata {
|
||||||
mode?: string;
|
mode?: string;
|
||||||
backend?: string;
|
backend?: string;
|
||||||
count?: number;
|
count?: number;
|
||||||
query?: string;
|
query?: string;
|
||||||
|
pattern?: string; // For find_files action
|
||||||
classified_as?: string;
|
classified_as?: string;
|
||||||
confidence?: number;
|
confidence?: number;
|
||||||
reasoning?: string;
|
reasoning?: string;
|
||||||
@@ -96,6 +160,9 @@ interface SearchMetadata {
|
|||||||
note?: string;
|
note?: string;
|
||||||
index_status?: 'indexed' | 'not_indexed' | 'partial';
|
index_status?: 'indexed' | 'not_indexed' | 'partial';
|
||||||
fallback_history?: string[];
|
fallback_history?: string[];
|
||||||
|
suggested_weights?: Record<string, number>;
|
||||||
|
// Pagination metadata
|
||||||
|
pagination?: PaginationInfo;
|
||||||
// Init action specific
|
// Init action specific
|
||||||
action?: string;
|
action?: string;
|
||||||
path?: string;
|
path?: string;
|
||||||
@@ -111,7 +178,7 @@ interface SearchMetadata {
|
|||||||
|
|
||||||
interface SearchResult {
|
interface SearchResult {
|
||||||
success: boolean;
|
success: boolean;
|
||||||
results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | unknown;
|
results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | FileMatch[] | unknown;
|
||||||
output?: string;
|
output?: string;
|
||||||
metadata?: SearchMetadata;
|
metadata?: SearchMetadata;
|
||||||
error?: string;
|
error?: string;
|
||||||
@@ -236,6 +303,14 @@ function detectRelationship(query: string): boolean {
|
|||||||
return /(import|export|uses?|depends?|calls?|extends?)\s/i.test(query);
|
return /(import|export|uses?|depends?|calls?|extends?)\s/i.test(query);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function looksLikeCodeQuery(query: string): boolean {
|
||||||
|
if (/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(query)) return true;
|
||||||
|
if (/[:.<>\-=(){}[\]]/.test(query) && query.split(/\s+/).length <= 2) return true;
|
||||||
|
if (/\.\*|\\\(|\\\[|\\s/.test(query)) return true;
|
||||||
|
if (/^[a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*$/.test(query)) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Classify query intent and recommend search mode
|
* Classify query intent and recommend search mode
|
||||||
* Simple mapping: hybrid (NL + index + embeddings) | exact (index or insufficient embeddings) | ripgrep (no index)
|
* Simple mapping: hybrid (NL + index + embeddings) | exact (index or insufficient embeddings) | ripgrep (no index)
|
||||||
@@ -245,34 +320,34 @@ function detectRelationship(query: string): boolean {
|
|||||||
* @returns Classification result
|
* @returns Classification result
|
||||||
*/
|
*/
|
||||||
function classifyIntent(query: string, hasIndex: boolean = false, hasSufficientEmbeddings: boolean = false): Classification {
|
function classifyIntent(query: string, hasIndex: boolean = false, hasSufficientEmbeddings: boolean = false): Classification {
|
||||||
// Detect query patterns
|
|
||||||
const isNaturalLanguage = detectNaturalLanguage(query);
|
const isNaturalLanguage = detectNaturalLanguage(query);
|
||||||
|
const isCodeQuery = looksLikeCodeQuery(query);
|
||||||
|
const isRegexPattern = detectRegex(query);
|
||||||
|
|
||||||
// Simple decision tree
|
|
||||||
let mode: string;
|
let mode: string;
|
||||||
let confidence: number;
|
let confidence: number;
|
||||||
|
|
||||||
if (!hasIndex) {
|
if (!hasIndex) {
|
||||||
// No index: use ripgrep
|
|
||||||
mode = 'ripgrep';
|
mode = 'ripgrep';
|
||||||
confidence = 1.0;
|
confidence = 1.0;
|
||||||
|
} else if (isCodeQuery || isRegexPattern) {
|
||||||
|
mode = 'exact';
|
||||||
|
confidence = 0.95;
|
||||||
} else if (isNaturalLanguage && hasSufficientEmbeddings) {
|
} else if (isNaturalLanguage && hasSufficientEmbeddings) {
|
||||||
// Natural language + sufficient embeddings: use hybrid
|
|
||||||
mode = 'hybrid';
|
mode = 'hybrid';
|
||||||
confidence = 0.9;
|
confidence = 0.9;
|
||||||
} else {
|
} else {
|
||||||
// Simple query OR insufficient embeddings: use exact
|
|
||||||
mode = 'exact';
|
mode = 'exact';
|
||||||
confidence = 0.8;
|
confidence = 0.8;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build reasoning string
|
|
||||||
const detectedPatterns: string[] = [];
|
const detectedPatterns: string[] = [];
|
||||||
if (detectLiteral(query)) detectedPatterns.push('literal');
|
if (detectLiteral(query)) detectedPatterns.push('literal');
|
||||||
if (detectRegex(query)) detectedPatterns.push('regex');
|
if (detectRegex(query)) detectedPatterns.push('regex');
|
||||||
if (detectNaturalLanguage(query)) detectedPatterns.push('natural language');
|
if (detectNaturalLanguage(query)) detectedPatterns.push('natural language');
|
||||||
if (detectFilePath(query)) detectedPatterns.push('file path');
|
if (detectFilePath(query)) detectedPatterns.push('file path');
|
||||||
if (detectRelationship(query)) detectedPatterns.push('relationship');
|
if (detectRelationship(query)) detectedPatterns.push('relationship');
|
||||||
|
if (isCodeQuery) detectedPatterns.push('code identifier');
|
||||||
|
|
||||||
const reasoning = `Query classified as ${mode} (confidence: ${confidence.toFixed(2)}, detected: ${detectedPatterns.join(', ')}, index: ${hasIndex ? 'available' : 'not available'}, embeddings: ${hasSufficientEmbeddings ? 'sufficient' : 'insufficient'})`;
|
const reasoning = `Query classified as ${mode} (confidence: ${confidence.toFixed(2)}, detected: ${detectedPatterns.join(', ')}, index: ${hasIndex ? 'available' : 'not available'}, embeddings: ${hasSufficientEmbeddings ? 'sufficient' : 'insufficient'})`;
|
||||||
|
|
||||||
@@ -306,34 +381,46 @@ function buildRipgrepCommand(params: {
|
|||||||
contextLines: number;
|
contextLines: number;
|
||||||
maxResults: number;
|
maxResults: number;
|
||||||
includeHidden: boolean;
|
includeHidden: boolean;
|
||||||
|
regex?: boolean;
|
||||||
|
caseSensitive?: boolean;
|
||||||
}): { command: string; args: string[] } {
|
}): { command: string; args: string[] } {
|
||||||
const { query, paths = ['.'], contextLines = 0, maxResults = 10, includeHidden = false } = params;
|
const { query, paths = ['.'], contextLines = 0, maxResults = 10, includeHidden = false, regex = false, caseSensitive = true } = params;
|
||||||
|
|
||||||
const args = [
|
const args = [
|
||||||
'-n', // Show line numbers
|
'-n',
|
||||||
'--color=never', // Disable color output
|
'--color=never',
|
||||||
'--json', // Output in JSON format
|
'--json',
|
||||||
];
|
];
|
||||||
|
|
||||||
// Add context lines if specified
|
// Add file filtering (unless includeHidden is true)
|
||||||
|
if (!includeHidden) {
|
||||||
|
args.push(...buildExcludeArgs());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case sensitivity
|
||||||
|
if (!caseSensitive) {
|
||||||
|
args.push('--ignore-case');
|
||||||
|
}
|
||||||
|
|
||||||
if (contextLines > 0) {
|
if (contextLines > 0) {
|
||||||
args.push('-C', contextLines.toString());
|
args.push('-C', contextLines.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add max results limit
|
|
||||||
if (maxResults > 0) {
|
if (maxResults > 0) {
|
||||||
args.push('--max-count', maxResults.toString());
|
args.push('--max-count', maxResults.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Include hidden files if specified
|
|
||||||
if (includeHidden) {
|
if (includeHidden) {
|
||||||
args.push('--hidden');
|
args.push('--hidden');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use literal/fixed string matching for exact mode
|
// Regex mode (-e) vs fixed string mode (-F)
|
||||||
|
if (regex) {
|
||||||
|
args.push('-e', query);
|
||||||
|
} else {
|
||||||
args.push('-F', query);
|
args.push('-F', query);
|
||||||
|
}
|
||||||
|
|
||||||
// Add search paths
|
|
||||||
args.push(...paths);
|
args.push(...paths);
|
||||||
|
|
||||||
return { command: 'rg', args };
|
return { command: 'rg', args };
|
||||||
@@ -492,7 +579,7 @@ async function executeAutoMode(params: Params): Promise<SearchResult> {
|
|||||||
* No index required, fallback to CodexLens if ripgrep unavailable
|
* No index required, fallback to CodexLens if ripgrep unavailable
|
||||||
*/
|
*/
|
||||||
async function executeRipgrepMode(params: Params): Promise<SearchResult> {
|
async function executeRipgrepMode(params: Params): Promise<SearchResult> {
|
||||||
const { query, paths = [], contextLines = 0, maxResults = 10, includeHidden = false, path = '.' } = params;
|
const { query, paths = [], contextLines = 0, maxResults = 10, includeHidden = false, path = '.', regex = true, caseSensitive = true } = params;
|
||||||
|
|
||||||
if (!query) {
|
if (!query) {
|
||||||
return {
|
return {
|
||||||
@@ -566,6 +653,8 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
|
|||||||
contextLines,
|
contextLines,
|
||||||
maxResults,
|
maxResults,
|
||||||
includeHidden,
|
includeHidden,
|
||||||
|
regex,
|
||||||
|
caseSensitive,
|
||||||
});
|
});
|
||||||
|
|
||||||
return new Promise((resolve) => {
|
return new Promise((resolve) => {
|
||||||
@@ -587,8 +676,6 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
|
|||||||
|
|
||||||
child.on('close', (code) => {
|
child.on('close', (code) => {
|
||||||
const results: ExactMatch[] = [];
|
const results: ExactMatch[] = [];
|
||||||
|
|
||||||
if (code === 0 || (code === 1 && stdout.trim())) {
|
|
||||||
const lines = stdout.split('\n').filter((line) => line.trim());
|
const lines = stdout.split('\n').filter((line) => line.trim());
|
||||||
|
|
||||||
for (const line of lines) {
|
for (const line of lines) {
|
||||||
@@ -612,6 +699,11 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Handle Windows device file errors gracefully (os error 1)
|
||||||
|
// If we have results despite the error, return them as partial success
|
||||||
|
const isWindowsDeviceError = stderr.includes('os error 1') || stderr.includes('函数不正确');
|
||||||
|
|
||||||
|
if (code === 0 || code === 1 || (isWindowsDeviceError && results.length > 0)) {
|
||||||
resolve({
|
resolve({
|
||||||
success: true,
|
success: true,
|
||||||
results,
|
results,
|
||||||
@@ -620,6 +712,20 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
|
|||||||
backend: 'ripgrep',
|
backend: 'ripgrep',
|
||||||
count: results.length,
|
count: results.length,
|
||||||
query,
|
query,
|
||||||
|
...(isWindowsDeviceError && { warning: 'Some Windows device files were skipped' }),
|
||||||
|
},
|
||||||
|
});
|
||||||
|
} else if (isWindowsDeviceError && results.length === 0) {
|
||||||
|
// Windows device error but no results - might be the only issue
|
||||||
|
resolve({
|
||||||
|
success: true,
|
||||||
|
results: [],
|
||||||
|
metadata: {
|
||||||
|
mode: 'ripgrep',
|
||||||
|
backend: 'ripgrep',
|
||||||
|
count: 0,
|
||||||
|
query,
|
||||||
|
warning: 'No matches found (some Windows device files were skipped)',
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
@@ -764,15 +870,42 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
|
|||||||
|
|
||||||
// Parse results
|
// Parse results
|
||||||
let results: SemanticMatch[] = [];
|
let results: SemanticMatch[] = [];
|
||||||
|
let baselineInfo: { score: number; count: number } | null = null;
|
||||||
|
let initialCount = 0;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const parsed = JSON.parse(stripAnsi(result.output || '{}'));
|
const parsed = JSON.parse(stripAnsi(result.output || '{}'));
|
||||||
const data = parsed.result?.results || parsed.results || parsed;
|
const data = parsed.result?.results || parsed.results || parsed;
|
||||||
results = (Array.isArray(data) ? data : []).map((item: any) => ({
|
results = (Array.isArray(data) ? data : []).map((item: any) => {
|
||||||
|
const rawScore = item.score || 0;
|
||||||
|
// Hybrid mode returns distance scores (lower is better).
|
||||||
|
// Convert to similarity scores (higher is better) for consistency.
|
||||||
|
// Formula: similarity = 1 / (1 + distance)
|
||||||
|
const similarityScore = rawScore > 0 ? 1 / (1 + rawScore) : 1;
|
||||||
|
return {
|
||||||
file: item.path || item.file,
|
file: item.path || item.file,
|
||||||
score: item.score || 0,
|
score: similarityScore,
|
||||||
content: item.excerpt || item.content || '',
|
content: item.excerpt || item.content || '',
|
||||||
symbol: item.symbol || null,
|
symbol: item.symbol || null,
|
||||||
}));
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
initialCount = results.length;
|
||||||
|
|
||||||
|
// Post-processing pipeline to improve semantic search quality
|
||||||
|
// 0. Filter dominant baseline scores (hot spot detection)
|
||||||
|
const baselineResult = filterDominantBaselineScores(results);
|
||||||
|
results = baselineResult.filteredResults;
|
||||||
|
baselineInfo = baselineResult.baselineInfo;
|
||||||
|
|
||||||
|
// 1. Filter noisy files (coverage, node_modules, etc.)
|
||||||
|
results = filterNoisyFiles(results);
|
||||||
|
// 2. Boost results containing query keywords
|
||||||
|
results = applyKeywordBoosting(results, query);
|
||||||
|
// 3. Enforce score diversity (penalize identical scores)
|
||||||
|
results = enforceScoreDiversity(results);
|
||||||
|
// 4. Re-sort by adjusted scores
|
||||||
|
results.sort((a, b) => b.score - a.score);
|
||||||
} catch {
|
} catch {
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
@@ -788,6 +921,12 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Build metadata with baseline info if detected
|
||||||
|
let note = 'Hybrid mode uses RRF fusion (exact + fuzzy + vector) for best results';
|
||||||
|
if (baselineInfo) {
|
||||||
|
note += ` | Filtered ${initialCount - results.length} hot-spot results with baseline score ~${baselineInfo.score.toFixed(4)}`;
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
results,
|
results,
|
||||||
@@ -796,12 +935,195 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
|
|||||||
backend: 'codexlens',
|
backend: 'codexlens',
|
||||||
count: results.length,
|
count: results.length,
|
||||||
query,
|
query,
|
||||||
note: 'Hybrid mode uses RRF fusion (exact + fuzzy + vector) for best results',
|
note,
|
||||||
warning: indexStatus.warning,
|
warning: indexStatus.warning,
|
||||||
|
suggested_weights: getRRFWeights(query),
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const RRF_WEIGHTS = {
|
||||||
|
code: { exact: 0.7, fuzzy: 0.2, vector: 0.1 },
|
||||||
|
natural: { exact: 0.4, fuzzy: 0.2, vector: 0.4 },
|
||||||
|
default: { exact: 0.5, fuzzy: 0.2, vector: 0.3 },
|
||||||
|
};
|
||||||
|
|
||||||
|
function getRRFWeights(query: string): Record<string, number> {
|
||||||
|
const isCode = looksLikeCodeQuery(query);
|
||||||
|
const isNatural = detectNaturalLanguage(query);
|
||||||
|
if (isCode) return RRF_WEIGHTS.code;
|
||||||
|
if (isNatural) return RRF_WEIGHTS.natural;
|
||||||
|
return RRF_WEIGHTS.default;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Post-processing: Filter noisy files from semantic search results
|
||||||
|
* Uses FILTER_CONFIG patterns to remove irrelevant files.
|
||||||
|
* Optimized: pre-compiled regexes, accurate path segment matching.
|
||||||
|
*/
|
||||||
|
// Pre-compile file exclusion regexes once (avoid recompilation in loop)
|
||||||
|
const FILE_EXCLUDE_REGEXES = [...FILTER_CONFIG.exclude_files].map(pattern =>
|
||||||
|
new RegExp('^' + pattern.replace(/[.*+?^${}()|[\]\\]/g, '\\$&').replace(/\\\*/g, '.*') + '$')
|
||||||
|
);
|
||||||
|
|
||||||
|
function filterNoisyFiles(results: SemanticMatch[]): SemanticMatch[] {
|
||||||
|
return results.filter(r => {
|
||||||
|
const filePath = r.file || '';
|
||||||
|
if (!filePath) return true;
|
||||||
|
|
||||||
|
const segments = filePath.split(/[/\\]/);
|
||||||
|
|
||||||
|
// Accurate directory check: segment must exactly match excluded directory
|
||||||
|
if (segments.some(segment => FILTER_CONFIG.exclude_directories.has(segment))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Accurate file check: pattern matches filename only (not full path)
|
||||||
|
const filename = segments.pop() || '';
|
||||||
|
if (FILE_EXCLUDE_REGEXES.some(regex => regex.test(filename))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Post-processing: Boost results containing query keywords
|
||||||
|
* Extracts keywords from query and boosts matching results.
|
||||||
|
* Optimized: uses whole-word matching with regex for accuracy.
|
||||||
|
*/
|
||||||
|
// Helper to escape regex special characters
|
||||||
|
function escapeRegExp(str: string): string {
|
||||||
|
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||||
|
}
|
||||||
|
|
||||||
|
function applyKeywordBoosting(results: SemanticMatch[], query: string): SemanticMatch[] {
|
||||||
|
// Extract meaningful keywords (ignore common words)
|
||||||
|
const stopWords = new Set(['the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', 'used', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'and', 'but', 'if', 'or', 'because', 'until', 'while', 'although', 'though', 'after', 'before', 'when', 'whenever', 'where', 'wherever', 'whether', 'which', 'who', 'whom', 'whose', 'what', 'whatever', 'whichever', 'whoever', 'whomever', 'this', 'that', 'these', 'those', 'it', 'its']);
|
||||||
|
|
||||||
|
const keywords = query
|
||||||
|
.toLowerCase()
|
||||||
|
.split(/[\s,.;:()"{}[\]-]+/) // More robust splitting on punctuation
|
||||||
|
.filter(word => word.length > 2 && !stopWords.has(word));
|
||||||
|
|
||||||
|
if (keywords.length === 0) return results;
|
||||||
|
|
||||||
|
// Create case-insensitive regexes for whole-word matching
|
||||||
|
const keywordRegexes = keywords.map(kw => new RegExp(`\\b${escapeRegExp(kw)}\\b`, 'i'));
|
||||||
|
|
||||||
|
return results.map(r => {
|
||||||
|
const content = r.content || '';
|
||||||
|
const file = r.file || '';
|
||||||
|
|
||||||
|
// Count keyword matches using whole-word regex
|
||||||
|
let matchCount = 0;
|
||||||
|
for (const regex of keywordRegexes) {
|
||||||
|
if (regex.test(content) || regex.test(file)) {
|
||||||
|
matchCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply boost only if there are matches
|
||||||
|
if (matchCount > 0) {
|
||||||
|
const matchRatio = matchCount / keywords.length;
|
||||||
|
const boost = 1 + (matchRatio * 0.3); // Up to 30% boost for full match
|
||||||
|
return {
|
||||||
|
...r,
|
||||||
|
score: r.score * boost,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Post-processing: Enforce score diversity
|
||||||
|
* Penalizes results with identical scores (indicates undifferentiated matching)
|
||||||
|
*/
|
||||||
|
function enforceScoreDiversity(results: SemanticMatch[]): SemanticMatch[] {
|
||||||
|
if (results.length < 2) return results;
|
||||||
|
|
||||||
|
// Count occurrences of each score (rounded to 3 decimal places for comparison)
|
||||||
|
const scoreCounts = new Map<number, number>();
|
||||||
|
for (const r of results) {
|
||||||
|
const roundedScore = Math.round(r.score * 1000) / 1000;
|
||||||
|
scoreCounts.set(roundedScore, (scoreCounts.get(roundedScore) || 0) + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply penalty to scores that appear more than twice
|
||||||
|
return results.map(r => {
|
||||||
|
const roundedScore = Math.round(r.score * 1000) / 1000;
|
||||||
|
const count = scoreCounts.get(roundedScore) || 1;
|
||||||
|
|
||||||
|
if (count > 2) {
|
||||||
|
// Progressive penalty: more duplicates = bigger penalty
|
||||||
|
const penalty = Math.max(0.7, 1 - (count * 0.05));
|
||||||
|
return { ...r, score: r.score * penalty };
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Post-processing: Filter results with dominant baseline score (hot spot detection)
|
||||||
|
* When backend returns default "hot spot" files with identical high scores,
|
||||||
|
* this function detects and removes them.
|
||||||
|
*
|
||||||
|
* Detection criteria:
|
||||||
|
* - A single score appears in >50% of results
|
||||||
|
* - That score is suspiciously high (>0.9)
|
||||||
|
* - This indicates fallback mechanism returned placeholder results
|
||||||
|
*/
|
||||||
|
function filterDominantBaselineScores(
|
||||||
|
results: SemanticMatch[]
|
||||||
|
): { filteredResults: SemanticMatch[]; baselineInfo: { score: number; count: number } | null } {
|
||||||
|
if (results.length < 4) {
|
||||||
|
return { filteredResults: results, baselineInfo: null };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count occurrences of each score (rounded to 4 decimal places)
|
||||||
|
const scoreCounts = new Map<number, number>();
|
||||||
|
results.forEach(r => {
|
||||||
|
const rounded = Math.round(r.score * 10000) / 10000;
|
||||||
|
scoreCounts.set(rounded, (scoreCounts.get(rounded) || 0) + 1);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Find the most dominant score
|
||||||
|
let dominantScore: number | null = null;
|
||||||
|
let dominantCount = 0;
|
||||||
|
scoreCounts.forEach((count, score) => {
|
||||||
|
if (count > dominantCount) {
|
||||||
|
dominantCount = count;
|
||||||
|
dominantScore = score;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// If a single score is present in >50% of results and is high (>0.9),
|
||||||
|
// treat it as a suspicious baseline score and filter it out
|
||||||
|
const BASELINE_THRESHOLD = 0.5; // >50% of results have same score
|
||||||
|
const HIGH_SCORE_THRESHOLD = 0.9; // Score above 0.9 is suspiciously high
|
||||||
|
|
||||||
|
if (
|
||||||
|
dominantScore !== null &&
|
||||||
|
dominantCount > results.length * BASELINE_THRESHOLD &&
|
||||||
|
dominantScore > HIGH_SCORE_THRESHOLD
|
||||||
|
) {
|
||||||
|
const filteredResults = results.filter(r => {
|
||||||
|
const rounded = Math.round(r.score * 10000) / 10000;
|
||||||
|
return rounded !== dominantScore;
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
filteredResults,
|
||||||
|
baselineInfo: { score: dominantScore, count: dominantCount },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return { filteredResults: results, baselineInfo: null };
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TypeScript implementation of Reciprocal Rank Fusion
|
* TypeScript implementation of Reciprocal Rank Fusion
|
||||||
* Reference: codex-lens/src/codexlens/search/ranking.py
|
* Reference: codex-lens/src/codexlens/search/ranking.py
|
||||||
@@ -963,34 +1285,52 @@ async function executePriorityFallbackMode(params: Params): Promise<SearchResult
|
|||||||
// Tool schema for MCP
|
// Tool schema for MCP
|
||||||
export const schema: ToolSchema = {
|
export const schema: ToolSchema = {
|
||||||
name: 'smart_search',
|
name: 'smart_search',
|
||||||
description: `Intelligent code search with five modes. Use "auto" mode (default) for intelligent routing.
|
description: `Unified code search tool with content search, file discovery, and semantic search capabilities.
|
||||||
|
|
||||||
**Usage:**
|
**Actions:**
|
||||||
|
- search: Search file content (default)
|
||||||
|
- find_files: Find files by path/name pattern (glob matching)
|
||||||
|
- init: Create FTS index
|
||||||
|
- status: Check index status
|
||||||
|
|
||||||
|
**Content Search (action="search"):**
|
||||||
smart_search(query="authentication logic") # auto mode - routes to best backend
|
smart_search(query="authentication logic") # auto mode - routes to best backend
|
||||||
smart_search(query="MyClass", mode="exact") # exact mode - precise FTS matching
|
smart_search(query="MyClass", mode="exact") # exact mode - precise FTS matching
|
||||||
smart_search(query="auth", mode="ripgrep") # ripgrep mode - fast literal search (no index)
|
smart_search(query="auth", mode="ripgrep") # ripgrep mode - fast literal search
|
||||||
smart_search(query="how to auth", mode="hybrid") # hybrid mode - semantic search (requires index)
|
smart_search(query="how to auth", mode="hybrid") # hybrid mode - semantic + fuzzy search
|
||||||
|
|
||||||
**Index Management:**
|
**File Discovery (action="find_files"):**
|
||||||
smart_search(action="init") # Create FTS index for current directory
|
smart_search(action="find_files", pattern="*.ts") # find all TypeScript files
|
||||||
smart_search(action="status") # Check index and embedding status
|
smart_search(action="find_files", pattern="src/**/*.js") # recursive glob pattern
|
||||||
|
smart_search(action="find_files", pattern="test_*.py") # find test files
|
||||||
|
smart_search(action="find_files", pattern="*.tsx", offset=20, limit=10) # pagination
|
||||||
|
|
||||||
**Graph Enrichment:**
|
**Pagination:** All actions support offset/limit for paginated results:
|
||||||
smart_search(query="func", enrich=true) # Enrich results with code relationships (calls, imports, called_by, imported_by)
|
smart_search(query="auth", limit=10, offset=0) # first page
|
||||||
|
smart_search(query="auth", limit=10, offset=10) # second page
|
||||||
|
|
||||||
**Modes:** auto (intelligent routing), hybrid (semantic, needs index), exact (FTS), ripgrep (fast, no index), priority (fallback: hybrid→exact→ripgrep)`,
|
**Regex Search (ripgrep mode):**
|
||||||
|
smart_search(query="class.*Builder") # auto-detects regex pattern
|
||||||
|
smart_search(query="def.*\\(.*\\):") # find function definitions
|
||||||
|
smart_search(query="import.*from", caseSensitive=false) # case-insensitive
|
||||||
|
|
||||||
|
**Modes:** auto (intelligent routing), hybrid (semantic+fuzzy), exact (FTS), ripgrep (fast), priority (fallback chain)`,
|
||||||
inputSchema: {
|
inputSchema: {
|
||||||
type: 'object',
|
type: 'object',
|
||||||
properties: {
|
properties: {
|
||||||
action: {
|
action: {
|
||||||
type: 'string',
|
type: 'string',
|
||||||
enum: ['init', 'search', 'search_files', 'status'],
|
enum: ['init', 'search', 'find_files', 'status', 'search_files'],
|
||||||
description: 'Action to perform: init (create FTS index, no embeddings), search (default), search_files (paths only), status (check index)',
|
description: 'Action: search (content search), find_files (path pattern matching), init (create index), status (check index). Note: search_files is deprecated.',
|
||||||
default: 'search',
|
default: 'search',
|
||||||
},
|
},
|
||||||
query: {
|
query: {
|
||||||
type: 'string',
|
type: 'string',
|
||||||
description: 'Search query (required for search/search_files actions)',
|
description: 'Content search query (for action="search")',
|
||||||
|
},
|
||||||
|
pattern: {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Glob pattern for file discovery (for action="find_files"). Examples: "*.ts", "src/**/*.js", "test_*.py"',
|
||||||
},
|
},
|
||||||
mode: {
|
mode: {
|
||||||
type: 'string',
|
type: 'string',
|
||||||
@@ -1023,13 +1363,18 @@ export const schema: ToolSchema = {
|
|||||||
},
|
},
|
||||||
maxResults: {
|
maxResults: {
|
||||||
type: 'number',
|
type: 'number',
|
||||||
description: 'Maximum number of results (default: 10)',
|
description: 'Maximum number of results (default: 20)',
|
||||||
default: 10,
|
default: 20,
|
||||||
},
|
},
|
||||||
limit: {
|
limit: {
|
||||||
type: 'number',
|
type: 'number',
|
||||||
description: 'Alias for maxResults',
|
description: 'Alias for maxResults (default: 20)',
|
||||||
default: 10,
|
default: 20,
|
||||||
|
},
|
||||||
|
offset: {
|
||||||
|
type: 'number',
|
||||||
|
description: 'Pagination offset - skip first N results (default: 0)',
|
||||||
|
default: 0,
|
||||||
},
|
},
|
||||||
includeHidden: {
|
includeHidden: {
|
||||||
type: 'boolean',
|
type: 'boolean',
|
||||||
@@ -1046,11 +1391,284 @@ export const schema: ToolSchema = {
|
|||||||
description: 'Enrich search results with code graph relationships (calls, imports, called_by, imported_by).',
|
description: 'Enrich search results with code graph relationships (calls, imports, called_by, imported_by).',
|
||||||
default: false,
|
default: false,
|
||||||
},
|
},
|
||||||
|
regex: {
|
||||||
|
type: 'boolean',
|
||||||
|
description: 'Use regex pattern matching instead of literal string (ripgrep mode only). Default: enabled. Example: smart_search(query="class.*Builder")',
|
||||||
|
default: true,
|
||||||
|
},
|
||||||
|
caseSensitive: {
|
||||||
|
type: 'boolean',
|
||||||
|
description: 'Case-sensitive search (default: true). Set to false for case-insensitive matching.',
|
||||||
|
default: true,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
required: [],
|
required: [],
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Action: find_files - Find files by path/name pattern (glob matching)
|
||||||
|
* Unlike search which looks inside file content, find_files matches file paths
|
||||||
|
*/
|
||||||
|
async function executeFindFilesAction(params: Params): Promise<SearchResult> {
|
||||||
|
const { pattern, path = '.', limit = 20, offset = 0, includeHidden = false, caseSensitive = true } = params;
|
||||||
|
|
||||||
|
if (!pattern) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: 'Pattern is required for find_files action. Use glob patterns like "*.ts", "src/**/*.js", or "test_*.py"',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use ripgrep with --files flag for fast file listing with glob pattern
|
||||||
|
const hasRipgrep = checkToolAvailability('rg');
|
||||||
|
|
||||||
|
if (!hasRipgrep) {
|
||||||
|
// Fallback to CodexLens file listing if available
|
||||||
|
const readyStatus = await ensureCodexLensReady();
|
||||||
|
if (!readyStatus.ready) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: 'Neither ripgrep nor CodexLens available for file discovery.',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try CodexLens file list command
|
||||||
|
const args = ['list-files', '--json'];
|
||||||
|
const result = await executeCodexLens(args, { cwd: path });
|
||||||
|
|
||||||
|
if (!result.success) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: `Failed to list files: ${result.error}`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse and filter results by pattern
|
||||||
|
let files: string[] = [];
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(stripAnsi(result.output || '[]'));
|
||||||
|
files = Array.isArray(parsed) ? parsed : (parsed.files || []);
|
||||||
|
} catch {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: 'Failed to parse file list from CodexLens',
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply glob pattern matching using minimatch-style regex
|
||||||
|
const globRegex = globToRegex(pattern, caseSensitive);
|
||||||
|
const matchedFiles = files.filter(f => globRegex.test(f));
|
||||||
|
|
||||||
|
// Apply pagination
|
||||||
|
const total = matchedFiles.length;
|
||||||
|
const paginatedFiles = matchedFiles.slice(offset, offset + limit);
|
||||||
|
|
||||||
|
const results: FileMatch[] = paginatedFiles.map(filePath => {
|
||||||
|
const parts = filePath.split(/[/\\]/);
|
||||||
|
const name = parts[parts.length - 1] || '';
|
||||||
|
const ext = name.includes('.') ? name.split('.').pop() : undefined;
|
||||||
|
return {
|
||||||
|
path: filePath,
|
||||||
|
type: 'file' as const,
|
||||||
|
name,
|
||||||
|
extension: ext,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
results,
|
||||||
|
metadata: {
|
||||||
|
pattern,
|
||||||
|
backend: 'codexlens',
|
||||||
|
count: results.length,
|
||||||
|
pagination: {
|
||||||
|
offset,
|
||||||
|
limit,
|
||||||
|
total,
|
||||||
|
has_more: offset + limit < total,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use ripgrep --files with glob pattern for fast file discovery
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
const args = ['--files'];
|
||||||
|
|
||||||
|
// Add exclude patterns
|
||||||
|
if (!includeHidden) {
|
||||||
|
args.push(...buildExcludeArgs());
|
||||||
|
} else {
|
||||||
|
args.push('--hidden');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add glob pattern
|
||||||
|
args.push('--glob', pattern);
|
||||||
|
|
||||||
|
// Case sensitivity for glob matching
|
||||||
|
if (!caseSensitive) {
|
||||||
|
args.push('--iglob', pattern);
|
||||||
|
// Remove the case-sensitive glob and use iglob instead
|
||||||
|
const globIndex = args.indexOf('--glob');
|
||||||
|
if (globIndex !== -1) {
|
||||||
|
args.splice(globIndex, 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const child = spawn('rg', args, {
|
||||||
|
cwd: path || process.cwd(),
|
||||||
|
stdio: ['ignore', 'pipe', 'pipe'],
|
||||||
|
});
|
||||||
|
|
||||||
|
let stdout = '';
|
||||||
|
let stderr = '';
|
||||||
|
|
||||||
|
child.stdout.on('data', (data) => {
|
||||||
|
stdout += data.toString();
|
||||||
|
});
|
||||||
|
|
||||||
|
child.stderr.on('data', (data) => {
|
||||||
|
stderr += data.toString();
|
||||||
|
});
|
||||||
|
|
||||||
|
child.on('close', (code) => {
|
||||||
|
// ripgrep returns 1 when no matches found, which is not an error
|
||||||
|
if (code !== 0 && code !== 1 && !stderr.includes('os error 1')) {
|
||||||
|
resolve({
|
||||||
|
success: false,
|
||||||
|
error: `ripgrep file search failed: ${stderr}`,
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const allFiles = stdout.split('\n').filter(line => line.trim());
|
||||||
|
const total = allFiles.length;
|
||||||
|
|
||||||
|
// Apply pagination
|
||||||
|
const paginatedFiles = allFiles.slice(offset, offset + limit);
|
||||||
|
|
||||||
|
const results: FileMatch[] = paginatedFiles.map(filePath => {
|
||||||
|
const normalizedPath = filePath.replace(/\\/g, '/');
|
||||||
|
const parts = normalizedPath.split('/');
|
||||||
|
const name = parts[parts.length - 1] || '';
|
||||||
|
const ext = name.includes('.') ? name.split('.').pop() : undefined;
|
||||||
|
return {
|
||||||
|
path: normalizedPath,
|
||||||
|
type: 'file' as const,
|
||||||
|
name,
|
||||||
|
extension: ext,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
resolve({
|
||||||
|
success: true,
|
||||||
|
results,
|
||||||
|
metadata: {
|
||||||
|
pattern,
|
||||||
|
backend: 'ripgrep',
|
||||||
|
count: results.length,
|
||||||
|
pagination: {
|
||||||
|
offset,
|
||||||
|
limit,
|
||||||
|
total,
|
||||||
|
has_more: offset + limit < total,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
child.on('error', (error) => {
|
||||||
|
resolve({
|
||||||
|
success: false,
|
||||||
|
error: `Failed to spawn ripgrep: ${error.message}`,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert glob pattern to regex for file matching
|
||||||
|
* Supports: *, **, ?, [abc], [!abc]
|
||||||
|
*/
|
||||||
|
function globToRegex(pattern: string, caseSensitive: boolean = true): RegExp {
|
||||||
|
let i = 0;
|
||||||
|
const out: string[] = [];
|
||||||
|
const special = '.^$+{}|()';
|
||||||
|
|
||||||
|
while (i < pattern.length) {
|
||||||
|
const c = pattern[i];
|
||||||
|
|
||||||
|
if (c === '*') {
|
||||||
|
if (i + 1 < pattern.length && pattern[i + 1] === '*') {
|
||||||
|
// ** matches any path including /
|
||||||
|
out.push('.*');
|
||||||
|
i += 2;
|
||||||
|
// Skip following / if present
|
||||||
|
if (pattern[i] === '/') {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
// * matches any character except /
|
||||||
|
out.push('[^/]*');
|
||||||
|
}
|
||||||
|
} else if (c === '?') {
|
||||||
|
out.push('[^/]');
|
||||||
|
} else if (c === '[') {
|
||||||
|
// Character class
|
||||||
|
let j = i + 1;
|
||||||
|
let negated = false;
|
||||||
|
if (pattern[j] === '!' || pattern[j] === '^') {
|
||||||
|
negated = true;
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
let classContent = '';
|
||||||
|
while (j < pattern.length && pattern[j] !== ']') {
|
||||||
|
classContent += pattern[j];
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
if (negated) {
|
||||||
|
out.push(`[^${classContent}]`);
|
||||||
|
} else {
|
||||||
|
out.push(`[${classContent}]`);
|
||||||
|
}
|
||||||
|
i = j;
|
||||||
|
} else if (special.includes(c)) {
|
||||||
|
out.push('\\' + c);
|
||||||
|
} else {
|
||||||
|
out.push(c);
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
const flags = caseSensitive ? '' : 'i';
|
||||||
|
return new RegExp('^' + out.join('') + '$', flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Apply pagination to search results and add pagination metadata
|
||||||
|
*/
|
||||||
|
function applyPagination<T>(
|
||||||
|
results: T[],
|
||||||
|
offset: number,
|
||||||
|
limit: number
|
||||||
|
): { paginatedResults: T[]; pagination: PaginationInfo } {
|
||||||
|
const total = results.length;
|
||||||
|
const paginatedResults = results.slice(offset, offset + limit);
|
||||||
|
|
||||||
|
return {
|
||||||
|
paginatedResults,
|
||||||
|
pagination: {
|
||||||
|
offset,
|
||||||
|
limit,
|
||||||
|
total,
|
||||||
|
has_more: offset + limit < total,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Transform results based on output_mode
|
* Transform results based on output_mode
|
||||||
*/
|
*/
|
||||||
@@ -1095,14 +1713,17 @@ export async function handler(params: Record<string, unknown>): Promise<ToolResu
|
|||||||
return { success: false, error: `Invalid params: ${parsed.error.message}` };
|
return { success: false, error: `Invalid params: ${parsed.error.message}` };
|
||||||
}
|
}
|
||||||
|
|
||||||
const { action, mode, output_mode } = parsed.data;
|
const { action, mode, output_mode, offset = 0 } = parsed.data;
|
||||||
|
|
||||||
// Sync limit and maxResults - use the larger of the two if both provided
|
// Sync limit and maxResults - use the larger of the two if both provided
|
||||||
// This ensures user-provided values take precedence over defaults
|
// This ensures user-provided values take precedence over defaults
|
||||||
const effectiveLimit = Math.max(parsed.data.limit || 10, parsed.data.maxResults || 10);
|
const effectiveLimit = Math.max(parsed.data.limit || 20, parsed.data.maxResults || 20);
|
||||||
parsed.data.maxResults = effectiveLimit;
|
parsed.data.maxResults = effectiveLimit;
|
||||||
parsed.data.limit = effectiveLimit;
|
parsed.data.limit = effectiveLimit;
|
||||||
|
|
||||||
|
// Track if search_files was used (deprecated)
|
||||||
|
let deprecationWarning: string | undefined;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
let result: SearchResult;
|
let result: SearchResult;
|
||||||
|
|
||||||
@@ -1116,8 +1737,14 @@ export async function handler(params: Record<string, unknown>): Promise<ToolResu
|
|||||||
result = await executeStatusAction(parsed.data);
|
result = await executeStatusAction(parsed.data);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case 'find_files':
|
||||||
|
// NEW: File path/name pattern matching (glob-based)
|
||||||
|
result = await executeFindFilesAction(parsed.data);
|
||||||
|
break;
|
||||||
|
|
||||||
case 'search_files':
|
case 'search_files':
|
||||||
// For search_files, use search mode but force files_only output
|
// DEPRECATED: Redirect to search with files_only output
|
||||||
|
deprecationWarning = 'action="search_files" is deprecated. Use action="search" with output_mode="files_only" for content-to-files search, or action="find_files" for path pattern matching.';
|
||||||
parsed.data.output_mode = 'files_only';
|
parsed.data.output_mode = 'files_only';
|
||||||
// Fall through to search
|
// Fall through to search
|
||||||
|
|
||||||
@@ -1151,6 +1778,27 @@ export async function handler(params: Record<string, unknown>): Promise<ToolResu
|
|||||||
if (result.success && result.results && output_mode !== 'full') {
|
if (result.success && result.results && output_mode !== 'full') {
|
||||||
result.results = transformOutput(result.results as any[], output_mode);
|
result.results = transformOutput(result.results as any[], output_mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add pagination metadata for search results if not already present
|
||||||
|
if (result.success && result.results && Array.isArray(result.results)) {
|
||||||
|
const totalResults = (result.results as any[]).length;
|
||||||
|
if (!result.metadata) {
|
||||||
|
result.metadata = {};
|
||||||
|
}
|
||||||
|
if (!result.metadata.pagination) {
|
||||||
|
result.metadata.pagination = {
|
||||||
|
offset: 0,
|
||||||
|
limit: effectiveLimit,
|
||||||
|
total: totalResults,
|
||||||
|
has_more: false, // Already limited by backend
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add deprecation warning if applicable
|
||||||
|
if (deprecationWarning && result.metadata) {
|
||||||
|
result.metadata.warning = deprecationWarning;
|
||||||
}
|
}
|
||||||
|
|
||||||
return result.success ? { success: true, result } : { success: false, error: result.error };
|
return result.success ? { success: true, result } : { success: false, error: result.error };
|
||||||
|
|||||||
1233
ccw/src/tools/smart-search.ts.backup
Normal file
1233
ccw/src/tools/smart-search.ts.backup
Normal file
File diff suppressed because it is too large
Load Diff
@@ -18,6 +18,27 @@ except ImportError:
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_path_column(conn: sqlite3.Connection) -> str:
|
||||||
|
"""Detect whether files table uses 'path' or 'full_path' column.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
conn: SQLite connection to the index database
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Column name ('path' or 'full_path')
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If neither column exists in files table
|
||||||
|
"""
|
||||||
|
cursor = conn.execute("PRAGMA table_info(files)")
|
||||||
|
columns = {row[1] for row in cursor.fetchall()}
|
||||||
|
if 'full_path' in columns:
|
||||||
|
return 'full_path'
|
||||||
|
elif 'path' in columns:
|
||||||
|
return 'path'
|
||||||
|
raise ValueError("files table has neither 'path' nor 'full_path' column")
|
||||||
|
|
||||||
|
|
||||||
def check_index_embeddings(index_path: Path) -> Dict[str, any]:
|
def check_index_embeddings(index_path: Path) -> Dict[str, any]:
|
||||||
"""Check if an index has embeddings and return statistics.
|
"""Check if an index has embeddings and return statistics.
|
||||||
|
|
||||||
@@ -75,10 +96,11 @@ def check_index_embeddings(index_path: Path) -> Dict[str, any]:
|
|||||||
files_with_chunks = cursor.fetchone()[0]
|
files_with_chunks = cursor.fetchone()[0]
|
||||||
|
|
||||||
# Get a sample of files without embeddings
|
# Get a sample of files without embeddings
|
||||||
cursor = conn.execute("""
|
path_column = _get_path_column(conn)
|
||||||
SELECT full_path
|
cursor = conn.execute(f"""
|
||||||
|
SELECT {path_column}
|
||||||
FROM files
|
FROM files
|
||||||
WHERE full_path NOT IN (
|
WHERE {path_column} NOT IN (
|
||||||
SELECT DISTINCT file_path FROM semantic_chunks
|
SELECT DISTINCT file_path FROM semantic_chunks
|
||||||
)
|
)
|
||||||
LIMIT 5
|
LIMIT 5
|
||||||
@@ -113,7 +135,10 @@ def generate_embeddings(
|
|||||||
chunk_size: int = 2000,
|
chunk_size: int = 2000,
|
||||||
progress_callback: Optional[callable] = None,
|
progress_callback: Optional[callable] = None,
|
||||||
) -> Dict[str, any]:
|
) -> Dict[str, any]:
|
||||||
"""Generate embeddings for an index.
|
"""Generate embeddings for an index using memory-efficient batch processing.
|
||||||
|
|
||||||
|
This function processes files in small batches to keep memory usage under 2GB,
|
||||||
|
regardless of the total project size.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
index_path: Path to _index.db file
|
index_path: Path to _index.db file
|
||||||
@@ -181,40 +206,45 @@ def generate_embeddings(
|
|||||||
"error": f"Failed to initialize components: {str(e)}",
|
"error": f"Failed to initialize components: {str(e)}",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Read files from index
|
# --- MEMORY-OPTIMIZED STREAMING PROCESSING ---
|
||||||
|
# Process files in small batches to control memory usage
|
||||||
|
# This keeps peak memory under 2GB regardless of project size
|
||||||
|
start_time = time.time()
|
||||||
|
failed_files = []
|
||||||
|
total_chunks_created = 0
|
||||||
|
total_files_processed = 0
|
||||||
|
FILE_BATCH_SIZE = 100 # Process 100 files at a time
|
||||||
|
EMBEDDING_BATCH_SIZE = 8 # jina-embeddings-v2-base-code needs small batches
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with sqlite3.connect(index_path) as conn:
|
with sqlite3.connect(index_path) as conn:
|
||||||
conn.row_factory = sqlite3.Row
|
conn.row_factory = sqlite3.Row
|
||||||
cursor = conn.execute("SELECT full_path, content, language FROM files")
|
path_column = _get_path_column(conn)
|
||||||
files = cursor.fetchall()
|
|
||||||
except Exception as e:
|
|
||||||
return {
|
|
||||||
"success": False,
|
|
||||||
"error": f"Failed to read files: {str(e)}",
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(files) == 0:
|
# Get total file count for progress reporting
|
||||||
return {
|
total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
|
||||||
"success": False,
|
if total_files == 0:
|
||||||
"error": "No files found in index",
|
return {"success": False, "error": "No files found in index"}
|
||||||
}
|
|
||||||
|
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
progress_callback(f"Processing {len(files)} files...")
|
progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...")
|
||||||
|
|
||||||
# Process all files using batch operations for optimal performance
|
cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
|
||||||
start_time = time.time()
|
batch_number = 0
|
||||||
failed_files = []
|
|
||||||
|
|
||||||
# --- OPTIMIZATION Step 1: Collect all chunks from all files ---
|
while True:
|
||||||
if progress_callback:
|
# Fetch a batch of files (streaming, not fetchall)
|
||||||
progress_callback(f"Step 1/4: Chunking {len(files)} files...")
|
file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
|
||||||
|
if not file_batch:
|
||||||
|
break
|
||||||
|
|
||||||
all_chunks_with_paths = [] # List of (chunk, file_path) tuples
|
batch_number += 1
|
||||||
files_with_chunks = set()
|
batch_chunks_with_paths = []
|
||||||
|
files_in_batch_with_chunks = set()
|
||||||
|
|
||||||
for idx, file_row in enumerate(files, 1):
|
# Step 1: Chunking for the current file batch
|
||||||
file_path = file_row["full_path"]
|
for file_row in file_batch:
|
||||||
|
file_path = file_row[path_column]
|
||||||
content = file_row["content"]
|
content = file_row["content"]
|
||||||
language = file_row["language"] or "python"
|
language = file_row["language"] or "python"
|
||||||
|
|
||||||
@@ -226,81 +256,57 @@ def generate_embeddings(
|
|||||||
)
|
)
|
||||||
if chunks:
|
if chunks:
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
all_chunks_with_paths.append((chunk, file_path))
|
batch_chunks_with_paths.append((chunk, file_path))
|
||||||
files_with_chunks.add(file_path)
|
files_in_batch_with_chunks.add(file_path)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to chunk {file_path}: {e}")
|
logger.error(f"Failed to chunk {file_path}: {e}")
|
||||||
failed_files.append((file_path, str(e)))
|
failed_files.append((file_path, str(e)))
|
||||||
|
|
||||||
if not all_chunks_with_paths:
|
if not batch_chunks_with_paths:
|
||||||
elapsed_time = time.time() - start_time
|
continue
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"result": {
|
|
||||||
"chunks_created": 0,
|
|
||||||
"files_processed": len(files) - len(failed_files),
|
|
||||||
"files_failed": len(failed_files),
|
|
||||||
"elapsed_time": elapsed_time,
|
|
||||||
"model_profile": model_profile,
|
|
||||||
"model_name": embedder.model_name,
|
|
||||||
"failed_files": failed_files[:5],
|
|
||||||
"index_path": str(index_path),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
total_chunks = len(all_chunks_with_paths)
|
|
||||||
|
|
||||||
# --- OPTIMIZATION Step 2: Batch generate embeddings with memory-safe batching ---
|
|
||||||
# Use smaller batches to avoid OOM errors while still benefiting from batch processing
|
|
||||||
# jina-embeddings-v2-base-code with long chunks needs small batches
|
|
||||||
BATCH_SIZE = 8 # Conservative batch size for memory efficiency
|
|
||||||
|
|
||||||
|
batch_chunk_count = len(batch_chunks_with_paths)
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
num_batches = (total_chunks + BATCH_SIZE - 1) // BATCH_SIZE
|
progress_callback(f" Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
|
||||||
progress_callback(f"Step 2/4: Generating embeddings for {total_chunks} chunks ({num_batches} batches)...")
|
|
||||||
|
|
||||||
|
# Step 2: Generate embeddings for this batch
|
||||||
|
batch_embeddings = []
|
||||||
try:
|
try:
|
||||||
all_embeddings = []
|
for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
|
||||||
for batch_start in range(0, total_chunks, BATCH_SIZE):
|
batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
|
||||||
batch_end = min(batch_start + BATCH_SIZE, total_chunks)
|
batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
|
||||||
batch_contents = [chunk.content for chunk, _ in all_chunks_with_paths[batch_start:batch_end]]
|
embeddings = embedder.embed(batch_contents)
|
||||||
batch_embeddings = embedder.embed(batch_contents)
|
batch_embeddings.extend(embeddings)
|
||||||
all_embeddings.extend(batch_embeddings)
|
|
||||||
|
|
||||||
if progress_callback and total_chunks > BATCH_SIZE:
|
|
||||||
progress_callback(f" Batch {batch_start // BATCH_SIZE + 1}/{(total_chunks + BATCH_SIZE - 1) // BATCH_SIZE}: {len(batch_embeddings)} embeddings")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {
|
logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
|
||||||
"success": False,
|
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
|
||||||
"error": f"Failed to generate embeddings: {str(e)}",
|
continue
|
||||||
}
|
|
||||||
|
|
||||||
# --- OPTIMIZATION Step 3: Assign embeddings back to chunks ---
|
# Step 3: Assign embeddings to chunks
|
||||||
if progress_callback:
|
for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
|
||||||
progress_callback(f"Step 3/4: Assigning {len(all_embeddings)} embeddings...")
|
|
||||||
|
|
||||||
for (chunk, _), embedding in zip(all_chunks_with_paths, all_embeddings):
|
|
||||||
chunk.embedding = embedding
|
chunk.embedding = embedding
|
||||||
|
|
||||||
# --- OPTIMIZATION Step 4: Batch store all chunks in single transaction ---
|
# Step 4: Store this batch to database immediately (releases memory)
|
||||||
if progress_callback:
|
|
||||||
progress_callback(f"Step 4/4: Storing {total_chunks} chunks to database...")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
vector_store.add_chunks_batch(all_chunks_with_paths)
|
vector_store.add_chunks_batch(batch_chunks_with_paths)
|
||||||
|
total_chunks_created += batch_chunk_count
|
||||||
|
total_files_processed += len(files_in_batch_with_chunks)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {
|
logger.error(f"Failed to store batch {batch_number}: {str(e)}")
|
||||||
"success": False,
|
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
|
||||||
"error": f"Failed to store chunks: {str(e)}",
|
|
||||||
}
|
# Memory is released here as batch_chunks_with_paths and batch_embeddings go out of scope
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"success": False, "error": f"Failed to read or process files: {str(e)}"}
|
||||||
|
|
||||||
elapsed_time = time.time() - start_time
|
elapsed_time = time.time() - start_time
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"success": True,
|
"success": True,
|
||||||
"result": {
|
"result": {
|
||||||
"chunks_created": total_chunks,
|
"chunks_created": total_chunks_created,
|
||||||
"files_processed": len(files_with_chunks),
|
"files_processed": total_files_processed,
|
||||||
"files_failed": len(failed_files),
|
"files_failed": len(failed_files),
|
||||||
"elapsed_time": elapsed_time,
|
"elapsed_time": elapsed_time,
|
||||||
"model_profile": model_profile,
|
"model_profile": model_profile,
|
||||||
|
|||||||
@@ -150,8 +150,13 @@ class Chunker:
|
|||||||
chunk_idx += 1
|
chunk_idx += 1
|
||||||
|
|
||||||
# Move window, accounting for overlap
|
# Move window, accounting for overlap
|
||||||
start = end - overlap_lines
|
step = lines_per_chunk - overlap_lines
|
||||||
if start >= len(lines) - overlap_lines:
|
if step <= 0:
|
||||||
|
step = 1 # Failsafe to prevent infinite loop
|
||||||
|
start += step
|
||||||
|
|
||||||
|
# Break if we've reached the end
|
||||||
|
if end >= len(lines):
|
||||||
break
|
break
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|||||||
Reference in New Issue
Block a user