mirror of
https://github.com/catlog22/Claude-Code-Workflow.git
synced 2026-02-05 01:50:27 +08:00
fix: improve chunking logic in Chunker class and enhance smart search tool with comprehensive features
- Updated the Chunker class to adjust the window movement logic, ensuring proper handling of overlap lines. - Introduced a new smart search tool with features including intent classification, CodexLens integration, multi-backend search routing, and index status checking. - Implemented various search modes (auto, hybrid, exact, ripgrep, priority) with detailed metadata and error handling. - Added support for progress tracking during index initialization and enhanced output transformation based on user-defined modes. - Included comprehensive documentation for usage and parameters in the smart search tool.
This commit is contained in:
@@ -384,13 +384,16 @@ export async function handleCodexLensRoutes(ctx: RouteContext): Promise<boolean>
|
||||
// API: CodexLens Init (Initialize workspace index)
|
||||
if (pathname === '/api/codexlens/init' && req.method === 'POST') {
|
||||
handlePostRequest(req, res, async (body) => {
|
||||
const { path: projectPath, indexType = 'vector' } = body;
|
||||
const { path: projectPath, indexType = 'vector', embeddingModel = 'code' } = body;
|
||||
const targetPath = projectPath || initialPath;
|
||||
|
||||
// Build CLI arguments based on index type
|
||||
const args = ['init', targetPath, '--json'];
|
||||
if (indexType === 'normal') {
|
||||
args.push('--no-embeddings');
|
||||
} else {
|
||||
// Add embedding model selection for vector index
|
||||
args.push('--embedding-model', embeddingModel);
|
||||
}
|
||||
|
||||
// Broadcast start event
|
||||
|
||||
@@ -275,6 +275,7 @@ const i18n = {
|
||||
'codexlens.semanticInstalled': 'Semantic dependencies installed',
|
||||
'codexlens.semanticNotInstalled': 'Semantic dependencies not installed',
|
||||
'codexlens.installDeps': 'Install Dependencies',
|
||||
'codexlens.installDepsPrompt': 'Would you like to install them now? (This may take a few minutes)\n\nClick "Cancel" to create FTS index only.',
|
||||
'codexlens.installingDeps': 'Installing dependencies...',
|
||||
'codexlens.depsInstalled': 'Dependencies installed successfully',
|
||||
'codexlens.depsInstallFailed': 'Failed to install dependencies',
|
||||
@@ -324,8 +325,15 @@ const i18n = {
|
||||
'index.cleanAllSuccess': 'All indexes cleaned',
|
||||
'index.vectorIndex': 'Vector',
|
||||
'index.normalIndex': 'FTS',
|
||||
'index.fullIndex': 'Full Index',
|
||||
'index.vectorDesc': 'Semantic search with embeddings',
|
||||
'index.normalDesc': 'Fast full-text search only',
|
||||
'index.fullDesc': 'FTS + Semantic search (recommended)',
|
||||
'index.selectModel': 'Select embedding model',
|
||||
'index.modelCode': 'Code (768d)',
|
||||
'index.modelFast': 'Fast (384d)',
|
||||
'index.modelMultilingual': 'Multilingual (1024d)',
|
||||
'index.modelBalanced': 'Balanced (1024d)',
|
||||
|
||||
// Semantic Search Configuration
|
||||
'semantic.settings': 'Semantic Search Settings',
|
||||
@@ -1596,6 +1604,7 @@ const i18n = {
|
||||
'codexlens.semanticInstalled': '语义搜索依赖已安装',
|
||||
'codexlens.semanticNotInstalled': '语义搜索依赖未安装',
|
||||
'codexlens.installDeps': '安装依赖',
|
||||
'codexlens.installDepsPrompt': '是否立即安装?(可能需要几分钟)\n\n点击"取消"将只创建 FTS 索引。',
|
||||
'codexlens.installingDeps': '安装依赖中...',
|
||||
'codexlens.depsInstalled': '依赖安装成功',
|
||||
'codexlens.depsInstallFailed': '依赖安装失败',
|
||||
@@ -1645,8 +1654,15 @@ const i18n = {
|
||||
'index.cleanAllSuccess': '所有索引已清理',
|
||||
'index.vectorIndex': '向量索引',
|
||||
'index.normalIndex': 'FTS索引',
|
||||
'index.fullIndex': '全部索引',
|
||||
'index.vectorDesc': '语义搜索(含嵌入向量)',
|
||||
'index.normalDesc': '快速全文搜索',
|
||||
'index.fullDesc': 'FTS + 语义搜索(推荐)',
|
||||
'index.selectModel': '选择嵌入模型',
|
||||
'index.modelCode': '代码优化 (768维)',
|
||||
'index.modelFast': '快速轻量 (384维)',
|
||||
'index.modelMultilingual': '多语言 (1024维)',
|
||||
'index.modelBalanced': '高精度 (1024维)',
|
||||
|
||||
// Semantic Search 配置
|
||||
'semantic.settings': '语义搜索设置',
|
||||
|
||||
@@ -338,6 +338,17 @@ async function renderCliManager() {
|
||||
if (window.lucide) lucide.createIcons();
|
||||
}
|
||||
|
||||
// ========== Helper Functions ==========
|
||||
|
||||
/**
|
||||
* Get selected embedding model from dropdown
|
||||
* @returns {string} Selected model profile (code, fast, multilingual, balanced)
|
||||
*/
|
||||
function getSelectedModel() {
|
||||
var select = document.getElementById('codexlensModelSelect');
|
||||
return select ? select.value : 'code';
|
||||
}
|
||||
|
||||
// ========== Tools Section (Left Column) ==========
|
||||
function renderToolsSection() {
|
||||
var container = document.getElementById('tools-section');
|
||||
@@ -392,8 +403,15 @@ function renderToolsSection() {
|
||||
'<div class="tool-item-right">' +
|
||||
(codexLensStatus.ready
|
||||
? '<span class="tool-status-text success"><i data-lucide="check-circle" class="w-3.5 h-3.5"></i> v' + (codexLensStatus.version || 'installed') + '</span>' +
|
||||
'<button class="btn-sm btn-outline" onclick="event.stopPropagation(); initCodexLensIndex(\'vector\')" title="' + (t('index.vectorDesc') || 'Semantic search with embeddings') + '"><i data-lucide="sparkles" class="w-3 h-3"></i> ' + (t('index.vectorIndex') || 'Vector') + '</button>' +
|
||||
'<button class="btn-sm btn-outline" onclick="event.stopPropagation(); initCodexLensIndex(\'normal\')" title="' + (t('index.normalDesc') || 'Fast full-text search only') + '"><i data-lucide="file-text" class="w-3 h-3"></i> ' + (t('index.normalIndex') || 'FTS') + '</button>' +
|
||||
'<select id="codexlensModelSelect" class="btn-sm bg-muted border border-border rounded text-xs" onclick="event.stopPropagation()" title="' + (t('index.selectModel') || 'Select embedding model') + '">' +
|
||||
'<option value="code">' + (t('index.modelCode') || 'Code (768d)') + '</option>' +
|
||||
'<option value="fast">' + (t('index.modelFast') || 'Fast (384d)') + '</option>' +
|
||||
'<option value="multilingual">' + (t('index.modelMultilingual') || 'Multilingual (1024d)') + '</option>' +
|
||||
'<option value="balanced">' + (t('index.modelBalanced') || 'Balanced (1024d)') + '</option>' +
|
||||
'</select>' +
|
||||
'<button class="btn-sm btn-primary" onclick="event.stopPropagation(); initCodexLensIndex(\'full\', getSelectedModel())" title="' + (t('index.fullDesc') || 'FTS + Semantic search (recommended)') + '"><i data-lucide="layers" class="w-3 h-3"></i> ' + (t('index.fullIndex') || '全部索引') + '</button>' +
|
||||
'<button class="btn-sm btn-outline" onclick="event.stopPropagation(); initCodexLensIndex(\'vector\', getSelectedModel())" title="' + (t('index.vectorDesc') || 'Semantic search with embeddings') + '"><i data-lucide="sparkles" class="w-3 h-3"></i> ' + (t('index.vectorIndex') || '向量索引') + '</button>' +
|
||||
'<button class="btn-sm btn-outline" onclick="event.stopPropagation(); initCodexLensIndex(\'normal\')" title="' + (t('index.normalDesc') || 'Fast full-text search only') + '"><i data-lucide="file-text" class="w-3 h-3"></i> ' + (t('index.normalIndex') || 'FTS索引') + '</button>' +
|
||||
'<button class="btn-sm btn-outline btn-danger" onclick="event.stopPropagation(); uninstallCodexLens()"><i data-lucide="trash-2" class="w-3 h-3"></i> ' + t('cli.uninstall') + '</button>'
|
||||
: '<span class="tool-status-text muted"><i data-lucide="circle-dashed" class="w-3.5 h-3.5"></i> ' + t('cli.notInstalled') + '</span>' +
|
||||
'<button class="btn-sm btn-primary" onclick="event.stopPropagation(); installCodexLens()"><i data-lucide="download" class="w-3 h-3"></i> ' + t('cli.install') + '</button>') +
|
||||
|
||||
@@ -554,10 +554,54 @@ async function deleteModel(profile) {
|
||||
|
||||
/**
|
||||
* Initialize CodexLens index with bottom floating progress bar
|
||||
* @param {string} indexType - 'vector' (with embeddings) or 'normal' (FTS only)
|
||||
* @param {string} indexType - 'vector' (with embeddings), 'normal' (FTS only), or 'full' (FTS + Vector)
|
||||
* @param {string} embeddingModel - Model profile: 'code', 'fast', 'multilingual', 'balanced'
|
||||
*/
|
||||
function initCodexLensIndex(indexType) {
|
||||
async function initCodexLensIndex(indexType, embeddingModel) {
|
||||
indexType = indexType || 'vector';
|
||||
embeddingModel = embeddingModel || 'code';
|
||||
|
||||
// For vector or full index, check if semantic dependencies are available
|
||||
if (indexType === 'vector' || indexType === 'full') {
|
||||
try {
|
||||
var semanticResponse = await fetch('/api/codexlens/semantic/status');
|
||||
var semanticStatus = await semanticResponse.json();
|
||||
|
||||
if (!semanticStatus.available) {
|
||||
// Semantic deps not installed - show confirmation dialog
|
||||
var installDeps = confirm(
|
||||
(t('codexlens.semanticNotInstalled') || 'Semantic search dependencies are not installed.') + '\n\n' +
|
||||
(t('codexlens.installDepsPrompt') || 'Would you like to install them now? (This may take a few minutes)\n\nClick "Cancel" to create FTS index only.')
|
||||
);
|
||||
|
||||
if (installDeps) {
|
||||
// Install semantic dependencies first
|
||||
showRefreshToast(t('codexlens.installingDeps') || 'Installing semantic dependencies...', 'info');
|
||||
try {
|
||||
var installResponse = await fetch('/api/codexlens/semantic/install', { method: 'POST' });
|
||||
var installResult = await installResponse.json();
|
||||
|
||||
if (!installResult.success) {
|
||||
showRefreshToast((t('codexlens.depsInstallFailed') || 'Failed to install dependencies') + ': ' + installResult.error, 'error');
|
||||
// Fall back to FTS only
|
||||
indexType = 'normal';
|
||||
} else {
|
||||
showRefreshToast(t('codexlens.depsInstalled') || 'Dependencies installed successfully', 'success');
|
||||
}
|
||||
} catch (err) {
|
||||
showRefreshToast((t('common.error') || 'Error') + ': ' + err.message, 'error');
|
||||
indexType = 'normal';
|
||||
}
|
||||
} else {
|
||||
// User chose to skip - create FTS only
|
||||
indexType = 'normal';
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn('[CodexLens] Could not check semantic status:', err);
|
||||
// Continue with requested type, backend will handle fallback
|
||||
}
|
||||
}
|
||||
|
||||
// Remove existing progress bar if any
|
||||
closeCodexLensIndexModal();
|
||||
@@ -566,7 +610,24 @@ function initCodexLensIndex(indexType) {
|
||||
var progressBar = document.createElement('div');
|
||||
progressBar.id = 'codexlensIndexFloating';
|
||||
progressBar.className = 'fixed bottom-0 left-0 right-0 z-50 bg-card border-t border-border shadow-lg transform transition-transform duration-300';
|
||||
var indexTypeLabel = indexType === 'vector' ? 'Vector' : 'FTS';
|
||||
|
||||
// Determine display label
|
||||
var indexTypeLabel;
|
||||
if (indexType === 'full') {
|
||||
indexTypeLabel = 'FTS + Vector';
|
||||
} else if (indexType === 'vector') {
|
||||
indexTypeLabel = 'Vector';
|
||||
} else {
|
||||
indexTypeLabel = 'FTS';
|
||||
}
|
||||
|
||||
// Add model info for vector indexes
|
||||
var modelLabel = '';
|
||||
if (indexType !== 'normal') {
|
||||
var modelNames = { code: 'Code', fast: 'Fast', multilingual: 'Multi', balanced: 'Balanced' };
|
||||
modelLabel = ' [' + (modelNames[embeddingModel] || embeddingModel) + ']';
|
||||
}
|
||||
|
||||
progressBar.innerHTML =
|
||||
'<div class="max-w-4xl mx-auto px-4 py-3">' +
|
||||
'<div class="flex items-center justify-between gap-4">' +
|
||||
@@ -574,7 +635,7 @@ function initCodexLensIndex(indexType) {
|
||||
'<div class="animate-spin w-5 h-5 border-2 border-primary border-t-transparent rounded-full flex-shrink-0" id="codexlensIndexSpinner"></div>' +
|
||||
'<div class="flex-1 min-w-0">' +
|
||||
'<div class="flex items-center gap-2">' +
|
||||
'<span class="font-medium text-sm">' + t('codexlens.indexing') + ' (' + indexTypeLabel + ')</span>' +
|
||||
'<span class="font-medium text-sm">' + t('codexlens.indexing') + ' (' + indexTypeLabel + modelLabel + ')</span>' +
|
||||
'<span class="text-xs text-muted-foreground" id="codexlensIndexPercent">0%</span>' +
|
||||
'</div>' +
|
||||
'<div class="text-xs text-muted-foreground truncate" id="codexlensIndexStatus">' + t('codexlens.preparingIndex') + '</div>' +
|
||||
@@ -594,16 +655,21 @@ function initCodexLensIndex(indexType) {
|
||||
document.body.appendChild(progressBar);
|
||||
if (window.lucide) lucide.createIcons();
|
||||
|
||||
// Start indexing with specified type
|
||||
startCodexLensIndexing(indexType);
|
||||
// For 'full' type, use 'vector' in the API (it creates FTS + embeddings)
|
||||
var apiIndexType = (indexType === 'full') ? 'vector' : indexType;
|
||||
|
||||
// Start indexing with specified type and model
|
||||
startCodexLensIndexing(apiIndexType, embeddingModel);
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the indexing process
|
||||
* @param {string} indexType - 'vector' or 'normal'
|
||||
* @param {string} embeddingModel - Model profile: 'code', 'fast', 'multilingual', 'balanced'
|
||||
*/
|
||||
async function startCodexLensIndexing(indexType) {
|
||||
async function startCodexLensIndexing(indexType, embeddingModel) {
|
||||
indexType = indexType || 'vector';
|
||||
embeddingModel = embeddingModel || 'code';
|
||||
var statusText = document.getElementById('codexlensIndexStatus');
|
||||
var progressBar = document.getElementById('codexlensIndexProgressBar');
|
||||
var percentText = document.getElementById('codexlensIndexPercent');
|
||||
@@ -635,11 +701,11 @@ async function startCodexLensIndexing(indexType) {
|
||||
}
|
||||
|
||||
try {
|
||||
console.log('[CodexLens] Starting index for:', projectPath, 'type:', indexType);
|
||||
console.log('[CodexLens] Starting index for:', projectPath, 'type:', indexType, 'model:', embeddingModel);
|
||||
var response = await fetch('/api/codexlens/init', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ path: projectPath, indexType: indexType })
|
||||
body: JSON.stringify({ path: projectPath, indexType: indexType, embeddingModel: embeddingModel })
|
||||
});
|
||||
|
||||
var result = await response.json();
|
||||
|
||||
@@ -429,7 +429,7 @@ function parseProgressLine(line: string): ProgressInfo | null {
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute CodexLens CLI command
|
||||
* Execute CodexLens CLI command with real-time progress updates
|
||||
* @param args - CLI arguments
|
||||
* @param options - Execution options
|
||||
* @returns Execution result
|
||||
@@ -463,34 +463,110 @@ async function executeCodexLens(args: string[], options: ExecuteOptions = {}): P
|
||||
fullCmd = `${quotedPython} -m codexlens ${cmdArgs.join(' ')}`;
|
||||
}
|
||||
|
||||
// Use exec with shell option for cross-platform compatibility
|
||||
exec(fullCmd, {
|
||||
cwd: process.platform === 'win32' ? undefined : cwd, // Don't use cwd on Windows, use cd command instead
|
||||
// Use spawn with shell for real-time progress updates
|
||||
// spawn streams output in real-time, unlike exec which buffers until completion
|
||||
const child = spawn(fullCmd, [], {
|
||||
cwd: process.platform === 'win32' ? undefined : cwd,
|
||||
shell: process.platform === 'win32' ? process.env.ComSpec || true : true,
|
||||
timeout,
|
||||
maxBuffer: 50 * 1024 * 1024, // 50MB buffer for large outputs
|
||||
shell: process.platform === 'win32' ? process.env.ComSpec : undefined,
|
||||
}, (error, stdout, stderr) => {
|
||||
if (error) {
|
||||
if (error.killed) {
|
||||
resolve({ success: false, error: 'Command timed out' });
|
||||
} else {
|
||||
resolve({ success: false, error: stderr || error.message });
|
||||
}
|
||||
return;
|
||||
}
|
||||
});
|
||||
|
||||
// Report final progress if callback provided
|
||||
if (onProgress && stdout) {
|
||||
const lines = stdout.split('\n');
|
||||
for (const line of lines) {
|
||||
const progress = parseProgressLine(line.trim());
|
||||
let stdout = '';
|
||||
let stderr = '';
|
||||
let stdoutLineBuffer = '';
|
||||
let stderrLineBuffer = '';
|
||||
let timeoutHandle: NodeJS.Timeout | null = null;
|
||||
let resolved = false;
|
||||
|
||||
// Helper to safely resolve only once
|
||||
const safeResolve = (result: ExecuteResult) => {
|
||||
if (resolved) return;
|
||||
resolved = true;
|
||||
if (timeoutHandle) {
|
||||
clearTimeout(timeoutHandle);
|
||||
timeoutHandle = null;
|
||||
}
|
||||
resolve(result);
|
||||
};
|
||||
|
||||
// Set up timeout handler
|
||||
if (timeout > 0) {
|
||||
timeoutHandle = setTimeout(() => {
|
||||
if (!resolved) {
|
||||
child.kill('SIGTERM');
|
||||
// Give it a moment to die gracefully, then force kill
|
||||
setTimeout(() => {
|
||||
if (!resolved) {
|
||||
child.kill('SIGKILL');
|
||||
}
|
||||
}, 5000);
|
||||
safeResolve({ success: false, error: 'Command timed out' });
|
||||
}
|
||||
}, timeout);
|
||||
}
|
||||
|
||||
// Process stdout line by line for real-time progress
|
||||
child.stdout?.on('data', (data: Buffer) => {
|
||||
const chunk = data.toString();
|
||||
stdoutLineBuffer += chunk;
|
||||
stdout += chunk;
|
||||
|
||||
// Process complete lines
|
||||
const lines = stdoutLineBuffer.split('\n');
|
||||
stdoutLineBuffer = lines.pop() || ''; // Keep incomplete line in buffer
|
||||
|
||||
for (const line of lines) {
|
||||
const trimmedLine = line.trim();
|
||||
if (trimmedLine && onProgress) {
|
||||
const progress = parseProgressLine(trimmedLine);
|
||||
if (progress) {
|
||||
onProgress(progress);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
resolve({ success: true, output: stdout.trim() });
|
||||
// Collect stderr
|
||||
child.stderr?.on('data', (data: Buffer) => {
|
||||
const chunk = data.toString();
|
||||
stderrLineBuffer += chunk;
|
||||
stderr += chunk;
|
||||
|
||||
// Also check stderr for progress (some tools output progress to stderr)
|
||||
const lines = stderrLineBuffer.split('\n');
|
||||
stderrLineBuffer = lines.pop() || '';
|
||||
|
||||
for (const line of lines) {
|
||||
const trimmedLine = line.trim();
|
||||
if (trimmedLine && onProgress) {
|
||||
const progress = parseProgressLine(trimmedLine);
|
||||
if (progress) {
|
||||
onProgress(progress);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Handle process errors (spawn failure)
|
||||
child.on('error', (err) => {
|
||||
safeResolve({ success: false, error: `Failed to start process: ${err.message}` });
|
||||
});
|
||||
|
||||
// Handle process completion
|
||||
child.on('close', (code) => {
|
||||
// Process any remaining buffered content
|
||||
if (stdoutLineBuffer.trim() && onProgress) {
|
||||
const progress = parseProgressLine(stdoutLineBuffer.trim());
|
||||
if (progress) {
|
||||
onProgress(progress);
|
||||
}
|
||||
}
|
||||
|
||||
if (code === 0) {
|
||||
safeResolve({ success: true, output: stdout.trim() });
|
||||
} else {
|
||||
safeResolve({ success: false, error: stderr.trim() || `Process exited with code ${code}` });
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
1233
ccw/src/tools/smart-search.ts.backup
Normal file
1233
ccw/src/tools/smart-search.ts.backup
Normal file
File diff suppressed because it is too large
Load Diff
@@ -18,6 +18,27 @@ except ImportError:
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_path_column(conn: sqlite3.Connection) -> str:
|
||||
"""Detect whether files table uses 'path' or 'full_path' column.
|
||||
|
||||
Args:
|
||||
conn: SQLite connection to the index database
|
||||
|
||||
Returns:
|
||||
Column name ('path' or 'full_path')
|
||||
|
||||
Raises:
|
||||
ValueError: If neither column exists in files table
|
||||
"""
|
||||
cursor = conn.execute("PRAGMA table_info(files)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
if 'full_path' in columns:
|
||||
return 'full_path'
|
||||
elif 'path' in columns:
|
||||
return 'path'
|
||||
raise ValueError("files table has neither 'path' nor 'full_path' column")
|
||||
|
||||
|
||||
def check_index_embeddings(index_path: Path) -> Dict[str, any]:
|
||||
"""Check if an index has embeddings and return statistics.
|
||||
|
||||
@@ -75,10 +96,11 @@ def check_index_embeddings(index_path: Path) -> Dict[str, any]:
|
||||
files_with_chunks = cursor.fetchone()[0]
|
||||
|
||||
# Get a sample of files without embeddings
|
||||
cursor = conn.execute("""
|
||||
SELECT full_path
|
||||
path_column = _get_path_column(conn)
|
||||
cursor = conn.execute(f"""
|
||||
SELECT {path_column}
|
||||
FROM files
|
||||
WHERE full_path NOT IN (
|
||||
WHERE {path_column} NOT IN (
|
||||
SELECT DISTINCT file_path FROM semantic_chunks
|
||||
)
|
||||
LIMIT 5
|
||||
@@ -113,7 +135,10 @@ def generate_embeddings(
|
||||
chunk_size: int = 2000,
|
||||
progress_callback: Optional[callable] = None,
|
||||
) -> Dict[str, any]:
|
||||
"""Generate embeddings for an index.
|
||||
"""Generate embeddings for an index using memory-efficient batch processing.
|
||||
|
||||
This function processes files in small batches to keep memory usage under 2GB,
|
||||
regardless of the total project size.
|
||||
|
||||
Args:
|
||||
index_path: Path to _index.db file
|
||||
@@ -181,126 +206,107 @@ def generate_embeddings(
|
||||
"error": f"Failed to initialize components: {str(e)}",
|
||||
}
|
||||
|
||||
# Read files from index
|
||||
# --- MEMORY-OPTIMIZED STREAMING PROCESSING ---
|
||||
# Process files in small batches to control memory usage
|
||||
# This keeps peak memory under 2GB regardless of project size
|
||||
start_time = time.time()
|
||||
failed_files = []
|
||||
total_chunks_created = 0
|
||||
total_files_processed = 0
|
||||
FILE_BATCH_SIZE = 100 # Process 100 files at a time
|
||||
EMBEDDING_BATCH_SIZE = 8 # jina-embeddings-v2-base-code needs small batches
|
||||
|
||||
try:
|
||||
with sqlite3.connect(index_path) as conn:
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.execute("SELECT full_path, content, language FROM files")
|
||||
files = cursor.fetchall()
|
||||
path_column = _get_path_column(conn)
|
||||
|
||||
# Get total file count for progress reporting
|
||||
total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
|
||||
if total_files == 0:
|
||||
return {"success": False, "error": "No files found in index"}
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...")
|
||||
|
||||
cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
|
||||
batch_number = 0
|
||||
|
||||
while True:
|
||||
# Fetch a batch of files (streaming, not fetchall)
|
||||
file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
|
||||
if not file_batch:
|
||||
break
|
||||
|
||||
batch_number += 1
|
||||
batch_chunks_with_paths = []
|
||||
files_in_batch_with_chunks = set()
|
||||
|
||||
# Step 1: Chunking for the current file batch
|
||||
for file_row in file_batch:
|
||||
file_path = file_row[path_column]
|
||||
content = file_row["content"]
|
||||
language = file_row["language"] or "python"
|
||||
|
||||
try:
|
||||
chunks = chunker.chunk_sliding_window(
|
||||
content,
|
||||
file_path=file_path,
|
||||
language=language
|
||||
)
|
||||
if chunks:
|
||||
for chunk in chunks:
|
||||
batch_chunks_with_paths.append((chunk, file_path))
|
||||
files_in_batch_with_chunks.add(file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to chunk {file_path}: {e}")
|
||||
failed_files.append((file_path, str(e)))
|
||||
|
||||
if not batch_chunks_with_paths:
|
||||
continue
|
||||
|
||||
batch_chunk_count = len(batch_chunks_with_paths)
|
||||
if progress_callback:
|
||||
progress_callback(f" Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
|
||||
|
||||
# Step 2: Generate embeddings for this batch
|
||||
batch_embeddings = []
|
||||
try:
|
||||
for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
|
||||
batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
|
||||
batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
|
||||
embeddings = embedder.embed(batch_contents)
|
||||
batch_embeddings.extend(embeddings)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
|
||||
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
|
||||
continue
|
||||
|
||||
# Step 3: Assign embeddings to chunks
|
||||
for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
|
||||
chunk.embedding = embedding
|
||||
|
||||
# Step 4: Store this batch to database immediately (releases memory)
|
||||
try:
|
||||
vector_store.add_chunks_batch(batch_chunks_with_paths)
|
||||
total_chunks_created += batch_chunk_count
|
||||
total_files_processed += len(files_in_batch_with_chunks)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to store batch {batch_number}: {str(e)}")
|
||||
failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
|
||||
|
||||
# Memory is released here as batch_chunks_with_paths and batch_embeddings go out of scope
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Failed to read files: {str(e)}",
|
||||
}
|
||||
|
||||
if len(files) == 0:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "No files found in index",
|
||||
}
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f"Processing {len(files)} files...")
|
||||
|
||||
# Process all files using batch operations for optimal performance
|
||||
start_time = time.time()
|
||||
failed_files = []
|
||||
|
||||
# --- OPTIMIZATION Step 1: Collect all chunks from all files ---
|
||||
if progress_callback:
|
||||
progress_callback(f"Step 1/4: Chunking {len(files)} files...")
|
||||
|
||||
all_chunks_with_paths = [] # List of (chunk, file_path) tuples
|
||||
files_with_chunks = set()
|
||||
|
||||
for idx, file_row in enumerate(files, 1):
|
||||
file_path = file_row["full_path"]
|
||||
content = file_row["content"]
|
||||
language = file_row["language"] or "python"
|
||||
|
||||
try:
|
||||
chunks = chunker.chunk_sliding_window(
|
||||
content,
|
||||
file_path=file_path,
|
||||
language=language
|
||||
)
|
||||
if chunks:
|
||||
for chunk in chunks:
|
||||
all_chunks_with_paths.append((chunk, file_path))
|
||||
files_with_chunks.add(file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to chunk {file_path}: {e}")
|
||||
failed_files.append((file_path, str(e)))
|
||||
|
||||
if not all_chunks_with_paths:
|
||||
elapsed_time = time.time() - start_time
|
||||
return {
|
||||
"success": True,
|
||||
"result": {
|
||||
"chunks_created": 0,
|
||||
"files_processed": len(files) - len(failed_files),
|
||||
"files_failed": len(failed_files),
|
||||
"elapsed_time": elapsed_time,
|
||||
"model_profile": model_profile,
|
||||
"model_name": embedder.model_name,
|
||||
"failed_files": failed_files[:5],
|
||||
"index_path": str(index_path),
|
||||
},
|
||||
}
|
||||
|
||||
total_chunks = len(all_chunks_with_paths)
|
||||
|
||||
# --- OPTIMIZATION Step 2: Batch generate embeddings with memory-safe batching ---
|
||||
# Use smaller batches to avoid OOM errors while still benefiting from batch processing
|
||||
# jina-embeddings-v2-base-code with long chunks needs small batches
|
||||
BATCH_SIZE = 8 # Conservative batch size for memory efficiency
|
||||
|
||||
if progress_callback:
|
||||
num_batches = (total_chunks + BATCH_SIZE - 1) // BATCH_SIZE
|
||||
progress_callback(f"Step 2/4: Generating embeddings for {total_chunks} chunks ({num_batches} batches)...")
|
||||
|
||||
try:
|
||||
all_embeddings = []
|
||||
for batch_start in range(0, total_chunks, BATCH_SIZE):
|
||||
batch_end = min(batch_start + BATCH_SIZE, total_chunks)
|
||||
batch_contents = [chunk.content for chunk, _ in all_chunks_with_paths[batch_start:batch_end]]
|
||||
batch_embeddings = embedder.embed(batch_contents)
|
||||
all_embeddings.extend(batch_embeddings)
|
||||
|
||||
if progress_callback and total_chunks > BATCH_SIZE:
|
||||
progress_callback(f" Batch {batch_start // BATCH_SIZE + 1}/{(total_chunks + BATCH_SIZE - 1) // BATCH_SIZE}: {len(batch_embeddings)} embeddings")
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Failed to generate embeddings: {str(e)}",
|
||||
}
|
||||
|
||||
# --- OPTIMIZATION Step 3: Assign embeddings back to chunks ---
|
||||
if progress_callback:
|
||||
progress_callback(f"Step 3/4: Assigning {len(all_embeddings)} embeddings...")
|
||||
|
||||
for (chunk, _), embedding in zip(all_chunks_with_paths, all_embeddings):
|
||||
chunk.embedding = embedding
|
||||
|
||||
# --- OPTIMIZATION Step 4: Batch store all chunks in single transaction ---
|
||||
if progress_callback:
|
||||
progress_callback(f"Step 4/4: Storing {total_chunks} chunks to database...")
|
||||
|
||||
try:
|
||||
vector_store.add_chunks_batch(all_chunks_with_paths)
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Failed to store chunks: {str(e)}",
|
||||
}
|
||||
return {"success": False, "error": f"Failed to read or process files: {str(e)}"}
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"result": {
|
||||
"chunks_created": total_chunks,
|
||||
"files_processed": len(files_with_chunks),
|
||||
"chunks_created": total_chunks_created,
|
||||
"files_processed": total_files_processed,
|
||||
"files_failed": len(failed_files),
|
||||
"elapsed_time": elapsed_time,
|
||||
"model_profile": model_profile,
|
||||
|
||||
@@ -150,8 +150,13 @@ class Chunker:
|
||||
chunk_idx += 1
|
||||
|
||||
# Move window, accounting for overlap
|
||||
start = end - overlap_lines
|
||||
if start >= len(lines) - overlap_lines:
|
||||
step = lines_per_chunk - overlap_lines
|
||||
if step <= 0:
|
||||
step = 1 # Failsafe to prevent infinite loop
|
||||
start += step
|
||||
|
||||
# Break if we've reached the end
|
||||
if end >= len(lines):
|
||||
break
|
||||
|
||||
return chunks
|
||||
|
||||
Reference in New Issue
Block a user