From fd4a15c84ef2cf604b85267b34cf39d024a910f2 Mon Sep 17 00:00:00 2001
From: catlog22 <catlog22@github.com>
Date: Sat, 20 Dec 2025 21:44:15 +0800
Subject: [PATCH] fix: improve chunking logic in Chunker class and enhance
 smart search tool with comprehensive features

- Updated the Chunker class to adjust the window movement logic, ensuring proper handling of overlap lines.
- Introduced a new smart search tool with features including intent classification, CodexLens integration, multi-backend search routing, and index status checking.
- Implemented various search modes (auto, hybrid, exact, ripgrep, priority) with detailed metadata and error handling.
- Added support for progress tracking during index initialization and enhanced output transformation based on user-defined modes.
- Included comprehensive documentation for usage and parameters in the smart search tool.
---
 ccw/src/core/routes/codexlens-routes.ts       |    5 +-
 ccw/src/templates/dashboard-js/i18n.js        |   16 +
 .../dashboard-js/views/cli-manager.js         |   22 +-
 .../dashboard-js/views/codexlens-manager.js   |   84 +-
 ccw/src/tools/codex-lens.ts                   |  118 +-
 ccw/src/tools/smart-search.ts                 |  786 ++++++++++-
 ccw/src/tools/smart-search.ts.backup          | 1233 +++++++++++++++++
 .../src/codexlens/cli/embedding_manager.py    |  234 ++--
 codex-lens/src/codexlens/semantic/chunker.py  |    9 +-
 9 files changed, 2289 insertions(+), 218 deletions(-)
 create mode 100644 ccw/src/tools/smart-search.ts.backup
diff --git a/ccw/src/core/routes/codexlens-routes.ts b/ccw/src/core/routes/codexlens-routes.ts
index e91f198b..012f7f14 100644
--- a/ccw/src/core/routes/codexlens-routes.ts
+++ b/ccw/src/core/routes/codexlens-routes.ts
@@ -384,13 +384,16 @@ export async function handleCodexLensRoutes(ctx: RouteContext): Promise<boolean>
   // API: CodexLens Init (Initialize workspace index)
   if (pathname === '/api/codexlens/init' && req.method === 'POST') {
     handlePostRequest(req, res, async (body) => {
-      const { path: projectPath, indexType = 'vector' } = body;
+      const { path: projectPath, indexType = 'vector', embeddingModel = 'code' } = body;
       const targetPath = projectPath || initialPath;
 
       // Build CLI arguments based on index type
       const args = ['init', targetPath, '--json'];
       if (indexType === 'normal') {
         args.push('--no-embeddings');
+      } else {
+        // Add embedding model selection for vector index
+        args.push('--embedding-model', embeddingModel);
       }
 
       // Broadcast start event
diff --git a/ccw/src/templates/dashboard-js/i18n.js b/ccw/src/templates/dashboard-js/i18n.js
index 5dcc4a5f..b438740a 100644
--- a/ccw/src/templates/dashboard-js/i18n.js
+++ b/ccw/src/templates/dashboard-js/i18n.js
@@ -275,6 +275,7 @@ const i18n = {
     'codexlens.semanticInstalled': 'Semantic dependencies installed',
     'codexlens.semanticNotInstalled': 'Semantic dependencies not installed',
     'codexlens.installDeps': 'Install Dependencies',
+    'codexlens.installDepsPrompt': 'Would you like to install them now? (This may take a few minutes)\n\nClick "Cancel" to create FTS index only.',
     'codexlens.installingDeps': 'Installing dependencies...',
     'codexlens.depsInstalled': 'Dependencies installed successfully',
     'codexlens.depsInstallFailed': 'Failed to install dependencies',
@@ -324,8 +325,15 @@ const i18n = {
     'index.cleanAllSuccess': 'All indexes cleaned',
     'index.vectorIndex': 'Vector',
     'index.normalIndex': 'FTS',
+    'index.fullIndex': 'Full Index',
     'index.vectorDesc': 'Semantic search with embeddings',
     'index.normalDesc': 'Fast full-text search only',
+    'index.fullDesc': 'FTS + Semantic search (recommended)',
+    'index.selectModel': 'Select embedding model',
+    'index.modelCode': 'Code (768d)',
+    'index.modelFast': 'Fast (384d)',
+    'index.modelMultilingual': 'Multilingual (1024d)',
+    'index.modelBalanced': 'Balanced (1024d)',
 
     // Semantic Search Configuration
     'semantic.settings': 'Semantic Search Settings',
@@ -1596,6 +1604,7 @@ const i18n = {
     'codexlens.semanticInstalled': '语义搜索依赖已安装',
     'codexlens.semanticNotInstalled': '语义搜索依赖未安装',
     'codexlens.installDeps': '安装依赖',
+    'codexlens.installDepsPrompt': '是否立即安装？（可能需要几分钟）\n\n点击"取消"将只创建 FTS 索引。',
     'codexlens.installingDeps': '安装依赖中...',
     'codexlens.depsInstalled': '依赖安装成功',
     'codexlens.depsInstallFailed': '依赖安装失败',
@@ -1645,8 +1654,15 @@ const i18n = {
     'index.cleanAllSuccess': '所有索引已清理',
     'index.vectorIndex': '向量索引',
     'index.normalIndex': 'FTS索引',
+    'index.fullIndex': '全部索引',
     'index.vectorDesc': '语义搜索（含嵌入向量）',
     'index.normalDesc': '快速全文搜索',
+    'index.fullDesc': 'FTS + 语义搜索（推荐）',
+    'index.selectModel': '选择嵌入模型',
+    'index.modelCode': '代码优化 (768维)',
+    'index.modelFast': '快速轻量 (384维)',
+    'index.modelMultilingual': '多语言 (1024维)',
+    'index.modelBalanced': '高精度 (1024维)',
 
     // Semantic Search 配置
     'semantic.settings': '语义搜索设置',
diff --git a/ccw/src/templates/dashboard-js/views/cli-manager.js b/ccw/src/templates/dashboard-js/views/cli-manager.js
index 859c684d..9f79b82c 100644
--- a/ccw/src/templates/dashboard-js/views/cli-manager.js
+++ b/ccw/src/templates/dashboard-js/views/cli-manager.js
@@ -338,6 +338,17 @@ async function renderCliManager() {
   if (window.lucide) lucide.createIcons();
 }
 
+// ========== Helper Functions ==========
+
+/**
+ * Get selected embedding model from dropdown
+ * @returns {string} Selected model profile (code, fast, multilingual, balanced)
+ */
+function getSelectedModel() {
+  var select = document.getElementById('codexlensModelSelect');
+  return select ? select.value : 'code';
+}
+
 // ========== Tools Section (Left Column) ==========
 function renderToolsSection() {
   var container = document.getElementById('tools-section');
@@ -392,8 +403,15 @@ function renderToolsSection() {
     '<div class="tool-item-right">' +
       (codexLensStatus.ready
         ? '<span class="tool-status-text success"><i data-lucide="check-circle" class="w-3.5 h-3.5"></i> v' + (codexLensStatus.version || 'installed') + '</span>' +
-          '<button class="btn-sm btn-outline" onclick="event.stopPropagation(); initCodexLensIndex(\'vector\')" title="' + (t('index.vectorDesc') || 'Semantic search with embeddings') + '"><i data-lucide="sparkles" class="w-3 h-3"></i> ' + (t('index.vectorIndex') || 'Vector') + '</button>' +
-          '<button class="btn-sm btn-outline" onclick="event.stopPropagation(); initCodexLensIndex(\'normal\')" title="' + (t('index.normalDesc') || 'Fast full-text search only') + '"><i data-lucide="file-text" class="w-3 h-3"></i> ' + (t('index.normalIndex') || 'FTS') + '</button>' +
+          '<select id="codexlensModelSelect" class="btn-sm bg-muted border border-border rounded text-xs" onclick="event.stopPropagation()" title="' + (t('index.selectModel') || 'Select embedding model') + '">' +
+            '<option value="code">' + (t('index.modelCode') || 'Code (768d)') + '</option>' +
+            '<option value="fast">' + (t('index.modelFast') || 'Fast (384d)') + '</option>' +
+            '<option value="multilingual">' + (t('index.modelMultilingual') || 'Multilingual (1024d)') + '</option>' +
+            '<option value="balanced">' + (t('index.modelBalanced') || 'Balanced (1024d)') + '</option>' +
+          '</select>' +
+          '<button class="btn-sm btn-primary" onclick="event.stopPropagation(); initCodexLensIndex(\'full\', getSelectedModel())" title="' + (t('index.fullDesc') || 'FTS + Semantic search (recommended)') + '"><i data-lucide="layers" class="w-3 h-3"></i> ' + (t('index.fullIndex') || '全部索引') + '</button>' +
+          '<button class="btn-sm btn-outline" onclick="event.stopPropagation(); initCodexLensIndex(\'vector\', getSelectedModel())" title="' + (t('index.vectorDesc') || 'Semantic search with embeddings') + '"><i data-lucide="sparkles" class="w-3 h-3"></i> ' + (t('index.vectorIndex') || '向量索引') + '</button>' +
+          '<button class="btn-sm btn-outline" onclick="event.stopPropagation(); initCodexLensIndex(\'normal\')" title="' + (t('index.normalDesc') || 'Fast full-text search only') + '"><i data-lucide="file-text" class="w-3 h-3"></i> ' + (t('index.normalIndex') || 'FTS索引') + '</button>' +
           '<button class="btn-sm btn-outline btn-danger" onclick="event.stopPropagation(); uninstallCodexLens()"><i data-lucide="trash-2" class="w-3 h-3"></i> ' + t('cli.uninstall') + '</button>'
         : '<span class="tool-status-text muted"><i data-lucide="circle-dashed" class="w-3.5 h-3.5"></i> ' + t('cli.notInstalled') + '</span>' +
           '<button class="btn-sm btn-primary" onclick="event.stopPropagation(); installCodexLens()"><i data-lucide="download" class="w-3 h-3"></i> ' + t('cli.install') + '</button>') +
diff --git a/ccw/src/templates/dashboard-js/views/codexlens-manager.js b/ccw/src/templates/dashboard-js/views/codexlens-manager.js
index 18c665b2..6e20c09d 100644
--- a/ccw/src/templates/dashboard-js/views/codexlens-manager.js
+++ b/ccw/src/templates/dashboard-js/views/codexlens-manager.js
@@ -554,10 +554,54 @@ async function deleteModel(profile) {
 
 /**
  * Initialize CodexLens index with bottom floating progress bar
- * @param {string} indexType - 'vector' (with embeddings) or 'normal' (FTS only)
+ * @param {string} indexType - 'vector' (with embeddings), 'normal' (FTS only), or 'full' (FTS + Vector)
+ * @param {string} embeddingModel - Model profile: 'code', 'fast', 'multilingual', 'balanced'
  */
-function initCodexLensIndex(indexType) {
+async function initCodexLensIndex(indexType, embeddingModel) {
   indexType = indexType || 'vector';
+  embeddingModel = embeddingModel || 'code';
+
+  // For vector or full index, check if semantic dependencies are available
+  if (indexType === 'vector' || indexType === 'full') {
+    try {
+      var semanticResponse = await fetch('/api/codexlens/semantic/status');
+      var semanticStatus = await semanticResponse.json();
+
+      if (!semanticStatus.available) {
+        // Semantic deps not installed - show confirmation dialog
+        var installDeps = confirm(
+          (t('codexlens.semanticNotInstalled') || 'Semantic search dependencies are not installed.') + '\n\n' +
+          (t('codexlens.installDepsPrompt') || 'Would you like to install them now? (This may take a few minutes)\n\nClick "Cancel" to create FTS index only.')
+        );
+
+        if (installDeps) {
+          // Install semantic dependencies first
+          showRefreshToast(t('codexlens.installingDeps') || 'Installing semantic dependencies...', 'info');
+          try {
+            var installResponse = await fetch('/api/codexlens/semantic/install', { method: 'POST' });
+            var installResult = await installResponse.json();
+
+            if (!installResult.success) {
+              showRefreshToast((t('codexlens.depsInstallFailed') || 'Failed to install dependencies') + ': ' + installResult.error, 'error');
+              // Fall back to FTS only
+              indexType = 'normal';
+            } else {
+              showRefreshToast(t('codexlens.depsInstalled') || 'Dependencies installed successfully', 'success');
+            }
+          } catch (err) {
+            showRefreshToast((t('common.error') || 'Error') + ': ' + err.message, 'error');
+            indexType = 'normal';
+          }
+        } else {
+          // User chose to skip - create FTS only
+          indexType = 'normal';
+        }
+      }
+    } catch (err) {
+      console.warn('[CodexLens] Could not check semantic status:', err);
+      // Continue with requested type, backend will handle fallback
+    }
+  }
 
   // Remove existing progress bar if any
   closeCodexLensIndexModal();
@@ -566,7 +610,24 @@ function initCodexLensIndex(indexType) {
   var progressBar = document.createElement('div');
   progressBar.id = 'codexlensIndexFloating';
   progressBar.className = 'fixed bottom-0 left-0 right-0 z-50 bg-card border-t border-border shadow-lg transform transition-transform duration-300';
-  var indexTypeLabel = indexType === 'vector' ? 'Vector' : 'FTS';
+
+  // Determine display label
+  var indexTypeLabel;
+  if (indexType === 'full') {
+    indexTypeLabel = 'FTS + Vector';
+  } else if (indexType === 'vector') {
+    indexTypeLabel = 'Vector';
+  } else {
+    indexTypeLabel = 'FTS';
+  }
+
+  // Add model info for vector indexes
+  var modelLabel = '';
+  if (indexType !== 'normal') {
+    var modelNames = { code: 'Code', fast: 'Fast', multilingual: 'Multi', balanced: 'Balanced' };
+    modelLabel = ' [' + (modelNames[embeddingModel] || embeddingModel) + ']';
+  }
+
   progressBar.innerHTML =
     '<div class="max-w-4xl mx-auto px-4 py-3">' +
       '<div class="flex items-center justify-between gap-4">' +
@@ -574,7 +635,7 @@ function initCodexLensIndex(indexType) {
           '<div class="animate-spin w-5 h-5 border-2 border-primary border-t-transparent rounded-full flex-shrink-0" id="codexlensIndexSpinner"></div>' +
           '<div class="flex-1 min-w-0">' +
             '<div class="flex items-center gap-2">' +
-              '<span class="font-medium text-sm">' + t('codexlens.indexing') + ' (' + indexTypeLabel + ')</span>' +
+              '<span class="font-medium text-sm">' + t('codexlens.indexing') + ' (' + indexTypeLabel + modelLabel + ')</span>' +
               '<span class="text-xs text-muted-foreground" id="codexlensIndexPercent">0%</span>' +
             '</div>' +
             '<div class="text-xs text-muted-foreground truncate" id="codexlensIndexStatus">' + t('codexlens.preparingIndex') + '</div>' +
@@ -594,16 +655,21 @@ function initCodexLensIndex(indexType) {
   document.body.appendChild(progressBar);
   if (window.lucide) lucide.createIcons();
 
-  // Start indexing with specified type
-  startCodexLensIndexing(indexType);
+  // For 'full' type, use 'vector' in the API (it creates FTS + embeddings)
+  var apiIndexType = (indexType === 'full') ? 'vector' : indexType;
+
+  // Start indexing with specified type and model
+  startCodexLensIndexing(apiIndexType, embeddingModel);
 }
 
 /**
  * Start the indexing process
  * @param {string} indexType - 'vector' or 'normal'
+ * @param {string} embeddingModel - Model profile: 'code', 'fast', 'multilingual', 'balanced'
  */
-async function startCodexLensIndexing(indexType) {
+async function startCodexLensIndexing(indexType, embeddingModel) {
   indexType = indexType || 'vector';
+  embeddingModel = embeddingModel || 'code';
   var statusText = document.getElementById('codexlensIndexStatus');
   var progressBar = document.getElementById('codexlensIndexProgressBar');
   var percentText = document.getElementById('codexlensIndexPercent');
@@ -635,11 +701,11 @@ async function startCodexLensIndexing(indexType) {
   }
 
   try {
-    console.log('[CodexLens] Starting index for:', projectPath, 'type:', indexType);
+    console.log('[CodexLens] Starting index for:', projectPath, 'type:', indexType, 'model:', embeddingModel);
     var response = await fetch('/api/codexlens/init', {
       method: 'POST',
       headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ path: projectPath, indexType: indexType })
+      body: JSON.stringify({ path: projectPath, indexType: indexType, embeddingModel: embeddingModel })
     });
 
     var result = await response.json();
diff --git a/ccw/src/tools/codex-lens.ts b/ccw/src/tools/codex-lens.ts
index 6d171fd4..d6229b6e 100644
--- a/ccw/src/tools/codex-lens.ts
+++ b/ccw/src/tools/codex-lens.ts
@@ -429,7 +429,7 @@ function parseProgressLine(line: string): ProgressInfo | null {
 }
 
 /**
- * Execute CodexLens CLI command
+ * Execute CodexLens CLI command with real-time progress updates
  * @param args - CLI arguments
  * @param options - Execution options
  * @returns Execution result
@@ -463,34 +463,110 @@ async function executeCodexLens(args: string[], options: ExecuteOptions = {}): P
       fullCmd = `${quotedPython} -m codexlens ${cmdArgs.join(' ')}`;
     }
 
-    // Use exec with shell option for cross-platform compatibility
-    exec(fullCmd, {
-      cwd: process.platform === 'win32' ? undefined : cwd, // Don't use cwd on Windows, use cd command instead
+    // Use spawn with shell for real-time progress updates
+    // spawn streams output in real-time, unlike exec which buffers until completion
+    const child = spawn(fullCmd, [], {
+      cwd: process.platform === 'win32' ? undefined : cwd,
+      shell: process.platform === 'win32' ? process.env.ComSpec || true : true,
       timeout,
-      maxBuffer: 50 * 1024 * 1024, // 50MB buffer for large outputs
-      shell: process.platform === 'win32' ? process.env.ComSpec : undefined,
-    }, (error, stdout, stderr) => {
-      if (error) {
-        if (error.killed) {
-          resolve({ success: false, error: 'Command timed out' });
-        } else {
-          resolve({ success: false, error: stderr || error.message });
-        }
-        return;
-      }
+    });
 
-      // Report final progress if callback provided
-      if (onProgress && stdout) {
-        const lines = stdout.split('\n');
-        for (const line of lines) {
-          const progress = parseProgressLine(line.trim());
+    let stdout = '';
+    let stderr = '';
+    let stdoutLineBuffer = '';
+    let stderrLineBuffer = '';
+    let timeoutHandle: NodeJS.Timeout | null = null;
+    let resolved = false;
+
+    // Helper to safely resolve only once
+    const safeResolve = (result: ExecuteResult) => {
+      if (resolved) return;
+      resolved = true;
+      if (timeoutHandle) {
+        clearTimeout(timeoutHandle);
+        timeoutHandle = null;
+      }
+      resolve(result);
+    };
+
+    // Set up timeout handler
+    if (timeout > 0) {
+      timeoutHandle = setTimeout(() => {
+        if (!resolved) {
+          child.kill('SIGTERM');
+          // Give it a moment to die gracefully, then force kill
+          setTimeout(() => {
+            if (!resolved) {
+              child.kill('SIGKILL');
+            }
+          }, 5000);
+          safeResolve({ success: false, error: 'Command timed out' });
+        }
+      }, timeout);
+    }
+
+    // Process stdout line by line for real-time progress
+    child.stdout?.on('data', (data: Buffer) => {
+      const chunk = data.toString();
+      stdoutLineBuffer += chunk;
+      stdout += chunk;
+
+      // Process complete lines
+      const lines = stdoutLineBuffer.split('\n');
+      stdoutLineBuffer = lines.pop() || ''; // Keep incomplete line in buffer
+
+      for (const line of lines) {
+        const trimmedLine = line.trim();
+        if (trimmedLine && onProgress) {
+          const progress = parseProgressLine(trimmedLine);
           if (progress) {
             onProgress(progress);
           }
         }
       }
+    });
 
-      resolve({ success: true, output: stdout.trim() });
+    // Collect stderr
+    child.stderr?.on('data', (data: Buffer) => {
+      const chunk = data.toString();
+      stderrLineBuffer += chunk;
+      stderr += chunk;
+
+      // Also check stderr for progress (some tools output progress to stderr)
+      const lines = stderrLineBuffer.split('\n');
+      stderrLineBuffer = lines.pop() || '';
+
+      for (const line of lines) {
+        const trimmedLine = line.trim();
+        if (trimmedLine && onProgress) {
+          const progress = parseProgressLine(trimmedLine);
+          if (progress) {
+            onProgress(progress);
+          }
+        }
+      }
+    });
+
+    // Handle process errors (spawn failure)
+    child.on('error', (err) => {
+      safeResolve({ success: false, error: `Failed to start process: ${err.message}` });
+    });
+
+    // Handle process completion
+    child.on('close', (code) => {
+      // Process any remaining buffered content
+      if (stdoutLineBuffer.trim() && onProgress) {
+        const progress = parseProgressLine(stdoutLineBuffer.trim());
+        if (progress) {
+          onProgress(progress);
+        }
+      }
+
+      if (code === 0) {
+        safeResolve({ success: true, output: stdout.trim() });
+      } else {
+        safeResolve({ success: false, error: stderr.trim() || `Process exited with code ${code}` });
+      }
     });
   });
 }
diff --git a/ccw/src/tools/smart-search.ts b/ccw/src/tools/smart-search.ts
index 173ce0fa..dfae27cb 100644
--- a/ccw/src/tools/smart-search.ts
+++ b/ccw/src/tools/smart-search.ts
@@ -25,18 +25,26 @@ import type { ProgressInfo } from './codex-lens.js';
 
 // Define Zod schema for validation
 const ParamsSchema = z.object({
-  action: z.enum(['init', 'search', 'search_files', 'status']).default('search'),
-  query: z.string().optional(),
+  // Action: search (content), find_files (path/name pattern), init, status
+  // Note: search_files is deprecated, use search with output_mode='files_only'
+  action: z.enum(['init', 'search', 'search_files', 'find_files', 'status']).default('search'),
+  query: z.string().optional().describe('Content search query (for action="search")'),
+  pattern: z.string().optional().describe('Glob pattern for path matching (for action="find_files")'),
   mode: z.enum(['auto', 'hybrid', 'exact', 'ripgrep', 'priority']).default('auto'),
   output_mode: z.enum(['full', 'files_only', 'count']).default('full'),
   path: z.string().optional(),
   paths: z.array(z.string()).default([]),
   contextLines: z.number().default(0),
-  maxResults: z.number().default(10),
+  maxResults: z.number().default(20),  // Increased default
   includeHidden: z.boolean().default(false),
   languages: z.array(z.string()).optional(),
-  limit: z.number().default(10),
+  limit: z.number().default(20),  // Increased default
+  offset: z.number().default(0),  // NEW: Pagination offset (start_index)
   enrich: z.boolean().default(false),
+  // Search modifiers for ripgrep mode
+  regex: z.boolean().default(true),            // Use regex pattern matching (default: enabled)
+  caseSensitive: z.boolean().default(true),    // Case sensitivity (default: case-sensitive)
+  // Fuzzy matching is implicit in hybrid mode (RRF fusion)
 });
 
 type Params = z.infer<typeof ParamsSchema>;
@@ -47,6 +55,46 @@ const SEARCH_MODES = ['auto', 'hybrid', 'exact', 'ripgrep', 'priority'] as const
 // Classification confidence threshold
 const CONFIDENCE_THRESHOLD = 0.7;
 
+// File filtering configuration (ported from code-index)
+const FILTER_CONFIG = {
+  exclude_directories: new Set([
+    '.git', '.svn', '.hg', '.bzr',
+    'node_modules', '__pycache__', '.venv', 'venv', 'vendor', 'bower_components',
+    'dist', 'build', 'target', 'out', 'bin', 'obj',
+    '.idea', '.vscode', '.vs', '.sublime-workspace',
+    '.pytest_cache', '.coverage', '.tox', '.nyc_output', 'coverage', 'htmlcov',
+    '.next', '.nuxt', '.cache', '.parcel-cache',
+    '.DS_Store', 'Thumbs.db',
+  ]),
+  exclude_files: new Set([
+    '*.tmp', '*.temp', '*.swp', '*.swo', '*.bak', '*~', '*.orig', '*.log',
+    'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml', 'Pipfile.lock',
+  ]),
+  // Windows device files - must use **/ pattern to match in any directory
+  // These cause "os error 1" on Windows when accessed
+  windows_device_files: new Set([
+    'nul', 'con', 'aux', 'prn',
+    'com1', 'com2', 'com3', 'com4', 'com5', 'com6', 'com7', 'com8', 'com9',
+    'lpt1', 'lpt2', 'lpt3', 'lpt4', 'lpt5', 'lpt6', 'lpt7', 'lpt8', 'lpt9',
+  ]),
+};
+
+function buildExcludeArgs(): string[] {
+  const args: string[] = [];
+  for (const dir of FILTER_CONFIG.exclude_directories) {
+    args.push('--glob', `!**/${dir}/**`);
+  }
+  for (const pattern of FILTER_CONFIG.exclude_files) {
+    args.push('--glob', `!${pattern}`);
+  }
+  // Windows device files need case-insensitive matching in any directory
+  for (const device of FILTER_CONFIG.windows_device_files) {
+    args.push('--glob', `!**/${device}`);
+    args.push('--glob', `!**/${device.toUpperCase()}`);
+  }
+  return args;
+}
+
 interface Classification {
   mode: string;
   confidence: number;
@@ -83,11 +131,27 @@ interface GraphMatch {
   relationships: unknown[];
 }
 
+// File match for find_files action (path-based search)
+interface FileMatch {
+  path: string;
+  type: 'file' | 'directory';
+  name: string;       // Filename only
+  extension?: string; // File extension (without dot)
+}
+
+interface PaginationInfo {
+  offset: number;     // Starting index of returned results
+  limit: number;      // Number of results requested
+  total: number;      // Total number of results found
+  has_more: boolean;  // True if more results are available
+}
+
 interface SearchMetadata {
   mode?: string;
   backend?: string;
   count?: number;
   query?: string;
+  pattern?: string;  // For find_files action
   classified_as?: string;
   confidence?: number;
   reasoning?: string;
@@ -96,6 +160,9 @@ interface SearchMetadata {
   note?: string;
   index_status?: 'indexed' | 'not_indexed' | 'partial';
   fallback_history?: string[];
+  suggested_weights?: Record<string, number>;
+  // Pagination metadata
+  pagination?: PaginationInfo;
   // Init action specific
   action?: string;
   path?: string;
@@ -111,7 +178,7 @@ interface SearchMetadata {
 
 interface SearchResult {
   success: boolean;
-  results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | unknown;
+  results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | FileMatch[] | unknown;
   output?: string;
   metadata?: SearchMetadata;
   error?: string;
@@ -236,6 +303,14 @@ function detectRelationship(query: string): boolean {
   return /(import|export|uses?|depends?|calls?|extends?)\s/i.test(query);
 }
 
+function looksLikeCodeQuery(query: string): boolean {
+  if (/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(query)) return true;
+  if (/[:.<>\-=(){}[\]]/.test(query) && query.split(/\s+/).length <= 2) return true;
+  if (/\.\*|\\\(|\\\[|\\s/.test(query)) return true;
+  if (/^[a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*$/.test(query)) return true;
+  return false;
+}
+
 /**
  * Classify query intent and recommend search mode
  * Simple mapping: hybrid (NL + index + embeddings) | exact (index or insufficient embeddings) | ripgrep (no index)
@@ -245,34 +320,34 @@ function detectRelationship(query: string): boolean {
  * @returns Classification result
  */
 function classifyIntent(query: string, hasIndex: boolean = false, hasSufficientEmbeddings: boolean = false): Classification {
-  // Detect query patterns
   const isNaturalLanguage = detectNaturalLanguage(query);
+  const isCodeQuery = looksLikeCodeQuery(query);
+  const isRegexPattern = detectRegex(query);
 
-  // Simple decision tree
   let mode: string;
   let confidence: number;
 
   if (!hasIndex) {
-    // No index: use ripgrep
     mode = 'ripgrep';
     confidence = 1.0;
+  } else if (isCodeQuery || isRegexPattern) {
+    mode = 'exact';
+    confidence = 0.95;
   } else if (isNaturalLanguage && hasSufficientEmbeddings) {
-    // Natural language + sufficient embeddings: use hybrid
     mode = 'hybrid';
     confidence = 0.9;
   } else {
-    // Simple query OR insufficient embeddings: use exact
     mode = 'exact';
     confidence = 0.8;
   }
 
-  // Build reasoning string
   const detectedPatterns: string[] = [];
   if (detectLiteral(query)) detectedPatterns.push('literal');
   if (detectRegex(query)) detectedPatterns.push('regex');
   if (detectNaturalLanguage(query)) detectedPatterns.push('natural language');
   if (detectFilePath(query)) detectedPatterns.push('file path');
   if (detectRelationship(query)) detectedPatterns.push('relationship');
+  if (isCodeQuery) detectedPatterns.push('code identifier');
 
   const reasoning = `Query classified as ${mode} (confidence: ${confidence.toFixed(2)}, detected: ${detectedPatterns.join(', ')}, index: ${hasIndex ? 'available' : 'not available'}, embeddings: ${hasSufficientEmbeddings ? 'sufficient' : 'insufficient'})`;
 
@@ -306,34 +381,46 @@ function buildRipgrepCommand(params: {
   contextLines: number;
   maxResults: number;
   includeHidden: boolean;
+  regex?: boolean;
+  caseSensitive?: boolean;
 }): { command: string; args: string[] } {
-  const { query, paths = ['.'], contextLines = 0, maxResults = 10, includeHidden = false } = params;
+  const { query, paths = ['.'], contextLines = 0, maxResults = 10, includeHidden = false, regex = false, caseSensitive = true } = params;
 
   const args = [
-    '-n', // Show line numbers
-    '--color=never', // Disable color output
-    '--json', // Output in JSON format
+    '-n',
+    '--color=never',
+    '--json',
   ];
 
-  // Add context lines if specified
+  // Add file filtering (unless includeHidden is true)
+  if (!includeHidden) {
+    args.push(...buildExcludeArgs());
+  }
+
+  // Case sensitivity
+  if (!caseSensitive) {
+    args.push('--ignore-case');
+  }
+
   if (contextLines > 0) {
     args.push('-C', contextLines.toString());
   }
 
-  // Add max results limit
   if (maxResults > 0) {
     args.push('--max-count', maxResults.toString());
   }
 
-  // Include hidden files if specified
   if (includeHidden) {
     args.push('--hidden');
   }
 
-  // Use literal/fixed string matching for exact mode
-  args.push('-F', query);
+  // Regex mode (-e) vs fixed string mode (-F)
+  if (regex) {
+    args.push('-e', query);
+  } else {
+    args.push('-F', query);
+  }
 
-  // Add search paths
   args.push(...paths);
 
   return { command: 'rg', args };
@@ -492,7 +579,7 @@ async function executeAutoMode(params: Params): Promise<SearchResult> {
  * No index required, fallback to CodexLens if ripgrep unavailable
  */
 async function executeRipgrepMode(params: Params): Promise<SearchResult> {
-  const { query, paths = [], contextLines = 0, maxResults = 10, includeHidden = false, path = '.' } = params;
+  const { query, paths = [], contextLines = 0, maxResults = 10, includeHidden = false, path = '.', regex = true, caseSensitive = true } = params;
 
   if (!query) {
     return {
@@ -566,6 +653,8 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
     contextLines,
     maxResults,
     includeHidden,
+    regex,
+    caseSensitive,
   });
 
   return new Promise((resolve) => {
@@ -587,31 +676,34 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
 
     child.on('close', (code) => {
       const results: ExactMatch[] = [];
+      const lines = stdout.split('\n').filter((line) => line.trim());
 
-      if (code === 0 || (code === 1 && stdout.trim())) {
-        const lines = stdout.split('\n').filter((line) => line.trim());
+      for (const line of lines) {
+        try {
+          const item = JSON.parse(line);
 
-        for (const line of lines) {
-          try {
-            const item = JSON.parse(line);
-
-            if (item.type === 'match') {
-              const match: ExactMatch = {
-                file: item.data.path.text,
-                line: item.data.line_number,
-                column:
-                  item.data.submatches && item.data.submatches[0]
-                    ? item.data.submatches[0].start + 1
-                    : 1,
-                content: item.data.lines.text.trim(),
-              };
-              results.push(match);
-            }
-          } catch {
-            continue;
+          if (item.type === 'match') {
+            const match: ExactMatch = {
+              file: item.data.path.text,
+              line: item.data.line_number,
+              column:
+                item.data.submatches && item.data.submatches[0]
+                  ? item.data.submatches[0].start + 1
+                  : 1,
+              content: item.data.lines.text.trim(),
+            };
+            results.push(match);
           }
+        } catch {
+          continue;
         }
+      }
 
+      // Handle Windows device file errors gracefully (os error 1)
+      // If we have results despite the error, return them as partial success
+      const isWindowsDeviceError = stderr.includes('os error 1') || stderr.includes('函数不正确');
+
+      if (code === 0 || code === 1 || (isWindowsDeviceError && results.length > 0)) {
         resolve({
           success: true,
           results,
@@ -620,6 +712,20 @@ async function executeRipgrepMode(params: Params): Promise<SearchResult> {
             backend: 'ripgrep',
             count: results.length,
             query,
+            ...(isWindowsDeviceError && { warning: 'Some Windows device files were skipped' }),
+          },
+        });
+      } else if (isWindowsDeviceError && results.length === 0) {
+        // Windows device error but no results - might be the only issue
+        resolve({
+          success: true,
+          results: [],
+          metadata: {
+            mode: 'ripgrep',
+            backend: 'ripgrep',
+            count: 0,
+            query,
+            warning: 'No matches found (some Windows device files were skipped)',
           },
         });
       } else {
@@ -764,15 +870,42 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
 
   // Parse results
   let results: SemanticMatch[] = [];
+  let baselineInfo: { score: number; count: number } | null = null;
+  let initialCount = 0;
+
   try {
     const parsed = JSON.parse(stripAnsi(result.output || '{}'));
     const data = parsed.result?.results || parsed.results || parsed;
-    results = (Array.isArray(data) ? data : []).map((item: any) => ({
-      file: item.path || item.file,
-      score: item.score || 0,
-      content: item.excerpt || item.content || '',
-      symbol: item.symbol || null,
-    }));
+    results = (Array.isArray(data) ? data : []).map((item: any) => {
+      const rawScore = item.score || 0;
+      // Hybrid mode returns distance scores (lower is better).
+      // Convert to similarity scores (higher is better) for consistency.
+      // Formula: similarity = 1 / (1 + distance)
+      const similarityScore = rawScore > 0 ? 1 / (1 + rawScore) : 1;
+      return {
+        file: item.path || item.file,
+        score: similarityScore,
+        content: item.excerpt || item.content || '',
+        symbol: item.symbol || null,
+      };
+    });
+
+    initialCount = results.length;
+
+    // Post-processing pipeline to improve semantic search quality
+    // 0. Filter dominant baseline scores (hot spot detection)
+    const baselineResult = filterDominantBaselineScores(results);
+    results = baselineResult.filteredResults;
+    baselineInfo = baselineResult.baselineInfo;
+
+    // 1. Filter noisy files (coverage, node_modules, etc.)
+    results = filterNoisyFiles(results);
+    // 2. Boost results containing query keywords
+    results = applyKeywordBoosting(results, query);
+    // 3. Enforce score diversity (penalize identical scores)
+    results = enforceScoreDiversity(results);
+    // 4. Re-sort by adjusted scores
+    results.sort((a, b) => b.score - a.score);
   } catch {
     return {
       success: true,
@@ -788,6 +921,12 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
     };
   }
 
+  // Build metadata with baseline info if detected
+  let note = 'Hybrid mode uses RRF fusion (exact + fuzzy + vector) for best results';
+  if (baselineInfo) {
+    note += ` | Filtered ${initialCount - results.length} hot-spot results with baseline score ~${baselineInfo.score.toFixed(4)}`;
+  }
+
   return {
     success: true,
     results,
@@ -796,12 +935,195 @@ async function executeHybridMode(params: Params): Promise<SearchResult> {
       backend: 'codexlens',
       count: results.length,
       query,
-      note: 'Hybrid mode uses RRF fusion (exact + fuzzy + vector) for best results',
+      note,
       warning: indexStatus.warning,
+      suggested_weights: getRRFWeights(query),
     },
   };
 }
 
+const RRF_WEIGHTS = {
+  code: { exact: 0.7, fuzzy: 0.2, vector: 0.1 },
+  natural: { exact: 0.4, fuzzy: 0.2, vector: 0.4 },
+  default: { exact: 0.5, fuzzy: 0.2, vector: 0.3 },
+};
+
+function getRRFWeights(query: string): Record<string, number> {
+  const isCode = looksLikeCodeQuery(query);
+  const isNatural = detectNaturalLanguage(query);
+  if (isCode) return RRF_WEIGHTS.code;
+  if (isNatural) return RRF_WEIGHTS.natural;
+  return RRF_WEIGHTS.default;
+}
+
+/**
+ * Post-processing: Filter noisy files from semantic search results
+ * Uses FILTER_CONFIG patterns to remove irrelevant files.
+ * Optimized: pre-compiled regexes, accurate path segment matching.
+ */
+// Pre-compile file exclusion regexes once (avoid recompilation in loop)
+const FILE_EXCLUDE_REGEXES = [...FILTER_CONFIG.exclude_files].map(pattern =>
+  new RegExp('^' + pattern.replace(/[.*+?^${}()|[\]\\]/g, '\\$&').replace(/\\\*/g, '.*') + '$')
+);
+
+function filterNoisyFiles(results: SemanticMatch[]): SemanticMatch[] {
+  return results.filter(r => {
+    const filePath = r.file || '';
+    if (!filePath) return true;
+
+    const segments = filePath.split(/[/\\]/);
+
+    // Accurate directory check: segment must exactly match excluded directory
+    if (segments.some(segment => FILTER_CONFIG.exclude_directories.has(segment))) {
+      return false;
+    }
+
+    // Accurate file check: pattern matches filename only (not full path)
+    const filename = segments.pop() || '';
+    if (FILE_EXCLUDE_REGEXES.some(regex => regex.test(filename))) {
+      return false;
+    }
+
+    return true;
+  });
+}
+
+/**
+ * Post-processing: Boost results containing query keywords
+ * Extracts keywords from query and boosts matching results.
+ * Optimized: uses whole-word matching with regex for accuracy.
+ */
+// Helper to escape regex special characters
+function escapeRegExp(str: string): string {
+  return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+}
+
+function applyKeywordBoosting(results: SemanticMatch[], query: string): SemanticMatch[] {
+  // Extract meaningful keywords (ignore common words)
+  const stopWords = new Set(['the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought', 'used', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'and', 'but', 'if', 'or', 'because', 'until', 'while', 'although', 'though', 'after', 'before', 'when', 'whenever', 'where', 'wherever', 'whether', 'which', 'who', 'whom', 'whose', 'what', 'whatever', 'whichever', 'whoever', 'whomever', 'this', 'that', 'these', 'those', 'it', 'its']);
+
+  const keywords = query
+    .toLowerCase()
+    .split(/[\s,.;:()"{}[\]-]+/)  // More robust splitting on punctuation
+    .filter(word => word.length > 2 && !stopWords.has(word));
+
+  if (keywords.length === 0) return results;
+
+  // Create case-insensitive regexes for whole-word matching
+  const keywordRegexes = keywords.map(kw => new RegExp(`\\b${escapeRegExp(kw)}\\b`, 'i'));
+
+  return results.map(r => {
+    const content = r.content || '';
+    const file = r.file || '';
+
+    // Count keyword matches using whole-word regex
+    let matchCount = 0;
+    for (const regex of keywordRegexes) {
+      if (regex.test(content) || regex.test(file)) {
+        matchCount++;
+      }
+    }
+
+    // Apply boost only if there are matches
+    if (matchCount > 0) {
+      const matchRatio = matchCount / keywords.length;
+      const boost = 1 + (matchRatio * 0.3); // Up to 30% boost for full match
+      return {
+        ...r,
+        score: r.score * boost,
+      };
+    }
+
+    return r;
+  });
+}
+
+/**
+ * Post-processing: Enforce score diversity
+ * Penalizes results with identical scores (indicates undifferentiated matching)
+ */
+function enforceScoreDiversity(results: SemanticMatch[]): SemanticMatch[] {
+  if (results.length < 2) return results;
+
+  // Count occurrences of each score (rounded to 3 decimal places for comparison)
+  const scoreCounts = new Map<number, number>();
+  for (const r of results) {
+    const roundedScore = Math.round(r.score * 1000) / 1000;
+    scoreCounts.set(roundedScore, (scoreCounts.get(roundedScore) || 0) + 1);
+  }
+
+  // Apply penalty to scores that appear more than twice
+  return results.map(r => {
+    const roundedScore = Math.round(r.score * 1000) / 1000;
+    const count = scoreCounts.get(roundedScore) || 1;
+
+    if (count > 2) {
+      // Progressive penalty: more duplicates = bigger penalty
+      const penalty = Math.max(0.7, 1 - (count * 0.05));
+      return { ...r, score: r.score * penalty };
+    }
+    return r;
+  });
+}
+
+/**
+ * Post-processing: Filter results with dominant baseline score (hot spot detection)
+ * When backend returns default "hot spot" files with identical high scores,
+ * this function detects and removes them.
+ *
+ * Detection criteria:
+ * - A single score appears in >50% of results
+ * - That score is suspiciously high (>0.9)
+ * - This indicates fallback mechanism returned placeholder results
+ */
+function filterDominantBaselineScores(
+  results: SemanticMatch[]
+): { filteredResults: SemanticMatch[]; baselineInfo: { score: number; count: number } | null } {
+  if (results.length < 4) {
+    return { filteredResults: results, baselineInfo: null };
+  }
+
+  // Count occurrences of each score (rounded to 4 decimal places)
+  const scoreCounts = new Map<number, number>();
+  results.forEach(r => {
+    const rounded = Math.round(r.score * 10000) / 10000;
+    scoreCounts.set(rounded, (scoreCounts.get(rounded) || 0) + 1);
+  });
+
+  // Find the most dominant score
+  let dominantScore: number | null = null;
+  let dominantCount = 0;
+  scoreCounts.forEach((count, score) => {
+    if (count > dominantCount) {
+      dominantCount = count;
+      dominantScore = score;
+    }
+  });
+
+  // If a single score is present in >50% of results and is high (>0.9),
+  // treat it as a suspicious baseline score and filter it out
+  const BASELINE_THRESHOLD = 0.5;  // >50% of results have same score
+  const HIGH_SCORE_THRESHOLD = 0.9; // Score above 0.9 is suspiciously high
+
+  if (
+    dominantScore !== null &&
+    dominantCount > results.length * BASELINE_THRESHOLD &&
+    dominantScore > HIGH_SCORE_THRESHOLD
+  ) {
+    const filteredResults = results.filter(r => {
+      const rounded = Math.round(r.score * 10000) / 10000;
+      return rounded !== dominantScore;
+    });
+
+    return {
+      filteredResults,
+      baselineInfo: { score: dominantScore, count: dominantCount },
+    };
+  }
+
+  return { filteredResults: results, baselineInfo: null };
+}
+
 /**
  * TypeScript implementation of Reciprocal Rank Fusion
  * Reference: codex-lens/src/codexlens/search/ranking.py
@@ -963,34 +1285,52 @@ async function executePriorityFallbackMode(params: Params): Promise<SearchResult
 // Tool schema for MCP
 export const schema: ToolSchema = {
   name: 'smart_search',
-  description: `Intelligent code search with five modes. Use "auto" mode (default) for intelligent routing.
+  description: `Unified code search tool with content search, file discovery, and semantic search capabilities.
 
-**Usage:**
+**Actions:**
+- search: Search file content (default)
+- find_files: Find files by path/name pattern (glob matching)
+- init: Create FTS index
+- status: Check index status
+
+**Content Search (action="search"):**
   smart_search(query="authentication logic")        # auto mode - routes to best backend
   smart_search(query="MyClass", mode="exact")       # exact mode - precise FTS matching
-  smart_search(query="auth", mode="ripgrep")        # ripgrep mode - fast literal search (no index)
-  smart_search(query="how to auth", mode="hybrid")  # hybrid mode - semantic search (requires index)
+  smart_search(query="auth", mode="ripgrep")        # ripgrep mode - fast literal search
+  smart_search(query="how to auth", mode="hybrid")  # hybrid mode - semantic + fuzzy search
 
-**Index Management:**
-  smart_search(action="init")                       # Create FTS index for current directory
-  smart_search(action="status")                     # Check index and embedding status
+**File Discovery (action="find_files"):**
+  smart_search(action="find_files", pattern="*.ts")           # find all TypeScript files
+  smart_search(action="find_files", pattern="src/**/*.js")    # recursive glob pattern
+  smart_search(action="find_files", pattern="test_*.py")      # find test files
+  smart_search(action="find_files", pattern="*.tsx", offset=20, limit=10)  # pagination
 
-**Graph Enrichment:**
-  smart_search(query="func", enrich=true)           # Enrich results with code relationships (calls, imports, called_by, imported_by)
+**Pagination:** All actions support offset/limit for paginated results:
+  smart_search(query="auth", limit=10, offset=0)    # first page
+  smart_search(query="auth", limit=10, offset=10)   # second page
 
-**Modes:** auto (intelligent routing), hybrid (semantic, needs index), exact (FTS), ripgrep (fast, no index), priority (fallback: hybrid→exact→ripgrep)`,
+**Regex Search (ripgrep mode):**
+  smart_search(query="class.*Builder")              # auto-detects regex pattern
+  smart_search(query="def.*\\(.*\\):")              # find function definitions
+  smart_search(query="import.*from", caseSensitive=false)  # case-insensitive
+
+**Modes:** auto (intelligent routing), hybrid (semantic+fuzzy), exact (FTS), ripgrep (fast), priority (fallback chain)`,
   inputSchema: {
     type: 'object',
     properties: {
       action: {
         type: 'string',
-        enum: ['init', 'search', 'search_files', 'status'],
-        description: 'Action to perform: init (create FTS index, no embeddings), search (default), search_files (paths only), status (check index)',
+        enum: ['init', 'search', 'find_files', 'status', 'search_files'],
+        description: 'Action: search (content search), find_files (path pattern matching), init (create index), status (check index). Note: search_files is deprecated.',
         default: 'search',
       },
       query: {
         type: 'string',
-        description: 'Search query (required for search/search_files actions)',
+        description: 'Content search query (for action="search")',
+      },
+      pattern: {
+        type: 'string',
+        description: 'Glob pattern for file discovery (for action="find_files"). Examples: "*.ts", "src/**/*.js", "test_*.py"',
       },
       mode: {
         type: 'string',
@@ -1023,13 +1363,18 @@ export const schema: ToolSchema = {
       },
       maxResults: {
         type: 'number',
-        description: 'Maximum number of results (default: 10)',
-        default: 10,
+        description: 'Maximum number of results (default: 20)',
+        default: 20,
       },
       limit: {
         type: 'number',
-        description: 'Alias for maxResults',
-        default: 10,
+        description: 'Alias for maxResults (default: 20)',
+        default: 20,
+      },
+      offset: {
+        type: 'number',
+        description: 'Pagination offset - skip first N results (default: 0)',
+        default: 0,
       },
       includeHidden: {
         type: 'boolean',
@@ -1046,11 +1391,284 @@ export const schema: ToolSchema = {
         description: 'Enrich search results with code graph relationships (calls, imports, called_by, imported_by).',
         default: false,
       },
+      regex: {
+        type: 'boolean',
+        description: 'Use regex pattern matching instead of literal string (ripgrep mode only). Default: enabled. Example: smart_search(query="class.*Builder")',
+        default: true,
+      },
+      caseSensitive: {
+        type: 'boolean',
+        description: 'Case-sensitive search (default: true). Set to false for case-insensitive matching.',
+        default: true,
+      },
     },
     required: [],
   },
 };
 
+/**
+ * Action: find_files - Find files by path/name pattern (glob matching)
+ * Unlike search which looks inside file content, find_files matches file paths
+ */
+async function executeFindFilesAction(params: Params): Promise<SearchResult> {
+  const { pattern, path = '.', limit = 20, offset = 0, includeHidden = false, caseSensitive = true } = params;
+
+  if (!pattern) {
+    return {
+      success: false,
+      error: 'Pattern is required for find_files action. Use glob patterns like "*.ts", "src/**/*.js", or "test_*.py"',
+    };
+  }
+
+  // Use ripgrep with --files flag for fast file listing with glob pattern
+  const hasRipgrep = checkToolAvailability('rg');
+
+  if (!hasRipgrep) {
+    // Fallback to CodexLens file listing if available
+    const readyStatus = await ensureCodexLensReady();
+    if (!readyStatus.ready) {
+      return {
+        success: false,
+        error: 'Neither ripgrep nor CodexLens available for file discovery.',
+      };
+    }
+
+    // Try CodexLens file list command
+    const args = ['list-files', '--json'];
+    const result = await executeCodexLens(args, { cwd: path });
+
+    if (!result.success) {
+      return {
+        success: false,
+        error: `Failed to list files: ${result.error}`,
+      };
+    }
+
+    // Parse and filter results by pattern
+    let files: string[] = [];
+    try {
+      const parsed = JSON.parse(stripAnsi(result.output || '[]'));
+      files = Array.isArray(parsed) ? parsed : (parsed.files || []);
+    } catch {
+      return {
+        success: false,
+        error: 'Failed to parse file list from CodexLens',
+      };
+    }
+
+    // Apply glob pattern matching using minimatch-style regex
+    const globRegex = globToRegex(pattern, caseSensitive);
+    const matchedFiles = files.filter(f => globRegex.test(f));
+
+    // Apply pagination
+    const total = matchedFiles.length;
+    const paginatedFiles = matchedFiles.slice(offset, offset + limit);
+
+    const results: FileMatch[] = paginatedFiles.map(filePath => {
+      const parts = filePath.split(/[/\\]/);
+      const name = parts[parts.length - 1] || '';
+      const ext = name.includes('.') ? name.split('.').pop() : undefined;
+      return {
+        path: filePath,
+        type: 'file' as const,
+        name,
+        extension: ext,
+      };
+    });
+
+    return {
+      success: true,
+      results,
+      metadata: {
+        pattern,
+        backend: 'codexlens',
+        count: results.length,
+        pagination: {
+          offset,
+          limit,
+          total,
+          has_more: offset + limit < total,
+        },
+      },
+    };
+  }
+
+  // Use ripgrep --files with glob pattern for fast file discovery
+  return new Promise((resolve) => {
+    const args = ['--files'];
+
+    // Add exclude patterns
+    if (!includeHidden) {
+      args.push(...buildExcludeArgs());
+    } else {
+      args.push('--hidden');
+    }
+
+    // Add glob pattern
+    args.push('--glob', pattern);
+
+    // Case sensitivity for glob matching
+    if (!caseSensitive) {
+      args.push('--iglob', pattern);
+      // Remove the case-sensitive glob and use iglob instead
+      const globIndex = args.indexOf('--glob');
+      if (globIndex !== -1) {
+        args.splice(globIndex, 2);
+      }
+    }
+
+    const child = spawn('rg', args, {
+      cwd: path || process.cwd(),
+      stdio: ['ignore', 'pipe', 'pipe'],
+    });
+
+    let stdout = '';
+    let stderr = '';
+
+    child.stdout.on('data', (data) => {
+      stdout += data.toString();
+    });
+
+    child.stderr.on('data', (data) => {
+      stderr += data.toString();
+    });
+
+    child.on('close', (code) => {
+      // ripgrep returns 1 when no matches found, which is not an error
+      if (code !== 0 && code !== 1 && !stderr.includes('os error 1')) {
+        resolve({
+          success: false,
+          error: `ripgrep file search failed: ${stderr}`,
+        });
+        return;
+      }
+
+      const allFiles = stdout.split('\n').filter(line => line.trim());
+      const total = allFiles.length;
+
+      // Apply pagination
+      const paginatedFiles = allFiles.slice(offset, offset + limit);
+
+      const results: FileMatch[] = paginatedFiles.map(filePath => {
+        const normalizedPath = filePath.replace(/\\/g, '/');
+        const parts = normalizedPath.split('/');
+        const name = parts[parts.length - 1] || '';
+        const ext = name.includes('.') ? name.split('.').pop() : undefined;
+        return {
+          path: normalizedPath,
+          type: 'file' as const,
+          name,
+          extension: ext,
+        };
+      });
+
+      resolve({
+        success: true,
+        results,
+        metadata: {
+          pattern,
+          backend: 'ripgrep',
+          count: results.length,
+          pagination: {
+            offset,
+            limit,
+            total,
+            has_more: offset + limit < total,
+          },
+        },
+      });
+    });
+
+    child.on('error', (error) => {
+      resolve({
+        success: false,
+        error: `Failed to spawn ripgrep: ${error.message}`,
+      });
+    });
+  });
+}
+
+/**
+ * Convert glob pattern to regex for file matching
+ * Supports: *, **, ?, [abc], [!abc]
+ */
+function globToRegex(pattern: string, caseSensitive: boolean = true): RegExp {
+  let i = 0;
+  const out: string[] = [];
+  const special = '.^$+{}|()';
+
+  while (i < pattern.length) {
+    const c = pattern[i];
+
+    if (c === '*') {
+      if (i + 1 < pattern.length && pattern[i + 1] === '*') {
+        // ** matches any path including /
+        out.push('.*');
+        i += 2;
+        // Skip following / if present
+        if (pattern[i] === '/') {
+          i++;
+        }
+        continue;
+      } else {
+        // * matches any character except /
+        out.push('[^/]*');
+      }
+    } else if (c === '?') {
+      out.push('[^/]');
+    } else if (c === '[') {
+      // Character class
+      let j = i + 1;
+      let negated = false;
+      if (pattern[j] === '!' || pattern[j] === '^') {
+        negated = true;
+        j++;
+      }
+      let classContent = '';
+      while (j < pattern.length && pattern[j] !== ']') {
+        classContent += pattern[j];
+        j++;
+      }
+      if (negated) {
+        out.push(`[^${classContent}]`);
+      } else {
+        out.push(`[${classContent}]`);
+      }
+      i = j;
+    } else if (special.includes(c)) {
+      out.push('\\' + c);
+    } else {
+      out.push(c);
+    }
+    i++;
+  }
+
+  const flags = caseSensitive ? '' : 'i';
+  return new RegExp('^' + out.join('') + '$', flags);
+}
+
+/**
+ * Apply pagination to search results and add pagination metadata
+ */
+function applyPagination<T>(
+  results: T[],
+  offset: number,
+  limit: number
+): { paginatedResults: T[]; pagination: PaginationInfo } {
+  const total = results.length;
+  const paginatedResults = results.slice(offset, offset + limit);
+
+  return {
+    paginatedResults,
+    pagination: {
+      offset,
+      limit,
+      total,
+      has_more: offset + limit < total,
+    },
+  };
+}
+
 /**
  * Transform results based on output_mode
  */
@@ -1095,14 +1713,17 @@ export async function handler(params: Record<string, unknown>): Promise<ToolResu
     return { success: false, error: `Invalid params: ${parsed.error.message}` };
   }
 
-  const { action, mode, output_mode } = parsed.data;
+  const { action, mode, output_mode, offset = 0 } = parsed.data;
 
   // Sync limit and maxResults - use the larger of the two if both provided
   // This ensures user-provided values take precedence over defaults
-  const effectiveLimit = Math.max(parsed.data.limit || 10, parsed.data.maxResults || 10);
+  const effectiveLimit = Math.max(parsed.data.limit || 20, parsed.data.maxResults || 20);
   parsed.data.maxResults = effectiveLimit;
   parsed.data.limit = effectiveLimit;
 
+  // Track if search_files was used (deprecated)
+  let deprecationWarning: string | undefined;
+
   try {
     let result: SearchResult;
 
@@ -1116,8 +1737,14 @@ export async function handler(params: Record<string, unknown>): Promise<ToolResu
         result = await executeStatusAction(parsed.data);
         break;
 
+      case 'find_files':
+        // NEW: File path/name pattern matching (glob-based)
+        result = await executeFindFilesAction(parsed.data);
+        break;
+
       case 'search_files':
-        // For search_files, use search mode but force files_only output
+        // DEPRECATED: Redirect to search with files_only output
+        deprecationWarning = 'action="search_files" is deprecated. Use action="search" with output_mode="files_only" for content-to-files search, or action="find_files" for path pattern matching.';
         parsed.data.output_mode = 'files_only';
         // Fall through to search
 
@@ -1151,6 +1778,27 @@ export async function handler(params: Record<string, unknown>): Promise<ToolResu
       if (result.success && result.results && output_mode !== 'full') {
         result.results = transformOutput(result.results as any[], output_mode);
       }
+
+      // Add pagination metadata for search results if not already present
+      if (result.success && result.results && Array.isArray(result.results)) {
+        const totalResults = (result.results as any[]).length;
+        if (!result.metadata) {
+          result.metadata = {};
+        }
+        if (!result.metadata.pagination) {
+          result.metadata.pagination = {
+            offset: 0,
+            limit: effectiveLimit,
+            total: totalResults,
+            has_more: false,  // Already limited by backend
+          };
+        }
+      }
+    }
+
+    // Add deprecation warning if applicable
+    if (deprecationWarning && result.metadata) {
+      result.metadata.warning = deprecationWarning;
     }
 
     return result.success ? { success: true, result } : { success: false, error: result.error };
diff --git a/ccw/src/tools/smart-search.ts.backup b/ccw/src/tools/smart-search.ts.backup
new file mode 100644
index 00000000..173ce0fa
--- /dev/null
+++ b/ccw/src/tools/smart-search.ts.backup
@@ -0,0 +1,1233 @@
+/**
+ * Smart Search Tool - Unified intelligent search with CodexLens integration
+ *
+ * Features:
+ * - Intent classification with automatic mode selection
+ * - CodexLens integration (init, hybrid, vector, semantic)
+ * - Ripgrep fallback for exact mode
+ * - Index status checking and warnings
+ * - Multi-backend search routing with RRF ranking
+ *
+ * Actions:
+ * - init: Initialize CodexLens index
+ * - search: Intelligent search with auto mode selection
+ * - status: Check index status
+ */
+
+import { z } from 'zod';
+import type { ToolSchema, ToolResult } from '../types/tool.js';
+import { spawn, execSync } from 'child_process';
+import {
+  ensureReady as ensureCodexLensReady,
+  executeCodexLens,
+} from './codex-lens.js';
+import type { ProgressInfo } from './codex-lens.js';
+
+// Define Zod schema for validation
+const ParamsSchema = z.object({
+  action: z.enum(['init', 'search', 'search_files', 'status']).default('search'),
+  query: z.string().optional(),
+  mode: z.enum(['auto', 'hybrid', 'exact', 'ripgrep', 'priority']).default('auto'),
+  output_mode: z.enum(['full', 'files_only', 'count']).default('full'),
+  path: z.string().optional(),
+  paths: z.array(z.string()).default([]),
+  contextLines: z.number().default(0),
+  maxResults: z.number().default(10),
+  includeHidden: z.boolean().default(false),
+  languages: z.array(z.string()).optional(),
+  limit: z.number().default(10),
+  enrich: z.boolean().default(false),
+});
+
+type Params = z.infer<typeof ParamsSchema>;
+
+// Search mode constants
+const SEARCH_MODES = ['auto', 'hybrid', 'exact', 'ripgrep', 'priority'] as const;
+
+// Classification confidence threshold
+const CONFIDENCE_THRESHOLD = 0.7;
+
+interface Classification {
+  mode: string;
+  confidence: number;
+  reasoning: string;
+}
+
+interface ExactMatch {
+  file: string;
+  line: number;
+  column: number;
+  content: string;
+}
+
+interface RelationshipInfo {
+  type: string;           // 'calls', 'imports', 'called_by', 'imported_by'
+  direction: 'outgoing' | 'incoming';
+  target?: string;        // Target symbol name (for outgoing)
+  source?: string;        // Source symbol name (for incoming)
+  file: string;           // File path
+  line?: number;          // Line number
+}
+
+interface SemanticMatch {
+  file: string;
+  score: number;
+  content: string;
+  symbol: string | null;
+  relationships?: RelationshipInfo[];
+}
+
+interface GraphMatch {
+  file: string;
+  symbols: unknown;
+  relationships: unknown[];
+}
+
+interface SearchMetadata {
+  mode?: string;
+  backend?: string;
+  count?: number;
+  query?: string;
+  classified_as?: string;
+  confidence?: number;
+  reasoning?: string;
+  embeddings_coverage_percent?: number;
+  warning?: string;
+  note?: string;
+  index_status?: 'indexed' | 'not_indexed' | 'partial';
+  fallback_history?: string[];
+  // Init action specific
+  action?: string;
+  path?: string;
+  progress?: {
+    stage: string;
+    message: string;
+    percent: number;
+    filesProcessed?: number;
+    totalFiles?: number;
+  };
+  progressHistory?: ProgressInfo[];
+}
+
+interface SearchResult {
+  success: boolean;
+  results?: ExactMatch[] | SemanticMatch[] | GraphMatch[] | unknown;
+  output?: string;
+  metadata?: SearchMetadata;
+  error?: string;
+  status?: unknown;
+  message?: string;
+}
+
+interface IndexStatus {
+  indexed: boolean;
+  has_embeddings: boolean;
+  file_count?: number;
+  embeddings_coverage_percent?: number;
+  warning?: string;
+}
+
+/**
+ * Strip ANSI color codes from string (for JSON parsing)
+ */
+function stripAnsi(str: string): string {
+  return str.replace(/\x1b\[[0-9;]*m/g, '');
+}
+
+/**
+ * Check if CodexLens index exists for current directory
+ * @param path - Directory path to check
+ * @returns Index status
+ */
+async function checkIndexStatus(path: string = '.'): Promise<IndexStatus> {
+  try {
+    const result = await executeCodexLens(['status', '--json'], { cwd: path });
+
+    if (!result.success) {
+      return {
+        indexed: false,
+        has_embeddings: false,
+        warning: 'No CodexLens index found. Run smart_search(action="init") to create index for better search results.',
+      };
+    }
+
+    // Parse status output
+    try {
+      // Strip ANSI color codes from JSON output
+      const cleanOutput = stripAnsi(result.output || '{}');
+      const parsed = JSON.parse(cleanOutput);
+      // Handle both direct and nested response formats (status returns {success, result: {...}})
+      const status = parsed.result || parsed;
+      const indexed = status.projects_count > 0 || status.total_files > 0;
+
+      // Get embeddings coverage from comprehensive status
+      const embeddingsData = status.embeddings || {};
+      const embeddingsCoverage = embeddingsData.coverage_percent || 0;
+      const has_embeddings = embeddingsCoverage >= 50; // Threshold: 50%
+
+      let warning: string | undefined;
+      if (!indexed) {
+        warning = 'No CodexLens index found. Run smart_search(action="init") to create index for better search results.';
+      } else if (embeddingsCoverage === 0) {
+        warning = 'Index exists but no embeddings generated. Run: codexlens embeddings-generate --recursive';
+      } else if (embeddingsCoverage < 50) {
+        warning = `Embeddings coverage is ${embeddingsCoverage.toFixed(1)}% (below 50%). Hybrid search will use exact mode. Run: codexlens embeddings-generate --recursive`;
+      }
+
+      return {
+        indexed,
+        has_embeddings,
+        file_count: status.total_files,
+        embeddings_coverage_percent: embeddingsCoverage,
+        warning,
+      };
+    } catch {
+      return {
+        indexed: false,
+        has_embeddings: false,
+        warning: 'Failed to parse index status',
+      };
+    }
+  } catch {
+    return {
+      indexed: false,
+      has_embeddings: false,
+      warning: 'CodexLens not available',
+    };
+  }
+}
+
+/**
+ * Detection heuristics for intent classification
+ */
+
+/**
+ * Detect literal string query (simple alphanumeric or quoted strings)
+ */
+function detectLiteral(query: string): boolean {
+  return /^[a-zA-Z0-9_-]+$/.test(query) || /^["'].*["']$/.test(query);
+}
+
+/**
+ * Detect regex pattern (contains regex metacharacters)
+ */
+function detectRegex(query: string): boolean {
+  return /[.*+?^${}()|[\]\\]/.test(query);
+}
+
+/**
+ * Detect natural language query (sentence structure, questions, multi-word phrases)
+ */
+function detectNaturalLanguage(query: string): boolean {
+  return query.split(/\s+/).length >= 3 || /\?$/.test(query);
+}
+
+/**
+ * Detect file path query (path separators, file extensions)
+ */
+function detectFilePath(query: string): boolean {
+  return /[/\\]/.test(query) || /\.[a-z]{2,4}$/i.test(query);
+}
+
+/**
+ * Detect relationship query (import, export, dependency keywords)
+ */
+function detectRelationship(query: string): boolean {
+  return /(import|export|uses?|depends?|calls?|extends?)\s/i.test(query);
+}
+
+/**
+ * Classify query intent and recommend search mode
+ * Simple mapping: hybrid (NL + index + embeddings) | exact (index or insufficient embeddings) | ripgrep (no index)
+ * @param query - Search query string
+ * @param hasIndex - Whether CodexLens index exists
+ * @param hasSufficientEmbeddings - Whether embeddings coverage >= 50%
+ * @returns Classification result
+ */
+function classifyIntent(query: string, hasIndex: boolean = false, hasSufficientEmbeddings: boolean = false): Classification {
+  // Detect query patterns
+  const isNaturalLanguage = detectNaturalLanguage(query);
+
+  // Simple decision tree
+  let mode: string;
+  let confidence: number;
+
+  if (!hasIndex) {
+    // No index: use ripgrep
+    mode = 'ripgrep';
+    confidence = 1.0;
+  } else if (isNaturalLanguage && hasSufficientEmbeddings) {
+    // Natural language + sufficient embeddings: use hybrid
+    mode = 'hybrid';
+    confidence = 0.9;
+  } else {
+    // Simple query OR insufficient embeddings: use exact
+    mode = 'exact';
+    confidence = 0.8;
+  }
+
+  // Build reasoning string
+  const detectedPatterns: string[] = [];
+  if (detectLiteral(query)) detectedPatterns.push('literal');
+  if (detectRegex(query)) detectedPatterns.push('regex');
+  if (detectNaturalLanguage(query)) detectedPatterns.push('natural language');
+  if (detectFilePath(query)) detectedPatterns.push('file path');
+  if (detectRelationship(query)) detectedPatterns.push('relationship');
+
+  const reasoning = `Query classified as ${mode} (confidence: ${confidence.toFixed(2)}, detected: ${detectedPatterns.join(', ')}, index: ${hasIndex ? 'available' : 'not available'}, embeddings: ${hasSufficientEmbeddings ? 'sufficient' : 'insufficient'})`;
+
+  return { mode, confidence, reasoning };
+}
+
+/**
+ * Check if a tool is available in PATH
+ * @param toolName - Tool executable name
+ * @returns True if available
+ */
+function checkToolAvailability(toolName: string): boolean {
+  try {
+    const isWindows = process.platform === 'win32';
+    const command = isWindows ? 'where' : 'which';
+    execSync(`${command} ${toolName}`, { stdio: 'ignore' });
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Build ripgrep command arguments
+ * @param params - Search parameters
+ * @returns Command and arguments
+ */
+function buildRipgrepCommand(params: {
+  query: string;
+  paths: string[];
+  contextLines: number;
+  maxResults: number;
+  includeHidden: boolean;
+}): { command: string; args: string[] } {
+  const { query, paths = ['.'], contextLines = 0, maxResults = 10, includeHidden = false } = params;
+
+  const args = [
+    '-n', // Show line numbers
+    '--color=never', // Disable color output
+    '--json', // Output in JSON format
+  ];
+
+  // Add context lines if specified
+  if (contextLines > 0) {
+    args.push('-C', contextLines.toString());
+  }
+
+  // Add max results limit
+  if (maxResults > 0) {
+    args.push('--max-count', maxResults.toString());
+  }
+
+  // Include hidden files if specified
+  if (includeHidden) {
+    args.push('--hidden');
+  }
+
+  // Use literal/fixed string matching for exact mode
+  args.push('-F', query);
+
+  // Add search paths
+  args.push(...paths);
+
+  return { command: 'rg', args };
+}
+
+/**
+ * Action: init - Initialize CodexLens index (FTS only, no embeddings)
+ * For semantic/vector search, use ccw view dashboard or codexlens CLI directly
+ */
+async function executeInitAction(params: Params): Promise<SearchResult> {
+  const { path = '.', languages } = params;
+
+  // Check CodexLens availability
+  const readyStatus = await ensureCodexLensReady();
+  if (!readyStatus.ready) {
+    return {
+      success: false,
+      error: `CodexLens not available: ${readyStatus.error}. CodexLens will be auto-installed on first use.`,
+    };
+  }
+
+  // Build args with --no-embeddings for FTS-only index (faster)
+  const args = ['init', path, '--no-embeddings'];
+  if (languages && languages.length > 0) {
+    args.push('--languages', languages.join(','));
+  }
+
+  // Track progress updates
+  const progressUpdates: ProgressInfo[] = [];
+  let lastProgress: ProgressInfo | null = null;
+
+  const result = await executeCodexLens(args, {
+    cwd: path,
+    timeout: 1800000, // 30 minutes for large codebases
+    onProgress: (progress: ProgressInfo) => {
+      progressUpdates.push(progress);
+      lastProgress = progress;
+    },
+  });
+
+  // Build metadata with progress info
+  const metadata: SearchMetadata = {
+    action: 'init',
+    path,
+  };
+
+  if (lastProgress !== null) {
+    const p = lastProgress as ProgressInfo;
+    metadata.progress = {
+      stage: p.stage,
+      message: p.message,
+      percent: p.percent,
+      filesProcessed: p.filesProcessed,
+      totalFiles: p.totalFiles,
+    };
+  }
+
+  if (progressUpdates.length > 0) {
+    metadata.progressHistory = progressUpdates.slice(-5); // Keep last 5 progress updates
+  }
+
+  const successMessage = result.success
+    ? `FTS index created for ${path}. Note: For semantic/vector search, create vector index via "ccw view" dashboard or run "codexlens init ${path}" (without --no-embeddings).`
+    : undefined;
+
+  return {
+    success: result.success,
+    error: result.error,
+    message: successMessage,
+    metadata,
+  };
+}
+
+/**
+ * Action: status - Check CodexLens index status
+ */
+async function executeStatusAction(params: Params): Promise<SearchResult> {
+  const { path = '.' } = params;
+
+  const indexStatus = await checkIndexStatus(path);
+
+  return {
+    success: true,
+    status: indexStatus,
+    message: indexStatus.warning || `Index status: ${indexStatus.indexed ? 'indexed' : 'not indexed'}, embeddings: ${indexStatus.has_embeddings ? 'available' : 'not available'}`,
+  };
+}
+
+/**
+ * Mode: auto - Intent classification and mode selection
+ * Routes to: hybrid (NL + index) | exact (index) | ripgrep (no index)
+ */
+async function executeAutoMode(params: Params): Promise<SearchResult> {
+  const { query, path = '.' } = params;
+
+  if (!query) {
+    return {
+      success: false,
+      error: 'Query is required for search action',
+    };
+  }
+
+  // Check index status
+  const indexStatus = await checkIndexStatus(path);
+
+  // Classify intent with index and embeddings awareness
+  const classification = classifyIntent(
+    query, 
+    indexStatus.indexed, 
+    indexStatus.has_embeddings  // This now considers 50% threshold
+  );
+
+  // Route to appropriate mode based on classification
+  let result: SearchResult;
+
+  switch (classification.mode) {
+    case 'hybrid':
+      result = await executeHybridMode(params);
+      break;
+
+    case 'exact':
+      result = await executeCodexLensExactMode(params);
+      break;
+
+    case 'ripgrep':
+      result = await executeRipgrepMode(params);
+      break;
+
+    default:
+      // Fallback to ripgrep
+      result = await executeRipgrepMode(params);
+      break;
+  }
+
+  // Add classification metadata
+  if (result.metadata) {
+    result.metadata.classified_as = classification.mode;
+    result.metadata.confidence = classification.confidence;
+    result.metadata.reasoning = classification.reasoning;
+    result.metadata.embeddings_coverage_percent = indexStatus.embeddings_coverage_percent;
+    result.metadata.index_status = indexStatus.indexed
+      ? (indexStatus.has_embeddings ? 'indexed' : 'partial')
+      : 'not_indexed';
+
+    // Add warning if needed
+    if (indexStatus.warning) {
+      result.metadata.warning = indexStatus.warning;
+    }
+  }
+
+  return result;
+}
+
+/**
+ * Mode: ripgrep - Fast literal string matching using ripgrep
+ * No index required, fallback to CodexLens if ripgrep unavailable
+ */
+async function executeRipgrepMode(params: Params): Promise<SearchResult> {
+  const { query, paths = [], contextLines = 0, maxResults = 10, includeHidden = false, path = '.' } = params;
+
+  if (!query) {
+    return {
+      success: false,
+      error: 'Query is required for search',
+    };
+  }
+
+  // Check if ripgrep is available
+  const hasRipgrep = checkToolAvailability('rg');
+
+  // If ripgrep not available, fall back to CodexLens exact mode
+  if (!hasRipgrep) {
+    const readyStatus = await ensureCodexLensReady();
+    if (!readyStatus.ready) {
+      return {
+        success: false,
+        error: 'Neither ripgrep nor CodexLens available. Install ripgrep (rg) or CodexLens for search functionality.',
+      };
+    }
+
+    // Use CodexLens exact mode as fallback
+    const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'exact', '--json'];
+    const result = await executeCodexLens(args, { cwd: path });
+
+    if (!result.success) {
+      return {
+        success: false,
+        error: result.error,
+        metadata: {
+          mode: 'ripgrep',
+          backend: 'codexlens-fallback',
+          count: 0,
+          query,
+        },
+      };
+    }
+
+    // Parse results
+    let results: SemanticMatch[] = [];
+    try {
+      const parsed = JSON.parse(stripAnsi(result.output || '{}'));
+      const data = parsed.result?.results || parsed.results || parsed;
+      results = (Array.isArray(data) ? data : []).map((item: any) => ({
+        file: item.path || item.file,
+        score: item.score || 0,
+        content: item.excerpt || item.content || '',
+        symbol: item.symbol || null,
+      }));
+    } catch {
+      // Keep empty results
+    }
+
+    return {
+      success: true,
+      results,
+      metadata: {
+        mode: 'ripgrep',
+        backend: 'codexlens-fallback',
+        count: results.length,
+        query,
+        note: 'Using CodexLens exact mode (ripgrep not available)',
+      },
+    };
+  }
+
+  // Use ripgrep
+  const { command, args } = buildRipgrepCommand({
+    query,
+    paths: paths.length > 0 ? paths : [path],
+    contextLines,
+    maxResults,
+    includeHidden,
+  });
+
+  return new Promise((resolve) => {
+    const child = spawn(command, args, {
+      cwd: path || process.cwd(),
+      stdio: ['ignore', 'pipe', 'pipe'],
+    });
+
+    let stdout = '';
+    let stderr = '';
+
+    child.stdout.on('data', (data) => {
+      stdout += data.toString();
+    });
+
+    child.stderr.on('data', (data) => {
+      stderr += data.toString();
+    });
+
+    child.on('close', (code) => {
+      const results: ExactMatch[] = [];
+
+      if (code === 0 || (code === 1 && stdout.trim())) {
+        const lines = stdout.split('\n').filter((line) => line.trim());
+
+        for (const line of lines) {
+          try {
+            const item = JSON.parse(line);
+
+            if (item.type === 'match') {
+              const match: ExactMatch = {
+                file: item.data.path.text,
+                line: item.data.line_number,
+                column:
+                  item.data.submatches && item.data.submatches[0]
+                    ? item.data.submatches[0].start + 1
+                    : 1,
+                content: item.data.lines.text.trim(),
+              };
+              results.push(match);
+            }
+          } catch {
+            continue;
+          }
+        }
+
+        resolve({
+          success: true,
+          results,
+          metadata: {
+            mode: 'ripgrep',
+            backend: 'ripgrep',
+            count: results.length,
+            query,
+          },
+        });
+      } else {
+        resolve({
+          success: false,
+          error: `ripgrep execution failed with code ${code}: ${stderr}`,
+          results: [],
+        });
+      }
+    });
+
+    child.on('error', (error) => {
+      resolve({
+        success: false,
+        error: `Failed to spawn ripgrep: ${error.message}`,
+        results: [],
+      });
+    });
+  });
+}
+
+/**
+ * Mode: exact - CodexLens exact/FTS search
+ * Requires index
+ */
+async function executeCodexLensExactMode(params: Params): Promise<SearchResult> {
+  const { query, path = '.', maxResults = 10, enrich = false } = params;
+
+  if (!query) {
+    return {
+      success: false,
+      error: 'Query is required for search',
+    };
+  }
+
+  // Check CodexLens availability
+  const readyStatus = await ensureCodexLensReady();
+  if (!readyStatus.ready) {
+    return {
+      success: false,
+      error: `CodexLens not available: ${readyStatus.error}`,
+    };
+  }
+
+  // Check index status
+  const indexStatus = await checkIndexStatus(path);
+
+  const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'exact', '--json'];
+  if (enrich) {
+    args.push('--enrich');
+  }
+  const result = await executeCodexLens(args, { cwd: path });
+
+  if (!result.success) {
+    return {
+      success: false,
+      error: result.error,
+      metadata: {
+        mode: 'exact',
+        backend: 'codexlens',
+        count: 0,
+        query,
+        warning: indexStatus.warning,
+      },
+    };
+  }
+
+  // Parse results
+  let results: SemanticMatch[] = [];
+  try {
+    const parsed = JSON.parse(stripAnsi(result.output || '{}'));
+    const data = parsed.result?.results || parsed.results || parsed;
+    results = (Array.isArray(data) ? data : []).map((item: any) => ({
+      file: item.path || item.file,
+      score: item.score || 0,
+      content: item.excerpt || item.content || '',
+      symbol: item.symbol || null,
+    }));
+  } catch {
+    // Keep empty results
+  }
+
+  return {
+    success: true,
+    results,
+    metadata: {
+      mode: 'exact',
+      backend: 'codexlens',
+      count: results.length,
+      query,
+      warning: indexStatus.warning,
+    },
+  };
+}
+
+/**
+ * Mode: hybrid - Best quality search with RRF fusion
+ * Uses CodexLens hybrid mode (exact + fuzzy + vector)
+ * Requires index with embeddings
+ */
+async function executeHybridMode(params: Params): Promise<SearchResult> {
+  const { query, path = '.', maxResults = 10, enrich = false } = params;
+
+  if (!query) {
+    return {
+      success: false,
+      error: 'Query is required for search',
+    };
+  }
+
+  // Check CodexLens availability
+  const readyStatus = await ensureCodexLensReady();
+  if (!readyStatus.ready) {
+    return {
+      success: false,
+      error: `CodexLens not available: ${readyStatus.error}`,
+    };
+  }
+
+  // Check index status
+  const indexStatus = await checkIndexStatus(path);
+
+  const args = ['search', query, '--limit', maxResults.toString(), '--mode', 'hybrid', '--json'];
+  if (enrich) {
+    args.push('--enrich');
+  }
+  const result = await executeCodexLens(args, { cwd: path });
+
+  if (!result.success) {
+    return {
+      success: false,
+      error: result.error,
+      metadata: {
+        mode: 'hybrid',
+        backend: 'codexlens',
+        count: 0,
+        query,
+        warning: indexStatus.warning,
+      },
+    };
+  }
+
+  // Parse results
+  let results: SemanticMatch[] = [];
+  try {
+    const parsed = JSON.parse(stripAnsi(result.output || '{}'));
+    const data = parsed.result?.results || parsed.results || parsed;
+    results = (Array.isArray(data) ? data : []).map((item: any) => ({
+      file: item.path || item.file,
+      score: item.score || 0,
+      content: item.excerpt || item.content || '',
+      symbol: item.symbol || null,
+    }));
+  } catch {
+    return {
+      success: true,
+      results: [],
+      output: result.output,
+      metadata: {
+        mode: 'hybrid',
+        backend: 'codexlens',
+        count: 0,
+        query,
+        warning: indexStatus.warning || 'Failed to parse JSON output',
+      },
+    };
+  }
+
+  return {
+    success: true,
+    results,
+    metadata: {
+      mode: 'hybrid',
+      backend: 'codexlens',
+      count: results.length,
+      query,
+      note: 'Hybrid mode uses RRF fusion (exact + fuzzy + vector) for best results',
+      warning: indexStatus.warning,
+    },
+  };
+}
+
+/**
+ * TypeScript implementation of Reciprocal Rank Fusion
+ * Reference: codex-lens/src/codexlens/search/ranking.py
+ * Formula: score(d) = Σ weight_source / (k + rank_source(d))
+ */
+function applyRRFFusion(
+  resultsMap: Map<string, any[]>,
+  weights: Record<string, number>,
+  limit: number,
+  k: number = 60,
+): any[] {
+  const pathScores = new Map<string, { score: number; result: any; sources: string[] }>();
+
+  resultsMap.forEach((results, source) => {
+    const weight = weights[source] || 0;
+    if (weight === 0 || !results) return;
+
+    results.forEach((result, rank) => {
+      const path = result.file || result.path;
+      if (!path) return;
+
+      const rrfContribution = weight / (k + rank + 1);
+
+      if (!pathScores.has(path)) {
+        pathScores.set(path, { score: 0, result, sources: [] });
+      }
+      const entry = pathScores.get(path)!;
+      entry.score += rrfContribution;
+      if (!entry.sources.includes(source)) {
+        entry.sources.push(source);
+      }
+    });
+  });
+
+  // Sort by fusion score descending
+  return Array.from(pathScores.values())
+    .sort((a, b) => b.score - a.score)
+    .slice(0, limit)
+    .map(item => ({
+      ...item.result,
+      fusion_score: item.score,
+      matched_backends: item.sources,
+    }));
+}
+
+/**
+ * Promise wrapper with timeout support
+ * @param promise - The promise to wrap
+ * @param ms - Timeout in milliseconds
+ * @param modeName - Name of the mode for error message
+ * @returns A new promise that rejects on timeout
+ */
+function withTimeout<T>(promise: Promise<T>, ms: number, modeName: string): Promise<T> {
+  return new Promise((resolve, reject) => {
+    const timer = setTimeout(() => {
+      reject(new Error(`'${modeName}' search timed out after ${ms}ms`));
+    }, ms);
+
+    promise
+      .then(resolve)
+      .catch(reject)
+      .finally(() => clearTimeout(timer));
+  });
+}
+
+/**
+ * Mode: priority - Fallback search strategy: hybrid -> exact -> ripgrep
+ * Returns results from the first backend that succeeds and provides results.
+ * More efficient than parallel mode - stops as soon as valid results are found.
+ */
+async function executePriorityFallbackMode(params: Params): Promise<SearchResult> {
+  const { query, path = '.' } = params;
+  const fallbackHistory: string[] = [];
+
+  if (!query) {
+    return { success: false, error: 'Query is required for search' };
+  }
+
+  // Check index status first
+  const indexStatus = await checkIndexStatus(path);
+
+  // 1. Try Hybrid search (highest priority) - 90s timeout for large indexes
+  if (indexStatus.indexed && indexStatus.has_embeddings) {
+    try {
+      const hybridResult = await withTimeout(executeHybridMode(params), 90000, 'hybrid');
+      if (hybridResult.success && hybridResult.results && (hybridResult.results as any[]).length > 0) {
+        fallbackHistory.push('hybrid: success');
+        return {
+          ...hybridResult,
+          metadata: {
+            ...hybridResult.metadata,
+            mode: 'priority',
+            note: 'Result from hybrid search (semantic + vector).',
+            fallback_history: fallbackHistory,
+          },
+        };
+      }
+      fallbackHistory.push('hybrid: no results');
+    } catch (error) {
+      fallbackHistory.push(`hybrid: ${(error as Error).message}`);
+    }
+  } else {
+    fallbackHistory.push(`hybrid: skipped (${!indexStatus.indexed ? 'no index' : 'no embeddings'})`);
+  }
+
+  // 2. Fallback to Exact search - 10s timeout
+  if (indexStatus.indexed) {
+    try {
+      const exactResult = await withTimeout(executeCodexLensExactMode(params), 10000, 'exact');
+      if (exactResult.success && exactResult.results && (exactResult.results as any[]).length > 0) {
+        fallbackHistory.push('exact: success');
+        return {
+          ...exactResult,
+          metadata: {
+            ...exactResult.metadata,
+            mode: 'priority',
+            note: 'Result from exact/FTS search (fallback from hybrid).',
+            fallback_history: fallbackHistory,
+          },
+        };
+      }
+      fallbackHistory.push('exact: no results');
+    } catch (error) {
+      fallbackHistory.push(`exact: ${(error as Error).message}`);
+    }
+  } else {
+    fallbackHistory.push('exact: skipped (no index)');
+  }
+
+  // 3. Final fallback to Ripgrep - 5s timeout
+  try {
+    const ripgrepResult = await withTimeout(executeRipgrepMode(params), 5000, 'ripgrep');
+    fallbackHistory.push(ripgrepResult.success ? 'ripgrep: success' : 'ripgrep: failed');
+    return {
+      ...ripgrepResult,
+      metadata: {
+        ...ripgrepResult.metadata,
+        mode: 'priority',
+        note: 'Result from ripgrep search (final fallback).',
+        fallback_history: fallbackHistory,
+      },
+    };
+  } catch (error) {
+    fallbackHistory.push(`ripgrep: ${(error as Error).message}`);
+  }
+
+  // All modes failed
+  return {
+    success: false,
+    error: 'All search backends in priority mode failed or returned no results.',
+    metadata: {
+      mode: 'priority',
+      query,
+      fallback_history: fallbackHistory,
+    } as any,
+  };
+}
+
+// Tool schema for MCP
+export const schema: ToolSchema = {
+  name: 'smart_search',
+  description: `Intelligent code search with five modes. Use "auto" mode (default) for intelligent routing.
+
+**Usage:**
+  smart_search(query="authentication logic")        # auto mode - routes to best backend
+  smart_search(query="MyClass", mode="exact")       # exact mode - precise FTS matching
+  smart_search(query="auth", mode="ripgrep")        # ripgrep mode - fast literal search (no index)
+  smart_search(query="how to auth", mode="hybrid")  # hybrid mode - semantic search (requires index)
+
+**Index Management:**
+  smart_search(action="init")                       # Create FTS index for current directory
+  smart_search(action="status")                     # Check index and embedding status
+
+**Graph Enrichment:**
+  smart_search(query="func", enrich=true)           # Enrich results with code relationships (calls, imports, called_by, imported_by)
+
+**Modes:** auto (intelligent routing), hybrid (semantic, needs index), exact (FTS), ripgrep (fast, no index), priority (fallback: hybrid→exact→ripgrep)`,
+  inputSchema: {
+    type: 'object',
+    properties: {
+      action: {
+        type: 'string',
+        enum: ['init', 'search', 'search_files', 'status'],
+        description: 'Action to perform: init (create FTS index, no embeddings), search (default), search_files (paths only), status (check index)',
+        default: 'search',
+      },
+      query: {
+        type: 'string',
+        description: 'Search query (required for search/search_files actions)',
+      },
+      mode: {
+        type: 'string',
+        enum: SEARCH_MODES,
+        description: 'Search mode: auto (default), hybrid (best quality), exact (CodexLens FTS), ripgrep (fast, no index), priority (fallback: hybrid->exact->ripgrep)',
+        default: 'auto',
+      },
+      output_mode: {
+        type: 'string',
+        enum: ['full', 'files_only', 'count'],
+        description: 'Output format: full (default), files_only (paths only), count (per-file counts)',
+        default: 'full',
+      },
+      path: {
+        type: 'string',
+        description: 'Directory path for init/search actions (default: current directory)',
+      },
+      paths: {
+        type: 'array',
+        description: 'Multiple paths to search within (for search action)',
+        items: {
+          type: 'string',
+        },
+        default: [],
+      },
+      contextLines: {
+        type: 'number',
+        description: 'Number of context lines around matches (exact mode only)',
+        default: 0,
+      },
+      maxResults: {
+        type: 'number',
+        description: 'Maximum number of results (default: 10)',
+        default: 10,
+      },
+      limit: {
+        type: 'number',
+        description: 'Alias for maxResults',
+        default: 10,
+      },
+      includeHidden: {
+        type: 'boolean',
+        description: 'Include hidden files/directories',
+        default: false,
+      },
+      languages: {
+        type: 'array',
+        items: { type: 'string' },
+        description: 'Languages to index (for init action). Example: ["javascript", "typescript"]',
+      },
+      enrich: {
+        type: 'boolean',
+        description: 'Enrich search results with code graph relationships (calls, imports, called_by, imported_by).',
+        default: false,
+      },
+    },
+    required: [],
+  },
+};
+
+/**
+ * Transform results based on output_mode
+ */
+function transformOutput(
+  results: ExactMatch[] | SemanticMatch[] | GraphMatch[] | unknown[],
+  outputMode: 'full' | 'files_only' | 'count'
+): unknown {
+  if (!Array.isArray(results)) {
+    return results;
+  }
+
+  switch (outputMode) {
+    case 'files_only': {
+      // Extract unique file paths
+      const files = [...new Set(results.map((r: any) => r.file))].filter(Boolean);
+      return { files, count: files.length };
+    }
+    case 'count': {
+      // Count matches per file
+      const counts: Record<string, number> = {};
+      for (const r of results) {
+        const file = (r as any).file;
+        if (file) {
+          counts[file] = (counts[file] || 0) + 1;
+        }
+      }
+      return {
+        files: Object.entries(counts).map(([file, count]) => ({ file, count })),
+        total: results.length,
+      };
+    }
+    case 'full':
+    default:
+      return results;
+  }
+}
+
+// Handler function
+export async function handler(params: Record<string, unknown>): Promise<ToolResult<SearchResult>> {
+  const parsed = ParamsSchema.safeParse(params);
+  if (!parsed.success) {
+    return { success: false, error: `Invalid params: ${parsed.error.message}` };
+  }
+
+  const { action, mode, output_mode } = parsed.data;
+
+  // Sync limit and maxResults - use the larger of the two if both provided
+  // This ensures user-provided values take precedence over defaults
+  const effectiveLimit = Math.max(parsed.data.limit || 10, parsed.data.maxResults || 10);
+  parsed.data.maxResults = effectiveLimit;
+  parsed.data.limit = effectiveLimit;
+
+  try {
+    let result: SearchResult;
+
+    // Handle actions
+    switch (action) {
+      case 'init':
+        result = await executeInitAction(parsed.data);
+        break;
+
+      case 'status':
+        result = await executeStatusAction(parsed.data);
+        break;
+
+      case 'search_files':
+        // For search_files, use search mode but force files_only output
+        parsed.data.output_mode = 'files_only';
+        // Fall through to search
+
+      case 'search':
+      default:
+        // Handle search modes: auto | hybrid | exact | ripgrep | priority
+        switch (mode) {
+          case 'auto':
+            result = await executeAutoMode(parsed.data);
+            break;
+          case 'hybrid':
+            result = await executeHybridMode(parsed.data);
+            break;
+          case 'exact':
+            result = await executeCodexLensExactMode(parsed.data);
+            break;
+          case 'ripgrep':
+            result = await executeRipgrepMode(parsed.data);
+            break;
+          case 'priority':
+            result = await executePriorityFallbackMode(parsed.data);
+            break;
+          default:
+            throw new Error(`Unsupported mode: ${mode}. Use: auto, hybrid, exact, ripgrep, or priority`);
+        }
+        break;
+    }
+
+    // Transform output based on output_mode (for search actions only)
+    if (action === 'search' || action === 'search_files') {
+      if (result.success && result.results && output_mode !== 'full') {
+        result.results = transformOutput(result.results as any[], output_mode);
+      }
+    }
+
+    return result.success ? { success: true, result } : { success: false, error: result.error };
+  } catch (error) {
+    return { success: false, error: (error as Error).message };
+  }
+}
+
+/**
+ * Execute init action with external progress callback
+ * Used by MCP server for streaming progress
+ */
+export async function executeInitWithProgress(
+  params: Record<string, unknown>,
+  onProgress?: (progress: ProgressInfo) => void
+): Promise<SearchResult> {
+  const path = (params.path as string) || '.';
+  const languages = params.languages as string[] | undefined;
+
+  // Check CodexLens availability
+  const readyStatus = await ensureCodexLensReady();
+  if (!readyStatus.ready) {
+    return {
+      success: false,
+      error: `CodexLens not available: ${readyStatus.error}. CodexLens will be auto-installed on first use.`,
+    };
+  }
+
+  const args = ['init', path];
+  if (languages && languages.length > 0) {
+    args.push('--languages', languages.join(','));
+  }
+
+  // Track progress updates
+  const progressUpdates: ProgressInfo[] = [];
+  let lastProgress: ProgressInfo | null = null;
+
+  const result = await executeCodexLens(args, {
+    cwd: path,
+    timeout: 1800000, // 30 minutes for large codebases
+    onProgress: (progress: ProgressInfo) => {
+      progressUpdates.push(progress);
+      lastProgress = progress;
+      // Call external progress callback if provided
+      if (onProgress) {
+        onProgress(progress);
+      }
+    },
+  });
+
+  // Build metadata with progress info
+  const metadata: SearchMetadata = {
+    action: 'init',
+    path,
+  };
+
+  if (lastProgress !== null) {
+    const p = lastProgress as ProgressInfo;
+    metadata.progress = {
+      stage: p.stage,
+      message: p.message,
+      percent: p.percent,
+      filesProcessed: p.filesProcessed,
+      totalFiles: p.totalFiles,
+    };
+  }
+
+  if (progressUpdates.length > 0) {
+    metadata.progressHistory = progressUpdates.slice(-5);
+  }
+
+  return {
+    success: result.success,
+    error: result.error,
+    message: result.success
+      ? `CodexLens index created successfully for ${path}`
+      : undefined,
+    metadata,
+  };
+}
diff --git a/codex-lens/src/codexlens/cli/embedding_manager.py b/codex-lens/src/codexlens/cli/embedding_manager.py
index 5adb3ca9..f4067840 100644
--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -18,6 +18,27 @@ except ImportError:
 logger = logging.getLogger(__name__)
 
 
+def _get_path_column(conn: sqlite3.Connection) -> str:
+    """Detect whether files table uses 'path' or 'full_path' column.
+
+    Args:
+        conn: SQLite connection to the index database
+
+    Returns:
+        Column name ('path' or 'full_path')
+
+    Raises:
+        ValueError: If neither column exists in files table
+    """
+    cursor = conn.execute("PRAGMA table_info(files)")
+    columns = {row[1] for row in cursor.fetchall()}
+    if 'full_path' in columns:
+        return 'full_path'
+    elif 'path' in columns:
+        return 'path'
+    raise ValueError("files table has neither 'path' nor 'full_path' column")
+
+
 def check_index_embeddings(index_path: Path) -> Dict[str, any]:
     """Check if an index has embeddings and return statistics.
 
@@ -75,10 +96,11 @@ def check_index_embeddings(index_path: Path) -> Dict[str, any]:
             files_with_chunks = cursor.fetchone()[0]
 
             # Get a sample of files without embeddings
-            cursor = conn.execute("""
-                SELECT full_path
+            path_column = _get_path_column(conn)
+            cursor = conn.execute(f"""
+                SELECT {path_column}
                 FROM files
-                WHERE full_path NOT IN (
+                WHERE {path_column} NOT IN (
                     SELECT DISTINCT file_path FROM semantic_chunks
                 )
                 LIMIT 5
@@ -113,7 +135,10 @@ def generate_embeddings(
     chunk_size: int = 2000,
     progress_callback: Optional[callable] = None,
 ) -> Dict[str, any]:
-    """Generate embeddings for an index.
+    """Generate embeddings for an index using memory-efficient batch processing.
+
+    This function processes files in small batches to keep memory usage under 2GB,
+    regardless of the total project size.
 
     Args:
         index_path: Path to _index.db file
@@ -181,126 +206,107 @@ def generate_embeddings(
             "error": f"Failed to initialize components: {str(e)}",
         }
 
-    # Read files from index
+    # --- MEMORY-OPTIMIZED STREAMING PROCESSING ---
+    # Process files in small batches to control memory usage
+    # This keeps peak memory under 2GB regardless of project size
+    start_time = time.time()
+    failed_files = []
+    total_chunks_created = 0
+    total_files_processed = 0
+    FILE_BATCH_SIZE = 100  # Process 100 files at a time
+    EMBEDDING_BATCH_SIZE = 8  # jina-embeddings-v2-base-code needs small batches
+
     try:
         with sqlite3.connect(index_path) as conn:
             conn.row_factory = sqlite3.Row
-            cursor = conn.execute("SELECT full_path, content, language FROM files")
-            files = cursor.fetchall()
+            path_column = _get_path_column(conn)
+
+            # Get total file count for progress reporting
+            total_files = conn.execute("SELECT COUNT(*) FROM files").fetchone()[0]
+            if total_files == 0:
+                return {"success": False, "error": "No files found in index"}
+
+            if progress_callback:
+                progress_callback(f"Processing {total_files} files in batches of {FILE_BATCH_SIZE}...")
+
+            cursor = conn.execute(f"SELECT {path_column}, content, language FROM files")
+            batch_number = 0
+
+            while True:
+                # Fetch a batch of files (streaming, not fetchall)
+                file_batch = cursor.fetchmany(FILE_BATCH_SIZE)
+                if not file_batch:
+                    break
+
+                batch_number += 1
+                batch_chunks_with_paths = []
+                files_in_batch_with_chunks = set()
+
+                # Step 1: Chunking for the current file batch
+                for file_row in file_batch:
+                    file_path = file_row[path_column]
+                    content = file_row["content"]
+                    language = file_row["language"] or "python"
+
+                    try:
+                        chunks = chunker.chunk_sliding_window(
+                            content,
+                            file_path=file_path,
+                            language=language
+                        )
+                        if chunks:
+                            for chunk in chunks:
+                                batch_chunks_with_paths.append((chunk, file_path))
+                            files_in_batch_with_chunks.add(file_path)
+                    except Exception as e:
+                        logger.error(f"Failed to chunk {file_path}: {e}")
+                        failed_files.append((file_path, str(e)))
+
+                if not batch_chunks_with_paths:
+                    continue
+
+                batch_chunk_count = len(batch_chunks_with_paths)
+                if progress_callback:
+                    progress_callback(f"  Batch {batch_number}: {len(file_batch)} files, {batch_chunk_count} chunks")
+
+                # Step 2: Generate embeddings for this batch
+                batch_embeddings = []
+                try:
+                    for i in range(0, batch_chunk_count, EMBEDDING_BATCH_SIZE):
+                        batch_end = min(i + EMBEDDING_BATCH_SIZE, batch_chunk_count)
+                        batch_contents = [chunk.content for chunk, _ in batch_chunks_with_paths[i:batch_end]]
+                        embeddings = embedder.embed(batch_contents)
+                        batch_embeddings.extend(embeddings)
+                except Exception as e:
+                    logger.error(f"Failed to generate embeddings for batch {batch_number}: {str(e)}")
+                    failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
+                    continue
+
+                # Step 3: Assign embeddings to chunks
+                for (chunk, _), embedding in zip(batch_chunks_with_paths, batch_embeddings):
+                    chunk.embedding = embedding
+
+                # Step 4: Store this batch to database immediately (releases memory)
+                try:
+                    vector_store.add_chunks_batch(batch_chunks_with_paths)
+                    total_chunks_created += batch_chunk_count
+                    total_files_processed += len(files_in_batch_with_chunks)
+                except Exception as e:
+                    logger.error(f"Failed to store batch {batch_number}: {str(e)}")
+                    failed_files.extend([(file_row[path_column], str(e)) for file_row in file_batch])
+
+                # Memory is released here as batch_chunks_with_paths and batch_embeddings go out of scope
+
     except Exception as e:
-        return {
-            "success": False,
-            "error": f"Failed to read files: {str(e)}",
-        }
-
-    if len(files) == 0:
-        return {
-            "success": False,
-            "error": "No files found in index",
-        }
-
-    if progress_callback:
-        progress_callback(f"Processing {len(files)} files...")
-
-    # Process all files using batch operations for optimal performance
-    start_time = time.time()
-    failed_files = []
-
-    # --- OPTIMIZATION Step 1: Collect all chunks from all files ---
-    if progress_callback:
-        progress_callback(f"Step 1/4: Chunking {len(files)} files...")
-
-    all_chunks_with_paths = []  # List of (chunk, file_path) tuples
-    files_with_chunks = set()
-
-    for idx, file_row in enumerate(files, 1):
-        file_path = file_row["full_path"]
-        content = file_row["content"]
-        language = file_row["language"] or "python"
-
-        try:
-            chunks = chunker.chunk_sliding_window(
-                content,
-                file_path=file_path,
-                language=language
-            )
-            if chunks:
-                for chunk in chunks:
-                    all_chunks_with_paths.append((chunk, file_path))
-                files_with_chunks.add(file_path)
-        except Exception as e:
-            logger.error(f"Failed to chunk {file_path}: {e}")
-            failed_files.append((file_path, str(e)))
-
-    if not all_chunks_with_paths:
-        elapsed_time = time.time() - start_time
-        return {
-            "success": True,
-            "result": {
-                "chunks_created": 0,
-                "files_processed": len(files) - len(failed_files),
-                "files_failed": len(failed_files),
-                "elapsed_time": elapsed_time,
-                "model_profile": model_profile,
-                "model_name": embedder.model_name,
-                "failed_files": failed_files[:5],
-                "index_path": str(index_path),
-            },
-        }
-
-    total_chunks = len(all_chunks_with_paths)
-
-    # --- OPTIMIZATION Step 2: Batch generate embeddings with memory-safe batching ---
-    # Use smaller batches to avoid OOM errors while still benefiting from batch processing
-    # jina-embeddings-v2-base-code with long chunks needs small batches
-    BATCH_SIZE = 8  # Conservative batch size for memory efficiency
-
-    if progress_callback:
-        num_batches = (total_chunks + BATCH_SIZE - 1) // BATCH_SIZE
-        progress_callback(f"Step 2/4: Generating embeddings for {total_chunks} chunks ({num_batches} batches)...")
-
-    try:
-        all_embeddings = []
-        for batch_start in range(0, total_chunks, BATCH_SIZE):
-            batch_end = min(batch_start + BATCH_SIZE, total_chunks)
-            batch_contents = [chunk.content for chunk, _ in all_chunks_with_paths[batch_start:batch_end]]
-            batch_embeddings = embedder.embed(batch_contents)
-            all_embeddings.extend(batch_embeddings)
-
-            if progress_callback and total_chunks > BATCH_SIZE:
-                progress_callback(f"  Batch {batch_start // BATCH_SIZE + 1}/{(total_chunks + BATCH_SIZE - 1) // BATCH_SIZE}: {len(batch_embeddings)} embeddings")
-    except Exception as e:
-        return {
-            "success": False,
-            "error": f"Failed to generate embeddings: {str(e)}",
-        }
-
-    # --- OPTIMIZATION Step 3: Assign embeddings back to chunks ---
-    if progress_callback:
-        progress_callback(f"Step 3/4: Assigning {len(all_embeddings)} embeddings...")
-
-    for (chunk, _), embedding in zip(all_chunks_with_paths, all_embeddings):
-        chunk.embedding = embedding
-
-    # --- OPTIMIZATION Step 4: Batch store all chunks in single transaction ---
-    if progress_callback:
-        progress_callback(f"Step 4/4: Storing {total_chunks} chunks to database...")
-
-    try:
-        vector_store.add_chunks_batch(all_chunks_with_paths)
-    except Exception as e:
-        return {
-            "success": False,
-            "error": f"Failed to store chunks: {str(e)}",
-        }
+        return {"success": False, "error": f"Failed to read or process files: {str(e)}"}
 
     elapsed_time = time.time() - start_time
 
     return {
         "success": True,
         "result": {
-            "chunks_created": total_chunks,
-            "files_processed": len(files_with_chunks),
+            "chunks_created": total_chunks_created,
+            "files_processed": total_files_processed,
             "files_failed": len(failed_files),
             "elapsed_time": elapsed_time,
             "model_profile": model_profile,
diff --git a/codex-lens/src/codexlens/semantic/chunker.py b/codex-lens/src/codexlens/semantic/chunker.py
index 38366dfb..a1df4686 100644
--- a/codex-lens/src/codexlens/semantic/chunker.py
+++ b/codex-lens/src/codexlens/semantic/chunker.py
@@ -150,8 +150,13 @@ class Chunker:
                 chunk_idx += 1
 
             # Move window, accounting for overlap
-            start = end - overlap_lines
-            if start >= len(lines) - overlap_lines:
+            step = lines_per_chunk - overlap_lines
+            if step <= 0:
+                step = 1  # Failsafe to prevent infinite loop
+            start += step
+
+            # Break if we've reached the end
+            if end >= len(lines):
                 break
 
         return chunks