feat: Enhance LiteLLM integration and CLI management

- Added token estimation and batching functionality in LiteLLMEmbedder to handle large text inputs efficiently. - Updated embed method to support max_tokens_per_batch parameter for better API call management. - Introduced new API routes for managing custom CLI endpoints, including GET, POST, PUT, and DELETE methods. - Enhanced CLI history component to support source directory context for native session content. - Improved error handling and logging in various components for better debugging and user feedback. - Added internationalization support for new API endpoint features in the i18n module. - Updated CodexLens CLI commands to allow for concurrent API calls with a max_workers option. - Enhanced embedding manager to track model information and handle embeddings generation more robustly. - Added entry points for CLI commands in the package configuration.
2026-02-05 01:50:27 +08:00 · 2025-12-24 18:01:26 +08:00
parent dfca4d60ee
commit e3e61bcae9
13 changed files with 575 additions and 107 deletions
--- a/ccw-litellm/src/ccw_litellm.egg-info/SOURCES.txt
+++ b/ccw-litellm/src/ccw_litellm.egg-info/SOURCES.txt
@@ -1,8 +1,11 @@
+README.md
 pyproject.toml
 src/ccw_litellm/__init__.py
+src/ccw_litellm/cli.py
 src/ccw_litellm.egg-info/PKG-INFO
 src/ccw_litellm.egg-info/SOURCES.txt
 src/ccw_litellm.egg-info/dependency_links.txt
+src/ccw_litellm.egg-info/entry_points.txt
 src/ccw_litellm.egg-info/requires.txt
 src/ccw_litellm.egg-info/top_level.txt
 src/ccw_litellm/clients/__init__.py
--- a/ccw-litellm/src/ccw_litellm.egg-info/entry_points.txt
+++ b/ccw-litellm/src/ccw_litellm.egg-info/entry_points.txt
@@ -0,0 +1,2 @@
+[console_scripts]
+ccw-litellm = ccw_litellm.cli:main
--- a/ccw-litellm/src/ccw_litellm/clients/litellm_embedder.py
+++ b/ccw-litellm/src/ccw_litellm/clients/litellm_embedder.py
@@ -102,18 +102,75 @@ class LiteLLMEmbedder(AbstractEmbedder):
        """Embedding vector size."""
        return self._model_config.dimensions

+    def _estimate_tokens(self, text: str) -> int:
+        """Estimate token count for a text using fast heuristic.
+
+        Args:
+            text: Text to estimate tokens for
+
+        Returns:
+            Estimated token count (len/4 is a reasonable approximation)
+        """
+        return len(text) // 4
+
+    def _create_batches(
+        self,
+        texts: list[str],
+        max_tokens: int = 30000
+    ) -> list[list[str]]:
+        """Split texts into batches that fit within token limits.
+
+        Args:
+            texts: List of texts to batch
+            max_tokens: Maximum tokens per batch (default: 30000, safe margin for 40960 limit)
+
+        Returns:
+            List of text batches
+        """
+        batches = []
+        current_batch = []
+        current_tokens = 0
+
+        for text in texts:
+            text_tokens = self._estimate_tokens(text)
+
+            # If single text exceeds limit, truncate it
+            if text_tokens > max_tokens:
+                logger.warning(f"Text with {text_tokens} estimated tokens exceeds limit, truncating")
+                # Truncate to fit (rough estimate: 4 chars per token)
+                max_chars = max_tokens * 4
+                text = text[:max_chars]
+                text_tokens = self._estimate_tokens(text)
+
+            # Start new batch if current would exceed limit
+            if current_tokens + text_tokens > max_tokens and current_batch:
+                batches.append(current_batch)
+                current_batch = []
+                current_tokens = 0
+
+            current_batch.append(text)
+            current_tokens += text_tokens
+
+        # Add final batch
+        if current_batch:
+            batches.append(current_batch)
+
+        return batches
+
    def embed(
        self,
        texts: str | Sequence[str],
        *,
        batch_size: int | None = None,
+        max_tokens_per_batch: int = 30000,
        **kwargs: Any,
    ) -> NDArray[np.floating]:
        """Embed one or more texts.

        Args:
            texts: Single text or sequence of texts
-            batch_size: Batch size for processing (currently unused, LiteLLM handles batching)
+            batch_size: Batch size for processing (deprecated, use max_tokens_per_batch)
+            max_tokens_per_batch: Maximum estimated tokens per API call (default: 30000)
            **kwargs: Additional arguments for litellm.embedding()

        Returns:
@@ -125,10 +182,8 @@ class LiteLLMEmbedder(AbstractEmbedder):
        # Normalize input to list
        if isinstance(texts, str):
            text_list = [texts]
-            single_input = True
        else:
            text_list = list(texts)
-            single_input = False

        if not text_list:
            # Return empty array with correct shape
@@ -137,36 +192,53 @@ class LiteLLMEmbedder(AbstractEmbedder):
        # Merge kwargs
        embedding_kwargs = {**self._litellm_kwargs, **kwargs}

-        try:
-            # For OpenAI-compatible endpoints, ensure encoding_format is set
-            if self._provider_config.api_base and "encoding_format" not in embedding_kwargs:
-                embedding_kwargs["encoding_format"] = "float"
+        # For OpenAI-compatible endpoints, ensure encoding_format is set
+        if self._provider_config.api_base and "encoding_format" not in embedding_kwargs:
+            embedding_kwargs["encoding_format"] = "float"

-            # Call LiteLLM embedding
-            response = litellm.embedding(
-                model=self._format_model_name(),
-                input=text_list,
-                **embedding_kwargs,
-            )
+        # Split into token-aware batches
+        batches = self._create_batches(text_list, max_tokens_per_batch)

-            # Extract embeddings
-            embeddings = [item["embedding"] for item in response.data]
+        if len(batches) > 1:
+            logger.info(f"Split {len(text_list)} texts into {len(batches)} batches for embedding")

-            # Convert to numpy array
-            result = np.array(embeddings, dtype=np.float32)
+        all_embeddings = []

-            # Validate dimensions
-            if result.shape[1] != self.dimensions:
-                logger.warning(
-                    f"Expected {self.dimensions} dimensions, got {result.shape[1]}. "
-                    f"Configuration may be incorrect."
+        for batch_idx, batch in enumerate(batches):
+            try:
+                # Build call kwargs with explicit api_base
+                call_kwargs = {**embedding_kwargs}
+                if self._provider_config.api_base:
+                    call_kwargs["api_base"] = self._provider_config.api_base
+                if self._provider_config.api_key:
+                    call_kwargs["api_key"] = self._provider_config.api_key
+
+                # Call LiteLLM embedding for this batch
+                response = litellm.embedding(
+                    model=self._format_model_name(),
+                    input=batch,
+                    **call_kwargs,
                )

-            return result
+                # Extract embeddings
+                batch_embeddings = [item["embedding"] for item in response.data]
+                all_embeddings.extend(batch_embeddings)

-        except Exception as e:
-            logger.error(f"LiteLLM embedding failed: {e}")
-            raise
+            except Exception as e:
+                logger.error(f"LiteLLM embedding failed for batch {batch_idx + 1}/{len(batches)}: {e}")
+                raise
+
+        # Convert to numpy array
+        result = np.array(all_embeddings, dtype=np.float32)
+
+        # Validate dimensions
+        if result.shape[1] != self.dimensions:
+            logger.warning(
+                f"Expected {self.dimensions} dimensions, got {result.shape[1]}. "
+                f"Configuration may be incorrect."
+            )
+
+        return result

    @property
    def model_name(self) -> str:
--- a/ccw/src/core/routes/cli-routes.ts
+++ b/ccw/src/core/routes/cli-routes.ts
@@ -38,7 +38,9 @@ import {
  saveClaudeCliTools,
  updateClaudeToolEnabled,
  updateClaudeCacheSettings,
-  getClaudeCliToolsInfo
+  getClaudeCliToolsInfo,
+  addClaudeCustomEndpoint,
+  removeClaudeCustomEndpoint
 } from '../../tools/claude-cli-tools.js';

 export interface RouteContext {
@@ -211,6 +213,93 @@ export async function handleCliRoutes(ctx: RouteContext): Promise<boolean> {
    }
  }

+  // API: Get all custom endpoints
+  if (pathname === '/api/cli/endpoints' && req.method === 'GET') {
+    try {
+      const config = loadClaudeCliTools(initialPath);
+      res.writeHead(200, { 'Content-Type': 'application/json' });
+      res.end(JSON.stringify({ endpoints: config.customEndpoints || [] }));
+    } catch (err) {
+      res.writeHead(500, { 'Content-Type': 'application/json' });
+      res.end(JSON.stringify({ error: (err as Error).message }));
+    }
+    return true;
+  }
+
+  // API: Add/Update custom endpoint
+  if (pathname === '/api/cli/endpoints' && req.method === 'POST') {
+    handlePostRequest(req, res, async (body: unknown) => {
+      try {
+        const { id, name, enabled } = body as { id: string; name: string; enabled: boolean };
+        if (!id || !name) {
+          return { error: 'id and name are required', status: 400 };
+        }
+        const config = addClaudeCustomEndpoint(initialPath, { id, name, enabled: enabled !== false });
+
+        broadcastToClients({
+          type: 'CLI_ENDPOINT_UPDATED',
+          payload: { endpoint: { id, name, enabled }, timestamp: new Date().toISOString() }
+        });
+
+        return { success: true, endpoints: config.customEndpoints };
+      } catch (err) {
+        return { error: (err as Error).message, status: 500 };
+      }
+    });
+    return true;
+  }
+
+  // API: Update custom endpoint enabled status
+  if (pathname.match(/^\/api\/cli\/endpoints\/[^/]+$/) && req.method === 'PUT') {
+    const endpointId = pathname.split('/').pop() || '';
+    handlePostRequest(req, res, async (body: unknown) => {
+      try {
+        const { enabled, name } = body as { enabled?: boolean; name?: string };
+        const config = loadClaudeCliTools(initialPath);
+        const endpoint = config.customEndpoints.find(e => e.id === endpointId);
+
+        if (!endpoint) {
+          return { error: 'Endpoint not found', status: 404 };
+        }
+
+        if (typeof enabled === 'boolean') endpoint.enabled = enabled;
+        if (name) endpoint.name = name;
+
+        saveClaudeCliTools(initialPath, config);
+
+        broadcastToClients({
+          type: 'CLI_ENDPOINT_UPDATED',
+          payload: { endpoint, timestamp: new Date().toISOString() }
+        });
+
+        return { success: true, endpoint };
+      } catch (err) {
+        return { error: (err as Error).message, status: 500 };
+      }
+    });
+    return true;
+  }
+
+  // API: Delete custom endpoint
+  if (pathname.match(/^\/api\/cli\/endpoints\/[^/]+$/) && req.method === 'DELETE') {
+    const endpointId = pathname.split('/').pop() || '';
+    try {
+      const config = removeClaudeCustomEndpoint(initialPath, endpointId);
+
+      broadcastToClients({
+        type: 'CLI_ENDPOINT_DELETED',
+        payload: { endpointId, timestamp: new Date().toISOString() }
+      });
+
+      res.writeHead(200, { 'Content-Type': 'application/json' });
+      res.end(JSON.stringify({ success: true, endpoints: config.customEndpoints }));
+    } catch (err) {
+      res.writeHead(500, { 'Content-Type': 'application/json' });
+      res.end(JSON.stringify({ error: (err as Error).message }));
+    }
+    return true;
+  }
+
  // API: CLI Execution History
  if (pathname === '/api/cli/history') {
    const projectPath = url.searchParams.get('path') || initialPath;
--- a/ccw/src/core/routes/litellm-api-routes.ts
+++ b/ccw/src/core/routes/litellm-api-routes.ts
@@ -529,27 +529,38 @@ export async function handleLiteLLMApiRoutes(ctx: RouteContext): Promise<boolean
  // GET /api/litellm-api/ccw-litellm/status - Check ccw-litellm installation status
  if (pathname === '/api/litellm-api/ccw-litellm/status' && req.method === 'GET') {
    try {
-      const { spawn } = await import('child_process');
-      const result = await new Promise<{ installed: boolean; version?: string }>((resolve) => {
-        const proc = spawn('python', ['-c', 'import ccw_litellm; print(ccw_litellm.__version__ if hasattr(ccw_litellm, "__version__") else "installed")'], {
-          shell: true,
-          timeout: 10000
-        });
+      const { execSync } = await import('child_process');

-        let output = '';
-        proc.stdout?.on('data', (data) => { output += data.toString(); });
-        proc.on('close', (code) => {
-          if (code === 0) {
-            resolve({ installed: true, version: output.trim() || 'unknown' });
-          } else {
-            resolve({ installed: false });
+      // Try multiple Python executables
+      const pythonExecutables = ['python', 'python3', 'py'];
+      // Use single quotes inside Python code for Windows compatibility
+      const pythonCode = "import ccw_litellm; print(getattr(ccw_litellm, '__version__', 'installed'))";
+
+      let installed = false;
+      let version = '';
+      let lastError = '';
+
+      for (const pythonExe of pythonExecutables) {
+        try {
+          const output = execSync(`${pythonExe} -c "${pythonCode}"`, {
+            encoding: 'utf-8',
+            timeout: 10000,
+            windowsHide: true
+          });
+          version = output.trim();
+          if (version) {
+            installed = true;
+            console.log(`[ccw-litellm status] Found with ${pythonExe}: ${version}`);
+            break;
          }
-        });
-        proc.on('error', () => resolve({ installed: false }));
-      });
+        } catch (err) {
+          lastError = (err as Error).message;
+          console.log(`[ccw-litellm status] ${pythonExe} failed:`, lastError.substring(0, 100));
+        }
+      }

      res.writeHead(200, { 'Content-Type': 'application/json' });
-      res.end(JSON.stringify(result));
+      res.end(JSON.stringify(installed ? { installed: true, version } : { installed: false, error: lastError }));
    } catch (err) {
      res.writeHead(200, { 'Content-Type': 'application/json' });
      res.end(JSON.stringify({ installed: false, error: (err as Error).message }));
--- a/ccw/src/templates/dashboard-js/components/cli-history.js
+++ b/ccw/src/templates/dashboard-js/components/cli-history.js
@@ -33,9 +33,13 @@ async function loadCliHistory(options = {}) {
 }

 // Load native session content for a specific execution
-async function loadNativeSessionContent(executionId) {
+async function loadNativeSessionContent(executionId, sourceDir) {
  try {
-    const url = `/api/cli/native-session?path=${encodeURIComponent(projectPath)}&id=${encodeURIComponent(executionId)}`;
+    // If sourceDir provided, use it to build the correct path
+    const basePath = sourceDir && sourceDir !== '.'
+      ? projectPath + '/' + sourceDir
+      : projectPath;
+    const url = `/api/cli/native-session?path=${encodeURIComponent(basePath)}&id=${encodeURIComponent(executionId)}`;
    const response = await fetch(url);
    if (!response.ok) return null;
    return await response.json();
@@ -133,9 +137,12 @@ function renderCliHistory() {
             </span>`
          : '';

+        // Escape sourceDir for use in onclick
+        const sourceDirEscaped = exec.sourceDir ? exec.sourceDir.replace(/'/g, "\\'") : '';
+
        return `
          <div class="cli-history-item ${hasNative ? 'has-native' : ''}">
-            <div class="cli-history-item-content" onclick="showExecutionDetail('${exec.id}')">
+            <div class="cli-history-item-content" onclick="showExecutionDetail('${exec.id}', '${sourceDirEscaped}')">
              <div class="cli-history-item-header">
                <span class="cli-tool-tag cli-tool-${exec.tool}">${exec.tool.toUpperCase()}</span>
                <span class="cli-mode-tag">${exec.mode || 'analysis'}</span>
@@ -154,14 +161,14 @@ function renderCliHistory() {
            </div>
            <div class="cli-history-actions">
              ${hasNative ? `
-                <button class="btn-icon" onclick="event.stopPropagation(); showNativeSessionDetail('${exec.id}')" title="View Native Session">
+                <button class="btn-icon" onclick="event.stopPropagation(); showNativeSessionDetail('${exec.id}', '${sourceDirEscaped}')" title="View Native Session">
                  <i data-lucide="file-json" class="w-3.5 h-3.5"></i>
                </button>
              ` : ''}
-              <button class="btn-icon" onclick="event.stopPropagation(); showExecutionDetail('${exec.id}')" title="View Details">
+              <button class="btn-icon" onclick="event.stopPropagation(); showExecutionDetail('${exec.id}', '${sourceDirEscaped}')" title="View Details">
                <i data-lucide="eye" class="w-3.5 h-3.5"></i>
              </button>
-              <button class="btn-icon btn-danger" onclick="event.stopPropagation(); confirmDeleteExecution('${exec.id}')" title="Delete">
+              <button class="btn-icon btn-danger" onclick="event.stopPropagation(); confirmDeleteExecution('${exec.id}', '${sourceDirEscaped}')" title="Delete">
                <i data-lucide="trash-2" class="w-3.5 h-3.5"></i>
              </button>
            </div>
@@ -650,9 +657,9 @@ async function copyConcatenatedPrompt(executionId) {
 /**
 * Show native session detail modal with full conversation content
 */
-async function showNativeSessionDetail(executionId) {
+async function showNativeSessionDetail(executionId, sourceDir) {
  // Load native session content
-  const nativeSession = await loadNativeSessionContent(executionId);
+  const nativeSession = await loadNativeSessionContent(executionId, sourceDir);

  if (!nativeSession) {
    showRefreshToast('Native session not found', 'error');
--- a/ccw/src/templates/dashboard-js/i18n.js
+++ b/ccw/src/templates/dashboard-js/i18n.js
@@ -228,6 +228,11 @@ const i18n = {
    'cli.codexLensDescFull': 'Full-text code search engine',
    'cli.semanticDesc': 'AI-powered code understanding',
    'cli.semanticDescFull': 'Natural language code search',
+    'cli.apiEndpoints': 'API Endpoints',
+    'cli.configured': 'configured',
+    'cli.addToCli': 'Add to CLI',
+    'cli.enabled': 'Enabled',
+    'cli.disabled': 'Disabled',

    // CodexLens Configuration
    'codexlens.config': 'CodexLens Configuration',
@@ -378,6 +383,8 @@ const i18n = {
    'codexlens.indexComplete': 'Index complete',
    'codexlens.indexSuccess': 'Index created successfully',
    'codexlens.indexFailed': 'Indexing failed',
+    'codexlens.embeddingsFailed': 'Embeddings generation failed',
+    'codexlens.ftsSuccessEmbeddingsFailed': 'FTS index created, but embeddings failed',

    // CodexLens Install
    'codexlens.installDesc': 'Python-based code indexing engine',
@@ -1880,6 +1887,11 @@ const i18n = {
    'cli.codexLensDescFull': '全文代码搜索引擎',
    'cli.semanticDesc': 'AI 驱动的代码理解',
    'cli.semanticDescFull': '自然语言代码搜索',
+    'cli.apiEndpoints': 'API 端点',
+    'cli.configured': '已配置',
+    'cli.addToCli': '添加到 CLI',
+    'cli.enabled': '已启用',
+    'cli.disabled': '已禁用',

    // CodexLens 配置
    'codexlens.config': 'CodexLens 配置',
@@ -2031,6 +2043,8 @@ const i18n = {
    'codexlens.indexComplete': '索引完成',
    'codexlens.indexSuccess': '索引创建成功',
    'codexlens.indexFailed': '索引失败',
+    'codexlens.embeddingsFailed': '嵌入生成失败',
+    'codexlens.ftsSuccessEmbeddingsFailed': 'FTS 索引已创建，但嵌入生成失败',

    // CodexLens 安装
    'codexlens.installDesc': '基于 Python 的代码索引引擎',
--- a/ccw/src/templates/dashboard-js/views/api-settings.js
+++ b/ccw/src/templates/dashboard-js/views/api-settings.js
@@ -2739,8 +2739,11 @@ function toggleKeyVisibility(btn) {
 */
 async function checkCcwLitellmStatus() {
  try {
+    console.log('[API Settings] Checking ccw-litellm status...');
    var response = await fetch('/api/litellm-api/ccw-litellm/status');
+    console.log('[API Settings] Status response:', response.status);
    var status = await response.json();
+    console.log('[API Settings] ccw-litellm status:', status);
    window.ccwLitellmStatus = status;
    return status;
  } catch (e) {
--- a/ccw/src/templates/dashboard-js/views/cli-manager.js
+++ b/ccw/src/templates/dashboard-js/views/cli-manager.js
@@ -59,6 +59,91 @@ async function loadCcwEndpointTools() {
  }
 }

+// ========== LiteLLM API Endpoints ==========
+var litellmApiEndpoints = [];
+var cliCustomEndpoints = [];
+
+async function loadLitellmApiEndpoints() {
+  try {
+    var response = await fetch('/api/litellm-api/config');
+    if (!response.ok) throw new Error('Failed to load LiteLLM endpoints');
+    var data = await response.json();
+    litellmApiEndpoints = data.endpoints || [];
+    window.litellmApiConfig = data;
+    return litellmApiEndpoints;
+  } catch (err) {
+    console.error('Failed to load LiteLLM endpoints:', err);
+    litellmApiEndpoints = [];
+    return [];
+  }
+}
+
+async function loadCliCustomEndpoints() {
+  try {
+    var response = await fetch('/api/cli/endpoints');
+    if (!response.ok) throw new Error('Failed to load CLI custom endpoints');
+    var data = await response.json();
+    cliCustomEndpoints = data.endpoints || [];
+    return cliCustomEndpoints;
+  } catch (err) {
+    console.error('Failed to load CLI custom endpoints:', err);
+    cliCustomEndpoints = [];
+    return [];
+  }
+}
+
+async function toggleEndpointEnabled(endpointId, enabled) {
+  try {
+    var response = await fetch('/api/cli/endpoints/' + endpointId, {
+      method: 'PUT',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ enabled: enabled })
+    });
+    if (!response.ok) throw new Error('Failed to update endpoint');
+    var data = await response.json();
+    if (data.success) {
+      // Update local state
+      var idx = cliCustomEndpoints.findIndex(function(e) { return e.id === endpointId; });
+      if (idx >= 0) {
+        cliCustomEndpoints[idx].enabled = enabled;
+      }
+      showRefreshToast((enabled ? 'Enabled' : 'Disabled') + ' endpoint: ' + endpointId, 'success');
+    }
+    return data;
+  } catch (err) {
+    showRefreshToast('Failed to update endpoint: ' + err.message, 'error');
+    throw err;
+  }
+}
+
+async function syncEndpointToCliTools(endpoint) {
+  try {
+    var response = await fetch('/api/cli/endpoints', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({
+        id: endpoint.id,
+        name: endpoint.name,
+        enabled: true
+      })
+    });
+    if (!response.ok) throw new Error('Failed to sync endpoint');
+    var data = await response.json();
+    if (data.success) {
+      cliCustomEndpoints = data.endpoints;
+      showRefreshToast('Endpoint synced to CLI tools: ' + endpoint.id, 'success');
+      renderToolsSection();
+    }
+    return data;
+  } catch (err) {
+    showRefreshToast('Failed to sync endpoint: ' + err.message, 'error');
+    throw err;
+  }
+}
+
+window.toggleEndpointEnabled = toggleEndpointEnabled;
+window.syncEndpointToCliTools = syncEndpointToCliTools;
+
 // ========== CLI Tool Configuration ==========
 async function loadCliToolConfig() {
  try {
@@ -322,7 +407,9 @@ async function renderCliManager() {
    loadCliToolStatus(),
    loadCodexLensStatus(),
    loadCcwInstallations(),
-    loadCcwEndpointTools()
+    loadCcwEndpointTools(),
+    loadLitellmApiEndpoints(),
+    loadCliCustomEndpoints()
  ]);

  container.innerHTML = '<div class="status-manager">' +
@@ -487,6 +574,51 @@ function renderToolsSection() {
    '</div>';
  }

+  // API Endpoints section
+  var apiEndpointsHtml = '';
+  if (litellmApiEndpoints.length > 0) {
+    var endpointItems = litellmApiEndpoints.map(function(endpoint) {
+      // Check if endpoint is synced to CLI tools
+      var cliEndpoint = cliCustomEndpoints.find(function(e) { return e.id === endpoint.id; });
+      var isSynced = !!cliEndpoint;
+      var isEnabled = cliEndpoint ? cliEndpoint.enabled : false;
+
+      // Find provider info
+      var provider = (window.litellmApiConfig?.providers || []).find(function(p) { return p.id === endpoint.providerId; });
+      var providerName = provider ? provider.name : endpoint.providerId;
+
+      return '<div class="tool-item ' + (isSynced && isEnabled ? 'available' : 'unavailable') + '">' +
+        '<div class="tool-item-left">' +
+          '<span class="tool-status-dot ' + (isSynced && isEnabled ? 'status-available' : 'status-unavailable') + '"></span>' +
+          '<div class="tool-item-info">' +
+            '<div class="tool-item-name">' + endpoint.id + ' <span class="tool-type-badge">API</span></div>' +
+            '<div class="tool-item-desc">' + endpoint.model + ' (' + providerName + ')</div>' +
+          '</div>' +
+        '</div>' +
+        '<div class="tool-item-right">' +
+          (isSynced
+            ? '<label class="toggle-switch" onclick="event.stopPropagation()">' +
+                '<input type="checkbox" ' + (isEnabled ? 'checked' : '') + ' onchange="toggleEndpointEnabled(\'' + endpoint.id + '\', this.checked); renderToolsSection();">' +
+                '<span class="toggle-slider"></span>' +
+              '</label>'
+            : '<button class="btn-sm btn-primary" onclick="event.stopPropagation(); syncEndpointToCliTools({id: \'' + endpoint.id + '\', name: \'' + endpoint.name + '\'})">' +
+                '<i data-lucide="plus" class="w-3 h-3"></i> ' + (t('cli.addToCli') || 'Add to CLI') +
+              '</button>') +
+        '</div>' +
+      '</div>';
+    }).join('');
+
+    apiEndpointsHtml = '<div class="tools-subsection" style="margin-top: 1rem; padding-top: 1rem; border-top: 1px solid var(--border);">' +
+      '<div class="section-header-left" style="margin-bottom: 0.5rem;">' +
+        '<h4 style="font-size: 0.875rem; font-weight: 600; display: flex; align-items: center; gap: 0.5rem;">' +
+          '<i data-lucide="cloud" class="w-4 h-4"></i> ' + (t('cli.apiEndpoints') || 'API Endpoints') +
+        '</h4>' +
+        '<span class="section-count">' + litellmApiEndpoints.length + ' ' + (t('cli.configured') || 'configured') + '</span>' +
+      '</div>' +
+      '<div class="tools-list">' + endpointItems + '</div>' +
+    '</div>';
+  }
+
  container.innerHTML = '<div class="section-header">' +
      '<div class="section-header-left">' +
        '<h3><i data-lucide="terminal" class="w-4 h-4"></i> ' + t('cli.tools') + '</h3>' +
@@ -500,7 +632,8 @@ function renderToolsSection() {
      toolsHtml +
      codexLensHtml +
      semanticHtml +
-    '</div>';
+    '</div>' +
+    apiEndpointsHtml;

  if (window.lucide) lucide.createIcons();
 }
--- a/ccw/src/templates/dashboard-js/views/codexlens-manager.js
+++ b/ccw/src/templates/dashboard-js/views/codexlens-manager.js
@@ -383,7 +383,7 @@ async function loadSemanticDepsStatus() {
        acceleratorIcon = 'zap';
        acceleratorClass = 'bg-green-500/20 text-green-600';
      } else if (accelerator === 'DirectML') {
-        acceleratorIcon = 'gpu-card';
+        acceleratorIcon = 'cpu';
        acceleratorClass = 'bg-blue-500/20 text-blue-600';
      } else if (accelerator === 'ROCm') {
        acceleratorIcon = 'flame';
@@ -450,7 +450,7 @@ function buildGpuModeSelector(gpuInfo) {
      id: 'directml',
      label: 'DirectML',
      desc: t('codexlens.directmlModeDesc') || 'Windows GPU (NVIDIA/AMD/Intel)',
-      icon: 'gpu-card',
+      icon: 'cpu',
      available: gpuInfo.available.includes('directml'),
      recommended: gpuInfo.mode === 'directml'
    },
@@ -1331,7 +1331,15 @@ async function startCodexLensIndexing(indexType, embeddingModel, embeddingBacken

    // Check if completed successfully (WebSocket might have already reported)
    if (result.success) {
-      handleIndexComplete(true, t('codexlens.indexComplete'));
+      // For vector index, check if embeddings were actually generated
+      var embeddingsResult = result.result && result.result.embeddings;
+      if (indexType === 'vector' && embeddingsResult && !embeddingsResult.generated) {
+        // FTS succeeded but embeddings failed - show partial success
+        var errorMsg = embeddingsResult.error || t('codexlens.embeddingsFailed');
+        handleIndexComplete(false, t('codexlens.ftsSuccessEmbeddingsFailed') || 'FTS index created, but embeddings failed: ' + errorMsg);
+      } else {
+        handleIndexComplete(true, t('codexlens.indexComplete'));
+      }
    } else if (!result.success) {
      handleIndexComplete(false, result.error || t('common.unknownError'));
    }
--- a/ccw/src/tools/smart-search.ts
+++ b/ccw/src/tools/smart-search.ts
@@ -275,11 +275,22 @@ interface SearchResult {
  message?: string;
 }

+interface ModelInfo {
+  model_profile?: string;
+  model_name?: string;
+  embedding_dim?: number;
+  backend?: string;
+  created_at?: string;
+  updated_at?: string;
+}
+
 interface IndexStatus {
  indexed: boolean;
  has_embeddings: boolean;
  file_count?: number;
  embeddings_coverage_percent?: number;
+  total_chunks?: number;
+  model_info?: ModelInfo;
  warning?: string;
 }

@@ -320,6 +331,18 @@ async function checkIndexStatus(path: string = '.'): Promise<IndexStatus> {
      const embeddingsData = status.embeddings || {};
      const embeddingsCoverage = embeddingsData.coverage_percent || 0;
      const has_embeddings = embeddingsCoverage >= 50; // Threshold: 50%
+      const totalChunks = embeddingsData.total_chunks || 0;
+
+      // Extract model info if available
+      const modelInfoData = embeddingsData.model_info;
+      const modelInfo: ModelInfo | undefined = modelInfoData ? {
+        model_profile: modelInfoData.model_profile,
+        model_name: modelInfoData.model_name,
+        embedding_dim: modelInfoData.embedding_dim,
+        backend: modelInfoData.backend,
+        created_at: modelInfoData.created_at,
+        updated_at: modelInfoData.updated_at,
+      } : undefined;

      let warning: string | undefined;
      if (!indexed) {
@@ -335,6 +358,8 @@ async function checkIndexStatus(path: string = '.'): Promise<IndexStatus> {
        has_embeddings,
        file_count: status.total_files,
        embeddings_coverage_percent: embeddingsCoverage,
+        total_chunks: totalChunks,
+        model_info: modelInfo,
        warning,
      };
    } catch {
--- a/codex-lens/src/codexlens/cli/commands.py
+++ b/codex-lens/src/codexlens/cli/commands.py
@@ -1831,6 +1831,14 @@ def embeddings_generate(
        "-r",
        help="Recursively process all _index.db files in directory tree.",
    ),
+    max_workers: int = typer.Option(
+        1,
+        "--max-workers",
+        "-w",
+        min=1,
+        max=16,
+        help="Max concurrent API calls. Recommended: 4-8 for litellm backend. Default: 1 (sequential).",
+    ),
    json_mode: bool = typer.Option(False, "--json", help="Output JSON response."),
    verbose: bool = typer.Option(False, "--verbose", "-v", help="Enable verbose output."),
 ) -> None:
@@ -1927,7 +1935,10 @@ def embeddings_generate(
    else:
        console.print(f"Index: [dim]{index_path}[/dim]")
    console.print(f"Backend: [cyan]{backend}[/cyan]")
-    console.print(f"Model: [cyan]{model}[/cyan]\n")
+    console.print(f"Model: [cyan]{model}[/cyan]")
+    if max_workers > 1:
+        console.print(f"Concurrency: [cyan]{max_workers} workers[/cyan]")
+    console.print()

    if use_recursive:
        result = generate_embeddings_recursive(
@@ -1937,6 +1948,7 @@ def embeddings_generate(
            force=force,
            chunk_size=chunk_size,
            progress_callback=progress_update,
+            max_workers=max_workers,
        )
    else:
        result = generate_embeddings(
@@ -1946,6 +1958,7 @@ def embeddings_generate(
            force=force,
            chunk_size=chunk_size,
            progress_callback=progress_update,
+            max_workers=max_workers,
        )

    if json_mode:
--- a/codex-lens/src/codexlens/cli/embedding_manager.py
+++ b/codex-lens/src/codexlens/cli/embedding_manager.py
@@ -7,7 +7,6 @@ import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from itertools import islice
 from pathlib import Path
-from threading import Lock
 from typing import Dict, Generator, List, Optional, Tuple

 try:
@@ -441,82 +440,133 @@ def generate_embeddings(
                    batch_number = 0
                    files_seen = set()

-                    # Thread-safe counters for concurrent processing
-                    counter_lock = Lock()
-
-                    def process_batch(batch_data: Tuple[int, List[Tuple]]) -> Tuple[int, set, Optional[str]]:
-                        """Process a single batch: generate embeddings and store.
+                    def compute_embeddings_only(batch_data: Tuple[int, List[Tuple]]):
+                        """Compute embeddings for a batch (no DB write).

                        Args:
                            batch_data: Tuple of (batch_number, chunk_batch)

                        Returns:
-                            Tuple of (chunks_created, files_in_batch, error_message)
+                            Tuple of (batch_num, chunk_batch, embeddings_numpy, batch_files, error)
                        """
                        batch_num, chunk_batch = batch_data
                        batch_files = set()

                        try:
-                            # Track files in this batch
                            for _, file_path in chunk_batch:
                                batch_files.add(file_path)

-                            # Generate embeddings
                            batch_contents = [chunk.content for chunk, _ in chunk_batch]
                            embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=EMBEDDING_BATCH_SIZE)

-                            # Store embeddings (thread-safe via SQLite's serialized mode)
-                            vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
-
-                            return len(chunk_batch), batch_files, None
+                            return batch_num, chunk_batch, embeddings_numpy, batch_files, None

                        except Exception as e:
                            error_msg = f"Batch {batch_num}: {str(e)}"
-                            logger.error(f"Failed to process embedding batch {batch_num}: {str(e)}")
-                            return 0, batch_files, error_msg
+                            logger.error(f"Failed to compute embeddings for batch {batch_num}: {str(e)}")
+                            return batch_num, chunk_batch, None, batch_files, error_msg

-                    # Collect batches for concurrent processing
-                    all_batches = []
-                    for chunk_batch in batch_generator:
-                        batch_number += 1
-                        all_batches.append((batch_number, chunk_batch))
-
-                    # Process batches (sequential or concurrent based on max_workers)
+                    # Process batches based on max_workers setting
                    if max_workers <= 1:
-                        # Sequential processing (original behavior)
-                        for batch_num, chunk_batch in all_batches:
-                            chunks_created, batch_files, error = process_batch((batch_num, chunk_batch))
-                            files_seen.update(batch_files)
-                            total_chunks_created += chunks_created
-                            total_files_processed = len(files_seen)
+                        # Sequential processing - stream directly from generator (no pre-materialization)
+                        for chunk_batch in batch_generator:
+                            batch_number += 1

-                            if progress_callback and batch_num % 10 == 0:
-                                progress_callback(f"  Batch {batch_num}: {total_chunks_created} chunks, {total_files_processed} files")
+                            # Track files in this batch
+                            batch_files = set()
+                            for _, file_path in chunk_batch:
+                                batch_files.add(file_path)
+
+                            try:
+                                # Generate embeddings
+                                batch_contents = [chunk.content for chunk, _ in chunk_batch]
+                                embeddings_numpy = embedder.embed_to_numpy(batch_contents, batch_size=EMBEDDING_BATCH_SIZE)
+
+                                # Store embeddings
+                                vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
+
+                                files_seen.update(batch_files)
+                                total_chunks_created += len(chunk_batch)
+                                total_files_processed = len(files_seen)
+
+                                if progress_callback and batch_number % 10 == 0:
+                                    progress_callback(f"  Batch {batch_number}: {total_chunks_created} chunks, {total_files_processed} files")
+
+                            except Exception as e:
+                                logger.error(f"Failed to process batch {batch_number}: {str(e)}")
+                                files_seen.update(batch_files)
                    else:
-                        # Concurrent processing for API backends
+                        # Concurrent processing with producer-consumer pattern
+                        # Workers compute embeddings (parallel), main thread writes to DB (serial)
+                        from queue import Queue
+                        from threading import Thread
+
+                        result_queue = Queue(maxsize=max_workers * 2)  # Bounded queue to limit memory
+                        batch_counter = [0]  # Mutable counter for producer thread
+                        producer_done = [False]
+
+                        def producer():
+                            """Submit batches to executor, put results in queue."""
+                            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                                pending_futures = []
+
+                                for chunk_batch in batch_generator:
+                                    batch_counter[0] += 1
+                                    batch_num = batch_counter[0]
+
+                                    # Submit compute task
+                                    future = executor.submit(compute_embeddings_only, (batch_num, chunk_batch))
+                                    pending_futures.append(future)
+
+                                    # Check for completed futures and add to queue
+                                    for f in list(pending_futures):
+                                        if f.done():
+                                            try:
+                                                result_queue.put(f.result())
+                                            except Exception as e:
+                                                logger.error(f"Future raised exception: {e}")
+                                            pending_futures.remove(f)
+
+                                # Wait for remaining futures
+                                for future in as_completed(pending_futures):
+                                    try:
+                                        result_queue.put(future.result())
+                                    except Exception as e:
+                                        logger.error(f"Future raised exception: {e}")
+
+                            producer_done[0] = True
+                            result_queue.put(None)  # Sentinel to signal completion
+
+                        # Start producer thread
+                        producer_thread = Thread(target=producer, daemon=True)
+                        producer_thread.start()
+
                        if progress_callback:
-                            progress_callback(f"Processing {len(all_batches)} batches with {max_workers} concurrent workers...")
+                            progress_callback(f"Processing with {max_workers} concurrent embedding workers...")

-                        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-                            futures = {executor.submit(process_batch, batch): batch[0] for batch in all_batches}
+                        # Consumer: main thread writes to DB (serial, no contention)
+                        completed = 0
+                        while True:
+                            result = result_queue.get()
+                            if result is None:  # Sentinel
+                                break

-                            completed = 0
-                            for future in as_completed(futures):
-                                batch_num = futures[future]
-                                try:
-                                    chunks_created, batch_files, error = future.result()
+                            batch_num, chunk_batch, embeddings_numpy, batch_files, error = result

-                                    with counter_lock:
-                                        files_seen.update(batch_files)
-                                        total_chunks_created += chunks_created
-                                        total_files_processed = len(files_seen)
-                                        completed += 1
+                            if embeddings_numpy is not None and error is None:
+                                # Write to DB in main thread (no contention)
+                                vector_store.add_chunks_batch_numpy(chunk_batch, embeddings_numpy)
+                                total_chunks_created += len(chunk_batch)

-                                    if progress_callback and completed % 10 == 0:
-                                        progress_callback(f"  Completed {completed}/{len(all_batches)} batches: {total_chunks_created} chunks")
+                            files_seen.update(batch_files)
+                            total_files_processed = len(files_seen)
+                            completed += 1

-                                except Exception as e:
-                                    logger.error(f"Batch {batch_num} raised exception: {str(e)}")
+                            if progress_callback and completed % 10 == 0:
+                                progress_callback(f"  Completed {completed} batches: {total_chunks_created} chunks")
+
+                        producer_thread.join()
+                        batch_number = batch_counter[0]

                # Notify before ANN index finalization (happens when bulk_insert context exits)
                if progress_callback:
@@ -718,7 +768,7 @@ def get_embeddings_status(index_root: Path) -> Dict[str, any]:
        index_root: Root index directory

    Returns:
-        Aggregated status with coverage statistics
+        Aggregated status with coverage statistics, model info, and timestamps
    """
    index_files = discover_all_index_dbs(index_root)

@@ -734,6 +784,7 @@ def get_embeddings_status(index_root: Path) -> Dict[str, any]:
                "coverage_percent": 0.0,
                "indexes_with_embeddings": 0,
                "indexes_without_embeddings": 0,
+                "model_info": None,
            },
        }

@@ -741,6 +792,8 @@ def get_embeddings_status(index_root: Path) -> Dict[str, any]:
    files_with_embeddings = 0
    total_chunks = 0
    indexes_with_embeddings = 0
+    model_info = None
+    latest_updated_at = None

    for index_path in index_files:
        status = check_index_embeddings(index_path)
@@ -752,6 +805,40 @@ def get_embeddings_status(index_root: Path) -> Dict[str, any]:
            if result["has_embeddings"]:
                indexes_with_embeddings += 1

+                # Get model config from first index with embeddings (they should all match)
+                if model_info is None:
+                    try:
+                        from codexlens.semantic.vector_store import VectorStore
+                        with VectorStore(index_path) as vs:
+                            config = vs.get_model_config()
+                            if config:
+                                model_info = {
+                                    "model_profile": config.get("model_profile"),
+                                    "model_name": config.get("model_name"),
+                                    "embedding_dim": config.get("embedding_dim"),
+                                    "backend": config.get("backend"),
+                                    "created_at": config.get("created_at"),
+                                    "updated_at": config.get("updated_at"),
+                                }
+                                latest_updated_at = config.get("updated_at")
+                    except Exception:
+                        pass
+                else:
+                    # Track the latest updated_at across all indexes
+                    try:
+                        from codexlens.semantic.vector_store import VectorStore
+                        with VectorStore(index_path) as vs:
+                            config = vs.get_model_config()
+                            if config and config.get("updated_at"):
+                                if latest_updated_at is None or config["updated_at"] > latest_updated_at:
+                                    latest_updated_at = config["updated_at"]
+                    except Exception:
+                        pass
+
+    # Update model_info with latest timestamp
+    if model_info and latest_updated_at:
+        model_info["updated_at"] = latest_updated_at
+
    return {
        "success": True,
        "result": {
@@ -763,6 +850,7 @@ def get_embeddings_status(index_root: Path) -> Dict[str, any]:
            "coverage_percent": round((files_with_embeddings / total_files * 100) if total_files > 0 else 0, 1),
            "indexes_with_embeddings": indexes_with_embeddings,
            "indexes_without_embeddings": len(index_files) - indexes_with_embeddings,
+            "model_info": model_info,
        },
    }