/** * LiteLLM Executor - Execute LiteLLM endpoints with context caching * Integrates with context-cache for file packing and LiteLLM client for API calls */ import { getLiteLLMClient, getCodexLensVenvPython } from './litellm-client.js'; import { handler as contextCacheHandler } from './context-cache.js'; import { findEndpointById, getProviderWithResolvedEnvVars, } from '../config/litellm-api-config-manager.js'; import type { CustomEndpoint, ProviderCredential } from '../types/litellm-api-config.js'; import type { CliOutputUnit } from './cli-output-converter.js'; export interface LiteLLMExecutionOptions { prompt: string; endpointId: string; // Custom endpoint ID (e.g., "my-gpt4o") baseDir: string; // Project base directory cwd?: string; // Working directory for file resolution includeDirs?: string[]; // Additional directories for @patterns enableCache?: boolean; // Override endpoint cache setting onOutput?: (unit: CliOutputUnit) => void; /** Number of retries after the initial attempt (default: 0) */ maxRetries?: number; /** Base delay for exponential backoff in milliseconds (default: 1000) */ retryBaseDelayMs?: number; } export interface LiteLLMExecutionResult { success: boolean; output: string; model: string; provider: string; cacheUsed: boolean; cachedFiles?: string[]; error?: string; } /** * Extract @patterns from prompt text */ export function extractPatterns(prompt: string): string[] { // Match @path patterns: @src/**/*.ts, @CLAUDE.md, @../shared/**/* const regex = /@([^\s]+)/g; const patterns: string[] = []; let match; while ((match = regex.exec(prompt)) !== null) { patterns.push('@' + match[1]); } return patterns; } /** * Execute LiteLLM endpoint with optional context caching */ export async function executeLiteLLMEndpoint( options: LiteLLMExecutionOptions ): Promise { const { prompt, endpointId, baseDir, cwd, includeDirs, enableCache, onOutput } = options; // 1. Find endpoint configuration const endpoint = findEndpointById(baseDir, endpointId); if (!endpoint) { return { success: false, output: '', model: '', provider: '', cacheUsed: false, error: `Endpoint not found: ${endpointId}`, }; } // 2. Get provider with resolved env vars const provider = getProviderWithResolvedEnvVars(baseDir, endpoint.providerId); if (!provider) { return { success: false, output: '', model: '', provider: '', cacheUsed: false, error: `Provider not found: ${endpoint.providerId}`, }; } // Verify API key is available if (!provider.resolvedApiKey) { return { success: false, output: '', model: endpoint.model, provider: provider.type, cacheUsed: false, error: `API key not configured for provider: ${provider.name}`, }; } // 3. Process context cache if enabled let finalPrompt = prompt; let cacheUsed = false; let cachedFiles: string[] = []; const shouldCache = enableCache ?? endpoint.cacheStrategy.enabled; if (shouldCache) { const patterns = extractPatterns(prompt); if (patterns.length > 0) { if (onOutput) { onOutput({ type: 'stderr', content: `[Context cache: Found ${patterns.length} @patterns]\n`, timestamp: new Date().toISOString() }); } // Pack files into cache const packResult = await contextCacheHandler({ operation: 'pack', patterns, cwd: cwd || process.cwd(), include_dirs: includeDirs, ttl: endpoint.cacheStrategy.ttlMinutes * 60 * 1000, max_file_size: endpoint.cacheStrategy.maxSizeKB * 1024, }); if (packResult.success && packResult.result) { const pack = packResult.result as any; if (onOutput) { onOutput({ type: 'stderr', content: `[Context cache: Packed ${pack.files_packed} files, ${pack.total_bytes} bytes]\n`, timestamp: new Date().toISOString() }); } // Read cached content const readResult = await contextCacheHandler({ operation: 'read', session_id: pack.session_id, limit: endpoint.cacheStrategy.maxSizeKB * 1024, }); if (readResult.success && readResult.result) { const read = readResult.result as any; // Prepend cached content to prompt finalPrompt = `${read.content}\n\n---\n\n${prompt}`; cacheUsed = true; cachedFiles = pack.files_packed ? Array(pack.files_packed).fill('...') : []; if (onOutput) { onOutput({ type: 'stderr', content: `[Context cache: Applied to prompt]\n`, timestamp: new Date().toISOString() }); } } } else if (packResult.error) { if (onOutput) { onOutput({ type: 'stderr', content: `[Context cache warning: ${packResult.error}]\n`, timestamp: new Date().toISOString() }); } } } } // 4. Call LiteLLM try { if (onOutput) { onOutput({ type: 'stderr', content: `[LiteLLM: Calling ${provider.type}/${endpoint.model}]\n`, timestamp: new Date().toISOString() }); } const client = getLiteLLMClient({ pythonPath: getCodexLensVenvPython(), timeout: 120000, // 2 minutes }); // Configure provider credentials via environment // LiteLLM uses standard env vars like OPENAI_API_KEY, ANTHROPIC_API_KEY const envVarName = getProviderEnvVarName(provider.type); if (envVarName) { process.env[envVarName] = provider.resolvedApiKey; } // Set base URL if custom if (provider.apiBase) { const baseUrlEnvVar = getProviderBaseUrlEnvVarName(provider.type); if (baseUrlEnvVar) { process.env[baseUrlEnvVar] = provider.apiBase; } } // Set custom headers from provider advanced settings if (provider.advancedSettings?.customHeaders) { process.env['CCW_LITELLM_EXTRA_HEADERS'] = JSON.stringify(provider.advancedSettings.customHeaders); } else { // Clear any previous custom headers delete process.env['CCW_LITELLM_EXTRA_HEADERS']; } // Use litellm-client to call chat const response = await callWithRetries( () => client.chat(finalPrompt, endpoint.model), { maxRetries: options.maxRetries ?? 0, baseDelayMs: options.retryBaseDelayMs ?? 1000, onOutput, rateLimitKey: `${provider.type}:${endpoint.model}`, }, ); if (onOutput) { onOutput({ type: 'stdout', content: response, timestamp: new Date().toISOString() }); } return { success: true, output: response, model: endpoint.model, provider: provider.type, cacheUsed, cachedFiles, }; } catch (error) { const errorMsg = (error as Error).message; if (onOutput) { onOutput({ type: 'stderr', content: `[LiteLLM error: ${errorMsg}]\n`, timestamp: new Date().toISOString() }); } return { success: false, output: '', model: endpoint.model, provider: provider.type, cacheUsed, error: errorMsg, }; } } /** * Get environment variable name for provider API key */ function getProviderEnvVarName(providerType: string): string | null { const envVarMap: Record = { openai: 'OPENAI_API_KEY', anthropic: 'ANTHROPIC_API_KEY', google: 'GOOGLE_API_KEY', azure: 'AZURE_API_KEY', mistral: 'MISTRAL_API_KEY', deepseek: 'DEEPSEEK_API_KEY', }; return envVarMap[providerType] || null; } /** * Get environment variable name for provider base URL */ function getProviderBaseUrlEnvVarName(providerType: string): string | null { const envVarMap: Record = { openai: 'OPENAI_API_BASE', anthropic: 'ANTHROPIC_API_BASE', azure: 'AZURE_API_BASE', }; return envVarMap[providerType] || null; } const rateLimitRetryQueueNextAt = new Map(); function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } function isRateLimitError(errorMessage: string): boolean { return /429|rate limit|too many requests/i.test(errorMessage); } function isRetryableError(errorMessage: string): boolean { // Never retry auth/config errors if (/401|403|unauthorized|forbidden/i.test(errorMessage)) { return false; } // Retry rate limits, transient server errors, and network timeouts return /(429|500|502|503|504|timeout|timed out|econnreset|enotfound|econnrefused|socket hang up)/i.test( errorMessage, ); } async function callWithRetries( call: () => Promise, options: { maxRetries: number; baseDelayMs: number; onOutput?: (unit: CliOutputUnit) => void; rateLimitKey: string; }, ): Promise { const { maxRetries, baseDelayMs, onOutput, rateLimitKey } = options; let attempt = 0; while (true) { try { return await call(); } catch (err) { const errorMessage = err instanceof Error ? err.message : String(err); if (attempt >= maxRetries || !isRetryableError(errorMessage)) { throw err; } const delayMs = baseDelayMs * 2 ** attempt; if (onOutput) { onOutput({ type: 'stderr', content: `[LiteLLM retry ${attempt + 1}/${maxRetries}: waiting ${delayMs}ms] ${errorMessage}\n`, timestamp: new Date().toISOString() }); } attempt += 1; if (isRateLimitError(errorMessage)) { const now = Date.now(); const earliestAt = now + delayMs; const queuedAt = rateLimitRetryQueueNextAt.get(rateLimitKey) ?? 0; const scheduledAt = Math.max(queuedAt, earliestAt); rateLimitRetryQueueNextAt.set(rateLimitKey, scheduledAt + delayMs); await sleep(scheduledAt - now); continue; } await sleep(delayMs); } } }