/** * Health Check Service * Singleton service that periodically checks API key health for providers * with health check enabled. Updates key health status and filters unhealthy * keys from rotation endpoints. */ import { getAllProviders, getProvider, updateProvider, resolveEnvVar, } from '../../config/litellm-api-config-manager.js'; import { testApiKeyConnection, getDefaultApiBase } from './api-key-tester.js'; import type { ProviderCredential, ApiKeyEntry, HealthCheckConfig } from '../../types/litellm-api-config.js'; /** * Internal state for tracking consecutive failures per key */ interface KeyHealthState { consecutiveFailures: number; cooldownUntil?: Date; } /** * Health Check Service - Singleton * Manages periodic health checks for API keys across all providers */ export class HealthCheckService { private static instance: HealthCheckService; /** Timer handles for each provider's health check interval */ private timers: Map = new Map(); /** Track consecutive failures and cooldown per key (providerId:keyId -> state) */ private keyStates: Map = new Map(); /** Base directory for config operations */ private baseDir: string = ''; /** Lock to prevent concurrent checks on same provider */ private checkingProviders: Set = new Set(); private constructor() {} /** * Get singleton instance */ static getInstance(): HealthCheckService { if (!HealthCheckService.instance) { HealthCheckService.instance = new HealthCheckService(); } return HealthCheckService.instance; } /** * Start health check for a specific provider * @param providerId - The provider ID to start health checks for */ startHealthCheck(providerId: string): void { // Stop existing timer if any this.stopHealthCheck(providerId); const provider = getProvider(this.baseDir, providerId); if (!provider) { console.warn(`[HealthCheck] Provider not found: ${providerId}`); return; } if (!provider.enabled) { console.log(`[HealthCheck] Provider ${providerId} is disabled, skipping`); return; } const healthConfig = provider.healthCheck; if (!healthConfig || !healthConfig.enabled) { console.log(`[HealthCheck] Health check not enabled for provider ${providerId}`); return; } const intervalMs = (healthConfig.intervalSeconds || 300) * 1000; console.log(`[HealthCheck] Starting health check for ${provider.name} (${providerId}), interval: ${healthConfig.intervalSeconds}s`); // Run initial check immediately void this.checkProviderNow(providerId); // Set up interval const timer = setInterval(() => { void this.checkProviderNow(providerId); }, intervalMs); this.timers.set(providerId, timer); } /** * Stop health check for a specific provider * @param providerId - The provider ID to stop health checks for */ stopHealthCheck(providerId: string): void { const timer = this.timers.get(providerId); if (timer) { clearInterval(timer); this.timers.delete(providerId); console.log(`[HealthCheck] Stopped health check for provider ${providerId}`); } } /** * Start health checks for all providers that have health check enabled * @param baseDir - Base directory for config operations */ startAllHealthChecks(baseDir: string): void { this.baseDir = baseDir; const providers = getAllProviders(baseDir); let startedCount = 0; for (const provider of providers) { if (provider.enabled && provider.healthCheck?.enabled) { this.startHealthCheck(provider.id); startedCount++; } } if (startedCount > 0) { console.log(`[HealthCheck] Started health checks for ${startedCount} provider(s)`); } else { console.log('[HealthCheck] No providers with health check enabled'); } } /** * Stop all active health checks */ stopAllHealthChecks(): void { const providerIds = Array.from(this.timers.keys()); for (const providerId of providerIds) { this.stopHealthCheck(providerId); } this.keyStates.clear(); console.log('[HealthCheck] Stopped all health checks'); } /** * Manually trigger a health check for a provider * @param providerId - The provider ID to check */ async checkProviderNow(providerId: string): Promise { // Prevent concurrent checks on same provider if (this.checkingProviders.has(providerId)) { console.log(`[HealthCheck] Already checking provider ${providerId}, skipping`); return; } this.checkingProviders.add(providerId); try { const provider = getProvider(this.baseDir, providerId); if (!provider) { console.warn(`[HealthCheck] Provider not found: ${providerId}`); return; } if (!provider.enabled) { return; } const healthConfig = provider.healthCheck; if (!healthConfig) { return; } const apiBase = provider.apiBase || getDefaultApiBase(provider.type); const apiKeys = provider.apiKeys || []; if (apiKeys.length === 0 && provider.apiKey) { // Single key mode - create virtual key entry await this.checkSingleKey(provider, 'default', provider.apiKey, apiBase, healthConfig); } else { // Multi-key mode for (const keyEntry of apiKeys) { if (!keyEntry.enabled) continue; await this.checkSingleKey(provider, keyEntry.id, keyEntry.key, apiBase, healthConfig); } } // Persist updated health statuses const updatedApiKeys = provider.apiKeys; if (updatedApiKeys && updatedApiKeys.length > 0) { try { updateProvider(this.baseDir, providerId, { apiKeys: updatedApiKeys }); } catch (err) { console.error(`[HealthCheck] Failed to persist health status for ${providerId}:`, err); } } } finally { this.checkingProviders.delete(providerId); } } /** * Check a single API key's health */ private async checkSingleKey( provider: ProviderCredential, keyId: string, keyValue: string, apiBase: string, healthConfig: HealthCheckConfig ): Promise { const stateKey = `${provider.id}:${keyId}`; let state = this.keyStates.get(stateKey); if (!state) { state = { consecutiveFailures: 0 }; this.keyStates.set(stateKey, state); } // Check if in cooldown if (state.cooldownUntil && new Date() < state.cooldownUntil) { console.log(`[HealthCheck] Key ${keyId} for ${provider.name} is in cooldown until ${state.cooldownUntil.toISOString()}`); return; } // Resolve environment variables const resolvedKey = resolveEnvVar(keyValue); if (!resolvedKey) { console.warn(`[HealthCheck] Key ${keyId} for ${provider.name} has empty value (env var not set?)`); this.updateKeyHealth(provider, keyId, 'unhealthy', 'API key is empty or environment variable not set'); return; } // Test the key const result = await testApiKeyConnection(provider.type, apiBase, resolvedKey); const now = new Date().toISOString(); if (result.valid) { // Reset failure count on success state.consecutiveFailures = 0; state.cooldownUntil = undefined; this.updateKeyHealth(provider, keyId, 'healthy', undefined, now, result.latencyMs); console.log(`[HealthCheck] Key ${keyId} for ${provider.name}: healthy (${result.latencyMs}ms)`); } else { // Increment failure count state.consecutiveFailures++; if (state.consecutiveFailures >= healthConfig.failureThreshold) { // Mark as unhealthy and enter cooldown const cooldownMs = (healthConfig.cooldownSeconds || 60) * 1000; state.cooldownUntil = new Date(Date.now() + cooldownMs); this.updateKeyHealth(provider, keyId, 'unhealthy', result.error, now); console.warn(`[HealthCheck] Key ${keyId} for ${provider.name}: UNHEALTHY after ${state.consecutiveFailures} failures. Cooldown until ${state.cooldownUntil.toISOString()}`); } else { // Still unknown/degraded, not yet unhealthy this.updateKeyHealth(provider, keyId, 'unknown', result.error, now); console.log(`[HealthCheck] Key ${keyId} for ${provider.name}: failed (${state.consecutiveFailures}/${healthConfig.failureThreshold}): ${result.error}`); } } } /** * Update the health status of a key in the provider's apiKeys array */ private updateKeyHealth( provider: ProviderCredential, keyId: string, status: 'healthy' | 'unhealthy' | 'unknown', error?: string, timestamp?: string, latencyMs?: number ): void { if (!provider.apiKeys) return; const keyEntry = provider.apiKeys.find(k => k.id === keyId); if (keyEntry) { keyEntry.healthStatus = status; keyEntry.lastHealthCheck = timestamp || new Date().toISOString(); if (error) { keyEntry.lastError = error; } else { delete keyEntry.lastError; } // Save latency if provided (only on successful checks) if (latencyMs !== undefined) { keyEntry.lastLatencyMs = latencyMs; } } } /** * Get the current health status of all keys for a provider */ getProviderHealthStatus(providerId: string): Array<{ keyId: string; status: 'healthy' | 'unhealthy' | 'unknown'; lastCheck?: string; lastError?: string; consecutiveFailures: number; inCooldown: boolean; }> { const provider = getProvider(this.baseDir, providerId); if (!provider || !provider.apiKeys) return []; return provider.apiKeys.map(key => { const stateKey = `${providerId}:${key.id}`; const state = this.keyStates.get(stateKey) || { consecutiveFailures: 0 }; return { keyId: key.id, status: key.healthStatus || 'unknown', lastCheck: key.lastHealthCheck, lastError: key.lastError, consecutiveFailures: state.consecutiveFailures, inCooldown: state.cooldownUntil ? new Date() < state.cooldownUntil : false, }; }); } /** * Check if the service is running health checks for any provider */ isRunning(): boolean { return this.timers.size > 0; } /** * Get list of provider IDs currently being monitored */ getMonitoredProviders(): string[] { return Array.from(this.timers.keys()); } /** * Clean up all state for a deleted provider * Call this when a provider is deleted to prevent memory leaks * @param providerId - The provider ID to clean up */ cleanupProvider(providerId: string): void { // Stop health check timer this.stopHealthCheck(providerId); // Remove all key states for this provider const keysToRemove: string[] = []; for (const key of this.keyStates.keys()) { if (key.startsWith(`${providerId}:`)) { keysToRemove.push(key); } } for (const key of keysToRemove) { this.keyStates.delete(key); } if (keysToRemove.length > 0) { console.log(`[HealthCheck] Cleaned up ${keysToRemove.length} key state(s) for deleted provider ${providerId}`); } } } /** * Get the singleton health check service instance */ export function getHealthCheckService(): HealthCheckService { return HealthCheckService.getInstance(); }