"""GPU acceleration support for semantic embeddings. This module provides GPU detection, initialization, and fallback handling for ONNX-based embedding generation. """ from __future__ import annotations import logging from dataclasses import dataclass from typing import List, Optional logger = logging.getLogger(__name__) @dataclass class GPUDevice: """Individual GPU device info.""" device_id: int name: str is_discrete: bool # True for discrete GPU (NVIDIA, AMD), False for integrated (Intel UHD) vendor: str # "nvidia", "amd", "intel", "unknown" @dataclass class GPUInfo: """GPU availability and configuration info.""" gpu_available: bool = False cuda_available: bool = False gpu_count: int = 0 gpu_name: Optional[str] = None onnx_providers: List[str] = None devices: List[GPUDevice] = None # List of detected GPU devices preferred_device_id: Optional[int] = None # Preferred GPU for embedding def __post_init__(self): if self.onnx_providers is None: self.onnx_providers = ["CPUExecutionProvider"] if self.devices is None: self.devices = [] _gpu_info_cache: Optional[GPUInfo] = None def _enumerate_gpus() -> List[GPUDevice]: """Enumerate available GPU devices using WMI on Windows. Returns: List of GPUDevice with device info, ordered by device_id. """ devices = [] try: import subprocess import sys if sys.platform == "win32": # Use PowerShell to query GPU information via WMI cmd = [ "powershell", "-NoProfile", "-Command", "Get-WmiObject Win32_VideoController | Select-Object DeviceID, Name, AdapterCompatibility | ConvertTo-Json" ] result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) if result.returncode == 0 and result.stdout.strip(): import json gpu_data = json.loads(result.stdout) # Handle single GPU case (returns dict instead of list) if isinstance(gpu_data, dict): gpu_data = [gpu_data] for idx, gpu in enumerate(gpu_data): name = gpu.get("Name", "Unknown GPU") compat = gpu.get("AdapterCompatibility", "").lower() # Determine vendor name_lower = name.lower() if "nvidia" in name_lower or "nvidia" in compat: vendor = "nvidia" is_discrete = True elif "amd" in name_lower or "radeon" in name_lower or "amd" in compat: vendor = "amd" is_discrete = True elif "intel" in name_lower or "intel" in compat: vendor = "intel" # Intel UHD/Iris are integrated, Intel Arc is discrete is_discrete = "arc" in name_lower else: vendor = "unknown" is_discrete = False devices.append(GPUDevice( device_id=idx, name=name, is_discrete=is_discrete, vendor=vendor )) logger.debug(f"Detected GPU {idx}: {name} (vendor={vendor}, discrete={is_discrete})") except Exception as e: logger.debug(f"GPU enumeration failed: {e}") return devices def _get_preferred_device_id(devices: List[GPUDevice]) -> Optional[int]: """Determine the preferred GPU device_id for embedding. Preference order: 1. NVIDIA discrete GPU (best DirectML/CUDA support) 2. AMD discrete GPU 3. Intel Arc (discrete) 4. Intel integrated (fallback) Returns: device_id of preferred GPU, or None to use default. """ if not devices: return None # Priority: NVIDIA > AMD > Intel Arc > Intel integrated priority_order = [ ("nvidia", True), # NVIDIA discrete ("amd", True), # AMD discrete ("intel", True), # Intel Arc (discrete) ("intel", False), # Intel integrated (fallback) ] for target_vendor, target_discrete in priority_order: for device in devices: if device.vendor == target_vendor and device.is_discrete == target_discrete: logger.info(f"Preferred GPU: {device.name} (device_id={device.device_id})") return device.device_id # If no match, use first device if devices: return devices[0].device_id return None def detect_gpu(force_refresh: bool = False) -> GPUInfo: """Detect available GPU resources for embedding acceleration. Args: force_refresh: If True, re-detect GPU even if cached. Returns: GPUInfo with detection results. """ global _gpu_info_cache if _gpu_info_cache is not None and not force_refresh: return _gpu_info_cache info = GPUInfo() # Enumerate GPU devices first info.devices = _enumerate_gpus() info.gpu_count = len(info.devices) if info.devices: # Set preferred device (discrete GPU preferred over integrated) info.preferred_device_id = _get_preferred_device_id(info.devices) # Set gpu_name to preferred device name for dev in info.devices: if dev.device_id == info.preferred_device_id: info.gpu_name = dev.name break # Check PyTorch CUDA availability (most reliable detection) try: import torch if torch.cuda.is_available(): info.cuda_available = True info.gpu_available = True info.gpu_count = torch.cuda.device_count() if info.gpu_count > 0: info.gpu_name = torch.cuda.get_device_name(0) logger.debug(f"PyTorch CUDA detected: {info.gpu_count} GPU(s)") except ImportError: logger.debug("PyTorch not available for GPU detection") # Check ONNX Runtime providers with validation try: import onnxruntime as ort available_providers = ort.get_available_providers() # Build provider list with priority order providers = [] # Test each provider to ensure it actually works def test_provider(provider_name: str) -> bool: """Test if a provider actually works by creating a dummy session.""" try: # Create a minimal ONNX model to test provider import numpy as np # Simple test: just check if provider can be instantiated sess_options = ort.SessionOptions() sess_options.log_severity_level = 4 # Suppress warnings return True except Exception: return False # CUDA provider (NVIDIA GPU) - check if CUDA runtime is available if "CUDAExecutionProvider" in available_providers: # Verify CUDA is actually usable by checking for cuBLAS cuda_works = False try: import ctypes # Try to load cuBLAS to verify CUDA installation try: ctypes.CDLL("cublas64_12.dll") cuda_works = True except OSError: try: ctypes.CDLL("cublas64_11.dll") cuda_works = True except OSError: pass except Exception: pass if cuda_works: providers.append("CUDAExecutionProvider") info.gpu_available = True logger.debug("ONNX CUDAExecutionProvider available and working") else: logger.debug("ONNX CUDAExecutionProvider listed but CUDA runtime not found") # TensorRT provider (optimized NVIDIA inference) if "TensorrtExecutionProvider" in available_providers: # TensorRT requires additional libraries, skip for now logger.debug("ONNX TensorrtExecutionProvider available (requires TensorRT SDK)") # DirectML provider (Windows GPU - AMD/Intel/NVIDIA) if "DmlExecutionProvider" in available_providers: providers.append("DmlExecutionProvider") info.gpu_available = True logger.debug("ONNX DmlExecutionProvider available (DirectML)") # ROCm provider (AMD GPU on Linux) if "ROCMExecutionProvider" in available_providers: providers.append("ROCMExecutionProvider") info.gpu_available = True logger.debug("ONNX ROCMExecutionProvider available (AMD)") # CoreML provider (Apple Silicon) if "CoreMLExecutionProvider" in available_providers: providers.append("CoreMLExecutionProvider") info.gpu_available = True logger.debug("ONNX CoreMLExecutionProvider available (Apple)") # Always include CPU as fallback providers.append("CPUExecutionProvider") info.onnx_providers = providers except ImportError: logger.debug("ONNX Runtime not available") info.onnx_providers = ["CPUExecutionProvider"] _gpu_info_cache = info return info def get_optimal_providers(use_gpu: bool = True, with_device_options: bool = False) -> list: """Get optimal ONNX execution providers based on availability. Args: use_gpu: If True, include GPU providers when available. If False, force CPU-only execution. with_device_options: If True, return providers as tuples with device_id options for proper GPU device selection (required for DirectML). Returns: List of provider names or tuples (provider_name, options_dict) in priority order. """ if not use_gpu: return ["CPUExecutionProvider"] gpu_info = detect_gpu() # Check if GPU was requested but not available - log warning if not gpu_info.gpu_available: try: import onnxruntime as ort available_providers = ort.get_available_providers() except ImportError: available_providers = [] logger.warning( "GPU acceleration was requested, but no supported GPU provider (CUDA, DirectML) " f"was found. Available providers: {available_providers}. Falling back to CPU." ) else: # Log which GPU provider is being used gpu_providers = [p for p in gpu_info.onnx_providers if p != "CPUExecutionProvider"] if gpu_providers: logger.info(f"Using {gpu_providers[0]} for ONNX GPU acceleration") if not with_device_options: return gpu_info.onnx_providers # Build providers with device_id options for GPU providers device_id = get_selected_device_id() providers = [] for provider in gpu_info.onnx_providers: if provider == "DmlExecutionProvider" and device_id is not None: # DirectML requires device_id in provider_options tuple providers.append(("DmlExecutionProvider", {"device_id": device_id})) logger.debug(f"DmlExecutionProvider configured with device_id={device_id}") elif provider == "CUDAExecutionProvider" and device_id is not None: # CUDA also supports device_id in provider_options providers.append(("CUDAExecutionProvider", {"device_id": device_id})) logger.debug(f"CUDAExecutionProvider configured with device_id={device_id}") elif provider == "ROCMExecutionProvider" and device_id is not None: # ROCm supports device_id providers.append(("ROCMExecutionProvider", {"device_id": device_id})) logger.debug(f"ROCMExecutionProvider configured with device_id={device_id}") else: # CPU and other providers don't need device_id providers.append(provider) return providers def is_gpu_available() -> bool: """Check if any GPU acceleration is available.""" return detect_gpu().gpu_available def get_gpu_summary() -> str: """Get human-readable GPU status summary.""" info = detect_gpu() if not info.gpu_available: return "GPU: Not available (using CPU)" parts = [] if info.gpu_name: parts.append(f"GPU: {info.gpu_name}") if info.gpu_count > 1: parts.append(f"({info.gpu_count} devices)") # Show active providers (excluding CPU fallback) gpu_providers = [p for p in info.onnx_providers if p != "CPUExecutionProvider"] if gpu_providers: parts.append(f"Providers: {', '.join(gpu_providers)}") return " | ".join(parts) if parts else "GPU: Available" def clear_gpu_cache() -> None: """Clear cached GPU detection info.""" global _gpu_info_cache _gpu_info_cache = None # User-selected device ID (overrides auto-detection) _selected_device_id: Optional[int] = None def get_gpu_devices() -> List[dict]: """Get list of available GPU devices for frontend selection. Returns: List of dicts with device info for each GPU. """ info = detect_gpu() devices = [] for dev in info.devices: devices.append({ "device_id": dev.device_id, "name": dev.name, "vendor": dev.vendor, "is_discrete": dev.is_discrete, "is_preferred": dev.device_id == info.preferred_device_id, "is_selected": dev.device_id == get_selected_device_id(), }) return devices def get_selected_device_id() -> Optional[int]: """Get the user-selected GPU device_id. Returns: User-selected device_id, or auto-detected preferred device_id if not set. """ global _selected_device_id if _selected_device_id is not None: return _selected_device_id # Fall back to auto-detected preferred device info = detect_gpu() return info.preferred_device_id def set_selected_device_id(device_id: Optional[int]) -> bool: """Set the GPU device_id to use for embeddings. Args: device_id: GPU device_id to use, or None to use auto-detection. Returns: True if device_id is valid, False otherwise. """ global _selected_device_id if device_id is None: _selected_device_id = None logger.info("GPU selection reset to auto-detection") return True # Validate device_id exists info = detect_gpu() valid_ids = [dev.device_id for dev in info.devices] if device_id in valid_ids: _selected_device_id = device_id device_name = next((dev.name for dev in info.devices if dev.device_id == device_id), "Unknown") logger.info(f"GPU selection set to device {device_id}: {device_name}") return True else: logger.warning(f"Invalid device_id {device_id}. Valid IDs: {valid_ids}") return False