| | """Device detection utilities. |
| | |
| | Provides a robust way to detect available accelerators (CUDA, MPS) and returns |
| | useful metadata that can be used to pick the best compute backend. |
| | |
| | When possible we rely on PyTorch for accurate information, falling back to |
| | ``nvidia-smi`` for a lightweight probe so that we can still inform the user |
| | about available GPUs even when PyTorch is not ready to use them. |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | from dataclasses import dataclass, field |
| | from contextlib import contextmanager |
| | import importlib |
| | import math |
| | import shutil |
| | import subprocess |
| | import warnings |
| | import sys |
| | from typing import List, Optional, Tuple |
| |
|
| |
|
| | _GIB = 1024 ** 3 |
| |
|
| | _MODEL_VRAM_REQUIREMENTS_GB = { |
| | "tiny": {"int8": 0.4, "int8_float16": 0.6, "float16": 1.0, "float32": 1.8}, |
| | "tiny.en": {"int8": 0.4, "int8_float16": 0.6, "float16": 1.0, "float32": 1.8}, |
| | "base": {"int8": 0.5, "int8_float16": 0.8, "float16": 1.1, "float32": 2.0}, |
| | "base.en": {"int8": 0.5, "int8_float16": 0.8, "float16": 1.1, "float32": 2.0}, |
| | "small": {"int8": 0.9, "int8_float16": 1.3, "float16": 2.0, "float32": 3.5}, |
| | "small.en": {"int8": 0.9, "int8_float16": 1.3, "float16": 2.0, "float32": 3.5}, |
| | "medium": {"int8": 2.2, "int8_float16": 3.0, "float16": 5.0, "float32": 9.0}, |
| | "medium.en": {"int8": 2.2, "int8_float16": 3.0, "float16": 5.0, "float32": 9.0}, |
| | "large": {"int8": 3.5, "int8_float16": 4.5, "float16": 10.0, "float32": 18.0}, |
| | "large-v1": {"int8": 3.5, "int8_float16": 4.5, "float16": 10.0, "float32": 18.0}, |
| | "large-v2": {"int8": 3.5, "int8_float16": 4.5, "float16": 10.0, "float32": 18.0}, |
| | "large-v3": {"int8": 3.5, "int8_float16": 4.5, "float16": 10.0, "float32": 18.0}, |
| | "large-v3-turbo": {"int8": 2.0, "int8_float16": 2.8, "float16": 6.0, "float32": 10.0}, |
| | "turbo": {"int8": 2.0, "int8_float16": 2.8, "float16": 6.0, "float32": 10.0}, |
| | "distil-large-v2": {"int8": 2.0, "int8_float16": 2.8, "float16": 6.0, "float32": 10.0}, |
| | "distil-medium.en": {"int8": 1.1, "int8_float16": 1.6, "float16": 3.0, "float32": 5.5}, |
| | "distil-small.en": {"int8": 0.5, "int8_float16": 0.8, "float16": 1.5, "float32": 2.5}, |
| | } |
| | _DEFAULT_MODEL_VRAM_GB = { |
| | "int8": 1.0, |
| | "int8_float16": 1.5, |
| | "float16": 6.0, |
| | "float32": 10.0, |
| | } |
| | _FALLBACK_MODEL_VRAM_GB = 6.0 |
| |
|
| | _GPU_ONLY_COMPUTE_TYPES = {"float16", "fp16", "bfloat16", "int8_float16", "int8_bfloat16"} |
| | _FP16_COMPUTE_TYPES = {"float16", "fp16", "bfloat16", "int8_float16", "int8_bfloat16"} |
| |
|
| | _COMPUTE_CANONICAL = { |
| | "int8": "int8", |
| | "int8_float16": "int8_float16", |
| | "int8_bfloat16": "int8_float16", |
| | "int8_float32": "float32", |
| | "float16": "float16", |
| | "fp16": "float16", |
| | "bfloat16": "float16", |
| | "float32": "float32", |
| | "int16": "float32", |
| | "default": "float16", |
| | "auto": "float16", |
| | } |
| |
|
| | _TORCH_WARNINGS_CONFIGURED = False |
| |
|
| |
|
| | @dataclass |
| | class DeviceInfo: |
| | """Structured information about the selected compute backend.""" |
| |
|
| | backend: str |
| | n_gpus: int = 0 |
| | gpu_names: List[str] = field(default_factory=list) |
| | gpu_vram_bytes: List[int] = field(default_factory=list) |
| | gpu_capabilities: List[Tuple[int, int]] = field(default_factory=list) |
| | cuda_version: Optional[str] = None |
| | driver_version: Optional[str] = None |
| | messages: List[str] = field(default_factory=list) |
| | issues: List[str] = field(default_factory=list) |
| | notes: List[str] = field(default_factory=list) |
| | resolved_compute_type: Optional[str] = None |
| | selected_gpu_index: Optional[int] = None |
| |
|
| | def primary_gpu_name(self) -> Optional[str]: |
| | if self.selected_gpu_index is not None and 0 <= self.selected_gpu_index < len(self.gpu_names): |
| | return self.gpu_names[self.selected_gpu_index] |
| | return self.gpu_names[0] if self.gpu_names else None |
| |
|
| |
|
| | def _format_gib(byte_count: int | float | None) -> str: |
| | if byte_count is None: |
| | return "unknown" |
| | return f"{byte_count / _GIB:.1f} GiB" |
| |
|
| |
|
| | def _normalize_model_name(model_name: Optional[str]) -> str: |
| | if not model_name: |
| | return "" |
| | return str(model_name).strip().lower() |
| |
|
| |
|
| |
|
| | def _canonical_compute(compute_type: Optional[str]) -> str: |
| | key = (compute_type or "float16").lower() |
| | return _COMPUTE_CANONICAL.get(key, "float16") |
| |
|
| |
|
| | def _estimate_required_vram_bytes(model_name: Optional[str], compute_type: Optional[str]) -> int: |
| | normalized = _normalize_model_name(model_name) |
| | canonical = _canonical_compute(compute_type) |
| | model_table = _MODEL_VRAM_REQUIREMENTS_GB.get(normalized) |
| | if model_table is not None: |
| | requirement_gb = model_table.get(canonical) |
| | if requirement_gb is None: |
| | requirement_gb = _DEFAULT_MODEL_VRAM_GB.get(canonical, _FALLBACK_MODEL_VRAM_GB) |
| | else: |
| | requirement_gb = _DEFAULT_MODEL_VRAM_GB.get(canonical, _FALLBACK_MODEL_VRAM_GB) |
| | return int(math.ceil(requirement_gb * _GIB)) |
| |
|
| |
|
| | def _gpu_supports_fp16(capability: Optional[Tuple[int, int]]) -> bool: |
| | if capability is None: |
| | return False |
| | major, minor = capability |
| | if major is None or minor is None: |
| | return False |
| | return (major > 5) or (major == 5 and minor >= 3) |
| |
|
| |
|
| | def _probe_nvidia_smi() -> Optional[DeviceInfo]: |
| | """Try to query ``nvidia-smi`` for GPU information.""" |
| |
|
| | if shutil.which("nvidia-smi") is None: |
| | return None |
| |
|
| | try: |
| | out = subprocess.check_output( |
| | [ |
| | "nvidia-smi", |
| | "--query-gpu=name,memory.total,driver_version", |
| | "--format=csv,noheader,nounits", |
| | ], |
| | text=True, |
| | stderr=subprocess.DEVNULL, |
| | ) |
| | except Exception: |
| | return None |
| |
|
| | lines = [line.strip() for line in out.splitlines() if line.strip()] |
| | if not lines: |
| | return None |
| |
|
| | gpu_names: List[str] = [] |
| | gpu_vram: List[int] = [] |
| | driver_version: Optional[str] = None |
| |
|
| | for line in lines: |
| | parts = [part.strip() for part in line.split(",")] |
| | if not parts: |
| | continue |
| | gpu_names.append(parts[0]) |
| | if len(parts) > 1: |
| | try: |
| | gpu_vram.append(int(float(parts[1])) * 1024 * 1024) |
| | except (TypeError, ValueError): |
| | gpu_vram.append(0) |
| | if len(parts) > 2 and driver_version is None: |
| | driver_version = parts[2] |
| |
|
| | return DeviceInfo( |
| | backend="cuda", |
| | n_gpus=len(gpu_names), |
| | gpu_names=gpu_names, |
| | gpu_vram_bytes=gpu_vram, |
| | driver_version=driver_version, |
| | ) |
| |
|
| |
|
| | def _suppress_known_torch_warnings() -> None: |
| | """Silence noisy torch.cuda capability warnings on older GPUs.""" |
| |
|
| | global _TORCH_WARNINGS_CONFIGURED |
| | if _TORCH_WARNINGS_CONFIGURED: |
| | return |
| |
|
| | patterns = [ |
| | r"torch\.cuda", |
| | r"torch\._C", |
| | ] |
| | for module_pattern in patterns: |
| | warnings.filterwarnings( |
| | "ignore", |
| | category=UserWarning, |
| | module=module_pattern, |
| | ) |
| |
|
| | message_patterns = [ |
| | r"Found GPU\d+ .*cuda capability", |
| | r"Please install PyTorch with a following CUDA", |
| | r"not compatible with the current PyTorch installation", |
| | ] |
| | for message_pattern in message_patterns: |
| | warnings.filterwarnings( |
| | "ignore", |
| | category=UserWarning, |
| | message=message_pattern, |
| | ) |
| |
|
| | _TORCH_WARNINGS_CONFIGURED = True |
| |
|
| |
|
| | @contextmanager |
| | def _suppress_torch_cuda_calls(): |
| | with warnings.catch_warnings(): |
| | warnings.filterwarnings( |
| | "ignore", |
| | category=UserWarning, |
| | module=r"torch\.cuda", |
| | ) |
| | warnings.filterwarnings( |
| | "ignore", |
| | category=UserWarning, |
| | module=r"torch\._C", |
| | ) |
| | yield |
| |
|
| |
|
| | def _load_torch_module(): |
| | existing = sys.modules.get("torch") |
| | if existing is not None: |
| | return existing |
| |
|
| | with warnings.catch_warnings(record=True) as caught: |
| | warnings.simplefilter("always") |
| | module = importlib.import_module("torch") |
| |
|
| | for warning_msg in caught: |
| | filename = getattr(warning_msg, "filename", "") or "" |
| | normalized = filename.replace("\\", "/") |
| | if isinstance(warning_msg.message, UserWarning) and "torch/cuda" in normalized: |
| | continue |
| | warnings.showwarning( |
| | warning_msg.message, |
| | warning_msg.category, |
| | warning_msg.filename, |
| | warning_msg.lineno, |
| | ) |
| |
|
| | return module |
| |
|
| |
|
| | def _resolve_compute_type( |
| | backend: str, |
| | requested: Optional[str], |
| | supports_fp16: bool, |
| | auto_mode: bool, |
| | ) -> Tuple[str, List[str]]: |
| | req = (requested or "auto").lower() |
| | issues: List[str] = [] |
| |
|
| | if req in {"auto", "default"}: |
| | if backend == "cuda": |
| | if not supports_fp16: |
| | issues.append("FP16 may be unsupported on this GPU; it could run slower or fail. Consider int8_float16 if issues occur.") |
| | return "float16", issues |
| | if backend == "mps": |
| | return "float16", issues |
| | return "float32", issues |
| |
|
| | if backend == "cpu" and req in _GPU_ONLY_COMPUTE_TYPES: |
| | replacement = "float32" if auto_mode else req |
| | issues.append(f"{req} requires a GPU; using {replacement}.") |
| | return replacement, issues |
| |
|
| | if backend in {"cuda", "mps"} and req in _FP16_COMPUTE_TYPES and not supports_fp16: |
| | issues.append(f"{req} may be unsupported on detected GPU; it could run slower or fail. Consider int8_float16 or float32 if problems occur.") |
| | return req, issues |
| |
|
| | return req, issues |
| |
|
| |
|
| | def select_torch_device( |
| | preferred: str = "auto", |
| | *, |
| | model_name: Optional[str] = None, |
| | compute_type: Optional[str] = None, |
| | ) -> DeviceInfo: |
| | """Select and validate the best compute device.""" |
| |
|
| | pref = (preferred or "auto").lower() |
| | auto_mode = pref == "auto" |
| | requested_compute = (compute_type or "auto").lower() |
| |
|
| | info = DeviceInfo(backend="cpu") |
| |
|
| | torch_module = None |
| | torch_import_error = None |
| | try: |
| | _suppress_known_torch_warnings() |
| | torch_module = _load_torch_module() |
| | except Exception as exc: |
| | torch_import_error = exc |
| |
|
| | cuda_available = False |
| | cuda_device_count = 0 |
| | cuda_names: List[str] = [] |
| | cuda_vram: List[int] = [] |
| | cuda_capabilities: List[Tuple[int, int]] = [] |
| |
|
| | if torch_module is not None and getattr(torch_module, "cuda", None) is not None: |
| | try: |
| | with _suppress_torch_cuda_calls(): |
| | cuda_available = bool(torch_module.cuda.is_available()) |
| | except Exception: |
| | cuda_available = False |
| |
|
| | try: |
| | with _suppress_torch_cuda_calls(): |
| | cuda_device_count = int(torch_module.cuda.device_count()) |
| | except Exception: |
| | cuda_device_count = 0 |
| |
|
| | if cuda_available and cuda_device_count: |
| | for idx in range(cuda_device_count): |
| | name = None |
| | total_mem = None |
| | capability = None |
| | try: |
| | with _suppress_torch_cuda_calls(): |
| | props = torch_module.cuda.get_device_properties(idx) |
| | except Exception: |
| | props = None |
| |
|
| | if props is not None: |
| | name = getattr(props, "name", None) |
| | total_mem = getattr(props, "total_memory", None) |
| | capability = ( |
| | getattr(props, "major", None), |
| | getattr(props, "minor", None), |
| | ) |
| |
|
| | if name is None: |
| | try: |
| | name = torch_module.cuda.get_device_name(idx) |
| | except Exception: |
| | name = f"CUDA GPU {idx}" |
| |
|
| | cuda_names.append(str(name)) |
| | cuda_vram.append(int(total_mem) if total_mem is not None else 0) |
| | if capability is not None and capability[0] is not None and capability[1] is not None: |
| | cuda_capabilities.append((int(capability[0]), int(capability[1]))) |
| | else: |
| | cuda_capabilities.append((0, 0)) |
| |
|
| | mps_available = False |
| | if torch_module is not None: |
| | mps_backend = getattr(getattr(torch_module, "backends", None), "mps", None) |
| | if mps_backend is not None and hasattr(mps_backend, "is_available"): |
| | try: |
| | mps_available = bool(mps_backend.is_available()) |
| | except Exception: |
| | mps_available = False |
| |
|
| | smi_info = _probe_nvidia_smi() |
| | if not cuda_names and smi_info is not None: |
| | cuda_names = smi_info.gpu_names |
| | cuda_vram = smi_info.gpu_vram_bytes |
| | info.driver_version = smi_info.driver_version |
| | info.n_gpus = smi_info.n_gpus |
| |
|
| | if torch_module is not None: |
| | info.cuda_version = getattr(getattr(torch_module, "version", None), "cuda", None) |
| |
|
| | if cuda_available and cuda_device_count: |
| | info.backend = "cuda" |
| | info.n_gpus = cuda_device_count |
| | info.gpu_names = cuda_names |
| | info.gpu_vram_bytes = cuda_vram |
| | info.gpu_capabilities = cuda_capabilities |
| | info.selected_gpu_index = 0 if cuda_device_count else None |
| | elif pref == "cuda": |
| | info.backend = "cpu" |
| | info.issues.append("CUDA backend unavailable in PyTorch; using CPU.") |
| | if torch_import_error is not None: |
| | info.notes.append(f"PyTorch import failed: {torch_import_error}") |
| | elif pref == "mps": |
| | if mps_available: |
| | info.backend = "mps" |
| | else: |
| | info.backend = "cpu" |
| | info.issues.append("MPS backend unavailable; using CPU.") |
| | elif pref == "rocm": |
| | info.backend = "cpu" |
| | info.issues.append("ROCm backend not implemented; using CPU.") |
| | elif pref == "cpu": |
| | info.backend = "cpu" |
| | else: |
| | if cuda_available and cuda_device_count: |
| | info.backend = "cuda" |
| | info.n_gpus = cuda_device_count |
| | info.gpu_names = cuda_names |
| | info.gpu_vram_bytes = cuda_vram |
| | info.gpu_capabilities = cuda_capabilities |
| | info.selected_gpu_index = 0 if cuda_device_count else None |
| | elif mps_available: |
| | info.backend = "mps" |
| | else: |
| | info.backend = "cpu" |
| |
|
| | primary_gpu = info.primary_gpu_name() |
| | if primary_gpu is not None: |
| | info.messages.append(f"Detected {primary_gpu} GPU") |
| |
|
| | if info.backend == "cuda" and primary_gpu is None and cuda_names: |
| | info.messages.append(f"Detected {cuda_names[0]} GPU") |
| |
|
| | initial_backend_for_compute = info.backend |
| |
|
| | available_vram = None |
| | if initial_backend_for_compute == "cuda" and info.selected_gpu_index is not None: |
| | if 0 <= info.selected_gpu_index < len(info.gpu_vram_bytes): |
| | available_vram = info.gpu_vram_bytes[info.selected_gpu_index] |
| | if available_vram is None and initial_backend_for_compute == "cuda" and info.gpu_vram_bytes: |
| | available_vram = info.gpu_vram_bytes[0] |
| |
|
| | supports_fp16 = False |
| | if initial_backend_for_compute == "cuda" and info.selected_gpu_index is not None: |
| | idx = info.selected_gpu_index |
| | capability = None |
| | if 0 <= idx < len(info.gpu_capabilities): |
| | capability = info.gpu_capabilities[idx] |
| | supports_fp16 = _gpu_supports_fp16(capability) |
| | elif initial_backend_for_compute == "mps": |
| | supports_fp16 = True |
| |
|
| | resolved_compute_candidate, compute_issues = _resolve_compute_type( |
| | initial_backend_for_compute, |
| | requested_compute, |
| | supports_fp16, |
| | auto_mode, |
| | ) |
| |
|
| | requirement_bytes = None |
| | compute_label = _canonical_compute(resolved_compute_candidate) |
| | if initial_backend_for_compute == "cuda": |
| | requirement_bytes = _estimate_required_vram_bytes(model_name, resolved_compute_candidate) |
| |
|
| | if ( |
| | initial_backend_for_compute == "cuda" |
| | and available_vram is not None |
| | and requirement_bytes is not None |
| | and available_vram < requirement_bytes |
| | ): |
| | message = ( |
| | f"VRAM too low for model {model_name or 'selected model'} using compute type {compute_label} (~{_format_gib(requirement_bytes)} required, found {_format_gib(available_vram)})." |
| | ) |
| | if auto_mode: |
| | message += " Falling back to CPU." |
| | else: |
| | message += " GPU execution may fail." |
| | message += " Consider lowering the compute type (e.g. --transcription_compute_type=int8_float16) or selecting a smaller model via --transcription_model." |
| | if auto_mode: |
| | message += " To force GPU usage, rerun with --transcription_device=cuda." |
| | info.issues.append(message) |
| | if auto_mode: |
| | info.backend = "cpu" |
| | info.selected_gpu_index = None |
| |
|
| | if info.backend != "cuda" and auto_mode and not primary_gpu and smi_info is not None and smi_info.gpu_names: |
| | info.messages.append("No compatible GPU ready; using CPU.") |
| |
|
| | if info.backend in {"cuda", "mps"}: |
| | info.resolved_compute_type = resolved_compute_candidate |
| | info.issues.extend(compute_issues) |
| | else: |
| | info.resolved_compute_type = None |
| |
|
| | if info.backend == "cpu" and primary_gpu is None and not info.messages: |
| | info.messages.append("Using CPU for transcription.") |
| |
|
| | if ( |
| | info.backend == "cuda" |
| | and not supports_fp16 |
| | and info.resolved_compute_type == "float32" |
| | ): |
| | info.notes.append("Consider reinstalling PyTorch with newer CUDA support for FP16 acceleration.") |
| |
|
| | return info |
| |
|
| |
|
| | def select_torch_device_str( |
| | preferred: str = "auto", |
| | *, |
| | model_name: Optional[str] = None, |
| | compute_type: Optional[str] = None, |
| | ) -> str: |
| | """Compatibility helper returning just the backend string.""" |
| |
|
| | info = select_torch_device(preferred=preferred, model_name=model_name, compute_type=compute_type) |
| | return info.backend |
| |
|