|  | import torch | 
					
						
						|  | from collections import defaultdict, abc | 
					
						
						|  | import warnings | 
					
						
						|  | from enum import Enum | 
					
						
						|  | from typing import Any, Dict, List, Optional, Tuple | 
					
						
						|  | from .common import amp_definitely_not_available | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | __all__ = ["OptState", "GradScaler"] | 
					
						
						|  |  | 
					
						
						|  | class _MultiDeviceReplicator(object): | 
					
						
						|  | """ | 
					
						
						|  | Lazily serves copies of a tensor to requested devices.  Copies are cached per-device. | 
					
						
						|  | """ | 
					
						
						|  | def __init__(self, master_tensor: torch.Tensor) -> None: | 
					
						
						|  | assert master_tensor.is_cuda or master_tensor.device.type == 'xla' | 
					
						
						|  | self.master = master_tensor | 
					
						
						|  | self._per_device_tensors: Dict[torch.device, torch.Tensor] = {} | 
					
						
						|  |  | 
					
						
						|  | def get(self, device) -> torch.Tensor: | 
					
						
						|  | retval = self._per_device_tensors.get(device, None) | 
					
						
						|  | if retval is None: | 
					
						
						|  | retval = self.master.to(device=device, non_blocking=True, copy=True) | 
					
						
						|  | self._per_device_tensors[device] = retval | 
					
						
						|  | return retval | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class OptState(Enum): | 
					
						
						|  | READY = 0 | 
					
						
						|  | UNSCALED = 1 | 
					
						
						|  | STEPPED = 2 | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def _refresh_per_optimizer_state(): | 
					
						
						|  | return {"stage": OptState.READY, "found_inf_per_device": {}} | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class GradScaler(object): | 
					
						
						|  | _scale: Optional[torch.Tensor] | 
					
						
						|  | _grows_tracker: Optional[torch.Tensor] | 
					
						
						|  | _per_optimizer_states: Dict[int, Dict[str, Any]] | 
					
						
						|  | """ | 
					
						
						|  | An instance ``scaler`` of :class:`GradScaler` helps perform the steps of gradient scaling | 
					
						
						|  | conveniently. | 
					
						
						|  |  | 
					
						
						|  | * ``scaler.scale(loss)`` multiplies a given loss by ``scaler``'s current scale factor. | 
					
						
						|  | * ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``. | 
					
						
						|  | * ``scaler.update()`` updates ``scaler``'s scale factor. | 
					
						
						|  |  | 
					
						
						|  | Example:: | 
					
						
						|  |  | 
					
						
						|  | # Creates a GradScaler once at the beginning of training. | 
					
						
						|  | scaler = GradScaler() | 
					
						
						|  |  | 
					
						
						|  | for epoch in epochs: | 
					
						
						|  | for input, target in data: | 
					
						
						|  | optimizer.zero_grad() | 
					
						
						|  | output = model(input) | 
					
						
						|  | loss = loss_fn(output, target) | 
					
						
						|  |  | 
					
						
						|  | # Scales loss.  Calls backward() on scaled loss to create scaled gradients. | 
					
						
						|  | scaler.scale(loss).backward() | 
					
						
						|  |  | 
					
						
						|  | # scaler.step() first unscales gradients of the optimizer's params. | 
					
						
						|  | # If gradients don't contain infs/NaNs, optimizer.step() is then called, | 
					
						
						|  | # otherwise, optimizer.step() is skipped. | 
					
						
						|  | scaler.step(optimizer) | 
					
						
						|  |  | 
					
						
						|  | # Updates the scale for next iteration. | 
					
						
						|  | scaler.update() | 
					
						
						|  |  | 
					
						
						|  | See the :ref:`Automatic Mixed Precision examples<amp-examples>` for usage | 
					
						
						|  | (along with autocasting) in more complex cases like gradient clipping, gradient accumulation, gradient penalty, | 
					
						
						|  | and multiple losses/optimizers. | 
					
						
						|  |  | 
					
						
						|  | ``scaler`` dynamically estimates the scale factor each iteration.  To minimize gradient underflow, | 
					
						
						|  | a large scale factor should be used.  However, ``float16`` values can "overflow" (become inf or NaN) if | 
					
						
						|  | the scale factor is too large.  Therefore, the optimal scale factor is the largest factor that can be used | 
					
						
						|  | without incurring inf or NaN gradient values. | 
					
						
						|  | ``scaler`` approximates the optimal scale factor over time by checking the gradients for infs and NaNs during every | 
					
						
						|  | ``scaler.step(optimizer)`` (or optional separate ``scaler.unscale_(optimizer)``, see :meth:`unscale_`). | 
					
						
						|  |  | 
					
						
						|  | * If infs/NaNs are found, ``scaler.step(optimizer)`` skips the underlying ``optimizer.step()`` (so the params | 
					
						
						|  | themselves remain uncorrupted) and ``update()`` multiplies the scale by ``backoff_factor``. | 
					
						
						|  |  | 
					
						
						|  | * If no infs/NaNs are found, ``scaler.step(optimizer)`` runs the underlying ``optimizer.step()`` as usual. | 
					
						
						|  | If ``growth_interval`` unskipped iterations occur consecutively, ``update()`` multiplies the scale by | 
					
						
						|  | ``growth_factor``. | 
					
						
						|  |  | 
					
						
						|  | The scale factor often causes infs/NaNs to appear in gradients for the first few iterations as its | 
					
						
						|  | value calibrates.  ``scaler.step`` will skip the underlying ``optimizer.step()`` for these | 
					
						
						|  | iterations.  After that, step skipping should occur rarely (once every few hundred or thousand iterations). | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | init_scale (float, optional, default=2.**16):  Initial scale factor. | 
					
						
						|  | growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during | 
					
						
						|  | :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations. | 
					
						
						|  | backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during | 
					
						
						|  | :meth:`update` if inf/NaN gradients occur in an iteration. | 
					
						
						|  | growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients | 
					
						
						|  | that must occur for the scale to be multiplied by ``growth_factor``. | 
					
						
						|  | enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply | 
					
						
						|  | invokes the underlying ``optimizer.step()``, and other methods become no-ops. | 
					
						
						|  | Default: ``True`` | 
					
						
						|  | """ | 
					
						
						|  | def __init__(self, | 
					
						
						|  | init_scale=2.**16, | 
					
						
						|  | growth_factor=2.0, | 
					
						
						|  | backoff_factor=0.5, | 
					
						
						|  | growth_interval=2000, | 
					
						
						|  | enabled=True): | 
					
						
						|  | if enabled and amp_definitely_not_available(): | 
					
						
						|  | warnings.warn("torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling.") | 
					
						
						|  | self._enabled = False | 
					
						
						|  | else: | 
					
						
						|  | self._enabled = enabled | 
					
						
						|  |  | 
					
						
						|  | if self._enabled: | 
					
						
						|  | assert growth_factor > 1.0, "The growth factor must be > 1.0." | 
					
						
						|  | assert backoff_factor < 1.0, "The backoff factor must be < 1.0." | 
					
						
						|  |  | 
					
						
						|  | self._init_scale = init_scale | 
					
						
						|  |  | 
					
						
						|  | self._scale = None | 
					
						
						|  | self._growth_factor = growth_factor | 
					
						
						|  | self._backoff_factor = backoff_factor | 
					
						
						|  | self._growth_interval = growth_interval | 
					
						
						|  | self._init_growth_tracker = 0 | 
					
						
						|  |  | 
					
						
						|  | self._growth_tracker = None | 
					
						
						|  | self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state) | 
					
						
						|  |  | 
					
						
						|  | def _check_scale_growth_tracker(self, funcname) -> Tuple[torch.Tensor, torch.Tensor]: | 
					
						
						|  | fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration." | 
					
						
						|  | assert self._scale is not None, "Attempted {} but _scale is None.  ".format(funcname) + fix | 
					
						
						|  | assert self._growth_tracker is not None, "Attempted {} but _growth_tracker is None.  ".format(funcname) + fix | 
					
						
						|  | return (self._scale, self._growth_tracker) | 
					
						
						|  |  | 
					
						
						|  | def _lazy_init_scale_growth_tracker(self, dev): | 
					
						
						|  | assert self._growth_tracker is None, "_growth_tracker initialized before _scale" | 
					
						
						|  | self._scale = torch.full((1,), self._init_scale, dtype=torch.float32, device=dev) | 
					
						
						|  | self._growth_tracker = torch.full((1,), self._init_growth_tracker, dtype=torch.int32, device=dev) | 
					
						
						|  |  | 
					
						
						|  | def scale(self, outputs): | 
					
						
						|  | """ | 
					
						
						|  | Multiplies ('scales') a tensor or list of tensors by the scale factor. | 
					
						
						|  |  | 
					
						
						|  | Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned | 
					
						
						|  | unmodified. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | outputs (Tensor or iterable of Tensors):  Outputs to scale. | 
					
						
						|  | """ | 
					
						
						|  | if not self._enabled: | 
					
						
						|  | return outputs | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if isinstance(outputs, torch.Tensor): | 
					
						
						|  | assert outputs.is_cuda or outputs.device.type == 'xla' | 
					
						
						|  | if self._scale is None: | 
					
						
						|  | self._lazy_init_scale_growth_tracker(outputs.device) | 
					
						
						|  | assert self._scale is not None | 
					
						
						|  | return outputs * self._scale.to(device=outputs.device, non_blocking=True) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | stash: List[_MultiDeviceReplicator] = [] | 
					
						
						|  |  | 
					
						
						|  | def apply_scale(val): | 
					
						
						|  | if isinstance(val, torch.Tensor): | 
					
						
						|  | assert val.is_cuda or val.device.type == 'xla' | 
					
						
						|  | if len(stash) == 0: | 
					
						
						|  | if self._scale is None: | 
					
						
						|  | self._lazy_init_scale_growth_tracker(val.device) | 
					
						
						|  | assert self._scale is not None | 
					
						
						|  | stash.append(_MultiDeviceReplicator(self._scale)) | 
					
						
						|  | return val * stash[0].get(val.device) | 
					
						
						|  | elif isinstance(val, abc.Iterable): | 
					
						
						|  | iterable = map(apply_scale, val) | 
					
						
						|  | if isinstance(val, list) or isinstance(val, tuple): | 
					
						
						|  | return type(val)(iterable) | 
					
						
						|  | else: | 
					
						
						|  | return iterable | 
					
						
						|  | else: | 
					
						
						|  | raise ValueError("outputs must be a Tensor or an iterable of Tensors") | 
					
						
						|  |  | 
					
						
						|  | return apply_scale(outputs) | 
					
						
						|  |  | 
					
						
						|  | def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16): | 
					
						
						|  | per_device_inv_scale = _MultiDeviceReplicator(inv_scale) | 
					
						
						|  | per_device_found_inf = _MultiDeviceReplicator(found_inf) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list)) | 
					
						
						|  | with torch.no_grad(): | 
					
						
						|  | for group in optimizer.param_groups: | 
					
						
						|  | for param in group["params"]: | 
					
						
						|  | if param.grad is None: | 
					
						
						|  | continue | 
					
						
						|  | if (not allow_fp16) and param.grad.dtype == torch.float16: | 
					
						
						|  | raise ValueError("Attempting to unscale FP16 gradients.") | 
					
						
						|  | if param.grad.is_sparse: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if param.grad.dtype is torch.float16: | 
					
						
						|  | param.grad = param.grad.coalesce() | 
					
						
						|  | to_unscale = param.grad._values() | 
					
						
						|  | else: | 
					
						
						|  | to_unscale = param.grad | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(to_unscale) | 
					
						
						|  |  | 
					
						
						|  | for device, per_dtype_grads in per_device_and_dtype_grads.items(): | 
					
						
						|  | for grads in per_dtype_grads.values(): | 
					
						
						|  | torch._amp_foreach_non_finite_check_and_unscale_(grads, | 
					
						
						|  | per_device_found_inf.get(device), | 
					
						
						|  | per_device_inv_scale.get(device)) | 
					
						
						|  |  | 
					
						
						|  | return per_device_found_inf._per_device_tensors | 
					
						
						|  |  | 
					
						
						|  | def unscale_(self, optimizer): | 
					
						
						|  | """ | 
					
						
						|  | Divides ("unscales") the optimizer's gradient tensors by the scale factor. | 
					
						
						|  |  | 
					
						
						|  | :meth:`unscale_` is optional, serving cases where you need to | 
					
						
						|  | :ref:`modify or inspect gradients<working-with-unscaled-gradients>` | 
					
						
						|  | between the backward pass(es) and :meth:`step`. | 
					
						
						|  | If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`. | 
					
						
						|  |  | 
					
						
						|  | Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients:: | 
					
						
						|  |  | 
					
						
						|  | ... | 
					
						
						|  | scaler.scale(loss).backward() | 
					
						
						|  | scaler.unscale_(optimizer) | 
					
						
						|  | torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) | 
					
						
						|  | scaler.step(optimizer) | 
					
						
						|  | scaler.update() | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled. | 
					
						
						|  |  | 
					
						
						|  | .. note:: | 
					
						
						|  | :meth:`unscale_` does not incur a CPU-GPU sync. | 
					
						
						|  |  | 
					
						
						|  | .. warning:: | 
					
						
						|  | :meth:`unscale_` should only be called once per optimizer per :meth:`step` call, | 
					
						
						|  | and only after all gradients for that optimizer's assigned parameters have been accumulated. | 
					
						
						|  | Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError. | 
					
						
						|  |  | 
					
						
						|  | .. warning:: | 
					
						
						|  | :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute. | 
					
						
						|  | """ | 
					
						
						|  | if not self._enabled: | 
					
						
						|  | return | 
					
						
						|  |  | 
					
						
						|  | self._check_scale_growth_tracker("unscale_") | 
					
						
						|  |  | 
					
						
						|  | optimizer_state = self._per_optimizer_states[id(optimizer)] | 
					
						
						|  |  | 
					
						
						|  | if optimizer_state["stage"] is OptState.UNSCALED: | 
					
						
						|  | raise RuntimeError("unscale_() has already been called on this optimizer since the last update().") | 
					
						
						|  | elif optimizer_state["stage"] is OptState.STEPPED: | 
					
						
						|  | raise RuntimeError("unscale_() is being called after step().") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | assert self._scale is not None | 
					
						
						|  | inv_scale = self._scale.double().reciprocal().float() | 
					
						
						|  | found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device) | 
					
						
						|  |  | 
					
						
						|  | optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, False) | 
					
						
						|  | optimizer_state["stage"] = OptState.UNSCALED | 
					
						
						|  |  | 
					
						
						|  | def _maybe_opt_step(self, optimizer, optimizer_state, *args, **kwargs): | 
					
						
						|  | retval = None | 
					
						
						|  | if not sum(v.item() for v in optimizer_state["found_inf_per_device"].values()): | 
					
						
						|  | retval = optimizer.step(*args, **kwargs) | 
					
						
						|  | return retval | 
					
						
						|  |  | 
					
						
						|  | def step(self, optimizer, *args, **kwargs): | 
					
						
						|  | """ | 
					
						
						|  | :meth:`step` carries out the following two operations: | 
					
						
						|  |  | 
					
						
						|  | 1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer`` | 
					
						
						|  | earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs. | 
					
						
						|  | 2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled | 
					
						
						|  | gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params. | 
					
						
						|  |  | 
					
						
						|  | ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``. | 
					
						
						|  |  | 
					
						
						|  | Returns the return value of ``optimizer.step(*args, **kwargs)``. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients. | 
					
						
						|  | args:  Any arguments. | 
					
						
						|  | kwargs:  Any keyword arguments. | 
					
						
						|  |  | 
					
						
						|  | .. warning:: | 
					
						
						|  | Closure use is not currently supported. | 
					
						
						|  | """ | 
					
						
						|  | if (not self._enabled): | 
					
						
						|  | return optimizer.step(*args, **kwargs) | 
					
						
						|  |  | 
					
						
						|  | if "closure" in kwargs: | 
					
						
						|  | raise RuntimeError("Closure use is not currently supported if GradScaler is enabled.") | 
					
						
						|  |  | 
					
						
						|  | self._check_scale_growth_tracker("step") | 
					
						
						|  |  | 
					
						
						|  | optimizer_state = self._per_optimizer_states[id(optimizer)] | 
					
						
						|  |  | 
					
						
						|  | if optimizer_state["stage"] is OptState.STEPPED: | 
					
						
						|  | raise RuntimeError("step() has already been called since the last update().") | 
					
						
						|  |  | 
					
						
						|  | retval = None | 
					
						
						|  |  | 
					
						
						|  | if (hasattr(optimizer, "_step_supports_amp_scaling") and optimizer._step_supports_amp_scaling): | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | retval = optimizer.step(*args, **dict(kwargs, grad_scaler=self)) | 
					
						
						|  | optimizer_state["stage"] = OptState.STEPPED | 
					
						
						|  | return retval | 
					
						
						|  |  | 
					
						
						|  | if optimizer_state["stage"] is OptState.READY: | 
					
						
						|  | self.unscale_(optimizer) | 
					
						
						|  |  | 
					
						
						|  | assert len(optimizer_state["found_inf_per_device"]) > 0, "No inf checks were recorded for this optimizer." | 
					
						
						|  |  | 
					
						
						|  | retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs) | 
					
						
						|  |  | 
					
						
						|  | optimizer_state["stage"] = OptState.STEPPED | 
					
						
						|  |  | 
					
						
						|  | return retval | 
					
						
						|  |  | 
					
						
						|  | def update(self, new_scale=None): | 
					
						
						|  | """ | 
					
						
						|  | Updates the scale factor. | 
					
						
						|  |  | 
					
						
						|  | If any optimizer steps were skipped the scale is multiplied by ``backoff_factor`` | 
					
						
						|  | to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively, | 
					
						
						|  | the scale is multiplied by ``growth_factor`` to increase it. | 
					
						
						|  |  | 
					
						
						|  | Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not | 
					
						
						|  | used directly, it's used to fill GradScaler's internal scale tensor. So if | 
					
						
						|  | ``new_scale`` was a tensor, later in-place changes to that tensor will not further | 
					
						
						|  | affect the scale GradScaler uses internally.) | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor. | 
					
						
						|  |  | 
					
						
						|  | .. warning:: | 
					
						
						|  | :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has | 
					
						
						|  | been invoked for all optimizers used this iteration. | 
					
						
						|  | """ | 
					
						
						|  | if not self._enabled: | 
					
						
						|  | return | 
					
						
						|  |  | 
					
						
						|  | _scale, _growth_tracker = self._check_scale_growth_tracker("update") | 
					
						
						|  |  | 
					
						
						|  | if new_scale is not None: | 
					
						
						|  |  | 
					
						
						|  | if isinstance(new_scale, float): | 
					
						
						|  | self._scale.fill_(new_scale) | 
					
						
						|  | else: | 
					
						
						|  | reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False." | 
					
						
						|  | assert isinstance(new_scale, torch.cuda.FloatTensor), reason | 
					
						
						|  | assert new_scale.numel() == 1, reason | 
					
						
						|  | assert new_scale.requires_grad is False, reason | 
					
						
						|  | self._scale.copy_(new_scale) | 
					
						
						|  | else: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | found_infs = [found_inf.to(device=_scale.device, non_blocking=True) | 
					
						
						|  | for state in self._per_optimizer_states.values() | 
					
						
						|  | for found_inf in state["found_inf_per_device"].values()] | 
					
						
						|  |  | 
					
						
						|  | assert len(found_infs) > 0, "No inf checks were recorded prior to update." | 
					
						
						|  |  | 
					
						
						|  | found_inf_combined = found_infs[0] | 
					
						
						|  | if len(found_infs) > 1: | 
					
						
						|  | for i in range(1, len(found_infs)): | 
					
						
						|  | found_inf_combined += found_infs[i] | 
					
						
						|  |  | 
					
						
						|  | torch._amp_update_scale_(_scale, | 
					
						
						|  | _growth_tracker, | 
					
						
						|  | found_inf_combined, | 
					
						
						|  | self._growth_factor, | 
					
						
						|  | self._backoff_factor, | 
					
						
						|  | self._growth_interval) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state) | 
					
						
						|  |  | 
					
						
						|  | def _get_scale_async(self): | 
					
						
						|  | return self._scale | 
					
						
						|  |  | 
					
						
						|  | def get_scale(self): | 
					
						
						|  | """ | 
					
						
						|  | Returns a Python float containing the current scale, or 1.0 if scaling is disabled. | 
					
						
						|  |  | 
					
						
						|  | .. warning:: | 
					
						
						|  | :meth:`get_scale` incurs a CPU-GPU sync. | 
					
						
						|  | """ | 
					
						
						|  | if self._enabled: | 
					
						
						|  | return self._init_scale if self._scale is None else self._get_scale_async().item() | 
					
						
						|  | else: | 
					
						
						|  | return 1.0 | 
					
						
						|  |  | 
					
						
						|  | def get_growth_factor(self): | 
					
						
						|  | r""" | 
					
						
						|  | Returns a Python float containing the scale growth factor. | 
					
						
						|  | """ | 
					
						
						|  | return self._growth_factor | 
					
						
						|  |  | 
					
						
						|  | def set_growth_factor(self, new_factor): | 
					
						
						|  | r""" | 
					
						
						|  | Args: | 
					
						
						|  | new_scale (float):  Value to use as the new scale growth factor. | 
					
						
						|  | """ | 
					
						
						|  | self._growth_factor = new_factor | 
					
						
						|  |  | 
					
						
						|  | def get_backoff_factor(self): | 
					
						
						|  | r""" | 
					
						
						|  | Returns a Python float containing the scale backoff factor. | 
					
						
						|  | """ | 
					
						
						|  | return self._backoff_factor | 
					
						
						|  |  | 
					
						
						|  | def set_backoff_factor(self, new_factor): | 
					
						
						|  | r""" | 
					
						
						|  | Args: | 
					
						
						|  | new_scale (float):  Value to use as the new scale backoff factor. | 
					
						
						|  | """ | 
					
						
						|  | self._backoff_factor = new_factor | 
					
						
						|  |  | 
					
						
						|  | def get_growth_interval(self): | 
					
						
						|  | r""" | 
					
						
						|  | Returns a Python int containing the growth interval. | 
					
						
						|  | """ | 
					
						
						|  | return self._growth_interval | 
					
						
						|  |  | 
					
						
						|  | def set_growth_interval(self, new_interval): | 
					
						
						|  | r""" | 
					
						
						|  | Args: | 
					
						
						|  | new_interval (int):  Value to use as the new growth interval. | 
					
						
						|  | """ | 
					
						
						|  | self._growth_interval = new_interval | 
					
						
						|  |  | 
					
						
						|  | def _get_growth_tracker(self): | 
					
						
						|  | if self._enabled: | 
					
						
						|  | return self._init_growth_tracker if self._growth_tracker is None else self._growth_tracker.item() | 
					
						
						|  | else: | 
					
						
						|  | return 0 | 
					
						
						|  |  | 
					
						
						|  | def is_enabled(self): | 
					
						
						|  | r""" | 
					
						
						|  | Returns a bool indicating whether this instance is enabled. | 
					
						
						|  | """ | 
					
						
						|  | return self._enabled | 
					
						
						|  |  | 
					
						
						|  | def state_dict(self): | 
					
						
						|  | r""" | 
					
						
						|  | Returns the state of the scaler as a :class:`dict`.  It contains five entries: | 
					
						
						|  |  | 
					
						
						|  | * ``"scale"`` - a Python float containing the current scale | 
					
						
						|  | * ``"growth_factor"`` - a Python float containing the current growth factor | 
					
						
						|  | * ``"backoff_factor"`` - a Python float containing the current backoff factor | 
					
						
						|  | * ``"growth_interval"`` - a Python int containing the current growth interval | 
					
						
						|  | * ``"_growth_tracker"`` - a Python int containing the number of recent consecutive unskipped steps. | 
					
						
						|  |  | 
					
						
						|  | If this instance is not enabled, returns an empty dict. | 
					
						
						|  |  | 
					
						
						|  | .. note:: | 
					
						
						|  | If you wish to checkpoint the scaler's state after a particular iteration, :meth:`state_dict` | 
					
						
						|  | should be called after :meth:`update`. | 
					
						
						|  | """ | 
					
						
						|  | return {"scale": self.get_scale(), | 
					
						
						|  | "growth_factor": self._growth_factor, | 
					
						
						|  | "backoff_factor": self._backoff_factor, | 
					
						
						|  | "growth_interval": self._growth_interval, | 
					
						
						|  | "_growth_tracker": self._get_growth_tracker()} if self._enabled else {} | 
					
						
						|  |  | 
					
						
						|  | def load_state_dict(self, state_dict): | 
					
						
						|  | r""" | 
					
						
						|  | Loads the scaler state.  If this instance is disabled, :meth:`load_state_dict` is a no-op. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | state_dict(dict): scaler state.  Should be an object returned from a call to :meth:`state_dict`. | 
					
						
						|  | """ | 
					
						
						|  | if not self._enabled: | 
					
						
						|  | return | 
					
						
						|  |  | 
					
						
						|  | if len(state_dict) == 0: | 
					
						
						|  | raise RuntimeError("The source state dict is empty, possibly because it was saved " | 
					
						
						|  | "from a disabled instance of GradScaler.") | 
					
						
						|  |  | 
					
						
						|  | self._init_scale = state_dict["scale"] | 
					
						
						|  | if self._scale is not None: | 
					
						
						|  | self._scale.fill_(state_dict["scale"]) | 
					
						
						|  | self._growth_factor = state_dict["growth_factor"] | 
					
						
						|  | self._backoff_factor = state_dict["backoff_factor"] | 
					
						
						|  | self._growth_interval = state_dict["growth_interval"] | 
					
						
						|  | self._init_growth_tracker = state_dict["_growth_tracker"] | 
					
						
						|  | if self._growth_tracker is not None: | 
					
						
						|  | self._growth_tracker.fill_(state_dict["_growth_tracker"]) | 
					
						
						|  |  | 
					
						
						|  | def __getstate__(self): | 
					
						
						|  | state = self.__dict__.copy() | 
					
						
						|  | if self._enabled: | 
					
						
						|  | assert len(self._per_optimizer_states) == 0, "A GradScaler instance may only be pickled at the beginning "\ | 
					
						
						|  | "of an iteration, or at the end after scaler.update()." | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | state['_init_scale'] = self.get_scale() | 
					
						
						|  | state['_init_growth_tracker'] = self._get_growth_tracker() | 
					
						
						|  | state['_scale'] = None | 
					
						
						|  | state['_growth_tracker'] = None | 
					
						
						|  | return state | 
					
						
						|  |  | 
					
						
						|  | def __setstate__(self, state): | 
					
						
						|  | self.__dict__.update(state) | 
					
						
						|  |  | 
					
						
						|  | def _check_inf_per_device(self, optimizer): | 
					
						
						|  | _scale, _ = self._check_scale_growth_tracker("_check_inf_per_device") | 
					
						
						|  |  | 
					
						
						|  | dummy_inv_scale = torch.full((1,), 1.0, dtype=torch.float32, device=_scale.device) | 
					
						
						|  | found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=_scale.device) | 
					
						
						|  |  | 
					
						
						|  | self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = \ | 
					
						
						|  | self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True) | 
					
						
						|  |  | 
					
						
						|  | return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] | 
					
						
						|  |  | 
					
						
						|  | def _found_inf_per_device(self, optimizer): | 
					
						
						|  | return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] | 
					
						
						|  |  |