lengyue233's picture
Init hf space integration
0a3525d verified
raw
history blame
3.44 kB
from typing import Optional, Union
import lightning.pytorch as pl
import torch
from lightning import LightningModule, Trainer
from lightning.pytorch.callbacks import Callback
from torch import Tensor, nn
from torch.utils._foreach_utils import (
_group_tensors_by_device_and_dtype,
_has_foreach_support,
)
@torch.no_grad()
def grad_norm(
parameters: Union[Tensor, list[Tensor]],
norm_type: float = 2.0,
) -> float:
"""
Returns the norm of the gradients of the given parameters.
Args:
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
single Tensor that will have gradients normalized
norm_type (float): type of the used p-norm.
Returns:
Total norm of the parameter gradients (viewed as a single vector).
""" # noqa: E501
if isinstance(parameters, Tensor):
parameters = [parameters]
grads = [p.grad for p in parameters if p.grad is not None]
if len(grads) == 0:
return None
first_device = grads[0].device
grouped_grads: dict[
tuple[torch.device, torch.dtype], list[list[Tensor]]
] = _group_tensors_by_device_and_dtype(
[[g.detach() for g in grads]]
) # type: ignore[assignment]
norms = []
for (device, _), ([grads], _) in grouped_grads.items():
if _has_foreach_support(grads, device=device):
norms.extend(torch._foreach_norm(grads, norm_type))
else:
norms.extend([torch.norm(g, norm_type) for g in grads])
return torch.norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type)
class GradNormMonitor(Callback):
"""
Callback that computes the gradient norm of the model parameters.
"""
def __init__(
self,
norm_type: float = 2.0,
logging_interval: str = "step",
sub_module: Optional[Union[str, list[str]]] = None,
) -> None:
"""
Args:
norm_type (float): type of the used p-norm.
logging_interval (str): "step" or "epoch".
"""
super().__init__()
self.norm_type = norm_type
self.logging_interval = logging_interval
self.sub_module = sub_module
def on_after_backward(self, trainer: Trainer, model: LightningModule) -> None:
"""
Computes the gradient norm of the model parameters and logs it to the logger.
Args:
trainer (Trainer): The trainer object
model (LightningModule): The current lightningModule
"""
lightning_model = model
if self.sub_module is None:
return self.log_sub_module_grad_norm(lightning_model, model, "")
sub_modules = self.sub_module
if isinstance(sub_modules, str):
sub_modules = [sub_modules]
for sub_module in sub_modules:
self.log_sub_module_grad_norm(
lightning_model, getattr(model, sub_module), f"/{sub_module}"
)
def log_sub_module_grad_norm(
self, lightning_model: LightningModule, model: nn.Module, path: str
) -> None:
grad_norm_val = grad_norm(model.parameters(), self.norm_type)
if grad_norm_val is None:
return
on_step = self.logging_interval == "step"
lightning_model.log(
f"train{path}/grad_norm",
grad_norm_val,
on_step=on_step,
on_epoch=not on_step,
)