Spaces:

unpairedelectron07
/

Text-to-Music-Generator

Running

App Files Files Community

unpairedelectron07 commited on Jan 21, 2024

Commit

034e769

verified ·

1 Parent(s): fbc2435

Upload 4 files

Browse files

Files changed (4) hide show

audiocraft/losses/balancer.py +136 -0
audiocraft/losses/sisnr.py +97 -0
audiocraft/losses/specloss.py +149 -0
audiocraft/losses/stftloss.py +207 -0

audiocraft/losses/balancer.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import flashy
+import torch
+from torch import autograd
+class Balancer:
+    """Loss balancer.
+    The loss balancer combines losses together to compute gradients for the backward.
+    Given `y = f(...)`, and a number of losses `l1(y, ...)`, `l2(y, ...)`, with `...`
+    not having any dependence on `f`, the balancer can efficiently normalize the partial gradients
+    `d l1 / d y`, `d l2 / dy` before summing them in order to achieve a desired ratio between
+    the losses. For instance if `weights = {'l1': 2, 'l2': 1}`, 66% of the gradient
+    going into `f(...)` will come from `l1` on average, and 33% from `l2`. This allows for an easy
+    interpration of the weights even if the intrisic scale of `l1`, `l2` ... is unknown.
+    Noting `g1 = d l1 / dy`, etc., the balanced gradient `G` will be
+    (with `avg` an exponential moving average over the updates),
+        G = sum_i total_norm * g_i / avg(||g_i||) * w_i / sum(w_i)
+    If `balance_grads` is False, this is deactivated, and instead the gradient will just be the
+    standard sum of the partial gradients with the given weights.
+    A call to the backward method of the balancer will compute the the partial gradients,
+    combining all the losses and potentially rescaling the gradients,
+    which can help stabilize the training and reason about multiple losses with varying scales.
+    The obtained gradient with respect to `y` is then back-propagated to `f(...)`.
+    Expected usage:
+        weights = {'loss_a': 1, 'loss_b': 4}
+        balancer = Balancer(weights, ...)
+        losses: dict = {}
+        losses['loss_a'] = compute_loss_a(x, y)
+        losses['loss_b'] = compute_loss_b(x, y)
+        if model.training():
+            effective_loss = balancer.backward(losses, x)
+    Args:
+        weights (dict[str, float]): Weight coefficient for each loss. The balancer expect the losses keys
+            from the backward method to match the weights keys to assign weight to each of the provided loss.
+        balance_grads (bool): Whether to rescale gradients so that weights reflect the fraction of the
+            overall gradient, rather than a constant multiplier.
+        total_norm (float): Reference norm when rescaling gradients, ignored otherwise.
+        emay_decay (float): EMA decay for averaging the norms.
+        per_batch_item (bool): Whether to compute the averaged norm per batch item or not. This only holds
+            when rescaling the gradients.
+        epsilon (float): Epsilon value for numerical stability.
+        monitor (bool): If True, stores in `self.metrics` the relative ratio between the norm of the gradients
+            coming from each loss, when calling `backward()`.
+    """
+    def __init__(self, weights: tp.Dict[str, float], balance_grads: bool = True, total_norm: float = 1.,
+                 ema_decay: float = 0.999, per_batch_item: bool = True, epsilon: float = 1e-12,
+                 monitor: bool = False):
+        self.weights = weights
+        self.per_batch_item = per_batch_item
+        self.total_norm = total_norm or 1.
+        self.averager = flashy.averager(ema_decay or 1.)
+        self.epsilon = epsilon
+        self.monitor = monitor
+        self.balance_grads = balance_grads
+        self._metrics: tp.Dict[str, tp.Any] = {}
+    @property
+    def metrics(self):
+        return self._metrics
+    def backward(self, losses: tp.Dict[str, torch.Tensor], input: torch.Tensor) -> torch.Tensor:
+        """Compute the backward and return the effective train loss, e.g. the loss obtained from
+        computing the effective weights. If `balance_grads` is True, the effective weights
+        are the one that needs to be applied to each gradient to respect the desired relative
+        scale of gradients coming from each loss.
+        Args:
+            losses (Dict[str, torch.Tensor]): dictionary with the same keys as `self.weights`.
+            input (torch.Tensor): the input of the losses, typically the output of the model.
+                This should be the single point of dependence between the losses
+                and the model being trained.
+        """
+        norms = {}
+        grads = {}
+        for name, loss in losses.items():
+            # Compute partial derivative of the less with respect to the input.
+            grad, = autograd.grad(loss, [input], retain_graph=True)
+            if self.per_batch_item:
+                # We do not average the gradient over the batch dimension.
+                dims = tuple(range(1, grad.dim()))
+                norm = grad.norm(dim=dims, p=2).mean()
+            else:
+                norm = grad.norm(p=2)
+            norms[name] = norm
+            grads[name] = grad
+        count = 1
+        if self.per_batch_item:
+            count = len(grad)
+        # Average norms across workers. Theoretically we should average the
+        # squared norm, then take the sqrt, but it worked fine like that.
+        avg_norms = flashy.distrib.average_metrics(self.averager(norms), count)
+        # We approximate the total norm of the gradient as the sums of the norms.
+        # Obviously this can be very incorrect if all gradients are aligned, but it works fine.
+        total = sum(avg_norms.values())
+        self._metrics = {}
+        if self.monitor:
+            # Store the ratio of the total gradient represented by each loss.
+            for k, v in avg_norms.items():
+                self._metrics[f'ratio_{k}'] = v / total
+        total_weights = sum([self.weights[k] for k in avg_norms])
+        assert total_weights > 0.
+        desired_ratios = {k: w / total_weights for k, w in self.weights.items()}
+        out_grad = torch.zeros_like(input)
+        effective_loss = torch.tensor(0., device=input.device, dtype=input.dtype)
+        for name, avg_norm in avg_norms.items():
+            if self.balance_grads:
+                # g_balanced = g / avg(||g||) * total_norm * desired_ratio
+                scale = desired_ratios[name] * self.total_norm / (self.epsilon + avg_norm)
+            else:
+                # We just do regular weighted sum of the gradients.
+                scale = self.weights[name]
+            out_grad.add_(grads[name], alpha=scale)
+            effective_loss += scale * losses[name].detach()
+        # Send the computed partial derivative with respect to the output of the model to the model.
+        input.backward(out_grad)
+        return effective_loss

audiocraft/losses/sisnr.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import typing as tp
+import torch
+from torch import nn
+from torch.nn import functional as F
+def _unfold(a: torch.Tensor, kernel_size: int, stride: int) -> torch.Tensor:
+    """Given input of size [*OT, T], output Tensor of size [*OT, F, K]
+    with K the kernel size, by extracting frames with the given stride.
+    This will pad the input so that `F = ceil(T / K)`.
+    see https://github.com/pytorch/pytorch/issues/60466
+    """
+    *shape, length = a.shape
+    n_frames = math.ceil(length / stride)
+    tgt_length = (n_frames - 1) * stride + kernel_size
+    a = F.pad(a, (0, tgt_length - length))
+    strides = list(a.stride())
+    assert strides[-1] == 1, "data should be contiguous"
+    strides = strides[:-1] + [stride, 1]
+    return a.as_strided([*shape, n_frames, kernel_size], strides)
+def _center(x: torch.Tensor) -> torch.Tensor:
+    return x - x.mean(-1, True)
+def _norm2(x: torch.Tensor) -> torch.Tensor:
+    return x.pow(2).sum(-1, True)
+class SISNR(nn.Module):
+    """SISNR loss.
+    Input should be [B, C, T], output is scalar.
+    ..Warning:: This function returns the opposite of the SI-SNR (e.g. `-1 * regular_SI_SNR`).
+        Consequently, lower scores are better in terms of reconstruction quality,
+        in particular, it should be negative if training goes well. This done this way so
+        that this module can also be used as a loss function for training model.
+    Args:
+        sample_rate (int): Sample rate.
+        segment (float or None): Evaluate on chunks of that many seconds. If None, evaluate on
+            entire audio only.
+        overlap (float): Overlap between chunks, i.e. 0.5 = 50 % overlap.
+        epsilon (float): Epsilon value for numerical stability.
+    """
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        segment: tp.Optional[float] = 20,
+        overlap: float = 0.5,
+        epsilon: float = torch.finfo(torch.float32).eps,
+    ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.segment = segment
+        self.overlap = overlap
+        self.epsilon = epsilon
+    def forward(self, out_sig: torch.Tensor, ref_sig: torch.Tensor) -> torch.Tensor:
+        B, C, T = ref_sig.shape
+        assert ref_sig.shape == out_sig.shape
+        if self.segment is None:
+            frame = T
+            stride = T
+        else:
+            frame = int(self.segment * self.sample_rate)
+            stride = int(frame * (1 - self.overlap))
+        epsilon = self.epsilon * frame  # make epsilon prop to frame size.
+        gt = _unfold(ref_sig, frame, stride)
+        est = _unfold(out_sig, frame, stride)
+        if self.segment is None:
+            assert gt.shape[-1] == 1
+        gt = _center(gt)
+        est = _center(est)
+        dot = torch.einsum("bcft,bcft->bcf", gt, est)
+        proj = dot[:, :, :, None] * gt / (epsilon + _norm2(gt))
+        noise = est - proj
+        sisnr = 10 * (
+            torch.log10(epsilon + _norm2(proj)) - torch.log10(epsilon + _norm2(noise))
+        )
+        return -1 * sisnr[..., 0].mean()

audiocraft/losses/specloss.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import numpy as np
+from torchaudio.transforms import MelSpectrogram
+import torch
+from torch import nn
+from torch.nn import functional as F
+from ..modules import pad_for_conv1d
+class MelSpectrogramWrapper(nn.Module):
+    """Wrapper around MelSpectrogram torchaudio transform providing proper padding
+    and additional post-processing including log scaling.
+    Args:
+        n_mels (int): Number of mel bins.
+        n_fft (int): Number of fft.
+        hop_length (int): Hop size.
+        win_length (int): Window length.
+        n_mels (int): Number of mel bins.
+        sample_rate (int): Sample rate.
+        f_min (float or None): Minimum frequency.
+        f_max (float or None): Maximum frequency.
+        log (bool): Whether to scale with log.
+        normalized (bool): Whether to normalize the melspectrogram.
+        floor_level (float): Floor level based on human perception (default=1e-5).
+    """
+    def __init__(self, n_fft: int = 1024, hop_length: int = 256, win_length: tp.Optional[int] = None,
+                 n_mels: int = 80, sample_rate: float = 22050, f_min: float = 0.0, f_max: tp.Optional[float] = None,
+                 log: bool = True, normalized: bool = False, floor_level: float = 1e-5):
+        super().__init__()
+        self.n_fft = n_fft
+        hop_length = int(hop_length)
+        self.hop_length = hop_length
+        self.mel_transform = MelSpectrogram(n_mels=n_mels, sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length,
+                                            win_length=win_length, f_min=f_min, f_max=f_max, normalized=normalized,
+                                            window_fn=torch.hann_window, center=False)
+        self.floor_level = floor_level
+        self.log = log
+    def forward(self, x):
+        p = int((self.n_fft - self.hop_length) // 2)
+        if len(x.shape) == 2:
+            x = x.unsqueeze(1)
+        x = F.pad(x, (p, p), "reflect")
+        # Make sure that all the frames are full.
+        # The combination of `pad_for_conv1d` and the above padding
+        # will make the output of size ceil(T / hop).
+        x = pad_for_conv1d(x, self.n_fft, self.hop_length)
+        self.mel_transform.to(x.device)
+        mel_spec = self.mel_transform(x)
+        B, C, freqs, frame = mel_spec.shape
+        if self.log:
+            mel_spec = torch.log10(self.floor_level + mel_spec)
+        return mel_spec.reshape(B, C * freqs, frame)
+class MelSpectrogramL1Loss(torch.nn.Module):
+    """L1 Loss on MelSpectrogram.
+    Args:
+        sample_rate (int): Sample rate.
+        n_fft (int): Number of fft.
+        hop_length (int): Hop size.
+        win_length (int): Window length.
+        n_mels (int): Number of mel bins.
+        f_min (float or None): Minimum frequency.
+        f_max (float or None): Maximum frequency.
+        log (bool): Whether to scale with log.
+        normalized (bool): Whether to normalize the melspectrogram.
+        floor_level (float): Floor level value based on human perception (default=1e-5).
+    """
+    def __init__(self, sample_rate: int, n_fft: int = 1024, hop_length: int = 256, win_length: int = 1024,
+                 n_mels: int = 80, f_min: float = 0.0, f_max: tp.Optional[float] = None,
+                 log: bool = True, normalized: bool = False, floor_level: float = 1e-5):
+        super().__init__()
+        self.l1 = torch.nn.L1Loss()
+        self.melspec = MelSpectrogramWrapper(n_fft=n_fft, hop_length=hop_length, win_length=win_length,
+                                             n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max,
+                                             log=log, normalized=normalized, floor_level=floor_level)
+    def forward(self, x, y):
+        self.melspec.to(x.device)
+        s_x = self.melspec(x)
+        s_y = self.melspec(y)
+        return self.l1(s_x, s_y)
+class MultiScaleMelSpectrogramLoss(nn.Module):
+    """Multi-Scale spectrogram loss (msspec).
+    Args:
+        sample_rate (int): Sample rate.
+        range_start (int): Power of 2 to use for the first scale.
+        range_stop (int): Power of 2 to use for the last scale.
+        n_mels (int): Number of mel bins.
+        f_min (float): Minimum frequency.
+        f_max (float or None): Maximum frequency.
+        normalized (bool): Whether to normalize the melspectrogram.
+        alphas (bool): Whether to use alphas as coefficients or not.
+        floor_level (float): Floor level value based on human perception (default=1e-5).
+    """
+    def __init__(self, sample_rate: int, range_start: int = 6, range_end: int = 11,
+                 n_mels: int = 64, f_min: float = 0.0, f_max: tp.Optional[float] = None,
+                 normalized: bool = False, alphas: bool = True, floor_level: float = 1e-5):
+        super().__init__()
+        l1s = list()
+        l2s = list()
+        self.alphas = list()
+        self.total = 0
+        self.normalized = normalized
+        for i in range(range_start, range_end):
+            l1s.append(
+                MelSpectrogramWrapper(n_fft=2 ** i, hop_length=(2 ** i) / 4, win_length=2 ** i,
+                                      n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max,
+                                      log=False, normalized=normalized, floor_level=floor_level))
+            l2s.append(
+                MelSpectrogramWrapper(n_fft=2 ** i, hop_length=(2 ** i) / 4, win_length=2 ** i,
+                                      n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max,
+                                      log=True, normalized=normalized, floor_level=floor_level))
+            if alphas:
+                self.alphas.append(np.sqrt(2 ** i - 1))
+            else:
+                self.alphas.append(1)
+            self.total += self.alphas[-1] + 1
+        self.l1s = nn.ModuleList(l1s)
+        self.l2s = nn.ModuleList(l2s)
+    def forward(self, x, y):
+        loss = 0.0
+        self.l1s.to(x.device)
+        self.l2s.to(x.device)
+        for i in range(len(self.alphas)):
+            s_x_1 = self.l1s[i](x)
+            s_y_1 = self.l1s[i](y)
+            s_x_2 = self.l2s[i](x)
+            s_y_2 = self.l2s[i](y)
+            loss += F.l1_loss(s_x_1, s_y_1) + self.alphas[i] * F.mse_loss(s_x_2, s_y_2)
+        if self.normalized:
+            loss = loss / self.total
+        return loss

audiocraft/losses/stftloss.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Adapted from MIT code under the original license
+# Copyright 2019 Tomoki Hayashi
+# MIT License (https://opensource.org/licenses/MIT)
+import typing as tp
+import torch
+from torch import nn
+from torch.nn import functional as F
+# TODO: Replace with torchaudio.STFT?
+def _stft(x: torch.Tensor, fft_size: int, hop_length: int, win_length: int,
+          window: tp.Optional[torch.Tensor], normalized: bool) -> torch.Tensor:
+    """Perform STFT and convert to magnitude spectrogram.
+    Args:
+        x: Input signal tensor (B, C, T).
+        fft_size (int): FFT size.
+        hop_length (int): Hop size.
+        win_length (int): Window length.
+        window (torch.Tensor or None): Window function type.
+        normalized (bool): Whether to normalize the STFT or not.
+    Returns:
+        torch.Tensor: Magnitude spectrogram (B, C, #frames, fft_size // 2 + 1).
+    """
+    B, C, T = x.shape
+    x_stft = torch.stft(
+        x.view(-1, T), fft_size, hop_length, win_length, window,
+        normalized=normalized, return_complex=True,
+    )
+    x_stft = x_stft.view(B, C, *x_stft.shape[1:])
+    real = x_stft.real
+    imag = x_stft.imag
+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
+    return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
+class SpectralConvergenceLoss(nn.Module):
+    """Spectral convergence loss.
+    """
+    def __init__(self, epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        self.epsilon = epsilon
+    def forward(self, x_mag: torch.Tensor, y_mag: torch.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x_mag: Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag: Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            torch.Tensor: Spectral convergence loss value.
+        """
+        return torch.norm(y_mag - x_mag, p="fro") / (torch.norm(y_mag, p="fro") + self.epsilon)
+class LogSTFTMagnitudeLoss(nn.Module):
+    """Log STFT magnitude loss.
+    Args:
+        epsilon (float): Epsilon value for numerical stability.
+    """
+    def __init__(self, epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        self.epsilon = epsilon
+    def forward(self, x_mag: torch.Tensor, y_mag: torch.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x_mag (torch.Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (torch.Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            torch.Tensor: Log STFT magnitude loss value.
+        """
+        return F.l1_loss(torch.log(self.epsilon + y_mag), torch.log(self.epsilon + x_mag))
+class STFTLosses(nn.Module):
+    """STFT losses.
+    Args:
+        n_fft (int): Size of FFT.
+        hop_length (int): Hop length.
+        win_length (int): Window length.
+        window (str): Window function type.
+        normalized (bool): Whether to use normalized STFT or not.
+        epsilon (float): Epsilon for numerical stability.
+    """
+    def __init__(self, n_fft: int = 1024, hop_length: int = 120, win_length: int = 600,
+                 window: str = "hann_window", normalized: bool = False,
+                 epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.normalized = normalized
+        self.register_buffer("window", getattr(torch, window)(win_length))
+        self.spectral_convergenge_loss = SpectralConvergenceLoss(epsilon)
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss(epsilon)
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+        """Calculate forward propagation.
+        Args:
+            x (torch.Tensor): Predicted signal (B, T).
+            y (torch.Tensor): Groundtruth signal (B, T).
+        Returns:
+            torch.Tensor: Spectral convergence loss value.
+            torch.Tensor: Log STFT magnitude loss value.
+        """
+        x_mag = _stft(x, self.n_fft, self.hop_length,
+                      self.win_length, self.window, self.normalized)  # type: ignore
+        y_mag = _stft(y, self.n_fft, self.hop_length,
+                      self.win_length, self.window, self.normalized)  # type: ignore
+        sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+        return sc_loss, mag_loss
+class STFTLoss(nn.Module):
+    """Single Resolution STFT loss.
+    Args:
+        n_fft (int): Nb of FFT.
+        hop_length (int): Hop length.
+        win_length (int): Window length.
+        window (str): Window function type.
+        normalized (bool): Whether to use normalized STFT or not.
+        epsilon (float): Epsilon for numerical stability.
+        factor_sc (float): Coefficient for the spectral loss.
+        factor_mag (float): Coefficient for the magnitude loss.
+    """
+    def __init__(self, n_fft: int = 1024, hop_length: int = 120, win_length: int = 600,
+                 window: str = "hann_window", normalized: bool = False,
+                 factor_sc: float = 0.1, factor_mag: float = 0.1,
+                 epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        self.loss = STFTLosses(n_fft, hop_length, win_length, window, normalized, epsilon)
+        self.factor_sc = factor_sc
+        self.factor_mag = factor_mag
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+        """Calculate forward propagation.
+        Args:
+            x (torch.Tensor): Predicted signal (B, T).
+            y (torch.Tensor): Groundtruth signal (B, T).
+        Returns:
+            torch.Tensor: Single resolution STFT loss.
+        """
+        sc_loss, mag_loss = self.loss(x, y)
+        return self.factor_sc * sc_loss + self.factor_mag * mag_loss
+class MRSTFTLoss(nn.Module):
+    """Multi resolution STFT loss.
+    Args:
+        n_ffts (Sequence[int]): Sequence of FFT sizes.
+        hop_lengths (Sequence[int]): Sequence of hop sizes.
+        win_lengths (Sequence[int]): Sequence of window lengths.
+        window (str): Window function type.
+        factor_sc (float): Coefficient for the spectral loss.
+        factor_mag (float): Coefficient for the magnitude loss.
+        normalized (bool): Whether to use normalized STFT or not.
+        epsilon (float): Epsilon for numerical stability.
+    """
+    def __init__(self, n_ffts: tp.Sequence[int] = [1024, 2048, 512], hop_lengths: tp.Sequence[int] = [120, 240, 50],
+                 win_lengths: tp.Sequence[int] = [600, 1200, 240], window: str = "hann_window",
+                 factor_sc: float = 0.1, factor_mag: float = 0.1,
+                 normalized: bool = False, epsilon: float = torch.finfo(torch.float32).eps):
+        super().__init__()
+        assert len(n_ffts) == len(hop_lengths) == len(win_lengths)
+        self.stft_losses = torch.nn.ModuleList()
+        for fs, ss, wl in zip(n_ffts, hop_lengths, win_lengths):
+            self.stft_losses += [STFTLosses(fs, ss, wl, window, normalized, epsilon)]
+        self.factor_sc = factor_sc
+        self.factor_mag = factor_mag
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        """Calculate forward propagation.
+        Args:
+            x (torch.Tensor): Predicted signal (B, T).
+            y (torch.Tensor): Groundtruth signal (B, T).
+        Returns:
+            torch.Tensor: Multi resolution STFT loss.
+        """
+        sc_loss = torch.Tensor([0.0])
+        mag_loss = torch.Tensor([0.0])
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+        return self.factor_sc * sc_loss + self.factor_mag * mag_loss