Spaces:

unpairedelectron07
/

Text-to-Music-Generator

Running

App Files Files Community

unpairedelectron07 commited on Jan 21

Commit

8788873

•

1 Parent(s): 652ff96

Upload 7 files

Browse files

Files changed (7) hide show

audiocraft/optim/cosine_lr_scheduler.py +48 -0
audiocraft/optim/dadam.py +248 -0
audiocraft/optim/ema.py +85 -0
audiocraft/optim/fsdp.py +195 -0
audiocraft/optim/inverse_sqrt_lr_scheduler.py +38 -0
audiocraft/optim/linear_warmup_lr_scheduler.py +35 -0
audiocraft/optim/polynomial_decay_lr_scheduler.py +47 -0

audiocraft/optim/cosine_lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+class CosineLRScheduler(_LRScheduler):
+    """Cosine LR scheduler.
+    Args:
+        optimizer (Optimizer): Torch optimizer.
+        warmup_steps (int): Number of warmup steps.
+        total_steps (int): Total number of steps.
+        lr_min_ratio (float): Minimum learning rate.
+        cycle_length (float): Cycle length.
+    """
+    def __init__(self, optimizer: Optimizer, total_steps: int, warmup_steps: int,
+                 lr_min_ratio: float = 0.0, cycle_length: float = 1.0):
+        self.warmup_steps = warmup_steps
+        assert self.warmup_steps >= 0
+        self.total_steps = total_steps
+        assert self.total_steps >= 0
+        self.lr_min_ratio = lr_min_ratio
+        self.cycle_length = cycle_length
+        super().__init__(optimizer)
+    def _get_sched_lr(self, lr: float, step: int):
+        if step < self.warmup_steps:
+            lr_ratio = step / self.warmup_steps
+            lr = lr_ratio * lr
+        elif step <= self.total_steps:
+            s = (step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
+            lr_ratio = self.lr_min_ratio + 0.5 * (1 - self.lr_min_ratio) * \
+                (1. + math.cos(math.pi * s / self.cycle_length))
+            lr = lr_ratio * lr
+        else:
+            lr_ratio = self.lr_min_ratio
+            lr = lr_ratio * lr
+        return lr
+    def get_lr(self):
+        return [self._get_sched_lr(lr, self.last_epoch) for lr in self.base_lrs]

audiocraft/optim/dadam.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from typing import Any
+import torch
+import torch.optim
+import torch.distributed as dist
+logger = logging.getLogger(__name__)
+_params_t = Any
+def to_real(x):
+    if torch.is_complex(x):
+        return x.real
+    else:
+        return x
+class DAdaptAdam(torch.optim.Optimizer):
+    """Adam with D-Adaptation automatic step-sizes.
+    Leave LR set to 1 unless you encounter instability.
+    Args:
+        params (iterable):
+            Iterable of parameters to optimize or dicts defining parameter groups.
+        lr (float):
+            Learning rate adjustment parameter. Increases or decreases the D-adapted learning rate.
+        betas (tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        momentum (float):
+            Momentum value in  the range [0,1) (default: 0.9).
+        eps (float):
+            Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-8).
+        weight_decay (float):
+            Weight decay, i.e. a L2 penalty (default: 0).
+        log_every (int):
+            Log using print every k steps, default 0 (no logging).
+        decouple (boolean):
+            Use AdamW style decoupled weight decay
+        d0 (float):
+            Initial D estimate for D-adaptation (default 1e-6). Rarely needs changing.
+        growth_rate (float):
+            prevent the D estimate from growing faster than this multiplicative rate.
+            Default is inf, for unrestricted. Values like 1.02 give a kind of learning
+            rate warmup effect.
+        fsdp_in_use (bool):
+            If you're using sharded parameters, this should be set to True. The optimizer
+            will attempt to auto-detect this, but if you're using an implementation other
+            than PyTorch's builtin version, the auto-detection won't work.
+    """
+    def __init__(self, params, lr=1.0,
+                 betas=(0.9, 0.999),
+                 eps=1e-8,
+                 weight_decay=0,
+                 log_every=0,
+                 decouple=True,
+                 d0=1e-6,
+                 growth_rate=float('inf')):
+        if not 0.0 < d0:
+            raise ValueError("Invalid d0 value: {}".format(d0))
+        if not 0.0 < lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 < eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if decouple:
+            logger.info("Using decoupled weight decay")
+        from .fsdp import is_fsdp_used
+        fsdp_in_use = is_fsdp_used()
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay,
+                        d=d0,
+                        k=0,
+                        gsq_weighted=0.0,
+                        log_every=log_every,
+                        decouple=decouple,
+                        growth_rate=growth_rate,
+                        fsdp_in_use=fsdp_in_use)
+        super().__init__(params, defaults)
+    @property
+    def supports_memory_efficient_fp16(self):
+        return False
+    @property
+    def supports_flat_params(self):
+        return True
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        g_sq = 0.0
+        sksq_weighted = 0.0
+        sk_l1 = 0.0
+        lr = max(group['lr'] for group in self.param_groups)
+        group = self.param_groups[0]
+        gsq_weighted = group['gsq_weighted']
+        d = group['d']
+        dlr = d*lr
+        growth_rate = group['growth_rate']
+        decouple = group['decouple']
+        fsdp_in_use = group['fsdp_in_use']
+        log_every = group['log_every']
+        beta1, beta2 = group['betas']
+        for group in self.param_groups:
+            group_lr = group['lr']
+            decay = group['weight_decay']
+            k = group['k']
+            eps = group['eps']
+            if group_lr not in [lr, 0.0]:
+                raise RuntimeError("Setting different lr values in different parameter "
+                                   "groups is only supported for values of 0")
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                if hasattr(p, "_fsdp_flattened"):
+                    fsdp_in_use = True
+                grad = p.grad.data
+                # Apply weight decay (coupled variant)
+                if decay != 0 and not decouple:
+                    grad.add_(p.data, alpha=decay)
+                state = self.state[p]
+                # State initialization
+                if 'step' not in state:
+                    state['step'] = 0
+                    state['s'] = torch.zeros_like(p.data, memory_format=torch.preserve_format).detach()
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data, memory_format=torch.preserve_format).detach()
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(
+                        to_real(p.data), memory_format=torch.preserve_format).detach()
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                grad_grad = to_real(grad * grad.conj())
+                # Adam EMA updates
+                if group_lr > 0:
+                    exp_avg.mul_(beta1).add_(grad, alpha=dlr*(1-beta1))
+                    exp_avg_sq.mul_(beta2).add_(grad_grad, alpha=1-beta2)
+                    denom = exp_avg_sq.sqrt().add_(eps)
+                    g_sq += grad_grad.div_(denom).sum().item()
+                    s = state['s']
+                    s.mul_(beta2).add_(grad, alpha=dlr*(1-beta2))
+                    sksq_weighted += to_real(s * s.conj()).div_(denom).sum().item()
+                    sk_l1 += s.abs().sum().item()
+            ######
+        gsq_weighted = beta2*gsq_weighted + g_sq*(dlr**2)*(1-beta2)
+        d_hat = d
+        # if we have not done any progres, return
+        # if we have any gradients available, will have sk_l1 > 0 (unless \|g\|=0)
+        if sk_l1 == 0:
+            return loss
+        if lr > 0.0:
+            if fsdp_in_use:
+                dist_tensor = torch.zeros(3, device='cuda')
+                dist_tensor[0] = sksq_weighted
+                dist_tensor[1] = gsq_weighted
+                dist_tensor[2] = sk_l1
+                dist.all_reduce(dist_tensor, op=dist.ReduceOp.SUM)
+                global_sksq_weighted = dist_tensor[0]
+                global_gsq_weighted = dist_tensor[1]
+                global_sk_l1 = dist_tensor[2]
+            else:
+                global_sksq_weighted = sksq_weighted
+                global_gsq_weighted = gsq_weighted
+                global_sk_l1 = sk_l1
+            d_hat = (global_sksq_weighted/(1-beta2) - global_gsq_weighted)/global_sk_l1
+            d = max(d, min(d_hat, d*growth_rate))
+        if log_every > 0 and k % log_every == 0:
+            logger.info(
+                f"(k={k}) dlr: {dlr:1.1e} d_hat: {d_hat:1.1e}, d: {d:1.8}. "
+                f"sksq_weighted={global_sksq_weighted:1.1e} gsq_weighted={global_gsq_weighted:1.1e} "
+                f"sk_l1={global_sk_l1:1.1e}{' (FSDP)' if fsdp_in_use else ''}")
+        for group in self.param_groups:
+            group['gsq_weighted'] = gsq_weighted
+            group['d'] = d
+            group_lr = group['lr']
+            decay = group['weight_decay']
+            k = group['k']
+            eps = group['eps']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                state = self.state[p]
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                state['step'] += 1
+                denom = exp_avg_sq.sqrt().add_(eps)
+                denom = denom.type(p.type())
+                # Apply weight decay (decoupled variant)
+                if decay != 0 and decouple and group_lr > 0:
+                    p.data.add_(p.data, alpha=-decay * dlr)
+                # Take step
+                p.data.addcdiv_(exp_avg, denom, value=-1)
+            group['k'] = k + 1
+        return loss

audiocraft/optim/ema.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# ModelEMA implementation is taken from
+# https://github.com/facebookresearch/demucs
+from collections import defaultdict
+import typing as tp
+import torch
+import torch.nn as nn
+def _get_all_non_persistent_buffers_set(module: nn.Module, root: str = "") -> set:
+    names: set = set()
+    for (name, sub_module) in module.named_modules():
+        if name == '':
+            buffer_names = module._non_persistent_buffers_set
+            buffer_names = {f"{root}.{buff_name}" if len(root) > 0 else buff_name
+                            for buff_name in buffer_names}
+            names.update(buffer_names)
+        else:
+            sub_name = f"{root}.{name}" if len(root) > 0 else name
+            sub_buffer_names = _get_all_non_persistent_buffers_set(sub_module, sub_name)
+            names.update(sub_buffer_names)
+    return names
+def _get_named_tensors(module: nn.Module):
+    non_persistent_buffers_set = _get_all_non_persistent_buffers_set(module)
+    named_buffers = [(name, buffer) for (name, buffer) in module.named_buffers()
+                     if name not in non_persistent_buffers_set]
+    named_parameters = list(module.named_parameters())
+    return named_parameters + named_buffers
+class ModuleDictEMA:
+    """Exponential Moving Average over a nn.ModuleDict.
+    You can switch to the EMA weights temporarily.
+    """
+    def __init__(self, module_dict: nn.ModuleDict, decay: float = 0.999,
+                 unbias: bool = True, device: tp.Union[torch.device, str] = 'cpu'):
+        self.decay = decay
+        self.module_dict = module_dict
+        self.state: dict = defaultdict(dict)
+        self.count = 0
+        self.device = device
+        self.unbias = unbias
+        self._init()
+    def _init(self):
+        for module_name, module in self.module_dict.items():
+            for key, val in _get_named_tensors(module):
+                if not val.is_floating_point():
+                    continue
+                device = self.device or val.device
+                if key not in self.state[module_name]:
+                    self.state[module_name][key] = val.detach().to(device, copy=True)
+    def step(self):
+        if self.unbias:
+            self.count = self.count * self.decay + 1
+            w = 1 / self.count
+        else:
+            w = 1 - self.decay
+        for module_name, module in self.module_dict.items():
+            for key, val in _get_named_tensors(module):
+                if not val.is_floating_point():
+                    continue
+                device = self.device or val.device
+                self.state[module_name][key].mul_(1 - w)
+                self.state[module_name][key].add_(val.detach().to(device), alpha=w)
+    def state_dict(self):
+        return {'state': self.state, 'count': self.count}
+    def load_state_dict(self, state):
+        self.count = state['count']
+        for module_name, module in state['state'].items():
+            for key, val in module.items():
+                self.state[module_name][key].copy_(val)

audiocraft/optim/fsdp.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Wrapper around FSDP for more convenient use in the training loops.
+"""
+from contextlib import contextmanager
+import typing as tp
+import dora
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import (
+    MixedPrecision, ShardingStrategy, FullStateDictConfig, StateDictType)
+from torch.distributed._shard.sharded_tensor.api import ShardedTensor
+def is_fsdp_used() -> bool:
+    """Return whether we are using FSDP."""
+    # A bit of a hack but should work from anywhere.
+    if dora.is_xp():
+        cfg = dora.get_xp().cfg
+        if hasattr(cfg, 'fsdp'):
+            return cfg.fsdp.use
+    return False
+def is_sharded_tensor(x: tp.Any) -> bool:
+    return isinstance(x, ShardedTensor)
+@contextmanager
+def switch_to_full_state_dict(models: tp.List[FSDP]):
+    # Another bug in FSDP makes it that we cannot use the `state_dict_type` API,
+    # so let's do thing manually.
+    for model in models:
+        FSDP.set_state_dict_type(  # type: ignore
+            model, StateDictType.FULL_STATE_DICT,
+            FullStateDictConfig(offload_to_cpu=True, rank0_only=True))
+    try:
+        yield
+    finally:
+        for model in models:
+            FSDP.set_state_dict_type(model, StateDictType.LOCAL_STATE_DICT)  # type: ignore
+def wrap_with_fsdp(cfg, model: torch.nn.Module,
+                   block_classes: tp.Optional[tp.Set[tp.Type]] = None) -> FSDP:
+    """Wraps a model with FSDP."""
+    # Some of the typing is disabled until this gets integrated
+    # into the stable version of PyTorch.
+    from torch.distributed.fsdp.wrap import ModuleWrapPolicy  # type: ignore
+    # we import this here to prevent circular import.
+    from ..modules.transformer import StreamingTransformerLayer
+    from ..modules.conditioners import ConditioningProvider
+    _fix_post_backward_hook()
+    assert cfg.use
+    sharding_strategy_dict = {
+        "no_shard": ShardingStrategy.NO_SHARD,
+        "shard_grad_op": ShardingStrategy.SHARD_GRAD_OP,
+        "full_shard": ShardingStrategy.FULL_SHARD,
+    }
+    dtype_dict = {
+        "float32": torch.float32,
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+    }
+    mixed_precision_config = MixedPrecision(
+        param_dtype=dtype_dict[cfg.param_dtype],
+        reduce_dtype=dtype_dict[cfg.reduce_dtype],
+        buffer_dtype=dtype_dict[cfg.buffer_dtype],
+    )
+    sharding_strategy_config = sharding_strategy_dict[cfg.sharding_strategy]
+    # The following is going to require being a bit smart
+    # when doing LM, because this would flush the weights for every time step
+    # during generation. One possiblity is to use hybrid sharding:
+    # See: https://pytorch.org/docs/master/fsdp.html#torch.distributed.fsdp.ShardingStrategy
+    assert sharding_strategy_config != ShardingStrategy.FULL_SHARD, \
+        "Not supported at the moment, requires a bit more work."
+    local_rank = dora.distrib.get_distrib_spec().local_rank
+    assert local_rank < torch.cuda.device_count(), "Please upgrade Dora!"
+    auto_wrap_policy = None
+    if block_classes is None:
+        block_classes = {StreamingTransformerLayer, ConditioningProvider}
+    if cfg.per_block:
+        auto_wrap_policy = ModuleWrapPolicy(block_classes)
+    wrapped = _FSDPFixStateDict(
+        model,
+        sharding_strategy=sharding_strategy_config,
+        mixed_precision=mixed_precision_config,
+        device_id=local_rank,
+        sync_module_states=True,
+        use_orig_params=True,
+        auto_wrap_policy=auto_wrap_policy,
+    )  # type: ignore
+    FSDP.set_state_dict_type(wrapped, StateDictType.LOCAL_STATE_DICT)  # type: ignore
+    # Let the wrapped model know about the wrapping!
+    # We use __dict__ to avoid it going into the state dict.
+    # This is a bit dirty, but needed during generation, as otherwise
+    # the wrapped model would call itself and bypass FSDP.
+    for module in FSDP.fsdp_modules(wrapped):
+        original = module._fsdp_wrapped_module
+        original.__dict__['_fsdp'] = module
+    return wrapped
+def purge_fsdp(model: FSDP):
+    """Purge the FSDP cached shard inside the model. This should
+    allow setting the best state or switching to the EMA.
+    """
+    from torch.distributed.fsdp._runtime_utils import _reshard  # type: ignore
+    for module in FSDP.fsdp_modules(model):
+        handles = module._handles
+        if not handles:
+            continue
+        handle = handles[0]
+        unsharded_flat_param = handle._get_padded_unsharded_flat_param()
+        storage_size: int = unsharded_flat_param._typed_storage()._size()  # type: ignore
+        if storage_size == 0:
+            continue
+        true_list = [True for h in handles]
+        _reshard(module, handles, true_list)
+class _FSDPFixStateDict(FSDP):
+    @staticmethod
+    def _name_without_fsdp_prefix(name: str) -> str:
+        from torch.distributed.fsdp._common_utils import FSDP_WRAPPED_MODULE  # type: ignore
+        parts = name.split('.')
+        new_parts = [part for part in parts if part != FSDP_WRAPPED_MODULE]
+        return '.'.join(new_parts)
+    def state_dict(self, *args, **kwargs) -> tp.Dict[str, tp.Any]:  # type: ignore
+        state = dict(super().state_dict(*args, **kwargs))
+        for key, value in list(state.items()):
+            if is_sharded_tensor(value):
+                del state[key]
+        return state
+    def load_state_dict(self, state: tp.Dict[str, tp.Any]):  # type: ignore
+        if self._state_dict_type is StateDictType.FULL_STATE_DICT:
+            super().load_state_dict(state)
+            purge_fsdp(self)
+            return
+        # Fix FSDP load state dict in all situation.
+        # Use this only with LOCAL_STATE_DICT !!!
+        current_state = dict(super().state_dict())
+        for key, value in state.items():
+            key = _FSDPFixStateDict._name_without_fsdp_prefix(key)
+            if key not in current_state:
+                # Emulate strict loading manually.
+                raise RuntimeError(f"Unknown state key {key}")
+            current_state[key].copy_(value)
+        # Purging cached weights from previous forward.
+        purge_fsdp(self)
+_hook_fixed = False
+def _fix_post_backward_hook():
+    global _hook_fixed
+    if _hook_fixed:
+        return
+    _hook_fixed = True
+    from torch.distributed.fsdp import _runtime_utils
+    from torch.distributed.fsdp._common_utils import TrainingState, HandleTrainingState
+    old_hook = _runtime_utils._post_backward_hook
+    def _post_backward_hook(state, handle, *args, **kwargs):
+        checkpointed = getattr(state._fsdp_wrapped_module, '_audiocraft_checkpointed', False)
+        if checkpointed:
+            # there will be one more forward in the backward with checkpointing and that will
+            # massively confuse FSDP, so we have to make it think everything
+            # is going according to the plan.
+            state.training_state = TrainingState.FORWARD_BACKWARD
+            handle._training_state = HandleTrainingState.BACKWARD_PRE
+        old_hook(state, handle, *args, **kwargs)
+    _runtime_utils._post_backward_hook = _post_backward_hook

audiocraft/optim/inverse_sqrt_lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+class InverseSquareRootLRScheduler(_LRScheduler):
+    """Inverse square root LR scheduler.
+    Args:
+        optimizer (Optimizer): Torch optimizer.
+        warmup_steps (int): Number of warmup steps.
+        warmup_init_lr (tp.Optional[float]): Initial learning rate
+            during warmup phase. When not set, use the provided learning rate.
+    """
+    def __init__(self, optimizer: Optimizer, warmup_steps: int, warmup_init_lr: tp.Optional[float] = 0):
+        self.warmup_steps = warmup_steps
+        self.warmup_init_lr = warmup_init_lr
+        super().__init__(optimizer)
+    def _get_sched_lr(self, lr: float, step: int):
+        if step < self.warmup_steps:
+            warmup_init_lr = self.warmup_init_lr or 0
+            lr_step = (lr - warmup_init_lr) / self.warmup_steps
+            lr = warmup_init_lr + step * lr_step
+        else:
+            decay_factor = lr * self.warmup_steps**0.5
+            lr = decay_factor * step**-0.5
+        return lr
+    def get_lr(self):
+        return [self._get_sched_lr(base_lr, self._step_count) for base_lr in self.base_lrs]

audiocraft/optim/linear_warmup_lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+class LinearWarmupLRScheduler(_LRScheduler):
+    """Inverse square root LR scheduler.
+    Args:
+        optimizer (Optimizer): Torch optimizer.
+        warmup_steps (int): Number of warmup steps.
+        warmup_init_lr (tp.Optional[float]): Initial learning rate
+            during warmup phase. When not set, use the provided learning rate.
+    """
+    def __init__(self, optimizer: Optimizer, warmup_steps: int, warmup_init_lr: tp.Optional[float] = 0):
+        self.warmup_steps = warmup_steps
+        self.warmup_init_lr = warmup_init_lr
+        super().__init__(optimizer)
+    def _get_sched_lr(self, lr: float, step: int):
+        if step < self.warmup_steps:
+            warmup_init_lr = self.warmup_init_lr or 0
+            lr_step = (lr - warmup_init_lr) / self.warmup_steps
+            lr = warmup_init_lr + step * lr_step
+        return lr
+    def get_lr(self):
+        return [self._get_sched_lr(base_lr, self.last_epoch) for base_lr in self.base_lrs]

audiocraft/optim/polynomial_decay_lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+class PolynomialDecayLRScheduler(_LRScheduler):
+    """Polynomial decay LR scheduler.
+    Args:
+        optimizer (Optimizer): Torch optimizer.
+        warmup_steps (int): Number of warmup steps.
+        total_steps (int): Total number of steps.
+        end_lr (float): Final learning rate to achieve over total number of steps.
+        zero_lr_warmup_steps (int): Number of steps with a learning rate of value 0.
+        power (float): Decay exponent.
+    """
+    def __init__(self, optimizer: Optimizer, warmup_steps: int, total_steps: int,
+                 end_lr: float = 0., zero_lr_warmup_steps: int = 0, power: float = 1.):
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.end_lr = end_lr
+        self.zero_lr_warmup_steps = zero_lr_warmup_steps
+        self.power = power
+        super().__init__(optimizer)
+    def _get_sched_lr(self, lr: float, step: int):
+        if self.zero_lr_warmup_steps > 0 and step <= self.zero_lr_warmup_steps:
+            lr = 0
+        elif self.warmup_steps > 0 and step <= self.warmup_steps + self.zero_lr_warmup_steps:
+            lr_ratio = (step - self.zero_lr_warmup_steps) / float(self.warmup_steps)
+            lr = lr_ratio * lr
+        elif step >= self.total_steps:
+            lr = self.end_lr
+        else:
+            total_warmup_steps = self.warmup_steps + self.zero_lr_warmup_steps
+            lr_range = lr - self.end_lr
+            pct_remaining = 1 - (step - total_warmup_steps) / (self.total_steps - total_warmup_steps)
+            lr = lr_range * pct_remaining ** self.power + self.end_lr
+        return lr
+    def get_lr(self):
+        return [self._get_sched_lr(base_lr, self.last_epoch) for base_lr in self.base_lrs]