Spaces:
Running
Running
# @package __global__ | |
# This is the training loop solver | |
# for the base AudioGen model (text-to-sound) | |
# on monophonic audio sampled at 16 kHz | |
# using a similar EnCodec+LM setup to MusicGen | |
defaults: | |
- audiogen/default | |
- /model: lm/audiogen_lm | |
- override /dset: audio/default | |
- _self_ | |
autocast: true | |
autocast_dtype: float16 | |
# EnCodec large trained on mono-channel music audio sampled at 16khz | |
# with a total stride of 320 leading to 50 frames/s. | |
# rvq.n_q=4, rvq.bins=2048, no quantization dropout | |
# (transformer_lm card and n_q must be compatible) | |
compression_model_checkpoint: //reference/bd44a852/checkpoint.th | |
channels: 1 | |
sample_rate: 16000 | |
deadlock: | |
use: true # deadlock detection | |
dataset: | |
batch_size: 128 # matching AudioGen paper setup (256 * mix_p=0.5 = 128) | |
num_workers: 10 | |
segment_duration: 10 | |
min_segment_ratio: 1.0 | |
sample_on_weight: false # Uniform sampling all the way | |
sample_on_duration: false # Uniform sampling all the way | |
external_metadata_source: null | |
# sample mixing augmentation at train time | |
train: | |
batch_size: 256 # matching AudioGen paper setup | |
aug_p: 0.5 # perform audio mixing 50% of the time | |
mix_p: 0.5 # proportion of batch items mixed together | |
# important: note that this will reduce the | |
# actual batch size used at train time | |
# which will be equal to mix_p * batch_size | |
mix_snr_low: -5 | |
mix_snr_high: 5 | |
mix_min_overlap: 0.5 | |
generate: | |
lm: | |
use_sampling: true | |
top_k: 250 | |
top_p: 0.0 | |
optim: | |
epochs: 100 | |
optimizer: adamw | |
lr: 5e-4 | |
ema: | |
use: true | |
updates: 10 | |
device: cuda | |
logging: | |
log_tensorboard: true | |
schedule: | |
lr_scheduler: inverse_sqrt | |
inverse_sqrt: | |
warmup: 3000 | |
warmup_init_lr: 0.0 | |