# @package __global__ # This is the training loop solver # for the base AudioGen model (text-to-sound) # on monophonic audio sampled at 16 kHz # using a similar EnCodec+LM setup to MusicGen defaults: - audiogen/default - /model: lm/audiogen_lm - override /dset: audio/default - _self_ autocast: true autocast_dtype: float16 # EnCodec large trained on mono-channel music audio sampled at 16khz # with a total stride of 320 leading to 50 frames/s. # rvq.n_q=4, rvq.bins=2048, no quantization dropout # (transformer_lm card and n_q must be compatible) compression_model_checkpoint: //reference/bd44a852/checkpoint.th channels: 1 sample_rate: 16000 deadlock: use: true # deadlock detection dataset: batch_size: 128 # matching AudioGen paper setup (256 * mix_p=0.5 = 128) num_workers: 10 segment_duration: 10 min_segment_ratio: 1.0 sample_on_weight: false # Uniform sampling all the way sample_on_duration: false # Uniform sampling all the way external_metadata_source: null # sample mixing augmentation at train time train: batch_size: 256 # matching AudioGen paper setup aug_p: 0.5 # perform audio mixing 50% of the time mix_p: 0.5 # proportion of batch items mixed together # important: note that this will reduce the # actual batch size used at train time # which will be equal to mix_p * batch_size mix_snr_low: -5 mix_snr_high: 5 mix_min_overlap: 0.5 generate: lm: use_sampling: true top_k: 250 top_p: 0.0 optim: epochs: 100 optimizer: adamw lr: 5e-4 ema: use: true updates: 10 device: cuda logging: log_tensorboard: true schedule: lr_scheduler: inverse_sqrt inverse_sqrt: warmup: 3000 warmup_init_lr: 0.0