tortoise-filtering-models / train_voice_voice_clip.yml
jbetker's picture
Initial commit
55cc85f
#### general settings
name: train_voice_voice_clip
use_tb_logger: true
gpu_ids: [0]
start_step: 0
fp16: false
checkpointing_enabled: true
wandb: false
datasets:
train:
name: clips
n_workers: 4
batch_size: 512
mode: unsupervised_audio
path: [/y/clips,
/y/bigasr_dataset/libritts/train-clean-100, /y/bigasr_dataset/libritts/train-clean-360,
/y/bigasr_dataset/libritts/train-other-500, /y/bigasr_dataset/ljspeech/wavs]
exclusions: [/y/clips/books1-hifreq.txt, /y/clips/podcasts-0-hifreq.txt,
/y/clips/books2-hifreq.txt, /y/bigasr_dataset/libritts-hifreq.txt]
cache_path: /y/clips-cache-hifreq.pth
sampling_rate: 22050
do_augmentation: false
pad_to_samples: 80000
resample_clip: false
min_length: 40000
debug_loading_failures: false
val:
name: clips_val
n_workers: 1
batch_size: 512
mode: unsupervised_audio
path: [/h/libritts/test-clean]
cache_path: /h/libritts/test-clean/cache.pth
sampling_rate: 22050
do_augmentation: false
pad_to_samples: 80000
resample_clip: false
min_length: 40000
debug_loading_failures: false
networks:
clip:
type: generator
which_model_G: voice_to_voice_clip
kwargs:
encoder_output: 512
#### path
path:
strict_load: true
#resume_state: ../experiments/train_voice_voice_clip/training_state/56000.state
pretrain_model_clip: voice_voice_clip.pth
steps:
clip_train:
training: clip
loss_log_buffer: 250
# Generally follows the recipe from the DALLE paper.
optimizer: adamw
optimizer_params:
lr: !!float 1e-4
weight_decay: 0
beta1: 0.9
beta2: 0.99
clip_grad_eps: 4 # TODO: remove clipping after warmup steps.
injectors:
# Speech only
speech_to_mel:
type: torch_mel_spectrogram
mel_norm_file: ../experiments/clips_mel_norms.pth
in: clip
out: speech_mel
forward:
type: generator
generator: clip
in: [speech_mel, clip_lengths]
out: clip_loss
losses:
clip_loss_ce:
type: direct
weight: 1
key: clip_loss
train:
niter: 500000
warmup_iter: -1
mega_batch_factor: 1
ema_rate: .999
val_freq: 500
default_lr_scheme: MultiStepLR
gen_lr_steps: [ 20000, 40000, 60000 ]
lr_gamma: 0.2
warmup_steps: 1000
#force_lr: !!float 4e-5
eval:
pure: true
logger:
print_freq: 10
save_checkpoint_freq: 500
visuals: []
is_mel_spectrogram: true
visual_debug_rate: 100