#### general settings name: train_voice_voice_clip use_tb_logger: true gpu_ids: [0] start_step: 0 fp16: false checkpointing_enabled: true wandb: false datasets: train: name: clips n_workers: 4 batch_size: 512 mode: unsupervised_audio path: [/y/clips, /y/bigasr_dataset/libritts/train-clean-100, /y/bigasr_dataset/libritts/train-clean-360, /y/bigasr_dataset/libritts/train-other-500, /y/bigasr_dataset/ljspeech/wavs] exclusions: [/y/clips/books1-hifreq.txt, /y/clips/podcasts-0-hifreq.txt, /y/clips/books2-hifreq.txt, /y/bigasr_dataset/libritts-hifreq.txt] cache_path: /y/clips-cache-hifreq.pth sampling_rate: 22050 do_augmentation: false pad_to_samples: 80000 resample_clip: false min_length: 40000 debug_loading_failures: false val: name: clips_val n_workers: 1 batch_size: 512 mode: unsupervised_audio path: [/h/libritts/test-clean] cache_path: /h/libritts/test-clean/cache.pth sampling_rate: 22050 do_augmentation: false pad_to_samples: 80000 resample_clip: false min_length: 40000 debug_loading_failures: false networks: clip: type: generator which_model_G: voice_to_voice_clip kwargs: encoder_output: 512 #### path path: strict_load: true #resume_state: ../experiments/train_voice_voice_clip/training_state/56000.state pretrain_model_clip: voice_voice_clip.pth steps: clip_train: training: clip loss_log_buffer: 250 # Generally follows the recipe from the DALLE paper. optimizer: adamw optimizer_params: lr: !!float 1e-4 weight_decay: 0 beta1: 0.9 beta2: 0.99 clip_grad_eps: 4 # TODO: remove clipping after warmup steps. injectors: # Speech only speech_to_mel: type: torch_mel_spectrogram mel_norm_file: ../experiments/clips_mel_norms.pth in: clip out: speech_mel forward: type: generator generator: clip in: [speech_mel, clip_lengths] out: clip_loss losses: clip_loss_ce: type: direct weight: 1 key: clip_loss train: niter: 500000 warmup_iter: -1 mega_batch_factor: 1 ema_rate: .999 val_freq: 500 default_lr_scheme: MultiStepLR gen_lr_steps: [ 20000, 40000, 60000 ] lr_gamma: 0.2 warmup_steps: 1000 #force_lr: !!float 4e-5 eval: pure: true logger: print_freq: 10 save_checkpoint_freq: 500 visuals: [] is_mel_spectrogram: true visual_debug_rate: 100