#### general settings
name: train_voice_voice_clip
use_tb_logger: true
gpu_ids: [0]
start_step: 0
fp16: false
checkpointing_enabled: true
wandb: false

datasets:
  train:
    name: clips
    n_workers: 4
    batch_size: 512
    mode: unsupervised_audio
    path: [/y/clips,
           /y/bigasr_dataset/libritts/train-clean-100, /y/bigasr_dataset/libritts/train-clean-360, 
           /y/bigasr_dataset/libritts/train-other-500, /y/bigasr_dataset/ljspeech/wavs]
    exclusions: [/y/clips/books1-hifreq.txt, /y/clips/podcasts-0-hifreq.txt,
                 /y/clips/books2-hifreq.txt, /y/bigasr_dataset/libritts-hifreq.txt]
    cache_path: /y/clips-cache-hifreq.pth
    sampling_rate: 22050
    do_augmentation: false
    pad_to_samples: 80000
    resample_clip: false
    min_length: 40000
    debug_loading_failures: false
  val:
    name: clips_val
    n_workers: 1
    batch_size: 512
    mode: unsupervised_audio
    path: [/h/libritts/test-clean]
    cache_path: /h/libritts/test-clean/cache.pth
    sampling_rate: 22050
    do_augmentation: false
    pad_to_samples: 80000
    resample_clip: false
    min_length: 40000
    debug_loading_failures: false

networks:
  clip:
    type: generator
    which_model_G: voice_to_voice_clip
    kwargs:
      encoder_output: 512

#### path
path:
  strict_load: true
  #resume_state: ../experiments/train_voice_voice_clip/training_state/56000.state
  pretrain_model_clip: voice_voice_clip.pth

steps:        
  clip_train:
    training: clip
    loss_log_buffer: 250

    # Generally follows the recipe from the DALLE paper.
    optimizer: adamw
    optimizer_params:
      lr: !!float 1e-4
      weight_decay: 0
      beta1: 0.9
      beta2: 0.99
    clip_grad_eps: 4  # TODO: remove clipping after warmup steps.

    injectors:
      # Speech only
      speech_to_mel:
        type: torch_mel_spectrogram
        mel_norm_file: ../experiments/clips_mel_norms.pth
        in: clip
        out: speech_mel
      forward:
        type: generator
        generator: clip
        in: [speech_mel, clip_lengths]
        out: clip_loss
    losses:
      clip_loss_ce:
        type: direct
        weight: 1
        key: clip_loss
      

train:
  niter: 500000
  warmup_iter: -1
  mega_batch_factor: 1
  ema_rate: .999
  val_freq: 500

  default_lr_scheme: MultiStepLR
  gen_lr_steps: [ 20000, 40000, 60000 ]
  lr_gamma: 0.2
  warmup_steps: 1000
  #force_lr: !!float 4e-5

eval:
  pure: true

logger:
  print_freq: 10
  save_checkpoint_freq: 500
  visuals: []
  is_mel_spectrogram: true
  visual_debug_rate: 100