File size: 4,385 Bytes
0e59770
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
name: james-finetune
model: extensibletrainer
scale: 1
gpu_ids: [0] # Superfluous, redundant, unnecessary, the way you launch the training script will set this
start_step: 0
checkpointing_enabled: true 
fp16: false
wandb: false 
use_tb_logger: true

datasets:
  train:
    name: james-train
    n_workers: 2
    batch_size: 68
    mode: paired_voice_audio
    path: ./training/james/train.txt
    fetcher_mode: ['lj']
    phase: train
    max_wav_length: 255995
    max_text_length: 200
    sample_rate: 22050
    load_conditioning: True
    num_conditioning_candidates: 2
    conditioning_length: 44000
    use_bpe_tokenizer: True
    tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
    load_aligned_codes: False
  val: # I really do not care about validation right now
    name: james-val
    n_workers: 1
    batch_size: 1
    mode: paired_voice_audio
    path: ./training/james/train.txt
    fetcher_mode: ['lj']
    phase: val
    max_wav_length: 255995
    max_text_length: 200
    sample_rate: 22050
    load_conditioning: True
    num_conditioning_candidates: 2
    conditioning_length: 44000
    use_bpe_tokenizer: True
    tokenizer_vocab: ./models/tortoise/bpe_lowercase_asr_256.json
    load_aligned_codes: False

steps:        
  gpt_train:
    training: gpt
    loss_log_buffer: 500

    # Generally follows the recipe from the DALLE paper.
    optimizer: adamw # this should be adamw_zero if you're using distributed training
    optimizer_params:
      lr: !!float 1e-05 # originally: 1e-4
      weight_decay: !!float 1e-2
      beta1: 0.9
      beta2: 0.96
    clip_grad_eps: 4

    injectors:
      paired_to_mel:
        type: torch_mel_spectrogram
        mel_norm_file: ./models/tortoise/clips_mel_norms.pth
        in: wav
        out: paired_mel
      paired_cond_to_mel:
        type: for_each
        subtype: torch_mel_spectrogram
        mel_norm_file: ./models/tortoise/clips_mel_norms.pth
        in: conditioning
        out: paired_conditioning_mel
      to_codes:
        type: discrete_token
        in: paired_mel
        out: paired_mel_codes
        dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml"
      paired_fwd_text:
        type: generator
        generator: gpt
        in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
        out: [loss_text_ce, loss_mel_ce, logits]      
    losses:
      text_ce:
        type: direct
        weight: 0.01
        key: loss_text_ce
      mel_ce:
        type: direct
        weight: 1
        key: loss_mel_ce

networks:
  gpt:
    type: generator 
    which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
    kwargs:
      layers: 30 # originally: 8
      model_dim: 1024 # originally: 512
      heads: 16 # originally: 8
      max_text_tokens: 402 # originally: 120
      max_mel_tokens: 604 # originally: 250
      max_conditioning_inputs: 2 # originally: 1
      mel_length_compression: 1024
      number_text_tokens: 256 # supposed to be 255 for newer unified_voice files 
      number_mel_codes: 8194
      start_mel_token: 8192
      stop_mel_token: 8193
      start_text_token: 255
      train_solo_embeddings: False # missing in uv3/4
      use_mel_codes_as_input: True # ditto
      checkpointing: True
      #types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it.
      #only_alignment_head: False  # uv3/4

path:
  # pretrain_model_gpt: './models/tortoise/autoregressive.pth' 
  strict_load: true
  resume_state: './training/james-finetune//training_state//500.state'

train:
  niter: 2000
  warmup_iter: -1
  mega_batch_factor: 34
  val_freq: 2000

  ema_enabled: false # I really don't think EMA matters

  default_lr_scheme: MultiStepLR
  gen_lr_steps: [9, 18, 25, 33] #[50000, 100000, 140000, 180000]
  lr_gamma: 0.5

eval:
  output_state: gen
  injectors:
    gen_inj_eval:
      type: generator
      generator: generator
      in: hq
      out: [gen, codebook_commitment_loss]

logger: 
  print_freq: 5
  save_checkpoint_freq: 50
  visuals: [gen, mel]
  visual_debug_rate: 5
  is_mel_spectrogram: true