File size: 4,384 Bytes
2c7c7ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
23-04-24 02:08:26.826 - INFO:   name: louise
  model: extensibletrainer
  scale: 1
  gpu_ids: [0]
  start_step: 0
  checkpointing_enabled: True
  fp16: True
  bitsandbytes: True
  gpus: 1
  datasets:[
    train:[
      name: training
      n_workers: 2
      batch_size: 32
      mode: paired_voice_audio
      path: ./training/louise/train.txt
      fetcher_mode: ['lj']
      phase: train
      max_wav_length: 255995
      max_text_length: 200
      sample_rate: 22050
      load_conditioning: True
      num_conditioning_candidates: 2
      conditioning_length: 44000
      use_bpe_tokenizer: True
      tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json
      load_aligned_codes: False
      data_type: img
    ]
    val:[
      name: validation
      n_workers: 2
      batch_size: 8
      mode: paired_voice_audio
      path: ./training/louise/validation.txt
      fetcher_mode: ['lj']
      phase: val
      max_wav_length: 255995
      max_text_length: 200
      sample_rate: 22050
      load_conditioning: True
      num_conditioning_candidates: 2
      conditioning_length: 44000
      use_bpe_tokenizer: True
      tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json
      load_aligned_codes: False
      data_type: img
    ]
  ]
  steps:[
    gpt_train:[
      training: gpt
      loss_log_buffer: 500
      optimizer: adamw
      optimizer_params:[
        lr: 1e-05
        weight_decay: 0.01
        beta1: 0.9
        beta2: 0.96
      ]
      clip_grad_eps: 4
      injectors:[
        paired_to_mel:[
          type: torch_mel_spectrogram
          mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth
          in: wav
          out: paired_mel
        ]
        paired_cond_to_mel:[
          type: for_each
          subtype: torch_mel_spectrogram
          mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth
          in: conditioning
          out: paired_conditioning_mel
        ]
        to_codes:[
          type: discrete_token
          in: paired_mel
          out: paired_mel_codes
          dvae_config: ./models/tortoise/train_diffusion_vocoder_22k_level.yml
        ]
        paired_fwd_text:[
          type: generator
          generator: gpt
          in: ['paired_conditioning_mel', 'padded_text', 'text_lengths', 'paired_mel_codes', 'wav_lengths']
          out: ['loss_text_ce', 'loss_mel_ce', 'logits']
        ]
      ]
      losses:[
        text_ce:[
          type: direct
          weight: 0.01
          key: loss_text_ce
        ]
        mel_ce:[
          type: direct
          weight: 1
          key: loss_mel_ce
        ]
      ]
    ]
  ]
  networks:[
    gpt:[
      type: generator
      which_model_G: unified_voice2
      kwargs:[
        layers: 30
        model_dim: 1024
        heads: 16
        max_text_tokens: 402
        max_mel_tokens: 604
        max_conditioning_inputs: 2
        mel_length_compression: 1024
        number_text_tokens: 256
        number_mel_codes: 8194
        start_mel_token: 8192
        stop_mel_token: 8193
        start_text_token: 255
        train_solo_embeddings: False
        use_mel_codes_as_input: True
        checkpointing: True
        tortoise_compat: True
      ]
    ]
  ]
  path:[
    strict_load: True
    pretrain_model_gpt: ./models/tortoise/autoregressive.pth
    root: ./
    experiments_root: ./training/louise/finetune
    models: ./training/louise/finetune/models
    training_state: ./training/louise/finetune/training_state
    log: ./training/louise/finetune
    val_images: ./training/louise/finetune/val_images
  ]
  train:[
    niter: 4700
    warmup_iter: -1
    mega_batch_factor: 4
    val_freq: 100
    ema_enabled: False
    default_lr_scheme: MultiStepLR
    gen_lr_steps: [2, 4, 9, 18, 25, 33, 50, 59]
    lr_gamma: 0.5
  ]
  eval:[
    pure: False
    output_state: gen
  ]
  logger:[
    save_checkpoint_freq: 100
    visuals: ['gen', 'mel']
    visual_debug_rate: 900
    is_mel_spectrogram: True
  ]
  is_train: True
  dist: False

23-04-24 02:08:26.826 - INFO: Random seed: 3594
23-04-24 02:08:27.626 - INFO: Number of training data elements: 293, iters: 10
23-04-24 02:08:27.626 - INFO: Total epochs needed: 470 for iters 4,700
23-04-24 02:08:52.848 - INFO: Loading model for [./models/tortoise/autoregressive.pth]
23-04-24 02:08:58.715 - INFO: Start training from epoch: 0, iter: 0