File size: 4,384 Bytes
2c7c7ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
23-04-24 02:08:26.826 - INFO: name: louise
model: extensibletrainer
scale: 1
gpu_ids: [0]
start_step: 0
checkpointing_enabled: True
fp16: True
bitsandbytes: True
gpus: 1
datasets:[
train:[
name: training
n_workers: 2
batch_size: 32
mode: paired_voice_audio
path: ./training/louise/train.txt
fetcher_mode: ['lj']
phase: train
max_wav_length: 255995
max_text_length: 200
sample_rate: 22050
load_conditioning: True
num_conditioning_candidates: 2
conditioning_length: 44000
use_bpe_tokenizer: True
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json
load_aligned_codes: False
data_type: img
]
val:[
name: validation
n_workers: 2
batch_size: 8
mode: paired_voice_audio
path: ./training/louise/validation.txt
fetcher_mode: ['lj']
phase: val
max_wav_length: 255995
max_text_length: 200
sample_rate: 22050
load_conditioning: True
num_conditioning_candidates: 2
conditioning_length: 44000
use_bpe_tokenizer: True
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json
load_aligned_codes: False
data_type: img
]
]
steps:[
gpt_train:[
training: gpt
loss_log_buffer: 500
optimizer: adamw
optimizer_params:[
lr: 1e-05
weight_decay: 0.01
beta1: 0.9
beta2: 0.96
]
clip_grad_eps: 4
injectors:[
paired_to_mel:[
type: torch_mel_spectrogram
mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth
in: wav
out: paired_mel
]
paired_cond_to_mel:[
type: for_each
subtype: torch_mel_spectrogram
mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth
in: conditioning
out: paired_conditioning_mel
]
to_codes:[
type: discrete_token
in: paired_mel
out: paired_mel_codes
dvae_config: ./models/tortoise/train_diffusion_vocoder_22k_level.yml
]
paired_fwd_text:[
type: generator
generator: gpt
in: ['paired_conditioning_mel', 'padded_text', 'text_lengths', 'paired_mel_codes', 'wav_lengths']
out: ['loss_text_ce', 'loss_mel_ce', 'logits']
]
]
losses:[
text_ce:[
type: direct
weight: 0.01
key: loss_text_ce
]
mel_ce:[
type: direct
weight: 1
key: loss_mel_ce
]
]
]
]
networks:[
gpt:[
type: generator
which_model_G: unified_voice2
kwargs:[
layers: 30
model_dim: 1024
heads: 16
max_text_tokens: 402
max_mel_tokens: 604
max_conditioning_inputs: 2
mel_length_compression: 1024
number_text_tokens: 256
number_mel_codes: 8194
start_mel_token: 8192
stop_mel_token: 8193
start_text_token: 255
train_solo_embeddings: False
use_mel_codes_as_input: True
checkpointing: True
tortoise_compat: True
]
]
]
path:[
strict_load: True
pretrain_model_gpt: ./models/tortoise/autoregressive.pth
root: ./
experiments_root: ./training/louise/finetune
models: ./training/louise/finetune/models
training_state: ./training/louise/finetune/training_state
log: ./training/louise/finetune
val_images: ./training/louise/finetune/val_images
]
train:[
niter: 4700
warmup_iter: -1
mega_batch_factor: 4
val_freq: 100
ema_enabled: False
default_lr_scheme: MultiStepLR
gen_lr_steps: [2, 4, 9, 18, 25, 33, 50, 59]
lr_gamma: 0.5
]
eval:[
pure: False
output_state: gen
]
logger:[
save_checkpoint_freq: 100
visuals: ['gen', 'mel']
visual_debug_rate: 900
is_mel_spectrogram: True
]
is_train: True
dist: False
23-04-24 02:08:26.826 - INFO: Random seed: 3594
23-04-24 02:08:27.626 - INFO: Number of training data elements: 293, iters: 10
23-04-24 02:08:27.626 - INFO: Total epochs needed: 470 for iters 4,700
23-04-24 02:08:52.848 - INFO: Loading model for [./models/tortoise/autoregressive.pth]
23-04-24 02:08:58.715 - INFO: Start training from epoch: 0, iter: 0
|