File size: 4,085 Bytes
2c7c7ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
name: 'louise'
model: extensibletrainer
scale: 1
gpu_ids: [0] # Manually edit this if the GPU you want to train on is not your primary, as this will set the env var that exposes CUDA devices
start_step: 0
checkpointing_enabled: true
fp16: True
bitsandbytes: True
gpus: 1
datasets:
train:
name: training
n_workers: 2
batch_size: 28
mode: paired_voice_audio
path: ./training/louise/train.txt
fetcher_mode: ['lj']
phase: train
max_wav_length: 255995 # ~11.6 seconds
max_text_length: 200
sample_rate: 22050
load_conditioning: True
num_conditioning_candidates: 2
conditioning_length: 44000
use_bpe_tokenizer: True
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
load_aligned_codes: False
val:
name: validation
n_workers: 2
batch_size: 7
mode: paired_voice_audio
path: ./training/louise/validation.txt
fetcher_mode: ['lj']
phase: val
max_wav_length: 255995
max_text_length: 200
sample_rate: 22050
load_conditioning: True
num_conditioning_candidates: 2
conditioning_length: 44000
use_bpe_tokenizer: True
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json # ./models/tortoise/bpe_lowercase_asr_256.json
load_aligned_codes: False
steps:
gpt_train:
training: gpt
loss_log_buffer: 500
# Generally follows the recipe from the DALLE paper.
optimizer: adamw # this should be adamw_zero if you're using distributed training
optimizer_params:
lr: !!float 1e-05 # originally: 1e-4
weight_decay: !!float 1e-2
beta1: 0.9
beta2: 0.96
clip_grad_eps: 4
injectors:
paired_to_mel:
type: torch_mel_spectrogram
mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
in: wav
out: paired_mel
paired_cond_to_mel:
type: for_each
subtype: torch_mel_spectrogram
mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth # ./models/tortoise/clips_mel_norms.pth
in: conditioning
out: paired_conditioning_mel
to_codes:
type: discrete_token
in: paired_mel
out: paired_mel_codes
dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml"
paired_fwd_text:
type: generator
generator: gpt
in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
out: [loss_text_ce, loss_mel_ce, logits]
losses:
text_ce:
type: direct
weight: 0.01
key: loss_text_ce
mel_ce:
type: direct
weight: 1
key: loss_mel_ce
networks:
gpt:
type: generator
which_model_G: unified_voice2
kwargs:
layers: 30 # originally: 8
model_dim: 1024 # originally: 512
heads: 16 # originally: 8
max_text_tokens: 402 # originally: 120
max_mel_tokens: 604 # originally: 250
max_conditioning_inputs: 2 # originally: 1
mel_length_compression: 1024
number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
number_mel_codes: 8194
start_mel_token: 8192
stop_mel_token: 8193
start_text_token: 255
train_solo_embeddings: False # missing in uv3/4
use_mel_codes_as_input: True # ditto
checkpointing: True
tortoise_compat: True
# freeze_everything_but_position_embeddings: True
path:
strict_load: true
# pretrain_model_gpt: './models/tortoise/autoregressive.pth'
resume_state: './training/louise/finetune/training_state//2550.state'
train:
niter: 2750
warmup_iter: -1
mega_batch_factor: 4
val_freq: 100
ema_enabled: false # I really don't think EMA matters
default_lr_scheme: MultiStepLR
gen_lr_steps: [2, 4, 9, 18, 25, 33, 50, 59]
lr_gamma: 0.5
eval:
pure: False
output_state: gen
logger:
save_checkpoint_freq: 100
visuals: [gen, mel]
visual_debug_rate: 1100
is_mel_spectrogram: true
|