|
name: 'louise' |
|
model: extensibletrainer |
|
scale: 1 |
|
gpu_ids: [0] |
|
start_step: 0 |
|
checkpointing_enabled: true |
|
fp16: True |
|
bitsandbytes: True |
|
gpus: 1 |
|
|
|
datasets: |
|
train: |
|
name: training |
|
n_workers: 2 |
|
batch_size: 28 |
|
mode: paired_voice_audio |
|
path: ./training/louise/train.txt |
|
fetcher_mode: ['lj'] |
|
phase: train |
|
max_wav_length: 255995 |
|
max_text_length: 200 |
|
sample_rate: 22050 |
|
load_conditioning: True |
|
num_conditioning_candidates: 2 |
|
conditioning_length: 44000 |
|
use_bpe_tokenizer: True |
|
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json |
|
load_aligned_codes: False |
|
val: |
|
name: validation |
|
n_workers: 2 |
|
batch_size: 7 |
|
mode: paired_voice_audio |
|
path: ./training/louise/validation.txt |
|
fetcher_mode: ['lj'] |
|
phase: val |
|
max_wav_length: 255995 |
|
max_text_length: 200 |
|
sample_rate: 22050 |
|
load_conditioning: True |
|
num_conditioning_candidates: 2 |
|
conditioning_length: 44000 |
|
use_bpe_tokenizer: True |
|
tokenizer_vocab: ./modules/tortoise-tts/tortoise/data/tokenizer.json |
|
load_aligned_codes: False |
|
|
|
steps: |
|
gpt_train: |
|
training: gpt |
|
loss_log_buffer: 500 |
|
|
|
|
|
optimizer: adamw |
|
optimizer_params: |
|
lr: !!float 1e-05 |
|
weight_decay: !!float 1e-2 |
|
beta1: 0.9 |
|
beta2: 0.96 |
|
clip_grad_eps: 4 |
|
|
|
injectors: |
|
paired_to_mel: |
|
type: torch_mel_spectrogram |
|
mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth |
|
in: wav |
|
out: paired_mel |
|
paired_cond_to_mel: |
|
type: for_each |
|
subtype: torch_mel_spectrogram |
|
mel_norm_file: ./modules/tortoise-tts/tortoise/data/mel_norms.pth |
|
in: conditioning |
|
out: paired_conditioning_mel |
|
to_codes: |
|
type: discrete_token |
|
in: paired_mel |
|
out: paired_mel_codes |
|
dvae_config: "./models/tortoise/train_diffusion_vocoder_22k_level.yml" |
|
paired_fwd_text: |
|
type: generator |
|
generator: gpt |
|
in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths] |
|
out: [loss_text_ce, loss_mel_ce, logits] |
|
losses: |
|
text_ce: |
|
type: direct |
|
weight: 0.01 |
|
key: loss_text_ce |
|
mel_ce: |
|
type: direct |
|
weight: 1 |
|
key: loss_mel_ce |
|
|
|
networks: |
|
gpt: |
|
type: generator |
|
which_model_G: unified_voice2 |
|
kwargs: |
|
layers: 30 |
|
model_dim: 1024 |
|
heads: 16 |
|
max_text_tokens: 402 |
|
max_mel_tokens: 604 |
|
max_conditioning_inputs: 2 |
|
mel_length_compression: 1024 |
|
number_text_tokens: 256 |
|
number_mel_codes: 8194 |
|
start_mel_token: 8192 |
|
stop_mel_token: 8193 |
|
start_text_token: 255 |
|
train_solo_embeddings: False |
|
use_mel_codes_as_input: True |
|
checkpointing: True |
|
tortoise_compat: True |
|
|
|
|
|
path: |
|
strict_load: true |
|
|
|
resume_state: './training/louise/finetune/training_state//2560.state' |
|
|
|
train: |
|
niter: 2950 |
|
warmup_iter: -1 |
|
mega_batch_factor: 4 |
|
val_freq: 100 |
|
|
|
ema_enabled: false |
|
|
|
default_lr_scheme: MultiStepLR |
|
gen_lr_steps: [2, 4, 9, 18, 25, 33, 50, 59] |
|
lr_gamma: 0.5 |
|
|
|
eval: |
|
pure: False |
|
output_state: gen |
|
|
|
logger: |
|
save_checkpoint_freq: 100 |
|
visuals: [gen, mel] |
|
visual_debug_rate: 1100 |
|
is_mel_spectrogram: true |
|
|