output_dir: "logs/s1" | |
train: | |
seed: 1234 | |
epochs: 15 | |
batch_size: 8 | |
save_every_n_epoch: 5 | |
precision: 32 | |
if_save_latest: true | |
if_save_every_weights: true | |
exp_name: "gpt_training" | |
half_weights_save_dir: "weights/s1" | |
wandb: | |
project: "gpt-sovits-hindi" | |
name: "stage1_training" | |
entity: null | |
log_interval: 100 | |
optimizer: | |
lr_init: 0.0001 | |
lr: 0.0004 | |
lr_end: 0.00001 | |
warmup_steps: 500 | |
decay_steps: 1000 | |
data: | |
training_files: "data8" | |
max_sec: 60 | |
max_frames: 60 | |
filter_length: 2048 | |
hop_length: 640 | |
win_length: 2048 | |
mel_channels: 128 | |
mel_fmin: 0.0 | |
mel_fmax: null | |
cleaned_text: true | |
num_workers: 4 | |
batch_size: 8 | |
pad_val: 1024 | |
# Data paths | |
train_semantic_path: "data8/semantic.tsv" | |
train_phoneme_path: "data8/phoneme.txt" | |
model: | |
hidden_dim: 768 | |
embedding_dim: 768 | |
n_layer: 12 | |
head: 12 | |
n_embd: 768 | |
vocab_size: 2048 | |
block_size: 1000 | |
embd_pdrop: 0.1 | |
resid_pdrop: 0.1 | |
attn_pdrop: 0.1 | |
semantic_dim: 1024 | |
num_layers: 6 | |
ffn_hidden: 3072 | |
dropout: 0.1 | |
attention_dropout: 0.1 | |
hidden_dropout: 0.1 | |
max_text_positions: 2048 | |
max_mel_positions: 8000 | |
prenet_dim: 384 | |
postnet_dim: 384 | |
prenet_layers: 3 | |
postnet_layers: 3 | |
phoneme_vocab_size: 2048 | |
EOS: 2047 | |
pad_val: 1024 |