Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,413 Bytes
8c92a11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
{
"base_config": "config/ns2.json",
"model_type": "NaturalSpeech2",
"dataset": [
"libritts"
],
"preprocess": {
"use_mel": false,
"use_code": true,
"use_spkid": true,
"use_pitch": true,
"use_duration": true,
"use_phone": true,
"use_len": true,
"use_cross_reference": true,
"train_file": "train.json",
"valid_file": "test.json",
"melspec_dir": "mel",
"code_dir": "code",
"pitch_dir": "pitch",
"duration_dir": "duration",
"metadata_dir": "metadata",
"read_metadata": true,
"clip_mode": "start"
},
"model": {
"latent_dim": 128,
"prior_encoder": {
"vocab_size": 100,
"pitch_min": 50,
"pitch_max": 1100,
"pitch_bins_num": 512,
"encoder": {
"encoder_layer": 6,
"encoder_hidden": 512,
"encoder_head": 8,
"conv_filter_size": 2048,
"conv_kernel_size": 9,
"encoder_dropout": 0.2,
"use_cln": true
},
"duration_predictor": {
"input_size": 512,
"filter_size": 512,
"kernel_size": 3,
"conv_layers": 30,
"cross_attn_per_layer": 3,
"attn_head": 8,
"drop_out": 0.5
},
"pitch_predictor": {
"input_size": 512,
"filter_size": 512,
"kernel_size": 5,
"conv_layers": 30,
"cross_attn_per_layer": 3,
"attn_head": 8,
"drop_out": 0.5
}
},
"diffusion": {
"wavenet": {
"input_size": 128,
"hidden_size": 512,
"out_size": 128,
"num_layers": 40,
"cross_attn_per_layer": 3,
"dilation_cycle": 2,
"attn_head": 8,
"drop_out": 0.2
},
"beta_min": 0.05,
"beta_max": 20,
"sigma": 1.0,
"noise_factor": 1.0,
"ode_solver": "euler",
"diffusion_type": "diffusion"
},
"prompt_encoder": {
"encoder_layer": 6,
"encoder_hidden": 512,
"encoder_head": 8,
"conv_filter_size": 2048,
"conv_kernel_size": 9,
"encoder_dropout": 0.2,
"use_cln": false
},
"query_emb": {
"query_token_num": 32,
"hidden_size": 512,
"head_num": 8
},
"inference_step": 500
},
"train": {
"use_dynamic_batchsize": true,
"max_tokens": 7500,
"max_sentences": 32,
"lr_warmup_steps": 5000,
"lr_scheduler": "cosine",
"num_train_steps": 800000,
"adam": {
"lr": 7.5e-5
},
"diff_ce_loss_lambda": 0.5,
"diff_noise_loss_lambda": 1.0,
"ddp": false,
"random_seed": 114,
"batch_size": 32,
"epochs": 5000,
"max_steps": 1000000,
"total_training_steps": 800000,
"save_summary_steps": 500,
"save_checkpoints_steps": 2000,
"valid_interval": 2000,
"keep_checkpoint_max": 100
}
} |