Spaces:
Runtime error
Runtime error
File size: 4,166 Bytes
271e316 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
{
"framework": "PyTorch",
"task" : "text-to-speech",
"model" : {
"type" : "sambert-hifigan",
"lang_type" : "zhcn",
"sample_rate" : 16000,
"custom_ckpt": {
"voice_name" : "F7",
"am_ckpt" : "basemodel_16k/sambert/ckpt",
"am_config" : "basemodel_16k/sambert/config.yaml",
"voc_ckpt" : "basemodel_16k/hifigan/ckpt",
"voc_config" : "basemodel_16k/hifigan/config.yaml",
"audio_config" : "basemodel_16k/audio_config_se_16k.yaml",
"se_model" : "basemodel_16k/speaker_embedding/se.onnx"
},
"am": {
"am": {
"max_len": 800,
"embedding_dim": 512,
"encoder_num_layers": 8,
"encoder_num_heads": 8,
"encoder_num_units": 128,
"encoder_ffn_inner_dim": 1024,
"encoder_dropout": 0.1,
"encoder_attention_dropout": 0.1,
"encoder_relu_dropout": 0.1,
"encoder_projection_units": 32,
"speaker_units": 512,
"emotion_units": 32,
"predictor_filter_size": 41,
"predictor_fsmn_num_layers": 3,
"predictor_num_memory_units": 128,
"predictor_ffn_inner_dim": 256,
"predictor_dropout": 0.1,
"predictor_shift": 0,
"predictor_lstm_units": 128,
"dur_pred_prenet_units": [128, 128],
"dur_pred_lstm_units": 128,
"decoder_prenet_units": [256, 256],
"decoder_num_layers": 12,
"decoder_num_heads": 8,
"decoder_num_units": 128,
"decoder_ffn_inner_dim": 1024,
"decoder_dropout": 0.1,
"decoder_attention_dropout": 0.1,
"decoder_relu_dropout": 0.1,
"outputs_per_step": 3,
"num_mels": 82,
"postnet_filter_size": 41,
"postnet_fsmn_num_layers": 4,
"postnet_num_memory_units": 256,
"postnet_ffn_inner_dim": 512,
"postnet_dropout": 0.1,
"postnet_shift": 17,
"postnet_lstm_units": 128,
"nsf_f0_global_maximum": 730.0,
"nsf_f0_global_minimum": 30.0,
"nsf_norm_type": "global"
},
"audio": {
"frame_shift_ms": 12.5
},
"linguistic_unit": {
"cleaners": "english_cleaners",
"lfeat_type_list": "sy,tone,syllable_flag,word_segment,emo_category,speaker_category",
"sy": "dict/sy_dict.txt",
"tone": "dict/tone_dict.txt",
"syllable_flag": "dict/syllable_flag_dict.txt",
"word_segment": "dict/word_segment_dict.txt",
"emo_category": "dict/emo_category_dict.txt",
"speaker_category": "dict/speaker_dict.txt"
},
"num_gpus": 1,
"batch_size": 32,
"group_size": 1024,
"learning_rate": 0.001,
"adam_b1": 0.9,
"adam_b2": 0.98,
"seed": 1234,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:11111",
"world_size": 1
}
},
"vocoder" : {
"resblock": "1",
"num_gpus": 1,
"batch_size": 16,
"learning_rate": 0.0002,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.999,
"seed": 1234,
"bias": true,
"causal": false,
"nsf_params" : {
"nb_harmonics": 7,
"nsf_f0_global_maximum": 730.0,
"nsf_f0_global_minimum": 30.0,
"nsf_norm_type": "global",
"sampling_rate": 16000
},
"upsample_rates": [10,5,2,2],
"upsample_kernel_sizes": [20,11,4,4],
"upsample_initial_channel": 256,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5,7], [1,3,5,7], [1,3,5,7]],
"segment_size": 6400,
"num_mels": 80,
"num_freq": 1025,
"n_fft": 2048,
"hop_size": 200,
"win_size": 1000,
"sampling_rate": 16000,
"fmin": 0,
"fmax": 8000,
"fmax_for_loss": null,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54312",
"world_size": 1
}
}
},
"train": {
},
"pipeline": {
"type": "sambert-hifigan-tts"
}
}
|