Spaces:
Runtime error
Runtime error
{ | |
"framework": "PyTorch", | |
"task" : "text-to-speech", | |
"model" : { | |
"type" : "sambert-hifigan", | |
"lang_type" : "zhcn", | |
"sample_rate" : 16000, | |
"custom_ckpt": { | |
"voice_name" : "F7", | |
"am_ckpt" : "basemodel_16k/sambert/ckpt", | |
"am_config" : "basemodel_16k/sambert/config.yaml", | |
"voc_ckpt" : "basemodel_16k/hifigan/ckpt", | |
"voc_config" : "basemodel_16k/hifigan/config.yaml", | |
"audio_config" : "basemodel_16k/audio_config_se_16k.yaml", | |
"se_model" : "basemodel_16k/speaker_embedding/se.onnx" | |
}, | |
"am": { | |
"am": { | |
"max_len": 800, | |
"embedding_dim": 512, | |
"encoder_num_layers": 8, | |
"encoder_num_heads": 8, | |
"encoder_num_units": 128, | |
"encoder_ffn_inner_dim": 1024, | |
"encoder_dropout": 0.1, | |
"encoder_attention_dropout": 0.1, | |
"encoder_relu_dropout": 0.1, | |
"encoder_projection_units": 32, | |
"speaker_units": 512, | |
"emotion_units": 32, | |
"predictor_filter_size": 41, | |
"predictor_fsmn_num_layers": 3, | |
"predictor_num_memory_units": 128, | |
"predictor_ffn_inner_dim": 256, | |
"predictor_dropout": 0.1, | |
"predictor_shift": 0, | |
"predictor_lstm_units": 128, | |
"dur_pred_prenet_units": [128, 128], | |
"dur_pred_lstm_units": 128, | |
"decoder_prenet_units": [256, 256], | |
"decoder_num_layers": 12, | |
"decoder_num_heads": 8, | |
"decoder_num_units": 128, | |
"decoder_ffn_inner_dim": 1024, | |
"decoder_dropout": 0.1, | |
"decoder_attention_dropout": 0.1, | |
"decoder_relu_dropout": 0.1, | |
"outputs_per_step": 3, | |
"num_mels": 82, | |
"postnet_filter_size": 41, | |
"postnet_fsmn_num_layers": 4, | |
"postnet_num_memory_units": 256, | |
"postnet_ffn_inner_dim": 512, | |
"postnet_dropout": 0.1, | |
"postnet_shift": 17, | |
"postnet_lstm_units": 128, | |
"nsf_f0_global_maximum": 730.0, | |
"nsf_f0_global_minimum": 30.0, | |
"nsf_norm_type": "global" | |
}, | |
"audio": { | |
"frame_shift_ms": 12.5 | |
}, | |
"linguistic_unit": { | |
"cleaners": "english_cleaners", | |
"lfeat_type_list": "sy,tone,syllable_flag,word_segment,emo_category,speaker_category", | |
"sy": "dict/sy_dict.txt", | |
"tone": "dict/tone_dict.txt", | |
"syllable_flag": "dict/syllable_flag_dict.txt", | |
"word_segment": "dict/word_segment_dict.txt", | |
"emo_category": "dict/emo_category_dict.txt", | |
"speaker_category": "dict/speaker_dict.txt" | |
}, | |
"num_gpus": 1, | |
"batch_size": 32, | |
"group_size": 1024, | |
"learning_rate": 0.001, | |
"adam_b1": 0.9, | |
"adam_b2": 0.98, | |
"seed": 1234, | |
"num_workers": 4, | |
"dist_config": { | |
"dist_backend": "nccl", | |
"dist_url": "tcp://localhost:11111", | |
"world_size": 1 | |
} | |
}, | |
"vocoder" : { | |
"resblock": "1", | |
"num_gpus": 1, | |
"batch_size": 16, | |
"learning_rate": 0.0002, | |
"adam_b1": 0.8, | |
"adam_b2": 0.99, | |
"lr_decay": 0.999, | |
"seed": 1234, | |
"bias": true, | |
"causal": false, | |
"nsf_params" : { | |
"nb_harmonics": 7, | |
"nsf_f0_global_maximum": 730.0, | |
"nsf_f0_global_minimum": 30.0, | |
"nsf_norm_type": "global", | |
"sampling_rate": 16000 | |
}, | |
"upsample_rates": [10,5,2,2], | |
"upsample_kernel_sizes": [20,11,4,4], | |
"upsample_initial_channel": 256, | |
"resblock_kernel_sizes": [3,7,11], | |
"resblock_dilation_sizes": [[1,3,5,7], [1,3,5,7], [1,3,5,7]], | |
"segment_size": 6400, | |
"num_mels": 80, | |
"num_freq": 1025, | |
"n_fft": 2048, | |
"hop_size": 200, | |
"win_size": 1000, | |
"sampling_rate": 16000, | |
"fmin": 0, | |
"fmax": 8000, | |
"fmax_for_loss": null, | |
"num_workers": 4, | |
"dist_config": { | |
"dist_backend": "nccl", | |
"dist_url": "tcp://localhost:54312", | |
"world_size": 1 | |
} | |
} | |
}, | |
"train": { | |
}, | |
"pipeline": { | |
"type": "sambert-hifigan-tts" | |
} | |
} | |