emofs2-base / config.json
hahunavth's picture
Upload model
d7ff923
{
"_name_or_path": "hahunavth/emofs2-base",
"architectures": [
"ESSModelForPretraining"
],
"freeze": [],
"model_config": {
"conformer": {
"attention_dropout_p": 0.2,
"conv_dropout_p": 0.2,
"conv_expansion_factor": 2,
"conv_kernel_size": 7,
"decoder_dim": 256,
"encoder_dim": 256,
"feed_forward_dropout_p": 0.2,
"feed_forward_expansion_factor": 4,
"half_step_residual": true,
"num_attention_heads": 2,
"num_decode_layers": 6,
"num_encode_layers": 4
},
"max_seq_len": 1000,
"mode": "train",
"num_emotion": 5,
"reference_encoder": {
"dropout": 0.2,
"encoder_dim": 128
},
"variance_embedding": {
"energy_quantization": "linear",
"n_bins": 256,
"pitch_quantization": "linear"
},
"variance_predictor": {
"dropout": 0.5,
"filter_size": 256,
"kernel_size": 3
},
"vocoder": {
"model": "HiFi-GAN",
"speaker": "tth"
}
},
"model_type": "emofs2",
"preprocess_config": {
"dataset": "vlsp2023emo",
"emotion2id": {
"angry": 3,
"happy": 1,
"neutral": 0,
"sad": 2,
"surprise": 4
},
"id2emotion": {
"0": "neutral",
"1": "happy",
"2": "sad",
"3": "angry",
"4": "surprise"
},
"path": {
"corpus_path": "./data/pretrained_tts_dataset/tuyendv.dict",
"lexicon_path": "../datasets/ess-vlsp2023-lexicon/lexicon.dict",
"preprocessed_path": "../datasets/ess-vlsp2023-emo-processed-phoneme-level",
"raw_path": "./data/pretrained_tts_dataset_raw"
},
"preprocessing": {
"audio": {
"max_wav_value": 32768.0,
"sampling_rate": 22050
},
"energy": {
"feature": "phoneme_level",
"normalization": true
},
"mel": {
"mel_fmax": 8000,
"mel_fmin": 0,
"n_mel_channels": 80
},
"pitch": {
"feature": "phoneme_level",
"normalization": true
},
"stft": {
"filter_length": 1024,
"hop_length": 256,
"win_length": 1024
},
"text": {
"language": "en",
"text_cleaners": []
},
"val_size": 512
},
"smoothing_label": 0.1
},
"torch_dtype": "float32",
"transformers_version": "4.35.2"
}