{ "_name_or_path": "hahunavth/emofs2-base", "architectures": [ "ESSModelForPretraining" ], "freeze": [], "model_config": { "conformer": { "attention_dropout_p": 0.2, "conv_dropout_p": 0.2, "conv_expansion_factor": 2, "conv_kernel_size": 7, "decoder_dim": 256, "encoder_dim": 256, "feed_forward_dropout_p": 0.2, "feed_forward_expansion_factor": 4, "half_step_residual": true, "num_attention_heads": 2, "num_decode_layers": 6, "num_encode_layers": 4 }, "max_seq_len": 1000, "mode": "train", "num_emotion": 5, "reference_encoder": { "dropout": 0.2, "encoder_dim": 128 }, "variance_embedding": { "energy_quantization": "linear", "n_bins": 256, "pitch_quantization": "linear" }, "variance_predictor": { "dropout": 0.5, "filter_size": 256, "kernel_size": 3 }, "vocoder": { "model": "HiFi-GAN", "speaker": "tth" } }, "model_type": "emofs2", "preprocess_config": { "dataset": "vlsp2023emo", "emotion2id": { "angry": 3, "happy": 1, "neutral": 0, "sad": 2, "surprise": 4 }, "id2emotion": { "0": "neutral", "1": "happy", "2": "sad", "3": "angry", "4": "surprise" }, "path": { "corpus_path": "./data/pretrained_tts_dataset/tuyendv.dict", "lexicon_path": "../datasets/ess-vlsp2023-lexicon/lexicon.dict", "preprocessed_path": "../datasets/ess-vlsp2023-emo-processed-phoneme-level", "raw_path": "./data/pretrained_tts_dataset_raw" }, "preprocessing": { "audio": { "max_wav_value": 32768.0, "sampling_rate": 22050 }, "energy": { "feature": "phoneme_level", "normalization": true }, "mel": { "mel_fmax": 8000, "mel_fmin": 0, "n_mel_channels": 80 }, "pitch": { "feature": "phoneme_level", "normalization": true }, "stft": { "filter_length": 1024, "hop_length": 256, "win_length": 1024 }, "text": { "language": "en", "text_cleaners": [] }, "val_size": 512 }, "smoothing_label": 0.1 }, "torch_dtype": "float32", "transformers_version": "4.35.2" }