thegenerativegeneration commited on
Commit
13ece43
1 Parent(s): 09446f0

Upload 6 files

Browse files
Files changed (6) hide show
  1. README.md +68 -0
  2. config.yaml +113 -0
  3. configuration.json +13 -0
  4. emotion2vec_base.pt +3 -0
  5. example/test.wav +0 -0
  6. tokens.txt +9 -0
README.md CHANGED
@@ -1,3 +1,71 @@
1
  ---
 
 
2
  license: apache-2.0
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ frameworks:
3
+ - Pytorch
4
  license: apache-2.0
5
+ tasks:
6
+ - emotion-recognition
7
+
8
  ---
9
+
10
+
11
+ # 安装环境
12
+
13
+ - modelscope>=1.11.1
14
+ - funasr>=1.0.5
15
+
16
+ # 用法
17
+
18
+ ## 基于modelscope进行推理
19
+
20
+ ```python
21
+ from modelscope.pipelines import pipeline
22
+ from modelscope.utils.constant import Tasks
23
+
24
+ inference_pipeline = pipeline(
25
+ task=Tasks.emotion_recognition,
26
+ model="iic/emotion2vec_base_finetuned", model_revision="v2.0.4")
27
+
28
+ rec_result = inference_pipeline('https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav', granularity="utterance", extract_embedding=False)
29
+ print(rec_result)
30
+ ```
31
+
32
+
33
+ ## 基于FunASR进行推理
34
+
35
+ ```python
36
+ from funasr import AutoModel
37
+
38
+ model = AutoModel(model="iic/emotion2vec_base_finetuned", model_revision="v2.0.4")
39
+
40
+ wav_file = f"{model.model_path}/example/test.wav"
41
+ res = model.generate(wav_file, output_dir="./outputs", granularity="utterance", extract_embedding=False)
42
+ print(res)
43
+ ```
44
+ 注:模型会自动下载
45
+
46
+ 支持输入文件列表,wav.scp(kaldi风格):
47
+ ```cat wav.scp
48
+ wav_name1 wav_path1.wav
49
+ wav_name2 wav_path2.wav
50
+ ...
51
+ ```
52
+
53
+ 输出为情感表征向量,保存在`output_dir`中,格式为numpy格式(可以用np.load()加载)
54
+
55
+ # 说明
56
+
57
+ 本仓库为emotion2vec的modelscope版本,模型参数完全一致。
58
+
59
+ 原始仓库地址: [https://github.com/ddlBoJack/emotion2vec](https://github.com/ddlBoJack/emotion2vec)
60
+
61
+ modelscope版本仓库:[https://github.com/alibaba-damo-academy/FunASR](https://github.com/alibaba-damo-academy/FunASR/tree/funasr1.0/examples/industrial_data_pretraining/emotion2vec)
62
+
63
+ # 相关论文以及引用信息
64
+ ```BibTeX
65
+ @article{ma2023emotion2vec,
66
+ title={emotion2vec: Self-Supervised Pre-Training for Speech Emotion Representation},
67
+ author={Ma, Ziyang and Zheng, Zhisheng and Ye, Jiaxin and Li, Jinchao and Gao, Zhifu and Zhang, Shiliang and Chen, Xie},
68
+ journal={arXiv preprint arXiv:2312.15185},
69
+ year={2023}
70
+ }
71
+ ```
config.yaml ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # network architecture
3
+ model: Emotion2vec
4
+ model_conf:
5
+ loss_beta: 0.0
6
+ loss_scale: null
7
+ depth: 8
8
+ start_drop_path_rate: 0.0
9
+ end_drop_path_rate: 0.0
10
+ num_heads: 12
11
+ norm_eps: 1e-05
12
+ norm_affine: true
13
+ encoder_dropout: 0.1
14
+ post_mlp_drop: 0.1
15
+ attention_dropout: 0.1
16
+ activation_dropout: 0.0
17
+ dropout_input: 0.0
18
+ layerdrop: 0.05
19
+ embed_dim: 768
20
+ mlp_ratio: 4.0
21
+ layer_norm_first: false
22
+ average_top_k_layers: 8
23
+ end_of_block_targets: false
24
+ clone_batch: 8
25
+ layer_norm_target_layer: false
26
+ batch_norm_target_layer: false
27
+ instance_norm_target_layer: true
28
+ instance_norm_targets: false
29
+ layer_norm_targets: false
30
+ ema_decay: 0.999
31
+ ema_same_dtype: true
32
+ log_norms: true
33
+ ema_end_decay: 0.99999
34
+ ema_anneal_end_step: 20000
35
+ ema_encoder_only: false
36
+ max_update: 100000
37
+ extractor_mode: layer_norm
38
+ shared_decoder: null
39
+ min_target_var: 0.1
40
+ min_pred_var: 0.01
41
+ supported_modality: AUDIO
42
+ mae_init: false
43
+ seed: 1
44
+ skip_ema: false
45
+ cls_loss: 1.0
46
+ recon_loss: 0.0
47
+ d2v_loss: 1.0
48
+ decoder_group: false
49
+ adversarial_training: false
50
+ adversarial_hidden_dim: 128
51
+ adversarial_weight: 0.1
52
+ cls_type: chunk
53
+ normalize: true
54
+ project_dim:
55
+
56
+ modalities:
57
+ audio:
58
+ type: AUDIO
59
+ prenet_depth: 4
60
+ prenet_layerdrop: 0.05
61
+ prenet_dropout: 0.1
62
+ start_drop_path_rate: 0.0
63
+ end_drop_path_rate: 0.0
64
+ num_extra_tokens: 10
65
+ init_extra_token_zero: true
66
+ mask_noise_std: 0.01
67
+ mask_prob_min: null
68
+ mask_prob: 0.5
69
+ inverse_mask: false
70
+ mask_prob_adjust: 0.05
71
+ keep_masked_pct: 0.0
72
+ mask_length: 5
73
+ add_masks: false
74
+ remove_masks: false
75
+ mask_dropout: 0.0
76
+ encoder_zero_mask: true
77
+ mask_channel_prob: 0.0
78
+ mask_channel_length: 64
79
+ ema_local_encoder: false
80
+ local_grad_mult: 1.0
81
+ use_alibi_encoder: true
82
+ alibi_scale: 1.0
83
+ learned_alibi: false
84
+ alibi_max_pos: null
85
+ learned_alibi_scale: true
86
+ learned_alibi_scale_per_head: true
87
+ learned_alibi_scale_per_layer: false
88
+ num_alibi_heads: 12
89
+ model_depth: 8
90
+ decoder:
91
+ decoder_dim: 384
92
+ decoder_groups: 16
93
+ decoder_kernel: 7
94
+ decoder_layers: 4
95
+ input_dropout: 0.1
96
+ add_positions_masked: false
97
+ add_positions_all: false
98
+ decoder_residual: true
99
+ projection_layers: 1
100
+ projection_ratio: 2.0
101
+ extractor_mode: layer_norm
102
+ feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
103
+ conv_pos_width: 95
104
+ conv_pos_groups: 16
105
+ conv_pos_depth: 5
106
+ conv_pos_pre_ln: false
107
+
108
+ tokenizer: CharTokenizer
109
+ tokenizer_conf:
110
+ unk_symbol: <unk>
111
+ split_with_space: true
112
+
113
+
configuration.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "framework": "pytorch",
3
+ "task" : "emotion-recognition",
4
+ "pipeline": {"type":"funasr-pipeline"},
5
+ "model": {"type" : "funasr"},
6
+ "file_path_metas": {
7
+ "init_param":"emotion2vec_base.pt",
8
+ "tokenizer_conf": {"token_list": "tokens.txt"},
9
+ "config":"config.yaml"},
10
+ "model_name_in_hub": {
11
+ "ms":"iic/emotion2vec_base",
12
+ "hf":""}
13
+ }
emotion2vec_base.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a6169355e0611d690f165f67901ce90fa905ee86a236ee54f4a14baf7d1689e
3
+ size 1123130820
example/test.wav ADDED
Binary file (321 kB). View file
 
tokens.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ 生气/angry
2
+ 厌恶/disgusted
3
+ 恐惧/fearful
4
+ 开心/happy
5
+ 中立/neutral
6
+ 其他/other
7
+ 难过/sad
8
+ 吃惊/surprised
9
+ <unk>