xmj2002 commited on
Commit
6a0baf1
1 Parent(s): 326941c

Upload README.md

Browse files
Files changed (1) hide show
  1. README.md +143 -0
README.md ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ # hubert-base-ch-speech-emotion-recognition
5
+ This model uses [TencentGameMate/chinese-hubert-base]([TencentGameMate/chinese-hubert-base · Hugging Face](https://huggingface.co/TencentGameMate/chinese-hubert-base)) as the pre-training model for training on the CASIA dataset.
6
+
7
+ The CASIA dataset provides 1200 samples of recordings from actor performing on 6 different emotions in Chinese(The official website provides a total of 9600 pieces of data, and the data set I used may not be complete), which are:
8
+
9
+ ```python
10
+ emotions = ['anger', 'fear', 'happy', 'neutral', 'sad', 'surprise']
11
+ ```
12
+
13
+ # Usage
14
+ ```python
15
+ import os
16
+ import random
17
+
18
+ import librosa
19
+ import torch
20
+ import torch.nn as nn
21
+ import torch.nn.functional as F
22
+ from transformers import AutoConfig, Wav2Vec2FeatureExtractor, HubertPreTrainedModel, HubertModel
23
+
24
+ model_name_or_path = "xmj2002/hubert-base-ch-speech-emotion-recognition"
25
+ duration = 6
26
+ sample_rate = 16000
27
+
28
+ config = AutoConfig.from_pretrained(
29
+ pretrained_model_name_or_path=model_name_or_path,
30
+ )
31
+
32
+
33
+ def id2class(id):
34
+ if id == 0:
35
+ return "angry"
36
+ elif id == 1:
37
+ return "fear"
38
+ elif id == 2:
39
+ return "happy"
40
+ elif id == 3:
41
+ return "neutral"
42
+ elif id == 4:
43
+ return "sad"
44
+ else:
45
+ return "surprise"
46
+
47
+
48
+ def predict(path, processor, model):
49
+ speech, sr = librosa.load(path=path, sr=sample_rate)
50
+ speech = processor(speech, padding="max_length", truncation=True, max_length=duration * sr,
51
+ return_tensors="pt", sampling_rate=sr).input_values
52
+ with torch.no_grad():
53
+ logit = model(speech)
54
+ score = F.softmax(logit, dim=1).detach().cpu().numpy()[0]
55
+ id = torch.argmax(logit).cpu().numpy()
56
+ print(f"file path: {path} \t predict: {id2class(id)} \t score:{score[id]} ")
57
+
58
+
59
+ class HubertClassificationHead(nn.Module):
60
+ def __init__(self, config):
61
+ super().__init__()
62
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
63
+ self.dropout = nn.Dropout(config.classifier_dropout)
64
+ self.out_proj = nn.Linear(config.hidden_size, config.num_class)
65
+
66
+ def forward(self, x):
67
+ x = self.dense(x)
68
+ x = torch.tanh(x)
69
+ x = self.dropout(x)
70
+ x = self.out_proj(x)
71
+ return x
72
+
73
+
74
+ class HubertForSpeechClassification(HubertPreTrainedModel):
75
+ def __init__(self, config):
76
+ super().__init__(config)
77
+ self.hubert = HubertModel(config)
78
+ self.classifier = HubertClassificationHead(config)
79
+ self.init_weights()
80
+
81
+ def forward(self, x):
82
+ outputs = self.hubert(x)
83
+ hidden_states = outputs[0]
84
+ x = torch.mean(hidden_states, dim=1)
85
+ x = self.classifier(x)
86
+ return x
87
+
88
+
89
+ processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
90
+ model = HubertForSpeechClassification.from_pretrained(
91
+ model_name_or_path,
92
+ config=config,
93
+ )
94
+ model.eval()
95
+
96
+ file_path = [f"test_data/{path}" for path in os.listdir("test_data")]
97
+ path = random.sample(file_path, 1)[0]
98
+ predict(path, processor, model)
99
+
100
+ ```
101
+ # Training setting
102
+
103
+ * Data set segmentation ratio: training set: verification set: test set = 0.6:0.2:0.2
104
+
105
+ * seed: 34
106
+
107
+ * batch_size: 36
108
+
109
+ * lr: 2e-4
110
+
111
+ * optimizer: AdamW(betas=(0.93,0.98), weight_decay=0.2)
112
+
113
+ * scheduler: Step_LR(step_size=10, gamma=0.3)
114
+
115
+ * classifier dropout: 0.1
116
+
117
+ * optimizer parameter:
118
+
119
+ ```python
120
+ for name, param in model.named_parameters():
121
+ if "hubert" in name:
122
+ parameter.append({'params': param, 'lr': 0.2 * lr})
123
+ else:
124
+ parameter.append({'params': param, "lr": lr})
125
+
126
+ ```
127
+
128
+
129
+
130
+ # Metric
131
+
132
+ **Loss(test set): 0.1165**
133
+
134
+ **Accuracy(test set): 0.972**
135
+
136
+ *Accuracy curve of training set and verification set*
137
+
138
+ <div> <img src="https://huggingface.co/xmj2002/hubert-base-ch-speech-emotion-recognition/resolve/main/accuracy.png" width = 80%/> </div>
139
+
140
+
141
+ *Loss curve of training set and verification set*
142
+
143
+ <div> <img src="https://huggingface.co/xmj2002/hubert-base-ch-speech-emotion-recognition/resolve/main/loss.png" width = 80%/> </div>