Porjaz commited on
Commit
9c031bd
1 Parent(s): 6256cb3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -214
app.py CHANGED
@@ -1,214 +0,0 @@
1
- # # from transformers import pipeline
2
- # import gradio as gr
3
-
4
- # # inference_model = pipeline(task="audio-classification", model="speechbrain/emotion-recognition-wav2vec2-IEMOCAP")
5
-
6
- # from speechbrain.inference.interfaces import foreign_class
7
-
8
-
9
- # def transcribe_audio(mic=None, file=None):
10
- # if mic is not None:
11
- # audio = mic
12
- # elif file is not None:
13
- # audio = file
14
- # else:
15
- # return "You must either provide a mic recording or a file"
16
- # classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
17
- # out_prob, score, index, text_lab = classifier.classify_file("speechbrain/emotion-recognition-wav2vec2-IEMOCAP/anger.wav")
18
-
19
- # return text_lab
20
-
21
-
22
- # gr.Interface(
23
- # fn=transcribe_audio,
24
- # inputs=[
25
- # # gr.Audio(source="microphone", type="filepath", optional=True),
26
- # # gr.Audio(source="upload", type="filepath", optional=True),
27
- # ],
28
- # outputs="text",
29
- # )
30
-
31
- # if __name__ == "__main__":
32
- # gr.launch(share=True)
33
-
34
-
35
-
36
- import gradio as gr
37
- import torch
38
- from speechbrain.inference.interfaces import Pretrained, foreign_class
39
-
40
-
41
- class CustomEncoderWav2vec2Classifier(Pretrained):
42
- """A ready-to-use class for utterance-level classification (e.g, speaker-id,
43
- language-id, emotion recognition, keyword spotting, etc).
44
- The class assumes that an self-supervised encoder like wav2vec2/hubert and a classifier model
45
- are defined in the yaml file. If you want to
46
- convert the predicted index into a corresponding text label, please
47
- provide the path of the label_encoder in a variable called 'lab_encoder_file'
48
- within the yaml.
49
- The class can be used either to run only the encoder (encode_batch()) to
50
- extract embeddings or to run a classification step (classify_batch()).
51
- ```
52
- Example
53
- -------
54
- >>> import torchaudio
55
- >>> from speechbrain.pretrained import EncoderClassifier
56
- >>> # Model is downloaded from the speechbrain HuggingFace repo
57
- >>> tmpdir = getfixture("tmpdir")
58
- >>> classifier = EncoderClassifier.from_hparams(
59
- ... source="speechbrain/spkrec-ecapa-voxceleb",
60
- ... savedir=tmpdir,
61
- ... )
62
- >>> # Compute embeddings
63
- >>> signal, fs = torchaudio.load("samples/audio_samples/example1.wav")
64
- >>> embeddings = classifier.encode_batch(signal)
65
- >>> # Classification
66
- >>> prediction = classifier .classify_batch(signal)
67
- """
68
-
69
- def __init__(self, *args, **kwargs):
70
- super().__init__(*args, **kwargs)
71
-
72
- def encode_batch(self, wavs, wav_lens=None, normalize=False):
73
- """Encodes the input audio into a single vector embedding.
74
- The waveforms should already be in the model's desired format.
75
- You can call:
76
- ``normalized = <this>.normalizer(signal, sample_rate)``
77
- to get a correctly converted signal in most cases.
78
- Arguments
79
- ---------
80
- wavs : torch.tensor
81
- Batch of waveforms [batch, time, channels] or [batch, time]
82
- depending on the model. Make sure the sample rate is fs=16000 Hz.
83
- wav_lens : torch.tensor
84
- Lengths of the waveforms relative to the longest one in the
85
- batch, tensor of shape [batch]. The longest one should have
86
- relative length 1.0 and others len(waveform) / max_length.
87
- Used for ignoring padding.
88
- normalize : bool
89
- If True, it normalizes the embeddings with the statistics
90
- contained in mean_var_norm_emb.
91
- Returns
92
- -------
93
- torch.tensor
94
- The encoded batch
95
- """
96
- # Manage single waveforms in input
97
- if len(wavs.shape) == 1:
98
- wavs = wavs.unsqueeze(0)
99
-
100
- # Assign full length if wav_lens is not assigned
101
- if wav_lens is None:
102
- wav_lens = torch.ones(wavs.shape[0], device=self.device)
103
-
104
- # Storing waveform in the specified device
105
- wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
106
- wavs = wavs.float()
107
-
108
- # Computing features and embeddings
109
- outputs = self.mods.wav2vec2(wavs)
110
-
111
- # last dim will be used for AdaptativeAVG pool
112
- outputs = self.mods.avg_pool(outputs, wav_lens)
113
- outputs = outputs.view(outputs.shape[0], -1)
114
- return outputs
115
-
116
- def classify_batch(self, wavs, wav_lens=None):
117
- """Performs classification on the top of the encoded features.
118
- It returns the posterior probabilities, the index and, if the label
119
- encoder is specified it also the text label.
120
- Arguments
121
- ---------
122
- wavs : torch.tensor
123
- Batch of waveforms [batch, time, channels] or [batch, time]
124
- depending on the model. Make sure the sample rate is fs=16000 Hz.
125
- wav_lens : torch.tensor
126
- Lengths of the waveforms relative to the longest one in the
127
- batch, tensor of shape [batch]. The longest one should have
128
- relative length 1.0 and others len(waveform) / max_length.
129
- Used for ignoring padding.
130
- Returns
131
- -------
132
- out_prob
133
- The log posterior probabilities of each class ([batch, N_class])
134
- score:
135
- It is the value of the log-posterior for the best class ([batch,])
136
- index
137
- The indexes of the best class ([batch,])
138
- text_lab:
139
- List with the text labels corresponding to the indexes.
140
- (label encoder should be provided).
141
- """
142
- outputs = self.encode_batch(wavs, wav_lens)
143
- outputs = self.mods.label_lin(outputs)
144
- out_prob = self.hparams.softmax(outputs)
145
- score, index = torch.max(out_prob, dim=-1)
146
- text_lab = self.hparams.label_encoder.decode_torch(index)
147
- return out_prob, score, index, text_lab
148
-
149
- def classify_file(self, path):
150
- """Classifies the given audiofile into the given set of labels.
151
- Arguments
152
- ---------
153
- path : str
154
- Path to audio file to classify.
155
- Returns
156
- -------
157
- out_prob
158
- The log posterior probabilities of each class ([batch, N_class])
159
- score:
160
- It is the value of the log-posterior for the best class ([batch,])
161
- index
162
- The indexes of the best class ([batch,])
163
- text_lab:
164
- List with the text labels corresponding to the indexes.
165
- (label encoder should be provided).
166
- """
167
- waveform = self.load_audio(path)
168
- # Fake a batch:
169
- batch = waveform.unsqueeze(0)
170
- rel_length = torch.tensor([1.0])
171
- outputs = self.encode_batch(batch, rel_length)
172
- outputs = self.mods.label_lin(outputs).squeeze(1)
173
- out_prob = self.hparams.softmax(outputs)
174
- score, index = torch.max(out_prob, dim=-1)
175
- text_lab = self.hparams.label_encoder.decode_torch(index)
176
- if text_lab[0] == "1":
177
- text_lab = "neutral"
178
- elif text_lab[0] == "2":
179
- text_lab = "sadness"
180
- elif text_lab[0] == "3":
181
- text_lab = "joy"
182
- elif text_lab[0] == "4":
183
- text_lab = "anger"
184
- elif text_lab[0] == "5":
185
- text_lab = "affection"
186
-
187
- return out_prob, score, index, text_lab
188
-
189
- def forward(self, wavs, wav_lens=None, normalize=False):
190
- return self.encode_batch(
191
- wavs=wavs, wav_lens=wav_lens, normalize=normalize
192
- )
193
-
194
-
195
- def return_prediction(mic, file):
196
- classifier = foreign_class(source="Porjaz/wavlm-base-emo-fi", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
197
- out_prob, score, index, text_lab = classifier.classify_file("anger.wav")
198
- return text_lab
199
-
200
-
201
- classifier = foreign_class(source="Porjaz/wavlm-base-emo-fi", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
202
-
203
- gradio_app = gr.Interface(
204
- return_prediction,
205
- inputs=[
206
- gr.Audio(sources="microphone", type="filepath"),
207
- gr.Audio(sources="upload", type="filepath"),
208
- ],
209
- outputs="text",
210
- title="Finnish-Emotion-Recognition",
211
- )
212
-
213
- if __name__ == "__main__":
214
- gradio_app.launch(share=True)