Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,214 +0,0 @@
|
|
1 |
-
# # from transformers import pipeline
|
2 |
-
# import gradio as gr
|
3 |
-
|
4 |
-
# # inference_model = pipeline(task="audio-classification", model="speechbrain/emotion-recognition-wav2vec2-IEMOCAP")
|
5 |
-
|
6 |
-
# from speechbrain.inference.interfaces import foreign_class
|
7 |
-
|
8 |
-
|
9 |
-
# def transcribe_audio(mic=None, file=None):
|
10 |
-
# if mic is not None:
|
11 |
-
# audio = mic
|
12 |
-
# elif file is not None:
|
13 |
-
# audio = file
|
14 |
-
# else:
|
15 |
-
# return "You must either provide a mic recording or a file"
|
16 |
-
# classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
|
17 |
-
# out_prob, score, index, text_lab = classifier.classify_file("speechbrain/emotion-recognition-wav2vec2-IEMOCAP/anger.wav")
|
18 |
-
|
19 |
-
# return text_lab
|
20 |
-
|
21 |
-
|
22 |
-
# gr.Interface(
|
23 |
-
# fn=transcribe_audio,
|
24 |
-
# inputs=[
|
25 |
-
# # gr.Audio(source="microphone", type="filepath", optional=True),
|
26 |
-
# # gr.Audio(source="upload", type="filepath", optional=True),
|
27 |
-
# ],
|
28 |
-
# outputs="text",
|
29 |
-
# )
|
30 |
-
|
31 |
-
# if __name__ == "__main__":
|
32 |
-
# gr.launch(share=True)
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
import gradio as gr
|
37 |
-
import torch
|
38 |
-
from speechbrain.inference.interfaces import Pretrained, foreign_class
|
39 |
-
|
40 |
-
|
41 |
-
class CustomEncoderWav2vec2Classifier(Pretrained):
|
42 |
-
"""A ready-to-use class for utterance-level classification (e.g, speaker-id,
|
43 |
-
language-id, emotion recognition, keyword spotting, etc).
|
44 |
-
The class assumes that an self-supervised encoder like wav2vec2/hubert and a classifier model
|
45 |
-
are defined in the yaml file. If you want to
|
46 |
-
convert the predicted index into a corresponding text label, please
|
47 |
-
provide the path of the label_encoder in a variable called 'lab_encoder_file'
|
48 |
-
within the yaml.
|
49 |
-
The class can be used either to run only the encoder (encode_batch()) to
|
50 |
-
extract embeddings or to run a classification step (classify_batch()).
|
51 |
-
```
|
52 |
-
Example
|
53 |
-
-------
|
54 |
-
>>> import torchaudio
|
55 |
-
>>> from speechbrain.pretrained import EncoderClassifier
|
56 |
-
>>> # Model is downloaded from the speechbrain HuggingFace repo
|
57 |
-
>>> tmpdir = getfixture("tmpdir")
|
58 |
-
>>> classifier = EncoderClassifier.from_hparams(
|
59 |
-
... source="speechbrain/spkrec-ecapa-voxceleb",
|
60 |
-
... savedir=tmpdir,
|
61 |
-
... )
|
62 |
-
>>> # Compute embeddings
|
63 |
-
>>> signal, fs = torchaudio.load("samples/audio_samples/example1.wav")
|
64 |
-
>>> embeddings = classifier.encode_batch(signal)
|
65 |
-
>>> # Classification
|
66 |
-
>>> prediction = classifier .classify_batch(signal)
|
67 |
-
"""
|
68 |
-
|
69 |
-
def __init__(self, *args, **kwargs):
|
70 |
-
super().__init__(*args, **kwargs)
|
71 |
-
|
72 |
-
def encode_batch(self, wavs, wav_lens=None, normalize=False):
|
73 |
-
"""Encodes the input audio into a single vector embedding.
|
74 |
-
The waveforms should already be in the model's desired format.
|
75 |
-
You can call:
|
76 |
-
``normalized = <this>.normalizer(signal, sample_rate)``
|
77 |
-
to get a correctly converted signal in most cases.
|
78 |
-
Arguments
|
79 |
-
---------
|
80 |
-
wavs : torch.tensor
|
81 |
-
Batch of waveforms [batch, time, channels] or [batch, time]
|
82 |
-
depending on the model. Make sure the sample rate is fs=16000 Hz.
|
83 |
-
wav_lens : torch.tensor
|
84 |
-
Lengths of the waveforms relative to the longest one in the
|
85 |
-
batch, tensor of shape [batch]. The longest one should have
|
86 |
-
relative length 1.0 and others len(waveform) / max_length.
|
87 |
-
Used for ignoring padding.
|
88 |
-
normalize : bool
|
89 |
-
If True, it normalizes the embeddings with the statistics
|
90 |
-
contained in mean_var_norm_emb.
|
91 |
-
Returns
|
92 |
-
-------
|
93 |
-
torch.tensor
|
94 |
-
The encoded batch
|
95 |
-
"""
|
96 |
-
# Manage single waveforms in input
|
97 |
-
if len(wavs.shape) == 1:
|
98 |
-
wavs = wavs.unsqueeze(0)
|
99 |
-
|
100 |
-
# Assign full length if wav_lens is not assigned
|
101 |
-
if wav_lens is None:
|
102 |
-
wav_lens = torch.ones(wavs.shape[0], device=self.device)
|
103 |
-
|
104 |
-
# Storing waveform in the specified device
|
105 |
-
wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
|
106 |
-
wavs = wavs.float()
|
107 |
-
|
108 |
-
# Computing features and embeddings
|
109 |
-
outputs = self.mods.wav2vec2(wavs)
|
110 |
-
|
111 |
-
# last dim will be used for AdaptativeAVG pool
|
112 |
-
outputs = self.mods.avg_pool(outputs, wav_lens)
|
113 |
-
outputs = outputs.view(outputs.shape[0], -1)
|
114 |
-
return outputs
|
115 |
-
|
116 |
-
def classify_batch(self, wavs, wav_lens=None):
|
117 |
-
"""Performs classification on the top of the encoded features.
|
118 |
-
It returns the posterior probabilities, the index and, if the label
|
119 |
-
encoder is specified it also the text label.
|
120 |
-
Arguments
|
121 |
-
---------
|
122 |
-
wavs : torch.tensor
|
123 |
-
Batch of waveforms [batch, time, channels] or [batch, time]
|
124 |
-
depending on the model. Make sure the sample rate is fs=16000 Hz.
|
125 |
-
wav_lens : torch.tensor
|
126 |
-
Lengths of the waveforms relative to the longest one in the
|
127 |
-
batch, tensor of shape [batch]. The longest one should have
|
128 |
-
relative length 1.0 and others len(waveform) / max_length.
|
129 |
-
Used for ignoring padding.
|
130 |
-
Returns
|
131 |
-
-------
|
132 |
-
out_prob
|
133 |
-
The log posterior probabilities of each class ([batch, N_class])
|
134 |
-
score:
|
135 |
-
It is the value of the log-posterior for the best class ([batch,])
|
136 |
-
index
|
137 |
-
The indexes of the best class ([batch,])
|
138 |
-
text_lab:
|
139 |
-
List with the text labels corresponding to the indexes.
|
140 |
-
(label encoder should be provided).
|
141 |
-
"""
|
142 |
-
outputs = self.encode_batch(wavs, wav_lens)
|
143 |
-
outputs = self.mods.label_lin(outputs)
|
144 |
-
out_prob = self.hparams.softmax(outputs)
|
145 |
-
score, index = torch.max(out_prob, dim=-1)
|
146 |
-
text_lab = self.hparams.label_encoder.decode_torch(index)
|
147 |
-
return out_prob, score, index, text_lab
|
148 |
-
|
149 |
-
def classify_file(self, path):
|
150 |
-
"""Classifies the given audiofile into the given set of labels.
|
151 |
-
Arguments
|
152 |
-
---------
|
153 |
-
path : str
|
154 |
-
Path to audio file to classify.
|
155 |
-
Returns
|
156 |
-
-------
|
157 |
-
out_prob
|
158 |
-
The log posterior probabilities of each class ([batch, N_class])
|
159 |
-
score:
|
160 |
-
It is the value of the log-posterior for the best class ([batch,])
|
161 |
-
index
|
162 |
-
The indexes of the best class ([batch,])
|
163 |
-
text_lab:
|
164 |
-
List with the text labels corresponding to the indexes.
|
165 |
-
(label encoder should be provided).
|
166 |
-
"""
|
167 |
-
waveform = self.load_audio(path)
|
168 |
-
# Fake a batch:
|
169 |
-
batch = waveform.unsqueeze(0)
|
170 |
-
rel_length = torch.tensor([1.0])
|
171 |
-
outputs = self.encode_batch(batch, rel_length)
|
172 |
-
outputs = self.mods.label_lin(outputs).squeeze(1)
|
173 |
-
out_prob = self.hparams.softmax(outputs)
|
174 |
-
score, index = torch.max(out_prob, dim=-1)
|
175 |
-
text_lab = self.hparams.label_encoder.decode_torch(index)
|
176 |
-
if text_lab[0] == "1":
|
177 |
-
text_lab = "neutral"
|
178 |
-
elif text_lab[0] == "2":
|
179 |
-
text_lab = "sadness"
|
180 |
-
elif text_lab[0] == "3":
|
181 |
-
text_lab = "joy"
|
182 |
-
elif text_lab[0] == "4":
|
183 |
-
text_lab = "anger"
|
184 |
-
elif text_lab[0] == "5":
|
185 |
-
text_lab = "affection"
|
186 |
-
|
187 |
-
return out_prob, score, index, text_lab
|
188 |
-
|
189 |
-
def forward(self, wavs, wav_lens=None, normalize=False):
|
190 |
-
return self.encode_batch(
|
191 |
-
wavs=wavs, wav_lens=wav_lens, normalize=normalize
|
192 |
-
)
|
193 |
-
|
194 |
-
|
195 |
-
def return_prediction(mic, file):
|
196 |
-
classifier = foreign_class(source="Porjaz/wavlm-base-emo-fi", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
|
197 |
-
out_prob, score, index, text_lab = classifier.classify_file("anger.wav")
|
198 |
-
return text_lab
|
199 |
-
|
200 |
-
|
201 |
-
classifier = foreign_class(source="Porjaz/wavlm-base-emo-fi", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
|
202 |
-
|
203 |
-
gradio_app = gr.Interface(
|
204 |
-
return_prediction,
|
205 |
-
inputs=[
|
206 |
-
gr.Audio(sources="microphone", type="filepath"),
|
207 |
-
gr.Audio(sources="upload", type="filepath"),
|
208 |
-
],
|
209 |
-
outputs="text",
|
210 |
-
title="Finnish-Emotion-Recognition",
|
211 |
-
)
|
212 |
-
|
213 |
-
if __name__ == "__main__":
|
214 |
-
gradio_app.launch(share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|