import gradio as gr import torch from speechbrain.inference.interfaces import Pretrained, foreign_class class CustomEncoderWav2vec2Classifier(Pretrained): """A ready-to-use class for utterance-level classification (e.g, speaker-id, language-id, emotion recognition, keyword spotting, etc). The class assumes that an self-supervised encoder like wav2vec2/hubert and a classifier model are defined in the yaml file. If you want to convert the predicted index into a corresponding text label, please provide the path of the label_encoder in a variable called 'lab_encoder_file' within the yaml. The class can be used either to run only the encoder (encode_batch()) to extract embeddings or to run a classification step (classify_batch()). ``` Example ------- >>> import torchaudio >>> from speechbrain.pretrained import EncoderClassifier >>> # Model is downloaded from the speechbrain HuggingFace repo >>> tmpdir = getfixture("tmpdir") >>> classifier = EncoderClassifier.from_hparams( ... source="speechbrain/spkrec-ecapa-voxceleb", ... savedir=tmpdir, ... ) >>> # Compute embeddings >>> signal, fs = torchaudio.load("samples/audio_samples/example1.wav") >>> embeddings = classifier.encode_batch(signal) >>> # Classification >>> prediction = classifier .classify_batch(signal) """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def encode_batch(self, wavs, wav_lens=None, normalize=False): """Encodes the input audio into a single vector embedding. The waveforms should already be in the model's desired format. You can call: ``normalized = .normalizer(signal, sample_rate)`` to get a correctly converted signal in most cases. Arguments --------- wavs : torch.tensor Batch of waveforms [batch, time, channels] or [batch, time] depending on the model. Make sure the sample rate is fs=16000 Hz. wav_lens : torch.tensor Lengths of the waveforms relative to the longest one in the batch, tensor of shape [batch]. The longest one should have relative length 1.0 and others len(waveform) / max_length. Used for ignoring padding. normalize : bool If True, it normalizes the embeddings with the statistics contained in mean_var_norm_emb. Returns ------- torch.tensor The encoded batch """ # Manage single waveforms in input if len(wavs.shape) == 1: wavs = wavs.unsqueeze(0) # Assign full length if wav_lens is not assigned if wav_lens is None: wav_lens = torch.ones(wavs.shape[0], device=self.device) # Storing waveform in the specified device wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device) wavs = wavs.float() # Computing features and embeddings outputs = self.mods.wav2vec2(wavs) # last dim will be used for AdaptativeAVG pool outputs = self.mods.avg_pool(outputs, wav_lens) outputs = outputs.view(outputs.shape[0], -1) return outputs def classify_batch(self, wavs, wav_lens=None): """Performs classification on the top of the encoded features. It returns the posterior probabilities, the index and, if the label encoder is specified it also the text label. Arguments --------- wavs : torch.tensor Batch of waveforms [batch, time, channels] or [batch, time] depending on the model. Make sure the sample rate is fs=16000 Hz. wav_lens : torch.tensor Lengths of the waveforms relative to the longest one in the batch, tensor of shape [batch]. The longest one should have relative length 1.0 and others len(waveform) / max_length. Used for ignoring padding. Returns ------- out_prob The log posterior probabilities of each class ([batch, N_class]) score: It is the value of the log-posterior for the best class ([batch,]) index The indexes of the best class ([batch,]) text_lab: List with the text labels corresponding to the indexes. (label encoder should be provided). """ outputs = self.encode_batch(wavs, wav_lens) outputs = self.mods.label_lin(outputs) out_prob = self.hparams.softmax(outputs) score, index = torch.max(out_prob, dim=-1) text_lab = self.hparams.label_encoder.decode_torch(index) return out_prob, score, index, text_lab def classify_file(self, path): """Classifies the given audiofile into the given set of labels. Arguments --------- path : str Path to audio file to classify. Returns ------- out_prob The log posterior probabilities of each class ([batch, N_class]) score: It is the value of the log-posterior for the best class ([batch,]) index The indexes of the best class ([batch,]) text_lab: List with the text labels corresponding to the indexes. (label encoder should be provided). """ waveform = self.load_audio(path) # Fake a batch: batch = waveform.unsqueeze(0) rel_length = torch.tensor([1.0]) outputs = self.encode_batch(batch, rel_length) outputs = self.mods.label_lin(outputs).squeeze(1) out_prob = self.hparams.softmax(outputs) score, index = torch.max(out_prob, dim=-1) text_lab = self.hparams.label_encoder.decode_torch(index) if text_lab[0] == "1": text_lab = "neutral" elif text_lab[0] == "2": text_lab = "sadness" elif text_lab[0] == "3": text_lab = "joy" elif text_lab[0] == "4": text_lab = "anger" elif text_lab[0] == "5": text_lab = "affection" return out_prob, score, index, text_lab def forward(self, wavs, wav_lens=None, normalize=False): return self.encode_batch( wavs=wavs, wav_lens=wav_lens, normalize=normalize ) def return_prediction(mic, file): if mic is not None: out_prob, score, index, text_lab = classifier.classify_file(mic) elif file is not None: out_prob, score, index, text_lab = classifier.classify_file(file) else: return "You must either provide a mic recording or a file" ''' '1' => 0 '3' => 1 '5' => 2 '4' => 3 '2' => 4 ''' score = score.item() score = str(round(100 * score, 2)) + "%" neu = round(100 * out_prob[0, 0].item(), 2) joy = round(100 * out_prob[0, 1].item(), 2) aff = round(100 * out_prob[0, 2].item(), 2) ang = round(100 * out_prob[0, 3].item(), 2) sad = round(100 * out_prob[0, 4].item(), 2) result_dict = { "Neutral: ": neu, "Joy: ": joy, "Affection: ": aff, "Anger: ": ang, "Sadness: ": sad, } # order the dict in reverse order by value result_dict = dict(sorted(result_dict.items(), key=lambda item: item[1], reverse=True)) keys = list(result_dict.keys()) values = list(result_dict.values()) result_string = keys[0] + "\t" + str(values[0]) + "%" + "\n" + keys[1] + "\t" + str(values[1]) + "%" + "\n" + keys[2] + "\t" + str(values[2]) + "%" + "\n" + keys[3] + "\t" + str(values[3]) + "%" + "\n" + keys[4] + "\t" + str(values[4]) + "%" # return text_lab, score return result_string classifier = foreign_class(source="Porjaz/wavlm-base-emo-fi", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier") gradio_app = gr.Interface( return_prediction, inputs=[ gr.Audio(sources="microphone", type="filepath"), gr.Audio(sources="upload", type="filepath"), ], outputs="text", title="Finnish-Emotion-Recognition", ) if __name__ == "__main__": gradio_app.launch(share=True, ssl_verify=False)