SeaBenSea commited on
Commit
6fa69eb
·
verified ·
1 Parent(s): e844ce1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -0
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.insert(1, './HuBERT-SER/')
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import torchaudio
6
+ from transformers import AutoConfig, Wav2Vec2FeatureExtractor
7
+ from src.models import Wav2Vec2ForSpeechClassification, HubertForSpeechClassification
8
+ import gradio as gr
9
+
10
+ model_name_or_path = "SeaBenSea/hubert-large-turkish-speech-emotion-recognition"
11
+
12
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
+ config = AutoConfig.from_pretrained(model_name_or_path)
14
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
15
+ sampling_rate = feature_extractor.sampling_rate
16
+
17
+ model = HubertForSpeechClassification.from_pretrained(model_name_or_path).to(device)
18
+
19
+ def speech_file_to_array_fn(path, sampling_rate):
20
+ speech_array, _sampling_rate = torchaudio.load(path)
21
+ resampler = torchaudio.transforms.Resample(_sampling_rate, sampling_rate)
22
+ speech = resampler(speech_array).squeeze().numpy()
23
+ return speech
24
+
25
+
26
+ def predict(path, sampling_rate):
27
+ speech = speech_file_to_array_fn(path, sampling_rate)
28
+ inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
29
+ inputs = {key: inputs[key].to(device) for key in inputs}
30
+
31
+ with torch.no_grad():
32
+ logits = model(**inputs).logits
33
+
34
+ scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
35
+ outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in
36
+ enumerate(scores)]
37
+ return outputs
38
+
39
+ def classify_audio(audio):
40
+ return predict(audio, sampling_rate)
41
+
42
+ iface = gr.Interface(
43
+ fn=classify_audio,
44
+ inputs=gr.Audio(sources="upload", type="filepath"),
45
+ outputs=gr.JSON(),
46
+ title="Speech Emotion Classification",
47
+ description="Upload an audio file to classify the emotion expressed in the speech."
48
+ )
49
+
50
+ iface.launch()