Spaces:

SeyedAli
/

Persian-Speech-Emotion-Detection

Runtime error

SeyedAli commited on Sep 22, 2023

Commit

76fe6b5

•

1 Parent(s): 1be8004

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,17 +6,12 @@ import torchaudio
 import gradio as gr
 from transformers import Wav2Vec2FeatureExtractor,AutoConfig
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.models.wav2vec2.modeling_wav2vec2 import (
-    Wav2Vec2PreTrainedModel,
-    Wav2Vec2Model
-)
-from transformers.models.hubert.modeling_hubert import (
-    HubertPreTrainedModel,
-    HubertModel
-)
 config = AutoConfig.from_pretrained("SeyedAli/Persian-Speech-Emotion-HuBert-V1")
-model = Wav2Vec2FeatureExtractor.from_pretrained("SeyedAli/Persian-Speech-Emotion-HuBert-V1")
 audio_input = gr.Audio(label="صوت گفتار فارسی",type="filepath")
 text_output = gr.TextArea(label="هیجان موجود در صوت گفتار",text_align="right",rtl=True,type="text")
@@ -30,7 +25,7 @@ def SER(audio):
         speech_array, _sampling_rate = torchaudio.load(temp_audio_file.name)
         resampler = torchaudio.transforms.Resample(_sampling_rate)
         speech = resampler(speech_array).squeeze().numpy()
-        inputs = model(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
         inputs = {key: inputs[key].to(device) for key in inputs}
         with torch.no_grad():

 import gradio as gr
 from transformers import Wav2Vec2FeatureExtractor,AutoConfig
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from models import Wav2Vec2ForSpeechClassification, HubertForSpeechClassification
 config = AutoConfig.from_pretrained("SeyedAli/Persian-Speech-Emotion-HuBert-V1")
+feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("SeyedAli/Persian-Speech-Emotion-HuBert-V1")
+model = HubertForSpeechClassification.from_pretrained("SeyedAli/Persian-Speech-Emotion-HuBert-V1")
+sampling_rate = feature_extractor.sampling_rate
 audio_input = gr.Audio(label="صوت گفتار فارسی",type="filepath")
 text_output = gr.TextArea(label="هیجان موجود در صوت گفتار",text_align="right",rtl=True,type="text")
         speech_array, _sampling_rate = torchaudio.load(temp_audio_file.name)
         resampler = torchaudio.transforms.Resample(_sampling_rate)
         speech = resampler(speech_array).squeeze().numpy()
+        inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
         inputs = {key: inputs[key].to(device) for key in inputs}
         with torch.no_grad():