Spaces:

jmparejaz
/

Audio_to_text_classification

Running

App Files Files Community

jmparejaz commited on Feb 8, 2023

Commit

b5a2ee4

•

1 Parent(s): ff0a951

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -1

app.py CHANGED Viewed

@@ -2,8 +2,16 @@ import os
 os.system("pip install git+https://github.com/openai/whisper.git")
 import gradio as gr
 import whisper
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from transformers import pipeline
 #call tokenizer and NLP model for text classification
@@ -14,6 +22,17 @@ model_nlp = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitt
 # call whisper model for audio/speech processing
 model = whisper.load_model("small")
 def inference_audio(audio):
     audio = whisper.load_audio(audio)
@@ -36,6 +55,49 @@ def inference_text(audio):
     return res['label'],res['score']
 audio = gr.Audio(
                     label="Input Audio",
                     show_label=False,
@@ -44,7 +106,7 @@ audio = gr.Audio(
                 )
-app=gr.Interface(title="Sentiment Audio Analysis",fn=inference_text,inputs=[audio], outputs=["text","text"])

 os.system("pip install git+https://github.com/openai/whisper.git")
 import gradio as gr
 import whisper
+from huggingface_hub import from_pretrained_keras
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from transformers import pipeline
+import librosa
+import librosa.display
+from sklearn.preprocessing import StandardScaler
+import logging
+import numpy
+import pickle
 #call tokenizer and NLP model for text classification
 # call whisper model for audio/speech processing
 model = whisper.load_model("small")
+# call model for audio emotions
+reloaded_model = from_pretrained_keras('jmparejaz/RAVDESS-CREMAD_AudioEmotionClassifier')
+# call scaler and decoder
+with open("scaler.pkl", "rb") as f:
+    scaler = pickle.load(f)
+with open("encoder.pkl", "rb") as f:
+    encoder = pickle.load(f)
 def inference_audio(audio):
     audio = whisper.load_audio(audio)
     return res['label'],res['score']
+def extract_features(data):
+    # ZCR
+    result = np.array([])
+    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
+    result=np.hstack((result, zcr)) # stacking horizontally
+    # Chroma_stft
+    stft = np.abs(librosa.stft(data))
+    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
+    result = np.hstack((result, chroma_stft)) # stacking horizontally
+    # MFCC
+    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
+    result = np.hstack((result, mfcc)) # stacking horizontally
+    # Root Mean Square Value
+    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
+    result = np.hstack((result, rms)) # stacking horizontally
+    # MelSpectogram
+    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
+    result = np.hstack((result, mel)) # stacking horizontally
+    return result
+def audio_emotions(audio):
+    data = audio.flatten()
+    sr=22050
+    features_audio = extract_features(data)
+    features_audio = np.array(features_audio)
+    scaled_features=scaler.transform(features_audio)
+    scaled_features = np.expand_dims(scaled_features, axis=2)
+    prediction=reloaded_model.predict(scaled_features)
+    y_pred = encoder.inverse_transform(prediction)
+    return y_pred
+def main(audio):
+    r1,r2=inference_text(audio)
+    r3=audio_emotions(audio)
+    return r1,r2,r3
 audio = gr.Audio(
                     label="Input Audio",
                     show_label=False,
                 )
+app=gr.Interface(title="Sentiment Audio Analysis",fn=main,inputs=[audio], outputs=["text","text","text"])