jmparejaz commited on
Commit
b5a2ee4
1 Parent(s): ff0a951

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -1
app.py CHANGED
@@ -2,8 +2,16 @@ import os
2
  os.system("pip install git+https://github.com/openai/whisper.git")
3
  import gradio as gr
4
  import whisper
 
5
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
  from transformers import pipeline
 
 
 
 
 
 
 
7
 
8
 
9
  #call tokenizer and NLP model for text classification
@@ -14,6 +22,17 @@ model_nlp = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitt
14
  # call whisper model for audio/speech processing
15
  model = whisper.load_model("small")
16
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def inference_audio(audio):
19
  audio = whisper.load_audio(audio)
@@ -36,6 +55,49 @@ def inference_text(audio):
36
 
37
  return res['label'],res['score']
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  audio = gr.Audio(
40
  label="Input Audio",
41
  show_label=False,
@@ -44,7 +106,7 @@ audio = gr.Audio(
44
  )
45
 
46
 
47
- app=gr.Interface(title="Sentiment Audio Analysis",fn=inference_text,inputs=[audio], outputs=["text","text"])
48
 
49
 
50
 
 
2
  os.system("pip install git+https://github.com/openai/whisper.git")
3
  import gradio as gr
4
  import whisper
5
+ from huggingface_hub import from_pretrained_keras
6
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
  from transformers import pipeline
8
+ import librosa
9
+ import librosa.display
10
+ from sklearn.preprocessing import StandardScaler
11
+ import logging
12
+ import numpy
13
+ import pickle
14
+
15
 
16
 
17
  #call tokenizer and NLP model for text classification
 
22
  # call whisper model for audio/speech processing
23
  model = whisper.load_model("small")
24
 
25
+ # call model for audio emotions
26
+ reloaded_model = from_pretrained_keras('jmparejaz/RAVDESS-CREMAD_AudioEmotionClassifier')
27
+
28
+ # call scaler and decoder
29
+ with open("scaler.pkl", "rb") as f:
30
+ scaler = pickle.load(f)
31
+
32
+ with open("encoder.pkl", "rb") as f:
33
+ encoder = pickle.load(f)
34
+
35
+
36
 
37
  def inference_audio(audio):
38
  audio = whisper.load_audio(audio)
 
55
 
56
  return res['label'],res['score']
57
 
58
+
59
+ def extract_features(data):
60
+ # ZCR
61
+ result = np.array([])
62
+ zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
63
+ result=np.hstack((result, zcr)) # stacking horizontally
64
+
65
+ # Chroma_stft
66
+ stft = np.abs(librosa.stft(data))
67
+ chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
68
+ result = np.hstack((result, chroma_stft)) # stacking horizontally
69
+
70
+ # MFCC
71
+ mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
72
+ result = np.hstack((result, mfcc)) # stacking horizontally
73
+
74
+ # Root Mean Square Value
75
+ rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
76
+ result = np.hstack((result, rms)) # stacking horizontally
77
+
78
+ # MelSpectogram
79
+ mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
80
+ result = np.hstack((result, mel)) # stacking horizontally
81
+
82
+ return result
83
+
84
+ def audio_emotions(audio):
85
+ data = audio.flatten()
86
+ sr=22050
87
+ features_audio = extract_features(data)
88
+ features_audio = np.array(features_audio)
89
+ scaled_features=scaler.transform(features_audio)
90
+ scaled_features = np.expand_dims(scaled_features, axis=2)
91
+ prediction=reloaded_model.predict(scaled_features)
92
+ y_pred = encoder.inverse_transform(prediction)
93
+ return y_pred
94
+
95
+ def main(audio):
96
+ r1,r2=inference_text(audio)
97
+ r3=audio_emotions(audio)
98
+ return r1,r2,r3
99
+
100
+
101
  audio = gr.Audio(
102
  label="Input Audio",
103
  show_label=False,
 
106
  )
107
 
108
 
109
+ app=gr.Interface(title="Sentiment Audio Analysis",fn=main,inputs=[audio], outputs=["text","text","text"])
110
 
111
 
112