Chatanya's picture
Create app.py
1348175 verified
'''
Outline:
- Create animation: animate charts (potentially using streamlit)
'''
import librosa
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import keras
import tensorflow
import matplotlib.animation as animation
model_path = "model_simple.sav" #Defines the path to the model file
emotion_map = {
'Disgust': 0,
'Happiness': 1,
'Saddness': 2,
'Neutral': 3,
'Fear': 4,
'Anger': 5,
'Surprise': 6
} #Maps emotions to integers: taken from data preprocessing
reversed_emotion_map = {value:key for key, value in emotion_map.items()}
#Reverses emotion mapping such that integers can be mapped into emotions
#Uses librosa to load the inputted audio file as a list of frequency values
@st.cache_data
def process_audio(input_file):
st.audio(input_file) #Creates an audio player within the streamlit app
audio_signal, sample_rate = librosa.load(input_file)
return audio_signal, sample_rate
#Creates a line chart displaying the audio frequency using librosa
def display_spectrum_animation(audio_signal, sample_rate):
S = np.abs(librosa.stft(audio_signal))
frequencies = librosa.fft_frequencies(sr=sample_rate)
fig, ax = plt.subplots()
def update_spectrum(num, S, ax):
ax.clear()
ax.plot(frequencies, S[:, num])
ax.set_xlabel("Frequency (Hz)")
ax.set_ylabel("Amplitude")
ani = animation.FuncAnimation(fig, update_spectrum, frames=S.shape[1], fargs=[S, ax], blit=False)
ani.save("spectrum_animation.gif", writer="imagemagick")
st.image("spectrum_animation.gif")
@st.cache_data
def display_frequency(audio_signal, sample_rate):
frequency_plot = librosa.display.waveshow(audio_signal, sr = sample_rate)
st.pyplot(plt.gcf())
#Creates and displays a mel spectrogram using librosa
@st.cache_data
def display_mel_spectogram(audio_signal, sample_rate):
fig, ax = plt.subplots()
audio_time = audio_signal.shape[0]/sample_rate
D = librosa.amplitude_to_db(np.abs(librosa.stft(audio_signal)), ref = np.max)
amt_to_add = int(D.shape[-1]/audio_time)
specshow = librosa.display.specshow(D, sr = sample_rate, x_axis = "time", y_axis = "log", ax = ax)
def update_spectrogram (num, D, ax, plus):
ax.clear()
librosa.display.specshow(D[:, :num + plus], sr = sample_rate, x_axis = "time", y_axis = "log", ax = ax)
ani = animation.FuncAnimation(fig, update_spectrogram, frames = np.arange(1, D.shape[1]), fargs = [D, ax, amt_to_add], blit = False)
ani.save("spectrogram_animation.gif", writer = "imagemagick")
st.image("spectrogram_animation.gif")
#Creates the interface allowing users to select which plot they want displayed
def create_selections(audio_signal, sample_rate):
chart_options = ["Spectrum", "Mel-Spectogram"] #Graph titles go here
functions = [display_spectrum_animation, display_mel_spectogram] #Graphing functions go here
chart_selector = st.radio(
label = "",
options = chart_options,
horizontal = True
)
selection_index = chart_options.index(chart_selector)
functions[selection_index](audio_signal, sample_rate)
#Helper function to force the length of a given frequency array into a specific length
#Currently, this length is hard-coded at 66,150 though that may change in the future
@st.cache_data
def standardize_waveform_length(waveform):
audio_length = 66150
if len(waveform) > audio_length:
waveform = waveform[:audio_length]
else:
waveform = np.pad(waveform, (0, max(0, audio_length - len(waveform))), "constant")
return waveform
#Takes in a given audio signal and returns its mel-frequency cepstral coefficients
@st.cache_data
def preprocess_audio_for_prediction(audio_signal, sample_rate):
waveform = standardize_waveform_length(waveform = audio_signal)
mfcc = librosa.feature.mfcc(y = waveform, sr = sample_rate, n_mels = 128)
mfcc = mfcc.reshape(-1)
return mfcc
#Loads the model given in model_path and returns a Keras Sequential model
@st.cache_data
def load_model(model_path):
model = pickle.load(open(model_path, "rb"))
return model
#Uses the model to predict the speaker's emotion in the given audio clip
@st.cache_data
def get_emotion_prediction(mfcc):
model = load_model(model_path)
prediction = model.predict(mfcc[None])
predicted_index = np.argmax(prediction)
emotion = reversed_emotion_map[predicted_index]
return emotion
#Combines all model functions and displays the model output as a subheader
@st.cache_data
def display_prediction(audio_signal, sample_rate):
mfcc = preprocess_audio_for_prediction(audio_signal, sample_rate)
prediction = get_emotion_prediction(mfcc)
st.subheader("Predicted Emotion: " + prediction, divider = True)
#Defines the entire process of inputting audio, displaying the model's predictions, and displaying graphs
def run(input_file):
audio_signal, sample_rate = process_audio(input_file)
display_prediction(audio_signal, sample_rate)
create_selections(audio_signal, sample_rate)
#Creates an input area to upload the file
def main():
st.header("Upload your file here")
file_uploader = st.file_uploader("", type = "wav")
if file_uploader is not None:
run(file_uploader)
if __name__ == "__main__":
main()