File size: 11,869 Bytes
d76d950 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 |
import joblib
import pandas as pd
import streamlit as st
import tensorflow
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
import audio_splitting
# Local Imports
import feature_extraction
def display(model_name,col2):
xgb_multi_class_names = ["Rock", "Rap & Hip-Hop", "Soul", "Classical", "Dance & Electronic", "Blues", "Jazz",
"Country", "Bebop", "Folk", "Reggae", "R&B", "Punk", "Metal", "Pop"]
# if you are interested to know why there are two different lists at the same time, the order of genres in which
# xgb was trained and others were trained are different
xmulti_class_names = ["Metal", "Blues", "Reggae", "Jazz", "Rock", "Folk", "Classical", "Dance & Electronic",
"Punk", "Bebop", "Pop", "R&B", "Country", "Rap & Hip-Hop", "Soul"]
if model_name == "XGB - (Multi Label)":
predicted_indices = model.predict(reshaped_features)
predicted_labels = []
for i in range(0, len(predicted_indices[0])):
if predicted_indices[0][i] == 1.0:
if predicted_labels:
labels = ', '.join(predicted_labels)
with col2:
st.metric(f"Predicted Genres: ",labels,label_visibility='collapsed')
with col2:
st.caption("No genres predicted for this input.")
elif model_name == "XGB Classifier - (Single Label)":
predicted_indices = model.predict(reshaped_features)
predicted_labels = [class_indices[i] for i in predicted_indices]
with col2:
st.metric("Predicted Genre:", str(predicted_labels[0]), label_visibility="collapsed")
elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)" \
or model_name == "Neural Network - (Multi Label)" \
or model_name == "Convolutional Neural Network - (Multi Label)" \
or model_name == "Batch Normalization - (Multi Label)":
predicted_probabilities = model.predict(reshaped_features)
threshold = 0.3
probabilities = []
if model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
predicted_labels = [class_name for i, class_name in enumerate(multi_class_names) if
predicted_probabilities[0][i] >= threshold]
probabilities = [(class_name, predicted_probabilities[0][i] * 100) for i, class_name in
predicted_labels = [class_name for i, class_name in enumerate(xmulti_class_names) if
predicted_probabilities[0][i] >= threshold]
probabilities = [(class_name, predicted_probabilities[0][i] * 100) for i, class_name in
if predicted_labels:
with col2:
st.metric(f"Predicted Genres:",str(', '.join(predicted_labels)))
st.caption("Below is a list of the probability of the sample being classified into each genre. Any "
"probability value below the threshold value (=0.35) is interpreted as the sample not being of "
"that genre ")
df = pd.DataFrame(probabilities,columns=["Genre","Probabilities"])
st.write("No genre predicted above the threshold.")
predicted_label = model.predict(reshaped_features)[0]
with col2:
st.metric("Predicted Genres:", str(predicted_label).capitalize(), label_visibility="collapsed")
# Vars
fields_df = ['Chromagram Short-Time Fourier Transform (Chroma-STFT)',
'Root Mean Square Energy (RMS)',
'Spectral Centroid',
'Spectral Bandwidth',
'Spectral Rolloff',
'Zero Crossing Rate',
'Mel-Frequency Cepstral Coefficients (MFCC-1)',
'MFCC-20', ]
url_single_label = ""
url_github = ""
url_docs = ""
st.title("Multi-Label Music Genre Classifier")
st.write("A multi-label music genre classifier based on the extension of my previous [project](%s). "
"The source files have been provided both on HuggingFace and on [Github](%s). "
"Dataset had to be created specifically, as none was available with the features and multi-labels tags for "
"each audio."
"All the models have been trained on the created dataset." % (url_single_label, url_github))
st.subheader('On Dataset Creation')
with st.expander("See explanation"):
s = 'The work done for creating the dataset were\n' \
'- Downloading the appropriate songs taken randomly from the MuMu dataset in sampled manner from ~80 genres (' \
'tags)\n' \
'- Data Cleaning which included to clean and replace the download songs as many of them were things such as album ' \
'intros, interludes or skits\n' \
'- There were also issues where the song required was not available on any platform and so had to appropriately ' \
'replaced for another proper track or I had to manually search and download\n' \
'- Each file had to properly checked to prevent any distortion or disturbances\n' \
'- Applying feature extraction on each downloaded song using the librosa library\n' \
'- Reducing the labels from ~80 to around ~15\n' \
'\nIn the end I decided to have feature extraction work on 3 second samples and thus have around ~24000 samples. ' \
'I have linked the actual dataset created from all the steps if anyone wishes to work upon it further\n'
st.subheader("Prediction of following genres")
multi_class_names = ["Bebop", "Blues", "Classical", "Country", "Dance & Electronic", "Folk", "Jazz", "Metal",
"Pop", "Punk", "R&B", "Rap & Hip-Hop", "Reggae", "Rock", "Soul"]
class_names = ["Blues", "Classical", "Country", "Disco", "HipHop",
"Jazz", "Metal", "Pop", "Reggae", "Rock"]
class_indices = {i: class_name for i, class_name in enumerate(class_names)}
col1, col2 = st.columns(2)
s = ''
with col1:
for i in multi_class_names[:7]:
s += "- " + i + "\n"
s = ''
with col2:
for i in multi_class_names[8:]:
s += "- " + i + "\n"
# Upload music file
st.subheader("Upload a music file")
uploaded_file = st.file_uploader("Upload a music file", type=["mp3", "wav", "ogg"], label_visibility="collapsed")
if uploaded_file is not None:
# User selects a model
all_models = ["K-Nearest Neighbors - (Single Label)",
"Logistic Regression - (Single Label)",
"Support Vector Machines - (Single Label)",
"Neural Network - (Single Label)",
"XGB Classifier - (Single Label)",
"Convolutional Recurrent Neural Network - (Multi Label)",
"Convolutional Neural Network - (Multi Label)",
"XGB - (Multi Label)",
"Neural Network - (Multi Label)",
"Neural Network with Batch Normalization - (Multi Label)"]
features_list, val_list = audio_splitting.split_audio(uploaded_file)
features = feature_extraction.scale(features_list)
feature_copy = features_list
feature_copy.insert(19, "-")
st.header("Feature Extraction")
st.write("The given audio sample is processed using the librosa library to get the features extracted used by the "
"models for genre prediction. Following is the dataframe with each of the feature extracted and "
"corresponding mean and variance of the feature. The docs of [*librosa*](%s) library can be referred for a more "
"indepth explanation and implementation specifics." % url_docs)
col3, col4 = st.columns([0.55, 0.45])
# Features Dataframe
df = pd.DataFrame({
"name": fields_df,
"Mean": feature_copy[2::2],
"Variance": feature_copy[3::2]
"name": "Features",
"Mean": "Mean of Feature",
"Variance": "Variance of Feature"
st.caption("Note: Harmonic and Percussion values generally have mean in the power of -1e5 or -1e6 and thus "
"are represented as 0.\nAlso, for the feature 'tempo' variance has not been added to keep up with the "
"consistency as presented in the original GTZAN dataset")
col1, col2 = st.columns([0.45, 0.55])
col1.subheader("Select a model")
with col1:
model_name = st.selectbox("Select a model", all_models, label_visibility="collapsed")
if model_name == "K-Nearest Neighbors - (Single Label)":
model = joblib.load("./models/knn.pkl")
elif model_name == "Logistic Regression - (Single Label)":
model = joblib.load("./models/logistic.pkl")
elif model_name == "Support Vector Machines - (Single Label)":
model = joblib.load("./models/svm.pkl")
elif model_name == "Neural Network - (Single Label)":
model = joblib.load("./models/nn.pkl")
elif model_name == "XGB Classifier - (Single Label)":
model = joblib.load("./models/xgb.pkl")
elif model_name == "XGB - (Multi Label)":
model = joblib.load("./models/xgb_mlb.pkl")
elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
model = tensorflow.keras.models.load_model("./models/model_crnn1.h5", compile=False)
elif model_name == "Neural Network - (Multi Label)":
model = tensorflow.keras.models.load_model("./models/model_nn.h5", compile=False)
elif model_name == "Neural Network with Batch Normalization - (Multi Label)":
model = tensorflow.keras.models.load_model("./models/model_bn.h5", compile=False)
elif model_name == "Convolutional Neural Network - (Multi Label)":
model = tensorflow.keras.models.load_model("./models/model_cnn.h5",compile=False)
col2.subheader("Predicted genre")
# Reshape the features to match the expected shape for prediction
reshaped_features = features.reshape(1, -1)