Spaces:

Hetan07
/

multi_label_music_genre_classifier

Sleeping

App Files Files Community

multi_label_music_genre_classifier / app.py

Hetan07

Upload 4 files

d76d950 verified 5 months ago

raw history blame

No virus

11.9 kB

	import joblib
	import pandas as pd
	import streamlit as st
	import tensorflow
	from keras.losses import binary_crossentropy
	from keras.optimizers import Adam

	import audio_splitting
	# Local Imports
	import feature_extraction

	st.set_page_config(layout="wide")


	def display(model_name,col2):
	xgb_multi_class_names = ["Rock", "Rap & Hip-Hop", "Soul", "Classical", "Dance & Electronic", "Blues", "Jazz",
	"Country", "Bebop", "Folk", "Reggae", "R&B", "Punk", "Metal", "Pop"]
	# if you are interested to know why there are two different lists at the same time, the order of genres in which
	# xgb was trained and others were trained are different
	xmulti_class_names = ["Metal", "Blues", "Reggae", "Jazz", "Rock", "Folk", "Classical", "Dance & Electronic",
	"Punk", "Bebop", "Pop", "R&B", "Country", "Rap & Hip-Hop", "Soul"]

	if model_name == "XGB - (Multi Label)":
	predicted_indices = model.predict(reshaped_features)
	predicted_labels = []
	for i in range(0, len(predicted_indices[0])):
	if predicted_indices[0][i] == 1.0:
	predicted_labels.append(xgb_multi_class_names[i])
	if predicted_labels:
	labels = ', '.join(predicted_labels)
	with col2:
	st.metric(f"Predicted Genres: ",labels,label_visibility='collapsed')
	else:
	with col2:
	st.caption("No genres predicted for this input.")

	elif model_name == "XGB Classifier - (Single Label)":
	predicted_indices = model.predict(reshaped_features)
	predicted_labels = [class_indices[i] for i in predicted_indices]
	with col2:
	st.metric("Predicted Genre:", str(predicted_labels[0]), label_visibility="collapsed")

	elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)" \
	or model_name == "Neural Network - (Multi Label)" \
	or model_name == "Convolutional Neural Network - (Multi Label)" \
	or model_name == "Batch Normalization - (Multi Label)":
	predicted_probabilities = model.predict(reshaped_features)
	threshold = 0.3
	print(predicted_probabilities)
	probabilities = []
	if model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
	predicted_labels = [class_name for i, class_name in enumerate(multi_class_names) if
	predicted_probabilities[0][i] >= threshold]
	probabilities = [(class_name, predicted_probabilities[0][i] * 100) for i, class_name in
	enumerate(multi_class_names)]

	else:
	predicted_labels = [class_name for i, class_name in enumerate(xmulti_class_names) if
	predicted_probabilities[0][i] >= threshold]
	probabilities = [(class_name, predicted_probabilities[0][i] * 100) for i, class_name in
	enumerate(xmulti_class_names)]

	if predicted_labels:
	with col2:
	st.metric(f"Predicted Genres:",str(', '.join(predicted_labels)))
	st.caption("Below is a list of the probability of the sample being classified into each genre. Any "
	"probability value below the threshold value (=0.35) is interpreted as the sample not being of "
	"that genre ")
	df = pd.DataFrame(probabilities,columns=["Genre","Probabilities"])
	st.dataframe(df,hide_index=True,use_container_width=True)
	else:
	st.write("No genre predicted above the threshold.")

	else:
	predicted_label = model.predict(reshaped_features)[0]
	with col2:
	st.metric("Predicted Genres:", str(predicted_label).capitalize(), label_visibility="collapsed")


	# Vars
	fields_df = ['Chromagram Short-Time Fourier Transform (Chroma-STFT)',
	'Root Mean Square Energy (RMS)',
	'Spectral Centroid',
	'Spectral Bandwidth',
	'Spectral Rolloff',
	'Zero Crossing Rate',
	'Harmony',
	'Percussion',
	'Tempo',
	'Mel-Frequency Cepstral Coefficients (MFCC-1)',
	'MFCC-2',
	'MFCC-3',
	'MFCC-4',
	'MFCC-5',
	'MFCC-6',
	'MFCC-7',
	'MFCC-8',
	'MFCC-9',
	'MFCC-10',
	'MFCC-11',
	'MFCC-12',
	'MFCC-13',
	'MFCC-14',
	'MFCC-15',
	'MFCC-16',
	'MFCC-17',
	'MFCC-18',
	'MFCC-19',
	'MFCC-20', ]

	url_single_label = "https://huggingface.co/spaces/Hetan07/Single_Label_Music_Genre_Classifier"
	url_github = "https://github.com/Hetan07/Multi-Label-Music-Genre-Classifier"
	url_docs = "https://librosa.org/doc/latest/index.html"

	st.title("Multi-Label Music Genre Classifier")
	st.write("A multi-label music genre classifier based on the extension of my previous [project](%s). "
	"The source files have been provided both on HuggingFace and on [Github](%s). "
	"Dataset had to be created specifically, as none was available with the features and multi-labels tags for "
	"each audio."
	"All the models have been trained on the created dataset." % (url_single_label, url_github))

	st.divider()
	st.subheader('On Dataset Creation')

	with st.expander("See explanation"):
	s = 'The work done for creating the dataset were\n' \
	'- Downloading the appropriate songs taken randomly from the MuMu dataset in sampled manner from ~80 genres (' \
	'tags)\n' \
	'- Data Cleaning which included to clean and replace the download songs as many of them were things such as album ' \
	'intros, interludes or skits\n' \
	'- There were also issues where the song required was not available on any platform and so had to appropriately ' \
	'replaced for another proper track or I had to manually search and download\n' \
	'- Each file had to properly checked to prevent any distortion or disturbances\n' \
	'- Applying feature extraction on each downloaded song using the librosa library\n' \
	'- Reducing the labels from ~80 to around ~15\n' \
	'\nIn the end I decided to have feature extraction work on 3 second samples and thus have around ~24000 samples. ' \
	'I have linked the actual dataset created from all the steps if anyone wishes to work upon it further\n'

	st.markdown(s)
	st.divider()

	st.subheader("Prediction of following genres")

	multi_class_names = ["Bebop", "Blues", "Classical", "Country", "Dance & Electronic", "Folk", "Jazz", "Metal",
	"Pop", "Punk", "R&B", "Rap & Hip-Hop", "Reggae", "Rock", "Soul"]

	class_names = ["Blues", "Classical", "Country", "Disco", "HipHop",
	"Jazz", "Metal", "Pop", "Reggae", "Rock"]

	class_indices = {i: class_name for i, class_name in enumerate(class_names)}

	col1, col2 = st.columns(2)
	s = ''
	with col1:
	for i in multi_class_names[:7]:
	s += "- " + i + "\n"
	st.markdown(s)

	s = ''
	with col2:
	for i in multi_class_names[8:]:
	s += "- " + i + "\n"
	st.markdown(s)

	st.divider()
	# Upload music file
	st.subheader("Upload a music file")
	uploaded_file = st.file_uploader("Upload a music file", type=["mp3", "wav", "ogg"], label_visibility="collapsed")

	st.divider()
	if uploaded_file is not None:
	# User selects a model
	all_models = ["K-Nearest Neighbors - (Single Label)",
	"Logistic Regression - (Single Label)",
	"Support Vector Machines - (Single Label)",
	"Neural Network - (Single Label)",
	"XGB Classifier - (Single Label)",
	"Convolutional Recurrent Neural Network - (Multi Label)",
	"Convolutional Neural Network - (Multi Label)",
	"XGB - (Multi Label)",
	"Neural Network - (Multi Label)",
	"Neural Network with Batch Normalization - (Multi Label)"]

	features_list, val_list = audio_splitting.split_audio(uploaded_file)
	features = feature_extraction.scale(features_list)

	feature_copy = features_list
	feature_copy.insert(19, "-")
	st.header("Feature Extraction")

	st.write("The given audio sample is processed using the librosa library to get the features extracted used by the "
	"models for genre prediction. Following is the dataframe with each of the feature extracted and "
	"corresponding mean and variance of the feature. The docs of [librosa](%s) library can be referred for a more "
	"indepth explanation and implementation specifics." % url_docs)

	col3, col4 = st.columns([0.55, 0.45])
	# Features Dataframe
	df = pd.DataFrame({
	"name": fields_df,
	"Mean": feature_copy[2::2],
	"Variance": feature_copy[3::2]
	})

	st.dataframe(
	df,
	column_config={
	"name": "Features",
	"Mean": "Mean of Feature",
	"Variance": "Variance of Feature"
	},
	use_container_width=True
	)
	st.caption("Note: Harmonic and Percussion values generally have mean in the power of -1e5 or -1e6 and thus "
	"are represented as 0.\nAlso, for the feature 'tempo' variance has not been added to keep up with the "
	"consistency as presented in the original GTZAN dataset")

	st.divider()

	col1, col2 = st.columns([0.45, 0.55])

	col1.subheader("Select a model")
	with col1:
	model_name = st.selectbox("Select a model", all_models, label_visibility="collapsed")

	if model_name == "K-Nearest Neighbors - (Single Label)":
	model = joblib.load("./models/knn.pkl")
	elif model_name == "Logistic Regression - (Single Label)":
	model = joblib.load("./models/logistic.pkl")
	elif model_name == "Support Vector Machines - (Single Label)":
	model = joblib.load("./models/svm.pkl")
	elif model_name == "Neural Network - (Single Label)":
	model = joblib.load("./models/nn.pkl")
	elif model_name == "XGB Classifier - (Single Label)":
	model = joblib.load("./models/xgb.pkl")
	elif model_name == "XGB - (Multi Label)":
	model = joblib.load("./models/xgb_mlb.pkl")
	elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
	model = tensorflow.keras.models.load_model("./models/model_crnn1.h5", compile=False)
	model.compile(loss=binary_crossentropy,
	optimizer=Adam(),
	metrics=['accuracy'])
	elif model_name == "Neural Network - (Multi Label)":
	model = tensorflow.keras.models.load_model("./models/model_nn.h5", compile=False)
	model.compile(loss=binary_crossentropy,
	optimizer=Adam(),
	metrics=['accuracy'])
	elif model_name == "Neural Network with Batch Normalization - (Multi Label)":
	model = tensorflow.keras.models.load_model("./models/model_bn.h5", compile=False)
	model.compile(loss=binary_crossentropy,
	optimizer=Adam(),
	metrics=['accuracy'])
	elif model_name == "Convolutional Neural Network - (Multi Label)":
	model = tensorflow.keras.models.load_model("./models/model_cnn.h5",compile=False)
	model.compile(loss=binary_crossentropy,
	optimizer=Adam(),
	metrics=['accuracy'])
	col2.subheader("Predicted genre")

	# Reshape the features to match the expected shape for prediction
	reshaped_features = features.reshape(1, -1)
	display(model_name,col2)