Spaces:

Hetan07
/

multi_label_music_genre_classifier

Running

App Files Files Community

Hetan07 commited on Jan 20

Commit

d76d950

•

1 Parent(s): b27f628

Upload 4 files

Browse files

Files changed (4) hide show

app.py +261 -0
audio_splitting.py +26 -0
feature_extraction.py +115 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import joblib
+import pandas as pd
+import streamlit as st
+import tensorflow
+from keras.losses import binary_crossentropy
+from keras.optimizers import Adam
+import audio_splitting
+# Local Imports
+import feature_extraction
+st.set_page_config(layout="wide")
+def display(model_name,col2):
+    xgb_multi_class_names = ["Rock", "Rap & Hip-Hop", "Soul", "Classical", "Dance & Electronic", "Blues", "Jazz",
+                             "Country", "Bebop", "Folk", "Reggae", "R&B", "Punk", "Metal", "Pop"]
+    # if you are interested to know why there are two different lists at the same time, the order of genres in which
+    # xgb was trained and others were trained are different
+    xmulti_class_names = ["Metal", "Blues", "Reggae", "Jazz", "Rock", "Folk", "Classical", "Dance & Electronic",
+                          "Punk", "Bebop", "Pop", "R&B", "Country", "Rap & Hip-Hop", "Soul"]
+    if model_name == "XGB - (Multi Label)":
+        predicted_indices = model.predict(reshaped_features)
+        predicted_labels = []
+        for i in range(0, len(predicted_indices[0])):
+            if predicted_indices[0][i] == 1.0:
+                predicted_labels.append(xgb_multi_class_names[i])
+        if predicted_labels:
+            labels = ', '.join(predicted_labels)
+            with col2:
+                st.metric(f"Predicted Genres: ",labels,label_visibility='collapsed')
+        else:
+            with col2:
+                st.caption("No genres predicted for this input.")
+    elif model_name == "XGB Classifier - (Single Label)":
+        predicted_indices = model.predict(reshaped_features)
+        predicted_labels = [class_indices[i] for i in predicted_indices]
+        with col2:
+            st.metric("Predicted Genre:", str(predicted_labels[0]), label_visibility="collapsed")
+    elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)" \
+            or model_name == "Neural Network - (Multi Label)" \
+            or model_name == "Convolutional Neural Network - (Multi Label)" \
+            or model_name == "Batch Normalization - (Multi Label)":
+        predicted_probabilities = model.predict(reshaped_features)
+        threshold = 0.3
+        print(predicted_probabilities)
+        probabilities = []
+        if model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
+            predicted_labels = [class_name for i, class_name in enumerate(multi_class_names) if
+                                predicted_probabilities[0][i] >= threshold]
+            probabilities = [(class_name, predicted_probabilities[0][i] * 100) for i, class_name in
+                             enumerate(multi_class_names)]
+        else:
+            predicted_labels = [class_name for i, class_name in enumerate(xmulti_class_names) if
+                                predicted_probabilities[0][i] >= threshold]
+            probabilities = [(class_name, predicted_probabilities[0][i] * 100) for i, class_name in
+                             enumerate(xmulti_class_names)]
+        if predicted_labels:
+            with col2:
+                st.metric(f"Predicted Genres:",str(', '.join(predicted_labels)))
+                st.caption("Below is a list of the probability of the sample being classified into each genre. Any "
+                           "probability value below the threshold value (=0.35) is interpreted as the sample not being of "
+                           "that genre ")
+                df = pd.DataFrame(probabilities,columns=["Genre","Probabilities"])
+                st.dataframe(df,hide_index=True,use_container_width=True)
+        else:
+            st.write("No genre predicted above the threshold.")
+    else:
+        predicted_label = model.predict(reshaped_features)[0]
+        with col2:
+            st.metric("Predicted Genres:", str(predicted_label).capitalize(), label_visibility="collapsed")
+# Vars
+fields_df = ['Chromagram Short-Time Fourier Transform (Chroma-STFT)',
+             'Root Mean Square Energy (RMS)',
+             'Spectral Centroid',
+             'Spectral Bandwidth',
+             'Spectral Rolloff',
+             'Zero Crossing Rate',
+             'Harmony',
+             'Percussion',
+             'Tempo',
+             'Mel-Frequency Cepstral Coefficients (MFCC-1)',
+             'MFCC-2',
+             'MFCC-3',
+             'MFCC-4',
+             'MFCC-5',
+             'MFCC-6',
+             'MFCC-7',
+             'MFCC-8',
+             'MFCC-9',
+             'MFCC-10',
+             'MFCC-11',
+             'MFCC-12',
+             'MFCC-13',
+             'MFCC-14',
+             'MFCC-15',
+             'MFCC-16',
+             'MFCC-17',
+             'MFCC-18',
+             'MFCC-19',
+             'MFCC-20', ]
+url_single_label = "https://huggingface.co/spaces/Hetan07/Single_Label_Music_Genre_Classifier"
+url_github = "https://github.com/Hetan07/Multi-Label-Music-Genre-Classifier"
+url_docs = "https://librosa.org/doc/latest/index.html"
+st.title("Multi-Label Music Genre Classifier")
+st.write("A multi-label music genre classifier based on the extension of my previous [project](%s). "
+         "The source files have been provided both on HuggingFace and on [Github](%s). "
+         "Dataset had to be created specifically, as none was available with the features and multi-labels tags for "
+         "each audio."
+         "All the models have been trained on the created dataset." % (url_single_label, url_github))
+st.divider()
+st.subheader('On Dataset Creation')
+with st.expander("See explanation"):
+    s = 'The work done for creating the dataset were\n' \
+        '- Downloading the appropriate songs taken randomly from the MuMu dataset in sampled manner from ~80 genres (' \
+        'tags)\n' \
+        '- Data Cleaning which included to clean and replace the download songs as many of them were things such as album ' \
+        'intros, interludes or skits\n' \
+        '- There were also issues where the song required was not available on any platform and so had to appropriately ' \
+        'replaced for another proper track or I had to manually search and download\n' \
+        '- Each file had to properly checked to prevent any distortion or disturbances\n' \
+        '- Applying feature extraction on each downloaded song using the librosa library\n' \
+        '- Reducing the labels from ~80 to around ~15\n' \
+        '\nIn the end I decided to have feature extraction work on 3 second samples and thus have around ~24000 samples. ' \
+        'I have linked the actual dataset created from all the steps if anyone wishes to work upon it further\n'
+    st.markdown(s)
+st.divider()
+st.subheader("Prediction of following genres")
+multi_class_names = ["Bebop", "Blues", "Classical", "Country", "Dance & Electronic", "Folk", "Jazz", "Metal",
+                     "Pop", "Punk", "R&B", "Rap & Hip-Hop", "Reggae", "Rock", "Soul"]
+class_names = ["Blues", "Classical", "Country", "Disco", "HipHop",
+               "Jazz", "Metal", "Pop", "Reggae", "Rock"]
+class_indices = {i: class_name for i, class_name in enumerate(class_names)}
+col1, col2 = st.columns(2)
+s = ''
+with col1:
+    for i in multi_class_names[:7]:
+        s += "- " + i + "\n"
+    st.markdown(s)
+s = ''
+with col2:
+    for i in multi_class_names[8:]:
+        s += "- " + i + "\n"
+    st.markdown(s)
+st.divider()
+# Upload music file
+st.subheader("Upload a music file")
+uploaded_file = st.file_uploader("Upload a music file", type=["mp3", "wav", "ogg"], label_visibility="collapsed")
+st.divider()
+if uploaded_file is not None:
+    # User selects a model
+    all_models = ["K-Nearest Neighbors - (Single Label)",
+                  "Logistic Regression - (Single Label)",
+                  "Support Vector Machines - (Single Label)",
+                  "Neural Network - (Single Label)",
+                  "XGB Classifier - (Single Label)",
+                  "Convolutional Recurrent Neural Network - (Multi Label)",
+                  "Convolutional Neural Network - (Multi Label)",
+                  "XGB - (Multi Label)",
+                  "Neural Network - (Multi Label)",
+                  "Neural Network with Batch Normalization - (Multi Label)"]
+    features_list, val_list = audio_splitting.split_audio(uploaded_file)
+    features = feature_extraction.scale(features_list)
+    feature_copy = features_list
+    feature_copy.insert(19, "-")
+    st.header("Feature Extraction")
+    st.write("The given audio sample is processed using the librosa library to get the features extracted used by the "
+             "models for genre prediction. Following is the dataframe with each of the feature extracted and "
+             "corresponding mean and variance of the feature. The docs of [*librosa*](%s) library can be referred for a more "
+             "indepth explanation and implementation specifics." % url_docs)
+    col3, col4 = st.columns([0.55, 0.45])
+    # Features Dataframe
+    df = pd.DataFrame({
+        "name": fields_df,
+        "Mean": feature_copy[2::2],
+        "Variance": feature_copy[3::2]
+    })
+    st.dataframe(
+        df,
+        column_config={
+            "name": "Features",
+            "Mean": "Mean of Feature",
+            "Variance": "Variance of Feature"
+        },
+        use_container_width=True
+    )
+    st.caption("Note: Harmonic and Percussion values generally have mean in the power of -1e5 or -1e6 and thus "
+               "are represented as 0.\nAlso, for the feature 'tempo' variance has not been added to keep up with the "
+               "consistency as presented in the original GTZAN dataset")
+    st.divider()
+    col1, col2 = st.columns([0.45, 0.55])
+    col1.subheader("Select a model")
+    with col1:
+        model_name = st.selectbox("Select a model", all_models, label_visibility="collapsed")
+        if model_name == "K-Nearest Neighbors - (Single Label)":
+            model = joblib.load("./models/knn.pkl")
+        elif model_name == "Logistic Regression - (Single Label)":
+            model = joblib.load("./models/logistic.pkl")
+        elif model_name == "Support Vector Machines - (Single Label)":
+            model = joblib.load("./models/svm.pkl")
+        elif model_name == "Neural Network - (Single Label)":
+            model = joblib.load("./models/nn.pkl")
+        elif model_name == "XGB Classifier - (Single Label)":
+            model = joblib.load("./models/xgb.pkl")
+        elif model_name == "XGB - (Multi Label)":
+            model = joblib.load("./models/xgb_mlb.pkl")
+        elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
+            model = tensorflow.keras.models.load_model("./models/model_crnn1.h5", compile=False)
+            model.compile(loss=binary_crossentropy,
+                          optimizer=Adam(),
+                          metrics=['accuracy'])
+        elif model_name == "Neural Network - (Multi Label)":
+            model = tensorflow.keras.models.load_model("./models/model_nn.h5", compile=False)
+            model.compile(loss=binary_crossentropy,
+                          optimizer=Adam(),
+                          metrics=['accuracy'])
+        elif model_name == "Neural Network with Batch Normalization - (Multi Label)":
+            model = tensorflow.keras.models.load_model("./models/model_bn.h5", compile=False)
+            model.compile(loss=binary_crossentropy,
+                          optimizer=Adam(),
+                          metrics=['accuracy'])
+        elif model_name == "Convolutional Neural Network - (Multi Label)":
+            model = tensorflow.keras.models.load_model("./models/model_cnn.h5",compile=False)
+            model.compile(loss=binary_crossentropy,
+                          optimizer=Adam(),
+                          metrics=['accuracy'])
+    col2.subheader("Predicted genre")
+    # Reshape the features to match the expected shape for prediction
+    reshaped_features = features.reshape(1, -1)
+    display(model_name,col2)

audio_splitting.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from pydub import AudioSegment
+import feature_extraction
+import io
+def split_audio(uploaded_file):
+    audio = AudioSegment.from_file(uploaded_file)
+    segment_duration = 3 * 1000  # 3 seconds in milliseconds
+    audio_duration = len(audio)
+    # Check if the audio is shorter than 1 minute and 3 seconds
+    if audio_duration < 63 * 1000:
+        # If it's shorter, take audio from 0 to 3 seconds
+        segment = audio[:segment_duration]
+    else:
+        # If it's longer, take audio from 1 minute to 1 minute 3 seconds
+        start_time = 60 * 1000
+        end_time = start_time + segment_duration
+        segment = audio[start_time:end_time]
+    output_stream = io.BytesIO()
+    segment.export(output_stream, format="wav")
+    output_stream.seek(0)
+    # Process and extract features from the segment
+    features = feature_extraction.all_feature_extraction(output_stream)
+    return features

feature_extraction.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import librosa
+import numpy as np
+import joblib
+import soundfile as sf
+scaler = joblib.load("./models/std_scaler(1).pkl")
+def load_audio_from_uploaded_file(uploaded_file):
+    # Use the soundfile library to read the audio data and sample rate
+    audio_data, sample_rate = sf.read(uploaded_file)
+    return audio_data, sample_rate
+# sample_audio,sr = librosa.load(r"classical.00000.wav",sr = 44100)
+Fields = ['name', 'length', 'chroma_stft_mean', 'chroma_stft_var', 'rms_mean', 'rms_var',
+          'spectral_centroid_mean', 'spectral_centroid_var', 'spectral_bandwidth_mean', 'spectral_bandwidth_var',
+          'rolloff_mean', 'rolloff_var', 'zero_crossing_rate_mean', 'zero_crossing_rate_var',
+          'harmony_mean', 'harmony_var', 'percussive_mean', 'percussive_var', 'tempo',
+          'mfcc1_mean', 'mfcc1_var', 'mfcc2_mean', 'mfcc2_var', 'mfcc3_mean', 'mfcc3_var', 'mfcc4_mean', 'mfcc4_var',
+          'mfcc5_mean', 'mfcc5_var', 'mfcc6_mean', 'mfcc6_var', 'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean', 'mfcc8_var',
+          'mfcc9_mean', 'mfcc9_var', 'mfcc10_mean', 'mfcc10_var', 'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean',
+          'mfcc12_var',
+          'mfcc13_mean', 'mfcc13_var', 'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean', 'mfcc15_var', 'mfcc16_mean',
+          'mfcc16_var',
+          'mfcc17_mean', 'mfcc17_var', 'mfcc18_mean', 'mfcc18_var', 'mfcc19_mean', 'mfcc19_var', 'mfcc20_mean',
+          'mfcc20_var']
+short_field = Fields[2:]
+def all_feature_extraction(audio_path, sample_rate=22050):
+    data_list = []
+    val_field = []
+    audio_df, sr = librosa.load(audio_path, sr=22050)
+    data_list.append(audio_path)
+    data_list.append(len(audio_df))
+    # 1. Chroma STFT
+    chroma_stft = librosa.feature.chroma_stft(y=audio_df, hop_length=512)
+    chroma_stft_mean = np.mean(chroma_stft)
+    chroma_stft_var = np.var(chroma_stft)
+    val_field.append(chroma_stft)
+    data_list.append(chroma_stft_mean)
+    data_list.append(chroma_stft_var)
+    # 2. RMS
+    rms = librosa.feature.rms(y=audio_df)
+    rms_mean = np.mean(rms)
+    rms_var = np.var(rms)
+    data_list.append(rms_mean)
+    data_list.append(rms_var)
+    spectral_centroid = librosa.feature.spectral_centroid(y=audio_df)
+    spectral_centroid_mean = np.mean(spectral_centroid)
+    spectral_centroid_var = np.var(spectral_centroid)
+    data_list.append(spectral_centroid_mean)
+    data_list.append(spectral_centroid_var)
+    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_df)
+    spectral_bandwidth_mean = np.mean(spectral_bandwidth)
+    spectral_bandwidth_var = np.var(spectral_bandwidth)
+    data_list.append(spectral_bandwidth_mean)
+    data_list.append(spectral_bandwidth_var)
+    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_df)
+    spectral_rolloff_mean = np.mean(spectral_rolloff)
+    spectral_rolloff_var = np.var(spectral_rolloff)
+    data_list.append(spectral_rolloff_mean)
+    data_list.append(spectral_rolloff_var)
+    zcr = librosa.feature.zero_crossing_rate(y=audio_df)
+    zcr_mean = np.mean(zcr)
+    zcr_var = np.var(zcr)
+    data_list.append(zcr_mean)
+    data_list.append(zcr_var)
+    harmonic, percussive = librosa.effects.hpss(y=audio_df)
+    harmonic_mean = np.mean(harmonic)
+    harmonic_var = np.var(harmonic)
+    percussive_mean = np.mean(percussive)
+    percussive_var = np.var(percussive)
+    data_list.append(harmonic_mean)
+    data_list.append(harmonic_var)
+    data_list.append(percussive_mean)
+    data_list.append(percussive_var)
+    tempo = librosa.feature.tempo(y=audio_df)
+    tempo = np.mean(tempo)
+    data_list.append(tempo)
+    mfccs = librosa.feature.mfcc(y=audio_df, sr=sr)
+    row_means = np.mean(mfccs, axis=1)
+    row_vars = np.var(mfccs, axis=1)
+    mfcc_means = {}
+    mfcc_vars = {}
+    for i in range(1, 21):
+        variable_name = f'mfcc{i}'
+        mfcc_means[variable_name] = row_means[i - 1]  # You can initialize with values if needed
+        mfcc_vars[variable_name] = row_vars[i - 1]
+    # Convert the dictionary values to a list
+    mfcc_list = [value for value in zip(mfcc_means.values(), mfcc_vars.values())]
+    for mean, var in mfcc_list:
+        data_list.append(mean)
+        data_list.append(var)
+    return [data_list,val_field]
+def scale(initial_features):
+    final_features = initial_features[2:]
+    final_features = np.array(final_features)
+    # Apply the loaded scaler to your single data point
+    scaled_data_point = scaler.transform([final_features])
+    return scaled_data_point

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+joblib==1.3.2
+keras==2.12.0
+librosa==0.10.1
+matplotlib==3.7.2
+numpy==1.23.5
+pandas==2.0.3
+scikit-learn==1.2.2
+seaborn==0.12.2
+tensorflow==2.12.0
+xgboost==1.7.6