Hetan07 commited on
Commit
d76d950
1 Parent(s): b27f628

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +261 -0
  2. audio_splitting.py +26 -0
  3. feature_extraction.py +115 -0
  4. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ import pandas as pd
3
+ import streamlit as st
4
+ import tensorflow
5
+ from keras.losses import binary_crossentropy
6
+ from keras.optimizers import Adam
7
+
8
+ import audio_splitting
9
+ # Local Imports
10
+ import feature_extraction
11
+
12
+ st.set_page_config(layout="wide")
13
+
14
+
15
+ def display(model_name,col2):
16
+ xgb_multi_class_names = ["Rock", "Rap & Hip-Hop", "Soul", "Classical", "Dance & Electronic", "Blues", "Jazz",
17
+ "Country", "Bebop", "Folk", "Reggae", "R&B", "Punk", "Metal", "Pop"]
18
+ # if you are interested to know why there are two different lists at the same time, the order of genres in which
19
+ # xgb was trained and others were trained are different
20
+ xmulti_class_names = ["Metal", "Blues", "Reggae", "Jazz", "Rock", "Folk", "Classical", "Dance & Electronic",
21
+ "Punk", "Bebop", "Pop", "R&B", "Country", "Rap & Hip-Hop", "Soul"]
22
+
23
+ if model_name == "XGB - (Multi Label)":
24
+ predicted_indices = model.predict(reshaped_features)
25
+ predicted_labels = []
26
+ for i in range(0, len(predicted_indices[0])):
27
+ if predicted_indices[0][i] == 1.0:
28
+ predicted_labels.append(xgb_multi_class_names[i])
29
+ if predicted_labels:
30
+ labels = ', '.join(predicted_labels)
31
+ with col2:
32
+ st.metric(f"Predicted Genres: ",labels,label_visibility='collapsed')
33
+ else:
34
+ with col2:
35
+ st.caption("No genres predicted for this input.")
36
+
37
+ elif model_name == "XGB Classifier - (Single Label)":
38
+ predicted_indices = model.predict(reshaped_features)
39
+ predicted_labels = [class_indices[i] for i in predicted_indices]
40
+ with col2:
41
+ st.metric("Predicted Genre:", str(predicted_labels[0]), label_visibility="collapsed")
42
+
43
+ elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)" \
44
+ or model_name == "Neural Network - (Multi Label)" \
45
+ or model_name == "Convolutional Neural Network - (Multi Label)" \
46
+ or model_name == "Batch Normalization - (Multi Label)":
47
+ predicted_probabilities = model.predict(reshaped_features)
48
+ threshold = 0.3
49
+ print(predicted_probabilities)
50
+ probabilities = []
51
+ if model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
52
+ predicted_labels = [class_name for i, class_name in enumerate(multi_class_names) if
53
+ predicted_probabilities[0][i] >= threshold]
54
+ probabilities = [(class_name, predicted_probabilities[0][i] * 100) for i, class_name in
55
+ enumerate(multi_class_names)]
56
+
57
+ else:
58
+ predicted_labels = [class_name for i, class_name in enumerate(xmulti_class_names) if
59
+ predicted_probabilities[0][i] >= threshold]
60
+ probabilities = [(class_name, predicted_probabilities[0][i] * 100) for i, class_name in
61
+ enumerate(xmulti_class_names)]
62
+
63
+ if predicted_labels:
64
+ with col2:
65
+ st.metric(f"Predicted Genres:",str(', '.join(predicted_labels)))
66
+ st.caption("Below is a list of the probability of the sample being classified into each genre. Any "
67
+ "probability value below the threshold value (=0.35) is interpreted as the sample not being of "
68
+ "that genre ")
69
+ df = pd.DataFrame(probabilities,columns=["Genre","Probabilities"])
70
+ st.dataframe(df,hide_index=True,use_container_width=True)
71
+ else:
72
+ st.write("No genre predicted above the threshold.")
73
+
74
+ else:
75
+ predicted_label = model.predict(reshaped_features)[0]
76
+ with col2:
77
+ st.metric("Predicted Genres:", str(predicted_label).capitalize(), label_visibility="collapsed")
78
+
79
+
80
+ # Vars
81
+ fields_df = ['Chromagram Short-Time Fourier Transform (Chroma-STFT)',
82
+ 'Root Mean Square Energy (RMS)',
83
+ 'Spectral Centroid',
84
+ 'Spectral Bandwidth',
85
+ 'Spectral Rolloff',
86
+ 'Zero Crossing Rate',
87
+ 'Harmony',
88
+ 'Percussion',
89
+ 'Tempo',
90
+ 'Mel-Frequency Cepstral Coefficients (MFCC-1)',
91
+ 'MFCC-2',
92
+ 'MFCC-3',
93
+ 'MFCC-4',
94
+ 'MFCC-5',
95
+ 'MFCC-6',
96
+ 'MFCC-7',
97
+ 'MFCC-8',
98
+ 'MFCC-9',
99
+ 'MFCC-10',
100
+ 'MFCC-11',
101
+ 'MFCC-12',
102
+ 'MFCC-13',
103
+ 'MFCC-14',
104
+ 'MFCC-15',
105
+ 'MFCC-16',
106
+ 'MFCC-17',
107
+ 'MFCC-18',
108
+ 'MFCC-19',
109
+ 'MFCC-20', ]
110
+
111
+ url_single_label = "https://huggingface.co/spaces/Hetan07/Single_Label_Music_Genre_Classifier"
112
+ url_github = "https://github.com/Hetan07/Multi-Label-Music-Genre-Classifier"
113
+ url_docs = "https://librosa.org/doc/latest/index.html"
114
+
115
+ st.title("Multi-Label Music Genre Classifier")
116
+ st.write("A multi-label music genre classifier based on the extension of my previous [project](%s). "
117
+ "The source files have been provided both on HuggingFace and on [Github](%s). "
118
+ "Dataset had to be created specifically, as none was available with the features and multi-labels tags for "
119
+ "each audio."
120
+ "All the models have been trained on the created dataset." % (url_single_label, url_github))
121
+
122
+ st.divider()
123
+ st.subheader('On Dataset Creation')
124
+
125
+ with st.expander("See explanation"):
126
+ s = 'The work done for creating the dataset were\n' \
127
+ '- Downloading the appropriate songs taken randomly from the MuMu dataset in sampled manner from ~80 genres (' \
128
+ 'tags)\n' \
129
+ '- Data Cleaning which included to clean and replace the download songs as many of them were things such as album ' \
130
+ 'intros, interludes or skits\n' \
131
+ '- There were also issues where the song required was not available on any platform and so had to appropriately ' \
132
+ 'replaced for another proper track or I had to manually search and download\n' \
133
+ '- Each file had to properly checked to prevent any distortion or disturbances\n' \
134
+ '- Applying feature extraction on each downloaded song using the librosa library\n' \
135
+ '- Reducing the labels from ~80 to around ~15\n' \
136
+ '\nIn the end I decided to have feature extraction work on 3 second samples and thus have around ~24000 samples. ' \
137
+ 'I have linked the actual dataset created from all the steps if anyone wishes to work upon it further\n'
138
+
139
+ st.markdown(s)
140
+ st.divider()
141
+
142
+ st.subheader("Prediction of following genres")
143
+
144
+ multi_class_names = ["Bebop", "Blues", "Classical", "Country", "Dance & Electronic", "Folk", "Jazz", "Metal",
145
+ "Pop", "Punk", "R&B", "Rap & Hip-Hop", "Reggae", "Rock", "Soul"]
146
+
147
+ class_names = ["Blues", "Classical", "Country", "Disco", "HipHop",
148
+ "Jazz", "Metal", "Pop", "Reggae", "Rock"]
149
+
150
+ class_indices = {i: class_name for i, class_name in enumerate(class_names)}
151
+
152
+ col1, col2 = st.columns(2)
153
+ s = ''
154
+ with col1:
155
+ for i in multi_class_names[:7]:
156
+ s += "- " + i + "\n"
157
+ st.markdown(s)
158
+
159
+ s = ''
160
+ with col2:
161
+ for i in multi_class_names[8:]:
162
+ s += "- " + i + "\n"
163
+ st.markdown(s)
164
+
165
+ st.divider()
166
+ # Upload music file
167
+ st.subheader("Upload a music file")
168
+ uploaded_file = st.file_uploader("Upload a music file", type=["mp3", "wav", "ogg"], label_visibility="collapsed")
169
+
170
+ st.divider()
171
+ if uploaded_file is not None:
172
+ # User selects a model
173
+ all_models = ["K-Nearest Neighbors - (Single Label)",
174
+ "Logistic Regression - (Single Label)",
175
+ "Support Vector Machines - (Single Label)",
176
+ "Neural Network - (Single Label)",
177
+ "XGB Classifier - (Single Label)",
178
+ "Convolutional Recurrent Neural Network - (Multi Label)",
179
+ "Convolutional Neural Network - (Multi Label)",
180
+ "XGB - (Multi Label)",
181
+ "Neural Network - (Multi Label)",
182
+ "Neural Network with Batch Normalization - (Multi Label)"]
183
+
184
+ features_list, val_list = audio_splitting.split_audio(uploaded_file)
185
+ features = feature_extraction.scale(features_list)
186
+
187
+ feature_copy = features_list
188
+ feature_copy.insert(19, "-")
189
+ st.header("Feature Extraction")
190
+
191
+ st.write("The given audio sample is processed using the librosa library to get the features extracted used by the "
192
+ "models for genre prediction. Following is the dataframe with each of the feature extracted and "
193
+ "corresponding mean and variance of the feature. The docs of [*librosa*](%s) library can be referred for a more "
194
+ "indepth explanation and implementation specifics." % url_docs)
195
+
196
+ col3, col4 = st.columns([0.55, 0.45])
197
+ # Features Dataframe
198
+ df = pd.DataFrame({
199
+ "name": fields_df,
200
+ "Mean": feature_copy[2::2],
201
+ "Variance": feature_copy[3::2]
202
+ })
203
+
204
+ st.dataframe(
205
+ df,
206
+ column_config={
207
+ "name": "Features",
208
+ "Mean": "Mean of Feature",
209
+ "Variance": "Variance of Feature"
210
+ },
211
+ use_container_width=True
212
+ )
213
+ st.caption("Note: Harmonic and Percussion values generally have mean in the power of -1e5 or -1e6 and thus "
214
+ "are represented as 0.\nAlso, for the feature 'tempo' variance has not been added to keep up with the "
215
+ "consistency as presented in the original GTZAN dataset")
216
+
217
+ st.divider()
218
+
219
+ col1, col2 = st.columns([0.45, 0.55])
220
+
221
+ col1.subheader("Select a model")
222
+ with col1:
223
+ model_name = st.selectbox("Select a model", all_models, label_visibility="collapsed")
224
+
225
+ if model_name == "K-Nearest Neighbors - (Single Label)":
226
+ model = joblib.load("./models/knn.pkl")
227
+ elif model_name == "Logistic Regression - (Single Label)":
228
+ model = joblib.load("./models/logistic.pkl")
229
+ elif model_name == "Support Vector Machines - (Single Label)":
230
+ model = joblib.load("./models/svm.pkl")
231
+ elif model_name == "Neural Network - (Single Label)":
232
+ model = joblib.load("./models/nn.pkl")
233
+ elif model_name == "XGB Classifier - (Single Label)":
234
+ model = joblib.load("./models/xgb.pkl")
235
+ elif model_name == "XGB - (Multi Label)":
236
+ model = joblib.load("./models/xgb_mlb.pkl")
237
+ elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
238
+ model = tensorflow.keras.models.load_model("./models/model_crnn1.h5", compile=False)
239
+ model.compile(loss=binary_crossentropy,
240
+ optimizer=Adam(),
241
+ metrics=['accuracy'])
242
+ elif model_name == "Neural Network - (Multi Label)":
243
+ model = tensorflow.keras.models.load_model("./models/model_nn.h5", compile=False)
244
+ model.compile(loss=binary_crossentropy,
245
+ optimizer=Adam(),
246
+ metrics=['accuracy'])
247
+ elif model_name == "Neural Network with Batch Normalization - (Multi Label)":
248
+ model = tensorflow.keras.models.load_model("./models/model_bn.h5", compile=False)
249
+ model.compile(loss=binary_crossentropy,
250
+ optimizer=Adam(),
251
+ metrics=['accuracy'])
252
+ elif model_name == "Convolutional Neural Network - (Multi Label)":
253
+ model = tensorflow.keras.models.load_model("./models/model_cnn.h5",compile=False)
254
+ model.compile(loss=binary_crossentropy,
255
+ optimizer=Adam(),
256
+ metrics=['accuracy'])
257
+ col2.subheader("Predicted genre")
258
+
259
+ # Reshape the features to match the expected shape for prediction
260
+ reshaped_features = features.reshape(1, -1)
261
+ display(model_name,col2)
audio_splitting.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydub import AudioSegment
2
+ import feature_extraction
3
+ import io
4
+ def split_audio(uploaded_file):
5
+ audio = AudioSegment.from_file(uploaded_file)
6
+
7
+ segment_duration = 3 * 1000 # 3 seconds in milliseconds
8
+ audio_duration = len(audio)
9
+
10
+ # Check if the audio is shorter than 1 minute and 3 seconds
11
+ if audio_duration < 63 * 1000:
12
+ # If it's shorter, take audio from 0 to 3 seconds
13
+ segment = audio[:segment_duration]
14
+ else:
15
+ # If it's longer, take audio from 1 minute to 1 minute 3 seconds
16
+ start_time = 60 * 1000
17
+ end_time = start_time + segment_duration
18
+ segment = audio[start_time:end_time]
19
+
20
+ output_stream = io.BytesIO()
21
+ segment.export(output_stream, format="wav")
22
+ output_stream.seek(0)
23
+
24
+ # Process and extract features from the segment
25
+ features = feature_extraction.all_feature_extraction(output_stream)
26
+ return features
feature_extraction.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import joblib
4
+ import soundfile as sf
5
+ scaler = joblib.load("./models/std_scaler(1).pkl")
6
+
7
+
8
+ def load_audio_from_uploaded_file(uploaded_file):
9
+ # Use the soundfile library to read the audio data and sample rate
10
+ audio_data, sample_rate = sf.read(uploaded_file)
11
+
12
+ return audio_data, sample_rate
13
+
14
+
15
+ # sample_audio,sr = librosa.load(r"classical.00000.wav",sr = 44100)
16
+ Fields = ['name', 'length', 'chroma_stft_mean', 'chroma_stft_var', 'rms_mean', 'rms_var',
17
+ 'spectral_centroid_mean', 'spectral_centroid_var', 'spectral_bandwidth_mean', 'spectral_bandwidth_var',
18
+ 'rolloff_mean', 'rolloff_var', 'zero_crossing_rate_mean', 'zero_crossing_rate_var',
19
+ 'harmony_mean', 'harmony_var', 'percussive_mean', 'percussive_var', 'tempo',
20
+ 'mfcc1_mean', 'mfcc1_var', 'mfcc2_mean', 'mfcc2_var', 'mfcc3_mean', 'mfcc3_var', 'mfcc4_mean', 'mfcc4_var',
21
+ 'mfcc5_mean', 'mfcc5_var', 'mfcc6_mean', 'mfcc6_var', 'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean', 'mfcc8_var',
22
+ 'mfcc9_mean', 'mfcc9_var', 'mfcc10_mean', 'mfcc10_var', 'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean',
23
+ 'mfcc12_var',
24
+ 'mfcc13_mean', 'mfcc13_var', 'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean', 'mfcc15_var', 'mfcc16_mean',
25
+ 'mfcc16_var',
26
+ 'mfcc17_mean', 'mfcc17_var', 'mfcc18_mean', 'mfcc18_var', 'mfcc19_mean', 'mfcc19_var', 'mfcc20_mean',
27
+ 'mfcc20_var']
28
+
29
+ short_field = Fields[2:]
30
+
31
+
32
+ def all_feature_extraction(audio_path, sample_rate=22050):
33
+ data_list = []
34
+ val_field = []
35
+ audio_df, sr = librosa.load(audio_path, sr=22050)
36
+ data_list.append(audio_path)
37
+ data_list.append(len(audio_df))
38
+
39
+ # 1. Chroma STFT
40
+ chroma_stft = librosa.feature.chroma_stft(y=audio_df, hop_length=512)
41
+ chroma_stft_mean = np.mean(chroma_stft)
42
+ chroma_stft_var = np.var(chroma_stft)
43
+
44
+ val_field.append(chroma_stft)
45
+ data_list.append(chroma_stft_mean)
46
+ data_list.append(chroma_stft_var)
47
+
48
+ # 2. RMS
49
+ rms = librosa.feature.rms(y=audio_df)
50
+ rms_mean = np.mean(rms)
51
+ rms_var = np.var(rms)
52
+ data_list.append(rms_mean)
53
+ data_list.append(rms_var)
54
+
55
+ spectral_centroid = librosa.feature.spectral_centroid(y=audio_df)
56
+ spectral_centroid_mean = np.mean(spectral_centroid)
57
+ spectral_centroid_var = np.var(spectral_centroid)
58
+ data_list.append(spectral_centroid_mean)
59
+ data_list.append(spectral_centroid_var)
60
+
61
+ spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_df)
62
+ spectral_bandwidth_mean = np.mean(spectral_bandwidth)
63
+ spectral_bandwidth_var = np.var(spectral_bandwidth)
64
+ data_list.append(spectral_bandwidth_mean)
65
+ data_list.append(spectral_bandwidth_var)
66
+
67
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_df)
68
+ spectral_rolloff_mean = np.mean(spectral_rolloff)
69
+ spectral_rolloff_var = np.var(spectral_rolloff)
70
+ data_list.append(spectral_rolloff_mean)
71
+ data_list.append(spectral_rolloff_var)
72
+
73
+ zcr = librosa.feature.zero_crossing_rate(y=audio_df)
74
+ zcr_mean = np.mean(zcr)
75
+ zcr_var = np.var(zcr)
76
+ data_list.append(zcr_mean)
77
+ data_list.append(zcr_var)
78
+
79
+ harmonic, percussive = librosa.effects.hpss(y=audio_df)
80
+ harmonic_mean = np.mean(harmonic)
81
+ harmonic_var = np.var(harmonic)
82
+ percussive_mean = np.mean(percussive)
83
+ percussive_var = np.var(percussive)
84
+ data_list.append(harmonic_mean)
85
+ data_list.append(harmonic_var)
86
+ data_list.append(percussive_mean)
87
+ data_list.append(percussive_var)
88
+
89
+ tempo = librosa.feature.tempo(y=audio_df)
90
+ tempo = np.mean(tempo)
91
+ data_list.append(tempo)
92
+ mfccs = librosa.feature.mfcc(y=audio_df, sr=sr)
93
+ row_means = np.mean(mfccs, axis=1)
94
+ row_vars = np.var(mfccs, axis=1)
95
+ mfcc_means = {}
96
+ mfcc_vars = {}
97
+ for i in range(1, 21):
98
+ variable_name = f'mfcc{i}'
99
+ mfcc_means[variable_name] = row_means[i - 1] # You can initialize with values if needed
100
+ mfcc_vars[variable_name] = row_vars[i - 1]
101
+ # Convert the dictionary values to a list
102
+ mfcc_list = [value for value in zip(mfcc_means.values(), mfcc_vars.values())]
103
+
104
+ for mean, var in mfcc_list:
105
+ data_list.append(mean)
106
+ data_list.append(var)
107
+
108
+ return [data_list,val_field]
109
+
110
+ def scale(initial_features):
111
+ final_features = initial_features[2:]
112
+ final_features = np.array(final_features)
113
+ # Apply the loaded scaler to your single data point
114
+ scaled_data_point = scaler.transform([final_features])
115
+ return scaled_data_point
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ joblib==1.3.2
2
+ keras==2.12.0
3
+ librosa==0.10.1
4
+ matplotlib==3.7.2
5
+ numpy==1.23.5
6
+ pandas==2.0.3
7
+ scikit-learn==1.2.2
8
+ seaborn==0.12.2
9
+ tensorflow==2.12.0
10
+ xgboost==1.7.6