Upload 4 files
Browse files- app.py +261 -0
- audio_splitting.py +26 -0
- feature_extraction.py +115 -0
- requirements.txt +10 -0
app.py
ADDED
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import joblib
|
2 |
+
import pandas as pd
|
3 |
+
import streamlit as st
|
4 |
+
import tensorflow
|
5 |
+
from keras.losses import binary_crossentropy
|
6 |
+
from keras.optimizers import Adam
|
7 |
+
|
8 |
+
import audio_splitting
|
9 |
+
# Local Imports
|
10 |
+
import feature_extraction
|
11 |
+
|
12 |
+
st.set_page_config(layout="wide")
|
13 |
+
|
14 |
+
|
15 |
+
def display(model_name,col2):
|
16 |
+
xgb_multi_class_names = ["Rock", "Rap & Hip-Hop", "Soul", "Classical", "Dance & Electronic", "Blues", "Jazz",
|
17 |
+
"Country", "Bebop", "Folk", "Reggae", "R&B", "Punk", "Metal", "Pop"]
|
18 |
+
# if you are interested to know why there are two different lists at the same time, the order of genres in which
|
19 |
+
# xgb was trained and others were trained are different
|
20 |
+
xmulti_class_names = ["Metal", "Blues", "Reggae", "Jazz", "Rock", "Folk", "Classical", "Dance & Electronic",
|
21 |
+
"Punk", "Bebop", "Pop", "R&B", "Country", "Rap & Hip-Hop", "Soul"]
|
22 |
+
|
23 |
+
if model_name == "XGB - (Multi Label)":
|
24 |
+
predicted_indices = model.predict(reshaped_features)
|
25 |
+
predicted_labels = []
|
26 |
+
for i in range(0, len(predicted_indices[0])):
|
27 |
+
if predicted_indices[0][i] == 1.0:
|
28 |
+
predicted_labels.append(xgb_multi_class_names[i])
|
29 |
+
if predicted_labels:
|
30 |
+
labels = ', '.join(predicted_labels)
|
31 |
+
with col2:
|
32 |
+
st.metric(f"Predicted Genres: ",labels,label_visibility='collapsed')
|
33 |
+
else:
|
34 |
+
with col2:
|
35 |
+
st.caption("No genres predicted for this input.")
|
36 |
+
|
37 |
+
elif model_name == "XGB Classifier - (Single Label)":
|
38 |
+
predicted_indices = model.predict(reshaped_features)
|
39 |
+
predicted_labels = [class_indices[i] for i in predicted_indices]
|
40 |
+
with col2:
|
41 |
+
st.metric("Predicted Genre:", str(predicted_labels[0]), label_visibility="collapsed")
|
42 |
+
|
43 |
+
elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)" \
|
44 |
+
or model_name == "Neural Network - (Multi Label)" \
|
45 |
+
or model_name == "Convolutional Neural Network - (Multi Label)" \
|
46 |
+
or model_name == "Batch Normalization - (Multi Label)":
|
47 |
+
predicted_probabilities = model.predict(reshaped_features)
|
48 |
+
threshold = 0.3
|
49 |
+
print(predicted_probabilities)
|
50 |
+
probabilities = []
|
51 |
+
if model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
|
52 |
+
predicted_labels = [class_name for i, class_name in enumerate(multi_class_names) if
|
53 |
+
predicted_probabilities[0][i] >= threshold]
|
54 |
+
probabilities = [(class_name, predicted_probabilities[0][i] * 100) for i, class_name in
|
55 |
+
enumerate(multi_class_names)]
|
56 |
+
|
57 |
+
else:
|
58 |
+
predicted_labels = [class_name for i, class_name in enumerate(xmulti_class_names) if
|
59 |
+
predicted_probabilities[0][i] >= threshold]
|
60 |
+
probabilities = [(class_name, predicted_probabilities[0][i] * 100) for i, class_name in
|
61 |
+
enumerate(xmulti_class_names)]
|
62 |
+
|
63 |
+
if predicted_labels:
|
64 |
+
with col2:
|
65 |
+
st.metric(f"Predicted Genres:",str(', '.join(predicted_labels)))
|
66 |
+
st.caption("Below is a list of the probability of the sample being classified into each genre. Any "
|
67 |
+
"probability value below the threshold value (=0.35) is interpreted as the sample not being of "
|
68 |
+
"that genre ")
|
69 |
+
df = pd.DataFrame(probabilities,columns=["Genre","Probabilities"])
|
70 |
+
st.dataframe(df,hide_index=True,use_container_width=True)
|
71 |
+
else:
|
72 |
+
st.write("No genre predicted above the threshold.")
|
73 |
+
|
74 |
+
else:
|
75 |
+
predicted_label = model.predict(reshaped_features)[0]
|
76 |
+
with col2:
|
77 |
+
st.metric("Predicted Genres:", str(predicted_label).capitalize(), label_visibility="collapsed")
|
78 |
+
|
79 |
+
|
80 |
+
# Vars
|
81 |
+
fields_df = ['Chromagram Short-Time Fourier Transform (Chroma-STFT)',
|
82 |
+
'Root Mean Square Energy (RMS)',
|
83 |
+
'Spectral Centroid',
|
84 |
+
'Spectral Bandwidth',
|
85 |
+
'Spectral Rolloff',
|
86 |
+
'Zero Crossing Rate',
|
87 |
+
'Harmony',
|
88 |
+
'Percussion',
|
89 |
+
'Tempo',
|
90 |
+
'Mel-Frequency Cepstral Coefficients (MFCC-1)',
|
91 |
+
'MFCC-2',
|
92 |
+
'MFCC-3',
|
93 |
+
'MFCC-4',
|
94 |
+
'MFCC-5',
|
95 |
+
'MFCC-6',
|
96 |
+
'MFCC-7',
|
97 |
+
'MFCC-8',
|
98 |
+
'MFCC-9',
|
99 |
+
'MFCC-10',
|
100 |
+
'MFCC-11',
|
101 |
+
'MFCC-12',
|
102 |
+
'MFCC-13',
|
103 |
+
'MFCC-14',
|
104 |
+
'MFCC-15',
|
105 |
+
'MFCC-16',
|
106 |
+
'MFCC-17',
|
107 |
+
'MFCC-18',
|
108 |
+
'MFCC-19',
|
109 |
+
'MFCC-20', ]
|
110 |
+
|
111 |
+
url_single_label = "https://huggingface.co/spaces/Hetan07/Single_Label_Music_Genre_Classifier"
|
112 |
+
url_github = "https://github.com/Hetan07/Multi-Label-Music-Genre-Classifier"
|
113 |
+
url_docs = "https://librosa.org/doc/latest/index.html"
|
114 |
+
|
115 |
+
st.title("Multi-Label Music Genre Classifier")
|
116 |
+
st.write("A multi-label music genre classifier based on the extension of my previous [project](%s). "
|
117 |
+
"The source files have been provided both on HuggingFace and on [Github](%s). "
|
118 |
+
"Dataset had to be created specifically, as none was available with the features and multi-labels tags for "
|
119 |
+
"each audio."
|
120 |
+
"All the models have been trained on the created dataset." % (url_single_label, url_github))
|
121 |
+
|
122 |
+
st.divider()
|
123 |
+
st.subheader('On Dataset Creation')
|
124 |
+
|
125 |
+
with st.expander("See explanation"):
|
126 |
+
s = 'The work done for creating the dataset were\n' \
|
127 |
+
'- Downloading the appropriate songs taken randomly from the MuMu dataset in sampled manner from ~80 genres (' \
|
128 |
+
'tags)\n' \
|
129 |
+
'- Data Cleaning which included to clean and replace the download songs as many of them were things such as album ' \
|
130 |
+
'intros, interludes or skits\n' \
|
131 |
+
'- There were also issues where the song required was not available on any platform and so had to appropriately ' \
|
132 |
+
'replaced for another proper track or I had to manually search and download\n' \
|
133 |
+
'- Each file had to properly checked to prevent any distortion or disturbances\n' \
|
134 |
+
'- Applying feature extraction on each downloaded song using the librosa library\n' \
|
135 |
+
'- Reducing the labels from ~80 to around ~15\n' \
|
136 |
+
'\nIn the end I decided to have feature extraction work on 3 second samples and thus have around ~24000 samples. ' \
|
137 |
+
'I have linked the actual dataset created from all the steps if anyone wishes to work upon it further\n'
|
138 |
+
|
139 |
+
st.markdown(s)
|
140 |
+
st.divider()
|
141 |
+
|
142 |
+
st.subheader("Prediction of following genres")
|
143 |
+
|
144 |
+
multi_class_names = ["Bebop", "Blues", "Classical", "Country", "Dance & Electronic", "Folk", "Jazz", "Metal",
|
145 |
+
"Pop", "Punk", "R&B", "Rap & Hip-Hop", "Reggae", "Rock", "Soul"]
|
146 |
+
|
147 |
+
class_names = ["Blues", "Classical", "Country", "Disco", "HipHop",
|
148 |
+
"Jazz", "Metal", "Pop", "Reggae", "Rock"]
|
149 |
+
|
150 |
+
class_indices = {i: class_name for i, class_name in enumerate(class_names)}
|
151 |
+
|
152 |
+
col1, col2 = st.columns(2)
|
153 |
+
s = ''
|
154 |
+
with col1:
|
155 |
+
for i in multi_class_names[:7]:
|
156 |
+
s += "- " + i + "\n"
|
157 |
+
st.markdown(s)
|
158 |
+
|
159 |
+
s = ''
|
160 |
+
with col2:
|
161 |
+
for i in multi_class_names[8:]:
|
162 |
+
s += "- " + i + "\n"
|
163 |
+
st.markdown(s)
|
164 |
+
|
165 |
+
st.divider()
|
166 |
+
# Upload music file
|
167 |
+
st.subheader("Upload a music file")
|
168 |
+
uploaded_file = st.file_uploader("Upload a music file", type=["mp3", "wav", "ogg"], label_visibility="collapsed")
|
169 |
+
|
170 |
+
st.divider()
|
171 |
+
if uploaded_file is not None:
|
172 |
+
# User selects a model
|
173 |
+
all_models = ["K-Nearest Neighbors - (Single Label)",
|
174 |
+
"Logistic Regression - (Single Label)",
|
175 |
+
"Support Vector Machines - (Single Label)",
|
176 |
+
"Neural Network - (Single Label)",
|
177 |
+
"XGB Classifier - (Single Label)",
|
178 |
+
"Convolutional Recurrent Neural Network - (Multi Label)",
|
179 |
+
"Convolutional Neural Network - (Multi Label)",
|
180 |
+
"XGB - (Multi Label)",
|
181 |
+
"Neural Network - (Multi Label)",
|
182 |
+
"Neural Network with Batch Normalization - (Multi Label)"]
|
183 |
+
|
184 |
+
features_list, val_list = audio_splitting.split_audio(uploaded_file)
|
185 |
+
features = feature_extraction.scale(features_list)
|
186 |
+
|
187 |
+
feature_copy = features_list
|
188 |
+
feature_copy.insert(19, "-")
|
189 |
+
st.header("Feature Extraction")
|
190 |
+
|
191 |
+
st.write("The given audio sample is processed using the librosa library to get the features extracted used by the "
|
192 |
+
"models for genre prediction. Following is the dataframe with each of the feature extracted and "
|
193 |
+
"corresponding mean and variance of the feature. The docs of [*librosa*](%s) library can be referred for a more "
|
194 |
+
"indepth explanation and implementation specifics." % url_docs)
|
195 |
+
|
196 |
+
col3, col4 = st.columns([0.55, 0.45])
|
197 |
+
# Features Dataframe
|
198 |
+
df = pd.DataFrame({
|
199 |
+
"name": fields_df,
|
200 |
+
"Mean": feature_copy[2::2],
|
201 |
+
"Variance": feature_copy[3::2]
|
202 |
+
})
|
203 |
+
|
204 |
+
st.dataframe(
|
205 |
+
df,
|
206 |
+
column_config={
|
207 |
+
"name": "Features",
|
208 |
+
"Mean": "Mean of Feature",
|
209 |
+
"Variance": "Variance of Feature"
|
210 |
+
},
|
211 |
+
use_container_width=True
|
212 |
+
)
|
213 |
+
st.caption("Note: Harmonic and Percussion values generally have mean in the power of -1e5 or -1e6 and thus "
|
214 |
+
"are represented as 0.\nAlso, for the feature 'tempo' variance has not been added to keep up with the "
|
215 |
+
"consistency as presented in the original GTZAN dataset")
|
216 |
+
|
217 |
+
st.divider()
|
218 |
+
|
219 |
+
col1, col2 = st.columns([0.45, 0.55])
|
220 |
+
|
221 |
+
col1.subheader("Select a model")
|
222 |
+
with col1:
|
223 |
+
model_name = st.selectbox("Select a model", all_models, label_visibility="collapsed")
|
224 |
+
|
225 |
+
if model_name == "K-Nearest Neighbors - (Single Label)":
|
226 |
+
model = joblib.load("./models/knn.pkl")
|
227 |
+
elif model_name == "Logistic Regression - (Single Label)":
|
228 |
+
model = joblib.load("./models/logistic.pkl")
|
229 |
+
elif model_name == "Support Vector Machines - (Single Label)":
|
230 |
+
model = joblib.load("./models/svm.pkl")
|
231 |
+
elif model_name == "Neural Network - (Single Label)":
|
232 |
+
model = joblib.load("./models/nn.pkl")
|
233 |
+
elif model_name == "XGB Classifier - (Single Label)":
|
234 |
+
model = joblib.load("./models/xgb.pkl")
|
235 |
+
elif model_name == "XGB - (Multi Label)":
|
236 |
+
model = joblib.load("./models/xgb_mlb.pkl")
|
237 |
+
elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
|
238 |
+
model = tensorflow.keras.models.load_model("./models/model_crnn1.h5", compile=False)
|
239 |
+
model.compile(loss=binary_crossentropy,
|
240 |
+
optimizer=Adam(),
|
241 |
+
metrics=['accuracy'])
|
242 |
+
elif model_name == "Neural Network - (Multi Label)":
|
243 |
+
model = tensorflow.keras.models.load_model("./models/model_nn.h5", compile=False)
|
244 |
+
model.compile(loss=binary_crossentropy,
|
245 |
+
optimizer=Adam(),
|
246 |
+
metrics=['accuracy'])
|
247 |
+
elif model_name == "Neural Network with Batch Normalization - (Multi Label)":
|
248 |
+
model = tensorflow.keras.models.load_model("./models/model_bn.h5", compile=False)
|
249 |
+
model.compile(loss=binary_crossentropy,
|
250 |
+
optimizer=Adam(),
|
251 |
+
metrics=['accuracy'])
|
252 |
+
elif model_name == "Convolutional Neural Network - (Multi Label)":
|
253 |
+
model = tensorflow.keras.models.load_model("./models/model_cnn.h5",compile=False)
|
254 |
+
model.compile(loss=binary_crossentropy,
|
255 |
+
optimizer=Adam(),
|
256 |
+
metrics=['accuracy'])
|
257 |
+
col2.subheader("Predicted genre")
|
258 |
+
|
259 |
+
# Reshape the features to match the expected shape for prediction
|
260 |
+
reshaped_features = features.reshape(1, -1)
|
261 |
+
display(model_name,col2)
|
audio_splitting.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydub import AudioSegment
|
2 |
+
import feature_extraction
|
3 |
+
import io
|
4 |
+
def split_audio(uploaded_file):
|
5 |
+
audio = AudioSegment.from_file(uploaded_file)
|
6 |
+
|
7 |
+
segment_duration = 3 * 1000 # 3 seconds in milliseconds
|
8 |
+
audio_duration = len(audio)
|
9 |
+
|
10 |
+
# Check if the audio is shorter than 1 minute and 3 seconds
|
11 |
+
if audio_duration < 63 * 1000:
|
12 |
+
# If it's shorter, take audio from 0 to 3 seconds
|
13 |
+
segment = audio[:segment_duration]
|
14 |
+
else:
|
15 |
+
# If it's longer, take audio from 1 minute to 1 minute 3 seconds
|
16 |
+
start_time = 60 * 1000
|
17 |
+
end_time = start_time + segment_duration
|
18 |
+
segment = audio[start_time:end_time]
|
19 |
+
|
20 |
+
output_stream = io.BytesIO()
|
21 |
+
segment.export(output_stream, format="wav")
|
22 |
+
output_stream.seek(0)
|
23 |
+
|
24 |
+
# Process and extract features from the segment
|
25 |
+
features = feature_extraction.all_feature_extraction(output_stream)
|
26 |
+
return features
|
feature_extraction.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import joblib
|
4 |
+
import soundfile as sf
|
5 |
+
scaler = joblib.load("./models/std_scaler(1).pkl")
|
6 |
+
|
7 |
+
|
8 |
+
def load_audio_from_uploaded_file(uploaded_file):
|
9 |
+
# Use the soundfile library to read the audio data and sample rate
|
10 |
+
audio_data, sample_rate = sf.read(uploaded_file)
|
11 |
+
|
12 |
+
return audio_data, sample_rate
|
13 |
+
|
14 |
+
|
15 |
+
# sample_audio,sr = librosa.load(r"classical.00000.wav",sr = 44100)
|
16 |
+
Fields = ['name', 'length', 'chroma_stft_mean', 'chroma_stft_var', 'rms_mean', 'rms_var',
|
17 |
+
'spectral_centroid_mean', 'spectral_centroid_var', 'spectral_bandwidth_mean', 'spectral_bandwidth_var',
|
18 |
+
'rolloff_mean', 'rolloff_var', 'zero_crossing_rate_mean', 'zero_crossing_rate_var',
|
19 |
+
'harmony_mean', 'harmony_var', 'percussive_mean', 'percussive_var', 'tempo',
|
20 |
+
'mfcc1_mean', 'mfcc1_var', 'mfcc2_mean', 'mfcc2_var', 'mfcc3_mean', 'mfcc3_var', 'mfcc4_mean', 'mfcc4_var',
|
21 |
+
'mfcc5_mean', 'mfcc5_var', 'mfcc6_mean', 'mfcc6_var', 'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean', 'mfcc8_var',
|
22 |
+
'mfcc9_mean', 'mfcc9_var', 'mfcc10_mean', 'mfcc10_var', 'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean',
|
23 |
+
'mfcc12_var',
|
24 |
+
'mfcc13_mean', 'mfcc13_var', 'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean', 'mfcc15_var', 'mfcc16_mean',
|
25 |
+
'mfcc16_var',
|
26 |
+
'mfcc17_mean', 'mfcc17_var', 'mfcc18_mean', 'mfcc18_var', 'mfcc19_mean', 'mfcc19_var', 'mfcc20_mean',
|
27 |
+
'mfcc20_var']
|
28 |
+
|
29 |
+
short_field = Fields[2:]
|
30 |
+
|
31 |
+
|
32 |
+
def all_feature_extraction(audio_path, sample_rate=22050):
|
33 |
+
data_list = []
|
34 |
+
val_field = []
|
35 |
+
audio_df, sr = librosa.load(audio_path, sr=22050)
|
36 |
+
data_list.append(audio_path)
|
37 |
+
data_list.append(len(audio_df))
|
38 |
+
|
39 |
+
# 1. Chroma STFT
|
40 |
+
chroma_stft = librosa.feature.chroma_stft(y=audio_df, hop_length=512)
|
41 |
+
chroma_stft_mean = np.mean(chroma_stft)
|
42 |
+
chroma_stft_var = np.var(chroma_stft)
|
43 |
+
|
44 |
+
val_field.append(chroma_stft)
|
45 |
+
data_list.append(chroma_stft_mean)
|
46 |
+
data_list.append(chroma_stft_var)
|
47 |
+
|
48 |
+
# 2. RMS
|
49 |
+
rms = librosa.feature.rms(y=audio_df)
|
50 |
+
rms_mean = np.mean(rms)
|
51 |
+
rms_var = np.var(rms)
|
52 |
+
data_list.append(rms_mean)
|
53 |
+
data_list.append(rms_var)
|
54 |
+
|
55 |
+
spectral_centroid = librosa.feature.spectral_centroid(y=audio_df)
|
56 |
+
spectral_centroid_mean = np.mean(spectral_centroid)
|
57 |
+
spectral_centroid_var = np.var(spectral_centroid)
|
58 |
+
data_list.append(spectral_centroid_mean)
|
59 |
+
data_list.append(spectral_centroid_var)
|
60 |
+
|
61 |
+
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_df)
|
62 |
+
spectral_bandwidth_mean = np.mean(spectral_bandwidth)
|
63 |
+
spectral_bandwidth_var = np.var(spectral_bandwidth)
|
64 |
+
data_list.append(spectral_bandwidth_mean)
|
65 |
+
data_list.append(spectral_bandwidth_var)
|
66 |
+
|
67 |
+
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_df)
|
68 |
+
spectral_rolloff_mean = np.mean(spectral_rolloff)
|
69 |
+
spectral_rolloff_var = np.var(spectral_rolloff)
|
70 |
+
data_list.append(spectral_rolloff_mean)
|
71 |
+
data_list.append(spectral_rolloff_var)
|
72 |
+
|
73 |
+
zcr = librosa.feature.zero_crossing_rate(y=audio_df)
|
74 |
+
zcr_mean = np.mean(zcr)
|
75 |
+
zcr_var = np.var(zcr)
|
76 |
+
data_list.append(zcr_mean)
|
77 |
+
data_list.append(zcr_var)
|
78 |
+
|
79 |
+
harmonic, percussive = librosa.effects.hpss(y=audio_df)
|
80 |
+
harmonic_mean = np.mean(harmonic)
|
81 |
+
harmonic_var = np.var(harmonic)
|
82 |
+
percussive_mean = np.mean(percussive)
|
83 |
+
percussive_var = np.var(percussive)
|
84 |
+
data_list.append(harmonic_mean)
|
85 |
+
data_list.append(harmonic_var)
|
86 |
+
data_list.append(percussive_mean)
|
87 |
+
data_list.append(percussive_var)
|
88 |
+
|
89 |
+
tempo = librosa.feature.tempo(y=audio_df)
|
90 |
+
tempo = np.mean(tempo)
|
91 |
+
data_list.append(tempo)
|
92 |
+
mfccs = librosa.feature.mfcc(y=audio_df, sr=sr)
|
93 |
+
row_means = np.mean(mfccs, axis=1)
|
94 |
+
row_vars = np.var(mfccs, axis=1)
|
95 |
+
mfcc_means = {}
|
96 |
+
mfcc_vars = {}
|
97 |
+
for i in range(1, 21):
|
98 |
+
variable_name = f'mfcc{i}'
|
99 |
+
mfcc_means[variable_name] = row_means[i - 1] # You can initialize with values if needed
|
100 |
+
mfcc_vars[variable_name] = row_vars[i - 1]
|
101 |
+
# Convert the dictionary values to a list
|
102 |
+
mfcc_list = [value for value in zip(mfcc_means.values(), mfcc_vars.values())]
|
103 |
+
|
104 |
+
for mean, var in mfcc_list:
|
105 |
+
data_list.append(mean)
|
106 |
+
data_list.append(var)
|
107 |
+
|
108 |
+
return [data_list,val_field]
|
109 |
+
|
110 |
+
def scale(initial_features):
|
111 |
+
final_features = initial_features[2:]
|
112 |
+
final_features = np.array(final_features)
|
113 |
+
# Apply the loaded scaler to your single data point
|
114 |
+
scaled_data_point = scaler.transform([final_features])
|
115 |
+
return scaled_data_point
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
joblib==1.3.2
|
2 |
+
keras==2.12.0
|
3 |
+
librosa==0.10.1
|
4 |
+
matplotlib==3.7.2
|
5 |
+
numpy==1.23.5
|
6 |
+
pandas==2.0.3
|
7 |
+
scikit-learn==1.2.2
|
8 |
+
seaborn==0.12.2
|
9 |
+
tensorflow==2.12.0
|
10 |
+
xgboost==1.7.6
|