File size: 4,676 Bytes
be4d0c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import os
import sys
import librosa
import numpy as np
from scipy.io import wavfile
from sklearn.preprocessing import normalize
class SoundPreprocessing:
"""
Parameters
----------
sr (int): sampling rate
max_size (iterable): resulting shape of the tensor
n_fft (int): number related to FFT
n_mfcc (int): number of MFCC
"""
def __init__(self, *, sr, max_size, n_fft, n_mfcc = 60, hop_length = 512):
self.sr = sr
self.n_fft = n_fft
self.n_mfcc = n_mfcc
self.max_size = max_size
self.hop_length = hop_length
def padding(self, array, xx, yy):
"""
Parameters
----------
array: numpy array
xx: desired height
yy: desirex width
Returns: padded array
"""
self.array = array
self.xx = xx
self.yy = yy
h = array.shape[0]
w = array.shape[1]
a = max((xx - h) // 2,0)
aa = max(0,xx - a - h)
b = max(0,(yy - w) // 2)
bb = max(yy - b - w,0)
return np.pad(array, pad_width = ((a, aa), (b, bb)),
mode = "constant")
def generate_features(self, y_cut, sr, max_size, n_fft, n_mfcc, hop_length):
self.y_cut = y_cut
# Numeri -2 divisibili per 14
condition = np.arange(2, 1000)[np.where((np.arange(2, 1000) - 2)%14 == 0)]
global shape_changed
shape_changed = False
if max_size[0] not in condition:
# Get closest number to 'max_size' that respects 'condition'
new_max0 = sorted(condition, key = lambda v: abs(v - max_size[0]))[0]
shape_changed = True
max_size = (new_max0, max_size[1])
stft = self.padding(np.abs(librosa.stft(y = y_cut, n_fft = n_fft,
hop_length = 512)), max_size[0], max_size[1])
if max_size[0] < stft.shape[0]:
new_max0 = sorted(condition[condition >= stft.shape[0]],
key = lambda v: abs(v - stft.shape[0]))[0]
max_size = (new_max0, max_size[1])
shape_changed = True
stft = self.padding(np.abs(librosa.stft(y = y_cut, n_fft = n_fft,
hop_length = 512)), max_size[0], max_size[1])
MFCCs = self.padding(librosa.feature.mfcc(y = y_cut, n_fft = n_fft, sr = sr,
hop_length = hop_length, n_mfcc = n_mfcc),
max_size[0], max_size[1])
spec_centroid = librosa.feature.spectral_centroid(y = y_cut, sr = sr)
chroma_stft = librosa.feature.chroma_stft(y = y_cut, sr = sr)
spec_bw = librosa.feature.spectral_bandwidth(y = y_cut, sr = sr)
#Now the padding part
image = np.array([self.padding(normalize(spec_bw), 1, max_size[1])]).reshape(1, max_size[1])
image = np.append(image, self.padding(normalize(spec_centroid), 1, max_size[1]), axis = 0)
#repeat the padded spec_bw,spec_centroid and chroma stft until they are stft and MFCC-sized
for i in range( int((max_size[0]-2)/14) ):
image = np.append(image, self.padding(normalize(spec_bw), 1, max_size[1]), axis = 0)
image = np.append(image, self.padding(normalize(spec_centroid), 1, max_size[1]), axis = 0)
image = np.append(image, self.padding(normalize(chroma_stft), 12, max_size[1]), axis = 0)
image = np.dstack((image, np.abs(stft)))
image = np.dstack((image, MFCCs))
return image
def get_features(self, df, filepath):
self.df = df
self.filepath = filepath
# Get data for CNN
X = []
y = np.zeros(shape = (len(df), 1))
for i in df.index:
sr_i, aud = wavfile.read("{}\\{}".format(filepath, df.loc[i, "filename"]))
aud = aud.astype(np.float16)
X += [self.generate_features(y_cut = aud, sr = sr_i,
n_fft = self.n_fft,
n_mfcc = self.n_mfcc,
max_size = self.max_size,
hop_length = self.hop_length)]
y[i] = df.loc[i, "target"]
if shape_changed == True:
print(f"New max_size is {max_size}")
X = np.array(X)
return X, y
|