File size: 4,676 Bytes
be4d0c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import sys
import librosa
import numpy as np
from scipy.io import wavfile
from sklearn.preprocessing import normalize

class SoundPreprocessing:
    """
    Parameters
    ----------
    
    sr (int): sampling rate
    max_size (iterable): resulting shape of the tensor
    n_fft (int): number related to FFT
    n_mfcc (int): number of MFCC
    
    """
    
    
    def __init__(self, *, sr, max_size, n_fft, n_mfcc = 60, hop_length = 512):
        self.sr = sr
        self.n_fft = n_fft
        self.n_mfcc = n_mfcc
        self.max_size = max_size
        self.hop_length = hop_length
        
        
    def padding(self, array, xx, yy):
        """
        Parameters
        ----------
            array: numpy array
            xx: desired height
            yy: desirex width
        
        Returns: padded array
        """
        self.array = array
        self.xx = xx
        self.yy = yy
        
        h = array.shape[0]
        w = array.shape[1]
        a = max((xx - h) // 2,0)
        aa = max(0,xx - a - h)
        b = max(0,(yy - w) // 2)
        bb = max(yy - b - w,0)

        return np.pad(array, pad_width = ((a, aa), (b, bb)),
                      mode = "constant")
    
    
    def generate_features(self, y_cut, sr, max_size, n_fft, n_mfcc, hop_length):
        self.y_cut = y_cut
        
        # Numeri -2 divisibili per 14
        condition = np.arange(2, 1000)[np.where((np.arange(2, 1000) - 2)%14 == 0)]
        
        global shape_changed
        shape_changed = False
        
        if max_size[0] not in condition:
            # Get closest number to 'max_size' that respects 'condition'
            new_max0 = sorted(condition, key = lambda v: abs(v - max_size[0]))[0]
            shape_changed = True
            max_size = (new_max0, max_size[1])
        
        stft = self.padding(np.abs(librosa.stft(y = y_cut, n_fft = n_fft,
                                   hop_length = 512)), max_size[0], max_size[1])
        
        if max_size[0] < stft.shape[0]:
            new_max0 = sorted(condition[condition >= stft.shape[0]],
                              key = lambda v: abs(v - stft.shape[0]))[0]
            max_size = (new_max0, max_size[1])
            shape_changed = True
        
        stft = self.padding(np.abs(librosa.stft(y = y_cut, n_fft = n_fft,
                                   hop_length = 512)), max_size[0], max_size[1])
        
        MFCCs = self.padding(librosa.feature.mfcc(y = y_cut, n_fft = n_fft, sr = sr,
                                                  hop_length = hop_length, n_mfcc = n_mfcc),
                             max_size[0], max_size[1])
        
        spec_centroid = librosa.feature.spectral_centroid(y = y_cut, sr = sr)
        chroma_stft = librosa.feature.chroma_stft(y = y_cut, sr = sr)
        spec_bw = librosa.feature.spectral_bandwidth(y = y_cut, sr = sr)
    
        #Now the padding part
        image = np.array([self.padding(normalize(spec_bw), 1, max_size[1])]).reshape(1, max_size[1])
        image = np.append(image, self.padding(normalize(spec_centroid), 1, max_size[1]), axis = 0)
        
        #repeat the padded spec_bw,spec_centroid and chroma stft until they are stft and MFCC-sized        
        for i in range( int((max_size[0]-2)/14) ):
            image = np.append(image, self.padding(normalize(spec_bw), 1, max_size[1]), axis = 0)
            image = np.append(image, self.padding(normalize(spec_centroid), 1, max_size[1]), axis = 0)
            image = np.append(image, self.padding(normalize(chroma_stft), 12, max_size[1]), axis = 0)
        
        image = np.dstack((image, np.abs(stft)))
        image = np.dstack((image, MFCCs))
        
        return image

    
    def get_features(self, df, filepath):
        self.df = df
        self.filepath = filepath
        
        # Get data for CNN
        X = []
        y = np.zeros(shape = (len(df), 1))

        for i in df.index:

            sr_i, aud = wavfile.read("{}\\{}".format(filepath, df.loc[i, "filename"]))
            aud = aud.astype(np.float16)
            
            X += [self.generate_features(y_cut = aud, sr = sr_i,
                                         n_fft = self.n_fft,
                                         n_mfcc = self.n_mfcc,
                                         max_size = self.max_size,
                                         hop_length = self.hop_length)]

            y[i] = df.loc[i, "target"]
        
        
        if shape_changed == True:
            print(f"New max_size is {max_size}")
            
        X = np.array(X)
        
        return X, y