BrendaTellez commited on
Commit
be4d0c2
1 Parent(s): 0791f60

Upload 5 files

Browse files
CNN_Architecture.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
CNN_support.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import librosa
4
+ import numpy as np
5
+ from scipy.io import wavfile
6
+ from sklearn.preprocessing import normalize
7
+
8
+ class SoundPreprocessing:
9
+ """
10
+ Parameters
11
+ ----------
12
+
13
+ sr (int): sampling rate
14
+ max_size (iterable): resulting shape of the tensor
15
+ n_fft (int): number related to FFT
16
+ n_mfcc (int): number of MFCC
17
+
18
+ """
19
+
20
+
21
+ def __init__(self, *, sr, max_size, n_fft, n_mfcc = 60, hop_length = 512):
22
+ self.sr = sr
23
+ self.n_fft = n_fft
24
+ self.n_mfcc = n_mfcc
25
+ self.max_size = max_size
26
+ self.hop_length = hop_length
27
+
28
+
29
+ def padding(self, array, xx, yy):
30
+ """
31
+ Parameters
32
+ ----------
33
+ array: numpy array
34
+ xx: desired height
35
+ yy: desirex width
36
+
37
+ Returns: padded array
38
+ """
39
+ self.array = array
40
+ self.xx = xx
41
+ self.yy = yy
42
+
43
+ h = array.shape[0]
44
+ w = array.shape[1]
45
+ a = max((xx - h) // 2,0)
46
+ aa = max(0,xx - a - h)
47
+ b = max(0,(yy - w) // 2)
48
+ bb = max(yy - b - w,0)
49
+
50
+ return np.pad(array, pad_width = ((a, aa), (b, bb)),
51
+ mode = "constant")
52
+
53
+
54
+ def generate_features(self, y_cut, sr, max_size, n_fft, n_mfcc, hop_length):
55
+ self.y_cut = y_cut
56
+
57
+ # Numeri -2 divisibili per 14
58
+ condition = np.arange(2, 1000)[np.where((np.arange(2, 1000) - 2)%14 == 0)]
59
+
60
+ global shape_changed
61
+ shape_changed = False
62
+
63
+ if max_size[0] not in condition:
64
+ # Get closest number to 'max_size' that respects 'condition'
65
+ new_max0 = sorted(condition, key = lambda v: abs(v - max_size[0]))[0]
66
+ shape_changed = True
67
+ max_size = (new_max0, max_size[1])
68
+
69
+ stft = self.padding(np.abs(librosa.stft(y = y_cut, n_fft = n_fft,
70
+ hop_length = 512)), max_size[0], max_size[1])
71
+
72
+ if max_size[0] < stft.shape[0]:
73
+ new_max0 = sorted(condition[condition >= stft.shape[0]],
74
+ key = lambda v: abs(v - stft.shape[0]))[0]
75
+ max_size = (new_max0, max_size[1])
76
+ shape_changed = True
77
+
78
+ stft = self.padding(np.abs(librosa.stft(y = y_cut, n_fft = n_fft,
79
+ hop_length = 512)), max_size[0], max_size[1])
80
+
81
+ MFCCs = self.padding(librosa.feature.mfcc(y = y_cut, n_fft = n_fft, sr = sr,
82
+ hop_length = hop_length, n_mfcc = n_mfcc),
83
+ max_size[0], max_size[1])
84
+
85
+ spec_centroid = librosa.feature.spectral_centroid(y = y_cut, sr = sr)
86
+ chroma_stft = librosa.feature.chroma_stft(y = y_cut, sr = sr)
87
+ spec_bw = librosa.feature.spectral_bandwidth(y = y_cut, sr = sr)
88
+
89
+ #Now the padding part
90
+ image = np.array([self.padding(normalize(spec_bw), 1, max_size[1])]).reshape(1, max_size[1])
91
+ image = np.append(image, self.padding(normalize(spec_centroid), 1, max_size[1]), axis = 0)
92
+
93
+ #repeat the padded spec_bw,spec_centroid and chroma stft until they are stft and MFCC-sized
94
+ for i in range( int((max_size[0]-2)/14) ):
95
+ image = np.append(image, self.padding(normalize(spec_bw), 1, max_size[1]), axis = 0)
96
+ image = np.append(image, self.padding(normalize(spec_centroid), 1, max_size[1]), axis = 0)
97
+ image = np.append(image, self.padding(normalize(chroma_stft), 12, max_size[1]), axis = 0)
98
+
99
+ image = np.dstack((image, np.abs(stft)))
100
+ image = np.dstack((image, MFCCs))
101
+
102
+ return image
103
+
104
+
105
+ def get_features(self, df, filepath):
106
+ self.df = df
107
+ self.filepath = filepath
108
+
109
+ # Get data for CNN
110
+ X = []
111
+ y = np.zeros(shape = (len(df), 1))
112
+
113
+ for i in df.index:
114
+
115
+ sr_i, aud = wavfile.read("{}\\{}".format(filepath, df.loc[i, "filename"]))
116
+ aud = aud.astype(np.float16)
117
+
118
+ X += [self.generate_features(y_cut = aud, sr = sr_i,
119
+ n_fft = self.n_fft,
120
+ n_mfcc = self.n_mfcc,
121
+ max_size = self.max_size,
122
+ hop_length = self.hop_length)]
123
+
124
+ y[i] = df.loc[i, "target"]
125
+
126
+
127
+ if shape_changed == True:
128
+ print(f"New max_size is {max_size}")
129
+
130
+ X = np.array(X)
131
+
132
+ return X, y
133
+
134
+
135
+
136
+
137
+
138
+
Data_preparation.ipynb ADDED
@@ -0,0 +1,652 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "VNUnhmXWe9qz"
7
+ },
8
+ "source": [
9
+ "# Notebook for data preparation\n",
10
+ "\n",
11
+ "A.A. 2022-2023 - HUMAN DATA ANALYTICS\n",
12
+ "\n",
13
+ "Authors:\n",
14
+ "* Mattia Brocco\n",
15
+ "* Brenda Eloisa Tellez Juarez\n",
16
+ "\n",
17
+ "In the following notebook the pipeline for data import, preprocessing and storage (using `.parquet` format) is presented."
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 1,
23
+ "metadata": {
24
+ "ExecuteTime": {
25
+ "end_time": "2023-02-12T22:43:39.436355Z",
26
+ "start_time": "2023-02-12T22:43:39.418449Z"
27
+ },
28
+ "colab": {
29
+ "base_uri": "https://localhost:8080/",
30
+ "height": 915
31
+ },
32
+ "id": "pz7MotpCfCUR",
33
+ "outputId": "fc916ed3-03d2-41ee-87db-237d79979cf0"
34
+ },
35
+ "outputs": [],
36
+ "source": [
37
+ "from google.colab import drive\n",
38
+ "drive.mount(\"/content/drive\")\n",
39
+ "\n",
40
+ "#%cd /content/drive/MyDrive/Environmental-sounds-UNIPD-2022"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 3,
46
+ "metadata": {
47
+ "id": "6YEmW9n_fOB8"
48
+ },
49
+ "outputs": [],
50
+ "source": [
51
+ "import os\n",
52
+ "import sys\n",
53
+ "import torch\n",
54
+ "import librosa\n",
55
+ "import matplotlib\n",
56
+ "import numpy as np\n",
57
+ "import pandas as pd\n",
58
+ "import seaborn as sns\n",
59
+ "import tensorflow as tf\n",
60
+ "from librosa import display\n",
61
+ "from scipy.io import wavfile\n",
62
+ "from tensorflow import keras\n",
63
+ "import IPython.display as ipd\n",
64
+ "import matplotlib.pyplot as plt\n",
65
+ "\n",
66
+ "from sklearn.metrics import confusion_matrix\n",
67
+ "from sklearn.metrics import classification_report\n",
68
+ "\n",
69
+ "import evaluation\n",
70
+ "import CNN_support as cnns\n",
71
+ "from gng import GrowingNeuralGas\n",
72
+ "\n",
73
+ "%load_ext autoreload\n",
74
+ "%autoreload 2"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 4,
80
+ "metadata": {
81
+ "colab": {
82
+ "base_uri": "https://localhost:8080/",
83
+ "height": 206
84
+ },
85
+ "execution": {
86
+ "iopub.execute_input": "2023-01-14T19:51:27.903698Z",
87
+ "iopub.status.busy": "2023-01-14T19:51:27.903426Z",
88
+ "iopub.status.idle": "2023-01-14T19:51:27.930731Z",
89
+ "shell.execute_reply": "2023-01-14T19:51:27.929790Z",
90
+ "shell.execute_reply.started": "2023-01-14T19:51:27.903668Z"
91
+ },
92
+ "id": "ZjdASAl2emSc",
93
+ "outputId": "a209c1ff-299b-4e8d-c79a-911fc9fab8ca"
94
+ },
95
+ "outputs": [
96
+ {
97
+ "data": {
98
+ "text/html": [
99
+ "\n",
100
+ " <div id=\"df-ea413149-b901-4dd6-a8ae-0b417a0edf47\">\n",
101
+ " <div class=\"colab-df-container\">\n",
102
+ " <div>\n",
103
+ "<style scoped>\n",
104
+ " .dataframe tbody tr th:only-of-type {\n",
105
+ " vertical-align: middle;\n",
106
+ " }\n",
107
+ "\n",
108
+ " .dataframe tbody tr th {\n",
109
+ " vertical-align: top;\n",
110
+ " }\n",
111
+ "\n",
112
+ " .dataframe thead th {\n",
113
+ " text-align: right;\n",
114
+ " }\n",
115
+ "</style>\n",
116
+ "<table border=\"1\" class=\"dataframe\">\n",
117
+ " <thead>\n",
118
+ " <tr style=\"text-align: right;\">\n",
119
+ " <th></th>\n",
120
+ " <th>filename</th>\n",
121
+ " <th>fold</th>\n",
122
+ " <th>target</th>\n",
123
+ " <th>category</th>\n",
124
+ " <th>esc10</th>\n",
125
+ " <th>src_file</th>\n",
126
+ " <th>take</th>\n",
127
+ " </tr>\n",
128
+ " </thead>\n",
129
+ " <tbody>\n",
130
+ " <tr>\n",
131
+ " <th>0</th>\n",
132
+ " <td>1-100032-A-0.wav</td>\n",
133
+ " <td>1</td>\n",
134
+ " <td>0</td>\n",
135
+ " <td>dog</td>\n",
136
+ " <td>True</td>\n",
137
+ " <td>100032</td>\n",
138
+ " <td>A</td>\n",
139
+ " </tr>\n",
140
+ " <tr>\n",
141
+ " <th>1</th>\n",
142
+ " <td>1-100038-A-14.wav</td>\n",
143
+ " <td>1</td>\n",
144
+ " <td>14</td>\n",
145
+ " <td>chirping_birds</td>\n",
146
+ " <td>False</td>\n",
147
+ " <td>100038</td>\n",
148
+ " <td>A</td>\n",
149
+ " </tr>\n",
150
+ " <tr>\n",
151
+ " <th>2</th>\n",
152
+ " <td>1-100210-A-36.wav</td>\n",
153
+ " <td>1</td>\n",
154
+ " <td>36</td>\n",
155
+ " <td>vacuum_cleaner</td>\n",
156
+ " <td>False</td>\n",
157
+ " <td>100210</td>\n",
158
+ " <td>A</td>\n",
159
+ " </tr>\n",
160
+ " <tr>\n",
161
+ " <th>3</th>\n",
162
+ " <td>1-100210-B-36.wav</td>\n",
163
+ " <td>1</td>\n",
164
+ " <td>36</td>\n",
165
+ " <td>vacuum_cleaner</td>\n",
166
+ " <td>False</td>\n",
167
+ " <td>100210</td>\n",
168
+ " <td>B</td>\n",
169
+ " </tr>\n",
170
+ " <tr>\n",
171
+ " <th>4</th>\n",
172
+ " <td>1-101296-A-19.wav</td>\n",
173
+ " <td>1</td>\n",
174
+ " <td>19</td>\n",
175
+ " <td>thunderstorm</td>\n",
176
+ " <td>False</td>\n",
177
+ " <td>101296</td>\n",
178
+ " <td>A</td>\n",
179
+ " </tr>\n",
180
+ " </tbody>\n",
181
+ "</table>\n",
182
+ "</div>\n",
183
+ " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-ea413149-b901-4dd6-a8ae-0b417a0edf47')\"\n",
184
+ " title=\"Convert this dataframe to an interactive table.\"\n",
185
+ " style=\"display:none;\">\n",
186
+ " \n",
187
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
188
+ " width=\"24px\">\n",
189
+ " <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
190
+ " <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
191
+ " </svg>\n",
192
+ " </button>\n",
193
+ " \n",
194
+ " <style>\n",
195
+ " .colab-df-container {\n",
196
+ " display:flex;\n",
197
+ " flex-wrap:wrap;\n",
198
+ " gap: 12px;\n",
199
+ " }\n",
200
+ "\n",
201
+ " .colab-df-convert {\n",
202
+ " background-color: #E8F0FE;\n",
203
+ " border: none;\n",
204
+ " border-radius: 50%;\n",
205
+ " cursor: pointer;\n",
206
+ " display: none;\n",
207
+ " fill: #1967D2;\n",
208
+ " height: 32px;\n",
209
+ " padding: 0 0 0 0;\n",
210
+ " width: 32px;\n",
211
+ " }\n",
212
+ "\n",
213
+ " .colab-df-convert:hover {\n",
214
+ " background-color: #E2EBFA;\n",
215
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
216
+ " fill: #174EA6;\n",
217
+ " }\n",
218
+ "\n",
219
+ " [theme=dark] .colab-df-convert {\n",
220
+ " background-color: #3B4455;\n",
221
+ " fill: #D2E3FC;\n",
222
+ " }\n",
223
+ "\n",
224
+ " [theme=dark] .colab-df-convert:hover {\n",
225
+ " background-color: #434B5C;\n",
226
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
227
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
228
+ " fill: #FFFFFF;\n",
229
+ " }\n",
230
+ " </style>\n",
231
+ "\n",
232
+ " <script>\n",
233
+ " const buttonEl =\n",
234
+ " document.querySelector('#df-ea413149-b901-4dd6-a8ae-0b417a0edf47 button.colab-df-convert');\n",
235
+ " buttonEl.style.display =\n",
236
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
237
+ "\n",
238
+ " async function convertToInteractive(key) {\n",
239
+ " const element = document.querySelector('#df-ea413149-b901-4dd6-a8ae-0b417a0edf47');\n",
240
+ " const dataTable =\n",
241
+ " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
242
+ " [key], {});\n",
243
+ " if (!dataTable) return;\n",
244
+ "\n",
245
+ " const docLinkHtml = 'Like what you see? Visit the ' +\n",
246
+ " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
247
+ " + ' to learn more about interactive tables.';\n",
248
+ " element.innerHTML = '';\n",
249
+ " dataTable['output_type'] = 'display_data';\n",
250
+ " await google.colab.output.renderOutput(dataTable, element);\n",
251
+ " const docLink = document.createElement('div');\n",
252
+ " docLink.innerHTML = docLinkHtml;\n",
253
+ " element.appendChild(docLink);\n",
254
+ " }\n",
255
+ " </script>\n",
256
+ " </div>\n",
257
+ " </div>\n",
258
+ " "
259
+ ],
260
+ "text/plain": [
261
+ " filename fold target category esc10 src_file take\n",
262
+ "0 1-100032-A-0.wav 1 0 dog True 100032 A\n",
263
+ "1 1-100038-A-14.wav 1 14 chirping_birds False 100038 A\n",
264
+ "2 1-100210-A-36.wav 1 36 vacuum_cleaner False 100210 A\n",
265
+ "3 1-100210-B-36.wav 1 36 vacuum_cleaner False 100210 B\n",
266
+ "4 1-101296-A-19.wav 1 19 thunderstorm False 101296 A"
267
+ ]
268
+ },
269
+ "execution_count": 4,
270
+ "metadata": {},
271
+ "output_type": "execute_result"
272
+ }
273
+ ],
274
+ "source": [
275
+ "#reading the csv file\n",
276
+ "data = pd.read_csv('./data/meta/esc50.csv')\n",
277
+ "data.head()"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "markdown",
282
+ "metadata": {
283
+ "id": "EsFcOZlvqf-K"
284
+ },
285
+ "source": [
286
+ "### 2. Data import & preprocessing\n",
287
+ "With the aim of replicability, the whole pipeline is implemented with the use of `np.random.seed()`."
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": 5,
293
+ "metadata": {
294
+ "colab": {
295
+ "base_uri": "https://localhost:8080/"
296
+ },
297
+ "id": "Q0aXZASmzZtM",
298
+ "outputId": "7556538e-41f2-4a0a-cb42-d1cca4d1d575"
299
+ },
300
+ "outputs": [
301
+ {
302
+ "name": "stderr",
303
+ "output_type": "stream",
304
+ "text": [
305
+ "/usr/local/lib/python3.8/dist-packages/librosa/core/pitch.py:153: UserWarning: Trying to estimate tuning from empty frequency set.\n",
306
+ " warnings.warn(\"Trying to estimate tuning from empty frequency set.\")\n",
307
+ "/usr/local/lib/python3.8/dist-packages/librosa/core/pitch.py:153: UserWarning: Trying to estimate tuning from empty frequency set.\n",
308
+ " warnings.warn(\"Trying to estimate tuning from empty frequency set.\")\n"
309
+ ]
310
+ }
311
+ ],
312
+ "source": [
313
+ "# DATA AUGMENTATION\n",
314
+ "\n",
315
+ "#np.random.seed(42)\n",
316
+ "#indexed_samples = np.random.choice(X.shape[0], size = 10000, replace = True)\n",
317
+ "np.random.seed(101)\n",
318
+ "randn_seeds = np.random.choice(len(data), size = len(data), replace = False)\n",
319
+ "\n",
320
+ "aug_iterations = 7\n",
321
+ "\n",
322
+ "new_X = []\n",
323
+ "#new_X2 = []\n",
324
+ "new_y = np.zeros(shape = (aug_iterations*len(randn_seeds), 1))\n",
325
+ "\n",
326
+ "input_length = 220500\n",
327
+ "row_count = 0\n",
328
+ "for i in data.index:\n",
329
+ "\n",
330
+ " sample, sr_sample = librosa.load(\"./data/audio/{}\".format(data.loc[i, \"filename\"]),\n",
331
+ " sr = 44100)\n",
332
+ " # Min-max scaler [0, 1]\n",
333
+ " sample = (sample - sample.min()) / (sample.max() - sample.min())\n",
334
+ "\n",
335
+ " if len(sample) > input_length:\n",
336
+ " sample = sample[:input_length]\n",
337
+ " else:\n",
338
+ " sample = np.pad(sample, (0, max(0, input_length - len(sample))), \"constant\")\n",
339
+ "\n",
340
+ " for n in range(aug_iterations):\n",
341
+ " \n",
342
+ " if n == 0:\n",
343
+ " # NOISE INJECTION\n",
344
+ " np.random.seed(randn_seeds[i])\n",
345
+ " noise = np.random.randn(len( sample ))\n",
346
+ " augmented_data = (sample + 0.005 * noise)\n",
347
+ "\n",
348
+ " elif n == 1:\n",
349
+ " # TIME SHIFT: right shift\n",
350
+ " augmented_data = np.roll(sample, 22050)\n",
351
+ "\n",
352
+ " elif n == 2:\n",
353
+ " # PITCH SHIFT: shift down by 3\n",
354
+ " augmented_data = librosa.effects.pitch_shift(y = sample, sr = sr_sample,\n",
355
+ " n_steps = 3)\n",
356
+ " elif n == 3:\n",
357
+ " # PITCH SHIFT: shift down by -3\n",
358
+ " augmented_data = librosa.effects.pitch_shift(y = sample, sr = sr_sample,\n",
359
+ " n_steps = -3)\n",
360
+ " elif n == 4:\n",
361
+ " # SPEED SHIFT: faster\n",
362
+ " augmented_data = librosa.effects.time_stretch(y = sample, rate = 1.25)\n",
363
+ " augmented_data = np.append(augmented_data,\n",
364
+ " np.zeros(shape = len(sample) - len(augmented_data)))\n",
365
+ " elif n == 5:\n",
366
+ " # SPEED SHIFT: slower (returns longer array)\n",
367
+ " augmented_data = librosa.effects.time_stretch(y = sample, rate = 0.8)\n",
368
+ " augmented_data = augmented_data[:len(sample)]\n",
369
+ "\n",
370
+ " else:\n",
371
+ " # KEEP NORMAL SAMPLE\n",
372
+ " augmented_data = sample\n",
373
+ "\n",
374
+ " new_instance = librosa.feature.mfcc(y = augmented_data, sr = sr_sample,\n",
375
+ " hop_length = 512, n_mfcc = 60)\n",
376
+ " \n",
377
+ " \"\"\"\n",
378
+ " For the CNN, the input is composed of three channels\n",
379
+ " stacked together as follows (commented lines).\n",
380
+ " \"\"\"\n",
381
+ " #new_MFCC = librosa.feature.mfcc(y = augmented_data, sr = sr_sample,\n",
382
+ " # hop_length = 512, n_mfcc = 60)\n",
383
+ " #new_chromagram = librosa.feature.chroma_stft(y = augmented_data, sr = sr_sample,\n",
384
+ " # hop_length = 512, win_length = 1024,\n",
385
+ " # n_chroma = 60)\n",
386
+ " #new_delta = librosa.feature.delta(new_MFCC)\n",
387
+ " \n",
388
+ " #new_instance = np.dstack((new_MFCC, new_chromagram, new_delta))\n",
389
+ "\n",
390
+ " \n",
391
+ " new_X += [new_instance]\n",
392
+ " #new_X2 += [new_instance2]\n",
393
+ " new_y[row_count] = data.loc[i, \"target\"]\n",
394
+ " \n",
395
+ " row_count += 1\n",
396
+ " \n",
397
+ " \n",
398
+ "new_X = np.array(new_X)\n",
399
+ "#new_X2 = np.array(new_X2)"
400
+ ]
401
+ },
402
+ {
403
+ "cell_type": "code",
404
+ "execution_count": 6,
405
+ "metadata": {
406
+ "colab": {
407
+ "base_uri": "https://localhost:8080/"
408
+ },
409
+ "id": "kXgmb61EKq2_",
410
+ "outputId": "c4af2309-c793-4c6b-a083-864b01c71a16"
411
+ },
412
+ "outputs": [
413
+ {
414
+ "data": {
415
+ "text/plain": [
416
+ "((14000, 60, 431, 3), (14000, 1))"
417
+ ]
418
+ },
419
+ "execution_count": 6,
420
+ "metadata": {},
421
+ "output_type": "execute_result"
422
+ }
423
+ ],
424
+ "source": [
425
+ "new_X.shape, new_y.shape"
426
+ ]
427
+ },
428
+ {
429
+ "cell_type": "code",
430
+ "execution_count": 8,
431
+ "metadata": {
432
+ "id": "paUvHcNHmVfH"
433
+ },
434
+ "outputs": [],
435
+ "source": [
436
+ "# Reduce float precision in order to decrease the size of the files\n",
437
+ "new_X = new_X.astype(np.float32)"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": 2,
443
+ "metadata": {
444
+ "ExecuteTime": {
445
+ "end_time": "2023-02-12T22:44:44.989798Z",
446
+ "start_time": "2023-02-12T22:44:44.984746Z"
447
+ },
448
+ "colab": {
449
+ "base_uri": "https://localhost:8080/",
450
+ "height": 811
451
+ },
452
+ "id": "CWxW80DewwYQ",
453
+ "outputId": "bebc360f-9f33-48f3-8106-62d0cc0c91ee"
454
+ },
455
+ "outputs": [],
456
+ "source": [
457
+ "def data_to_parquet(arr, name):\n",
458
+ " \"\"\"\n",
459
+ " Whether it is for the CNN or the RNN,\n",
460
+ " this function provides a flattening of all the \n",
461
+ " dimensions of the array except the first\n",
462
+ " (number of samples).\n",
463
+ " \n",
464
+ " When required, the files are then imported\n",
465
+ " via the 'pandas' library and prperly reshaped.\n",
466
+ " \"\"\"\n",
467
+ " if len(arr.shape) > 2:\n",
468
+ " arr2 = arr.reshape(arr.shape[0], -1)\n",
469
+ " arr2 = pd.DataFrame(arr2)\n",
470
+ " else:\n",
471
+ " arr2 = pd.DataFrame(arr)\n",
472
+ "\n",
473
+ " arr2.columns = [str(c) for c in arr2.columns]\n",
474
+ " arr2.to_parquet(os.getcwd() + f\"/data/{name}.parquet\")\n",
475
+ " \n",
476
+ "\n",
477
+ "data_to_parquet(new_X, \"X_CNN_60x431x3_7times\")\n",
478
+ "data_to_parquet(new_y, \"y_CNN_7times\")"
479
+ ]
480
+ },
481
+ {
482
+ "cell_type": "markdown",
483
+ "metadata": {
484
+ "id": "lb16Lux3deLi"
485
+ },
486
+ "source": [
487
+ "```python\n",
488
+ "# Get data for RNN\n",
489
+ "X = []\n",
490
+ "y = np.zeros(shape = (len(data), 1))\n",
491
+ "\n",
492
+ "for i in data.index:\n",
493
+ " \n",
494
+ " sample, sr_sample = librosa.load(\"./data/audio/{}\".format(data.loc[i, \"filename\"]),\n",
495
+ " sr = 44100)\n",
496
+ " \n",
497
+ " MFCC = librosa.feature.mfcc(y = sample, sr = sr_sample,\n",
498
+ " hop_length = 512, n_mfcc = 60)\n",
499
+ " \n",
500
+ " #instance = MFCC.mean(axis = 0)\n",
501
+ " \n",
502
+ " X += [MFCC]\n",
503
+ " \n",
504
+ " y[i] = data.loc[i, \"target\"]\n",
505
+ " \n",
506
+ "X = np.array(X)\n",
507
+ "```"
508
+ ]
509
+ },
510
+ {
511
+ "cell_type": "markdown",
512
+ "metadata": {
513
+ "id": "kNf0QXsLg8Yz"
514
+ },
515
+ "source": [
516
+ "### Adversarial attacks"
517
+ ]
518
+ },
519
+ {
520
+ "cell_type": "code",
521
+ "execution_count": null,
522
+ "metadata": {
523
+ "execution": {
524
+ "iopub.execute_input": "2023-01-14T17:37:45.772463Z",
525
+ "iopub.status.busy": "2023-01-14T17:37:45.771814Z",
526
+ "iopub.status.idle": "2023-01-14T17:37:45.787426Z",
527
+ "shell.execute_reply": "2023-01-14T17:37:45.786380Z",
528
+ "shell.execute_reply.started": "2023-01-14T17:37:45.772366Z"
529
+ },
530
+ "id": "u8gNRa0xemS-"
531
+ },
532
+ "outputs": [],
533
+ "source": [
534
+ "# create an adversarial example\n",
535
+ "def create_adversarial_example(x2, y_new, model_bidirectional):\n",
536
+ " # convert the label to a one-hot encoded vector\n",
537
+ " y = tf.keras.utils.to_categorical(y_new, num_classes=50)\n",
538
+ "# compute the gradient of the loss with respect to the input\n",
539
+ " with tf.GradientTape() as tape:\n",
540
+ " tape.watch(x2)\n",
541
+ " logits = model_bidirectional(x2)\n",
542
+ " loss_value = tf.losses.categorical_crossentropy(y_new, logits)\n",
543
+ " grads = tape.gradient(loss_value, x2)\n",
544
+ "# create an adversarial example by adding the sign of the gradient to the input\n",
545
+ " epsilon = 0.01\n",
546
+ " x_adv = x2 + epsilon * tf.sign(grads)\n",
547
+ " x_adv = tf.clip_by_value(x_adv, 0, 1)\n",
548
+ " return x_adv"
549
+ ]
550
+ },
551
+ {
552
+ "cell_type": "code",
553
+ "execution_count": null,
554
+ "metadata": {
555
+ "execution": {
556
+ "iopub.execute_input": "2023-01-14T17:34:00.160306Z",
557
+ "iopub.status.busy": "2023-01-14T17:34:00.159720Z",
558
+ "iopub.status.idle": "2023-01-14T17:34:00.166335Z",
559
+ "shell.execute_reply": "2023-01-14T17:34:00.165267Z",
560
+ "shell.execute_reply.started": "2023-01-14T17:34:00.160266Z"
561
+ },
562
+ "id": "fXKsE1PzemS_"
563
+ },
564
+ "outputs": [],
565
+ "source": [
566
+ "#def create_adversarial_example(x2, y_new, model_bidirectional):\n",
567
+ " # convert the label to a one-hot encoded vector\n",
568
+ " y = tf.keras.utils.to_categorical(y_new, num_classes=20)\n",
569
+ " # compute the gradient of the loss with respect to the input\n",
570
+ " logits = model_bidirectional(x2)\n",
571
+ " loss = tf.losses.categorical_crossentropy(y_new, logits)\n",
572
+ " grads, = tf.gradients(loss, x2)\n",
573
+ " # create an adversarial example by adding the sign of the gradient to the input\n",
574
+ " epsilon = 0.01\n",
575
+ " x_adv = x2 + epsilon * tf.sign(grads)\n",
576
+ " x_adv = tf.clip_by_value(x_adv, 0, 1)\n",
577
+ " return x_adv"
578
+ ]
579
+ },
580
+ {
581
+ "cell_type": "code",
582
+ "execution_count": null,
583
+ "metadata": {
584
+ "execution": {
585
+ "iopub.execute_input": "2023-01-14T20:08:50.574052Z",
586
+ "iopub.status.busy": "2023-01-14T20:08:50.573757Z",
587
+ "iopub.status.idle": "2023-01-14T20:09:00.767976Z",
588
+ "shell.execute_reply": "2023-01-14T20:09:00.766358Z",
589
+ "shell.execute_reply.started": "2023-01-14T20:08:50.574022Z"
590
+ },
591
+ "id": "Tl_qP5S6emS_"
592
+ },
593
+ "outputs": [],
594
+ "source": [
595
+ "# create an adversarial example and test it with the model\n",
596
+ "x_adv = create_adversarial_example(x2, y_new, model_bidirectional)\n",
597
+ "y_pred_adv = model_bidirectional(x_adv).argmax() # get the predicted label\n",
598
+ "acc = (y_pred_adv == y_new).mean() # calculate the accuracy\n",
599
+ "print(f'Model accuracy on adversarial example: {acc:.2f}')"
600
+ ]
601
+ },
602
+ {
603
+ "cell_type": "code",
604
+ "execution_count": null,
605
+ "metadata": {
606
+ "execution": {
607
+ "iopub.execute_input": "2023-01-14T19:56:28.078039Z",
608
+ "iopub.status.busy": "2023-01-14T19:56:28.074638Z",
609
+ "iopub.status.idle": "2023-01-14T19:56:39.922249Z",
610
+ "shell.execute_reply": "2023-01-14T19:56:39.920914Z",
611
+ "shell.execute_reply.started": "2023-01-14T19:56:28.077987Z"
612
+ },
613
+ "id": "bFH3lL8UemS_"
614
+ },
615
+ "outputs": [],
616
+ "source": [
617
+ "# test the adversarial example\n",
618
+ "x_adv = create_adversarial_example(x2, y_new, model_bidirectional)\n",
619
+ "logits_adv = model_bidirectional(x_adv)\n",
620
+ "y_pred_adv = np.argmax(logits_adv, axis=1)\n",
621
+ "accuracy = accuracy_score(y_new, y_pred_adv)\n",
622
+ "print('Accuracy on adversarial example:', accuracy)"
623
+ ]
624
+ }
625
+ ],
626
+ "metadata": {
627
+ "colab": {
628
+ "machine_shape": "hm",
629
+ "provenance": []
630
+ },
631
+ "gpuClass": "standard",
632
+ "kernelspec": {
633
+ "display_name": "Python 3 (ipykernel)",
634
+ "language": "python",
635
+ "name": "python3"
636
+ },
637
+ "language_info": {
638
+ "codemirror_mode": {
639
+ "name": "ipython",
640
+ "version": 3
641
+ },
642
+ "file_extension": ".py",
643
+ "mimetype": "text/x-python",
644
+ "name": "python",
645
+ "nbconvert_exporter": "python",
646
+ "pygments_lexer": "ipython3",
647
+ "version": "3.9.12"
648
+ }
649
+ },
650
+ "nbformat": 4,
651
+ "nbformat_minor": 1
652
+ }
RNN_Architecture.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
evaluation.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import seaborn as sns
2
+ import tensorflow as tf
3
+ import matplotlib.pyplot as plt
4
+
5
+ def plot_loss(history, axis = None):
6
+ """
7
+ Parameters
8
+ ----------
9
+
10
+ history : 'tf.keras.callbacks.History' object
11
+ axis : 'matplotlib.pyplot.axis' object
12
+ """
13
+ if axis is not None:
14
+ axis.plot(history.epoch, history.history["loss"],
15
+ label = "Train loss", color = "#191970")
16
+ axis.plot(history.epoch, history.history["val_loss"],
17
+ label = "Val loss", color = "#00CC33")
18
+ axis.set_title("Loss")
19
+ axis.legend()
20
+ else:
21
+ plt.plot(history.epoch, history.history["loss"],
22
+ label = "Train loss", color = "#191970")
23
+ plt.plot(history.epoch, history.history["val_loss"],
24
+ label = "Val loss", color = "#00CC33")
25
+ plt.title("Loss")
26
+ plt.legend()
27
+
28
+
29
+ def plot_accuracy(history, axis = None):
30
+ """
31
+ Parameters
32
+ ----------
33
+
34
+ history : 'tf.keras.callbacks.History' object
35
+ axis : 'matplotlib.pyplot.axis' object
36
+ """
37
+ if axis is not None:
38
+ axis.plot(history.epoch, history.history["accuracy"],
39
+ label = "Train accuracy", color = "#191970")
40
+ axis.plot(history.epoch, history.history["val_accuracy"],
41
+ label = "Val accuracy", color = "#00CC33")
42
+ axis.set_ylim(0, 1.1)
43
+ axis.set_title("Accuracy")
44
+ axis.legend()
45
+ else:
46
+ plt.plot(history.epoch, history.history["accuracy"],
47
+ label = "Train accuracy", color = "#191970")
48
+ plt.plot(history.epoch, history.history["val_accuracy"],
49
+ label = "Val accuracy", color = "#00CC33")
50
+ plt.title("Accuracy")
51
+ plt.ylim(0, 1.1)
52
+ plt.legend()
53
+
54
+
55
+ def keras_model_memory_usage_in_bytes(model, *, batch_size: int):
56
+ """
57
+ Return the estimated memory usage of a given Keras model in bytes.
58
+ This includes the model weights and layers, but excludes the dataset.
59
+
60
+ The model shapes are multipled by the batch size, but the weights are not.
61
+
62
+ Parameters
63
+ ----------
64
+ model: A Keras model.
65
+ batch_size: The batch size you intend to run the model with. If you
66
+ have already specified the batch size in the model itself, then
67
+ pass `1` as the argument here.
68
+
69
+ Returns
70
+ -------
71
+ An estimate of the Keras model's memory usage in bytes.
72
+
73
+ """
74
+ default_dtype = tf.keras.backend.floatx()
75
+ shapes_mem_count = 0
76
+ internal_model_mem_count = 0
77
+ for layer in model.layers:
78
+ if isinstance(layer, tf.keras.Model):
79
+ internal_model_mem_count += keras_model_memory_usage_in_bytes(layer,
80
+ batch_size = batch_size)
81
+ single_layer_mem = tf.as_dtype(layer.dtype or default_dtype).size
82
+ out_shape = layer.output_shape
83
+ if isinstance(out_shape, list):
84
+ out_shape = out_shape[0]
85
+ for s in out_shape:
86
+ if s is None:
87
+ continue
88
+ single_layer_mem *= s
89
+ shapes_mem_count += single_layer_mem
90
+
91
+ trainable_count = sum([tf.keras.backend.count_params(p)
92
+ for p in model.trainable_weights])
93
+ non_trainable_count = sum([tf.keras.backend.count_params(p)
94
+ for p in model.non_trainable_weights])
95
+
96
+ total_memory = (batch_size * shapes_mem_count + internal_model_mem_count\
97
+ + trainable_count + non_trainable_count)
98
+
99
+ return total_memory