mirodil commited on
Commit
1020825
1 Parent(s): a1e206c

Upload hands_on.ipynb

Browse files
Files changed (1) hide show
  1. hands_on.ipynb +1002 -0
hands_on.ipynb ADDED
@@ -0,0 +1,1002 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from huggingface_hub import notebook_login\n",
10
+ "notebook_login()"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "from datasets import load_dataset, DatasetDict"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 2,
25
+ "metadata": {},
26
+ "outputs": [
27
+ {
28
+ "data": {
29
+ "text/plain": [
30
+ "Dataset({\n",
31
+ " features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],\n",
32
+ " num_rows: 563\n",
33
+ "})"
34
+ ]
35
+ },
36
+ "execution_count": 2,
37
+ "metadata": {},
38
+ "output_type": "execute_result"
39
+ }
40
+ ],
41
+ "source": [
42
+ "minds14_train = load_dataset(\n",
43
+ " \"PolyAI/minds14\", \n",
44
+ " \"en-US\",\n",
45
+ " split=\"train\"\n",
46
+ ")\n",
47
+ "minds14_train"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 3,
53
+ "metadata": {},
54
+ "outputs": [
55
+ {
56
+ "data": {
57
+ "text/plain": [
58
+ "DatasetDict({\n",
59
+ " train: Dataset({\n",
60
+ " features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],\n",
61
+ " num_rows: 450\n",
62
+ " })\n",
63
+ " test: Dataset({\n",
64
+ " features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],\n",
65
+ " num_rows: 113\n",
66
+ " })\n",
67
+ "})"
68
+ ]
69
+ },
70
+ "execution_count": 3,
71
+ "metadata": {},
72
+ "output_type": "execute_result"
73
+ }
74
+ ],
75
+ "source": [
76
+ "minds14 = DatasetDict()\n",
77
+ "\n",
78
+ "minds14[\"train\"] = minds14_train.select(range(450))\n",
79
+ "minds14[\"test\"] = minds14_train.select(range(450, 563))\n",
80
+ "\n",
81
+ "minds14"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": 4,
87
+ "metadata": {},
88
+ "outputs": [
89
+ {
90
+ "data": {
91
+ "text/plain": [
92
+ "DatasetDict({\n",
93
+ " train: Dataset({\n",
94
+ " features: ['audio', 'transcription'],\n",
95
+ " num_rows: 450\n",
96
+ " })\n",
97
+ " test: Dataset({\n",
98
+ " features: ['audio', 'transcription'],\n",
99
+ " num_rows: 113\n",
100
+ " })\n",
101
+ "})"
102
+ ]
103
+ },
104
+ "execution_count": 4,
105
+ "metadata": {},
106
+ "output_type": "execute_result"
107
+ }
108
+ ],
109
+ "source": [
110
+ "minds14 = minds14.select_columns(['audio', 'transcription'])\n",
111
+ "minds14"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 5,
117
+ "metadata": {},
118
+ "outputs": [],
119
+ "source": [
120
+ "from transformers import WhisperProcessor\n",
121
+ "\n",
122
+ "processor = WhisperProcessor.from_pretrained(\n",
123
+ " \"openai/whisper-tiny\", language=\"english\", task=\"transcribe\"\n",
124
+ ")"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 6,
130
+ "metadata": {},
131
+ "outputs": [
132
+ {
133
+ "data": {
134
+ "text/plain": [
135
+ "{'audio': Audio(sampling_rate=8000, mono=True, decode=True, id=None),\n",
136
+ " 'transcription': Value(dtype='string', id=None)}"
137
+ ]
138
+ },
139
+ "execution_count": 6,
140
+ "metadata": {},
141
+ "output_type": "execute_result"
142
+ }
143
+ ],
144
+ "source": [
145
+ "minds14[\"train\"].features"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": 7,
151
+ "metadata": {},
152
+ "outputs": [],
153
+ "source": [
154
+ "from datasets import Audio\n",
155
+ "\n",
156
+ "sampling_rate = processor.feature_extractor.sampling_rate\n",
157
+ "minds14 = minds14.cast_column(\"audio\", Audio(sampling_rate=sampling_rate))"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 8,
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": [
166
+ "def prepare_dataset(example):\n",
167
+ " audio = example[\"audio\"]\n",
168
+ "\n",
169
+ " example = processor(\n",
170
+ " audio=audio[\"array\"],\n",
171
+ " sampling_rate=audio[\"sampling_rate\"],\n",
172
+ " text=example[\"transcription\"],\n",
173
+ " )\n",
174
+ "\n",
175
+ " # compute input length of audio sample in seconds\n",
176
+ " example[\"input_length\"] = len(audio[\"array\"]) / audio[\"sampling_rate\"]\n",
177
+ "\n",
178
+ " return example"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": 9,
184
+ "metadata": {},
185
+ "outputs": [
186
+ {
187
+ "data": {
188
+ "application/vnd.jupyter.widget-view+json": {
189
+ "model_id": "356d0ccec48f41b9ad10504ae0ca4813",
190
+ "version_major": 2,
191
+ "version_minor": 0
192
+ },
193
+ "text/plain": [
194
+ "Map: 0%| | 0/450 [00:00<?, ? examples/s]"
195
+ ]
196
+ },
197
+ "metadata": {},
198
+ "output_type": "display_data"
199
+ },
200
+ {
201
+ "data": {
202
+ "application/vnd.jupyter.widget-view+json": {
203
+ "model_id": "ef753a60316c4115924c49052eeb411d",
204
+ "version_major": 2,
205
+ "version_minor": 0
206
+ },
207
+ "text/plain": [
208
+ "Map: 0%| | 0/113 [00:00<?, ? examples/s]"
209
+ ]
210
+ },
211
+ "metadata": {},
212
+ "output_type": "display_data"
213
+ }
214
+ ],
215
+ "source": [
216
+ "minds14 = minds14.map(\n",
217
+ " prepare_dataset, remove_columns=minds14.column_names[\"train\"], num_proc=1\n",
218
+ ")"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": 10,
224
+ "metadata": {},
225
+ "outputs": [],
226
+ "source": [
227
+ "max_input_length = 30.0\n",
228
+ "def is_audio_in_length_range(length):\n",
229
+ " return length < max_input_length"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": 11,
235
+ "metadata": {},
236
+ "outputs": [
237
+ {
238
+ "data": {
239
+ "application/vnd.jupyter.widget-view+json": {
240
+ "model_id": "2292d10d955d4d958e07849f0abb57c8",
241
+ "version_major": 2,
242
+ "version_minor": 0
243
+ },
244
+ "text/plain": [
245
+ "Filter: 0%| | 0/450 [00:00<?, ? examples/s]"
246
+ ]
247
+ },
248
+ "metadata": {},
249
+ "output_type": "display_data"
250
+ }
251
+ ],
252
+ "source": [
253
+ "minds14[\"train\"] = minds14[\"train\"].filter(\n",
254
+ " is_audio_in_length_range,\n",
255
+ " input_columns=[\"input_length\"],\n",
256
+ ")"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": 12,
262
+ "metadata": {},
263
+ "outputs": [
264
+ {
265
+ "data": {
266
+ "text/plain": [
267
+ "Dataset({\n",
268
+ " features: ['input_features', 'labels', 'input_length'],\n",
269
+ " num_rows: 445\n",
270
+ "})"
271
+ ]
272
+ },
273
+ "execution_count": 12,
274
+ "metadata": {},
275
+ "output_type": "execute_result"
276
+ }
277
+ ],
278
+ "source": [
279
+ "minds14['train']"
280
+ ]
281
+ },
282
+ {
283
+ "cell_type": "markdown",
284
+ "metadata": {},
285
+ "source": [
286
+ "### Training and Evaluation"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": 13,
292
+ "metadata": {},
293
+ "outputs": [],
294
+ "source": [
295
+ "import torch\n",
296
+ "\n",
297
+ "from dataclasses import dataclass\n",
298
+ "from typing import Any, Dict, List, Union\n",
299
+ "\n",
300
+ "\n",
301
+ "@dataclass\n",
302
+ "class DataCollatorSpeechSeq2SeqWithPadding:\n",
303
+ " processor: Any\n",
304
+ "\n",
305
+ " def __call__(\n",
306
+ " self, features: List[Dict[str, Union[List[int], torch.Tensor]]]\n",
307
+ " ) -> Dict[str, torch.Tensor]:\n",
308
+ " # split inputs and labels since they have to be of different lengths and need different padding methods\n",
309
+ " # first treat the audio inputs by simply returning torch tensors\n",
310
+ " input_features = [\n",
311
+ " {\"input_features\": feature[\"input_features\"][0]} for feature in features\n",
312
+ " ]\n",
313
+ " batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n",
314
+ "\n",
315
+ " # get the tokenized label sequences\n",
316
+ " label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n",
317
+ " # pad the labels to max length\n",
318
+ " labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n",
319
+ "\n",
320
+ " # replace padding with -100 to ignore loss correctly\n",
321
+ " labels = labels_batch[\"input_ids\"].masked_fill(\n",
322
+ " labels_batch.attention_mask.ne(1), -100\n",
323
+ " )\n",
324
+ "\n",
325
+ " # if bos token is appended in previous tokenization step,\n",
326
+ " # cut bos token here as it's append later anyways\n",
327
+ " if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n",
328
+ " labels = labels[:, 1:]\n",
329
+ "\n",
330
+ " batch[\"labels\"] = labels\n",
331
+ "\n",
332
+ " return batch"
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": 14,
338
+ "metadata": {},
339
+ "outputs": [],
340
+ "source": [
341
+ "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 15,
347
+ "metadata": {},
348
+ "outputs": [],
349
+ "source": [
350
+ "import evaluate\n",
351
+ "from transformers.models.whisper.english_normalizer import BasicTextNormalizer\n",
352
+ "\n",
353
+ "metric = evaluate.load(\"wer\")\n",
354
+ "normalizer = BasicTextNormalizer()\n",
355
+ "\n",
356
+ "def compute_metrics(pred):\n",
357
+ " pred_ids = pred.predictions\n",
358
+ " label_ids = pred.label_ids\n",
359
+ "\n",
360
+ " # replace -100 with the pad_token_id\n",
361
+ " label_ids[label_ids == -100] = processor.tokenizer.pad_token_id\n",
362
+ "\n",
363
+ " # we do not want to group tokens when computing the metrics\n",
364
+ " pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)\n",
365
+ " label_str = processor.batch_decode(label_ids, skip_special_tokens=True)\n",
366
+ "\n",
367
+ " # compute orthographic wer\n",
368
+ " wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str)\n",
369
+ "\n",
370
+ " # compute normalised WER\n",
371
+ " pred_str_norm = [normalizer(pred) for pred in pred_str]\n",
372
+ " label_str_norm = [normalizer(label) for label in label_str]\n",
373
+ " # filtering step to only evaluate the samples that correspond to non-zero references:\n",
374
+ " pred_str_norm = [\n",
375
+ " pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0\n",
376
+ " ]\n",
377
+ " label_str_norm = [\n",
378
+ " label_str_norm[i]\n",
379
+ " for i in range(len(label_str_norm))\n",
380
+ " if len(label_str_norm[i]) > 0\n",
381
+ " ]\n",
382
+ "\n",
383
+ " wer = 100 * metric.compute(predictions=pred_str_norm, references=label_str_norm)\n",
384
+ "\n",
385
+ " return {\"wer_ortho\": wer_ortho, \"wer\": wer}"
386
+ ]
387
+ },
388
+ {
389
+ "cell_type": "code",
390
+ "execution_count": 16,
391
+ "metadata": {},
392
+ "outputs": [],
393
+ "source": [
394
+ "from transformers import WhisperForConditionalGeneration\n",
395
+ "model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny\")"
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "code",
400
+ "execution_count": 17,
401
+ "metadata": {},
402
+ "outputs": [],
403
+ "source": [
404
+ "from functools import partial\n",
405
+ "\n",
406
+ "# disable cache during training since it's incompatible with gradient checkpointing\n",
407
+ "model.config.use_cache = False\n",
408
+ "\n",
409
+ "# set language and task for generation and re-enable cache\n",
410
+ "model.generate = partial(\n",
411
+ " model.generate, language=\"english\", task=\"transcribe\", use_cache=True\n",
412
+ ")"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": 18,
418
+ "metadata": {},
419
+ "outputs": [],
420
+ "source": [
421
+ "from transformers import Seq2SeqTrainingArguments\n",
422
+ "\n",
423
+ "training_args = Seq2SeqTrainingArguments(\n",
424
+ " output_dir=\"./whisper-tiny-en-us-minds14\", # name on the HF Hub\n",
425
+ " per_device_train_batch_size=16,\n",
426
+ " gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size\n",
427
+ " learning_rate=1e-5,\n",
428
+ " lr_scheduler_type=\"constant_with_warmup\",\n",
429
+ " warmup_steps=50,\n",
430
+ " max_steps=4000, # increase to 4000 if you have your own GPU or a Colab paid plan\n",
431
+ " gradient_checkpointing=True,\n",
432
+ " # fp16=True,\n",
433
+ " # fp16_full_eval=True,\n",
434
+ " evaluation_strategy=\"steps\",\n",
435
+ " per_device_eval_batch_size=16,\n",
436
+ " predict_with_generate=True,\n",
437
+ " generation_max_length=225,\n",
438
+ " save_steps=500,\n",
439
+ " eval_steps=500,\n",
440
+ " logging_steps=25,\n",
441
+ " report_to=[\"tensorboard\"],\n",
442
+ " load_best_model_at_end=True,\n",
443
+ " metric_for_best_model=\"wer\",\n",
444
+ " greater_is_better=False,\n",
445
+ " # push_to_hub=False,\n",
446
+ ")"
447
+ ]
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": 19,
452
+ "metadata": {},
453
+ "outputs": [],
454
+ "source": [
455
+ "from transformers import Seq2SeqTrainer\n",
456
+ "\n",
457
+ "trainer = Seq2SeqTrainer(\n",
458
+ " args=training_args,\n",
459
+ " model=model,\n",
460
+ " train_dataset=minds14[\"train\"],\n",
461
+ " eval_dataset=minds14[\"test\"],\n",
462
+ " data_collator=data_collator,\n",
463
+ " compute_metrics=compute_metrics,\n",
464
+ " tokenizer=processor,\n",
465
+ ")"
466
+ ]
467
+ },
468
+ {
469
+ "cell_type": "code",
470
+ "execution_count": 20,
471
+ "metadata": {},
472
+ "outputs": [
473
+ {
474
+ "data": {
475
+ "application/vnd.jupyter.widget-view+json": {
476
+ "model_id": "9dcf642e434e48468854ec1cbaa6120c",
477
+ "version_major": 2,
478
+ "version_minor": 0
479
+ },
480
+ "text/plain": [
481
+ " 0%| | 0/4000 [00:00<?, ?it/s]"
482
+ ]
483
+ },
484
+ "metadata": {},
485
+ "output_type": "display_data"
486
+ },
487
+ {
488
+ "name": "stderr",
489
+ "output_type": "stream",
490
+ "text": [
491
+ "/Users/mkhojira/Projects/mml/audio-course/venv/lib/python3.8/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
492
+ " warnings.warn(\n"
493
+ ]
494
+ },
495
+ {
496
+ "name": "stdout",
497
+ "output_type": "stream",
498
+ "text": [
499
+ "{'loss': 1.584, 'learning_rate': 5e-06, 'epoch': 0.89}\n",
500
+ "{'loss': 0.6567, 'learning_rate': 1e-05, 'epoch': 1.79}\n",
501
+ "{'loss': 0.1857, 'learning_rate': 1e-05, 'epoch': 2.68}\n",
502
+ "{'loss': 0.1218, 'learning_rate': 1e-05, 'epoch': 3.57}\n",
503
+ "{'loss': 0.0876, 'learning_rate': 1e-05, 'epoch': 4.46}\n",
504
+ "{'loss': 0.0512, 'learning_rate': 1e-05, 'epoch': 5.36}\n",
505
+ "{'loss': 0.0299, 'learning_rate': 1e-05, 'epoch': 6.25}\n",
506
+ "{'loss': 0.016, 'learning_rate': 1e-05, 'epoch': 7.14}\n",
507
+ "{'loss': 0.0085, 'learning_rate': 1e-05, 'epoch': 8.04}\n",
508
+ "{'loss': 0.0038, 'learning_rate': 1e-05, 'epoch': 8.93}\n",
509
+ "{'loss': 0.0028, 'learning_rate': 1e-05, 'epoch': 9.82}\n",
510
+ "{'loss': 0.0023, 'learning_rate': 1e-05, 'epoch': 10.71}\n",
511
+ "{'loss': 0.0015, 'learning_rate': 1e-05, 'epoch': 11.61}\n",
512
+ "{'loss': 0.0012, 'learning_rate': 1e-05, 'epoch': 12.5}\n",
513
+ "{'loss': 0.0011, 'learning_rate': 1e-05, 'epoch': 13.39}\n",
514
+ "{'loss': 0.0009, 'learning_rate': 1e-05, 'epoch': 14.29}\n",
515
+ "{'loss': 0.0008, 'learning_rate': 1e-05, 'epoch': 15.18}\n",
516
+ "{'loss': 0.0007, 'learning_rate': 1e-05, 'epoch': 16.07}\n",
517
+ "{'loss': 0.0007, 'learning_rate': 1e-05, 'epoch': 16.96}\n",
518
+ "{'loss': 0.0006, 'learning_rate': 1e-05, 'epoch': 17.86}\n"
519
+ ]
520
+ },
521
+ {
522
+ "data": {
523
+ "application/vnd.jupyter.widget-view+json": {
524
+ "model_id": "6448ea85978f4e14ad837324e482d808",
525
+ "version_major": 2,
526
+ "version_minor": 0
527
+ },
528
+ "text/plain": [
529
+ " 0%| | 0/8 [00:00<?, ?it/s]"
530
+ ]
531
+ },
532
+ "metadata": {},
533
+ "output_type": "display_data"
534
+ },
535
+ {
536
+ "name": "stdout",
537
+ "output_type": "stream",
538
+ "text": [
539
+ "{'eval_loss': 0.25609758496284485, 'eval_wer_ortho': 35.90376310919186, 'eval_wer': 35.30106257378985, 'eval_runtime': 27.7439, 'eval_samples_per_second': 4.073, 'eval_steps_per_second': 0.288, 'epoch': 17.86}\n"
540
+ ]
541
+ },
542
+ {
543
+ "name": "stderr",
544
+ "output_type": "stream",
545
+ "text": [
546
+ "/Users/mkhojira/Projects/mml/audio-course/venv/lib/python3.8/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
547
+ " warnings.warn(\n"
548
+ ]
549
+ },
550
+ {
551
+ "name": "stdout",
552
+ "output_type": "stream",
553
+ "text": [
554
+ "{'loss': 0.0006, 'learning_rate': 1e-05, 'epoch': 18.75}\n",
555
+ "{'loss': 0.0005, 'learning_rate': 1e-05, 'epoch': 19.64}\n",
556
+ "{'loss': 0.0005, 'learning_rate': 1e-05, 'epoch': 20.54}\n",
557
+ "{'loss': 0.0005, 'learning_rate': 1e-05, 'epoch': 21.43}\n",
558
+ "{'loss': 0.0004, 'learning_rate': 1e-05, 'epoch': 22.32}\n",
559
+ "{'loss': 0.0004, 'learning_rate': 1e-05, 'epoch': 23.21}\n",
560
+ "{'loss': 0.0004, 'learning_rate': 1e-05, 'epoch': 24.11}\n",
561
+ "{'loss': 0.0003, 'learning_rate': 1e-05, 'epoch': 25.0}\n",
562
+ "{'loss': 0.0003, 'learning_rate': 1e-05, 'epoch': 25.89}\n",
563
+ "{'loss': 0.0003, 'learning_rate': 1e-05, 'epoch': 26.79}\n",
564
+ "{'loss': 0.0003, 'learning_rate': 1e-05, 'epoch': 27.68}\n",
565
+ "{'loss': 0.0003, 'learning_rate': 1e-05, 'epoch': 28.57}\n",
566
+ "{'loss': 0.0003, 'learning_rate': 1e-05, 'epoch': 29.46}\n",
567
+ "{'loss': 0.0002, 'learning_rate': 1e-05, 'epoch': 30.36}\n",
568
+ "{'loss': 0.0002, 'learning_rate': 1e-05, 'epoch': 31.25}\n",
569
+ "{'loss': 0.0002, 'learning_rate': 1e-05, 'epoch': 32.14}\n",
570
+ "{'loss': 0.0002, 'learning_rate': 1e-05, 'epoch': 33.04}\n",
571
+ "{'loss': 0.0002, 'learning_rate': 1e-05, 'epoch': 33.93}\n",
572
+ "{'loss': 0.0002, 'learning_rate': 1e-05, 'epoch': 34.82}\n",
573
+ "{'loss': 0.0002, 'learning_rate': 1e-05, 'epoch': 35.71}\n"
574
+ ]
575
+ },
576
+ {
577
+ "data": {
578
+ "application/vnd.jupyter.widget-view+json": {
579
+ "model_id": "bb97f0dd1de841f4a6904e6240ffa58a",
580
+ "version_major": 2,
581
+ "version_minor": 0
582
+ },
583
+ "text/plain": [
584
+ " 0%| | 0/8 [00:00<?, ?it/s]"
585
+ ]
586
+ },
587
+ "metadata": {},
588
+ "output_type": "display_data"
589
+ },
590
+ {
591
+ "name": "stdout",
592
+ "output_type": "stream",
593
+ "text": [
594
+ "{'eval_loss': 0.2792435586452484, 'eval_wer_ortho': 36.4589759407773, 'eval_wer': 35.9504132231405, 'eval_runtime': 20.8669, 'eval_samples_per_second': 5.415, 'eval_steps_per_second': 0.383, 'epoch': 35.71}\n"
595
+ ]
596
+ },
597
+ {
598
+ "name": "stderr",
599
+ "output_type": "stream",
600
+ "text": [
601
+ "/Users/mkhojira/Projects/mml/audio-course/venv/lib/python3.8/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
602
+ " warnings.warn(\n"
603
+ ]
604
+ },
605
+ {
606
+ "name": "stdout",
607
+ "output_type": "stream",
608
+ "text": [
609
+ "{'loss': 0.0002, 'learning_rate': 1e-05, 'epoch': 36.61}\n",
610
+ "{'loss': 0.0002, 'learning_rate': 1e-05, 'epoch': 37.5}\n",
611
+ "{'loss': 0.0002, 'learning_rate': 1e-05, 'epoch': 38.39}\n",
612
+ "{'loss': 0.0002, 'learning_rate': 1e-05, 'epoch': 39.29}\n",
613
+ "{'loss': 0.0002, 'learning_rate': 1e-05, 'epoch': 40.18}\n",
614
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 41.07}\n",
615
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 41.96}\n",
616
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 42.86}\n",
617
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 43.75}\n",
618
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 44.64}\n",
619
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 45.54}\n",
620
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 46.43}\n",
621
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 47.32}\n",
622
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 48.21}\n",
623
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 49.11}\n",
624
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 50.0}\n",
625
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 50.89}\n",
626
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 51.79}\n",
627
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 52.68}\n",
628
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 53.57}\n"
629
+ ]
630
+ },
631
+ {
632
+ "data": {
633
+ "application/vnd.jupyter.widget-view+json": {
634
+ "model_id": "4f8a8ea4cd774a72a6b89f714f17a78e",
635
+ "version_major": 2,
636
+ "version_minor": 0
637
+ },
638
+ "text/plain": [
639
+ " 0%| | 0/8 [00:00<?, ?it/s]"
640
+ ]
641
+ },
642
+ "metadata": {},
643
+ "output_type": "display_data"
644
+ },
645
+ {
646
+ "name": "stdout",
647
+ "output_type": "stream",
648
+ "text": [
649
+ "{'eval_loss': 0.29441583156585693, 'eval_wer_ortho': 36.705737199259715, 'eval_wer': 36.36363636363637, 'eval_runtime': 20.6363, 'eval_samples_per_second': 5.476, 'eval_steps_per_second': 0.388, 'epoch': 53.57}\n"
650
+ ]
651
+ },
652
+ {
653
+ "name": "stderr",
654
+ "output_type": "stream",
655
+ "text": [
656
+ "/Users/mkhojira/Projects/mml/audio-course/venv/lib/python3.8/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
657
+ " warnings.warn(\n"
658
+ ]
659
+ },
660
+ {
661
+ "name": "stdout",
662
+ "output_type": "stream",
663
+ "text": [
664
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 54.46}\n",
665
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 55.36}\n",
666
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 56.25}\n",
667
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 57.14}\n",
668
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 58.04}\n",
669
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 58.93}\n",
670
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 59.82}\n",
671
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 60.71}\n",
672
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 61.61}\n",
673
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 62.5}\n",
674
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 63.39}\n",
675
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 64.29}\n",
676
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 65.18}\n",
677
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 66.07}\n",
678
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 66.96}\n",
679
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 67.86}\n",
680
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 68.75}\n",
681
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 69.64}\n",
682
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 70.54}\n",
683
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 71.43}\n"
684
+ ]
685
+ },
686
+ {
687
+ "data": {
688
+ "application/vnd.jupyter.widget-view+json": {
689
+ "model_id": "c3e15e770b014f84beff76935f5e1069",
690
+ "version_major": 2,
691
+ "version_minor": 0
692
+ },
693
+ "text/plain": [
694
+ " 0%| | 0/8 [00:00<?, ?it/s]"
695
+ ]
696
+ },
697
+ "metadata": {},
698
+ "output_type": "display_data"
699
+ },
700
+ {
701
+ "name": "stdout",
702
+ "output_type": "stream",
703
+ "text": [
704
+ "{'eval_loss': 0.30616462230682373, 'eval_wer_ortho': 36.76742751388032, 'eval_wer': 36.481700118063756, 'eval_runtime': 20.6248, 'eval_samples_per_second': 5.479, 'eval_steps_per_second': 0.388, 'epoch': 71.43}\n"
705
+ ]
706
+ },
707
+ {
708
+ "name": "stderr",
709
+ "output_type": "stream",
710
+ "text": [
711
+ "/Users/mkhojira/Projects/mml/audio-course/venv/lib/python3.8/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
712
+ " warnings.warn(\n"
713
+ ]
714
+ },
715
+ {
716
+ "name": "stdout",
717
+ "output_type": "stream",
718
+ "text": [
719
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 72.32}\n",
720
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 73.21}\n",
721
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 74.11}\n",
722
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 75.0}\n",
723
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 75.89}\n",
724
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 76.79}\n",
725
+ "{'loss': 0.0001, 'learning_rate': 1e-05, 'epoch': 77.68}\n",
726
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 78.57}\n",
727
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 79.46}\n",
728
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 80.36}\n",
729
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 81.25}\n",
730
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 82.14}\n",
731
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 83.04}\n",
732
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 83.93}\n",
733
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 84.82}\n",
734
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 85.71}\n",
735
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 86.61}\n",
736
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 87.5}\n",
737
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 88.39}\n",
738
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 89.29}\n"
739
+ ]
740
+ },
741
+ {
742
+ "data": {
743
+ "application/vnd.jupyter.widget-view+json": {
744
+ "model_id": "287e32b32c004d56bfaabf8398bc0b57",
745
+ "version_major": 2,
746
+ "version_minor": 0
747
+ },
748
+ "text/plain": [
749
+ " 0%| | 0/8 [00:00<?, ?it/s]"
750
+ ]
751
+ },
752
+ "metadata": {},
753
+ "output_type": "display_data"
754
+ },
755
+ {
756
+ "name": "stdout",
757
+ "output_type": "stream",
758
+ "text": [
759
+ "{'eval_loss': 0.31588611006736755, 'eval_wer_ortho': 36.82911782850093, 'eval_wer': 36.77685950413223, 'eval_runtime': 20.6213, 'eval_samples_per_second': 5.48, 'eval_steps_per_second': 0.388, 'epoch': 89.29}\n"
760
+ ]
761
+ },
762
+ {
763
+ "name": "stderr",
764
+ "output_type": "stream",
765
+ "text": [
766
+ "/Users/mkhojira/Projects/mml/audio-course/venv/lib/python3.8/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
767
+ " warnings.warn(\n"
768
+ ]
769
+ },
770
+ {
771
+ "name": "stdout",
772
+ "output_type": "stream",
773
+ "text": [
774
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 90.18}\n",
775
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 91.07}\n",
776
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 91.96}\n",
777
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 92.86}\n",
778
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 93.75}\n",
779
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 94.64}\n",
780
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 95.54}\n",
781
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 96.43}\n",
782
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 97.32}\n",
783
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 98.21}\n",
784
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 99.11}\n",
785
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 100.0}\n",
786
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 100.89}\n",
787
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 101.79}\n",
788
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 102.68}\n",
789
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 103.57}\n",
790
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 104.46}\n",
791
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 105.36}\n",
792
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 106.25}\n",
793
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 107.14}\n"
794
+ ]
795
+ },
796
+ {
797
+ "data": {
798
+ "application/vnd.jupyter.widget-view+json": {
799
+ "model_id": "b4c5b0d4040949408a37a90be1cc106b",
800
+ "version_major": 2,
801
+ "version_minor": 0
802
+ },
803
+ "text/plain": [
804
+ " 0%| | 0/8 [00:00<?, ?it/s]"
805
+ ]
806
+ },
807
+ "metadata": {},
808
+ "output_type": "display_data"
809
+ },
810
+ {
811
+ "name": "stdout",
812
+ "output_type": "stream",
813
+ "text": [
814
+ "{'eval_loss': 0.3247106671333313, 'eval_wer_ortho': 36.705737199259715, 'eval_wer': 36.658795749704844, 'eval_runtime': 20.5021, 'eval_samples_per_second': 5.512, 'eval_steps_per_second': 0.39, 'epoch': 107.14}\n"
815
+ ]
816
+ },
817
+ {
818
+ "name": "stderr",
819
+ "output_type": "stream",
820
+ "text": [
821
+ "/Users/mkhojira/Projects/mml/audio-course/venv/lib/python3.8/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
822
+ " warnings.warn(\n"
823
+ ]
824
+ },
825
+ {
826
+ "name": "stdout",
827
+ "output_type": "stream",
828
+ "text": [
829
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 108.04}\n",
830
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 108.93}\n",
831
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 109.82}\n",
832
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 110.71}\n",
833
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 111.61}\n",
834
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 112.5}\n",
835
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 113.39}\n",
836
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 114.29}\n",
837
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 115.18}\n",
838
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 116.07}\n",
839
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 116.96}\n",
840
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 117.86}\n",
841
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 118.75}\n",
842
+ "{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 119.64}\n"
843
+ ]
844
+ },
845
+ {
846
+ "ename": "KeyboardInterrupt",
847
+ "evalue": "",
848
+ "output_type": "error",
849
+ "traceback": [
850
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
851
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
852
+ "\u001b[1;32m/Users/mkhojira/Projects/mml/audio-course/unit5/hands_on.ipynb Cell 22\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/mkhojira/Projects/mml/audio-course/unit5/hands_on.ipynb#X34sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m trainer\u001b[39m.\u001b[39;49mtrain()\n",
853
+ "File \u001b[0;32m~/Projects/mml/audio-course/venv/lib/python3.8/site-packages/transformers/trainer.py:1555\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1553\u001b[0m hf_hub_utils\u001b[39m.\u001b[39menable_progress_bars()\n\u001b[1;32m 1554\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 1555\u001b[0m \u001b[39mreturn\u001b[39;00m inner_training_loop(\n\u001b[1;32m 1556\u001b[0m args\u001b[39m=\u001b[39;49margs,\n\u001b[1;32m 1557\u001b[0m resume_from_checkpoint\u001b[39m=\u001b[39;49mresume_from_checkpoint,\n\u001b[1;32m 1558\u001b[0m trial\u001b[39m=\u001b[39;49mtrial,\n\u001b[1;32m 1559\u001b[0m ignore_keys_for_eval\u001b[39m=\u001b[39;49mignore_keys_for_eval,\n\u001b[1;32m 1560\u001b[0m )\n",
854
+ "File \u001b[0;32m~/Projects/mml/audio-course/venv/lib/python3.8/site-packages/transformers/trainer.py:1862\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 1859\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39maccelerator\u001b[39m.\u001b[39maccumulate(model):\n\u001b[1;32m 1860\u001b[0m tr_loss_step \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtraining_step(model, inputs)\n\u001b[0;32m-> 1862\u001b[0m \u001b[39mif\u001b[39;00m (\n\u001b[1;32m 1863\u001b[0m args\u001b[39m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m 1864\u001b[0m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[1;32m 1865\u001b[0m \u001b[39mand\u001b[39;00m (torch\u001b[39m.\u001b[39misnan(tr_loss_step) \u001b[39mor\u001b[39;00m torch\u001b[39m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m 1866\u001b[0m ):\n\u001b[1;32m 1867\u001b[0m \u001b[39m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m 1868\u001b[0m tr_loss \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m tr_loss \u001b[39m/\u001b[39m (\u001b[39m1\u001b[39m \u001b[39m+\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstate\u001b[39m.\u001b[39mglobal_step \u001b[39m-\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_globalstep_last_logged)\n\u001b[1;32m 1869\u001b[0m \u001b[39melse\u001b[39;00m:\n",
855
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
856
+ ]
857
+ }
858
+ ],
859
+ "source": [
860
+ "trainer.train()"
861
+ ]
862
+ },
863
+ {
864
+ "cell_type": "code",
865
+ "execution_count": null,
866
+ "metadata": {},
867
+ "outputs": [],
868
+ "source": [
869
+ "# from transformers import GenerationConfig\n",
870
+ "# generation_config = GenerationConfig.from_pretrained(\"openai/whisper-tiny.en\")\n",
871
+ "# generation_config.push_to_hub('mirodil/whisper-tiny-en-us-minds14')"
872
+ ]
873
+ },
874
+ {
875
+ "cell_type": "code",
876
+ "execution_count": 21,
877
+ "metadata": {},
878
+ "outputs": [
879
+ {
880
+ "data": {
881
+ "application/vnd.jupyter.widget-view+json": {
882
+ "model_id": "5cb7500ba08c4c98b821669c3207517d",
883
+ "version_major": 2,
884
+ "version_minor": 0
885
+ },
886
+ "text/plain": [
887
+ "events.out.tfevents.1700719599.L67DDV9G7R.91939.0: 0%| | 0.00/29.3k [00:00<?, ?B/s]"
888
+ ]
889
+ },
890
+ "metadata": {},
891
+ "output_type": "display_data"
892
+ },
893
+ {
894
+ "data": {
895
+ "application/vnd.jupyter.widget-view+json": {
896
+ "model_id": "26bce367c9974964a5e06097af5959e8",
897
+ "version_major": 2,
898
+ "version_minor": 0
899
+ },
900
+ "text/plain": [
901
+ "model.safetensors: 0%| | 0.00/151M [00:00<?, ?B/s]"
902
+ ]
903
+ },
904
+ "metadata": {},
905
+ "output_type": "display_data"
906
+ },
907
+ {
908
+ "data": {
909
+ "application/vnd.jupyter.widget-view+json": {
910
+ "model_id": "d947a721dfcc44cab504adee4a2cab9f",
911
+ "version_major": 2,
912
+ "version_minor": 0
913
+ },
914
+ "text/plain": [
915
+ "training_args.bin: 0%| | 0.00/4.73k [00:00<?, ?B/s]"
916
+ ]
917
+ },
918
+ "metadata": {},
919
+ "output_type": "display_data"
920
+ },
921
+ {
922
+ "data": {
923
+ "application/vnd.jupyter.widget-view+json": {
924
+ "model_id": "c4487700a97e42188f5bf27cf538c82d",
925
+ "version_major": 2,
926
+ "version_minor": 0
927
+ },
928
+ "text/plain": [
929
+ "Upload 3 LFS files: 0%| | 0/3 [00:00<?, ?it/s]"
930
+ ]
931
+ },
932
+ "metadata": {},
933
+ "output_type": "display_data"
934
+ },
935
+ {
936
+ "data": {
937
+ "text/plain": [
938
+ "'https://huggingface.co/mirodil/whisper-tiny-en-us-minds14/tree/main/'"
939
+ ]
940
+ },
941
+ "execution_count": 21,
942
+ "metadata": {},
943
+ "output_type": "execute_result"
944
+ }
945
+ ],
946
+ "source": [
947
+ "kwargs = {\n",
948
+ " \"dataset_tags\": \"PolyAI/minds14\",\n",
949
+ " \"finetuned_from\": \"openai/whisper-tiny\",\n",
950
+ " \"tasks\": \"automatic-speech-recognition\",\n",
951
+ "}\n",
952
+ "trainer.push_to_hub(**kwargs)"
953
+ ]
954
+ },
955
+ {
956
+ "cell_type": "code",
957
+ "execution_count": null,
958
+ "metadata": {},
959
+ "outputs": [],
960
+ "source": [
961
+ "model.generation_config"
962
+ ]
963
+ },
964
+ {
965
+ "cell_type": "code",
966
+ "execution_count": null,
967
+ "metadata": {},
968
+ "outputs": [],
969
+ "source": [
970
+ "hasattr(generation_config, \"lang_to_id\")"
971
+ ]
972
+ },
973
+ {
974
+ "cell_type": "code",
975
+ "execution_count": null,
976
+ "metadata": {},
977
+ "outputs": [],
978
+ "source": []
979
+ }
980
+ ],
981
+ "metadata": {
982
+ "kernelspec": {
983
+ "display_name": "venv",
984
+ "language": "python",
985
+ "name": "python3"
986
+ },
987
+ "language_info": {
988
+ "codemirror_mode": {
989
+ "name": "ipython",
990
+ "version": 3
991
+ },
992
+ "file_extension": ".py",
993
+ "mimetype": "text/x-python",
994
+ "name": "python",
995
+ "nbconvert_exporter": "python",
996
+ "pygments_lexer": "ipython3",
997
+ "version": "3.8.17"
998
+ }
999
+ },
1000
+ "nbformat": 4,
1001
+ "nbformat_minor": 2
1002
+ }