kimbochen commited on
Commit
1925aa2
โ€ข
1 Parent(s): 22e3a01

Added fugashi sample code.

Browse files
Files changed (1) hide show
  1. fine-tune-whisper-streaming.ipynb +303 -10
fine-tune-whisper-streaming.ipynb CHANGED
@@ -306,7 +306,7 @@
306
  },
307
  {
308
  "cell_type": "code",
309
- "execution_count": 11,
310
  "id": "c085911c-a10a-41ef-8874-306e0503e9bb",
311
  "metadata": {},
312
  "outputs": [],
@@ -328,7 +328,8 @@
328
  " transcription = normalizer(transcription).strip()\n",
329
  " \n",
330
  " # encode target text to label ids\n",
331
- " batch[\"labels\"] = processor.tokenizer(transcription).input_ids\n",
 
332
  " return batch"
333
  ]
334
  },
@@ -342,7 +343,7 @@
342
  },
343
  {
344
  "cell_type": "code",
345
- "execution_count": 12,
346
  "id": "a37a7cdb-9013-427f-8de9-6a8d0e9dc684",
347
  "metadata": {},
348
  "outputs": [],
@@ -360,7 +361,7 @@
360
  },
361
  {
362
  "cell_type": "code",
363
- "execution_count": 13,
364
  "id": "1b145699-acfc-4b1d-93a2-a2ad3d62674c",
365
  "metadata": {},
366
  "outputs": [],
@@ -381,7 +382,7 @@
381
  },
382
  {
383
  "cell_type": "code",
384
- "execution_count": 14,
385
  "id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
386
  "metadata": {},
387
  "outputs": [],
@@ -402,7 +403,7 @@
402
  },
403
  {
404
  "cell_type": "code",
405
- "execution_count": 15,
406
  "id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
407
  "metadata": {},
408
  "outputs": [],
@@ -413,14 +414,252 @@
413
  ")"
414
  ]
415
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  {
417
  "cell_type": "code",
418
  "execution_count": null,
419
- "id": "2d56f5bf",
420
  "metadata": {},
421
  "outputs": [],
422
  "source": [
423
- "vectorized_datasets['train'][0]"
424
  ]
425
  },
426
  {
@@ -895,7 +1134,7 @@
895
  "execution_count": 26,
896
  "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
897
  "metadata": {
898
- "scrolled": false
899
  },
900
  "outputs": [
901
  {
@@ -1139,7 +1378,7 @@
1139
  },
1140
  {
1141
  "cell_type": "code",
1142
- "execution_count": null,
1143
  "id": "95737cda-c5dd-4887-a4d0-dfcb0d61d977",
1144
  "metadata": {},
1145
  "outputs": [
@@ -1155,6 +1394,60 @@
1155
  "Special tokens file saved in ./special_tokens_map.json\n",
1156
  "added tokens file saved in ./added_tokens.json\n"
1157
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1158
  }
1159
  ],
1160
  "source": [
 
306
  },
307
  {
308
  "cell_type": "code",
309
+ "execution_count": 44,
310
  "id": "c085911c-a10a-41ef-8874-306e0503e9bb",
311
  "metadata": {},
312
  "outputs": [],
 
328
  " transcription = normalizer(transcription).strip()\n",
329
  " \n",
330
  " # encode target text to label ids\n",
331
+ "# batch[\"labels\"] = processor.tokenizer(transcription).input_ids\n",
332
+ " batch['labels'] = transcription\n",
333
  " return batch"
334
  ]
335
  },
 
343
  },
344
  {
345
  "cell_type": "code",
346
+ "execution_count": 45,
347
  "id": "a37a7cdb-9013-427f-8de9-6a8d0e9dc684",
348
  "metadata": {},
349
  "outputs": [],
 
361
  },
362
  {
363
  "cell_type": "code",
364
+ "execution_count": 46,
365
  "id": "1b145699-acfc-4b1d-93a2-a2ad3d62674c",
366
  "metadata": {},
367
  "outputs": [],
 
382
  },
383
  {
384
  "cell_type": "code",
385
+ "execution_count": 47,
386
  "id": "01cb25ef-4bb0-4325-9461-f59198acadf6",
387
  "metadata": {},
388
  "outputs": [],
 
403
  },
404
  {
405
  "cell_type": "code",
406
+ "execution_count": 48,
407
  "id": "333f7f6e-6053-4d3b-8924-c733c79b82ac",
408
  "metadata": {},
409
  "outputs": [],
 
414
  ")"
415
  ]
416
  },
417
+ {
418
+ "cell_type": "code",
419
+ "execution_count": 49,
420
+ "id": "bede1184",
421
+ "metadata": {},
422
+ "outputs": [
423
+ {
424
+ "name": "stderr",
425
+ "output_type": "stream",
426
+ "text": [
427
+ "Reading metadata...: 6505it [00:00, 35406.66it/s]\n",
428
+ "Reading metadata...: 4485it [00:00, 19930.24it/s]\n"
429
+ ]
430
+ },
431
+ {
432
+ "data": {
433
+ "text/plain": [
434
+ "'ๅคšใ‹ใ‚‰ไธ€ใธใจใ„ใ†ใฎใฏใ€ไธ–็•Œใ‚’ๅ› ๆžœ็š„ใซๆฑบๅฎš่ซ–็š„ใซ่€ƒใˆใ‚‹ใ“ใจใงใ‚ใ‚‹ใ€้ŽๅŽปใ‹ใ‚‰่€ƒใˆใ‚‹ใ“ใจใงใ‚ใ‚‹ใ€ๆฉŸๆขฐ็š„ใซ่€ƒใˆใ‚‹ใ“ใจใงใ‚ใ‚‹ใ€‚'"
435
+ ]
436
+ },
437
+ "execution_count": 49,
438
+ "metadata": {},
439
+ "output_type": "execute_result"
440
+ }
441
+ ],
442
+ "source": [
443
+ "xb = next(iter(vectorized_datasets['train']))\n",
444
+ "xb['labels']"
445
+ ]
446
+ },
447
+ {
448
+ "cell_type": "code",
449
+ "execution_count": 59,
450
+ "id": "ac1e8d5b",
451
+ "metadata": {},
452
+ "outputs": [
453
+ {
454
+ "name": "stdout",
455
+ "output_type": "stream",
456
+ "text": [
457
+ "<|startoftranscript|>\n",
458
+ "<|ja|>\n",
459
+ "<|transcribe|>\n",
460
+ "<|notimestamps|>\n",
461
+ "ๅคš\n",
462
+ "ใ‹ใ‚‰\n",
463
+ "ไธ€\n",
464
+ "ใธ\n",
465
+ "ใจใ„ใ†\n",
466
+ "ใฎใฏ\n",
467
+ "ใ€\n",
468
+ "ไธ–็•Œ\n",
469
+ "ใ‚’\n",
470
+ "ๅ› \n",
471
+ "ๆžœ\n",
472
+ "็š„\n",
473
+ "ใซ\n",
474
+ "ๆฑบ\n",
475
+ "ๅฎš\n",
476
+ "่ซ–\n",
477
+ "็š„\n",
478
+ "ใซ\n",
479
+ "่€ƒ\n",
480
+ "ใˆใ‚‹\n",
481
+ "ใ“ใจ\n",
482
+ "ใง\n",
483
+ "ใ‚ใ‚‹\n",
484
+ "ใ€\n",
485
+ "้ŽๅŽป\n",
486
+ "ใ‹ใ‚‰\n",
487
+ "่€ƒ\n",
488
+ "ใˆใ‚‹\n",
489
+ "ใ“ใจ\n",
490
+ "ใง\n",
491
+ "ใ‚ใ‚‹\n",
492
+ "ใ€\n",
493
+ "ๆฉŸ\n",
494
+ "๏ฟฝ\n",
495
+ "๏ฟฝ\n",
496
+ "็š„\n",
497
+ "ใซ\n",
498
+ "่€ƒ\n",
499
+ "ใˆใ‚‹\n",
500
+ "ใ“ใจ\n",
501
+ "ใง\n",
502
+ "ใ‚ใ‚‹\n",
503
+ "ใ€‚\n",
504
+ "<|endoftext|>\n"
505
+ ]
506
+ }
507
+ ],
508
+ "source": [
509
+ "idxs = processor.tokenizer(xb['labels']).input_ids\n",
510
+ "for idx in idxs:\n",
511
+ " print(processor.tokenizer.decode(idx))"
512
+ ]
513
+ },
514
+ {
515
+ "cell_type": "code",
516
+ "execution_count": 60,
517
+ "id": "d33cefc4",
518
+ "metadata": {},
519
+ "outputs": [
520
+ {
521
+ "data": {
522
+ "text/plain": [
523
+ "[ๅคšใ‹ใ‚‰,\n",
524
+ " ไธ€,\n",
525
+ " ใธ,\n",
526
+ " ใจ,\n",
527
+ " ใ„ใ†,\n",
528
+ " ใฎ,\n",
529
+ " ใฏ,\n",
530
+ " ใ€,\n",
531
+ " ไธ–็•Œ,\n",
532
+ " ใ‚’,\n",
533
+ " ๅ› ๆžœ,\n",
534
+ " ็š„,\n",
535
+ " ใซ,\n",
536
+ " ๆฑบๅฎš,\n",
537
+ " ่ซ–,\n",
538
+ " ็š„,\n",
539
+ " ใซ,\n",
540
+ " ่€ƒใˆใ‚‹,\n",
541
+ " ใ“ใจ,\n",
542
+ " ใง,\n",
543
+ " ใ‚ใ‚‹,\n",
544
+ " ใ€,\n",
545
+ " ้ŽๅŽป,\n",
546
+ " ใ‹ใ‚‰,\n",
547
+ " ่€ƒใˆใ‚‹,\n",
548
+ " ใ“ใจ,\n",
549
+ " ใง,\n",
550
+ " ใ‚ใ‚‹,\n",
551
+ " ใ€,\n",
552
+ " ๆฉŸๆขฐ,\n",
553
+ " ็š„,\n",
554
+ " ใซ,\n",
555
+ " ่€ƒใˆใ‚‹,\n",
556
+ " ใ“ใจ,\n",
557
+ " ใง,\n",
558
+ " ใ‚ใ‚‹,\n",
559
+ " ใ€‚]"
560
+ ]
561
+ },
562
+ "execution_count": 60,
563
+ "metadata": {},
564
+ "output_type": "execute_result"
565
+ }
566
+ ],
567
+ "source": [
568
+ "tagger(xb['labels'])"
569
+ ]
570
+ },
571
+ {
572
+ "cell_type": "code",
573
+ "execution_count": 55,
574
+ "id": "2cbb82ef",
575
+ "metadata": {},
576
+ "outputs": [
577
+ {
578
+ "name": "stdout",
579
+ "output_type": "stream",
580
+ "text": [
581
+ "Help on method decode in module transformers.tokenization_utils_base:\n",
582
+ "\n",
583
+ "decode(token_ids: Union[int, List[int], ForwardRef('np.ndarray'), ForwardRef('torch.Tensor'), ForwardRef('tf.Tensor')], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, **kwargs) -> str method of transformers.models.whisper.tokenization_whisper.WhisperTokenizer instance\n",
584
+ " Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special\n",
585
+ " tokens and clean up tokenization spaces.\n",
586
+ " \n",
587
+ " Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.\n",
588
+ " \n",
589
+ " Args:\n",
590
+ " token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):\n",
591
+ " List of tokenized input ids. Can be obtained using the `__call__` method.\n",
592
+ " skip_special_tokens (`bool`, *optional*, defaults to `False`):\n",
593
+ " Whether or not to remove special tokens in the decoding.\n",
594
+ " clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):\n",
595
+ " Whether or not to clean up the tokenization spaces.\n",
596
+ " kwargs (additional keyword arguments, *optional*):\n",
597
+ " Will be passed to the underlying model specific decode method.\n",
598
+ " \n",
599
+ " Returns:\n",
600
+ " `str`: The decoded sentence.\n",
601
+ "\n"
602
+ ]
603
+ }
604
+ ],
605
+ "source": [
606
+ "help(processor.tokenizer.decode)"
607
+ ]
608
+ },
609
+ {
610
+ "cell_type": "code",
611
+ "execution_count": 41,
612
+ "id": "b4b9bbfc",
613
+ "metadata": {},
614
+ "outputs": [
615
+ {
616
+ "data": {
617
+ "text/plain": [
618
+ "'้บฉ ่“ๅญ ใฏ ใ€ ้บฉ ใ‚’ ไธปๆ ๆ–™ ใจ ใ— ใŸ ๆ—ฅๆœฌ ใฎ ่“ๅญ ใ€‚'"
619
+ ]
620
+ },
621
+ "execution_count": 41,
622
+ "metadata": {},
623
+ "output_type": "execute_result"
624
+ }
625
+ ],
626
+ "source": [
627
+ "from fugashi import Tagger\n",
628
+ "\n",
629
+ "tagger = Tagger('-Owakati')\n",
630
+ "text = \"้บฉ่“ๅญใฏใ€้บฉใ‚’ไธปๆๆ–™ใจใ—ใŸๆ—ฅๆœฌใฎ่“ๅญใ€‚\"\n",
631
+ "tagger.parse(text)"
632
+ ]
633
+ },
634
+ {
635
+ "cell_type": "code",
636
+ "execution_count": 43,
637
+ "id": "833ca62d",
638
+ "metadata": {},
639
+ "outputs": [
640
+ {
641
+ "data": {
642
+ "text/plain": [
643
+ "[้บฉ, ่“ๅญ, ใฏ, ใ€, ้บฉ, ใ‚’, ไธปๆ, ๆ–™, ใจ, ใ—, ใŸ, ๆ—ฅๆœฌ, ใฎ, ่“ๅญ, ใ€‚]"
644
+ ]
645
+ },
646
+ "execution_count": 43,
647
+ "metadata": {},
648
+ "output_type": "execute_result"
649
+ }
650
+ ],
651
+ "source": [
652
+ "tagger(text)"
653
+ ]
654
+ },
655
  {
656
  "cell_type": "code",
657
  "execution_count": null,
658
+ "id": "7b7854d6",
659
  "metadata": {},
660
  "outputs": [],
661
  "source": [
662
+ "raw_datasets['']"
663
  ]
664
  },
665
  {
 
1134
  "execution_count": 26,
1135
  "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de",
1136
  "metadata": {
1137
+ "scrolled": true
1138
  },
1139
  "outputs": [
1140
  {
 
1378
  },
1379
  {
1380
  "cell_type": "code",
1381
+ "execution_count": 28,
1382
  "id": "95737cda-c5dd-4887-a4d0-dfcb0d61d977",
1383
  "metadata": {},
1384
  "outputs": [
 
1394
  "Special tokens file saved in ./special_tokens_map.json\n",
1395
  "added tokens file saved in ./added_tokens.json\n"
1396
  ]
1397
+ },
1398
+ {
1399
+ "data": {
1400
+ "application/vnd.jupyter.widget-view+json": {
1401
+ "model_id": "a47d7e61b9144723a4208cc4cc492eee",
1402
+ "version_major": 2,
1403
+ "version_minor": 0
1404
+ },
1405
+ "text/plain": [
1406
+ "Upload file pytorch_model.bin: 0%| | 32.0k/922M [00:00<?, ?B/s]"
1407
+ ]
1408
+ },
1409
+ "metadata": {},
1410
+ "output_type": "display_data"
1411
+ },
1412
+ {
1413
+ "data": {
1414
+ "application/vnd.jupyter.widget-view+json": {
1415
+ "model_id": "a7eb0d82c2fd4f978981915aa2314463",
1416
+ "version_major": 2,
1417
+ "version_minor": 0
1418
+ },
1419
+ "text/plain": [
1420
+ "Upload file runs/Dec12_04-37-47_150-136-44-233/events.out.tfevents.1670819878.150-136-44-233.69039.0: 100%|###โ€ฆ"
1421
+ ]
1422
+ },
1423
+ "metadata": {},
1424
+ "output_type": "display_data"
1425
+ },
1426
+ {
1427
+ "name": "stderr",
1428
+ "output_type": "stream",
1429
+ "text": [
1430
+ "remote: Scanning LFS files for validity, may be slow... \n",
1431
+ "remote: LFS file scan complete. \n",
1432
+ "To https://huggingface.co/kimbochen/whisper-small-jp\n",
1433
+ " d83a98f..0ff52f0 main -> main\n",
1434
+ "\n",
1435
+ "Dropping the following result as it does not have all the necessary fields:\n",
1436
+ "{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}, 'dataset': {'name': 'Common Voice 11.0', 'type': 'mozilla-foundation/common_voice_11_0', 'config': 'ja', 'split': 'test', 'args': 'ja'}}\n",
1437
+ "To https://huggingface.co/kimbochen/whisper-small-jp\n",
1438
+ " 0ff52f0..22e3a01 main -> main\n",
1439
+ "\n"
1440
+ ]
1441
+ },
1442
+ {
1443
+ "data": {
1444
+ "text/plain": [
1445
+ "'https://huggingface.co/kimbochen/whisper-small-jp/commit/0ff52f0f1d63daf816427096a83f7bbf8f3892eb'"
1446
+ ]
1447
+ },
1448
+ "execution_count": 28,
1449
+ "metadata": {},
1450
+ "output_type": "execute_result"
1451
  }
1452
  ],
1453
  "source": [