rristo commited on
Commit
cda0233
1 Parent(s): 3996908

update example

Browse files
Files changed (1) hide show
  1. example_usage.ipynb +262 -36
example_usage.ipynb CHANGED
@@ -2,31 +2,24 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "id": "5920c653-448e-43b3-93eb-12d7073ad352",
7
  "metadata": {
8
  "tags": []
9
  },
10
- "outputs": [
11
- {
12
- "name": "stderr",
13
- "output_type": "stream",
14
- "text": [
15
- "/opt/espnet/tools/anaconda/envs/espnet/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
16
- " from .autonotebook import tqdm as notebook_tqdm\n"
17
- ]
18
- }
19
- ],
20
  "source": [
21
- "from espnet2.bin.asr_inference import Speech2Text\n",
22
- "from espnet2.bin.asr_align import CTCSegmentation\n",
23
  "import soundfile\n",
24
- "import pandas as pd"
 
 
 
25
  ]
26
  },
27
  {
28
  "cell_type": "code",
29
- "execution_count": 2,
30
  "id": "83058587-1a8a-4b01-92ff-e9125fbe55a3",
31
  "metadata": {
32
  "tags": []
@@ -47,14 +40,15 @@
47
  },
48
  {
49
  "cell_type": "code",
50
- "execution_count": 3,
51
  "id": "5e4670d6-0949-48cf-b6b1-d9cc4cf3ad65",
52
  "metadata": {
53
  "tags": []
54
  },
55
  "outputs": [],
56
  "source": [
57
- "speech2text = Speech2Text(\"exp/config.yaml\", \"exp/valid.acc.ave_10best.pth\", quantize_asr_model=True, quantize_lm=True)"
 
58
  ]
59
  },
60
  {
@@ -69,7 +63,7 @@
69
  },
70
  {
71
  "cell_type": "code",
72
- "execution_count": 4,
73
  "id": "e8120e8e-3718-4a1a-ab7a-46ef98a6bc11",
74
  "metadata": {
75
  "tags": []
@@ -82,7 +76,7 @@
82
  },
83
  {
84
  "cell_type": "code",
85
- "execution_count": 5,
86
  "id": "eec8d4b2-c27a-4780-aeed-8aa7538f70e5",
87
  "metadata": {
88
  "tags": []
@@ -92,8 +86,8 @@
92
  "name": "stdout",
93
  "output_type": "stream",
94
  "text": [
95
- "CPU times: user 2.64 s, sys: 6.23 ms, total: 2.65 s\n",
96
- "Wall time: 2.66 s\n"
97
  ]
98
  }
99
  ],
@@ -103,7 +97,7 @@
103
  },
104
  {
105
  "cell_type": "code",
106
- "execution_count": 6,
107
  "id": "39f41a8b-94c3-42d6-a989-6c7183a6f94d",
108
  "metadata": {
109
  "tags": []
@@ -123,7 +117,7 @@
123
  },
124
  {
125
  "cell_type": "code",
126
- "execution_count": 7,
127
  "id": "812060a6-90de-4134-8d1f-9f3d98853bc2",
128
  "metadata": {
129
  "tags": []
@@ -224,7 +218,7 @@
224
  },
225
  {
226
  "cell_type": "code",
227
- "execution_count": 10,
228
  "id": "ae9f7e3f-b75d-4bcb-98d1-ae2f037fb4af",
229
  "metadata": {
230
  "tags": []
@@ -244,7 +238,7 @@
244
  },
245
  {
246
  "cell_type": "code",
247
- "execution_count": 11,
248
  "id": "0215d312-1896-43f1-9782-c92aced787b7",
249
  "metadata": {
250
  "tags": []
@@ -254,8 +248,8 @@
254
  "name": "stdout",
255
  "output_type": "stream",
256
  "text": [
257
- "CPU times: user 2.96 s, sys: 19 ms, total: 2.98 s\n",
258
- "Wall time: 2.98 s\n"
259
  ]
260
  }
261
  ],
@@ -268,7 +262,7 @@
268
  },
269
  {
270
  "cell_type": "code",
271
- "execution_count": 12,
272
  "id": "d31d6840-3a80-411a-969c-05f4a5e3e9a1",
273
  "metadata": {
274
  "tags": []
@@ -506,20 +500,252 @@
506
  ]
507
  },
508
  {
509
- "cell_type": "code",
510
- "execution_count": null,
511
- "id": "7a4be2b1-5e0f-4558-8097-b37be0b83785",
512
  "metadata": {},
 
 
 
 
 
 
 
 
 
 
 
513
  "outputs": [],
514
- "source": []
 
 
515
  },
516
  {
517
  "cell_type": "code",
518
- "execution_count": null,
519
- "id": "1e9d45ad-c8fc-4bab-9285-b82ff3903702",
520
- "metadata": {},
 
 
521
  "outputs": [],
522
- "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  }
524
  ],
525
  "metadata": {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 16,
6
  "id": "5920c653-448e-43b3-93eb-12d7073ad352",
7
  "metadata": {
8
  "tags": []
9
  },
10
+ "outputs": [],
 
 
 
 
 
 
 
 
 
11
  "source": [
12
+ "import time\n",
 
13
  "import soundfile\n",
14
+ "import pandas as pd\n",
15
+ "import matplotlib.pyplot as plt\n",
16
+ "from espnet2.bin.asr_inference import Speech2Text\n",
17
+ "from espnet2.bin.asr_align import CTCSegmentation"
18
  ]
19
  },
20
  {
21
  "cell_type": "code",
22
+ "execution_count": 5,
23
  "id": "83058587-1a8a-4b01-92ff-e9125fbe55a3",
24
  "metadata": {
25
  "tags": []
 
40
  },
41
  {
42
  "cell_type": "code",
43
+ "execution_count": 44,
44
  "id": "5e4670d6-0949-48cf-b6b1-d9cc4cf3ad65",
45
  "metadata": {
46
  "tags": []
47
  },
48
  "outputs": [],
49
  "source": [
50
+ "#longer beam size take more time but is more accurate, default is 20\n",
51
+ "speech2text = Speech2Text(\"exp/config.yaml\", \"exp/valid.acc.ave_10best.pth\", quantize_asr_model=True, quantize_lm=True, beam_size=10)"
52
  ]
53
  },
54
  {
 
63
  },
64
  {
65
  "cell_type": "code",
66
+ "execution_count": 45,
67
  "id": "e8120e8e-3718-4a1a-ab7a-46ef98a6bc11",
68
  "metadata": {
69
  "tags": []
 
76
  },
77
  {
78
  "cell_type": "code",
79
+ "execution_count": 46,
80
  "id": "eec8d4b2-c27a-4780-aeed-8aa7538f70e5",
81
  "metadata": {
82
  "tags": []
 
86
  "name": "stdout",
87
  "output_type": "stream",
88
  "text": [
89
+ "CPU times: user 1.71 s, sys: 9.89 ms, total: 1.72 s\n",
90
+ "Wall time: 1.75 s\n"
91
  ]
92
  }
93
  ],
 
97
  },
98
  {
99
  "cell_type": "code",
100
+ "execution_count": 47,
101
  "id": "39f41a8b-94c3-42d6-a989-6c7183a6f94d",
102
  "metadata": {
103
  "tags": []
 
117
  },
118
  {
119
  "cell_type": "code",
120
+ "execution_count": 32,
121
  "id": "812060a6-90de-4134-8d1f-9f3d98853bc2",
122
  "metadata": {
123
  "tags": []
 
218
  },
219
  {
220
  "cell_type": "code",
221
+ "execution_count": 25,
222
  "id": "ae9f7e3f-b75d-4bcb-98d1-ae2f037fb4af",
223
  "metadata": {
224
  "tags": []
 
238
  },
239
  {
240
  "cell_type": "code",
241
+ "execution_count": 26,
242
  "id": "0215d312-1896-43f1-9782-c92aced787b7",
243
  "metadata": {
244
  "tags": []
 
248
  "name": "stdout",
249
  "output_type": "stream",
250
  "text": [
251
+ "CPU times: user 1.68 s, sys: 0 ns, total: 1.68 s\n",
252
+ "Wall time: 1.68 s\n"
253
  ]
254
  }
255
  ],
 
262
  },
263
  {
264
  "cell_type": "code",
265
+ "execution_count": 27,
266
  "id": "d31d6840-3a80-411a-969c-05f4a5e3e9a1",
267
  "metadata": {
268
  "tags": []
 
500
  ]
501
  },
502
  {
503
+ "cell_type": "markdown",
504
+ "id": "6288dbee-b84b-4465-829e-978352a9f0e7",
 
505
  "metadata": {},
506
+ "source": [
507
+ "## Chunk audio to see how long audio increases transcripton time significantly"
508
+ ]
509
+ },
510
+ {
511
+ "cell_type": "code",
512
+ "execution_count": 1,
513
+ "id": "6e7af387-d4bf-486e-a12a-9689242793fe",
514
+ "metadata": {
515
+ "tags": []
516
+ },
517
  "outputs": [],
518
+ "source": [
519
+ "from subprocess import Popen, PIPE"
520
+ ]
521
  },
522
  {
523
  "cell_type": "code",
524
+ "execution_count": 7,
525
+ "id": "0d51f384-4e1d-435f-993e-351af6bc42ff",
526
+ "metadata": {
527
+ "tags": []
528
+ },
529
  "outputs": [],
530
+ "source": [
531
+ "def chunk_audio(src_file, to_file, start, end):\n",
532
+ " proc = Popen(['sox', src_file, to_file, 'trim', str(start), f'={end}'], stdout=PIPE, stderr=PIPE)\n",
533
+ " stdout, stderr = proc.communicate()\n",
534
+ " return stdout, stderr\n",
535
+ "\n",
536
+ "from_file='example_audio/oden_kypsis16k.wav'\n",
537
+ "to_files=[]\n",
538
+ "for i in range(5, 31):\n",
539
+ " to_file=f'example_audio/chunks/oden_kypsis16k_chunk_{i}.wav'\n",
540
+ " chunk_audio(from_file, to_file, 0, i)\n",
541
+ " to_files.append(to_file)"
542
+ ]
543
+ },
544
+ {
545
+ "cell_type": "code",
546
+ "execution_count": 38,
547
+ "id": "9aad1658-bdbc-479c-b1f9-89e52c6c2487",
548
+ "metadata": {
549
+ "tags": []
550
+ },
551
+ "outputs": [],
552
+ "source": [
553
+ "chunk_times=[]\n",
554
+ "for file in to_files:\n",
555
+ " speech, rate = soundfile.read(file)\n",
556
+ " assert rate == 16000\n",
557
+ " start=time.time()\n",
558
+ " text, *_ = speech2text(speech)\n",
559
+ " end=time.time()\n",
560
+ " duration=end-start\n",
561
+ " chunk_times.append([file, text[0], duration, len(speech)/16000])\n",
562
+ "df_chunk_times=pd.DataFrame(chunk_times)"
563
+ ]
564
+ },
565
+ {
566
+ "cell_type": "code",
567
+ "execution_count": 39,
568
+ "id": "9d3cd39b-9199-493c-a4d9-4084c92d844a",
569
+ "metadata": {
570
+ "tags": []
571
+ },
572
+ "outputs": [
573
+ {
574
+ "data": {
575
+ "text/html": [
576
+ "<div>\n",
577
+ "<style scoped>\n",
578
+ " .dataframe tbody tr th:only-of-type {\n",
579
+ " vertical-align: middle;\n",
580
+ " }\n",
581
+ "\n",
582
+ " .dataframe tbody tr th {\n",
583
+ " vertical-align: top;\n",
584
+ " }\n",
585
+ "\n",
586
+ " .dataframe thead th {\n",
587
+ " text-align: right;\n",
588
+ " }\n",
589
+ "</style>\n",
590
+ "<table border=\"1\" class=\"dataframe\">\n",
591
+ " <thead>\n",
592
+ " <tr style=\"text-align: right;\">\n",
593
+ " <th></th>\n",
594
+ " <th>file</th>\n",
595
+ " <th>hyp</th>\n",
596
+ " <th>elapsed_time</th>\n",
597
+ " <th>audio_dur_sec</th>\n",
598
+ " <th>trans_time_audio_dur_share</th>\n",
599
+ " </tr>\n",
600
+ " </thead>\n",
601
+ " <tbody>\n",
602
+ " <tr>\n",
603
+ " <th>0</th>\n",
604
+ " <td>example_audio/chunks/oden_kypsis16k_chunk_5.wav</td>\n",
605
+ " <td>enamus ajast nagu klik</td>\n",
606
+ " <td>0.418611</td>\n",
607
+ " <td>5.0</td>\n",
608
+ " <td>0.083722</td>\n",
609
+ " </tr>\n",
610
+ " <tr>\n",
611
+ " <th>1</th>\n",
612
+ " <td>example_audio/chunks/oden_kypsis16k_chunk_6.wav</td>\n",
613
+ " <td>enamus ajast nagu klikid neid all</td>\n",
614
+ " <td>0.481883</td>\n",
615
+ " <td>6.0</td>\n",
616
+ " <td>0.080314</td>\n",
617
+ " </tr>\n",
618
+ " <tr>\n",
619
+ " <th>2</th>\n",
620
+ " <td>example_audio/chunks/oden_kypsis16k_chunk_7.wav</td>\n",
621
+ " <td>enamus ajast nagu klikid neid allserva tekivad</td>\n",
622
+ " <td>0.700862</td>\n",
623
+ " <td>7.0</td>\n",
624
+ " <td>0.100123</td>\n",
625
+ " </tr>\n",
626
+ " <tr>\n",
627
+ " <th>3</th>\n",
628
+ " <td>example_audio/chunks/oden_kypsis16k_chunk_8.wav</td>\n",
629
+ " <td>enamus ajast nagu klikid neid allserva tekivad...</td>\n",
630
+ " <td>0.839978</td>\n",
631
+ " <td>8.0</td>\n",
632
+ " <td>0.104997</td>\n",
633
+ " </tr>\n",
634
+ " <tr>\n",
635
+ " <th>4</th>\n",
636
+ " <td>example_audio/chunks/oden_kypsis16k_chunk_9.wav</td>\n",
637
+ " <td>enamus ajast nagu klikid neid allserva tekivad...</td>\n",
638
+ " <td>1.016149</td>\n",
639
+ " <td>9.0</td>\n",
640
+ " <td>0.112905</td>\n",
641
+ " </tr>\n",
642
+ " </tbody>\n",
643
+ "</table>\n",
644
+ "</div>"
645
+ ],
646
+ "text/plain": [
647
+ " file \\\n",
648
+ "0 example_audio/chunks/oden_kypsis16k_chunk_5.wav \n",
649
+ "1 example_audio/chunks/oden_kypsis16k_chunk_6.wav \n",
650
+ "2 example_audio/chunks/oden_kypsis16k_chunk_7.wav \n",
651
+ "3 example_audio/chunks/oden_kypsis16k_chunk_8.wav \n",
652
+ "4 example_audio/chunks/oden_kypsis16k_chunk_9.wav \n",
653
+ "\n",
654
+ " hyp elapsed_time \\\n",
655
+ "0 enamus ajast nagu klik 0.418611 \n",
656
+ "1 enamus ajast nagu klikid neid all 0.481883 \n",
657
+ "2 enamus ajast nagu klikid neid allserva tekivad 0.700862 \n",
658
+ "3 enamus ajast nagu klikid neid allserva tekivad... 0.839978 \n",
659
+ "4 enamus ajast nagu klikid neid allserva tekivad... 1.016149 \n",
660
+ "\n",
661
+ " audio_dur_sec trans_time_audio_dur_share \n",
662
+ "0 5.0 0.083722 \n",
663
+ "1 6.0 0.080314 \n",
664
+ "2 7.0 0.100123 \n",
665
+ "3 8.0 0.104997 \n",
666
+ "4 9.0 0.112905 "
667
+ ]
668
+ },
669
+ "execution_count": 39,
670
+ "metadata": {},
671
+ "output_type": "execute_result"
672
+ }
673
+ ],
674
+ "source": [
675
+ "df_chunk_times.columns=['file', 'hyp','elapsed_time', 'audio_dur_sec']\n",
676
+ "df_chunk_times['trans_time_audio_dur_share']=df_chunk_times.elapsed_time/df_chunk_times.audio_dur_sec\n",
677
+ "df_chunk_times=df_chunk_times.sort_values('audio_dur_sec')\n",
678
+ "df_chunk_times=df_chunk_times.reset_index(drop=True)\n",
679
+ "df_chunk_times.head()"
680
+ ]
681
+ },
682
+ {
683
+ "cell_type": "code",
684
+ "execution_count": 40,
685
+ "id": "1d8d9520-1bbd-43f5-ae7a-08643def9285",
686
+ "metadata": {
687
+ "tags": []
688
+ },
689
+ "outputs": [
690
+ {
691
+ "data": {
692
+ "text/plain": [
693
+ "<Axes: xlabel='elapsed_time', ylabel='audio_dur_sec'>"
694
+ ]
695
+ },
696
+ "execution_count": 40,
697
+ "metadata": {},
698
+ "output_type": "execute_result"
699
+ },
700
+ {
701
+ "data": {
702
+ "image/png": "",
703
+ "text/plain": [
704
+ "<Figure size 640x480 with 1 Axes>"
705
+ ]
706
+ },
707
+ "metadata": {},
708
+ "output_type": "display_data"
709
+ }
710
+ ],
711
+ "source": [
712
+ "df_chunk_times.plot.scatter('elapsed_time', 'audio_dur_sec')"
713
+ ]
714
+ },
715
+ {
716
+ "cell_type": "code",
717
+ "execution_count": 41,
718
+ "id": "fcd06626-4e6e-4461-bf6b-7495bcc825b5",
719
+ "metadata": {
720
+ "tags": []
721
+ },
722
+ "outputs": [
723
+ {
724
+ "data": {
725
+ "text/plain": [
726
+ "Text(0.5, 0, 'audio duration')"
727
+ ]
728
+ },
729
+ "execution_count": 41,
730
+ "metadata": {},
731
+ "output_type": "execute_result"
732
+ },
733
+ {
734
+ "data": {
735
+ "image/png": "",
736
+ "text/plain": [
737
+ "<Figure size 640x480 with 1 Axes>"
738
+ ]
739
+ },
740
+ "metadata": {},
741
+ "output_type": "display_data"
742
+ }
743
+ ],
744
+ "source": [
745
+ "df_chunk_times['trans_time_audio_dur_share'].plot()\n",
746
+ "plt.ylabel('transc time/audio duration ratio')\n",
747
+ "plt.xlabel('audio duration')"
748
+ ]
749
  }
750
  ],
751
  "metadata": {