update example
Browse files- example_usage.ipynb +459 -421
example_usage.ipynb
CHANGED
@@ -2,28 +2,30 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"id": "5920c653-448e-43b3-93eb-12d7073ad352",
|
7 |
"metadata": {
|
8 |
"tags": []
|
9 |
},
|
10 |
-
"outputs": [
|
11 |
-
{
|
12 |
-
"name": "stderr",
|
13 |
-
"output_type": "stream",
|
14 |
-
"text": [
|
15 |
-
"/opt/espnet/tools/anaconda/envs/espnet/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
16 |
-
" from .autonotebook import tqdm as notebook_tqdm\n",
|
17 |
-
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
|
18 |
-
"[nltk_data] /root/nltk_data...\n",
|
19 |
-
"[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n",
|
20 |
-
"[nltk_data] Downloading package cmudict to /root/nltk_data...\n",
|
21 |
-
"[nltk_data] Unzipping corpora/cmudict.zip.\n"
|
22 |
-
]
|
23 |
-
}
|
24 |
-
],
|
25 |
"source": [
|
26 |
-
"from espnet2.bin.asr_inference import Speech2Text"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
]
|
28 |
},
|
29 |
{
|
@@ -36,7 +38,7 @@
|
|
36 |
},
|
37 |
{
|
38 |
"cell_type": "code",
|
39 |
-
"execution_count":
|
40 |
"id": "5e4670d6-0949-48cf-b6b1-d9cc4cf3ad65",
|
41 |
"metadata": {
|
42 |
"tags": []
|
@@ -58,11 +60,45 @@
|
|
58 |
},
|
59 |
{
|
60 |
"cell_type": "code",
|
61 |
-
"execution_count":
|
62 |
"id": "e8120e8e-3718-4a1a-ab7a-46ef98a6bc11",
|
63 |
"metadata": {
|
64 |
"tags": []
|
65 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
"outputs": [
|
67 |
{
|
68 |
"name": "stdout",
|
@@ -73,432 +109,434 @@
|
|
73 |
}
|
74 |
],
|
75 |
"source": [
|
76 |
-
"import soundfile\n",
|
77 |
-
"speech, rate = soundfile.read(\"example_audio/emt16k.wav\")\n",
|
78 |
-
"assert rate == 16000\n",
|
79 |
-
"text, *_ = speech2text(speech)\n",
|
80 |
"print(text[0])"
|
81 |
]
|
82 |
},
|
83 |
{
|
84 |
"cell_type": "code",
|
85 |
-
"execution_count":
|
86 |
-
"id": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
"metadata": {
|
88 |
"tags": []
|
89 |
},
|
90 |
"outputs": [
|
91 |
{
|
92 |
"data": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
"text/plain": [
|
94 |
-
"
|
95 |
-
"
|
96 |
-
"
|
97 |
-
"
|
98 |
-
"
|
99 |
-
"
|
100 |
-
"
|
101 |
-
"
|
102 |
-
"
|
103 |
-
"
|
104 |
-
"
|
105 |
-
"
|
106 |
-
"
|
107 |
-
"
|
108 |
-
"
|
109 |
-
"
|
110 |
-
"
|
111 |
-
"
|
112 |
-
"
|
113 |
-
"
|
114 |
-
"
|
115 |
-
" 2003,\n",
|
116 |
-
" 1403,\n",
|
117 |
-
" 101,\n",
|
118 |
-
" 4,\n",
|
119 |
-
" 1403,\n",
|
120 |
-
" 13,\n",
|
121 |
-
" 303,\n",
|
122 |
-
" 3,\n",
|
123 |
-
" 13,\n",
|
124 |
-
" 2652,\n",
|
125 |
-
" 60,\n",
|
126 |
-
" 2940,\n",
|
127 |
-
" 410,\n",
|
128 |
-
" 143,\n",
|
129 |
-
" 376,\n",
|
130 |
-
" 15,\n",
|
131 |
-
" 88,\n",
|
132 |
-
" 467],\n",
|
133 |
-
" Hypothesis(yseq=tensor([4999, 160, 2003, 1403, 101, 4, 1403, 13, 303, 3, 13, 2652,\n",
|
134 |
-
" 60, 2940, 410, 143, 376, 15, 88, 467, 4999]), score=tensor(-11.9828), scores={'decoder': tensor(-14.2027), 'ctc': tensor(-9.7630)}, states={'decoder': [tensor([[ -5.1263, 3.1550, -2.0594, ..., 24.0211, 0.3968, -0.0377],\n",
|
135 |
-
" [ -6.2288, 3.8375, 1.5658, ..., 1.4794, -19.9161, -9.2051],\n",
|
136 |
-
" [ -0.8955, -12.1962, 11.2881, ..., 14.5357, 2.5597, 5.0430],\n",
|
137 |
-
" ...,\n",
|
138 |
-
" [ -1.5868, 0.9150, 7.6028, ..., 6.9893, -9.0237, -6.0684],\n",
|
139 |
-
" [ -0.2270, -5.4793, -4.3761, ..., 8.3707, 3.3640, -10.0500],\n",
|
140 |
-
" [ -3.5901, 5.2413, -6.3700, ..., 1.8956, -1.3764, 2.8161]]), tensor([[ -3.6827, 2.9746, -2.4586, ..., 21.4053, 0.7093, 0.0590],\n",
|
141 |
-
" [ -3.6746, 2.3007, 6.9940, ..., 0.4042, -26.4674, -7.6587],\n",
|
142 |
-
" [ -4.1463, -12.4967, 23.4652, ..., 20.2923, 5.0455, 2.7187],\n",
|
143 |
-
" ...,\n",
|
144 |
-
" [ -3.0836, 8.8793, 15.3461, ..., 6.7483, -10.2614, -12.3279],\n",
|
145 |
-
" [ -3.0388, -4.8860, 1.2961, ..., 7.1166, 7.1762, -11.6877],\n",
|
146 |
-
" [ -6.3154, 10.7874, -8.3653, ..., -2.7268, 6.2414, 2.0028]]), tensor([[ -4.2097, 3.3128, -4.3004, ..., 19.7619, 0.0523, 0.5325],\n",
|
147 |
-
" [ -2.0206, 5.0642, 3.8430, ..., 4.3120, -31.3507, -4.5523],\n",
|
148 |
-
" [ -4.9023, -11.4819, 30.5385, ..., 23.1088, 7.1218, 6.8997],\n",
|
149 |
-
" ...,\n",
|
150 |
-
" [ -7.4921, 14.9905, 18.8936, ..., 14.4608, -14.1381, -20.3670],\n",
|
151 |
-
" [ -7.2482, -3.5175, -1.7658, ..., 13.9724, 3.8404, -19.6644],\n",
|
152 |
-
" [ -4.5970, 14.7940, 1.5388, ..., -3.3481, 2.3485, 1.7171]]), tensor([[ -2.6693, 5.9532, -6.0862, ..., 18.1037, 1.8700, 1.8734],\n",
|
153 |
-
" [ -3.4684, 1.5924, -2.2865, ..., 17.7742, -34.5136, -5.1747],\n",
|
154 |
-
" [-14.9132, -16.1804, 39.0452, ..., 19.7372, 15.8912, 11.7538],\n",
|
155 |
-
" ...,\n",
|
156 |
-
" [-13.3347, 13.1394, 22.9955, ..., 15.1552, -26.7795, -35.4029],\n",
|
157 |
-
" [-20.9897, -12.5458, 2.4344, ..., 25.7768, 4.4565, -20.4715],\n",
|
158 |
-
" [-11.9333, 21.3926, 0.9612, ..., -7.3184, 1.1050, -8.7355]]), tensor([[ -8.3364, -8.5930, 12.5300, ..., 14.0010, 9.2563, 15.4071],\n",
|
159 |
-
" [-11.7589, -8.5604, -14.7583, ..., 27.1039, -22.6838, 8.2407],\n",
|
160 |
-
" [-26.8291, -28.9856, 66.4735, ..., 10.2667, 51.7921, 13.9935],\n",
|
161 |
-
" ...,\n",
|
162 |
-
" [-24.8075, 8.7724, 35.3504, ..., -15.1498, -51.6167, -44.5583],\n",
|
163 |
-
" [-25.6442, -8.3698, 14.0967, ..., 21.2507, 16.2526, -28.0444],\n",
|
164 |
-
" [ -9.2648, 33.3486, 10.6099, ..., -8.6877, 0.9520, -13.3015]]), tensor([[-30.5616, -7.4708, 56.1848, ..., 1.5934, 20.7831, 44.0383],\n",
|
165 |
-
" [-51.2013, 29.8913, -73.0093, ..., -2.1801, -11.0278, 151.5184],\n",
|
166 |
-
" [-74.1030, -51.1745, 158.5696, ..., 40.8046, 92.3456, 42.8557],\n",
|
167 |
-
" ...,\n",
|
168 |
-
" [-56.5551, 4.2527, 102.6477, ..., 4.8044, -99.0702, -65.3296],\n",
|
169 |
-
" [-22.1416, 8.2998, 48.0735, ..., -36.2092, 40.0850, 6.7160],\n",
|
170 |
-
" [-21.1855, 35.3772, -11.6307, ..., 17.0477, 6.7013, -32.2759]])], 'ctc': (tensor([[-1.0000e+10, -1.0000e+10],\n",
|
171 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
172 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
173 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
174 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
175 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
176 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
177 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
178 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
179 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
180 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
181 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
182 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
183 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
184 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
185 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
186 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
187 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
188 |
-
" [-1.0000e+10, -1.0000e+10],\n",
|
189 |
-
" [-3.9543e+02, -1.0000e+10],\n",
|
190 |
-
" [-3.9364e+02, -3.9543e+02],\n",
|
191 |
-
" [-3.9464e+02, -3.9349e+02],\n",
|
192 |
-
" [-3.9263e+02, -3.9321e+02],\n",
|
193 |
-
" [-3.8944e+02, -3.9219e+02],\n",
|
194 |
-
" [-3.8704e+02, -3.8938e+02],\n",
|
195 |
-
" [-3.8350e+02, -3.8695e+02],\n",
|
196 |
-
" [-3.8215e+02, -3.8347e+02],\n",
|
197 |
-
" [-3.8234e+02, -3.8191e+02],\n",
|
198 |
-
" [-3.8383e+02, -3.8141e+02],\n",
|
199 |
-
" [-3.8014e+02, -3.9431e+02],\n",
|
200 |
-
" [-3.8677e+02, -3.8014e+02],\n",
|
201 |
-
" [-3.8578e+02, -3.8013e+02],\n",
|
202 |
-
" [-3.8176e+02, -3.8013e+02],\n",
|
203 |
-
" [-3.8125e+02, -3.7995e+02],\n",
|
204 |
-
" [-3.8104e+02, -3.7971e+02],\n",
|
205 |
-
" [-3.7952e+02, -3.7948e+02],\n",
|
206 |
-
" [-3.7984e+02, -3.7881e+02],\n",
|
207 |
-
" [-3.8364e+02, -3.7850e+02],\n",
|
208 |
-
" [-3.8166e+02, -3.7851e+02],\n",
|
209 |
-
" [-3.8023e+02, -3.8624e+02],\n",
|
210 |
-
" [-3.8322e+02, -3.8023e+02],\n",
|
211 |
-
" [-3.8385e+02, -3.8019e+02],\n",
|
212 |
-
" [-3.8177e+02, -3.8016e+02],\n",
|
213 |
-
" [-3.7862e+02, -3.7998e+02],\n",
|
214 |
-
" [-3.7735e+02, -3.7839e+02],\n",
|
215 |
-
" [-3.7684e+02, -3.7705e+02],\n",
|
216 |
-
" [-3.7739e+02, -3.7624e+02],\n",
|
217 |
-
" [-3.7643e+02, -3.7597e+02],\n",
|
218 |
-
" [-3.7122e+02, -3.7548e+02],\n",
|
219 |
-
" [-3.6641e+02, -3.7120e+02],\n",
|
220 |
-
" [-3.6524e+02, -3.6640e+02],\n",
|
221 |
-
" [-3.6296e+02, -3.6497e+02],\n",
|
222 |
-
" [-3.6198e+02, -3.6283e+02],\n",
|
223 |
-
" [-3.5907e+02, -3.6162e+02],\n",
|
224 |
-
" [-3.4759e+02, -3.6746e+02],\n",
|
225 |
-
" [-3.4178e+02, -3.4759e+02],\n",
|
226 |
-
" [-3.4269e+02, -3.4178e+02],\n",
|
227 |
-
" [-3.4194e+02, -3.4144e+02],\n",
|
228 |
-
" [-3.3753e+02, -3.4097e+02],\n",
|
229 |
-
" [-3.3310e+02, -3.3750e+02],\n",
|
230 |
-
" [-3.2918e+02, -3.3309e+02],\n",
|
231 |
-
" [-3.2564e+02, -3.2916e+02],\n",
|
232 |
-
" [-3.2297e+02, -3.2561e+02],\n",
|
233 |
-
" [-3.2136e+02, -3.2290e+02],\n",
|
234 |
-
" [-3.1502e+02, -3.2117e+02],\n",
|
235 |
-
" [-3.1264e+02, -3.1502e+02],\n",
|
236 |
-
" [-3.1198e+02, -3.1255e+02],\n",
|
237 |
-
" [-3.1081e+02, -3.1153e+02],\n",
|
238 |
-
" [-3.1095e+02, -3.1041e+02],\n",
|
239 |
-
" [-3.0918e+02, -3.0995e+02],\n",
|
240 |
-
" [-3.0741e+02, -3.0880e+02],\n",
|
241 |
-
" [-3.0711e+02, -3.0719e+02],\n",
|
242 |
-
" [-3.0739e+02, -3.0645e+02],\n",
|
243 |
-
" [-3.0782e+02, -3.0612e+02],\n",
|
244 |
-
" [-3.0775e+02, -3.0595e+02],\n",
|
245 |
-
" [-3.0780e+02, -3.0580e+02],\n",
|
246 |
-
" [-3.0755e+02, -3.0567e+02],\n",
|
247 |
-
" [-3.0734e+02, -3.0553e+02],\n",
|
248 |
-
" [-3.0679e+02, -3.0538e+02],\n",
|
249 |
-
" [-3.0643e+02, -3.0516e+02],\n",
|
250 |
-
" [-3.0644e+02, -3.0491e+02],\n",
|
251 |
-
" [-3.0617e+02, -3.0472e+02],\n",
|
252 |
-
" [-3.0556e+02, -3.0451e+02],\n",
|
253 |
-
" [-3.0395e+02, -3.0421e+02],\n",
|
254 |
-
" [-3.0080e+02, -3.0338e+02],\n",
|
255 |
-
" [-2.9807e+02, -3.0073e+02],\n",
|
256 |
-
" [-2.9489e+02, -2.9800e+02],\n",
|
257 |
-
" [-2.9230e+02, -2.9485e+02],\n",
|
258 |
-
" [-2.9033e+02, -2.9223e+02],\n",
|
259 |
-
" [-2.8882e+02, -2.9019e+02],\n",
|
260 |
-
" [-2.9074e+02, -2.8859e+02],\n",
|
261 |
-
" [-2.9428e+02, -2.8848e+02],\n",
|
262 |
-
" [-2.8970e+02, -2.8848e+02],\n",
|
263 |
-
" [-2.8296e+02, -2.9165e+02],\n",
|
264 |
-
" [-2.9051e+02, -2.8296e+02],\n",
|
265 |
-
" [-2.8984e+02, -2.8296e+02],\n",
|
266 |
-
" [-2.8508e+02, -2.8296e+02],\n",
|
267 |
-
" [-2.8530e+02, -2.8285e+02],\n",
|
268 |
-
" [-2.8455e+02, -2.8277e+02],\n",
|
269 |
-
" [-2.8364e+02, -2.8261e+02],\n",
|
270 |
-
" [-2.8490e+02, -2.8231e+02],\n",
|
271 |
-
" [-2.8613e+02, -2.8223e+02],\n",
|
272 |
-
" [-2.8638e+02, -2.8221e+02],\n",
|
273 |
-
" [-2.8644e+02, -2.8220e+02],\n",
|
274 |
-
" [-2.8776e+02, -2.8218e+02],\n",
|
275 |
-
" [-2.9038e+02, -2.8218e+02],\n",
|
276 |
-
" [-2.8499e+02, -2.9242e+02],\n",
|
277 |
-
" [-2.9040e+02, -2.8500e+02],\n",
|
278 |
-
" [-2.9231e+02, -2.8500e+02],\n",
|
279 |
-
" [-2.9122e+02, -2.8500e+02],\n",
|
280 |
-
" [-2.8800e+02, -2.8500e+02],\n",
|
281 |
-
" [-2.8509e+02, -2.8495e+02],\n",
|
282 |
-
" [-2.8517e+02, -2.8432e+02],\n",
|
283 |
-
" [-2.8559e+02, -2.8397e+02],\n",
|
284 |
-
" [-2.8567e+02, -2.8379e+02],\n",
|
285 |
-
" [-2.8590e+02, -2.8365e+02],\n",
|
286 |
-
" [-2.8608e+02, -2.8355e+02],\n",
|
287 |
-
" [-2.8644e+02, -2.8347e+02],\n",
|
288 |
-
" [-2.8695e+02, -2.8342e+02],\n",
|
289 |
-
" [-2.8715e+02, -2.8339e+02],\n",
|
290 |
-
" [-2.8738e+02, -2.8337e+02],\n",
|
291 |
-
" [-2.8759e+02, -2.8335e+02],\n",
|
292 |
-
" [-2.8722e+02, -2.8334e+02],\n",
|
293 |
-
" [-2.8677e+02, -2.8331e+02],\n",
|
294 |
-
" [-2.8624e+02, -2.8328e+02],\n",
|
295 |
-
" [-2.8576e+02, -2.8323e+02],\n",
|
296 |
-
" [-2.8529e+02, -2.8316e+02],\n",
|
297 |
-
" [-2.8409e+02, -2.8304e+02],\n",
|
298 |
-
" [-2.8393e+02, -2.8274e+02],\n",
|
299 |
-
" [-2.8336e+02, -2.8248e+02],\n",
|
300 |
-
" [-2.8150e+02, -2.8213e+02],\n",
|
301 |
-
" [-2.7884e+02, -2.8107e+02],\n",
|
302 |
-
" [-2.7735e+02, -2.7874e+02],\n",
|
303 |
-
" [-2.7646e+02, -2.7713e+02],\n",
|
304 |
-
" [-2.7592e+02, -2.7605e+02],\n",
|
305 |
-
" [-2.7555e+02, -2.7529e+02],\n",
|
306 |
-
" [-2.7501e+02, -2.7472e+02],\n",
|
307 |
-
" [-2.7359e+02, -2.7416e+02],\n",
|
308 |
-
" [-2.7206e+02, -2.7314e+02],\n",
|
309 |
-
" [-2.7034e+02, -2.7176e+02],\n",
|
310 |
-
" [-2.6631e+02, -2.7013e+02],\n",
|
311 |
-
" [-2.6184e+02, -2.6629e+02],\n",
|
312 |
-
" [-2.5695e+02, -2.6183e+02],\n",
|
313 |
-
" [-2.5373e+02, -2.5694e+02],\n",
|
314 |
-
" [-2.5259e+02, -2.5369e+02],\n",
|
315 |
-
" [-2.5437e+02, -2.5230e+02],\n",
|
316 |
-
" [-2.5365e+02, -2.5218e+02],\n",
|
317 |
-
" [-2.5339e+02, -2.5198e+02],\n",
|
318 |
-
" [-2.5424e+02, -2.5176e+02],\n",
|
319 |
-
" [-2.5564e+02, -2.5168e+02],\n",
|
320 |
-
" [-2.5749e+02, -2.5166e+02],\n",
|
321 |
-
" [-2.5639e+02, -2.5166e+02],\n",
|
322 |
-
" [-2.4779e+02, -2.5773e+02],\n",
|
323 |
-
" [-2.5443e+02, -2.4779e+02],\n",
|
324 |
-
" [-2.5357e+02, -2.4779e+02],\n",
|
325 |
-
" [-2.4874e+02, -2.4779e+02],\n",
|
326 |
-
" [-2.4432e+02, -2.4747e+02],\n",
|
327 |
-
" [-2.3843e+02, -2.4428e+02],\n",
|
328 |
-
" [-2.3581e+02, -2.3843e+02],\n",
|
329 |
-
" [-2.3313e+02, -2.3574e+02],\n",
|
330 |
-
" [-2.3312e+02, -2.3306e+02],\n",
|
331 |
-
" [-2.3310e+02, -2.3239e+02],\n",
|
332 |
-
" [-2.2699e+02, -2.3199e+02],\n",
|
333 |
-
" [-2.2357e+02, -2.2699e+02],\n",
|
334 |
-
" [-2.1992e+02, -2.2354e+02],\n",
|
335 |
-
" [-2.1943e+02, -2.1989e+02],\n",
|
336 |
-
" [-2.1140e+02, -2.2310e+02],\n",
|
337 |
-
" [-2.1110e+02, -2.1140e+02],\n",
|
338 |
-
" [-2.1043e+02, -2.1055e+02],\n",
|
339 |
-
" [-2.0790e+02, -2.0981e+02],\n",
|
340 |
-
" [-2.0805e+02, -2.0777e+02],\n",
|
341 |
-
" [-2.0844e+02, -2.0721e+02],\n",
|
342 |
-
" [-2.0603e+02, -2.0695e+02],\n",
|
343 |
-
" [-2.0277e+02, -2.0569e+02],\n",
|
344 |
-
" [-1.9900e+02, -2.0271e+02],\n",
|
345 |
-
" [-1.9703e+02, -1.9898e+02],\n",
|
346 |
-
" [-1.9753e+02, -1.9690e+02],\n",
|
347 |
-
" [-1.9758e+02, -1.9647e+02],\n",
|
348 |
-
" [-1.9675e+02, -1.9619e+02],\n",
|
349 |
-
" [-1.9541e+02, -1.9574e+02],\n",
|
350 |
-
" [-1.9552e+02, -1.9487e+02],\n",
|
351 |
-
" [-1.9666e+02, -1.9446e+02],\n",
|
352 |
-
" [-1.9090e+02, -1.9553e+02],\n",
|
353 |
-
" [-1.8694e+02, -1.9095e+02],\n",
|
354 |
-
" [-1.9004e+02, -1.8693e+02],\n",
|
355 |
-
" [-1.8831e+02, -1.8688e+02],\n",
|
356 |
-
" [-1.8653e+02, -1.8667e+02],\n",
|
357 |
-
" [-1.8658e+02, -1.8591e+02],\n",
|
358 |
-
" [-1.8689e+02, -1.8549e+02],\n",
|
359 |
-
" [-1.8719e+02, -1.8527e+02],\n",
|
360 |
-
" [-1.8760e+02, -1.8514e+02],\n",
|
361 |
-
" [-1.8826e+02, -1.8506e+02],\n",
|
362 |
-
" [-1.8857e+02, -1.8502e+02],\n",
|
363 |
-
" [-1.8863e+02, -1.8499e+02],\n",
|
364 |
-
" [-1.8759e+02, -1.8496e+02],\n",
|
365 |
-
" [-1.8709e+02, -1.8489e+02],\n",
|
366 |
-
" [-1.8714e+02, -1.8479e+02],\n",
|
367 |
-
" [-1.8638e+02, -1.8469e+02],\n",
|
368 |
-
" [-1.8282e+02, -1.8452e+02],\n",
|
369 |
-
" [-1.8101e+02, -1.8265e+02],\n",
|
370 |
-
" [-1.8089e+02, -1.8084e+02],\n",
|
371 |
-
" [-1.8437e+02, -1.8017e+02],\n",
|
372 |
-
" [-1.8485e+02, -1.8016e+02],\n",
|
373 |
-
" [-1.8380e+02, -1.8015e+02],\n",
|
374 |
-
" [-1.8273e+02, -1.8012e+02],\n",
|
375 |
-
" [-1.8204e+02, -1.8005e+02],\n",
|
376 |
-
" [-1.8135e+02, -1.7992e+02],\n",
|
377 |
-
" [-1.8096e+02, -1.7971e+02],\n",
|
378 |
-
" [-1.8007e+02, -1.7947e+02],\n",
|
379 |
-
" [-1.8093e+02, -1.7904e+02],\n",
|
380 |
-
" [-1.8179e+02, -1.7891e+02],\n",
|
381 |
-
" [-1.7581e+02, -1.8068e+02],\n",
|
382 |
-
" [-1.7191e+02, -1.7584e+02],\n",
|
383 |
-
" [-1.7005e+02, -1.7195e+02],\n",
|
384 |
-
" [-1.6664e+02, -1.6997e+02],\n",
|
385 |
-
" [-1.5918e+02, -1.6662e+02],\n",
|
386 |
-
" [-1.5845e+02, -1.5919e+02],\n",
|
387 |
-
" [-1.5920e+02, -1.5808e+02],\n",
|
388 |
-
" [-1.5987e+02, -1.5787e+02],\n",
|
389 |
-
" [-1.6164e+02, -1.5777e+02],\n",
|
390 |
-
" [-1.6259e+02, -1.5776e+02],\n",
|
391 |
-
" [-1.5753e+02, -1.6083e+02],\n",
|
392 |
-
" [-1.5305e+02, -1.5760e+02],\n",
|
393 |
-
" [-1.5195e+02, -1.5419e+02],\n",
|
394 |
-
" [-1.5250e+02, -1.5235e+02],\n",
|
395 |
-
" [-1.4619e+02, -1.5538e+02],\n",
|
396 |
-
" [-1.4701e+02, -1.4619e+02],\n",
|
397 |
-
" [-1.4396e+02, -1.4594e+02],\n",
|
398 |
-
" [-1.4333e+02, -1.4386e+02],\n",
|
399 |
-
" [-1.4368e+02, -1.4288e+02],\n",
|
400 |
-
" [-1.3974e+02, -1.4251e+02],\n",
|
401 |
-
" [-1.3701e+02, -1.3969e+02],\n",
|
402 |
-
" [-1.3037e+02, -1.4083e+02],\n",
|
403 |
-
" [-1.2646e+02, -1.3052e+02],\n",
|
404 |
-
" [-1.2280e+02, -1.2654e+02],\n",
|
405 |
-
" [-1.2127e+02, -1.2277e+02],\n",
|
406 |
-
" [-1.1802e+02, -1.2107e+02],\n",
|
407 |
-
" [-1.0934e+02, -1.2088e+02],\n",
|
408 |
-
" [-1.0717e+02, -1.0934e+02],\n",
|
409 |
-
" [-1.0696e+02, -1.0706e+02],\n",
|
410 |
-
" [-1.0192e+02, -1.0632e+02],\n",
|
411 |
-
" [-9.6160e+01, -1.0192e+02],\n",
|
412 |
-
" [-9.1622e+01, -9.6166e+01],\n",
|
413 |
-
" [-9.0682e+01, -9.1624e+01],\n",
|
414 |
-
" [-9.0639e+01, -9.0360e+01],\n",
|
415 |
-
" [-9.3666e+01, -8.9798e+01],\n",
|
416 |
-
" [-8.5321e+01, -9.6196e+01],\n",
|
417 |
-
" [-8.7987e+01, -8.5552e+01],\n",
|
418 |
-
" [-8.8252e+01, -8.6939e+01],\n",
|
419 |
-
" [-8.5141e+01, -8.7025e+01],\n",
|
420 |
-
" [-6.2668e+01, -8.9043e+01],\n",
|
421 |
-
" [-5.8723e+01, -6.2668e+01],\n",
|
422 |
-
" [-5.4701e+01, -5.8703e+01],\n",
|
423 |
-
" [-5.2488e+01, -5.4683e+01],\n",
|
424 |
-
" [-4.8607e+01, -5.2382e+01],\n",
|
425 |
-
" [-4.4409e+01, -4.8588e+01],\n",
|
426 |
-
" [-3.6444e+01, -5.1356e+01],\n",
|
427 |
-
" [-3.5016e+01, -3.6445e+01],\n",
|
428 |
-
" [-3.5325e+01, -3.4801e+01],\n",
|
429 |
-
" [-3.4360e+01, -3.4336e+01],\n",
|
430 |
-
" [-3.2938e+01, -3.3655e+01],\n",
|
431 |
-
" [-3.0829e+01, -3.2540e+01],\n",
|
432 |
-
" [-2.9628e+01, -3.0663e+01],\n",
|
433 |
-
" [-2.9922e+01, -2.9324e+01],\n",
|
434 |
-
" [-3.0902e+01, -2.8886e+01],\n",
|
435 |
-
" [-3.1163e+01, -2.8761e+01],\n",
|
436 |
-
" [-2.7488e+01, -2.8674e+01],\n",
|
437 |
-
" [-2.6982e+01, -2.7221e+01],\n",
|
438 |
-
" [-3.0372e+01, -2.6402e+01],\n",
|
439 |
-
" [-2.9038e+01, -2.6383e+01],\n",
|
440 |
-
" [-2.8164e+01, -2.6316e+01],\n",
|
441 |
-
" [-2.7254e+01, -2.6178e+01],\n",
|
442 |
-
" [-2.4680e+01, -2.6031e+01],\n",
|
443 |
-
" [-2.5801e+01, -2.4487e+01],\n",
|
444 |
-
" [-2.6773e+01, -2.4260e+01],\n",
|
445 |
-
" [-2.5797e+01, -2.4190e+01],\n",
|
446 |
-
" [-2.6708e+01, -2.4018e+01],\n",
|
447 |
-
" [-2.7858e+01, -2.3958e+01],\n",
|
448 |
-
" [-2.2139e+01, -2.5121e+01],\n",
|
449 |
-
" [-2.8328e+01, -2.2100e+01],\n",
|
450 |
-
" [-2.7177e+01, -2.2113e+01],\n",
|
451 |
-
" [-2.5953e+01, -2.2147e+01],\n",
|
452 |
-
" [-2.5987e+01, -2.2175e+01],\n",
|
453 |
-
" [-2.6544e+01, -2.2229e+01],\n",
|
454 |
-
" [-2.7662e+01, -2.2247e+01],\n",
|
455 |
-
" [-2.9118e+01, -2.2250e+01],\n",
|
456 |
-
" [-2.5194e+01, -2.2554e+01],\n",
|
457 |
-
" [-2.5209e+01, -2.2896e+01],\n",
|
458 |
-
" [-3.0700e+01, -2.2829e+01],\n",
|
459 |
-
" [-2.9220e+01, -2.2893e+01],\n",
|
460 |
-
" [-2.9130e+01, -2.2960e+01],\n",
|
461 |
-
" [-2.9412e+01, -2.2982e+01],\n",
|
462 |
-
" [-2.9311e+01, -2.2987e+01],\n",
|
463 |
-
" [-2.9602e+01, -2.2988e+01],\n",
|
464 |
-
" [-3.0431e+01, -2.2988e+01],\n",
|
465 |
-
" [-3.1378e+01, -2.2987e+01],\n",
|
466 |
-
" [-3.1774e+01, -2.2987e+01],\n",
|
467 |
-
" [-3.2103e+01, -2.2987e+01],\n",
|
468 |
-
" [-3.2398e+01, -2.2987e+01],\n",
|
469 |
-
" [-3.2698e+01, -2.2987e+01],\n",
|
470 |
-
" [-3.2968e+01, -2.2987e+01],\n",
|
471 |
-
" [-3.3121e+01, -2.2987e+01],\n",
|
472 |
-
" [-3.3213e+01, -2.2987e+01],\n",
|
473 |
-
" [-3.3279e+01, -2.2987e+01],\n",
|
474 |
-
" [-3.3436e+01, -2.2987e+01],\n",
|
475 |
-
" [-3.3520e+01, -2.2987e+01],\n",
|
476 |
-
" [-3.3524e+01, -2.2987e+01],\n",
|
477 |
-
" [-3.3570e+01, -2.2987e+01],\n",
|
478 |
-
" [-3.3440e+01, -2.2987e+01],\n",
|
479 |
-
" [-3.3221e+01, -2.2987e+01],\n",
|
480 |
-
" [-3.0416e+01, -2.2987e+01],\n",
|
481 |
-
" [-2.9359e+01, -2.2986e+01],\n",
|
482 |
-
" [-2.9178e+01, -2.2985e+01],\n",
|
483 |
-
" [-2.9116e+01, -2.2983e+01],\n",
|
484 |
-
" [-2.9091e+01, -2.2981e+01],\n",
|
485 |
-
" [-2.9035e+01, -2.2979e+01],\n",
|
486 |
-
" [-2.8889e+01, -2.2977e+01]]), tensor([-9.7630, -9.7630, -9.7630, ..., -9.7630, -9.7630, -9.7630]), 0, 0)}))"
|
487 |
]
|
488 |
},
|
489 |
-
"execution_count":
|
490 |
"metadata": {},
|
491 |
"output_type": "execute_result"
|
492 |
}
|
493 |
],
|
494 |
"source": [
|
495 |
-
"
|
496 |
]
|
497 |
},
|
498 |
{
|
499 |
"cell_type": "code",
|
500 |
"execution_count": null,
|
501 |
-
"id": "
|
502 |
"metadata": {},
|
503 |
"outputs": [],
|
504 |
"source": []
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 80,
|
6 |
"id": "5920c653-448e-43b3-93eb-12d7073ad352",
|
7 |
"metadata": {
|
8 |
"tags": []
|
9 |
},
|
10 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
"source": [
|
12 |
+
"from espnet2.bin.asr_inference import Speech2Text\n",
|
13 |
+
"from espnet2.bin.asr_align import CTCSegmentation\n",
|
14 |
+
"import soundfile\n",
|
15 |
+
"import pandas as pd"
|
16 |
+
]
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"cell_type": "code",
|
20 |
+
"execution_count": 81,
|
21 |
+
"id": "83058587-1a8a-4b01-92ff-e9125fbe55a3",
|
22 |
+
"metadata": {
|
23 |
+
"tags": []
|
24 |
+
},
|
25 |
+
"outputs": [],
|
26 |
+
"source": [
|
27 |
+
"import torch\n",
|
28 |
+
"torch.set_num_threads(1)"
|
29 |
]
|
30 |
},
|
31 |
{
|
|
|
38 |
},
|
39 |
{
|
40 |
"cell_type": "code",
|
41 |
+
"execution_count": 82,
|
42 |
"id": "5e4670d6-0949-48cf-b6b1-d9cc4cf3ad65",
|
43 |
"metadata": {
|
44 |
"tags": []
|
|
|
60 |
},
|
61 |
{
|
62 |
"cell_type": "code",
|
63 |
+
"execution_count": 92,
|
64 |
"id": "e8120e8e-3718-4a1a-ab7a-46ef98a6bc11",
|
65 |
"metadata": {
|
66 |
"tags": []
|
67 |
},
|
68 |
+
"outputs": [],
|
69 |
+
"source": [
|
70 |
+
"speech, rate = soundfile.read(\"example_audio/emt16k.wav\")\n",
|
71 |
+
"assert rate == 16000"
|
72 |
+
]
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"cell_type": "code",
|
76 |
+
"execution_count": 93,
|
77 |
+
"id": "eec8d4b2-c27a-4780-aeed-8aa7538f70e5",
|
78 |
+
"metadata": {
|
79 |
+
"tags": []
|
80 |
+
},
|
81 |
+
"outputs": [
|
82 |
+
{
|
83 |
+
"name": "stdout",
|
84 |
+
"output_type": "stream",
|
85 |
+
"text": [
|
86 |
+
"CPU times: user 3.67 s, sys: 191 ms, total: 3.87 s\n",
|
87 |
+
"Wall time: 3.86 s\n"
|
88 |
+
]
|
89 |
+
}
|
90 |
+
],
|
91 |
+
"source": [
|
92 |
+
"%time text, *_ = speech2text(speech)"
|
93 |
+
]
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"cell_type": "code",
|
97 |
+
"execution_count": 94,
|
98 |
+
"id": "39f41a8b-94c3-42d6-a989-6c7183a6f94d",
|
99 |
+
"metadata": {
|
100 |
+
"tags": []
|
101 |
+
},
|
102 |
"outputs": [
|
103 |
{
|
104 |
"name": "stdout",
|
|
|
109 |
}
|
110 |
],
|
111 |
"source": [
|
|
|
|
|
|
|
|
|
112 |
"print(text[0])"
|
113 |
]
|
114 |
},
|
115 |
{
|
116 |
"cell_type": "code",
|
117 |
+
"execution_count": 116,
|
118 |
+
"id": "812060a6-90de-4134-8d1f-9f3d98853bc2",
|
119 |
+
"metadata": {
|
120 |
+
"tags": []
|
121 |
+
},
|
122 |
+
"outputs": [
|
123 |
+
{
|
124 |
+
"name": "stdout",
|
125 |
+
"output_type": "stream",
|
126 |
+
"text": [
|
127 |
+
"\n",
|
128 |
+
"Input File : 'example_audio/emt16k.wav'\n",
|
129 |
+
"Channels : 1\n",
|
130 |
+
"Sample Rate : 16000\n",
|
131 |
+
"Precision : 16-bit\n",
|
132 |
+
"Duration : 00:00:12.74 = 203815 samples ~ 955.383 CDDA sectors\n",
|
133 |
+
"File Size : 408k\n",
|
134 |
+
"Bit Rate : 256k\n",
|
135 |
+
"Sample Encoding: 16-bit Signed Integer PCM\n",
|
136 |
+
"\n"
|
137 |
+
]
|
138 |
+
}
|
139 |
+
],
|
140 |
+
"source": [
|
141 |
+
"!soxi example_audio/emt16k.wav"
|
142 |
+
]
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"cell_type": "markdown",
|
146 |
+
"id": "7d07e8a4-1dbf-4a79-bdf0-aeaeb160ba19",
|
147 |
+
"metadata": {},
|
148 |
+
"source": [
|
149 |
+
"## Example token level alignment"
|
150 |
+
]
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"cell_type": "code",
|
154 |
+
"execution_count": 95,
|
155 |
+
"id": "e6b7331c-52f1-4162-b564-2e6a08b325b0",
|
156 |
+
"metadata": {
|
157 |
+
"tags": []
|
158 |
+
},
|
159 |
+
"outputs": [
|
160 |
+
{
|
161 |
+
"name": "stderr",
|
162 |
+
"output_type": "stream",
|
163 |
+
"text": [
|
164 |
+
"WARNING:root:No RNN model detected; memory consumption may be high.\n"
|
165 |
+
]
|
166 |
+
}
|
167 |
+
],
|
168 |
+
"source": [
|
169 |
+
"aligner = CTCSegmentation(\"exp/config.yaml\", \"exp/valid.acc.ave_10best.pth\" , kaldi_style_text=False, blank_transition_cost_zero=True)\n",
|
170 |
+
"segments = aligner(speech, text[0].split())"
|
171 |
+
]
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"cell_type": "code",
|
175 |
+
"execution_count": 96,
|
176 |
+
"id": "e6d18b5f-3d2a-4fcf-bf4e-00480e58094a",
|
177 |
+
"metadata": {
|
178 |
+
"tags": []
|
179 |
+
},
|
180 |
+
"outputs": [
|
181 |
+
{
|
182 |
+
"name": "stdout",
|
183 |
+
"output_type": "stream",
|
184 |
+
"text": [
|
185 |
+
"utt_0000 utt 0.36 0.78 -0.0001 mina\n",
|
186 |
+
"utt_0001 utt 0.78 1.19 -0.0003 tahaksin\n",
|
187 |
+
"utt_0002 utt 1.19 1.59 -0.0017 homme\n",
|
188 |
+
"utt_0003 utt 1.67 2.19 -0.0001 täna\n",
|
189 |
+
"utt_0004 utt 3.24 3.76 -0.0037 ja\n",
|
190 |
+
"utt_0005 utt 3.76 4.28 -0.0000 homme\n",
|
191 |
+
"utt_0006 utt 5.61 6.13 -0.0001 kui\n",
|
192 |
+
"utt_0007 utt 6.17 6.69 -0.0009 saanud\n",
|
193 |
+
"utt_0008 utt 6.81 7.33 -0.0018 on\n",
|
194 |
+
"utt_0009 utt 7.98 8.50 -0.0862 kui\n",
|
195 |
+
"utt_0010 utt 8.50 9.34 -0.1062 krampsumas\n",
|
196 |
+
"utt_0011 utt 9.34 9.54 -0.1183 ise\n",
|
197 |
+
"utt_0012 utt 9.54 10.07 -0.2033 veiki\n",
|
198 |
+
"utt_0013 utt 10.07 10.31 -0.1041 panna\n",
|
199 |
+
"\n"
|
200 |
+
]
|
201 |
+
}
|
202 |
+
],
|
203 |
+
"source": [
|
204 |
+
"print(segments)"
|
205 |
+
]
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"cell_type": "markdown",
|
209 |
+
"id": "77f82a7d-08dc-40cb-88e5-48ef8c36af7d",
|
210 |
+
"metadata": {
|
211 |
+
"tags": []
|
212 |
+
},
|
213 |
+
"source": [
|
214 |
+
"## Get timestamps with some correction"
|
215 |
+
]
|
216 |
+
},
|
217 |
+
{
|
218 |
+
"cell_type": "code",
|
219 |
+
"execution_count": 97,
|
220 |
+
"id": "ae9f7e3f-b75d-4bcb-98d1-ae2f037fb4af",
|
221 |
+
"metadata": {
|
222 |
+
"tags": []
|
223 |
+
},
|
224 |
+
"outputs": [],
|
225 |
+
"source": [
|
226 |
+
"def get_timestamps(aligner, speech, text, time_correction=0.2):\n",
|
227 |
+
" tokens=text.split()\n",
|
228 |
+
" segments = aligner(speech, tokens)\n",
|
229 |
+
" df=pd.DataFrame(segments.segments)\n",
|
230 |
+
" df.columns=['start', 'end', 'confidence']\n",
|
231 |
+
" df['start']=df.start+time_correction\n",
|
232 |
+
" df['end']=df.end+time_correction\n",
|
233 |
+
" df['words']=tokens\n",
|
234 |
+
" return df"
|
235 |
+
]
|
236 |
+
},
|
237 |
+
{
|
238 |
+
"cell_type": "code",
|
239 |
+
"execution_count": 128,
|
240 |
+
"id": "93aa6281-3b73-47b7-93ca-e90fedd8d398",
|
241 |
+
"metadata": {
|
242 |
+
"tags": []
|
243 |
+
},
|
244 |
+
"outputs": [],
|
245 |
+
"source": [
|
246 |
+
"torch.set_num_threads(5)"
|
247 |
+
]
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"cell_type": "code",
|
251 |
+
"execution_count": 131,
|
252 |
+
"id": "0215d312-1896-43f1-9782-c92aced787b7",
|
253 |
+
"metadata": {
|
254 |
+
"tags": []
|
255 |
+
},
|
256 |
+
"outputs": [
|
257 |
+
{
|
258 |
+
"name": "stdout",
|
259 |
+
"output_type": "stream",
|
260 |
+
"text": [
|
261 |
+
"CPU times: user 14.4 s, sys: 89.6 ms, total: 14.5 s\n",
|
262 |
+
"Wall time: 2.9 s\n"
|
263 |
+
]
|
264 |
+
}
|
265 |
+
],
|
266 |
+
"source": [
|
267 |
+
"speech, rate = soundfile.read(\"example_audio/oden_kypsis16k_subset2.wav\")\n",
|
268 |
+
"assert rate == 16000\n",
|
269 |
+
"\n",
|
270 |
+
"%time text, *_ = speech2text(speech)"
|
271 |
+
]
|
272 |
+
},
|
273 |
+
{
|
274 |
+
"cell_type": "code",
|
275 |
+
"execution_count": 132,
|
276 |
+
"id": "d31d6840-3a80-411a-969c-05f4a5e3e9a1",
|
277 |
+
"metadata": {
|
278 |
+
"tags": []
|
279 |
+
},
|
280 |
+
"outputs": [
|
281 |
+
{
|
282 |
+
"name": "stdout",
|
283 |
+
"output_type": "stream",
|
284 |
+
"text": [
|
285 |
+
"\n",
|
286 |
+
"Input File : 'example_audio/oden_kypsis16k_subset2.wav'\n",
|
287 |
+
"Channels : 1\n",
|
288 |
+
"Sample Rate : 16000\n",
|
289 |
+
"Precision : 16-bit\n",
|
290 |
+
"Duration : 00:00:09.19 = 146983 samples ~ 688.983 CDDA sectors\n",
|
291 |
+
"File Size : 294k\n",
|
292 |
+
"Bit Rate : 256k\n",
|
293 |
+
"Sample Encoding: 16-bit Signed Integer PCM\n",
|
294 |
+
"\n"
|
295 |
+
]
|
296 |
+
}
|
297 |
+
],
|
298 |
+
"source": [
|
299 |
+
"!soxi example_audio/oden_kypsis16k_subset2.wav"
|
300 |
+
]
|
301 |
+
},
|
302 |
+
{
|
303 |
+
"cell_type": "code",
|
304 |
+
"execution_count": 108,
|
305 |
+
"id": "53f3b63f-9b40-432b-b58c-f5b7223252ed",
|
306 |
+
"metadata": {
|
307 |
+
"tags": []
|
308 |
+
},
|
309 |
+
"outputs": [
|
310 |
+
{
|
311 |
+
"name": "stdout",
|
312 |
+
"output_type": "stream",
|
313 |
+
"text": [
|
314 |
+
"CPU times: user 474 ms, sys: 30.2 ms, total: 504 ms\n",
|
315 |
+
"Wall time: 501 ms\n"
|
316 |
+
]
|
317 |
+
}
|
318 |
+
],
|
319 |
+
"source": [
|
320 |
+
"%time df_times=get_timestamps(aligner, speech, text[0])"
|
321 |
+
]
|
322 |
+
},
|
323 |
+
{
|
324 |
+
"cell_type": "code",
|
325 |
+
"execution_count": 109,
|
326 |
+
"id": "1b4dd747-4be2-4ace-a301-6207f7dd9a71",
|
327 |
"metadata": {
|
328 |
"tags": []
|
329 |
},
|
330 |
"outputs": [
|
331 |
{
|
332 |
"data": {
|
333 |
+
"text/html": [
|
334 |
+
"<div>\n",
|
335 |
+
"<style scoped>\n",
|
336 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
337 |
+
" vertical-align: middle;\n",
|
338 |
+
" }\n",
|
339 |
+
"\n",
|
340 |
+
" .dataframe tbody tr th {\n",
|
341 |
+
" vertical-align: top;\n",
|
342 |
+
" }\n",
|
343 |
+
"\n",
|
344 |
+
" .dataframe thead th {\n",
|
345 |
+
" text-align: right;\n",
|
346 |
+
" }\n",
|
347 |
+
"</style>\n",
|
348 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
349 |
+
" <thead>\n",
|
350 |
+
" <tr style=\"text-align: right;\">\n",
|
351 |
+
" <th></th>\n",
|
352 |
+
" <th>start</th>\n",
|
353 |
+
" <th>end</th>\n",
|
354 |
+
" <th>confidence</th>\n",
|
355 |
+
" <th>words</th>\n",
|
356 |
+
" </tr>\n",
|
357 |
+
" </thead>\n",
|
358 |
+
" <tbody>\n",
|
359 |
+
" <tr>\n",
|
360 |
+
" <th>0</th>\n",
|
361 |
+
" <td>0.260154</td>\n",
|
362 |
+
" <td>0.661184</td>\n",
|
363 |
+
" <td>-0.107317</td>\n",
|
364 |
+
" <td>klikid</td>\n",
|
365 |
+
" </tr>\n",
|
366 |
+
" <tr>\n",
|
367 |
+
" <th>1</th>\n",
|
368 |
+
" <td>0.661184</td>\n",
|
369 |
+
" <td>0.821596</td>\n",
|
370 |
+
" <td>-0.001331</td>\n",
|
371 |
+
" <td>neid</td>\n",
|
372 |
+
" </tr>\n",
|
373 |
+
" <tr>\n",
|
374 |
+
" <th>2</th>\n",
|
375 |
+
" <td>0.822883</td>\n",
|
376 |
+
" <td>1.784067</td>\n",
|
377 |
+
" <td>-0.002136</td>\n",
|
378 |
+
" <td>allserva</td>\n",
|
379 |
+
" </tr>\n",
|
380 |
+
" <tr>\n",
|
381 |
+
" <th>3</th>\n",
|
382 |
+
" <td>1.784067</td>\n",
|
383 |
+
" <td>1.984582</td>\n",
|
384 |
+
" <td>-0.041078</td>\n",
|
385 |
+
" <td>tekivad</td>\n",
|
386 |
+
" </tr>\n",
|
387 |
+
" <tr>\n",
|
388 |
+
" <th>4</th>\n",
|
389 |
+
" <td>2.547310</td>\n",
|
390 |
+
" <td>3.067362</td>\n",
|
391 |
+
" <td>-0.008251</td>\n",
|
392 |
+
" <td>need</td>\n",
|
393 |
+
" </tr>\n",
|
394 |
+
" <tr>\n",
|
395 |
+
" <th>5</th>\n",
|
396 |
+
" <td>3.067362</td>\n",
|
397 |
+
" <td>4.029833</td>\n",
|
398 |
+
" <td>-0.007814</td>\n",
|
399 |
+
" <td>lubaküpsiseid</td>\n",
|
400 |
+
" </tr>\n",
|
401 |
+
" <tr>\n",
|
402 |
+
" <th>6</th>\n",
|
403 |
+
" <td>4.752973</td>\n",
|
404 |
+
" <td>5.273025</td>\n",
|
405 |
+
" <td>-0.000333</td>\n",
|
406 |
+
" <td>mis</td>\n",
|
407 |
+
" </tr>\n",
|
408 |
+
" <tr>\n",
|
409 |
+
" <th>7</th>\n",
|
410 |
+
" <td>5.273025</td>\n",
|
411 |
+
" <td>5.413385</td>\n",
|
412 |
+
" <td>-0.063720</td>\n",
|
413 |
+
" <td>on</td>\n",
|
414 |
+
" </tr>\n",
|
415 |
+
" <tr>\n",
|
416 |
+
" <th>8</th>\n",
|
417 |
+
" <td>5.413385</td>\n",
|
418 |
+
" <td>5.553745</td>\n",
|
419 |
+
" <td>-0.000231</td>\n",
|
420 |
+
" <td>nagu</td>\n",
|
421 |
+
" </tr>\n",
|
422 |
+
" <tr>\n",
|
423 |
+
" <th>9</th>\n",
|
424 |
+
" <td>5.553745</td>\n",
|
425 |
+
" <td>5.834466</td>\n",
|
426 |
+
" <td>-0.000573</td>\n",
|
427 |
+
" <td>ilusti</td>\n",
|
428 |
+
" </tr>\n",
|
429 |
+
" <tr>\n",
|
430 |
+
" <th>10</th>\n",
|
431 |
+
" <td>5.834466</td>\n",
|
432 |
+
" <td>6.115187</td>\n",
|
433 |
+
" <td>-0.001930</td>\n",
|
434 |
+
" <td>kohati</td>\n",
|
435 |
+
" </tr>\n",
|
436 |
+
" <tr>\n",
|
437 |
+
" <th>11</th>\n",
|
438 |
+
" <td>6.236783</td>\n",
|
439 |
+
" <td>7.037555</td>\n",
|
440 |
+
" <td>-0.004937</td>\n",
|
441 |
+
" <td>tõlgitud</td>\n",
|
442 |
+
" </tr>\n",
|
443 |
+
" <tr>\n",
|
444 |
+
" <th>12</th>\n",
|
445 |
+
" <td>7.037555</td>\n",
|
446 |
+
" <td>7.238070</td>\n",
|
447 |
+
" <td>-0.001092</td>\n",
|
448 |
+
" <td>eesti</td>\n",
|
449 |
+
" </tr>\n",
|
450 |
+
" <tr>\n",
|
451 |
+
" <th>13</th>\n",
|
452 |
+
" <td>7.238070</td>\n",
|
453 |
+
" <td>7.679202</td>\n",
|
454 |
+
" <td>-0.003088</td>\n",
|
455 |
+
" <td>keelde</td>\n",
|
456 |
+
" </tr>\n",
|
457 |
+
" <tr>\n",
|
458 |
+
" <th>14</th>\n",
|
459 |
+
" <td>7.800798</td>\n",
|
460 |
+
" <td>8.320850</td>\n",
|
461 |
+
" <td>-0.001126</td>\n",
|
462 |
+
" <td>see</td>\n",
|
463 |
+
" </tr>\n",
|
464 |
+
" <tr>\n",
|
465 |
+
" <th>15</th>\n",
|
466 |
+
" <td>8.320850</td>\n",
|
467 |
+
" <td>8.601571</td>\n",
|
468 |
+
" <td>-0.033408</td>\n",
|
469 |
+
" <td>idee</td>\n",
|
470 |
+
" </tr>\n",
|
471 |
+
" <tr>\n",
|
472 |
+
" <th>16</th>\n",
|
473 |
+
" <td>8.601571</td>\n",
|
474 |
+
" <td>9.363527</td>\n",
|
475 |
+
" <td>-0.032846</td>\n",
|
476 |
+
" <td>arusaadavamaks</td>\n",
|
477 |
+
" </tr>\n",
|
478 |
+
" <tr>\n",
|
479 |
+
" <th>17</th>\n",
|
480 |
+
" <td>9.363527</td>\n",
|
481 |
+
" <td>9.584093</td>\n",
|
482 |
+
" <td>-0.390966</td>\n",
|
483 |
+
" <td>ma</td>\n",
|
484 |
+
" </tr>\n",
|
485 |
+
" <tr>\n",
|
486 |
+
" <th>18</th>\n",
|
487 |
+
" <td>9.584093</td>\n",
|
488 |
+
" <td>9.764557</td>\n",
|
489 |
+
" <td>-0.053868</td>\n",
|
490 |
+
" <td>tean</td>\n",
|
491 |
+
" </tr>\n",
|
492 |
+
" <tr>\n",
|
493 |
+
" <th>19</th>\n",
|
494 |
+
" <td>9.764557</td>\n",
|
495 |
+
" <td>9.924968</td>\n",
|
496 |
+
" <td>-0.000163</td>\n",
|
497 |
+
" <td>et</td>\n",
|
498 |
+
" </tr>\n",
|
499 |
+
" </tbody>\n",
|
500 |
+
"</table>\n",
|
501 |
+
"</div>"
|
502 |
+
],
|
503 |
"text/plain": [
|
504 |
+
" start end confidence words\n",
|
505 |
+
"0 0.260154 0.661184 -0.107317 klikid\n",
|
506 |
+
"1 0.661184 0.821596 -0.001331 neid\n",
|
507 |
+
"2 0.822883 1.784067 -0.002136 allserva\n",
|
508 |
+
"3 1.784067 1.984582 -0.041078 tekivad\n",
|
509 |
+
"4 2.547310 3.067362 -0.008251 need\n",
|
510 |
+
"5 3.067362 4.029833 -0.007814 lubaküpsiseid\n",
|
511 |
+
"6 4.752973 5.273025 -0.000333 mis\n",
|
512 |
+
"7 5.273025 5.413385 -0.063720 on\n",
|
513 |
+
"8 5.413385 5.553745 -0.000231 nagu\n",
|
514 |
+
"9 5.553745 5.834466 -0.000573 ilusti\n",
|
515 |
+
"10 5.834466 6.115187 -0.001930 kohati\n",
|
516 |
+
"11 6.236783 7.037555 -0.004937 tõlgitud\n",
|
517 |
+
"12 7.037555 7.238070 -0.001092 eesti\n",
|
518 |
+
"13 7.238070 7.679202 -0.003088 keelde\n",
|
519 |
+
"14 7.800798 8.320850 -0.001126 see\n",
|
520 |
+
"15 8.320850 8.601571 -0.033408 idee\n",
|
521 |
+
"16 8.601571 9.363527 -0.032846 arusaadavamaks\n",
|
522 |
+
"17 9.363527 9.584093 -0.390966 ma\n",
|
523 |
+
"18 9.584093 9.764557 -0.053868 tean\n",
|
524 |
+
"19 9.764557 9.924968 -0.000163 et"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
]
|
526 |
},
|
527 |
+
"execution_count": 109,
|
528 |
"metadata": {},
|
529 |
"output_type": "execute_result"
|
530 |
}
|
531 |
],
|
532 |
"source": [
|
533 |
+
"df_times.head(20)"
|
534 |
]
|
535 |
},
|
536 |
{
|
537 |
"cell_type": "code",
|
538 |
"execution_count": null,
|
539 |
+
"id": "7a4be2b1-5e0f-4558-8097-b37be0b83785",
|
540 |
"metadata": {},
|
541 |
"outputs": [],
|
542 |
"source": []
|