update readme with training
Browse files- .gitignore +1 -0
- README.md +64 -2
- config.json +6 -6
- inference.ipynb +147 -275
- preprocessor_config.json +0 -9
- pytorch_model.bin +0 -3
- train_tr.ipynb β train-Copy1.ipynb +0 -0
- train.ipynb +0 -0
- train_kh.ipynb +0 -0
- training_args.bin +0 -3
- vocab.json +1 -1
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
checkpoint-*
|
2 |
km_kh*
|
3 |
.ipynb_checkpoints
|
|
|
|
1 |
checkpoint-*
|
2 |
km_kh*
|
3 |
.ipynb_checkpoints
|
4 |
+
vitouphy
|
README.md
CHANGED
@@ -4,11 +4,73 @@ language:
|
|
4 |
license: apache-2.0
|
5 |
tags:
|
6 |
- automatic-speech-recognition
|
|
|
7 |
- robust-speech-event
|
8 |
- km
|
9 |
-
|
10 |
-
- open_slr
|
11 |
model-index:
|
12 |
- name: ''
|
13 |
results: []
|
14 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
license: apache-2.0
|
5 |
tags:
|
6 |
- automatic-speech-recognition
|
7 |
+
- openslr
|
8 |
- robust-speech-event
|
9 |
- km
|
10 |
+
- generated_from_trainer
|
|
|
11 |
model-index:
|
12 |
- name: ''
|
13 |
results: []
|
14 |
---
|
15 |
+
|
16 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
17 |
+
should probably proofread and complete it, then remove this comment. -->
|
18 |
+
|
19 |
+
#
|
20 |
+
|
21 |
+
This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on the openslr dataset.
|
22 |
+
It achieves the following results on the evaluation set:
|
23 |
+
- Loss: 0.4638
|
24 |
+
- Wer: 0.4944
|
25 |
+
|
26 |
+
## Model description
|
27 |
+
|
28 |
+
More information needed
|
29 |
+
|
30 |
+
## Intended uses & limitations
|
31 |
+
|
32 |
+
More information needed
|
33 |
+
|
34 |
+
## Training and evaluation data
|
35 |
+
|
36 |
+
More information needed
|
37 |
+
|
38 |
+
## Training procedure
|
39 |
+
|
40 |
+
### Training hyperparameters
|
41 |
+
|
42 |
+
The following hyperparameters were used during training:
|
43 |
+
- learning_rate: 5e-05
|
44 |
+
- train_batch_size: 8
|
45 |
+
- eval_batch_size: 8
|
46 |
+
- seed: 42
|
47 |
+
- gradient_accumulation_steps: 4
|
48 |
+
- total_train_batch_size: 32
|
49 |
+
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
50 |
+
- lr_scheduler_type: linear
|
51 |
+
- lr_scheduler_warmup_steps: 1000
|
52 |
+
- num_epochs: 50
|
53 |
+
- mixed_precision_training: Native AMP
|
54 |
+
|
55 |
+
### Training results
|
56 |
+
|
57 |
+
| Training Loss | Epoch | Step | Validation Loss | Wer |
|
58 |
+
|:-------------:|:-----:|:----:|:---------------:|:------:|
|
59 |
+
| 5.2049 | 4.93 | 400 | 4.5570 | 1.0 |
|
60 |
+
| 3.569 | 9.87 | 800 | 3.5415 | 1.0 |
|
61 |
+
| 3.483 | 14.81 | 1200 | 3.3956 | 1.0 |
|
62 |
+
| 2.1906 | 19.75 | 1600 | 1.1732 | 0.7897 |
|
63 |
+
| 1.7968 | 24.69 | 2000 | 0.7634 | 0.6678 |
|
64 |
+
| 1.615 | 29.62 | 2400 | 0.6182 | 0.5922 |
|
65 |
+
| 1.52 | 34.56 | 2800 | 0.5473 | 0.5479 |
|
66 |
+
| 1.4696 | 39.5 | 3200 | 0.5002 | 0.5130 |
|
67 |
+
| 1.4175 | 44.44 | 3600 | 0.4752 | 0.5021 |
|
68 |
+
| 1.3943 | 49.38 | 4000 | 0.4638 | 0.4944 |
|
69 |
+
|
70 |
+
|
71 |
+
### Framework versions
|
72 |
+
|
73 |
+
- Transformers 4.17.0.dev0
|
74 |
+
- Pytorch 1.10.2+cu102
|
75 |
+
- Datasets 1.18.2.dev0
|
76 |
+
- Tokenizers 0.11.0
|
config.json
CHANGED
@@ -8,7 +8,7 @@
|
|
8 |
"architectures": [
|
9 |
"Wav2Vec2ForCTC"
|
10 |
],
|
11 |
-
"attention_dropout": 0.
|
12 |
"bos_token_id": 1,
|
13 |
"classifier_proj_size": 256,
|
14 |
"codevector_dim": 768,
|
@@ -54,18 +54,18 @@
|
|
54 |
"final_dropout": 0.0,
|
55 |
"gradient_checkpointing": false,
|
56 |
"hidden_act": "gelu",
|
57 |
-
"hidden_dropout": 0.
|
58 |
"hidden_size": 1024,
|
59 |
"initializer_range": 0.02,
|
60 |
"intermediate_size": 4096,
|
61 |
"layer_norm_eps": 1e-05,
|
62 |
"layerdrop": 0.0,
|
63 |
-
"mask_feature_length":
|
64 |
"mask_feature_min_masks": 0,
|
65 |
-
"mask_feature_prob": 0.
|
66 |
"mask_time_length": 10,
|
67 |
"mask_time_min_masks": 2,
|
68 |
-
"mask_time_prob": 0.
|
69 |
"model_type": "wav2vec2",
|
70 |
"num_adapter_layers": 3,
|
71 |
"num_attention_heads": 16,
|
@@ -77,7 +77,7 @@
|
|
77 |
"num_hidden_layers": 24,
|
78 |
"num_negatives": 100,
|
79 |
"output_hidden_size": 1024,
|
80 |
-
"pad_token_id":
|
81 |
"proj_codevector_dim": 768,
|
82 |
"tdnn_dilation": [
|
83 |
1,
|
|
|
8 |
"architectures": [
|
9 |
"Wav2Vec2ForCTC"
|
10 |
],
|
11 |
+
"attention_dropout": 0.1,
|
12 |
"bos_token_id": 1,
|
13 |
"classifier_proj_size": 256,
|
14 |
"codevector_dim": 768,
|
|
|
54 |
"final_dropout": 0.0,
|
55 |
"gradient_checkpointing": false,
|
56 |
"hidden_act": "gelu",
|
57 |
+
"hidden_dropout": 0.1,
|
58 |
"hidden_size": 1024,
|
59 |
"initializer_range": 0.02,
|
60 |
"intermediate_size": 4096,
|
61 |
"layer_norm_eps": 1e-05,
|
62 |
"layerdrop": 0.0,
|
63 |
+
"mask_feature_length": 64,
|
64 |
"mask_feature_min_masks": 0,
|
65 |
+
"mask_feature_prob": 0.25,
|
66 |
"mask_time_length": 10,
|
67 |
"mask_time_min_masks": 2,
|
68 |
+
"mask_time_prob": 0.75,
|
69 |
"model_type": "wav2vec2",
|
70 |
"num_adapter_layers": 3,
|
71 |
"num_attention_heads": 16,
|
|
|
77 |
"num_hidden_layers": 24,
|
78 |
"num_negatives": 100,
|
79 |
"output_hidden_size": 1024,
|
80 |
+
"pad_token_id": 72,
|
81 |
"proj_codevector_dim": 768,
|
82 |
"tdnn_dilation": [
|
83 |
1,
|
inference.ipynb
CHANGED
@@ -2,20 +2,21 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
-
"id": "
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": [
|
10 |
"from transformers import AutoModelForCTC, Wav2Vec2Processor\n",
|
11 |
"from datasets import load_dataset, load_metric, Audio\n",
|
|
|
12 |
"import torch"
|
13 |
]
|
14 |
},
|
15 |
{
|
16 |
"cell_type": "code",
|
17 |
-
"execution_count":
|
18 |
-
"id": "
|
19 |
"metadata": {},
|
20 |
"outputs": [
|
21 |
{
|
@@ -28,238 +29,189 @@
|
|
28 |
],
|
29 |
"source": [
|
30 |
"# model = AutoModelForCTC.from_pretrained(\".\").to('cuda')\n",
|
31 |
-
"processor = Wav2Vec2Processor.from_pretrained(\".\")"
|
32 |
]
|
33 |
},
|
34 |
{
|
35 |
"cell_type": "code",
|
36 |
-
"execution_count":
|
37 |
-
"id": "
|
38 |
"metadata": {},
|
39 |
"outputs": [
|
40 |
{
|
41 |
-
"
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
]
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
"cell_type": "code",
|
55 |
-
"execution_count": 48,
|
56 |
-
"id": "bb365941",
|
57 |
-
"metadata": {},
|
58 |
-
"outputs": [],
|
59 |
-
"source": [
|
60 |
-
"common_voice_test = (common_voice_test\n",
|
61 |
-
" .remove_columns([\"Unnamed: 0\", \"drop\"])\n",
|
62 |
-
" .rename_column('text', 'sentence'))"
|
63 |
-
]
|
64 |
-
},
|
65 |
-
{
|
66 |
-
"cell_type": "code",
|
67 |
-
"execution_count": null,
|
68 |
-
"id": "34979efb",
|
69 |
-
"metadata": {},
|
70 |
-
"outputs": [],
|
71 |
-
"source": [
|
72 |
-
"common_voice_test = common_voice_test.cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio')"
|
73 |
-
]
|
74 |
-
},
|
75 |
-
{
|
76 |
-
"cell_type": "code",
|
77 |
-
"execution_count": null,
|
78 |
-
"id": "66ac6b14",
|
79 |
-
"metadata": {},
|
80 |
-
"outputs": [],
|
81 |
-
"source": []
|
82 |
-
},
|
83 |
-
{
|
84 |
-
"cell_type": "code",
|
85 |
-
"execution_count": 17,
|
86 |
-
"id": "e135b397",
|
87 |
-
"metadata": {},
|
88 |
-
"outputs": [
|
89 |
{
|
90 |
-
"
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
"execution_count": 18,
|
104 |
-
"id": "9dd4cfd4",
|
105 |
-
"metadata": {},
|
106 |
-
"outputs": [
|
107 |
{
|
108 |
"data": {
|
|
|
|
|
|
|
|
|
|
|
109 |
"text/plain": [
|
110 |
-
"
|
111 |
-
" 'path': 'common_voice_tr_17343551.mp3',\n",
|
112 |
-
" 'audio': {'path': 'cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_17343551.mp3',\n",
|
113 |
-
" 'array': array([0. , 0. , 0. , ..., 0.00157976, 0.00167614,\n",
|
114 |
-
" 0.00091976], dtype=float32),\n",
|
115 |
-
" 'sampling_rate': 48000},\n",
|
116 |
-
" 'sentence': 'AΕΔ±rΔ± derecede kapalΔ± bir ortamΔ±mΔ±z var.',\n",
|
117 |
-
" 'up_votes': 2,\n",
|
118 |
-
" 'down_votes': 0,\n",
|
119 |
-
" 'age': 'thirties',\n",
|
120 |
-
" 'gender': 'male',\n",
|
121 |
-
" 'accent': 'other',\n",
|
122 |
-
" 'locale': 'tr',\n",
|
123 |
-
" 'segment': \"''\"}"
|
124 |
]
|
125 |
},
|
126 |
-
"execution_count": 18,
|
127 |
"metadata": {},
|
128 |
-
"output_type": "
|
129 |
-
}
|
130 |
-
],
|
131 |
-
"source": [
|
132 |
-
"common_voice_test[3]"
|
133 |
-
]
|
134 |
-
},
|
135 |
-
{
|
136 |
-
"cell_type": "code",
|
137 |
-
"execution_count": 19,
|
138 |
-
"id": "f36c3bcd",
|
139 |
-
"metadata": {},
|
140 |
-
"outputs": [],
|
141 |
-
"source": [
|
142 |
-
"# remove unnecceesary attributes\n",
|
143 |
-
"common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])"
|
144 |
-
]
|
145 |
-
},
|
146 |
-
{
|
147 |
-
"cell_type": "code",
|
148 |
-
"execution_count": 20,
|
149 |
-
"id": "142cffaa",
|
150 |
-
"metadata": {},
|
151 |
-
"outputs": [],
|
152 |
-
"source": [
|
153 |
-
"common_voice_test = common_voice_test.cast_column(\"audio\", Audio(sampling_rate=16_000))"
|
154 |
-
]
|
155 |
-
},
|
156 |
-
{
|
157 |
-
"cell_type": "code",
|
158 |
-
"execution_count": 29,
|
159 |
-
"id": "b1103455",
|
160 |
-
"metadata": {},
|
161 |
-
"outputs": [
|
162 |
{
|
163 |
"data": {
|
|
|
|
|
|
|
|
|
|
|
164 |
"text/plain": [
|
165 |
-
"
|
166 |
-
" features: ['path', 'audio', 'sentence'],\n",
|
167 |
-
" num_rows: 1647\n",
|
168 |
-
"})"
|
169 |
]
|
170 |
},
|
171 |
-
"execution_count": 29,
|
172 |
"metadata": {},
|
173 |
-
"output_type": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
}
|
175 |
],
|
176 |
"source": [
|
177 |
-
"
|
|
|
178 |
]
|
179 |
},
|
180 |
{
|
181 |
"cell_type": "code",
|
182 |
-
"execution_count":
|
183 |
-
"id": "
|
184 |
"metadata": {},
|
185 |
"outputs": [
|
186 |
{
|
187 |
-
"
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
"metadata": {},
|
194 |
-
"output_type": "execute_result"
|
195 |
}
|
196 |
],
|
197 |
"source": [
|
198 |
-
"common_voice_test
|
199 |
]
|
200 |
},
|
201 |
{
|
202 |
"cell_type": "code",
|
203 |
-
"execution_count":
|
204 |
-
"id": "
|
205 |
"metadata": {},
|
206 |
"outputs": [],
|
207 |
"source": [
|
208 |
-
"
|
|
|
|
|
209 |
]
|
210 |
},
|
211 |
{
|
212 |
"cell_type": "code",
|
213 |
-
"execution_count":
|
214 |
-
"id": "
|
215 |
"metadata": {},
|
216 |
-
"outputs": [
|
217 |
-
{
|
218 |
-
"data": {
|
219 |
-
"text/plain": [
|
220 |
-
"{'path': 'cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_17341269.mp3',\n",
|
221 |
-
" 'array': array([ 0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 8.288735e-06,\n",
|
222 |
-
" -1.994405e-03, -7.770515e-03], dtype=float32),\n",
|
223 |
-
" 'sampling_rate': 16000}"
|
224 |
-
]
|
225 |
-
},
|
226 |
-
"execution_count": 33,
|
227 |
-
"metadata": {},
|
228 |
-
"output_type": "execute_result"
|
229 |
-
}
|
230 |
-
],
|
231 |
"source": [
|
232 |
-
"common_voice_test
|
233 |
]
|
234 |
},
|
235 |
{
|
236 |
"cell_type": "code",
|
237 |
-
"execution_count":
|
238 |
-
"id": "
|
239 |
"metadata": {},
|
240 |
"outputs": [
|
241 |
{
|
242 |
-
"
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
|
|
253 |
}
|
254 |
],
|
255 |
"source": [
|
256 |
-
"
|
257 |
]
|
258 |
},
|
259 |
{
|
260 |
"cell_type": "code",
|
261 |
-
"execution_count":
|
262 |
-
"id": "
|
263 |
"metadata": {},
|
264 |
"outputs": [],
|
265 |
"source": [
|
@@ -277,14 +229,14 @@
|
|
277 |
},
|
278 |
{
|
279 |
"cell_type": "code",
|
280 |
-
"execution_count":
|
281 |
-
"id": "
|
282 |
"metadata": {},
|
283 |
"outputs": [
|
284 |
{
|
285 |
"data": {
|
286 |
"application/vnd.jupyter.widget-view+json": {
|
287 |
-
"model_id": "
|
288 |
"version_major": 2,
|
289 |
"version_minor": 0
|
290 |
},
|
@@ -294,27 +246,6 @@
|
|
294 |
},
|
295 |
"metadata": {},
|
296 |
"output_type": "display_data"
|
297 |
-
},
|
298 |
-
{
|
299 |
-
"ename": "ValueError",
|
300 |
-
"evalue": "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).",
|
301 |
-
"output_type": "error",
|
302 |
-
"traceback": [
|
303 |
-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
304 |
-
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
305 |
-
"Input \u001b[0;32mIn [28]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m common_voice_test \u001b[38;5;241m=\u001b[39m \u001b[43mcommon_voice_test\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprepare_dataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mremove_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcommon_voice_test\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumn_names\u001b[49m\u001b[43m)\u001b[49m\n",
|
306 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2107\u001b[0m, in \u001b[0;36mDataset.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m 2104\u001b[0m disable_tqdm \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mbool\u001b[39m(logging\u001b[38;5;241m.\u001b[39mget_verbosity() \u001b[38;5;241m==\u001b[39m logging\u001b[38;5;241m.\u001b[39mNOTSET) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mis_progress_bar_enabled()\n\u001b[1;32m 2106\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m num_proc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m num_proc \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m-> 2107\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_single\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2108\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2109\u001b[0m \u001b[43m \u001b[49m\u001b[43mwith_indices\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwith_indices\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2110\u001b[0m \u001b[43m \u001b[49m\u001b[43mwith_rank\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwith_rank\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2111\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2112\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatched\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatched\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2113\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2114\u001b[0m \u001b[43m \u001b[49m\u001b[43mdrop_last_batch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdrop_last_batch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2115\u001b[0m \u001b[43m \u001b[49m\u001b[43mremove_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mremove_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2116\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeep_in_memory\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeep_in_memory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2117\u001b[0m \u001b[43m \u001b[49m\u001b[43mload_from_cache_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mload_from_cache_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2118\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_file_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_file_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2119\u001b[0m \u001b[43m \u001b[49m\u001b[43mwriter_batch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwriter_batch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2120\u001b[0m \u001b[43m \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2121\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable_nullable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable_nullable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2122\u001b[0m \u001b[43m \u001b[49m\u001b[43mfn_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfn_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2123\u001b[0m \u001b[43m \u001b[49m\u001b[43mnew_fingerprint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnew_fingerprint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2124\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable_tqdm\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable_tqdm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2125\u001b[0m \u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdesc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2126\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2127\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2129\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mformat_cache_file_name\u001b[39m(cache_file_name, rank):\n",
|
307 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:519\u001b[0m, in \u001b[0;36mtransmit_tasks.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[38;5;28mself\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mself\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 518\u001b[0m \u001b[38;5;66;03m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 519\u001b[0m out: Union[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDatasetDict\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 520\u001b[0m datasets: List[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(out\u001b[38;5;241m.\u001b[39mvalues()) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(out, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m [out]\n\u001b[1;32m 521\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m dataset \u001b[38;5;129;01min\u001b[39;00m datasets:\n\u001b[1;32m 522\u001b[0m \u001b[38;5;66;03m# Remove task templates if a column mapping of the template is no longer valid\u001b[39;00m\n",
|
308 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:486\u001b[0m, in \u001b[0;36mtransmit_format.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 479\u001b[0m self_format \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 480\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_type,\n\u001b[1;32m 481\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mformat_kwargs\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_kwargs,\n\u001b[1;32m 482\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumns\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_columns,\n\u001b[1;32m 483\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_all_columns\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_output_all_columns,\n\u001b[1;32m 484\u001b[0m }\n\u001b[1;32m 485\u001b[0m \u001b[38;5;66;03m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 486\u001b[0m out: Union[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDatasetDict\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 487\u001b[0m datasets: List[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(out\u001b[38;5;241m.\u001b[39mvalues()) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(out, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m [out]\n\u001b[1;32m 488\u001b[0m \u001b[38;5;66;03m# re-apply format to the output\u001b[39;00m\n",
|
309 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/fingerprint.py:413\u001b[0m, in \u001b[0;36mfingerprint_transform.<locals>._fingerprint.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 407\u001b[0m kwargs[fingerprint_name] \u001b[38;5;241m=\u001b[39m update_fingerprint(\n\u001b[1;32m 408\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fingerprint, transform, kwargs_for_fingerprint\n\u001b[1;32m 409\u001b[0m )\n\u001b[1;32m 411\u001b[0m \u001b[38;5;66;03m# Call actual function\u001b[39;00m\n\u001b[0;32m--> 413\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 415\u001b[0m \u001b[38;5;66;03m# Update fingerprint of in-place transforms + update in-place history of transforms\u001b[39;00m\n\u001b[1;32m 417\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace: \u001b[38;5;66;03m# update after calling func so that the fingerprint doesn't change if the function fails\u001b[39;00m\n",
|
310 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2465\u001b[0m, in \u001b[0;36mDataset._map_single\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset, disable_tqdm, desc, cache_only)\u001b[0m\n\u001b[1;32m 2463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m batched:\n\u001b[1;32m 2464\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, example \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(pbar):\n\u001b[0;32m-> 2465\u001b[0m example \u001b[38;5;241m=\u001b[39m \u001b[43mapply_function_on_filtered_inputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexample\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2466\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m update_data:\n\u001b[1;32m 2467\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
|
311 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2372\u001b[0m, in \u001b[0;36mDataset._map_single.<locals>.apply_function_on_filtered_inputs\u001b[0;34m(inputs, indices, check_same_num_examples, offset)\u001b[0m\n\u001b[1;32m 2370\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m with_rank:\n\u001b[1;32m 2371\u001b[0m additional_args \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (rank,)\n\u001b[0;32m-> 2372\u001b[0m processed_inputs \u001b[38;5;241m=\u001b[39m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfn_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43madditional_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfn_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2373\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m update_data \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 2374\u001b[0m \u001b[38;5;66;03m# Check if the function returns updated examples\u001b[39;00m\n\u001b[1;32m 2375\u001b[0m update_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28misinstance\u001b[39m(processed_inputs, (Mapping, pa\u001b[38;5;241m.\u001b[39mTable))\n",
|
312 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2067\u001b[0m, in \u001b[0;36mDataset.map.<locals>.decorate.<locals>.decorated\u001b[0;34m(item, *args, **kwargs)\u001b[0m\n\u001b[1;32m 2063\u001b[0m decorated_item \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 2064\u001b[0m Example(item, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m batched \u001b[38;5;28;01melse\u001b[39;00m Batch(item, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures)\n\u001b[1;32m 2065\u001b[0m )\n\u001b[1;32m 2066\u001b[0m \u001b[38;5;66;03m# Use the LazyDict internally, while mapping the function\u001b[39;00m\n\u001b[0;32m-> 2067\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdecorated_item\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2068\u001b[0m \u001b[38;5;66;03m# Return a standard dict\u001b[39;00m\n\u001b[1;32m 2069\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\u001b[38;5;241m.\u001b[39mdata \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, LazyDict) \u001b[38;5;28;01melse\u001b[39;00m result\n",
|
313 |
-
"Input \u001b[0;32mIn [27]\u001b[0m, in \u001b[0;36mprepare_dataset\u001b[0;34m(batch)\u001b[0m\n\u001b[1;32m 2\u001b[0m audio \u001b[38;5;241m=\u001b[39m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maudio\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# batched output is \"un-batched\"\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_values\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mprocessor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marray\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43marray\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msampling_rate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msampling_rate\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39minput_values[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 6\u001b[0m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_length\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_values\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m processor\u001b[38;5;241m.\u001b[39mas_target_processor():\n",
|
314 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py:138\u001b[0m, in \u001b[0;36mWav2Vec2Processor.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 132\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;124;03m When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's\u001b[39;00m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context\u001b[39;00m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;124;03m [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.\u001b[39;00m\n\u001b[1;32m 137\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 138\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_processor\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
315 |
-
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2417\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.__call__\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 2414\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 2416\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text):\n\u001b[0;32m-> 2417\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2418\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2419\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2420\u001b[0m )\n\u001b[1;32m 2422\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text_pair):\n\u001b[1;32m 2423\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2424\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2425\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2426\u001b[0m )\n",
|
316 |
-
"\u001b[0;31mValueError\u001b[0m: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)."
|
317 |
-
]
|
318 |
}
|
319 |
],
|
320 |
"source": [
|
@@ -323,85 +254,25 @@
|
|
323 |
},
|
324 |
{
|
325 |
"cell_type": "code",
|
326 |
-
"execution_count":
|
327 |
-
"id": "
|
328 |
-
"metadata": {},
|
329 |
-
"outputs": [
|
330 |
-
{
|
331 |
-
"data": {
|
332 |
-
"text/plain": [
|
333 |
-
"{'path': 'common_voice_tr_17341269.mp3',\n",
|
334 |
-
" 'audio': {'path': 'cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_17341269.mp3',\n",
|
335 |
-
" 'array': array([ 0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 8.288735e-06,\n",
|
336 |
-
" -1.994405e-03, -7.770515e-03], dtype=float32),\n",
|
337 |
-
" 'sampling_rate': 16000},\n",
|
338 |
-
" 'sentence': 'Pek Γ§oΔu da Roman toplumundan geliyor.'}"
|
339 |
-
]
|
340 |
-
},
|
341 |
-
"execution_count": 24,
|
342 |
-
"metadata": {},
|
343 |
-
"output_type": "execute_result"
|
344 |
-
}
|
345 |
-
],
|
346 |
-
"source": [
|
347 |
-
"common_voice_test[0]"
|
348 |
-
]
|
349 |
-
},
|
350 |
-
{
|
351 |
-
"cell_type": "code",
|
352 |
-
"execution_count": null,
|
353 |
-
"id": "603ecd46",
|
354 |
-
"metadata": {},
|
355 |
-
"outputs": [],
|
356 |
-
"source": []
|
357 |
-
},
|
358 |
-
{
|
359 |
-
"cell_type": "code",
|
360 |
-
"execution_count": 12,
|
361 |
-
"id": "760f0031",
|
362 |
"metadata": {},
|
363 |
"outputs": [],
|
364 |
"source": [
|
365 |
-
"i =
|
366 |
]
|
367 |
},
|
368 |
{
|
369 |
"cell_type": "code",
|
370 |
-
"execution_count":
|
371 |
-
"id": "
|
372 |
"metadata": {},
|
373 |
"outputs": [
|
374 |
{
|
375 |
-
"
|
376 |
-
"
|
377 |
-
"
|
378 |
-
|
379 |
-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
380 |
-
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
381 |
-
"Input \u001b[0;32mIn [14]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m input_dict \u001b[38;5;241m=\u001b[39m processor(\u001b[43mcommon_voice_test\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minput_values\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m, padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
|
382 |
-
"\u001b[0;31mKeyError\u001b[0m: 'input_values'"
|
383 |
-
]
|
384 |
-
}
|
385 |
-
],
|
386 |
-
"source": [
|
387 |
-
"input_dict = processor(common_voice_test[i][\"input_values\"], return_tensors=\"pt\", padding=True)"
|
388 |
-
]
|
389 |
-
},
|
390 |
-
{
|
391 |
-
"cell_type": "code",
|
392 |
-
"execution_count": 13,
|
393 |
-
"id": "c0b3603c",
|
394 |
-
"metadata": {},
|
395 |
-
"outputs": [
|
396 |
-
{
|
397 |
-
"ename": "KeyError",
|
398 |
-
"evalue": "'input_values'",
|
399 |
-
"output_type": "error",
|
400 |
-
"traceback": [
|
401 |
-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
402 |
-
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
|
403 |
-
"Input \u001b[0;32mIn [13]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m input_dict \u001b[38;5;241m=\u001b[39m processor(\u001b[43mcommon_voice_test\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minput_values\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m, padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m logits \u001b[38;5;241m=\u001b[39m model(input_dict\u001b[38;5;241m.\u001b[39minput_values\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m))\u001b[38;5;241m.\u001b[39mlogits\n\u001b[1;32m 3\u001b[0m pred_ids \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39margmax(logits, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)[\u001b[38;5;241m0\u001b[39m]\n",
|
404 |
-
"\u001b[0;31mKeyError\u001b[0m: 'input_values'"
|
405 |
]
|
406 |
}
|
407 |
],
|
@@ -413,8 +284,8 @@
|
|
413 |
},
|
414 |
{
|
415 |
"cell_type": "code",
|
416 |
-
"execution_count":
|
417 |
-
"id": "
|
418 |
"metadata": {},
|
419 |
"outputs": [
|
420 |
{
|
@@ -422,15 +293,16 @@
|
|
422 |
"output_type": "stream",
|
423 |
"text": [
|
424 |
"Prediction:\n",
|
425 |
-
"
|
426 |
"\n",
|
427 |
"Reference:\n",
|
428 |
-
"
|
429 |
]
|
430 |
}
|
431 |
],
|
432 |
"source": [
|
433 |
"print(\"Prediction:\")\n",
|
|
|
434 |
"print(processor.decode(pred_ids))\n",
|
435 |
"\n",
|
436 |
"print(\"\\nReference:\")\n",
|
@@ -441,7 +313,7 @@
|
|
441 |
{
|
442 |
"cell_type": "code",
|
443 |
"execution_count": null,
|
444 |
-
"id": "
|
445 |
"metadata": {},
|
446 |
"outputs": [],
|
447 |
"source": []
|
@@ -449,7 +321,7 @@
|
|
449 |
{
|
450 |
"cell_type": "code",
|
451 |
"execution_count": null,
|
452 |
-
"id": "
|
453 |
"metadata": {},
|
454 |
"outputs": [],
|
455 |
"source": []
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 7,
|
6 |
+
"id": "9abf3270",
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": [
|
10 |
"from transformers import AutoModelForCTC, Wav2Vec2Processor\n",
|
11 |
"from datasets import load_dataset, load_metric, Audio\n",
|
12 |
+
"import numpy as np\n",
|
13 |
"import torch"
|
14 |
]
|
15 |
},
|
16 |
{
|
17 |
"cell_type": "code",
|
18 |
+
"execution_count": 27,
|
19 |
+
"id": "6e0830a2",
|
20 |
"metadata": {},
|
21 |
"outputs": [
|
22 |
{
|
|
|
29 |
],
|
30 |
"source": [
|
31 |
"# model = AutoModelForCTC.from_pretrained(\".\").to('cuda')\n",
|
32 |
+
"# processor = Wav2Vec2Processor.from_pretrained(\".\")"
|
33 |
]
|
34 |
},
|
35 |
{
|
36 |
"cell_type": "code",
|
37 |
+
"execution_count": 39,
|
38 |
+
"id": "126e39e0",
|
39 |
"metadata": {},
|
40 |
"outputs": [
|
41 |
{
|
42 |
+
"data": {
|
43 |
+
"application/vnd.jupyter.widget-view+json": {
|
44 |
+
"model_id": "a2fdec3c288946a19a5b36618af4c26c",
|
45 |
+
"version_major": 2,
|
46 |
+
"version_minor": 0
|
47 |
+
},
|
48 |
+
"text/plain": [
|
49 |
+
"Downloading: 0%| | 0.00/2.02k [00:00<?, ?B/s]"
|
50 |
+
]
|
51 |
+
},
|
52 |
+
"metadata": {},
|
53 |
+
"output_type": "display_data"
|
54 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
{
|
56 |
+
"data": {
|
57 |
+
"application/vnd.jupyter.widget-view+json": {
|
58 |
+
"model_id": "ae6b614b82aa4627b55b8c742ca8898e",
|
59 |
+
"version_major": 2,
|
60 |
+
"version_minor": 0
|
61 |
+
},
|
62 |
+
"text/plain": [
|
63 |
+
"Downloading: 0%| | 0.00/1.18G [00:00<?, ?B/s]"
|
64 |
+
]
|
65 |
+
},
|
66 |
+
"metadata": {},
|
67 |
+
"output_type": "display_data"
|
68 |
+
},
|
|
|
|
|
|
|
|
|
69 |
{
|
70 |
"data": {
|
71 |
+
"application/vnd.jupyter.widget-view+json": {
|
72 |
+
"model_id": "8c39ff01100c40aab1191514bb52399d",
|
73 |
+
"version_major": 2,
|
74 |
+
"version_minor": 0
|
75 |
+
},
|
76 |
"text/plain": [
|
77 |
+
"Downloading: 0%| | 0.00/214 [00:00<?, ?B/s]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
]
|
79 |
},
|
|
|
80 |
"metadata": {},
|
81 |
+
"output_type": "display_data"
|
82 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
{
|
84 |
"data": {
|
85 |
+
"application/vnd.jupyter.widget-view+json": {
|
86 |
+
"model_id": "5078aede36ff4dc1a17e24a967cb46dd",
|
87 |
+
"version_major": 2,
|
88 |
+
"version_minor": 0
|
89 |
+
},
|
90 |
"text/plain": [
|
91 |
+
"Downloading: 0%| | 0.00/260 [00:00<?, ?B/s]"
|
|
|
|
|
|
|
92 |
]
|
93 |
},
|
|
|
94 |
"metadata": {},
|
95 |
+
"output_type": "display_data"
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"data": {
|
99 |
+
"application/vnd.jupyter.widget-view+json": {
|
100 |
+
"model_id": "a4c7e0e6e26f4c5e8f40e79b67b61c17",
|
101 |
+
"version_major": 2,
|
102 |
+
"version_minor": 0
|
103 |
+
},
|
104 |
+
"text/plain": [
|
105 |
+
"Downloading: 0%| | 0.00/795 [00:00<?, ?B/s]"
|
106 |
+
]
|
107 |
+
},
|
108 |
+
"metadata": {},
|
109 |
+
"output_type": "display_data"
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"data": {
|
113 |
+
"application/vnd.jupyter.widget-view+json": {
|
114 |
+
"model_id": "b55bcb16d8d042059a6487391e1a51de",
|
115 |
+
"version_major": 2,
|
116 |
+
"version_minor": 0
|
117 |
+
},
|
118 |
+
"text/plain": [
|
119 |
+
"Downloading: 0%| | 0.00/23.0 [00:00<?, ?B/s]"
|
120 |
+
]
|
121 |
+
},
|
122 |
+
"metadata": {},
|
123 |
+
"output_type": "display_data"
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"data": {
|
127 |
+
"application/vnd.jupyter.widget-view+json": {
|
128 |
+
"model_id": "0546a666b47a4d418e62ccc8fec58bd4",
|
129 |
+
"version_major": 2,
|
130 |
+
"version_minor": 0
|
131 |
+
},
|
132 |
+
"text/plain": [
|
133 |
+
"Downloading: 0%| | 0.00/309 [00:00<?, ?B/s]"
|
134 |
+
]
|
135 |
+
},
|
136 |
+
"metadata": {},
|
137 |
+
"output_type": "display_data"
|
138 |
}
|
139 |
],
|
140 |
"source": [
|
141 |
+
"model = AutoModelForCTC.from_pretrained(\"vitouphy/xls-r-300m-km\").to('cuda')\n",
|
142 |
+
"processor = Wav2Vec2Processor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
|
143 |
]
|
144 |
},
|
145 |
{
|
146 |
"cell_type": "code",
|
147 |
+
"execution_count": 3,
|
148 |
+
"id": "cf72163d",
|
149 |
"metadata": {},
|
150 |
"outputs": [
|
151 |
{
|
152 |
+
"name": "stderr",
|
153 |
+
"output_type": "stream",
|
154 |
+
"text": [
|
155 |
+
"Using custom data configuration default-fbad308ab5a03eb2\n",
|
156 |
+
"Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-fbad308ab5a03eb2/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)\n"
|
157 |
+
]
|
|
|
|
|
158 |
}
|
159 |
],
|
160 |
"source": [
|
161 |
+
"common_voice_test = load_dataset('csv', data_files='km_kh_male/line_index_test.csv', split = 'train')"
|
162 |
]
|
163 |
},
|
164 |
{
|
165 |
"cell_type": "code",
|
166 |
+
"execution_count": 4,
|
167 |
+
"id": "6b30d6ea",
|
168 |
"metadata": {},
|
169 |
"outputs": [],
|
170 |
"source": [
|
171 |
+
"common_voice_test = (common_voice_test\n",
|
172 |
+
" .remove_columns([\"Unnamed: 0\", \"drop\"])\n",
|
173 |
+
" .rename_column('text', 'sentence'))"
|
174 |
]
|
175 |
},
|
176 |
{
|
177 |
"cell_type": "code",
|
178 |
+
"execution_count": 5,
|
179 |
+
"id": "bf9734cc",
|
180 |
"metadata": {},
|
181 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
"source": [
|
183 |
+
"common_voice_test = common_voice_test.cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio')"
|
184 |
]
|
185 |
},
|
186 |
{
|
187 |
"cell_type": "code",
|
188 |
+
"execution_count": 8,
|
189 |
+
"id": "5e74effa",
|
190 |
"metadata": {},
|
191 |
"outputs": [
|
192 |
{
|
193 |
+
"data": {
|
194 |
+
"text/plain": [
|
195 |
+
"{'audio': {'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/khm_1443_3799144408.wav',\n",
|
196 |
+
" 'array': array([-1.0600963e-06, 1.2359066e-06, -1.4001107e-06, ...,\n",
|
197 |
+
" -3.1423504e-05, 4.4914182e-06, 0.0000000e+00], dtype=float32),\n",
|
198 |
+
" 'sampling_rate': 16000},\n",
|
199 |
+
" 'sentence': 'αααΈ ααΆα
α αααΌααΌ αα
ααα ααα ααααΎ α±αα αα»α αααααΆαα ααααΏα αααα αΆααΉα αα αα
α±αα αααΌα αα
αααα»α αααα αααααα’ααααα'}"
|
200 |
+
]
|
201 |
+
},
|
202 |
+
"execution_count": 8,
|
203 |
+
"metadata": {},
|
204 |
+
"output_type": "execute_result"
|
205 |
}
|
206 |
],
|
207 |
"source": [
|
208 |
+
"common_voice_test[0]"
|
209 |
]
|
210 |
},
|
211 |
{
|
212 |
"cell_type": "code",
|
213 |
+
"execution_count": 11,
|
214 |
+
"id": "c94de1a7",
|
215 |
"metadata": {},
|
216 |
"outputs": [],
|
217 |
"source": [
|
|
|
229 |
},
|
230 |
{
|
231 |
"cell_type": "code",
|
232 |
+
"execution_count": 12,
|
233 |
+
"id": "c018376a",
|
234 |
"metadata": {},
|
235 |
"outputs": [
|
236 |
{
|
237 |
"data": {
|
238 |
"application/vnd.jupyter.widget-view+json": {
|
239 |
+
"model_id": "e9ec232c2caf4bdcb62b24696217a723",
|
240 |
"version_major": 2,
|
241 |
"version_minor": 0
|
242 |
},
|
|
|
246 |
},
|
247 |
"metadata": {},
|
248 |
"output_type": "display_data"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
}
|
250 |
],
|
251 |
"source": [
|
|
|
254 |
},
|
255 |
{
|
256 |
"cell_type": "code",
|
257 |
+
"execution_count": 45,
|
258 |
+
"id": "c0a02606",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
"metadata": {},
|
260 |
"outputs": [],
|
261 |
"source": [
|
262 |
+
"i = 21"
|
263 |
]
|
264 |
},
|
265 |
{
|
266 |
"cell_type": "code",
|
267 |
+
"execution_count": 46,
|
268 |
+
"id": "19b8e75f",
|
269 |
"metadata": {},
|
270 |
"outputs": [
|
271 |
{
|
272 |
+
"name": "stderr",
|
273 |
+
"output_type": "stream",
|
274 |
+
"text": [
|
275 |
+
"It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
]
|
277 |
}
|
278 |
],
|
|
|
284 |
},
|
285 |
{
|
286 |
"cell_type": "code",
|
287 |
+
"execution_count": 47,
|
288 |
+
"id": "d15fb5a5",
|
289 |
"metadata": {},
|
290 |
"outputs": [
|
291 |
{
|
|
|
293 |
"output_type": "stream",
|
294 |
"text": [
|
295 |
"Prediction:\n",
|
296 |
+
"ααΆααΌα αααΌααααΆα αα·α ααΌα αααΆαααα ααΆ αα·ααα αα·α ααααΆ\n",
|
297 |
"\n",
|
298 |
"Reference:\n",
|
299 |
+
"ααΆααΌα αααΌααααΆα αα·α ααΌα αααΆααααα ααΆ αα·ααα αα·α ααααΆ\n"
|
300 |
]
|
301 |
}
|
302 |
],
|
303 |
"source": [
|
304 |
"print(\"Prediction:\")\n",
|
305 |
+
"pred_ids = pred_ids[pred_ids != processor.tokenizer.pad_token_id]\n",
|
306 |
"print(processor.decode(pred_ids))\n",
|
307 |
"\n",
|
308 |
"print(\"\\nReference:\")\n",
|
|
|
313 |
{
|
314 |
"cell_type": "code",
|
315 |
"execution_count": null,
|
316 |
+
"id": "07bc1b8e",
|
317 |
"metadata": {},
|
318 |
"outputs": [],
|
319 |
"source": []
|
|
|
321 |
{
|
322 |
"cell_type": "code",
|
323 |
"execution_count": null,
|
324 |
+
"id": "b228faa1",
|
325 |
"metadata": {},
|
326 |
"outputs": [],
|
327 |
"source": []
|
preprocessor_config.json
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"do_normalize": true,
|
3 |
-
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
-
"feature_size": 1,
|
5 |
-
"padding_side": "right",
|
6 |
-
"padding_value": 0.0,
|
7 |
-
"return_attention_mask": true,
|
8 |
-
"sampling_rate": 16000
|
9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pytorch_model.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:de338e5c1afe8724c486f34b78be6997d34259229acda73ba51b26c8c304a4d7
|
3 |
-
size 1262231153
|
|
|
|
|
|
|
|
train_tr.ipynb β train-Copy1.ipynb
RENAMED
File without changes
|
train.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
train_kh.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
training_args.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:9b35c85cfceecf16b9f8a4306c6c419ed33469f21c8e3b016a95fdbbcd87ddbb
|
3 |
-
size 2991
|
|
|
|
|
|
|
|
vocab.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"
|
|
|
1 |
+
{"\u1780": 1, "\u1781": 2, "\u1782": 3, "\u1783": 4, "\u1784": 5, "\u1785": 6, "\u1786": 7, "\u1787": 8, "\u1788": 9, "\u1789": 10, "\u178a": 11, "\u178b": 12, "\u178c": 13, "\u178d": 14, "\u178e": 15, "\u178f": 16, "\u1790": 17, "\u1791": 18, "\u1792": 19, "\u1793": 20, "\u1794": 21, "\u1795": 22, "\u1796": 23, "\u1797": 24, "\u1798": 25, "\u1799": 26, "\u179a": 27, "\u179b": 28, "\u179c": 29, "\u179f": 30, "\u17a0": 31, "\u17a1": 32, "\u17a2": 33, "\u17a5": 34, "\u17a7": 35, "\u17aa": 36, "\u17ab": 37, "\u17ac": 38, "\u17ad": 39, "\u17ae": 40, "\u17af": 41, "\u17b1": 42, "\u17b6": 43, "\u17b7": 44, "\u17b8": 45, "\u17b9": 46, "\u17ba": 47, "\u17bb": 48, "\u17bc": 49, "\u17bd": 50, "\u17be": 51, "\u17bf": 52, "\u17c0": 53, "\u17c1": 54, "\u17c2": 55, "\u17c3": 56, "\u17c4": 57, "\u17c5": 58, "\u17c6": 59, "\u17c7": 60, "\u17c8": 61, "\u17c9": 62, "\u17ca": 63, "\u17cb": 64, "\u17cc": 65, "\u17cd": 66, "\u17ce": 67, "\u17cf": 68, "\u17d0": 69, "\u17d2": 70, "|": 0, "[UNK]": 71, "[PAD]": 72}
|