vitouphy commited on
Commit
e10d376
β€’
1 Parent(s): 0530d4b

update readme with training

Browse files
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  checkpoint-*
2
  km_kh*
3
  .ipynb_checkpoints
 
 
1
  checkpoint-*
2
  km_kh*
3
  .ipynb_checkpoints
4
+ vitouphy
README.md CHANGED
@@ -4,11 +4,73 @@ language:
4
  license: apache-2.0
5
  tags:
6
  - automatic-speech-recognition
 
7
  - robust-speech-event
8
  - km
9
- datasets:
10
- - open_slr
11
  model-index:
12
  - name: ''
13
  results: []
14
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  license: apache-2.0
5
  tags:
6
  - automatic-speech-recognition
7
+ - openslr
8
  - robust-speech-event
9
  - km
10
+ - generated_from_trainer
 
11
  model-index:
12
  - name: ''
13
  results: []
14
  ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ #
20
+
21
+ This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on the openslr dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 0.4638
24
+ - Wer: 0.4944
25
+
26
+ ## Model description
27
+
28
+ More information needed
29
+
30
+ ## Intended uses & limitations
31
+
32
+ More information needed
33
+
34
+ ## Training and evaluation data
35
+
36
+ More information needed
37
+
38
+ ## Training procedure
39
+
40
+ ### Training hyperparameters
41
+
42
+ The following hyperparameters were used during training:
43
+ - learning_rate: 5e-05
44
+ - train_batch_size: 8
45
+ - eval_batch_size: 8
46
+ - seed: 42
47
+ - gradient_accumulation_steps: 4
48
+ - total_train_batch_size: 32
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: linear
51
+ - lr_scheduler_warmup_steps: 1000
52
+ - num_epochs: 50
53
+ - mixed_precision_training: Native AMP
54
+
55
+ ### Training results
56
+
57
+ | Training Loss | Epoch | Step | Validation Loss | Wer |
58
+ |:-------------:|:-----:|:----:|:---------------:|:------:|
59
+ | 5.2049 | 4.93 | 400 | 4.5570 | 1.0 |
60
+ | 3.569 | 9.87 | 800 | 3.5415 | 1.0 |
61
+ | 3.483 | 14.81 | 1200 | 3.3956 | 1.0 |
62
+ | 2.1906 | 19.75 | 1600 | 1.1732 | 0.7897 |
63
+ | 1.7968 | 24.69 | 2000 | 0.7634 | 0.6678 |
64
+ | 1.615 | 29.62 | 2400 | 0.6182 | 0.5922 |
65
+ | 1.52 | 34.56 | 2800 | 0.5473 | 0.5479 |
66
+ | 1.4696 | 39.5 | 3200 | 0.5002 | 0.5130 |
67
+ | 1.4175 | 44.44 | 3600 | 0.4752 | 0.5021 |
68
+ | 1.3943 | 49.38 | 4000 | 0.4638 | 0.4944 |
69
+
70
+
71
+ ### Framework versions
72
+
73
+ - Transformers 4.17.0.dev0
74
+ - Pytorch 1.10.2+cu102
75
+ - Datasets 1.18.2.dev0
76
+ - Tokenizers 0.11.0
config.json CHANGED
@@ -8,7 +8,7 @@
8
  "architectures": [
9
  "Wav2Vec2ForCTC"
10
  ],
11
- "attention_dropout": 0.0,
12
  "bos_token_id": 1,
13
  "classifier_proj_size": 256,
14
  "codevector_dim": 768,
@@ -54,18 +54,18 @@
54
  "final_dropout": 0.0,
55
  "gradient_checkpointing": false,
56
  "hidden_act": "gelu",
57
- "hidden_dropout": 0.0,
58
  "hidden_size": 1024,
59
  "initializer_range": 0.02,
60
  "intermediate_size": 4096,
61
  "layer_norm_eps": 1e-05,
62
  "layerdrop": 0.0,
63
- "mask_feature_length": 10,
64
  "mask_feature_min_masks": 0,
65
- "mask_feature_prob": 0.0,
66
  "mask_time_length": 10,
67
  "mask_time_min_masks": 2,
68
- "mask_time_prob": 0.05,
69
  "model_type": "wav2vec2",
70
  "num_adapter_layers": 3,
71
  "num_attention_heads": 16,
@@ -77,7 +77,7 @@
77
  "num_hidden_layers": 24,
78
  "num_negatives": 100,
79
  "output_hidden_size": 1024,
80
- "pad_token_id": 73,
81
  "proj_codevector_dim": 768,
82
  "tdnn_dilation": [
83
  1,
 
8
  "architectures": [
9
  "Wav2Vec2ForCTC"
10
  ],
11
+ "attention_dropout": 0.1,
12
  "bos_token_id": 1,
13
  "classifier_proj_size": 256,
14
  "codevector_dim": 768,
 
54
  "final_dropout": 0.0,
55
  "gradient_checkpointing": false,
56
  "hidden_act": "gelu",
57
+ "hidden_dropout": 0.1,
58
  "hidden_size": 1024,
59
  "initializer_range": 0.02,
60
  "intermediate_size": 4096,
61
  "layer_norm_eps": 1e-05,
62
  "layerdrop": 0.0,
63
+ "mask_feature_length": 64,
64
  "mask_feature_min_masks": 0,
65
+ "mask_feature_prob": 0.25,
66
  "mask_time_length": 10,
67
  "mask_time_min_masks": 2,
68
+ "mask_time_prob": 0.75,
69
  "model_type": "wav2vec2",
70
  "num_adapter_layers": 3,
71
  "num_attention_heads": 16,
 
77
  "num_hidden_layers": 24,
78
  "num_negatives": 100,
79
  "output_hidden_size": 1024,
80
+ "pad_token_id": 72,
81
  "proj_codevector_dim": 768,
82
  "tdnn_dilation": [
83
  1,
inference.ipynb CHANGED
@@ -2,20 +2,21 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
- "id": "3eace62e",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
10
  "from transformers import AutoModelForCTC, Wav2Vec2Processor\n",
11
  "from datasets import load_dataset, load_metric, Audio\n",
 
12
  "import torch"
13
  ]
14
  },
15
  {
16
  "cell_type": "code",
17
- "execution_count": 3,
18
- "id": "47d5c062",
19
  "metadata": {},
20
  "outputs": [
21
  {
@@ -28,238 +29,189 @@
28
  ],
29
  "source": [
30
  "# model = AutoModelForCTC.from_pretrained(\".\").to('cuda')\n",
31
- "processor = Wav2Vec2Processor.from_pretrained(\".\")"
32
  ]
33
  },
34
  {
35
  "cell_type": "code",
36
- "execution_count": 47,
37
- "id": "1ffed05d",
38
  "metadata": {},
39
  "outputs": [
40
  {
41
- "name": "stderr",
42
- "output_type": "stream",
43
- "text": [
44
- "Using custom data configuration default-f6158d05a859ae5c\n",
45
- "Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-f6158d05a859ae5c/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)\n"
46
- ]
47
- }
48
- ],
49
- "source": [
50
- "common_voice_test = load_dataset('csv', data_files='km_kh_male/line_index_test.csv', split = 'train')"
51
- ]
52
- },
53
- {
54
- "cell_type": "code",
55
- "execution_count": 48,
56
- "id": "bb365941",
57
- "metadata": {},
58
- "outputs": [],
59
- "source": [
60
- "common_voice_test = (common_voice_test\n",
61
- " .remove_columns([\"Unnamed: 0\", \"drop\"])\n",
62
- " .rename_column('text', 'sentence'))"
63
- ]
64
- },
65
- {
66
- "cell_type": "code",
67
- "execution_count": null,
68
- "id": "34979efb",
69
- "metadata": {},
70
- "outputs": [],
71
- "source": [
72
- "common_voice_test = common_voice_test.cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio')"
73
- ]
74
- },
75
- {
76
- "cell_type": "code",
77
- "execution_count": null,
78
- "id": "66ac6b14",
79
- "metadata": {},
80
- "outputs": [],
81
- "source": []
82
- },
83
- {
84
- "cell_type": "code",
85
- "execution_count": 17,
86
- "id": "e135b397",
87
- "metadata": {},
88
- "outputs": [
89
  {
90
- "name": "stderr",
91
- "output_type": "stream",
92
- "text": [
93
- "Reusing dataset common_voice (/workspace/.cache/huggingface/datasets/common_voice/tr/6.1.0/5693bfc0feeade582a78c2fb250bc88f52bd86f0a7f1bb22bfee67e715de30fd)\n"
94
- ]
95
- }
96
- ],
97
- "source": [
98
- "common_voice_test = load_dataset(\"common_voice\", \"tr\", split=\"test\")"
99
- ]
100
- },
101
- {
102
- "cell_type": "code",
103
- "execution_count": 18,
104
- "id": "9dd4cfd4",
105
- "metadata": {},
106
- "outputs": [
107
  {
108
  "data": {
 
 
 
 
 
109
  "text/plain": [
110
- "{'client_id': 'b8fffa3c4745500cd2c5f40a82b65bf1fb2d4c4f8638209a33fe1886fbfffdbd2f93aa43e0bd2026c4643e08aada408165138f75787cee501c4d735aa555a61c',\n",
111
- " 'path': 'common_voice_tr_17343551.mp3',\n",
112
- " 'audio': {'path': 'cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_17343551.mp3',\n",
113
- " 'array': array([0. , 0. , 0. , ..., 0.00157976, 0.00167614,\n",
114
- " 0.00091976], dtype=float32),\n",
115
- " 'sampling_rate': 48000},\n",
116
- " 'sentence': 'Aşırı derecede kapalı bir ortamımız var.',\n",
117
- " 'up_votes': 2,\n",
118
- " 'down_votes': 0,\n",
119
- " 'age': 'thirties',\n",
120
- " 'gender': 'male',\n",
121
- " 'accent': 'other',\n",
122
- " 'locale': 'tr',\n",
123
- " 'segment': \"''\"}"
124
  ]
125
  },
126
- "execution_count": 18,
127
  "metadata": {},
128
- "output_type": "execute_result"
129
- }
130
- ],
131
- "source": [
132
- "common_voice_test[3]"
133
- ]
134
- },
135
- {
136
- "cell_type": "code",
137
- "execution_count": 19,
138
- "id": "f36c3bcd",
139
- "metadata": {},
140
- "outputs": [],
141
- "source": [
142
- "# remove unnecceesary attributes\n",
143
- "common_voice_test = common_voice_test.remove_columns([\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"segment\", \"up_votes\"])"
144
- ]
145
- },
146
- {
147
- "cell_type": "code",
148
- "execution_count": 20,
149
- "id": "142cffaa",
150
- "metadata": {},
151
- "outputs": [],
152
- "source": [
153
- "common_voice_test = common_voice_test.cast_column(\"audio\", Audio(sampling_rate=16_000))"
154
- ]
155
- },
156
- {
157
- "cell_type": "code",
158
- "execution_count": 29,
159
- "id": "b1103455",
160
- "metadata": {},
161
- "outputs": [
162
  {
163
  "data": {
 
 
 
 
 
164
  "text/plain": [
165
- "Dataset({\n",
166
- " features: ['path', 'audio', 'sentence'],\n",
167
- " num_rows: 1647\n",
168
- "})"
169
  ]
170
  },
171
- "execution_count": 29,
172
  "metadata": {},
173
- "output_type": "execute_result"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  }
175
  ],
176
  "source": [
177
- "common_voice_test"
 
178
  ]
179
  },
180
  {
181
  "cell_type": "code",
182
- "execution_count": 30,
183
- "id": "e2f9be66",
184
  "metadata": {},
185
  "outputs": [
186
  {
187
- "data": {
188
- "text/plain": [
189
- "'Pek çoğu da Roman toplumundan geliyor.'"
190
- ]
191
- },
192
- "execution_count": 30,
193
- "metadata": {},
194
- "output_type": "execute_result"
195
  }
196
  ],
197
  "source": [
198
- "common_voice_test[0]['sentence']"
199
  ]
200
  },
201
  {
202
  "cell_type": "code",
203
- "execution_count": 26,
204
- "id": "94a0e9c5",
205
  "metadata": {},
206
  "outputs": [],
207
  "source": [
208
- "import numpy as np"
 
 
209
  ]
210
  },
211
  {
212
  "cell_type": "code",
213
- "execution_count": 33,
214
- "id": "c2bcce8a",
215
  "metadata": {},
216
- "outputs": [
217
- {
218
- "data": {
219
- "text/plain": [
220
- "{'path': 'cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_17341269.mp3',\n",
221
- " 'array': array([ 0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 8.288735e-06,\n",
222
- " -1.994405e-03, -7.770515e-03], dtype=float32),\n",
223
- " 'sampling_rate': 16000}"
224
- ]
225
- },
226
- "execution_count": 33,
227
- "metadata": {},
228
- "output_type": "execute_result"
229
- }
230
- ],
231
  "source": [
232
- "common_voice_test[0]['audio']"
233
  ]
234
  },
235
  {
236
  "cell_type": "code",
237
- "execution_count": 34,
238
- "id": "47d9dd9c",
239
  "metadata": {},
240
  "outputs": [
241
  {
242
- "ename": "ValueError",
243
- "evalue": "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).",
244
- "output_type": "error",
245
- "traceback": [
246
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
247
- "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
248
- "Input \u001b[0;32mIn [34]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mprocessor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marray\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcommon_voice_test\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43maudio\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43marray\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msampling_rate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m16000\u001b[39;49m\u001b[43m)\u001b[49m\n",
249
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py:138\u001b[0m, in \u001b[0;36mWav2Vec2Processor.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 132\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;124;03m When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's\u001b[39;00m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context\u001b[39;00m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;124;03m [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.\u001b[39;00m\n\u001b[1;32m 137\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 138\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_processor\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
250
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2417\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.__call__\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 2414\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 2416\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text):\n\u001b[0;32m-> 2417\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2418\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2419\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2420\u001b[0m )\n\u001b[1;32m 2422\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text_pair):\n\u001b[1;32m 2423\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2424\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2425\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2426\u001b[0m )\n",
251
- "\u001b[0;31mValueError\u001b[0m: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)."
252
- ]
 
253
  }
254
  ],
255
  "source": [
256
- "processor(np.array(common_voice_test[0]['audio'][\"array\"]), sampling_rate=16000)"
257
  ]
258
  },
259
  {
260
  "cell_type": "code",
261
- "execution_count": 27,
262
- "id": "5f0e5342",
263
  "metadata": {},
264
  "outputs": [],
265
  "source": [
@@ -277,14 +229,14 @@
277
  },
278
  {
279
  "cell_type": "code",
280
- "execution_count": 28,
281
- "id": "6786ed7e",
282
  "metadata": {},
283
  "outputs": [
284
  {
285
  "data": {
286
  "application/vnd.jupyter.widget-view+json": {
287
- "model_id": "b74fe324f3bd4d98b6366c614fec7991",
288
  "version_major": 2,
289
  "version_minor": 0
290
  },
@@ -294,27 +246,6 @@
294
  },
295
  "metadata": {},
296
  "output_type": "display_data"
297
- },
298
- {
299
- "ename": "ValueError",
300
- "evalue": "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).",
301
- "output_type": "error",
302
- "traceback": [
303
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
304
- "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
305
- "Input \u001b[0;32mIn [28]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m common_voice_test \u001b[38;5;241m=\u001b[39m \u001b[43mcommon_voice_test\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprepare_dataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mremove_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcommon_voice_test\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumn_names\u001b[49m\u001b[43m)\u001b[49m\n",
306
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2107\u001b[0m, in \u001b[0;36mDataset.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m 2104\u001b[0m disable_tqdm \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mbool\u001b[39m(logging\u001b[38;5;241m.\u001b[39mget_verbosity() \u001b[38;5;241m==\u001b[39m logging\u001b[38;5;241m.\u001b[39mNOTSET) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m utils\u001b[38;5;241m.\u001b[39mis_progress_bar_enabled()\n\u001b[1;32m 2106\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m num_proc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m num_proc \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m-> 2107\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_single\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2108\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2109\u001b[0m \u001b[43m \u001b[49m\u001b[43mwith_indices\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwith_indices\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2110\u001b[0m \u001b[43m \u001b[49m\u001b[43mwith_rank\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwith_rank\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2111\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2112\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatched\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatched\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2113\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2114\u001b[0m \u001b[43m \u001b[49m\u001b[43mdrop_last_batch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdrop_last_batch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2115\u001b[0m \u001b[43m \u001b[49m\u001b[43mremove_columns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mremove_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2116\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeep_in_memory\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeep_in_memory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2117\u001b[0m \u001b[43m \u001b[49m\u001b[43mload_from_cache_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mload_from_cache_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2118\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_file_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_file_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2119\u001b[0m \u001b[43m \u001b[49m\u001b[43mwriter_batch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwriter_batch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2120\u001b[0m \u001b[43m \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2121\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable_nullable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable_nullable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2122\u001b[0m \u001b[43m \u001b[49m\u001b[43mfn_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfn_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2123\u001b[0m \u001b[43m \u001b[49m\u001b[43mnew_fingerprint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnew_fingerprint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2124\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable_tqdm\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdisable_tqdm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2125\u001b[0m \u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdesc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2126\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2127\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2129\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mformat_cache_file_name\u001b[39m(cache_file_name, rank):\n",
307
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:519\u001b[0m, in \u001b[0;36mtransmit_tasks.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[38;5;28mself\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mself\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 518\u001b[0m \u001b[38;5;66;03m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 519\u001b[0m out: Union[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDatasetDict\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 520\u001b[0m datasets: List[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(out\u001b[38;5;241m.\u001b[39mvalues()) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(out, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m [out]\n\u001b[1;32m 521\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m dataset \u001b[38;5;129;01min\u001b[39;00m datasets:\n\u001b[1;32m 522\u001b[0m \u001b[38;5;66;03m# Remove task templates if a column mapping of the template is no longer valid\u001b[39;00m\n",
308
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:486\u001b[0m, in \u001b[0;36mtransmit_format.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 479\u001b[0m self_format \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 480\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_type,\n\u001b[1;32m 481\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mformat_kwargs\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_kwargs,\n\u001b[1;32m 482\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolumns\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_format_columns,\n\u001b[1;32m 483\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_all_columns\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_output_all_columns,\n\u001b[1;32m 484\u001b[0m }\n\u001b[1;32m 485\u001b[0m \u001b[38;5;66;03m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 486\u001b[0m out: Union[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDatasetDict\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 487\u001b[0m datasets: List[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(out\u001b[38;5;241m.\u001b[39mvalues()) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(out, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m [out]\n\u001b[1;32m 488\u001b[0m \u001b[38;5;66;03m# re-apply format to the output\u001b[39;00m\n",
309
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/fingerprint.py:413\u001b[0m, in \u001b[0;36mfingerprint_transform.<locals>._fingerprint.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 407\u001b[0m kwargs[fingerprint_name] \u001b[38;5;241m=\u001b[39m update_fingerprint(\n\u001b[1;32m 408\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fingerprint, transform, kwargs_for_fingerprint\n\u001b[1;32m 409\u001b[0m )\n\u001b[1;32m 411\u001b[0m \u001b[38;5;66;03m# Call actual function\u001b[39;00m\n\u001b[0;32m--> 413\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 415\u001b[0m \u001b[38;5;66;03m# Update fingerprint of in-place transforms + update in-place history of transforms\u001b[39;00m\n\u001b[1;32m 417\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inplace: \u001b[38;5;66;03m# update after calling func so that the fingerprint doesn't change if the function fails\u001b[39;00m\n",
310
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2465\u001b[0m, in \u001b[0;36mDataset._map_single\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset, disable_tqdm, desc, cache_only)\u001b[0m\n\u001b[1;32m 2463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m batched:\n\u001b[1;32m 2464\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, example \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(pbar):\n\u001b[0;32m-> 2465\u001b[0m example \u001b[38;5;241m=\u001b[39m \u001b[43mapply_function_on_filtered_inputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mexample\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moffset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moffset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2466\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m update_data:\n\u001b[1;32m 2467\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
311
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2372\u001b[0m, in \u001b[0;36mDataset._map_single.<locals>.apply_function_on_filtered_inputs\u001b[0;34m(inputs, indices, check_same_num_examples, offset)\u001b[0m\n\u001b[1;32m 2370\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m with_rank:\n\u001b[1;32m 2371\u001b[0m additional_args \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (rank,)\n\u001b[0;32m-> 2372\u001b[0m processed_inputs \u001b[38;5;241m=\u001b[39m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfn_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43madditional_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfn_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2373\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m update_data \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 2374\u001b[0m \u001b[38;5;66;03m# Check if the function returns updated examples\u001b[39;00m\n\u001b[1;32m 2375\u001b[0m update_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28misinstance\u001b[39m(processed_inputs, (Mapping, pa\u001b[38;5;241m.\u001b[39mTable))\n",
312
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/datasets/arrow_dataset.py:2067\u001b[0m, in \u001b[0;36mDataset.map.<locals>.decorate.<locals>.decorated\u001b[0;34m(item, *args, **kwargs)\u001b[0m\n\u001b[1;32m 2063\u001b[0m decorated_item \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 2064\u001b[0m Example(item, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m batched \u001b[38;5;28;01melse\u001b[39;00m Batch(item, features\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeatures)\n\u001b[1;32m 2065\u001b[0m )\n\u001b[1;32m 2066\u001b[0m \u001b[38;5;66;03m# Use the LazyDict internally, while mapping the function\u001b[39;00m\n\u001b[0;32m-> 2067\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdecorated_item\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2068\u001b[0m \u001b[38;5;66;03m# Return a standard dict\u001b[39;00m\n\u001b[1;32m 2069\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\u001b[38;5;241m.\u001b[39mdata \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(result, LazyDict) \u001b[38;5;28;01melse\u001b[39;00m result\n",
313
- "Input \u001b[0;32mIn [27]\u001b[0m, in \u001b[0;36mprepare_dataset\u001b[0;34m(batch)\u001b[0m\n\u001b[1;32m 2\u001b[0m audio \u001b[38;5;241m=\u001b[39m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maudio\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# batched output is \"un-batched\"\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_values\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mprocessor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marray\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43marray\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msampling_rate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msampling_rate\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39minput_values[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 6\u001b[0m batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_length\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(batch[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_values\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m processor\u001b[38;5;241m.\u001b[39mas_target_processor():\n",
314
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py:138\u001b[0m, in \u001b[0;36mWav2Vec2Processor.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 132\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 133\u001b[0m \u001b[38;5;124;03m When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's\u001b[39;00m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context\u001b[39;00m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;124;03m [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;124;03m [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.\u001b[39;00m\n\u001b[1;32m 137\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 138\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_processor\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
315
- "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2417\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.__call__\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 2414\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 2416\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text):\n\u001b[0;32m-> 2417\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2418\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2419\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2420\u001b[0m )\n\u001b[1;32m 2422\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m text_pair \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _is_valid_text_input(text_pair):\n\u001b[1;32m 2423\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 2424\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext input must of type `str` (single example), `List[str]` (batch or single pretokenized example) \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2425\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mor `List[List[str]]` (batch of pretokenized examples).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2426\u001b[0m )\n",
316
- "\u001b[0;31mValueError\u001b[0m: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples)."
317
- ]
318
  }
319
  ],
320
  "source": [
@@ -323,85 +254,25 @@
323
  },
324
  {
325
  "cell_type": "code",
326
- "execution_count": 24,
327
- "id": "81506c80",
328
- "metadata": {},
329
- "outputs": [
330
- {
331
- "data": {
332
- "text/plain": [
333
- "{'path': 'common_voice_tr_17341269.mp3',\n",
334
- " 'audio': {'path': 'cv-corpus-6.1-2020-12-11/tr/clips/common_voice_tr_17341269.mp3',\n",
335
- " 'array': array([ 0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 8.288735e-06,\n",
336
- " -1.994405e-03, -7.770515e-03], dtype=float32),\n",
337
- " 'sampling_rate': 16000},\n",
338
- " 'sentence': 'Pek çoğu da Roman toplumundan geliyor.'}"
339
- ]
340
- },
341
- "execution_count": 24,
342
- "metadata": {},
343
- "output_type": "execute_result"
344
- }
345
- ],
346
- "source": [
347
- "common_voice_test[0]"
348
- ]
349
- },
350
- {
351
- "cell_type": "code",
352
- "execution_count": null,
353
- "id": "603ecd46",
354
- "metadata": {},
355
- "outputs": [],
356
- "source": []
357
- },
358
- {
359
- "cell_type": "code",
360
- "execution_count": 12,
361
- "id": "760f0031",
362
  "metadata": {},
363
  "outputs": [],
364
  "source": [
365
- "i = 20"
366
  ]
367
  },
368
  {
369
  "cell_type": "code",
370
- "execution_count": 14,
371
- "id": "e0355fac",
372
  "metadata": {},
373
  "outputs": [
374
  {
375
- "ename": "KeyError",
376
- "evalue": "'input_values'",
377
- "output_type": "error",
378
- "traceback": [
379
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
380
- "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
381
- "Input \u001b[0;32mIn [14]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m input_dict \u001b[38;5;241m=\u001b[39m processor(\u001b[43mcommon_voice_test\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minput_values\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m, padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
382
- "\u001b[0;31mKeyError\u001b[0m: 'input_values'"
383
- ]
384
- }
385
- ],
386
- "source": [
387
- "input_dict = processor(common_voice_test[i][\"input_values\"], return_tensors=\"pt\", padding=True)"
388
- ]
389
- },
390
- {
391
- "cell_type": "code",
392
- "execution_count": 13,
393
- "id": "c0b3603c",
394
- "metadata": {},
395
- "outputs": [
396
- {
397
- "ename": "KeyError",
398
- "evalue": "'input_values'",
399
- "output_type": "error",
400
- "traceback": [
401
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
402
- "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
403
- "Input \u001b[0;32mIn [13]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m input_dict \u001b[38;5;241m=\u001b[39m processor(\u001b[43mcommon_voice_test\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43minput_values\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m, padding\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 2\u001b[0m logits \u001b[38;5;241m=\u001b[39m model(input_dict\u001b[38;5;241m.\u001b[39minput_values\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m))\u001b[38;5;241m.\u001b[39mlogits\n\u001b[1;32m 3\u001b[0m pred_ids \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39margmax(logits, dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)[\u001b[38;5;241m0\u001b[39m]\n",
404
- "\u001b[0;31mKeyError\u001b[0m: 'input_values'"
405
  ]
406
  }
407
  ],
@@ -413,8 +284,8 @@
413
  },
414
  {
415
  "cell_type": "code",
416
- "execution_count": 15,
417
- "id": "23db2fe7",
418
  "metadata": {},
419
  "outputs": [
420
  {
@@ -422,15 +293,16 @@
422
  "output_type": "stream",
423
  "text": [
424
  "Prediction:\n",
425
- "ş\n",
426
  "\n",
427
  "Reference:\n",
428
- "Yine de her iki grup farklΔ± sorunlar Γ§Δ±karΔ±yor.\n"
429
  ]
430
  }
431
  ],
432
  "source": [
433
  "print(\"Prediction:\")\n",
 
434
  "print(processor.decode(pred_ids))\n",
435
  "\n",
436
  "print(\"\\nReference:\")\n",
@@ -441,7 +313,7 @@
441
  {
442
  "cell_type": "code",
443
  "execution_count": null,
444
- "id": "4da2cb6c",
445
  "metadata": {},
446
  "outputs": [],
447
  "source": []
@@ -449,7 +321,7 @@
449
  {
450
  "cell_type": "code",
451
  "execution_count": null,
452
- "id": "0f5325dd",
453
  "metadata": {},
454
  "outputs": [],
455
  "source": []
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 7,
6
+ "id": "9abf3270",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
10
  "from transformers import AutoModelForCTC, Wav2Vec2Processor\n",
11
  "from datasets import load_dataset, load_metric, Audio\n",
12
+ "import numpy as np\n",
13
  "import torch"
14
  ]
15
  },
16
  {
17
  "cell_type": "code",
18
+ "execution_count": 27,
19
+ "id": "6e0830a2",
20
  "metadata": {},
21
  "outputs": [
22
  {
 
29
  ],
30
  "source": [
31
  "# model = AutoModelForCTC.from_pretrained(\".\").to('cuda')\n",
32
+ "# processor = Wav2Vec2Processor.from_pretrained(\".\")"
33
  ]
34
  },
35
  {
36
  "cell_type": "code",
37
+ "execution_count": 39,
38
+ "id": "126e39e0",
39
  "metadata": {},
40
  "outputs": [
41
  {
42
+ "data": {
43
+ "application/vnd.jupyter.widget-view+json": {
44
+ "model_id": "a2fdec3c288946a19a5b36618af4c26c",
45
+ "version_major": 2,
46
+ "version_minor": 0
47
+ },
48
+ "text/plain": [
49
+ "Downloading: 0%| | 0.00/2.02k [00:00<?, ?B/s]"
50
+ ]
51
+ },
52
+ "metadata": {},
53
+ "output_type": "display_data"
54
+ },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  {
56
+ "data": {
57
+ "application/vnd.jupyter.widget-view+json": {
58
+ "model_id": "ae6b614b82aa4627b55b8c742ca8898e",
59
+ "version_major": 2,
60
+ "version_minor": 0
61
+ },
62
+ "text/plain": [
63
+ "Downloading: 0%| | 0.00/1.18G [00:00<?, ?B/s]"
64
+ ]
65
+ },
66
+ "metadata": {},
67
+ "output_type": "display_data"
68
+ },
 
 
 
 
69
  {
70
  "data": {
71
+ "application/vnd.jupyter.widget-view+json": {
72
+ "model_id": "8c39ff01100c40aab1191514bb52399d",
73
+ "version_major": 2,
74
+ "version_minor": 0
75
+ },
76
  "text/plain": [
77
+ "Downloading: 0%| | 0.00/214 [00:00<?, ?B/s]"
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  ]
79
  },
 
80
  "metadata": {},
81
+ "output_type": "display_data"
82
+ },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  {
84
  "data": {
85
+ "application/vnd.jupyter.widget-view+json": {
86
+ "model_id": "5078aede36ff4dc1a17e24a967cb46dd",
87
+ "version_major": 2,
88
+ "version_minor": 0
89
+ },
90
  "text/plain": [
91
+ "Downloading: 0%| | 0.00/260 [00:00<?, ?B/s]"
 
 
 
92
  ]
93
  },
 
94
  "metadata": {},
95
+ "output_type": "display_data"
96
+ },
97
+ {
98
+ "data": {
99
+ "application/vnd.jupyter.widget-view+json": {
100
+ "model_id": "a4c7e0e6e26f4c5e8f40e79b67b61c17",
101
+ "version_major": 2,
102
+ "version_minor": 0
103
+ },
104
+ "text/plain": [
105
+ "Downloading: 0%| | 0.00/795 [00:00<?, ?B/s]"
106
+ ]
107
+ },
108
+ "metadata": {},
109
+ "output_type": "display_data"
110
+ },
111
+ {
112
+ "data": {
113
+ "application/vnd.jupyter.widget-view+json": {
114
+ "model_id": "b55bcb16d8d042059a6487391e1a51de",
115
+ "version_major": 2,
116
+ "version_minor": 0
117
+ },
118
+ "text/plain": [
119
+ "Downloading: 0%| | 0.00/23.0 [00:00<?, ?B/s]"
120
+ ]
121
+ },
122
+ "metadata": {},
123
+ "output_type": "display_data"
124
+ },
125
+ {
126
+ "data": {
127
+ "application/vnd.jupyter.widget-view+json": {
128
+ "model_id": "0546a666b47a4d418e62ccc8fec58bd4",
129
+ "version_major": 2,
130
+ "version_minor": 0
131
+ },
132
+ "text/plain": [
133
+ "Downloading: 0%| | 0.00/309 [00:00<?, ?B/s]"
134
+ ]
135
+ },
136
+ "metadata": {},
137
+ "output_type": "display_data"
138
  }
139
  ],
140
  "source": [
141
+ "model = AutoModelForCTC.from_pretrained(\"vitouphy/xls-r-300m-km\").to('cuda')\n",
142
+ "processor = Wav2Vec2Processor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
143
  ]
144
  },
145
  {
146
  "cell_type": "code",
147
+ "execution_count": 3,
148
+ "id": "cf72163d",
149
  "metadata": {},
150
  "outputs": [
151
  {
152
+ "name": "stderr",
153
+ "output_type": "stream",
154
+ "text": [
155
+ "Using custom data configuration default-fbad308ab5a03eb2\n",
156
+ "Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-fbad308ab5a03eb2/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)\n"
157
+ ]
 
 
158
  }
159
  ],
160
  "source": [
161
+ "common_voice_test = load_dataset('csv', data_files='km_kh_male/line_index_test.csv', split = 'train')"
162
  ]
163
  },
164
  {
165
  "cell_type": "code",
166
+ "execution_count": 4,
167
+ "id": "6b30d6ea",
168
  "metadata": {},
169
  "outputs": [],
170
  "source": [
171
+ "common_voice_test = (common_voice_test\n",
172
+ " .remove_columns([\"Unnamed: 0\", \"drop\"])\n",
173
+ " .rename_column('text', 'sentence'))"
174
  ]
175
  },
176
  {
177
  "cell_type": "code",
178
+ "execution_count": 5,
179
+ "id": "bf9734cc",
180
  "metadata": {},
181
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  "source": [
183
+ "common_voice_test = common_voice_test.cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio')"
184
  ]
185
  },
186
  {
187
  "cell_type": "code",
188
+ "execution_count": 8,
189
+ "id": "5e74effa",
190
  "metadata": {},
191
  "outputs": [
192
  {
193
+ "data": {
194
+ "text/plain": [
195
+ "{'audio': {'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/khm_1443_3799144408.wav',\n",
196
+ " 'array': array([-1.0600963e-06, 1.2359066e-06, -1.4001107e-06, ...,\n",
197
+ " -3.1423504e-05, 4.4914182e-06, 0.0000000e+00], dtype=float32),\n",
198
+ " 'sampling_rate': 16000},\n",
199
+ " 'sentence': 'ស៊ី αžŠαžΆαž…αŸ‹ αž˜αŸ‰αžΌαžαžΌ αž“αŸ… αž–αŸαž› αžŠαŸ‚αž› αž”αŸ’αžšαžΎ αž±αŸ’αž™ αžŒαž»αž” αžŸαž˜αŸ’αž—αžΆαžšαŸˆ αž‚αŸ’αžšαžΏαž„ αžŸαž„αŸ’αž αžΆαžšαžΉαž˜ αž™αž€ αž‘αŸ… αž±αŸ’αž™ αž˜αŸ‰αžΌαž™ αž“αŸ… αž˜αŸ’αžŠαž»αŸ† αžœαžαŸ’αžŠ αžŸαŸ†αžšαŸ„αž„αž’αžŽαŸ’αžŠαŸ‚αž'}"
200
+ ]
201
+ },
202
+ "execution_count": 8,
203
+ "metadata": {},
204
+ "output_type": "execute_result"
205
  }
206
  ],
207
  "source": [
208
+ "common_voice_test[0]"
209
  ]
210
  },
211
  {
212
  "cell_type": "code",
213
+ "execution_count": 11,
214
+ "id": "c94de1a7",
215
  "metadata": {},
216
  "outputs": [],
217
  "source": [
 
229
  },
230
  {
231
  "cell_type": "code",
232
+ "execution_count": 12,
233
+ "id": "c018376a",
234
  "metadata": {},
235
  "outputs": [
236
  {
237
  "data": {
238
  "application/vnd.jupyter.widget-view+json": {
239
+ "model_id": "e9ec232c2caf4bdcb62b24696217a723",
240
  "version_major": 2,
241
  "version_minor": 0
242
  },
 
246
  },
247
  "metadata": {},
248
  "output_type": "display_data"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  }
250
  ],
251
  "source": [
 
254
  },
255
  {
256
  "cell_type": "code",
257
+ "execution_count": 45,
258
+ "id": "c0a02606",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  "metadata": {},
260
  "outputs": [],
261
  "source": [
262
+ "i = 21"
263
  ]
264
  },
265
  {
266
  "cell_type": "code",
267
+ "execution_count": 46,
268
+ "id": "19b8e75f",
269
  "metadata": {},
270
  "outputs": [
271
  {
272
+ "name": "stderr",
273
+ "output_type": "stream",
274
+ "text": [
275
+ "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  ]
277
  }
278
  ],
 
284
  },
285
  {
286
  "cell_type": "code",
287
+ "execution_count": 47,
288
+ "id": "d15fb5a5",
289
  "metadata": {},
290
  "outputs": [
291
  {
 
293
  "output_type": "stream",
294
  "text": [
295
  "Prediction:\n",
296
+ "αž€αžΆαž”αžΌαž αŸ’αžœαžΌαž“αžšαŸ‰αžΆαž“ αž“αž·αž„ αž€αžΌαž αŸ’αžœαžΆαž€αž‘αŸαžš αž‡αžΆ αž˜αž·αžαŸ’αž αž“αž·αž„ αž‚αŸ’αž“αžΆ\n",
297
  "\n",
298
  "Reference:\n",
299
+ "αž€αžΆαž”αžΌαž αŸ’αžœαžΌαž“αžšαŸ‰αžΆαž“ αž“αž·αž„ αž€αžΌαž αŸ’αžœαžΆαž€αŸ‹αž‘αŸαžš αž‡αžΆ αž˜αž·αžαŸ’αž αž“αž·αž„ αž‚αŸ’αž“αžΆ\n"
300
  ]
301
  }
302
  ],
303
  "source": [
304
  "print(\"Prediction:\")\n",
305
+ "pred_ids = pred_ids[pred_ids != processor.tokenizer.pad_token_id]\n",
306
  "print(processor.decode(pred_ids))\n",
307
  "\n",
308
  "print(\"\\nReference:\")\n",
 
313
  {
314
  "cell_type": "code",
315
  "execution_count": null,
316
+ "id": "07bc1b8e",
317
  "metadata": {},
318
  "outputs": [],
319
  "source": []
 
321
  {
322
  "cell_type": "code",
323
  "execution_count": null,
324
+ "id": "b228faa1",
325
  "metadata": {},
326
  "outputs": [],
327
  "source": []
preprocessor_config.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "do_normalize": true,
3
- "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
- "feature_size": 1,
5
- "padding_side": "right",
6
- "padding_value": 0.0,
7
- "return_attention_mask": true,
8
- "sampling_rate": 16000
9
- }
 
 
 
 
 
 
 
 
 
 
pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:de338e5c1afe8724c486f34b78be6997d34259229acda73ba51b26c8c304a4d7
3
- size 1262231153
 
 
 
 
train_tr.ipynb β†’ train-Copy1.ipynb RENAMED
File without changes
train.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
train_kh.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b35c85cfceecf16b9f8a4306c6c419ed33469f21c8e3b016a95fdbbcd87ddbb
3
- size 2991
 
 
 
 
vocab.json CHANGED
@@ -1 +1 @@
1
- {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8, "I": 9, "J": 10, "K": 11, "L": 12, "M": 13, "N": 14, "O": 15, "P": 16, "Q": 17, "R": 18, "S": 19, "T": 20, "U": 21, "V": 22, "W": 23, "X": 24, "Y": 25, "Z": 26, "a": 27, "b": 28, "c": 29, "d": 30, "e": 31, "f": 32, "g": 33, "h": 34, "i": 35, "j": 36, "k": 37, "l": 38, "m": 39, "n": 40, "o": 41, "p": 42, "r": 43, "s": 44, "t": 45, "u": 46, "v": 47, "w": 48, "x": 49, "y": 50, "z": 51, "\u00c7": 52, "\u00d6": 53, "\u00dc": 54, "\u00e2": 55, "\u00e7": 56, "\u00eb": 57, "\u00ee": 58, "\u00f6": 59, "\u00fc": 60, "\u011f": 61, "\u0130": 62, "\u0131": 63, "\u015e": 64, "\u015f": 65, "|": 0, "[UNK]": 67, "[PAD]": 68}
 
1
+ {"\u1780": 1, "\u1781": 2, "\u1782": 3, "\u1783": 4, "\u1784": 5, "\u1785": 6, "\u1786": 7, "\u1787": 8, "\u1788": 9, "\u1789": 10, "\u178a": 11, "\u178b": 12, "\u178c": 13, "\u178d": 14, "\u178e": 15, "\u178f": 16, "\u1790": 17, "\u1791": 18, "\u1792": 19, "\u1793": 20, "\u1794": 21, "\u1795": 22, "\u1796": 23, "\u1797": 24, "\u1798": 25, "\u1799": 26, "\u179a": 27, "\u179b": 28, "\u179c": 29, "\u179f": 30, "\u17a0": 31, "\u17a1": 32, "\u17a2": 33, "\u17a5": 34, "\u17a7": 35, "\u17aa": 36, "\u17ab": 37, "\u17ac": 38, "\u17ad": 39, "\u17ae": 40, "\u17af": 41, "\u17b1": 42, "\u17b6": 43, "\u17b7": 44, "\u17b8": 45, "\u17b9": 46, "\u17ba": 47, "\u17bb": 48, "\u17bc": 49, "\u17bd": 50, "\u17be": 51, "\u17bf": 52, "\u17c0": 53, "\u17c1": 54, "\u17c2": 55, "\u17c3": 56, "\u17c4": 57, "\u17c5": 58, "\u17c6": 59, "\u17c7": 60, "\u17c8": 61, "\u17c9": 62, "\u17ca": 63, "\u17cb": 64, "\u17cc": 65, "\u17cd": 66, "\u17ce": 67, "\u17cf": 68, "\u17d0": 69, "\u17d2": 70, "|": 0, "[UNK]": 71, "[PAD]": 72}