vitouphy commited on
Commit
e256a57
โ€ข
1 Parent(s): 98a6466

fix vocab json file

Browse files
Files changed (5) hide show
  1. inference.ipynb +58 -139
  2. preprocessor_config.json +9 -0
  3. train_kh.ipynb +97 -64
  4. training_args.bin +3 -0
  5. vocab.json +1 -5
inference.ipynb CHANGED
@@ -2,8 +2,8 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 7,
6
- "id": "9abf3270",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
@@ -15,126 +15,45 @@
15
  },
16
  {
17
  "cell_type": "code",
18
- "execution_count": 27,
19
- "id": "6e0830a2",
20
  "metadata": {},
21
- "outputs": [
22
- {
23
- "name": "stderr",
24
- "output_type": "stream",
25
- "text": [
26
- "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
27
- ]
28
- }
29
- ],
30
  "source": [
31
- "# model = AutoModelForCTC.from_pretrained(\".\").to('cuda')\n",
32
- "# processor = Wav2Vec2Processor.from_pretrained(\".\")"
33
  ]
34
  },
35
  {
36
  "cell_type": "code",
37
- "execution_count": 39,
38
- "id": "126e39e0",
39
- "metadata": {},
 
 
 
 
 
40
  "outputs": [
41
  {
42
- "data": {
43
- "application/vnd.jupyter.widget-view+json": {
44
- "model_id": "a2fdec3c288946a19a5b36618af4c26c",
45
- "version_major": 2,
46
- "version_minor": 0
47
- },
48
- "text/plain": [
49
- "Downloading: 0%| | 0.00/2.02k [00:00<?, ?B/s]"
50
- ]
51
- },
52
- "metadata": {},
53
- "output_type": "display_data"
54
- },
55
- {
56
- "data": {
57
- "application/vnd.jupyter.widget-view+json": {
58
- "model_id": "ae6b614b82aa4627b55b8c742ca8898e",
59
- "version_major": 2,
60
- "version_minor": 0
61
- },
62
- "text/plain": [
63
- "Downloading: 0%| | 0.00/1.18G [00:00<?, ?B/s]"
64
- ]
65
- },
66
- "metadata": {},
67
- "output_type": "display_data"
68
- },
69
- {
70
- "data": {
71
- "application/vnd.jupyter.widget-view+json": {
72
- "model_id": "8c39ff01100c40aab1191514bb52399d",
73
- "version_major": 2,
74
- "version_minor": 0
75
- },
76
- "text/plain": [
77
- "Downloading: 0%| | 0.00/214 [00:00<?, ?B/s]"
78
- ]
79
- },
80
- "metadata": {},
81
- "output_type": "display_data"
82
- },
83
- {
84
- "data": {
85
- "application/vnd.jupyter.widget-view+json": {
86
- "model_id": "5078aede36ff4dc1a17e24a967cb46dd",
87
- "version_major": 2,
88
- "version_minor": 0
89
- },
90
- "text/plain": [
91
- "Downloading: 0%| | 0.00/260 [00:00<?, ?B/s]"
92
- ]
93
- },
94
- "metadata": {},
95
- "output_type": "display_data"
96
- },
97
- {
98
- "data": {
99
- "application/vnd.jupyter.widget-view+json": {
100
- "model_id": "a4c7e0e6e26f4c5e8f40e79b67b61c17",
101
- "version_major": 2,
102
- "version_minor": 0
103
- },
104
- "text/plain": [
105
- "Downloading: 0%| | 0.00/795 [00:00<?, ?B/s]"
106
- ]
107
- },
108
- "metadata": {},
109
- "output_type": "display_data"
110
- },
111
- {
112
- "data": {
113
- "application/vnd.jupyter.widget-view+json": {
114
- "model_id": "b55bcb16d8d042059a6487391e1a51de",
115
- "version_major": 2,
116
- "version_minor": 0
117
- },
118
- "text/plain": [
119
- "Downloading: 0%| | 0.00/23.0 [00:00<?, ?B/s]"
120
- ]
121
- },
122
- "metadata": {},
123
- "output_type": "display_data"
124
- },
125
- {
126
- "data": {
127
- "application/vnd.jupyter.widget-view+json": {
128
- "model_id": "0546a666b47a4d418e62ccc8fec58bd4",
129
- "version_major": 2,
130
- "version_minor": 0
131
- },
132
- "text/plain": [
133
- "Downloading: 0%| | 0.00/309 [00:00<?, ?B/s]"
134
- ]
135
- },
136
- "metadata": {},
137
- "output_type": "display_data"
138
  }
139
  ],
140
  "source": [
@@ -144,8 +63,8 @@
144
  },
145
  {
146
  "cell_type": "code",
147
- "execution_count": 3,
148
- "id": "cf72163d",
149
  "metadata": {},
150
  "outputs": [
151
  {
@@ -163,8 +82,8 @@
163
  },
164
  {
165
  "cell_type": "code",
166
- "execution_count": 4,
167
- "id": "6b30d6ea",
168
  "metadata": {},
169
  "outputs": [],
170
  "source": [
@@ -175,8 +94,8 @@
175
  },
176
  {
177
  "cell_type": "code",
178
- "execution_count": 5,
179
- "id": "bf9734cc",
180
  "metadata": {},
181
  "outputs": [],
182
  "source": [
@@ -185,8 +104,8 @@
185
  },
186
  {
187
  "cell_type": "code",
188
- "execution_count": 8,
189
- "id": "5e74effa",
190
  "metadata": {},
191
  "outputs": [
192
  {
@@ -199,7 +118,7 @@
199
  " 'sentence': 'แžŸแŸŠแžธ แžŠแžถแž…แŸ‹ แž˜แŸ‰แžผแžแžผ แž“แŸ… แž–แŸแž› แžŠแŸ‚แž› แž”แŸ’แžšแžพ แžฑแŸ’แž™ แžŒแžปแž” แžŸแž˜แŸ’แž—แžถแžšแŸˆ แž‚แŸ’แžšแžฟแž„ แžŸแž„แŸ’แž แžถแžšแžนแž˜ แž™แž€ แž‘แŸ… แžฑแŸ’แž™ แž˜แŸ‰แžผแž™ แž“แŸ… แž˜แŸ’แžŠแžปแŸ† แžœแžแŸ’แžŠ แžŸแŸ†แžšแŸ„แž„แžขแžŽแŸ’แžŠแŸ‚แž'}"
200
  ]
201
  },
202
- "execution_count": 8,
203
  "metadata": {},
204
  "output_type": "execute_result"
205
  }
@@ -210,8 +129,8 @@
210
  },
211
  {
212
  "cell_type": "code",
213
- "execution_count": 11,
214
- "id": "c94de1a7",
215
  "metadata": {},
216
  "outputs": [],
217
  "source": [
@@ -229,14 +148,14 @@
229
  },
230
  {
231
  "cell_type": "code",
232
- "execution_count": 12,
233
- "id": "c018376a",
234
  "metadata": {},
235
  "outputs": [
236
  {
237
  "data": {
238
  "application/vnd.jupyter.widget-view+json": {
239
- "model_id": "e9ec232c2caf4bdcb62b24696217a723",
240
  "version_major": 2,
241
  "version_minor": 0
242
  },
@@ -254,18 +173,18 @@
254
  },
255
  {
256
  "cell_type": "code",
257
- "execution_count": 45,
258
- "id": "c0a02606",
259
  "metadata": {},
260
  "outputs": [],
261
  "source": [
262
- "i = 21"
263
  ]
264
  },
265
  {
266
  "cell_type": "code",
267
- "execution_count": 46,
268
- "id": "19b8e75f",
269
  "metadata": {},
270
  "outputs": [
271
  {
@@ -284,8 +203,8 @@
284
  },
285
  {
286
  "cell_type": "code",
287
- "execution_count": 47,
288
- "id": "d15fb5a5",
289
  "metadata": {},
290
  "outputs": [
291
  {
@@ -293,10 +212,10 @@
293
  "output_type": "stream",
294
  "text": [
295
  "Prediction:\n",
296
- "แž€แžถแž”แžผแž แŸ’แžœแžผแž“แžšแŸ‰แžถแž“ แž“แžทแž„ แž€แžผแž แŸ’แžœแžถแž€แž‘แŸแžš แž‡แžถ แž˜แžทแžแŸ’แž แž“แžทแž„ แž‚แŸ’แž“แžถ\n",
297
  "\n",
298
  "Reference:\n",
299
- "แž€แžถแž”แžผแž แŸ’แžœแžผแž“แžšแŸ‰แžถแž“ แž“แžทแž„ แž€แžผแž แŸ’แžœแžถแž€แŸ‹แž‘แŸแžš แž‡แžถ แž˜แžทแžแŸ’แž แž“แžทแž„ แž‚แŸ’แž“แžถ\n"
300
  ]
301
  }
302
  ],
@@ -313,7 +232,7 @@
313
  {
314
  "cell_type": "code",
315
  "execution_count": null,
316
- "id": "07bc1b8e",
317
  "metadata": {},
318
  "outputs": [],
319
  "source": []
@@ -321,7 +240,7 @@
321
  {
322
  "cell_type": "code",
323
  "execution_count": null,
324
- "id": "b228faa1",
325
  "metadata": {},
326
  "outputs": [],
327
  "source": []
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "438927ca",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
 
15
  },
16
  {
17
  "cell_type": "code",
18
+ "execution_count": 5,
19
+ "id": "27a57965",
20
  "metadata": {},
21
+ "outputs": [],
 
 
 
 
 
 
 
 
22
  "source": [
23
+ "model = AutoModelForCTC.from_pretrained(\".\").to('cuda')\n",
24
+ "processor = Wav2Vec2Processor.from_pretrained(\".\")"
25
  ]
26
  },
27
  {
28
  "cell_type": "code",
29
+ "execution_count": 3,
30
+ "id": "1d4324df",
31
+ "metadata": {
32
+ "collapsed": true,
33
+ "jupyter": {
34
+ "outputs_hidden": true
35
+ }
36
+ },
37
  "outputs": [
38
  {
39
+ "ename": "JSONDecodeError",
40
+ "evalue": "Expecting value: line 1 column 1 (char 0)",
41
+ "output_type": "error",
42
+ "traceback": [
43
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
44
+ "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)",
45
+ "Input \u001b[0;32mIn [3]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m model \u001b[38;5;241m=\u001b[39m AutoModelForCTC\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvitouphy/xls-r-300m-km\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 2\u001b[0m processor \u001b[38;5;241m=\u001b[39m \u001b[43mWav2Vec2Processor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mvitouphy/xls-r-300m-km\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
46
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py:117\u001b[0m, in \u001b[0;36mWav2Vec2Processor.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;66;03m# load generic `AutoTokenizer`\u001b[39;00m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;66;03m# need fallback here for backward compatibility in case processor is\u001b[39;00m\n\u001b[1;32m 114\u001b[0m \u001b[38;5;66;03m# loaded from just a tokenizer file that does not have a `tokenizer_class` attribute\u001b[39;00m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;66;03m# behavior should be deprecated in major future release\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 117\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mAutoTokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m 119\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 120\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLoading a tokenizer inside \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m from a config that does not\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m include a `tokenizer_class` attribute is deprecated and will be \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[1;32m 126\u001b[0m )\n",
47
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py:514\u001b[0m, in \u001b[0;36mAutoTokenizer.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m tokenizer_class \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 511\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 512\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTokenizer class \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtokenizer_class_candidate\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not exist or is not currently imported.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 513\u001b[0m )\n\u001b[0;32m--> 514\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtokenizer_class\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 516\u001b[0m \u001b[38;5;66;03m# Otherwise we have to be creative.\u001b[39;00m\n\u001b[1;32m 517\u001b[0m \u001b[38;5;66;03m# if model is an encoder decoder, the encoder tokenizer class is used by default\u001b[39;00m\n\u001b[1;32m 518\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(config, EncoderDecoderConfig):\n",
48
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:1773\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 1770\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1771\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloading file \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m from cache at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresolved_vocab_files[file_id]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 1773\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_from_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1774\u001b[0m \u001b[43m \u001b[49m\u001b[43mresolved_vocab_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1775\u001b[0m \u001b[43m \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1776\u001b[0m \u001b[43m \u001b[49m\u001b[43minit_configuration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1777\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1778\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1779\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1780\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1781\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
49
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:1908\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase._from_pretrained\u001b[0;34m(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 1906\u001b[0m \u001b[38;5;66;03m# Instantiate tokenizer.\u001b[39;00m\n\u001b[1;32m 1907\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1908\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1909\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m 1910\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\n\u001b[1;32m 1911\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to load vocabulary from file. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1912\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease check that the provided vocabulary is accessible and not corrupted.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1913\u001b[0m )\n",
50
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/tokenization_wav2vec2.py:142\u001b[0m, in \u001b[0;36mWav2Vec2CTCTokenizer.__init__\u001b[0;34m(self, vocab_file, bos_token, eos_token, unk_token, pad_token, word_delimiter_token, do_lower_case, **kwargs)\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdo_lower_case \u001b[38;5;241m=\u001b[39m do_lower_case\n\u001b[1;32m 141\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(vocab_file, encoding\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m vocab_handle:\n\u001b[0;32m--> 142\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder \u001b[38;5;241m=\u001b[39m \u001b[43mjson\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvocab_handle\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecoder \u001b[38;5;241m=\u001b[39m {v: k \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder\u001b[38;5;241m.\u001b[39mitems()}\n\u001b[1;32m 145\u001b[0m \u001b[38;5;66;03m# make sure that tokens made of several\u001b[39;00m\n\u001b[1;32m 146\u001b[0m \u001b[38;5;66;03m# characters are not split at tokenization\u001b[39;00m\n",
51
+ "File \u001b[0;32m/opt/conda/lib/python3.8/json/__init__.py:293\u001b[0m, in \u001b[0;36mload\u001b[0;34m(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload\u001b[39m(fp, \u001b[38;5;241m*\u001b[39m, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, object_hook\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, parse_float\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 275\u001b[0m parse_int\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, parse_constant\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, object_pairs_hook\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkw):\n\u001b[1;32m 276\u001b[0m \u001b[38;5;124;03m\"\"\"Deserialize ``fp`` (a ``.read()``-supporting file-like object containing\u001b[39;00m\n\u001b[1;32m 277\u001b[0m \u001b[38;5;124;03m a JSON document) to a Python object.\u001b[39;00m\n\u001b[1;32m 278\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[38;5;124;03m kwarg; otherwise ``JSONDecoder`` is used.\u001b[39;00m\n\u001b[1;32m 292\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloads\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobject_hook\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobject_hook\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[43mparse_float\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_float\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparse_int\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_int\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43mparse_constant\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_constant\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobject_pairs_hook\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobject_pairs_hook\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\n",
52
+ "File \u001b[0;32m/opt/conda/lib/python3.8/json/__init__.py:357\u001b[0m, in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 352\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m kw[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 355\u001b[0m parse_int \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m parse_float \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 356\u001b[0m parse_constant \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_pairs_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m kw):\n\u001b[0;32m--> 357\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_default_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 358\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 359\u001b[0m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;241m=\u001b[39m JSONDecoder\n",
53
+ "File \u001b[0;32m/opt/conda/lib/python3.8/json/decoder.py:337\u001b[0m, in \u001b[0;36mJSONDecoder.decode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode\u001b[39m(\u001b[38;5;28mself\u001b[39m, s, _w\u001b[38;5;241m=\u001b[39mWHITESPACE\u001b[38;5;241m.\u001b[39mmatch):\n\u001b[1;32m 333\u001b[0m \u001b[38;5;124;03m\"\"\"Return the Python representation of ``s`` (a ``str`` instance\u001b[39;00m\n\u001b[1;32m 334\u001b[0m \u001b[38;5;124;03m containing a JSON document).\u001b[39;00m\n\u001b[1;32m 335\u001b[0m \n\u001b[1;32m 336\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 337\u001b[0m obj, end \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraw_decode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43midx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_w\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mend\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 338\u001b[0m end \u001b[38;5;241m=\u001b[39m _w(s, end)\u001b[38;5;241m.\u001b[39mend()\n\u001b[1;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m end \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(s):\n",
54
+ "File \u001b[0;32m/opt/conda/lib/python3.8/json/decoder.py:355\u001b[0m, in \u001b[0;36mJSONDecoder.raw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 353\u001b[0m obj, end \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscan_once(s, idx)\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m--> 355\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m JSONDecodeError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpecting value\u001b[39m\u001b[38;5;124m\"\u001b[39m, s, err\u001b[38;5;241m.\u001b[39mvalue) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[1;32m 356\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj, end\n",
55
+ "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)"
56
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  }
58
  ],
59
  "source": [
 
63
  },
64
  {
65
  "cell_type": "code",
66
+ "execution_count": 8,
67
+ "id": "3d61ff3b",
68
  "metadata": {},
69
  "outputs": [
70
  {
 
82
  },
83
  {
84
  "cell_type": "code",
85
+ "execution_count": 9,
86
+ "id": "a03f3af4",
87
  "metadata": {},
88
  "outputs": [],
89
  "source": [
 
94
  },
95
  {
96
  "cell_type": "code",
97
+ "execution_count": 10,
98
+ "id": "9c88048b",
99
  "metadata": {},
100
  "outputs": [],
101
  "source": [
 
104
  },
105
  {
106
  "cell_type": "code",
107
+ "execution_count": 11,
108
+ "id": "f3bfc930",
109
  "metadata": {},
110
  "outputs": [
111
  {
 
118
  " 'sentence': 'แžŸแŸŠแžธ แžŠแžถแž…แŸ‹ แž˜แŸ‰แžผแžแžผ แž“แŸ… แž–แŸแž› แžŠแŸ‚แž› แž”แŸ’แžšแžพ แžฑแŸ’แž™ แžŒแžปแž” แžŸแž˜แŸ’แž—แžถแžšแŸˆ แž‚แŸ’แžšแžฟแž„ แžŸแž„แŸ’แž แžถแžšแžนแž˜ แž™แž€ แž‘แŸ… แžฑแŸ’แž™ แž˜แŸ‰แžผแž™ แž“แŸ… แž˜แŸ’แžŠแžปแŸ† แžœแžแŸ’แžŠ แžŸแŸ†แžšแŸ„แž„แžขแžŽแŸ’แžŠแŸ‚แž'}"
119
  ]
120
  },
121
+ "execution_count": 11,
122
  "metadata": {},
123
  "output_type": "execute_result"
124
  }
 
129
  },
130
  {
131
  "cell_type": "code",
132
+ "execution_count": 12,
133
+ "id": "122a898b",
134
  "metadata": {},
135
  "outputs": [],
136
  "source": [
 
148
  },
149
  {
150
  "cell_type": "code",
151
+ "execution_count": 13,
152
+ "id": "153e7f45",
153
  "metadata": {},
154
  "outputs": [
155
  {
156
  "data": {
157
  "application/vnd.jupyter.widget-view+json": {
158
+ "model_id": "a0dd47d98a4e448c9f786ce464348946",
159
  "version_major": 2,
160
  "version_minor": 0
161
  },
 
173
  },
174
  {
175
  "cell_type": "code",
176
+ "execution_count": 17,
177
+ "id": "8947d307",
178
  "metadata": {},
179
  "outputs": [],
180
  "source": [
181
+ "i = 25"
182
  ]
183
  },
184
  {
185
  "cell_type": "code",
186
+ "execution_count": 18,
187
+ "id": "3d6b46ca",
188
  "metadata": {},
189
  "outputs": [
190
  {
 
203
  },
204
  {
205
  "cell_type": "code",
206
+ "execution_count": 19,
207
+ "id": "d1550ddc",
208
  "metadata": {},
209
  "outputs": [
210
  {
 
212
  "output_type": "stream",
213
  "text": [
214
  "Prediction:\n",
215
+ "แž€แŸ’แžšแžปแž„ แž”แŸ‰แž™แž”แŸ‰แŸ‚แž แž“แžทแž„ แžแŸ’แžœแžถแŸ‡แž‘แžนแž€ แžŸแŸ’แžขแžถแž แž”แŸ’แžšแžพ แž…แžถแž”แŸ‹ แž–แžทแžŸแž–แŸ’ แž”แŸ’แžแžถแž› แž€แŸ’แžšแŸ„แž™ แž‘แŸ…\n",
216
  "\n",
217
  "Reference:\n",
218
+ "แž€แŸ’แžšแžปแž„ แž”แŸ‰แŸ„แž™แž”แŸ‰แŸ‚แž แž“แžนแž„ แžแŸ’แžœแŸ‡ แž‘แžนแž€ แžŸแŸ’แžขแžถแž แž”แŸ’แžšแžพ แž…แžถแž”แŸ‹ แž–แžธ แžŸแž”แŸ’แžแžถแž แŸ แž€แŸ’แžšแŸ„แž™ แž‘แŸ…\n"
219
  ]
220
  }
221
  ],
 
232
  {
233
  "cell_type": "code",
234
  "execution_count": null,
235
+ "id": "5bbf1c82",
236
  "metadata": {},
237
  "outputs": [],
238
  "source": []
 
240
  {
241
  "cell_type": "code",
242
  "execution_count": null,
243
+ "id": "71b6f502",
244
  "metadata": {},
245
  "outputs": [],
246
  "source": []
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
train_kh.ipynb CHANGED
@@ -3,7 +3,7 @@
3
  {
4
  "cell_type": "code",
5
  "execution_count": 1,
6
- "id": "1ffb676e",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
@@ -16,7 +16,7 @@
16
  {
17
  "cell_type": "code",
18
  "execution_count": null,
19
- "id": "4aaaf656",
20
  "metadata": {
21
  "collapsed": true,
22
  "jupyter": {
@@ -19167,7 +19167,7 @@
19167
  },
19168
  {
19169
  "cell_type": "markdown",
19170
- "id": "ae085c17",
19171
  "metadata": {},
19172
  "source": [
19173
  "### Load KH Data"
@@ -19176,7 +19176,7 @@
19176
  {
19177
  "cell_type": "code",
19178
  "execution_count": 4,
19179
- "id": "4bc84318",
19180
  "metadata": {},
19181
  "outputs": [],
19182
  "source": [
@@ -19197,7 +19197,7 @@
19197
  {
19198
  "cell_type": "code",
19199
  "execution_count": 5,
19200
- "id": "e5bc62ba",
19201
  "metadata": {},
19202
  "outputs": [
19203
  {
@@ -19307,7 +19307,7 @@
19307
  {
19308
  "cell_type": "code",
19309
  "execution_count": 6,
19310
- "id": "36718ca2",
19311
  "metadata": {},
19312
  "outputs": [],
19313
  "source": [
@@ -19321,7 +19321,7 @@
19321
  },
19322
  {
19323
  "cell_type": "markdown",
19324
- "id": "ad45ec58",
19325
  "metadata": {},
19326
  "source": [
19327
  "### Clean Up the Text"
@@ -19330,7 +19330,7 @@
19330
  {
19331
  "cell_type": "code",
19332
  "execution_count": 6,
19333
- "id": "cceeec4b",
19334
  "metadata": {},
19335
  "outputs": [],
19336
  "source": [
@@ -19346,7 +19346,7 @@
19346
  {
19347
  "cell_type": "code",
19348
  "execution_count": 7,
19349
- "id": "79539c6a",
19350
  "metadata": {
19351
  "collapsed": true,
19352
  "jupyter": {
@@ -19402,7 +19402,7 @@
19402
  {
19403
  "cell_type": "code",
19404
  "execution_count": 7,
19405
- "id": "d06a790a",
19406
  "metadata": {},
19407
  "outputs": [
19408
  {
@@ -19423,7 +19423,7 @@
19423
  },
19424
  {
19425
  "cell_type": "markdown",
19426
- "id": "ca7019af",
19427
  "metadata": {},
19428
  "source": [
19429
  "### Build Character"
@@ -19432,7 +19432,7 @@
19432
  {
19433
  "cell_type": "code",
19434
  "execution_count": 8,
19435
- "id": "9e6c5aeb",
19436
  "metadata": {},
19437
  "outputs": [
19438
  {
@@ -19480,7 +19480,7 @@
19480
  {
19481
  "cell_type": "code",
19482
  "execution_count": 9,
19483
- "id": "04999981",
19484
  "metadata": {},
19485
  "outputs": [],
19486
  "source": [
@@ -19491,7 +19491,7 @@
19491
  {
19492
  "cell_type": "code",
19493
  "execution_count": 10,
19494
- "id": "0a62cbcf",
19495
  "metadata": {},
19496
  "outputs": [
19497
  {
@@ -19509,7 +19509,7 @@
19509
  {
19510
  "cell_type": "code",
19511
  "execution_count": 11,
19512
- "id": "c388cffc",
19513
  "metadata": {},
19514
  "outputs": [
19515
  {
@@ -19536,7 +19536,7 @@
19536
  {
19537
  "cell_type": "code",
19538
  "execution_count": 12,
19539
- "id": "6539221d",
19540
  "metadata": {},
19541
  "outputs": [
19542
  {
@@ -19554,7 +19554,7 @@
19554
  {
19555
  "cell_type": "code",
19556
  "execution_count": 13,
19557
- "id": "3e3bc229",
19558
  "metadata": {},
19559
  "outputs": [],
19560
  "source": [
@@ -19565,7 +19565,7 @@
19565
  },
19566
  {
19567
  "cell_type": "markdown",
19568
- "id": "90ef7d2e",
19569
  "metadata": {},
19570
  "source": [
19571
  "# Tokenizer"
@@ -19574,7 +19574,7 @@
19574
  {
19575
  "cell_type": "code",
19576
  "execution_count": 14,
19577
- "id": "5dca790c",
19578
  "metadata": {},
19579
  "outputs": [],
19580
  "source": [
@@ -19585,15 +19585,38 @@
19585
  },
19586
  {
19587
  "cell_type": "code",
19588
- "execution_count": 15,
19589
- "id": "2a18f061",
19590
  "metadata": {},
19591
  "outputs": [
19592
  {
19593
  "name": "stderr",
19594
  "output_type": "stream",
19595
  "text": [
19596
- "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19597
  ]
19598
  }
19599
  ],
@@ -19606,7 +19629,7 @@
19606
  {
19607
  "cell_type": "code",
19608
  "execution_count": 26,
19609
- "id": "4f1ff378",
19610
  "metadata": {},
19611
  "outputs": [],
19612
  "source": [
@@ -19623,7 +19646,7 @@
19623
  {
19624
  "cell_type": "code",
19625
  "execution_count": 27,
19626
- "id": "1c8d8b0e",
19627
  "metadata": {},
19628
  "outputs": [
19629
  {
@@ -19663,7 +19686,7 @@
19663
  {
19664
  "cell_type": "code",
19665
  "execution_count": 17,
19666
- "id": "4f5cbac0",
19667
  "metadata": {},
19668
  "outputs": [],
19669
  "source": [
@@ -19674,7 +19697,7 @@
19674
  {
19675
  "cell_type": "code",
19676
  "execution_count": 18,
19677
- "id": "46a14b30",
19678
  "metadata": {},
19679
  "outputs": [
19680
  {
@@ -19699,7 +19722,7 @@
19699
  {
19700
  "cell_type": "code",
19701
  "execution_count": 19,
19702
- "id": "7b54ffa4",
19703
  "metadata": {},
19704
  "outputs": [
19705
  {
@@ -19746,7 +19769,7 @@
19746
  {
19747
  "cell_type": "code",
19748
  "execution_count": 20,
19749
- "id": "0a51643b",
19750
  "metadata": {},
19751
  "outputs": [],
19752
  "source": [
@@ -19768,7 +19791,7 @@
19768
  {
19769
  "cell_type": "code",
19770
  "execution_count": 22,
19771
- "id": "e6a3137b",
19772
  "metadata": {},
19773
  "outputs": [],
19774
  "source": [
@@ -19779,7 +19802,7 @@
19779
  {
19780
  "cell_type": "code",
19781
  "execution_count": 41,
19782
- "id": "ffe2e1e6",
19783
  "metadata": {},
19784
  "outputs": [],
19785
  "source": [
@@ -19791,7 +19814,7 @@
19791
  {
19792
  "cell_type": "code",
19793
  "execution_count": 25,
19794
- "id": "a1aec6d5",
19795
  "metadata": {},
19796
  "outputs": [],
19797
  "source": [
@@ -19851,7 +19874,7 @@
19851
  {
19852
  "cell_type": "code",
19853
  "execution_count": 26,
19854
- "id": "1f73c038",
19855
  "metadata": {},
19856
  "outputs": [],
19857
  "source": [
@@ -19861,7 +19884,7 @@
19861
  {
19862
  "cell_type": "code",
19863
  "execution_count": 27,
19864
- "id": "322220b4",
19865
  "metadata": {},
19866
  "outputs": [],
19867
  "source": [
@@ -19872,7 +19895,7 @@
19872
  {
19873
  "cell_type": "code",
19874
  "execution_count": 44,
19875
- "id": "19d15ad2",
19876
  "metadata": {},
19877
  "outputs": [],
19878
  "source": [
@@ -19897,7 +19920,7 @@
19897
  {
19898
  "cell_type": "code",
19899
  "execution_count": 45,
19900
- "id": "fa0dd3e4",
19901
  "metadata": {
19902
  "collapsed": true,
19903
  "jupyter": {
@@ -20048,7 +20071,7 @@
20048
  {
20049
  "cell_type": "code",
20050
  "execution_count": 46,
20051
- "id": "6b01e8e0",
20052
  "metadata": {},
20053
  "outputs": [],
20054
  "source": [
@@ -20058,7 +20081,7 @@
20058
  {
20059
  "cell_type": "code",
20060
  "execution_count": 47,
20061
- "id": "3fe63254",
20062
  "metadata": {},
20063
  "outputs": [
20064
  {
@@ -20095,7 +20118,7 @@
20095
  {
20096
  "cell_type": "code",
20097
  "execution_count": 48,
20098
- "id": "745522a2",
20099
  "metadata": {},
20100
  "outputs": [
20101
  {
@@ -20123,7 +20146,7 @@
20123
  {
20124
  "cell_type": "code",
20125
  "execution_count": 49,
20126
- "id": "a2a214a5",
20127
  "metadata": {
20128
  "collapsed": true,
20129
  "jupyter": {
@@ -20764,7 +20787,7 @@
20764
  {
20765
  "cell_type": "code",
20766
  "execution_count": 57,
20767
- "id": "6cc9f33d",
20768
  "metadata": {},
20769
  "outputs": [
20770
  {
@@ -20784,7 +20807,7 @@
20784
  {
20785
  "cell_type": "code",
20786
  "execution_count": 53,
20787
- "id": "72f2e951",
20788
  "metadata": {},
20789
  "outputs": [],
20790
  "source": [
@@ -20801,7 +20824,7 @@
20801
  {
20802
  "cell_type": "code",
20803
  "execution_count": 54,
20804
- "id": "78d7353f",
20805
  "metadata": {},
20806
  "outputs": [
20807
  {
@@ -20819,52 +20842,54 @@
20819
  },
20820
  {
20821
  "cell_type": "code",
20822
- "execution_count": 58,
20823
- "id": "d9bb5fa1",
20824
  "metadata": {},
20825
  "outputs": [
20826
  {
20827
  "name": "stderr",
20828
  "output_type": "stream",
20829
  "text": [
20830
- "Configuration saved in vitouphy/xls-r-300m-km/config.json\n",
20831
- "Model weights saved in vitouphy/xls-r-300m-km/pytorch_model.bin\n",
20832
- "Several commits (2) will be pushed upstream.\n",
20833
- "The progress bars may be unreliable.\n"
20834
  ]
20835
  },
20836
  {
20837
  "data": {
20838
  "application/vnd.jupyter.widget-view+json": {
20839
- "model_id": "65189fc5b517439b87208f0898179afd",
20840
  "version_major": 2,
20841
  "version_minor": 0
20842
  },
20843
  "text/plain": [
20844
- "Upload file pytorch_model.bin: 0%| | 3.39k/1.18G [00:00<?, ?B/s]"
20845
  ]
20846
  },
20847
  "metadata": {},
20848
  "output_type": "display_data"
20849
  },
20850
- {
20851
- "name": "stderr",
20852
- "output_type": "stream",
20853
- "text": [
20854
- "To https://huggingface.co/vitouphy/xls-r-300m-km\n",
20855
- " e25c362..dff1f30 main -> main\n",
20856
- "\n"
20857
- ]
20858
- },
20859
  {
20860
  "data": {
 
 
 
 
 
20861
  "text/plain": [
20862
- "'https://huggingface.co/vitouphy/xls-r-300m-km/commit/dff1f3008b5c2afbbbcab722e17fded4bf8f782b'"
20863
  ]
20864
  },
20865
- "execution_count": 58,
20866
  "metadata": {},
20867
- "output_type": "execute_result"
 
 
 
 
 
 
 
 
20868
  }
20869
  ],
20870
  "source": [
@@ -20873,8 +20898,8 @@
20873
  },
20874
  {
20875
  "cell_type": "code",
20876
- "execution_count": 59,
20877
- "id": "e56874c6",
20878
  "metadata": {},
20879
  "outputs": [
20880
  {
@@ -20891,6 +20916,14 @@
20891
  "source": [
20892
  "trainer.save_model()"
20893
  ]
 
 
 
 
 
 
 
 
20894
  }
20895
  ],
20896
  "metadata": {
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": 1,
6
+ "id": "1bf32ef8",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
 
16
  {
17
  "cell_type": "code",
18
  "execution_count": null,
19
+ "id": "d2deec6c",
20
  "metadata": {
21
  "collapsed": true,
22
  "jupyter": {
 
19167
  },
19168
  {
19169
  "cell_type": "markdown",
19170
+ "id": "6fe38e7a",
19171
  "metadata": {},
19172
  "source": [
19173
  "### Load KH Data"
 
19176
  {
19177
  "cell_type": "code",
19178
  "execution_count": 4,
19179
+ "id": "b75f1fec",
19180
  "metadata": {},
19181
  "outputs": [],
19182
  "source": [
 
19197
  {
19198
  "cell_type": "code",
19199
  "execution_count": 5,
19200
+ "id": "433fe749",
19201
  "metadata": {},
19202
  "outputs": [
19203
  {
 
19307
  {
19308
  "cell_type": "code",
19309
  "execution_count": 6,
19310
+ "id": "c6d633ad",
19311
  "metadata": {},
19312
  "outputs": [],
19313
  "source": [
 
19321
  },
19322
  {
19323
  "cell_type": "markdown",
19324
+ "id": "acb914d0",
19325
  "metadata": {},
19326
  "source": [
19327
  "### Clean Up the Text"
 
19330
  {
19331
  "cell_type": "code",
19332
  "execution_count": 6,
19333
+ "id": "bc3a017b",
19334
  "metadata": {},
19335
  "outputs": [],
19336
  "source": [
 
19346
  {
19347
  "cell_type": "code",
19348
  "execution_count": 7,
19349
+ "id": "4a7b6a10",
19350
  "metadata": {
19351
  "collapsed": true,
19352
  "jupyter": {
 
19402
  {
19403
  "cell_type": "code",
19404
  "execution_count": 7,
19405
+ "id": "7f511e3f",
19406
  "metadata": {},
19407
  "outputs": [
19408
  {
 
19423
  },
19424
  {
19425
  "cell_type": "markdown",
19426
+ "id": "205a6e23",
19427
  "metadata": {},
19428
  "source": [
19429
  "### Build Character"
 
19432
  {
19433
  "cell_type": "code",
19434
  "execution_count": 8,
19435
+ "id": "48a97fac",
19436
  "metadata": {},
19437
  "outputs": [
19438
  {
 
19480
  {
19481
  "cell_type": "code",
19482
  "execution_count": 9,
19483
+ "id": "9b4ac5f7",
19484
  "metadata": {},
19485
  "outputs": [],
19486
  "source": [
 
19491
  {
19492
  "cell_type": "code",
19493
  "execution_count": 10,
19494
+ "id": "a9a07875",
19495
  "metadata": {},
19496
  "outputs": [
19497
  {
 
19509
  {
19510
  "cell_type": "code",
19511
  "execution_count": 11,
19512
+ "id": "8a3d39d8",
19513
  "metadata": {},
19514
  "outputs": [
19515
  {
 
19536
  {
19537
  "cell_type": "code",
19538
  "execution_count": 12,
19539
+ "id": "934a4070",
19540
  "metadata": {},
19541
  "outputs": [
19542
  {
 
19554
  {
19555
  "cell_type": "code",
19556
  "execution_count": 13,
19557
+ "id": "7f42a2b4",
19558
  "metadata": {},
19559
  "outputs": [],
19560
  "source": [
 
19565
  },
19566
  {
19567
  "cell_type": "markdown",
19568
+ "id": "9a504bc4",
19569
  "metadata": {},
19570
  "source": [
19571
  "# Tokenizer"
 
19574
  {
19575
  "cell_type": "code",
19576
  "execution_count": 14,
19577
+ "id": "0cec90b4",
19578
  "metadata": {},
19579
  "outputs": [],
19580
  "source": [
 
19585
  },
19586
  {
19587
  "cell_type": "code",
19588
+ "execution_count": 62,
19589
+ "id": "dc9e79da",
19590
  "metadata": {},
19591
  "outputs": [
19592
  {
19593
  "name": "stderr",
19594
  "output_type": "stream",
19595
  "text": [
19596
+ "Didn't find file ./tokenizer.json. We won't load it.\n",
19597
+ "loading file ./vocab.json\n",
19598
+ "loading file ./tokenizer_config.json\n",
19599
+ "loading file ./added_tokens.json\n",
19600
+ "loading file ./special_tokens_map.json\n",
19601
+ "loading file None\n"
19602
+ ]
19603
+ },
19604
+ {
19605
+ "ename": "JSONDecodeError",
19606
+ "evalue": "Expecting value: line 1 column 1 (char 0)",
19607
+ "output_type": "error",
19608
+ "traceback": [
19609
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
19610
+ "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)",
19611
+ "Input \u001b[0;32mIn [62]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mWav2Vec2CTCTokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m./\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43munk_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m[UNK]\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpad_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m[PAD]\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword_delimiter_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m|\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# './' load vocab.json in the current directory\u001b[39;00m\n\u001b[1;32m 2\u001b[0m feature_extractor \u001b[38;5;241m=\u001b[39m Wav2Vec2FeatureExtractor(feature_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, sampling_rate\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16000\u001b[39m, padding_value\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.0\u001b[39m, do_normalize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, return_attention_mask\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m) \n\u001b[1;32m 3\u001b[0m processor \u001b[38;5;241m=\u001b[39m Wav2Vec2Processor(feature_extractor\u001b[38;5;241m=\u001b[39mfeature_extractor, tokenizer\u001b[38;5;241m=\u001b[39mtokenizer)\n",
19612
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:1773\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 1770\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1771\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloading file \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m from cache at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresolved_vocab_files[file_id]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 1773\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_from_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1774\u001b[0m \u001b[43m \u001b[49m\u001b[43mresolved_vocab_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1775\u001b[0m \u001b[43m \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1776\u001b[0m \u001b[43m \u001b[49m\u001b[43minit_configuration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1777\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1778\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1779\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1780\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1781\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
19613
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:1908\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase._from_pretrained\u001b[0;34m(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 1906\u001b[0m \u001b[38;5;66;03m# Instantiate tokenizer.\u001b[39;00m\n\u001b[1;32m 1907\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1908\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1909\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m 1910\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\n\u001b[1;32m 1911\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to load vocabulary from file. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1912\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease check that the provided vocabulary is accessible and not corrupted.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1913\u001b[0m )\n",
19614
+ "File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/tokenization_wav2vec2.py:142\u001b[0m, in \u001b[0;36mWav2Vec2CTCTokenizer.__init__\u001b[0;34m(self, vocab_file, bos_token, eos_token, unk_token, pad_token, word_delimiter_token, do_lower_case, **kwargs)\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdo_lower_case \u001b[38;5;241m=\u001b[39m do_lower_case\n\u001b[1;32m 141\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(vocab_file, encoding\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m vocab_handle:\n\u001b[0;32m--> 142\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder \u001b[38;5;241m=\u001b[39m \u001b[43mjson\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvocab_handle\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecoder \u001b[38;5;241m=\u001b[39m {v: k \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder\u001b[38;5;241m.\u001b[39mitems()}\n\u001b[1;32m 145\u001b[0m \u001b[38;5;66;03m# make sure that tokens made of several\u001b[39;00m\n\u001b[1;32m 146\u001b[0m \u001b[38;5;66;03m# characters are not split at tokenization\u001b[39;00m\n",
19615
+ "File \u001b[0;32m/opt/conda/lib/python3.8/json/__init__.py:293\u001b[0m, in \u001b[0;36mload\u001b[0;34m(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload\u001b[39m(fp, \u001b[38;5;241m*\u001b[39m, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, object_hook\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, parse_float\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 275\u001b[0m parse_int\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, parse_constant\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, object_pairs_hook\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkw):\n\u001b[1;32m 276\u001b[0m \u001b[38;5;124;03m\"\"\"Deserialize ``fp`` (a ``.read()``-supporting file-like object containing\u001b[39;00m\n\u001b[1;32m 277\u001b[0m \u001b[38;5;124;03m a JSON document) to a Python object.\u001b[39;00m\n\u001b[1;32m 278\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[38;5;124;03m kwarg; otherwise ``JSONDecoder`` is used.\u001b[39;00m\n\u001b[1;32m 292\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloads\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobject_hook\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobject_hook\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[43mparse_float\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_float\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparse_int\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_int\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43mparse_constant\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_constant\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobject_pairs_hook\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobject_pairs_hook\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\n",
19616
+ "File \u001b[0;32m/opt/conda/lib/python3.8/json/__init__.py:357\u001b[0m, in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 352\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m kw[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 355\u001b[0m parse_int \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m parse_float \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 356\u001b[0m parse_constant \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_pairs_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m kw):\n\u001b[0;32m--> 357\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_default_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 358\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 359\u001b[0m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;241m=\u001b[39m JSONDecoder\n",
19617
+ "File \u001b[0;32m/opt/conda/lib/python3.8/json/decoder.py:337\u001b[0m, in \u001b[0;36mJSONDecoder.decode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode\u001b[39m(\u001b[38;5;28mself\u001b[39m, s, _w\u001b[38;5;241m=\u001b[39mWHITESPACE\u001b[38;5;241m.\u001b[39mmatch):\n\u001b[1;32m 333\u001b[0m \u001b[38;5;124;03m\"\"\"Return the Python representation of ``s`` (a ``str`` instance\u001b[39;00m\n\u001b[1;32m 334\u001b[0m \u001b[38;5;124;03m containing a JSON document).\u001b[39;00m\n\u001b[1;32m 335\u001b[0m \n\u001b[1;32m 336\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 337\u001b[0m obj, end \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraw_decode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43midx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_w\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mend\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 338\u001b[0m end \u001b[38;5;241m=\u001b[39m _w(s, end)\u001b[38;5;241m.\u001b[39mend()\n\u001b[1;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m end \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(s):\n",
19618
+ "File \u001b[0;32m/opt/conda/lib/python3.8/json/decoder.py:355\u001b[0m, in \u001b[0;36mJSONDecoder.raw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 353\u001b[0m obj, end \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscan_once(s, idx)\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m--> 355\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m JSONDecodeError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpecting value\u001b[39m\u001b[38;5;124m\"\u001b[39m, s, err\u001b[38;5;241m.\u001b[39mvalue) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[1;32m 356\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj, end\n",
19619
+ "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)"
19620
  ]
19621
  }
19622
  ],
 
19629
  {
19630
  "cell_type": "code",
19631
  "execution_count": 26,
19632
+ "id": "61738038",
19633
  "metadata": {},
19634
  "outputs": [],
19635
  "source": [
 
19646
  {
19647
  "cell_type": "code",
19648
  "execution_count": 27,
19649
+ "id": "4b72b9b8",
19650
  "metadata": {},
19651
  "outputs": [
19652
  {
 
19686
  {
19687
  "cell_type": "code",
19688
  "execution_count": 17,
19689
+ "id": "ccb4a36c",
19690
  "metadata": {},
19691
  "outputs": [],
19692
  "source": [
 
19697
  {
19698
  "cell_type": "code",
19699
  "execution_count": 18,
19700
+ "id": "cf9d1391",
19701
  "metadata": {},
19702
  "outputs": [
19703
  {
 
19722
  {
19723
  "cell_type": "code",
19724
  "execution_count": 19,
19725
+ "id": "57ea4c6f",
19726
  "metadata": {},
19727
  "outputs": [
19728
  {
 
19769
  {
19770
  "cell_type": "code",
19771
  "execution_count": 20,
19772
+ "id": "7cb9fd2a",
19773
  "metadata": {},
19774
  "outputs": [],
19775
  "source": [
 
19791
  {
19792
  "cell_type": "code",
19793
  "execution_count": 22,
19794
+ "id": "42f2952f",
19795
  "metadata": {},
19796
  "outputs": [],
19797
  "source": [
 
19802
  {
19803
  "cell_type": "code",
19804
  "execution_count": 41,
19805
+ "id": "fe093630",
19806
  "metadata": {},
19807
  "outputs": [],
19808
  "source": [
 
19814
  {
19815
  "cell_type": "code",
19816
  "execution_count": 25,
19817
+ "id": "a6efe782",
19818
  "metadata": {},
19819
  "outputs": [],
19820
  "source": [
 
19874
  {
19875
  "cell_type": "code",
19876
  "execution_count": 26,
19877
+ "id": "e82a3663",
19878
  "metadata": {},
19879
  "outputs": [],
19880
  "source": [
 
19884
  {
19885
  "cell_type": "code",
19886
  "execution_count": 27,
19887
+ "id": "1df03ab8",
19888
  "metadata": {},
19889
  "outputs": [],
19890
  "source": [
 
19895
  {
19896
  "cell_type": "code",
19897
  "execution_count": 44,
19898
+ "id": "8304f047",
19899
  "metadata": {},
19900
  "outputs": [],
19901
  "source": [
 
19920
  {
19921
  "cell_type": "code",
19922
  "execution_count": 45,
19923
+ "id": "f92c9b4d",
19924
  "metadata": {
19925
  "collapsed": true,
19926
  "jupyter": {
 
20071
  {
20072
  "cell_type": "code",
20073
  "execution_count": 46,
20074
+ "id": "7f2dd147",
20075
  "metadata": {},
20076
  "outputs": [],
20077
  "source": [
 
20081
  {
20082
  "cell_type": "code",
20083
  "execution_count": 47,
20084
+ "id": "3d27466c",
20085
  "metadata": {},
20086
  "outputs": [
20087
  {
 
20118
  {
20119
  "cell_type": "code",
20120
  "execution_count": 48,
20121
+ "id": "014ac4c9",
20122
  "metadata": {},
20123
  "outputs": [
20124
  {
 
20146
  {
20147
  "cell_type": "code",
20148
  "execution_count": 49,
20149
+ "id": "e6cb809a",
20150
  "metadata": {
20151
  "collapsed": true,
20152
  "jupyter": {
 
20787
  {
20788
  "cell_type": "code",
20789
  "execution_count": 57,
20790
+ "id": "57c2527b",
20791
  "metadata": {},
20792
  "outputs": [
20793
  {
 
20807
  {
20808
  "cell_type": "code",
20809
  "execution_count": 53,
20810
+ "id": "0211e267",
20811
  "metadata": {},
20812
  "outputs": [],
20813
  "source": [
 
20824
  {
20825
  "cell_type": "code",
20826
  "execution_count": 54,
20827
+ "id": "62f6fd3e",
20828
  "metadata": {},
20829
  "outputs": [
20830
  {
 
20842
  },
20843
  {
20844
  "cell_type": "code",
20845
+ "execution_count": 60,
20846
+ "id": "b050fb9f",
20847
  "metadata": {},
20848
  "outputs": [
20849
  {
20850
  "name": "stderr",
20851
  "output_type": "stream",
20852
  "text": [
20853
+ "/opt/conda/lib/python3.8/site-packages/huggingface_hub/hf_api.py:1001: FutureWarning: `create_repo` now takes `token` as an optional positional argument. Be sure to adapt your code!\n",
20854
+ " warnings.warn(\n",
20855
+ "Cloning https://huggingface.co/vitouphy/xls-r-300m-km into local empty directory.\n"
 
20856
  ]
20857
  },
20858
  {
20859
  "data": {
20860
  "application/vnd.jupyter.widget-view+json": {
20861
+ "model_id": "331db7acce774ee3b699aa82a0451092",
20862
  "version_major": 2,
20863
  "version_minor": 0
20864
  },
20865
  "text/plain": [
20866
+ "Download file pytorch_model.bin: 0%| | 3.47k/1.18G [00:00<?, ?B/s]"
20867
  ]
20868
  },
20869
  "metadata": {},
20870
  "output_type": "display_data"
20871
  },
 
 
 
 
 
 
 
 
 
20872
  {
20873
  "data": {
20874
+ "application/vnd.jupyter.widget-view+json": {
20875
+ "model_id": "db90465291c64e9f82988698d2473234",
20876
+ "version_major": 2,
20877
+ "version_minor": 0
20878
+ },
20879
  "text/plain": [
20880
+ "Clean file pytorch_model.bin: 0%| | 1.00k/1.18G [00:00<?, ?B/s]"
20881
  ]
20882
  },
 
20883
  "metadata": {},
20884
+ "output_type": "display_data"
20885
+ },
20886
+ {
20887
+ "name": "stderr",
20888
+ "output_type": "stream",
20889
+ "text": [
20890
+ "Configuration saved in vitouphy/xls-r-300m-km/config.json\n",
20891
+ "Model weights saved in vitouphy/xls-r-300m-km/pytorch_model.bin\n"
20892
+ ]
20893
  }
20894
  ],
20895
  "source": [
 
20898
  },
20899
  {
20900
  "cell_type": "code",
20901
+ "execution_count": 61,
20902
+ "id": "9d7cb173",
20903
  "metadata": {},
20904
  "outputs": [
20905
  {
 
20916
  "source": [
20917
  "trainer.save_model()"
20918
  ]
20919
+ },
20920
+ {
20921
+ "cell_type": "code",
20922
+ "execution_count": null,
20923
+ "id": "8dc01ad4",
20924
+ "metadata": {},
20925
+ "outputs": [],
20926
+ "source": []
20927
  }
20928
  ],
20929
  "metadata": {
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c93624168d989aabdf33bd32b0dbfa8b32857515b2a2f04190df98bd15ae4e61
3
+ size 2991
vocab.json CHANGED
@@ -1,5 +1 @@
1
- <<<<<<< HEAD
2
- {"\u1780": 1, "\u1781": 2, "\u1782": 3, "\u1783": 4, "\u1784": 5, "\u1785": 6, "\u1786": 7, "\u1787": 8, "\u1788": 9, "\u1789": 10, "\u178a": 11, "\u178b": 12, "\u178c": 13, "\u178d": 14, "\u178e": 15, "\u178f": 16, "\u1790": 17, "\u1791": 18, "\u1792": 19, "\u1793": 20, "\u1794": 21, "\u1795": 22, "\u1796": 23, "\u1797": 24, "\u1798": 25, "\u1799": 26, "\u179a": 27, "\u179b": 28, "\u179c": 29, "\u179f": 30, "\u17a0": 31, "\u17a1": 32, "\u17a2": 33, "\u17a5": 34, "\u17a7": 35, "\u17aa": 36, "\u17ab": 37, "\u17ac": 38, "\u17ad": 39, "\u17ae": 40, "\u17af": 41, "\u17b1": 42, "\u17b6": 43, "\u17b7": 44, "\u17b8": 45, "\u17b9": 46, "\u17ba": 47, "\u17bb": 48, "\u17bc": 49, "\u17bd": 50, "\u17be": 51, "\u17bf": 52, "\u17c0": 53, "\u17c1": 54, "\u17c2": 55, "\u17c3": 56, "\u17c4": 57, "\u17c5": 58, "\u17c6": 59, "\u17c7": 60, "\u17c8": 61, "\u17c9": 62, "\u17ca": 63, "\u17cb": 64, "\u17cc": 65, "\u17cd": 66, "\u17ce": 67, "\u17cf": 68, "\u17d0": 69, "\u17d2": 70, "|": 0, "[UNK]": 71, "[PAD]": 72}
3
- =======
4
- {"แž€": 1, "แž": 2, "แž‚": 3, "แžƒ": 4, "แž„": 5, "แž…": 6, "แž†": 7, "แž‡": 8, "แžˆ": 9, "แž‰": 10, "แžŠ": 11, "แž‹": 12, "แžŒ": 13, "แž": 14, "แžŽ": 15, "แž": 16, "แž": 17, "แž‘": 18, "แž’": 19, "แž“": 20, "แž”": 21, "แž•": 22, "แž–": 23, "แž—": 24, "แž˜": 25, "แž™": 26, "แžš": 27, "แž›": 28, "แžœ": 29, "แžŸ": 30, "แž ": 31, "แžก": 32, "แžข": 33, "แžฅ": 34, "แžง": 35, "แžช": 36, "แžซ": 37, "แžฌ": 38, "แžญ": 39, "แžฎ": 40, "แžฏ": 41, "แžฑ": 42, "แžถ": 43, "แžท": 44, "แžธ": 45, "แžน": 46, "แžบ": 47, "แžป": 48, "แžผ": 49, "แžฝ": 50, "แžพ": 51, "แžฟ": 52, "แŸ€": 53, "แŸ": 54, "แŸ‚": 55, "แŸƒ": 56, "แŸ„": 57, "แŸ…": 58, "แŸ†": 59, "แŸ‡": 60, "แŸˆ": 61, "แŸ‰": 62, "แŸŠ": 63, "แŸ‹": 64, "แŸŒ": 65, "แŸ": 66, "แŸŽ": 67, "แŸ": 68, "แŸ": 69, "แŸ’": 70, "|": 0, "[UNK]": 71, "[PAD]": 72}
5
- >>>>>>> dff1f3008b5c2afbbbcab722e17fded4bf8f782b
 
1
+ {"แž€": 1, "แž": 2, "แž‚": 3, "แžƒ": 4, "แž„": 5, "แž…": 6, "แž†": 7, "แž‡": 8, "แžˆ": 9, "แž‰": 10, "แžŠ": 11, "แž‹": 12, "แžŒ": 13, "แž": 14, "แžŽ": 15, "แž": 16, "แž": 17, "แž‘": 18, "แž’": 19, "แž“": 20, "แž”": 21, "แž•": 22, "แž–": 23, "แž—": 24, "แž˜": 25, "แž™": 26, "แžš": 27, "แž›": 28, "แžœ": 29, "แžŸ": 30, "แž ": 31, "แžก": 32, "แžข": 33, "แžฅ": 34, "แžง": 35, "แžช": 36, "แžซ": 37, "แžฌ": 38, "แžญ": 39, "แžฎ": 40, "แžฏ": 41, "แžฑ": 42, "แžถ": 43, "แžท": 44, "แžธ": 45, "แžน": 46, "แžบ": 47, "แžป": 48, "แžผ": 49, "แžฝ": 50, "แžพ": 51, "แžฟ": 52, "แŸ€": 53, "แŸ": 54, "แŸ‚": 55, "แŸƒ": 56, "แŸ„": 57, "แŸ…": 58, "แŸ†": 59, "แŸ‡": 60, "แŸˆ": 61, "แŸ‰": 62, "แŸŠ": 63, "แŸ‹": 64, "แŸŒ": 65, "แŸ": 66, "แŸŽ": 67, "แŸ": 68, "แŸ": 69, "แŸ’": 70, "|": 0, "[UNK]": 71, "[PAD]": 72}