fix vocab json file
Browse files- inference.ipynb +58 -139
- preprocessor_config.json +9 -0
- train_kh.ipynb +97 -64
- training_args.bin +3 -0
- vocab.json +1 -5
inference.ipynb
CHANGED
@@ -2,8 +2,8 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
-
"id": "
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": [
|
@@ -15,126 +15,45 @@
|
|
15 |
},
|
16 |
{
|
17 |
"cell_type": "code",
|
18 |
-
"execution_count":
|
19 |
-
"id": "
|
20 |
"metadata": {},
|
21 |
-
"outputs": [
|
22 |
-
{
|
23 |
-
"name": "stderr",
|
24 |
-
"output_type": "stream",
|
25 |
-
"text": [
|
26 |
-
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
27 |
-
]
|
28 |
-
}
|
29 |
-
],
|
30 |
"source": [
|
31 |
-
"
|
32 |
-
"
|
33 |
]
|
34 |
},
|
35 |
{
|
36 |
"cell_type": "code",
|
37 |
-
"execution_count":
|
38 |
-
"id": "
|
39 |
-
"metadata": {
|
|
|
|
|
|
|
|
|
|
|
40 |
"outputs": [
|
41 |
{
|
42 |
-
"
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
"
|
49 |
-
|
50 |
-
|
51 |
-
},
|
52 |
-
"
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
"
|
57 |
-
"
|
58 |
-
|
59 |
-
|
60 |
-
"version_minor": 0
|
61 |
-
},
|
62 |
-
"text/plain": [
|
63 |
-
"Downloading: 0%| | 0.00/1.18G [00:00<?, ?B/s]"
|
64 |
-
]
|
65 |
-
},
|
66 |
-
"metadata": {},
|
67 |
-
"output_type": "display_data"
|
68 |
-
},
|
69 |
-
{
|
70 |
-
"data": {
|
71 |
-
"application/vnd.jupyter.widget-view+json": {
|
72 |
-
"model_id": "8c39ff01100c40aab1191514bb52399d",
|
73 |
-
"version_major": 2,
|
74 |
-
"version_minor": 0
|
75 |
-
},
|
76 |
-
"text/plain": [
|
77 |
-
"Downloading: 0%| | 0.00/214 [00:00<?, ?B/s]"
|
78 |
-
]
|
79 |
-
},
|
80 |
-
"metadata": {},
|
81 |
-
"output_type": "display_data"
|
82 |
-
},
|
83 |
-
{
|
84 |
-
"data": {
|
85 |
-
"application/vnd.jupyter.widget-view+json": {
|
86 |
-
"model_id": "5078aede36ff4dc1a17e24a967cb46dd",
|
87 |
-
"version_major": 2,
|
88 |
-
"version_minor": 0
|
89 |
-
},
|
90 |
-
"text/plain": [
|
91 |
-
"Downloading: 0%| | 0.00/260 [00:00<?, ?B/s]"
|
92 |
-
]
|
93 |
-
},
|
94 |
-
"metadata": {},
|
95 |
-
"output_type": "display_data"
|
96 |
-
},
|
97 |
-
{
|
98 |
-
"data": {
|
99 |
-
"application/vnd.jupyter.widget-view+json": {
|
100 |
-
"model_id": "a4c7e0e6e26f4c5e8f40e79b67b61c17",
|
101 |
-
"version_major": 2,
|
102 |
-
"version_minor": 0
|
103 |
-
},
|
104 |
-
"text/plain": [
|
105 |
-
"Downloading: 0%| | 0.00/795 [00:00<?, ?B/s]"
|
106 |
-
]
|
107 |
-
},
|
108 |
-
"metadata": {},
|
109 |
-
"output_type": "display_data"
|
110 |
-
},
|
111 |
-
{
|
112 |
-
"data": {
|
113 |
-
"application/vnd.jupyter.widget-view+json": {
|
114 |
-
"model_id": "b55bcb16d8d042059a6487391e1a51de",
|
115 |
-
"version_major": 2,
|
116 |
-
"version_minor": 0
|
117 |
-
},
|
118 |
-
"text/plain": [
|
119 |
-
"Downloading: 0%| | 0.00/23.0 [00:00<?, ?B/s]"
|
120 |
-
]
|
121 |
-
},
|
122 |
-
"metadata": {},
|
123 |
-
"output_type": "display_data"
|
124 |
-
},
|
125 |
-
{
|
126 |
-
"data": {
|
127 |
-
"application/vnd.jupyter.widget-view+json": {
|
128 |
-
"model_id": "0546a666b47a4d418e62ccc8fec58bd4",
|
129 |
-
"version_major": 2,
|
130 |
-
"version_minor": 0
|
131 |
-
},
|
132 |
-
"text/plain": [
|
133 |
-
"Downloading: 0%| | 0.00/309 [00:00<?, ?B/s]"
|
134 |
-
]
|
135 |
-
},
|
136 |
-
"metadata": {},
|
137 |
-
"output_type": "display_data"
|
138 |
}
|
139 |
],
|
140 |
"source": [
|
@@ -144,8 +63,8 @@
|
|
144 |
},
|
145 |
{
|
146 |
"cell_type": "code",
|
147 |
-
"execution_count":
|
148 |
-
"id": "
|
149 |
"metadata": {},
|
150 |
"outputs": [
|
151 |
{
|
@@ -163,8 +82,8 @@
|
|
163 |
},
|
164 |
{
|
165 |
"cell_type": "code",
|
166 |
-
"execution_count":
|
167 |
-
"id": "
|
168 |
"metadata": {},
|
169 |
"outputs": [],
|
170 |
"source": [
|
@@ -175,8 +94,8 @@
|
|
175 |
},
|
176 |
{
|
177 |
"cell_type": "code",
|
178 |
-
"execution_count":
|
179 |
-
"id": "
|
180 |
"metadata": {},
|
181 |
"outputs": [],
|
182 |
"source": [
|
@@ -185,8 +104,8 @@
|
|
185 |
},
|
186 |
{
|
187 |
"cell_type": "code",
|
188 |
-
"execution_count":
|
189 |
-
"id": "
|
190 |
"metadata": {},
|
191 |
"outputs": [
|
192 |
{
|
@@ -199,7 +118,7 @@
|
|
199 |
" 'sentence': 'แแแธ แแถแ
แ แแแผแแผ แแ
แแแ แแแ แแแแพ แฑแแ แแปแ แแแแแถแแ แแแแฟแ แแแแ แถแแนแ แแ แแ
แฑแแ แแแผแ แแ
แแแแปแ แแแแ แแแแแแขแแแแแ'}"
|
200 |
]
|
201 |
},
|
202 |
-
"execution_count":
|
203 |
"metadata": {},
|
204 |
"output_type": "execute_result"
|
205 |
}
|
@@ -210,8 +129,8 @@
|
|
210 |
},
|
211 |
{
|
212 |
"cell_type": "code",
|
213 |
-
"execution_count":
|
214 |
-
"id": "
|
215 |
"metadata": {},
|
216 |
"outputs": [],
|
217 |
"source": [
|
@@ -229,14 +148,14 @@
|
|
229 |
},
|
230 |
{
|
231 |
"cell_type": "code",
|
232 |
-
"execution_count":
|
233 |
-
"id": "
|
234 |
"metadata": {},
|
235 |
"outputs": [
|
236 |
{
|
237 |
"data": {
|
238 |
"application/vnd.jupyter.widget-view+json": {
|
239 |
-
"model_id": "
|
240 |
"version_major": 2,
|
241 |
"version_minor": 0
|
242 |
},
|
@@ -254,18 +173,18 @@
|
|
254 |
},
|
255 |
{
|
256 |
"cell_type": "code",
|
257 |
-
"execution_count":
|
258 |
-
"id": "
|
259 |
"metadata": {},
|
260 |
"outputs": [],
|
261 |
"source": [
|
262 |
-
"i =
|
263 |
]
|
264 |
},
|
265 |
{
|
266 |
"cell_type": "code",
|
267 |
-
"execution_count":
|
268 |
-
"id": "
|
269 |
"metadata": {},
|
270 |
"outputs": [
|
271 |
{
|
@@ -284,8 +203,8 @@
|
|
284 |
},
|
285 |
{
|
286 |
"cell_type": "code",
|
287 |
-
"execution_count":
|
288 |
-
"id": "
|
289 |
"metadata": {},
|
290 |
"outputs": [
|
291 |
{
|
@@ -293,10 +212,10 @@
|
|
293 |
"output_type": "stream",
|
294 |
"text": [
|
295 |
"Prediction:\n",
|
296 |
-
"
|
297 |
"\n",
|
298 |
"Reference:\n",
|
299 |
-
"
|
300 |
]
|
301 |
}
|
302 |
],
|
@@ -313,7 +232,7 @@
|
|
313 |
{
|
314 |
"cell_type": "code",
|
315 |
"execution_count": null,
|
316 |
-
"id": "
|
317 |
"metadata": {},
|
318 |
"outputs": [],
|
319 |
"source": []
|
@@ -321,7 +240,7 @@
|
|
321 |
{
|
322 |
"cell_type": "code",
|
323 |
"execution_count": null,
|
324 |
-
"id": "
|
325 |
"metadata": {},
|
326 |
"outputs": [],
|
327 |
"source": []
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "438927ca",
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": [
|
|
|
15 |
},
|
16 |
{
|
17 |
"cell_type": "code",
|
18 |
+
"execution_count": 5,
|
19 |
+
"id": "27a57965",
|
20 |
"metadata": {},
|
21 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
"source": [
|
23 |
+
"model = AutoModelForCTC.from_pretrained(\".\").to('cuda')\n",
|
24 |
+
"processor = Wav2Vec2Processor.from_pretrained(\".\")"
|
25 |
]
|
26 |
},
|
27 |
{
|
28 |
"cell_type": "code",
|
29 |
+
"execution_count": 3,
|
30 |
+
"id": "1d4324df",
|
31 |
+
"metadata": {
|
32 |
+
"collapsed": true,
|
33 |
+
"jupyter": {
|
34 |
+
"outputs_hidden": true
|
35 |
+
}
|
36 |
+
},
|
37 |
"outputs": [
|
38 |
{
|
39 |
+
"ename": "JSONDecodeError",
|
40 |
+
"evalue": "Expecting value: line 1 column 1 (char 0)",
|
41 |
+
"output_type": "error",
|
42 |
+
"traceback": [
|
43 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
44 |
+
"\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)",
|
45 |
+
"Input \u001b[0;32mIn [3]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m model \u001b[38;5;241m=\u001b[39m AutoModelForCTC\u001b[38;5;241m.\u001b[39mfrom_pretrained(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvitouphy/xls-r-300m-km\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 2\u001b[0m processor \u001b[38;5;241m=\u001b[39m \u001b[43mWav2Vec2Processor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mvitouphy/xls-r-300m-km\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
46 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/processing_wav2vec2.py:117\u001b[0m, in \u001b[0;36mWav2Vec2Processor.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;66;03m# load generic `AutoTokenizer`\u001b[39;00m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;66;03m# need fallback here for backward compatibility in case processor is\u001b[39;00m\n\u001b[1;32m 114\u001b[0m \u001b[38;5;66;03m# loaded from just a tokenizer file that does not have a `tokenizer_class` attribute\u001b[39;00m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;66;03m# behavior should be deprecated in major future release\u001b[39;00m\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 117\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mAutoTokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m 119\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 120\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLoading a tokenizer inside \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m from a config that does not\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m include a `tokenizer_class` attribute is deprecated and will be \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[1;32m 126\u001b[0m )\n",
|
47 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py:514\u001b[0m, in \u001b[0;36mAutoTokenizer.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m tokenizer_class \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 511\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 512\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTokenizer class \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtokenizer_class_candidate\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not exist or is not currently imported.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 513\u001b[0m )\n\u001b[0;32m--> 514\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtokenizer_class\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 516\u001b[0m \u001b[38;5;66;03m# Otherwise we have to be creative.\u001b[39;00m\n\u001b[1;32m 517\u001b[0m \u001b[38;5;66;03m# if model is an encoder decoder, the encoder tokenizer class is used by default\u001b[39;00m\n\u001b[1;32m 518\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(config, EncoderDecoderConfig):\n",
|
48 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:1773\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 1770\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1771\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloading file \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m from cache at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresolved_vocab_files[file_id]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 1773\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_from_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1774\u001b[0m \u001b[43m \u001b[49m\u001b[43mresolved_vocab_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1775\u001b[0m \u001b[43m \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1776\u001b[0m \u001b[43m \u001b[49m\u001b[43minit_configuration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1777\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1778\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1779\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1780\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1781\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
49 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:1908\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase._from_pretrained\u001b[0;34m(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 1906\u001b[0m \u001b[38;5;66;03m# Instantiate tokenizer.\u001b[39;00m\n\u001b[1;32m 1907\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1908\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1909\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m 1910\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\n\u001b[1;32m 1911\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to load vocabulary from file. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1912\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease check that the provided vocabulary is accessible and not corrupted.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1913\u001b[0m )\n",
|
50 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/tokenization_wav2vec2.py:142\u001b[0m, in \u001b[0;36mWav2Vec2CTCTokenizer.__init__\u001b[0;34m(self, vocab_file, bos_token, eos_token, unk_token, pad_token, word_delimiter_token, do_lower_case, **kwargs)\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdo_lower_case \u001b[38;5;241m=\u001b[39m do_lower_case\n\u001b[1;32m 141\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(vocab_file, encoding\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m vocab_handle:\n\u001b[0;32m--> 142\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder \u001b[38;5;241m=\u001b[39m \u001b[43mjson\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvocab_handle\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecoder \u001b[38;5;241m=\u001b[39m {v: k \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder\u001b[38;5;241m.\u001b[39mitems()}\n\u001b[1;32m 145\u001b[0m \u001b[38;5;66;03m# make sure that tokens made of several\u001b[39;00m\n\u001b[1;32m 146\u001b[0m \u001b[38;5;66;03m# characters are not split at tokenization\u001b[39;00m\n",
|
51 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/json/__init__.py:293\u001b[0m, in \u001b[0;36mload\u001b[0;34m(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload\u001b[39m(fp, \u001b[38;5;241m*\u001b[39m, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, object_hook\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, parse_float\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 275\u001b[0m parse_int\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, parse_constant\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, object_pairs_hook\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkw):\n\u001b[1;32m 276\u001b[0m \u001b[38;5;124;03m\"\"\"Deserialize ``fp`` (a ``.read()``-supporting file-like object containing\u001b[39;00m\n\u001b[1;32m 277\u001b[0m \u001b[38;5;124;03m a JSON document) to a Python object.\u001b[39;00m\n\u001b[1;32m 278\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[38;5;124;03m kwarg; otherwise ``JSONDecoder`` is used.\u001b[39;00m\n\u001b[1;32m 292\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloads\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobject_hook\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobject_hook\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[43mparse_float\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_float\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparse_int\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_int\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43mparse_constant\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_constant\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobject_pairs_hook\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobject_pairs_hook\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\n",
|
52 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/json/__init__.py:357\u001b[0m, in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 352\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m kw[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 355\u001b[0m parse_int \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m parse_float \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 356\u001b[0m parse_constant \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_pairs_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m kw):\n\u001b[0;32m--> 357\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_default_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 358\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 359\u001b[0m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;241m=\u001b[39m JSONDecoder\n",
|
53 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/json/decoder.py:337\u001b[0m, in \u001b[0;36mJSONDecoder.decode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode\u001b[39m(\u001b[38;5;28mself\u001b[39m, s, _w\u001b[38;5;241m=\u001b[39mWHITESPACE\u001b[38;5;241m.\u001b[39mmatch):\n\u001b[1;32m 333\u001b[0m \u001b[38;5;124;03m\"\"\"Return the Python representation of ``s`` (a ``str`` instance\u001b[39;00m\n\u001b[1;32m 334\u001b[0m \u001b[38;5;124;03m containing a JSON document).\u001b[39;00m\n\u001b[1;32m 335\u001b[0m \n\u001b[1;32m 336\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 337\u001b[0m obj, end \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraw_decode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43midx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_w\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mend\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 338\u001b[0m end \u001b[38;5;241m=\u001b[39m _w(s, end)\u001b[38;5;241m.\u001b[39mend()\n\u001b[1;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m end \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(s):\n",
|
54 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/json/decoder.py:355\u001b[0m, in \u001b[0;36mJSONDecoder.raw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 353\u001b[0m obj, end \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscan_once(s, idx)\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m--> 355\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m JSONDecodeError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpecting value\u001b[39m\u001b[38;5;124m\"\u001b[39m, s, err\u001b[38;5;241m.\u001b[39mvalue) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[1;32m 356\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj, end\n",
|
55 |
+
"\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)"
|
56 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
}
|
58 |
],
|
59 |
"source": [
|
|
|
63 |
},
|
64 |
{
|
65 |
"cell_type": "code",
|
66 |
+
"execution_count": 8,
|
67 |
+
"id": "3d61ff3b",
|
68 |
"metadata": {},
|
69 |
"outputs": [
|
70 |
{
|
|
|
82 |
},
|
83 |
{
|
84 |
"cell_type": "code",
|
85 |
+
"execution_count": 9,
|
86 |
+
"id": "a03f3af4",
|
87 |
"metadata": {},
|
88 |
"outputs": [],
|
89 |
"source": [
|
|
|
94 |
},
|
95 |
{
|
96 |
"cell_type": "code",
|
97 |
+
"execution_count": 10,
|
98 |
+
"id": "9c88048b",
|
99 |
"metadata": {},
|
100 |
"outputs": [],
|
101 |
"source": [
|
|
|
104 |
},
|
105 |
{
|
106 |
"cell_type": "code",
|
107 |
+
"execution_count": 11,
|
108 |
+
"id": "f3bfc930",
|
109 |
"metadata": {},
|
110 |
"outputs": [
|
111 |
{
|
|
|
118 |
" 'sentence': 'แแแธ แแถแ
แ แแแผแแผ แแ
แแแ แแแ แแแแพ แฑแแ แแปแ แแแแแถแแ แแแแฟแ แแแแ แถแแนแ แแ แแ
แฑแแ แแแผแ แแ
แแแแปแ แแแแ แแแแแแขแแแแแ'}"
|
119 |
]
|
120 |
},
|
121 |
+
"execution_count": 11,
|
122 |
"metadata": {},
|
123 |
"output_type": "execute_result"
|
124 |
}
|
|
|
129 |
},
|
130 |
{
|
131 |
"cell_type": "code",
|
132 |
+
"execution_count": 12,
|
133 |
+
"id": "122a898b",
|
134 |
"metadata": {},
|
135 |
"outputs": [],
|
136 |
"source": [
|
|
|
148 |
},
|
149 |
{
|
150 |
"cell_type": "code",
|
151 |
+
"execution_count": 13,
|
152 |
+
"id": "153e7f45",
|
153 |
"metadata": {},
|
154 |
"outputs": [
|
155 |
{
|
156 |
"data": {
|
157 |
"application/vnd.jupyter.widget-view+json": {
|
158 |
+
"model_id": "a0dd47d98a4e448c9f786ce464348946",
|
159 |
"version_major": 2,
|
160 |
"version_minor": 0
|
161 |
},
|
|
|
173 |
},
|
174 |
{
|
175 |
"cell_type": "code",
|
176 |
+
"execution_count": 17,
|
177 |
+
"id": "8947d307",
|
178 |
"metadata": {},
|
179 |
"outputs": [],
|
180 |
"source": [
|
181 |
+
"i = 25"
|
182 |
]
|
183 |
},
|
184 |
{
|
185 |
"cell_type": "code",
|
186 |
+
"execution_count": 18,
|
187 |
+
"id": "3d6b46ca",
|
188 |
"metadata": {},
|
189 |
"outputs": [
|
190 |
{
|
|
|
203 |
},
|
204 |
{
|
205 |
"cell_type": "code",
|
206 |
+
"execution_count": 19,
|
207 |
+
"id": "d1550ddc",
|
208 |
"metadata": {},
|
209 |
"outputs": [
|
210 |
{
|
|
|
212 |
"output_type": "stream",
|
213 |
"text": [
|
214 |
"Prediction:\n",
|
215 |
+
"แแแแปแ แแแแแแแ แแทแ แแแแถแแแนแ แแแขแถแ แแแแพ แ
แถแแ แแทแแแ แแแแถแ แแแแแ แแ
\n",
|
216 |
"\n",
|
217 |
"Reference:\n",
|
218 |
+
"แแแแปแ แแแแแแแแ แแนแ แแแแ แแนแ แแแขแถแ แแแแพ แ
แถแแ แแธ แแแแแถแ แ แแแแแ แแ
\n"
|
219 |
]
|
220 |
}
|
221 |
],
|
|
|
232 |
{
|
233 |
"cell_type": "code",
|
234 |
"execution_count": null,
|
235 |
+
"id": "5bbf1c82",
|
236 |
"metadata": {},
|
237 |
"outputs": [],
|
238 |
"source": []
|
|
|
240 |
{
|
241 |
"cell_type": "code",
|
242 |
"execution_count": null,
|
243 |
+
"id": "71b6f502",
|
244 |
"metadata": {},
|
245 |
"outputs": [],
|
246 |
"source": []
|
preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0.0,
|
7 |
+
"return_attention_mask": true,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
train_kh.ipynb
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
"execution_count": 1,
|
6 |
-
"id": "
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": [
|
@@ -16,7 +16,7 @@
|
|
16 |
{
|
17 |
"cell_type": "code",
|
18 |
"execution_count": null,
|
19 |
-
"id": "
|
20 |
"metadata": {
|
21 |
"collapsed": true,
|
22 |
"jupyter": {
|
@@ -19167,7 +19167,7 @@
|
|
19167 |
},
|
19168 |
{
|
19169 |
"cell_type": "markdown",
|
19170 |
-
"id": "
|
19171 |
"metadata": {},
|
19172 |
"source": [
|
19173 |
"### Load KH Data"
|
@@ -19176,7 +19176,7 @@
|
|
19176 |
{
|
19177 |
"cell_type": "code",
|
19178 |
"execution_count": 4,
|
19179 |
-
"id": "
|
19180 |
"metadata": {},
|
19181 |
"outputs": [],
|
19182 |
"source": [
|
@@ -19197,7 +19197,7 @@
|
|
19197 |
{
|
19198 |
"cell_type": "code",
|
19199 |
"execution_count": 5,
|
19200 |
-
"id": "
|
19201 |
"metadata": {},
|
19202 |
"outputs": [
|
19203 |
{
|
@@ -19307,7 +19307,7 @@
|
|
19307 |
{
|
19308 |
"cell_type": "code",
|
19309 |
"execution_count": 6,
|
19310 |
-
"id": "
|
19311 |
"metadata": {},
|
19312 |
"outputs": [],
|
19313 |
"source": [
|
@@ -19321,7 +19321,7 @@
|
|
19321 |
},
|
19322 |
{
|
19323 |
"cell_type": "markdown",
|
19324 |
-
"id": "
|
19325 |
"metadata": {},
|
19326 |
"source": [
|
19327 |
"### Clean Up the Text"
|
@@ -19330,7 +19330,7 @@
|
|
19330 |
{
|
19331 |
"cell_type": "code",
|
19332 |
"execution_count": 6,
|
19333 |
-
"id": "
|
19334 |
"metadata": {},
|
19335 |
"outputs": [],
|
19336 |
"source": [
|
@@ -19346,7 +19346,7 @@
|
|
19346 |
{
|
19347 |
"cell_type": "code",
|
19348 |
"execution_count": 7,
|
19349 |
-
"id": "
|
19350 |
"metadata": {
|
19351 |
"collapsed": true,
|
19352 |
"jupyter": {
|
@@ -19402,7 +19402,7 @@
|
|
19402 |
{
|
19403 |
"cell_type": "code",
|
19404 |
"execution_count": 7,
|
19405 |
-
"id": "
|
19406 |
"metadata": {},
|
19407 |
"outputs": [
|
19408 |
{
|
@@ -19423,7 +19423,7 @@
|
|
19423 |
},
|
19424 |
{
|
19425 |
"cell_type": "markdown",
|
19426 |
-
"id": "
|
19427 |
"metadata": {},
|
19428 |
"source": [
|
19429 |
"### Build Character"
|
@@ -19432,7 +19432,7 @@
|
|
19432 |
{
|
19433 |
"cell_type": "code",
|
19434 |
"execution_count": 8,
|
19435 |
-
"id": "
|
19436 |
"metadata": {},
|
19437 |
"outputs": [
|
19438 |
{
|
@@ -19480,7 +19480,7 @@
|
|
19480 |
{
|
19481 |
"cell_type": "code",
|
19482 |
"execution_count": 9,
|
19483 |
-
"id": "
|
19484 |
"metadata": {},
|
19485 |
"outputs": [],
|
19486 |
"source": [
|
@@ -19491,7 +19491,7 @@
|
|
19491 |
{
|
19492 |
"cell_type": "code",
|
19493 |
"execution_count": 10,
|
19494 |
-
"id": "
|
19495 |
"metadata": {},
|
19496 |
"outputs": [
|
19497 |
{
|
@@ -19509,7 +19509,7 @@
|
|
19509 |
{
|
19510 |
"cell_type": "code",
|
19511 |
"execution_count": 11,
|
19512 |
-
"id": "
|
19513 |
"metadata": {},
|
19514 |
"outputs": [
|
19515 |
{
|
@@ -19536,7 +19536,7 @@
|
|
19536 |
{
|
19537 |
"cell_type": "code",
|
19538 |
"execution_count": 12,
|
19539 |
-
"id": "
|
19540 |
"metadata": {},
|
19541 |
"outputs": [
|
19542 |
{
|
@@ -19554,7 +19554,7 @@
|
|
19554 |
{
|
19555 |
"cell_type": "code",
|
19556 |
"execution_count": 13,
|
19557 |
-
"id": "
|
19558 |
"metadata": {},
|
19559 |
"outputs": [],
|
19560 |
"source": [
|
@@ -19565,7 +19565,7 @@
|
|
19565 |
},
|
19566 |
{
|
19567 |
"cell_type": "markdown",
|
19568 |
-
"id": "
|
19569 |
"metadata": {},
|
19570 |
"source": [
|
19571 |
"# Tokenizer"
|
@@ -19574,7 +19574,7 @@
|
|
19574 |
{
|
19575 |
"cell_type": "code",
|
19576 |
"execution_count": 14,
|
19577 |
-
"id": "
|
19578 |
"metadata": {},
|
19579 |
"outputs": [],
|
19580 |
"source": [
|
@@ -19585,15 +19585,38 @@
|
|
19585 |
},
|
19586 |
{
|
19587 |
"cell_type": "code",
|
19588 |
-
"execution_count":
|
19589 |
-
"id": "
|
19590 |
"metadata": {},
|
19591 |
"outputs": [
|
19592 |
{
|
19593 |
"name": "stderr",
|
19594 |
"output_type": "stream",
|
19595 |
"text": [
|
19596 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19597 |
]
|
19598 |
}
|
19599 |
],
|
@@ -19606,7 +19629,7 @@
|
|
19606 |
{
|
19607 |
"cell_type": "code",
|
19608 |
"execution_count": 26,
|
19609 |
-
"id": "
|
19610 |
"metadata": {},
|
19611 |
"outputs": [],
|
19612 |
"source": [
|
@@ -19623,7 +19646,7 @@
|
|
19623 |
{
|
19624 |
"cell_type": "code",
|
19625 |
"execution_count": 27,
|
19626 |
-
"id": "
|
19627 |
"metadata": {},
|
19628 |
"outputs": [
|
19629 |
{
|
@@ -19663,7 +19686,7 @@
|
|
19663 |
{
|
19664 |
"cell_type": "code",
|
19665 |
"execution_count": 17,
|
19666 |
-
"id": "
|
19667 |
"metadata": {},
|
19668 |
"outputs": [],
|
19669 |
"source": [
|
@@ -19674,7 +19697,7 @@
|
|
19674 |
{
|
19675 |
"cell_type": "code",
|
19676 |
"execution_count": 18,
|
19677 |
-
"id": "
|
19678 |
"metadata": {},
|
19679 |
"outputs": [
|
19680 |
{
|
@@ -19699,7 +19722,7 @@
|
|
19699 |
{
|
19700 |
"cell_type": "code",
|
19701 |
"execution_count": 19,
|
19702 |
-
"id": "
|
19703 |
"metadata": {},
|
19704 |
"outputs": [
|
19705 |
{
|
@@ -19746,7 +19769,7 @@
|
|
19746 |
{
|
19747 |
"cell_type": "code",
|
19748 |
"execution_count": 20,
|
19749 |
-
"id": "
|
19750 |
"metadata": {},
|
19751 |
"outputs": [],
|
19752 |
"source": [
|
@@ -19768,7 +19791,7 @@
|
|
19768 |
{
|
19769 |
"cell_type": "code",
|
19770 |
"execution_count": 22,
|
19771 |
-
"id": "
|
19772 |
"metadata": {},
|
19773 |
"outputs": [],
|
19774 |
"source": [
|
@@ -19779,7 +19802,7 @@
|
|
19779 |
{
|
19780 |
"cell_type": "code",
|
19781 |
"execution_count": 41,
|
19782 |
-
"id": "
|
19783 |
"metadata": {},
|
19784 |
"outputs": [],
|
19785 |
"source": [
|
@@ -19791,7 +19814,7 @@
|
|
19791 |
{
|
19792 |
"cell_type": "code",
|
19793 |
"execution_count": 25,
|
19794 |
-
"id": "
|
19795 |
"metadata": {},
|
19796 |
"outputs": [],
|
19797 |
"source": [
|
@@ -19851,7 +19874,7 @@
|
|
19851 |
{
|
19852 |
"cell_type": "code",
|
19853 |
"execution_count": 26,
|
19854 |
-
"id": "
|
19855 |
"metadata": {},
|
19856 |
"outputs": [],
|
19857 |
"source": [
|
@@ -19861,7 +19884,7 @@
|
|
19861 |
{
|
19862 |
"cell_type": "code",
|
19863 |
"execution_count": 27,
|
19864 |
-
"id": "
|
19865 |
"metadata": {},
|
19866 |
"outputs": [],
|
19867 |
"source": [
|
@@ -19872,7 +19895,7 @@
|
|
19872 |
{
|
19873 |
"cell_type": "code",
|
19874 |
"execution_count": 44,
|
19875 |
-
"id": "
|
19876 |
"metadata": {},
|
19877 |
"outputs": [],
|
19878 |
"source": [
|
@@ -19897,7 +19920,7 @@
|
|
19897 |
{
|
19898 |
"cell_type": "code",
|
19899 |
"execution_count": 45,
|
19900 |
-
"id": "
|
19901 |
"metadata": {
|
19902 |
"collapsed": true,
|
19903 |
"jupyter": {
|
@@ -20048,7 +20071,7 @@
|
|
20048 |
{
|
20049 |
"cell_type": "code",
|
20050 |
"execution_count": 46,
|
20051 |
-
"id": "
|
20052 |
"metadata": {},
|
20053 |
"outputs": [],
|
20054 |
"source": [
|
@@ -20058,7 +20081,7 @@
|
|
20058 |
{
|
20059 |
"cell_type": "code",
|
20060 |
"execution_count": 47,
|
20061 |
-
"id": "
|
20062 |
"metadata": {},
|
20063 |
"outputs": [
|
20064 |
{
|
@@ -20095,7 +20118,7 @@
|
|
20095 |
{
|
20096 |
"cell_type": "code",
|
20097 |
"execution_count": 48,
|
20098 |
-
"id": "
|
20099 |
"metadata": {},
|
20100 |
"outputs": [
|
20101 |
{
|
@@ -20123,7 +20146,7 @@
|
|
20123 |
{
|
20124 |
"cell_type": "code",
|
20125 |
"execution_count": 49,
|
20126 |
-
"id": "
|
20127 |
"metadata": {
|
20128 |
"collapsed": true,
|
20129 |
"jupyter": {
|
@@ -20764,7 +20787,7 @@
|
|
20764 |
{
|
20765 |
"cell_type": "code",
|
20766 |
"execution_count": 57,
|
20767 |
-
"id": "
|
20768 |
"metadata": {},
|
20769 |
"outputs": [
|
20770 |
{
|
@@ -20784,7 +20807,7 @@
|
|
20784 |
{
|
20785 |
"cell_type": "code",
|
20786 |
"execution_count": 53,
|
20787 |
-
"id": "
|
20788 |
"metadata": {},
|
20789 |
"outputs": [],
|
20790 |
"source": [
|
@@ -20801,7 +20824,7 @@
|
|
20801 |
{
|
20802 |
"cell_type": "code",
|
20803 |
"execution_count": 54,
|
20804 |
-
"id": "
|
20805 |
"metadata": {},
|
20806 |
"outputs": [
|
20807 |
{
|
@@ -20819,52 +20842,54 @@
|
|
20819 |
},
|
20820 |
{
|
20821 |
"cell_type": "code",
|
20822 |
-
"execution_count":
|
20823 |
-
"id": "
|
20824 |
"metadata": {},
|
20825 |
"outputs": [
|
20826 |
{
|
20827 |
"name": "stderr",
|
20828 |
"output_type": "stream",
|
20829 |
"text": [
|
20830 |
-
"
|
20831 |
-
"
|
20832 |
-
"
|
20833 |
-
"The progress bars may be unreliable.\n"
|
20834 |
]
|
20835 |
},
|
20836 |
{
|
20837 |
"data": {
|
20838 |
"application/vnd.jupyter.widget-view+json": {
|
20839 |
-
"model_id": "
|
20840 |
"version_major": 2,
|
20841 |
"version_minor": 0
|
20842 |
},
|
20843 |
"text/plain": [
|
20844 |
-
"
|
20845 |
]
|
20846 |
},
|
20847 |
"metadata": {},
|
20848 |
"output_type": "display_data"
|
20849 |
},
|
20850 |
-
{
|
20851 |
-
"name": "stderr",
|
20852 |
-
"output_type": "stream",
|
20853 |
-
"text": [
|
20854 |
-
"To https://huggingface.co/vitouphy/xls-r-300m-km\n",
|
20855 |
-
" e25c362..dff1f30 main -> main\n",
|
20856 |
-
"\n"
|
20857 |
-
]
|
20858 |
-
},
|
20859 |
{
|
20860 |
"data": {
|
|
|
|
|
|
|
|
|
|
|
20861 |
"text/plain": [
|
20862 |
-
"
|
20863 |
]
|
20864 |
},
|
20865 |
-
"execution_count": 58,
|
20866 |
"metadata": {},
|
20867 |
-
"output_type": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20868 |
}
|
20869 |
],
|
20870 |
"source": [
|
@@ -20873,8 +20898,8 @@
|
|
20873 |
},
|
20874 |
{
|
20875 |
"cell_type": "code",
|
20876 |
-
"execution_count":
|
20877 |
-
"id": "
|
20878 |
"metadata": {},
|
20879 |
"outputs": [
|
20880 |
{
|
@@ -20891,6 +20916,14 @@
|
|
20891 |
"source": [
|
20892 |
"trainer.save_model()"
|
20893 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20894 |
}
|
20895 |
],
|
20896 |
"metadata": {
|
|
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
"execution_count": 1,
|
6 |
+
"id": "1bf32ef8",
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": [
|
|
|
16 |
{
|
17 |
"cell_type": "code",
|
18 |
"execution_count": null,
|
19 |
+
"id": "d2deec6c",
|
20 |
"metadata": {
|
21 |
"collapsed": true,
|
22 |
"jupyter": {
|
|
|
19167 |
},
|
19168 |
{
|
19169 |
"cell_type": "markdown",
|
19170 |
+
"id": "6fe38e7a",
|
19171 |
"metadata": {},
|
19172 |
"source": [
|
19173 |
"### Load KH Data"
|
|
|
19176 |
{
|
19177 |
"cell_type": "code",
|
19178 |
"execution_count": 4,
|
19179 |
+
"id": "b75f1fec",
|
19180 |
"metadata": {},
|
19181 |
"outputs": [],
|
19182 |
"source": [
|
|
|
19197 |
{
|
19198 |
"cell_type": "code",
|
19199 |
"execution_count": 5,
|
19200 |
+
"id": "433fe749",
|
19201 |
"metadata": {},
|
19202 |
"outputs": [
|
19203 |
{
|
|
|
19307 |
{
|
19308 |
"cell_type": "code",
|
19309 |
"execution_count": 6,
|
19310 |
+
"id": "c6d633ad",
|
19311 |
"metadata": {},
|
19312 |
"outputs": [],
|
19313 |
"source": [
|
|
|
19321 |
},
|
19322 |
{
|
19323 |
"cell_type": "markdown",
|
19324 |
+
"id": "acb914d0",
|
19325 |
"metadata": {},
|
19326 |
"source": [
|
19327 |
"### Clean Up the Text"
|
|
|
19330 |
{
|
19331 |
"cell_type": "code",
|
19332 |
"execution_count": 6,
|
19333 |
+
"id": "bc3a017b",
|
19334 |
"metadata": {},
|
19335 |
"outputs": [],
|
19336 |
"source": [
|
|
|
19346 |
{
|
19347 |
"cell_type": "code",
|
19348 |
"execution_count": 7,
|
19349 |
+
"id": "4a7b6a10",
|
19350 |
"metadata": {
|
19351 |
"collapsed": true,
|
19352 |
"jupyter": {
|
|
|
19402 |
{
|
19403 |
"cell_type": "code",
|
19404 |
"execution_count": 7,
|
19405 |
+
"id": "7f511e3f",
|
19406 |
"metadata": {},
|
19407 |
"outputs": [
|
19408 |
{
|
|
|
19423 |
},
|
19424 |
{
|
19425 |
"cell_type": "markdown",
|
19426 |
+
"id": "205a6e23",
|
19427 |
"metadata": {},
|
19428 |
"source": [
|
19429 |
"### Build Character"
|
|
|
19432 |
{
|
19433 |
"cell_type": "code",
|
19434 |
"execution_count": 8,
|
19435 |
+
"id": "48a97fac",
|
19436 |
"metadata": {},
|
19437 |
"outputs": [
|
19438 |
{
|
|
|
19480 |
{
|
19481 |
"cell_type": "code",
|
19482 |
"execution_count": 9,
|
19483 |
+
"id": "9b4ac5f7",
|
19484 |
"metadata": {},
|
19485 |
"outputs": [],
|
19486 |
"source": [
|
|
|
19491 |
{
|
19492 |
"cell_type": "code",
|
19493 |
"execution_count": 10,
|
19494 |
+
"id": "a9a07875",
|
19495 |
"metadata": {},
|
19496 |
"outputs": [
|
19497 |
{
|
|
|
19509 |
{
|
19510 |
"cell_type": "code",
|
19511 |
"execution_count": 11,
|
19512 |
+
"id": "8a3d39d8",
|
19513 |
"metadata": {},
|
19514 |
"outputs": [
|
19515 |
{
|
|
|
19536 |
{
|
19537 |
"cell_type": "code",
|
19538 |
"execution_count": 12,
|
19539 |
+
"id": "934a4070",
|
19540 |
"metadata": {},
|
19541 |
"outputs": [
|
19542 |
{
|
|
|
19554 |
{
|
19555 |
"cell_type": "code",
|
19556 |
"execution_count": 13,
|
19557 |
+
"id": "7f42a2b4",
|
19558 |
"metadata": {},
|
19559 |
"outputs": [],
|
19560 |
"source": [
|
|
|
19565 |
},
|
19566 |
{
|
19567 |
"cell_type": "markdown",
|
19568 |
+
"id": "9a504bc4",
|
19569 |
"metadata": {},
|
19570 |
"source": [
|
19571 |
"# Tokenizer"
|
|
|
19574 |
{
|
19575 |
"cell_type": "code",
|
19576 |
"execution_count": 14,
|
19577 |
+
"id": "0cec90b4",
|
19578 |
"metadata": {},
|
19579 |
"outputs": [],
|
19580 |
"source": [
|
|
|
19585 |
},
|
19586 |
{
|
19587 |
"cell_type": "code",
|
19588 |
+
"execution_count": 62,
|
19589 |
+
"id": "dc9e79da",
|
19590 |
"metadata": {},
|
19591 |
"outputs": [
|
19592 |
{
|
19593 |
"name": "stderr",
|
19594 |
"output_type": "stream",
|
19595 |
"text": [
|
19596 |
+
"Didn't find file ./tokenizer.json. We won't load it.\n",
|
19597 |
+
"loading file ./vocab.json\n",
|
19598 |
+
"loading file ./tokenizer_config.json\n",
|
19599 |
+
"loading file ./added_tokens.json\n",
|
19600 |
+
"loading file ./special_tokens_map.json\n",
|
19601 |
+
"loading file None\n"
|
19602 |
+
]
|
19603 |
+
},
|
19604 |
+
{
|
19605 |
+
"ename": "JSONDecodeError",
|
19606 |
+
"evalue": "Expecting value: line 1 column 1 (char 0)",
|
19607 |
+
"output_type": "error",
|
19608 |
+
"traceback": [
|
19609 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
19610 |
+
"\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)",
|
19611 |
+
"Input \u001b[0;32mIn [62]\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mWav2Vec2CTCTokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m./\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43munk_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m[UNK]\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpad_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m[PAD]\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword_delimiter_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m|\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# './' load vocab.json in the current directory\u001b[39;00m\n\u001b[1;32m 2\u001b[0m feature_extractor \u001b[38;5;241m=\u001b[39m Wav2Vec2FeatureExtractor(feature_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, sampling_rate\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16000\u001b[39m, padding_value\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.0\u001b[39m, do_normalize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, return_attention_mask\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m) \n\u001b[1;32m 3\u001b[0m processor \u001b[38;5;241m=\u001b[39m Wav2Vec2Processor(feature_extractor\u001b[38;5;241m=\u001b[39mfeature_extractor, tokenizer\u001b[38;5;241m=\u001b[39mtokenizer)\n",
|
19612 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:1773\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 1770\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1771\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloading file \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m from cache at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresolved_vocab_files[file_id]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 1773\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_from_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1774\u001b[0m \u001b[43m \u001b[49m\u001b[43mresolved_vocab_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1775\u001b[0m \u001b[43m \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1776\u001b[0m \u001b[43m \u001b[49m\u001b[43minit_configuration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1777\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1778\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1779\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1780\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1781\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
19613 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:1908\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase._from_pretrained\u001b[0;34m(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, *init_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 1906\u001b[0m \u001b[38;5;66;03m# Instantiate tokenizer.\u001b[39;00m\n\u001b[1;32m 1907\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1908\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1909\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m 1910\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\n\u001b[1;32m 1911\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to load vocabulary from file. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1912\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease check that the provided vocabulary is accessible and not corrupted.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1913\u001b[0m )\n",
|
19614 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/site-packages/transformers/models/wav2vec2/tokenization_wav2vec2.py:142\u001b[0m, in \u001b[0;36mWav2Vec2CTCTokenizer.__init__\u001b[0;34m(self, vocab_file, bos_token, eos_token, unk_token, pad_token, word_delimiter_token, do_lower_case, **kwargs)\u001b[0m\n\u001b[1;32m 139\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdo_lower_case \u001b[38;5;241m=\u001b[39m do_lower_case\n\u001b[1;32m 141\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(vocab_file, encoding\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m vocab_handle:\n\u001b[0;32m--> 142\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder \u001b[38;5;241m=\u001b[39m \u001b[43mjson\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvocab_handle\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdecoder \u001b[38;5;241m=\u001b[39m {v: k \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mencoder\u001b[38;5;241m.\u001b[39mitems()}\n\u001b[1;32m 145\u001b[0m \u001b[38;5;66;03m# make sure that tokens made of several\u001b[39;00m\n\u001b[1;32m 146\u001b[0m \u001b[38;5;66;03m# characters are not split at tokenization\u001b[39;00m\n",
|
19615 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/json/__init__.py:293\u001b[0m, in \u001b[0;36mload\u001b[0;34m(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload\u001b[39m(fp, \u001b[38;5;241m*\u001b[39m, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, object_hook\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, parse_float\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 275\u001b[0m parse_int\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, parse_constant\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, object_pairs_hook\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkw):\n\u001b[1;32m 276\u001b[0m \u001b[38;5;124;03m\"\"\"Deserialize ``fp`` (a ``.read()``-supporting file-like object containing\u001b[39;00m\n\u001b[1;32m 277\u001b[0m \u001b[38;5;124;03m a JSON document) to a Python object.\u001b[39;00m\n\u001b[1;32m 278\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[38;5;124;03m kwarg; otherwise ``JSONDecoder`` is used.\u001b[39;00m\n\u001b[1;32m 292\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloads\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobject_hook\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobject_hook\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[43mparse_float\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_float\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparse_int\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_int\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43mparse_constant\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparse_constant\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobject_pairs_hook\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobject_pairs_hook\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\n",
|
19616 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/json/__init__.py:357\u001b[0m, in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 352\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m kw[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mencoding\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 355\u001b[0m parse_int \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m parse_float \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 356\u001b[0m parse_constant \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_pairs_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m kw):\n\u001b[0;32m--> 357\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_default_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 358\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 359\u001b[0m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;241m=\u001b[39m JSONDecoder\n",
|
19617 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/json/decoder.py:337\u001b[0m, in \u001b[0;36mJSONDecoder.decode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecode\u001b[39m(\u001b[38;5;28mself\u001b[39m, s, _w\u001b[38;5;241m=\u001b[39mWHITESPACE\u001b[38;5;241m.\u001b[39mmatch):\n\u001b[1;32m 333\u001b[0m \u001b[38;5;124;03m\"\"\"Return the Python representation of ``s`` (a ``str`` instance\u001b[39;00m\n\u001b[1;32m 334\u001b[0m \u001b[38;5;124;03m containing a JSON document).\u001b[39;00m\n\u001b[1;32m 335\u001b[0m \n\u001b[1;32m 336\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 337\u001b[0m obj, end \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraw_decode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43midx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_w\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mend\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 338\u001b[0m end \u001b[38;5;241m=\u001b[39m _w(s, end)\u001b[38;5;241m.\u001b[39mend()\n\u001b[1;32m 339\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m end \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(s):\n",
|
19618 |
+
"File \u001b[0;32m/opt/conda/lib/python3.8/json/decoder.py:355\u001b[0m, in \u001b[0;36mJSONDecoder.raw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 353\u001b[0m obj, end \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mscan_once(s, idx)\n\u001b[1;32m 354\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m--> 355\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m JSONDecodeError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpecting value\u001b[39m\u001b[38;5;124m\"\u001b[39m, s, err\u001b[38;5;241m.\u001b[39mvalue) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[1;32m 356\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj, end\n",
|
19619 |
+
"\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)"
|
19620 |
]
|
19621 |
}
|
19622 |
],
|
|
|
19629 |
{
|
19630 |
"cell_type": "code",
|
19631 |
"execution_count": 26,
|
19632 |
+
"id": "61738038",
|
19633 |
"metadata": {},
|
19634 |
"outputs": [],
|
19635 |
"source": [
|
|
|
19646 |
{
|
19647 |
"cell_type": "code",
|
19648 |
"execution_count": 27,
|
19649 |
+
"id": "4b72b9b8",
|
19650 |
"metadata": {},
|
19651 |
"outputs": [
|
19652 |
{
|
|
|
19686 |
{
|
19687 |
"cell_type": "code",
|
19688 |
"execution_count": 17,
|
19689 |
+
"id": "ccb4a36c",
|
19690 |
"metadata": {},
|
19691 |
"outputs": [],
|
19692 |
"source": [
|
|
|
19697 |
{
|
19698 |
"cell_type": "code",
|
19699 |
"execution_count": 18,
|
19700 |
+
"id": "cf9d1391",
|
19701 |
"metadata": {},
|
19702 |
"outputs": [
|
19703 |
{
|
|
|
19722 |
{
|
19723 |
"cell_type": "code",
|
19724 |
"execution_count": 19,
|
19725 |
+
"id": "57ea4c6f",
|
19726 |
"metadata": {},
|
19727 |
"outputs": [
|
19728 |
{
|
|
|
19769 |
{
|
19770 |
"cell_type": "code",
|
19771 |
"execution_count": 20,
|
19772 |
+
"id": "7cb9fd2a",
|
19773 |
"metadata": {},
|
19774 |
"outputs": [],
|
19775 |
"source": [
|
|
|
19791 |
{
|
19792 |
"cell_type": "code",
|
19793 |
"execution_count": 22,
|
19794 |
+
"id": "42f2952f",
|
19795 |
"metadata": {},
|
19796 |
"outputs": [],
|
19797 |
"source": [
|
|
|
19802 |
{
|
19803 |
"cell_type": "code",
|
19804 |
"execution_count": 41,
|
19805 |
+
"id": "fe093630",
|
19806 |
"metadata": {},
|
19807 |
"outputs": [],
|
19808 |
"source": [
|
|
|
19814 |
{
|
19815 |
"cell_type": "code",
|
19816 |
"execution_count": 25,
|
19817 |
+
"id": "a6efe782",
|
19818 |
"metadata": {},
|
19819 |
"outputs": [],
|
19820 |
"source": [
|
|
|
19874 |
{
|
19875 |
"cell_type": "code",
|
19876 |
"execution_count": 26,
|
19877 |
+
"id": "e82a3663",
|
19878 |
"metadata": {},
|
19879 |
"outputs": [],
|
19880 |
"source": [
|
|
|
19884 |
{
|
19885 |
"cell_type": "code",
|
19886 |
"execution_count": 27,
|
19887 |
+
"id": "1df03ab8",
|
19888 |
"metadata": {},
|
19889 |
"outputs": [],
|
19890 |
"source": [
|
|
|
19895 |
{
|
19896 |
"cell_type": "code",
|
19897 |
"execution_count": 44,
|
19898 |
+
"id": "8304f047",
|
19899 |
"metadata": {},
|
19900 |
"outputs": [],
|
19901 |
"source": [
|
|
|
19920 |
{
|
19921 |
"cell_type": "code",
|
19922 |
"execution_count": 45,
|
19923 |
+
"id": "f92c9b4d",
|
19924 |
"metadata": {
|
19925 |
"collapsed": true,
|
19926 |
"jupyter": {
|
|
|
20071 |
{
|
20072 |
"cell_type": "code",
|
20073 |
"execution_count": 46,
|
20074 |
+
"id": "7f2dd147",
|
20075 |
"metadata": {},
|
20076 |
"outputs": [],
|
20077 |
"source": [
|
|
|
20081 |
{
|
20082 |
"cell_type": "code",
|
20083 |
"execution_count": 47,
|
20084 |
+
"id": "3d27466c",
|
20085 |
"metadata": {},
|
20086 |
"outputs": [
|
20087 |
{
|
|
|
20118 |
{
|
20119 |
"cell_type": "code",
|
20120 |
"execution_count": 48,
|
20121 |
+
"id": "014ac4c9",
|
20122 |
"metadata": {},
|
20123 |
"outputs": [
|
20124 |
{
|
|
|
20146 |
{
|
20147 |
"cell_type": "code",
|
20148 |
"execution_count": 49,
|
20149 |
+
"id": "e6cb809a",
|
20150 |
"metadata": {
|
20151 |
"collapsed": true,
|
20152 |
"jupyter": {
|
|
|
20787 |
{
|
20788 |
"cell_type": "code",
|
20789 |
"execution_count": 57,
|
20790 |
+
"id": "57c2527b",
|
20791 |
"metadata": {},
|
20792 |
"outputs": [
|
20793 |
{
|
|
|
20807 |
{
|
20808 |
"cell_type": "code",
|
20809 |
"execution_count": 53,
|
20810 |
+
"id": "0211e267",
|
20811 |
"metadata": {},
|
20812 |
"outputs": [],
|
20813 |
"source": [
|
|
|
20824 |
{
|
20825 |
"cell_type": "code",
|
20826 |
"execution_count": 54,
|
20827 |
+
"id": "62f6fd3e",
|
20828 |
"metadata": {},
|
20829 |
"outputs": [
|
20830 |
{
|
|
|
20842 |
},
|
20843 |
{
|
20844 |
"cell_type": "code",
|
20845 |
+
"execution_count": 60,
|
20846 |
+
"id": "b050fb9f",
|
20847 |
"metadata": {},
|
20848 |
"outputs": [
|
20849 |
{
|
20850 |
"name": "stderr",
|
20851 |
"output_type": "stream",
|
20852 |
"text": [
|
20853 |
+
"/opt/conda/lib/python3.8/site-packages/huggingface_hub/hf_api.py:1001: FutureWarning: `create_repo` now takes `token` as an optional positional argument. Be sure to adapt your code!\n",
|
20854 |
+
" warnings.warn(\n",
|
20855 |
+
"Cloning https://huggingface.co/vitouphy/xls-r-300m-km into local empty directory.\n"
|
|
|
20856 |
]
|
20857 |
},
|
20858 |
{
|
20859 |
"data": {
|
20860 |
"application/vnd.jupyter.widget-view+json": {
|
20861 |
+
"model_id": "331db7acce774ee3b699aa82a0451092",
|
20862 |
"version_major": 2,
|
20863 |
"version_minor": 0
|
20864 |
},
|
20865 |
"text/plain": [
|
20866 |
+
"Download file pytorch_model.bin: 0%| | 3.47k/1.18G [00:00<?, ?B/s]"
|
20867 |
]
|
20868 |
},
|
20869 |
"metadata": {},
|
20870 |
"output_type": "display_data"
|
20871 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20872 |
{
|
20873 |
"data": {
|
20874 |
+
"application/vnd.jupyter.widget-view+json": {
|
20875 |
+
"model_id": "db90465291c64e9f82988698d2473234",
|
20876 |
+
"version_major": 2,
|
20877 |
+
"version_minor": 0
|
20878 |
+
},
|
20879 |
"text/plain": [
|
20880 |
+
"Clean file pytorch_model.bin: 0%| | 1.00k/1.18G [00:00<?, ?B/s]"
|
20881 |
]
|
20882 |
},
|
|
|
20883 |
"metadata": {},
|
20884 |
+
"output_type": "display_data"
|
20885 |
+
},
|
20886 |
+
{
|
20887 |
+
"name": "stderr",
|
20888 |
+
"output_type": "stream",
|
20889 |
+
"text": [
|
20890 |
+
"Configuration saved in vitouphy/xls-r-300m-km/config.json\n",
|
20891 |
+
"Model weights saved in vitouphy/xls-r-300m-km/pytorch_model.bin\n"
|
20892 |
+
]
|
20893 |
}
|
20894 |
],
|
20895 |
"source": [
|
|
|
20898 |
},
|
20899 |
{
|
20900 |
"cell_type": "code",
|
20901 |
+
"execution_count": 61,
|
20902 |
+
"id": "9d7cb173",
|
20903 |
"metadata": {},
|
20904 |
"outputs": [
|
20905 |
{
|
|
|
20916 |
"source": [
|
20917 |
"trainer.save_model()"
|
20918 |
]
|
20919 |
+
},
|
20920 |
+
{
|
20921 |
+
"cell_type": "code",
|
20922 |
+
"execution_count": null,
|
20923 |
+
"id": "8dc01ad4",
|
20924 |
+
"metadata": {},
|
20925 |
+
"outputs": [],
|
20926 |
+
"source": []
|
20927 |
}
|
20928 |
],
|
20929 |
"metadata": {
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c93624168d989aabdf33bd32b0dbfa8b32857515b2a2f04190df98bd15ae4e61
|
3 |
+
size 2991
|
vocab.json
CHANGED
@@ -1,5 +1 @@
|
|
1 |
-
|
2 |
-
{"\u1780": 1, "\u1781": 2, "\u1782": 3, "\u1783": 4, "\u1784": 5, "\u1785": 6, "\u1786": 7, "\u1787": 8, "\u1788": 9, "\u1789": 10, "\u178a": 11, "\u178b": 12, "\u178c": 13, "\u178d": 14, "\u178e": 15, "\u178f": 16, "\u1790": 17, "\u1791": 18, "\u1792": 19, "\u1793": 20, "\u1794": 21, "\u1795": 22, "\u1796": 23, "\u1797": 24, "\u1798": 25, "\u1799": 26, "\u179a": 27, "\u179b": 28, "\u179c": 29, "\u179f": 30, "\u17a0": 31, "\u17a1": 32, "\u17a2": 33, "\u17a5": 34, "\u17a7": 35, "\u17aa": 36, "\u17ab": 37, "\u17ac": 38, "\u17ad": 39, "\u17ae": 40, "\u17af": 41, "\u17b1": 42, "\u17b6": 43, "\u17b7": 44, "\u17b8": 45, "\u17b9": 46, "\u17ba": 47, "\u17bb": 48, "\u17bc": 49, "\u17bd": 50, "\u17be": 51, "\u17bf": 52, "\u17c0": 53, "\u17c1": 54, "\u17c2": 55, "\u17c3": 56, "\u17c4": 57, "\u17c5": 58, "\u17c6": 59, "\u17c7": 60, "\u17c8": 61, "\u17c9": 62, "\u17ca": 63, "\u17cb": 64, "\u17cc": 65, "\u17cd": 66, "\u17ce": 67, "\u17cf": 68, "\u17d0": 69, "\u17d2": 70, "|": 0, "[UNK]": 71, "[PAD]": 72}
|
3 |
-
=======
|
4 |
-
{"แ": 1, "แ": 2, "แ": 3, "แ": 4, "แ": 5, "แ
": 6, "แ": 7, "แ": 8, "แ": 9, "แ": 10, "แ": 11, "แ": 12, "แ": 13, "แ": 14, "แ": 15, "แ": 16, "แ": 17, "แ": 18, "แ": 19, "แ": 20, "แ": 21, "แ": 22, "แ": 23, "แ": 24, "แ": 25, "แ": 26, "แ": 27, "แ": 28, "แ": 29, "แ": 30, "แ ": 31, "แก": 32, "แข": 33, "แฅ": 34, "แง": 35, "แช": 36, "แซ": 37, "แฌ": 38, "แญ": 39, "แฎ": 40, "แฏ": 41, "แฑ": 42, "แถ": 43, "แท": 44, "แธ": 45, "แน": 46, "แบ": 47, "แป": 48, "แผ": 49, "แฝ": 50, "แพ": 51, "แฟ": 52, "แ": 53, "แ": 54, "แ": 55, "แ": 56, "แ": 57, "แ
": 58, "แ": 59, "แ": 60, "แ": 61, "แ": 62, "แ": 63, "แ": 64, "แ": 65, "แ": 66, "แ": 67, "แ": 68, "แ": 69, "แ": 70, "|": 0, "[UNK]": 71, "[PAD]": 72}
|
5 |
-
>>>>>>> dff1f3008b5c2afbbbcab722e17fded4bf8f782b
|
|
|
1 |
+
{"แ": 1, "แ": 2, "แ": 3, "แ": 4, "แ": 5, "แ
": 6, "แ": 7, "แ": 8, "แ": 9, "แ": 10, "แ": 11, "แ": 12, "แ": 13, "แ": 14, "แ": 15, "แ": 16, "แ": 17, "แ": 18, "แ": 19, "แ": 20, "แ": 21, "แ": 22, "แ": 23, "แ": 24, "แ": 25, "แ": 26, "แ": 27, "แ": 28, "แ": 29, "แ": 30, "แ ": 31, "แก": 32, "แข": 33, "แฅ": 34, "แง": 35, "แช": 36, "แซ": 37, "แฌ": 38, "แญ": 39, "แฎ": 40, "แฏ": 41, "แฑ": 42, "แถ": 43, "แท": 44, "แธ": 45, "แน": 46, "แบ": 47, "แป": 48, "แผ": 49, "แฝ": 50, "แพ": 51, "แฟ": 52, "แ": 53, "แ": 54, "แ": 55, "แ": 56, "แ": 57, "แ
": 58, "แ": 59, "แ": 60, "แ": 61, "แ": 62, "แ": 63, "แ": 64, "แ": 65, "แ": 66, "แ": 67, "แ": 68, "แ": 69, "แ": 70, "|": 0, "[UNK]": 71, "[PAD]": 72}
|
|
|
|
|
|
|
|