add language model
Browse files- .gitignore +1 -0
- alphabet.json +1 -0
- build_lm_processor.ipynb +200 -0
- eval.sh +1 -1
- inference.ipynb +118 -56
- language_model/attrs.json +1 -0
- language_model/km_wiki_ngram.arpa +3 -0
- language_model/unigrams.txt +0 -0
- preprocessor_config.json +1 -0
- special_tokens_map.json +1 -1
- tokenizer_config.json +1 -1
- train_kh.ipynb +45 -45
- train_kh_lm.ipynb +0 -0
- vocab.json +1 -1
.gitignore
CHANGED
@@ -3,3 +3,4 @@ km_kh*
|
|
3 |
.ipynb_checkpoints
|
4 |
vitouphy
|
5 |
*checkpoint*
|
|
|
|
3 |
.ipynb_checkpoints
|
4 |
vitouphy
|
5 |
*checkpoint*
|
6 |
+
data
|
alphabet.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"labels": [" ", "\u1780", "\u1781", "\u1782", "\u1783", "\u1784", "\u1785", "\u1786", "\u1787", "\u1788", "\u1789", "\u178a", "\u178b", "\u178c", "\u178d", "\u178e", "\u178f", "\u1790", "\u1791", "\u1792", "\u1793", "\u1794", "\u1795", "\u1796", "\u1797", "\u1798", "\u1799", "\u179a", "\u179b", "\u179c", "\u179f", "\u17a0", "\u17a1", "\u17a2", "\u17a5", "\u17a7", "\u17aa", "\u17ab", "\u17ac", "\u17ad", "\u17ae", "\u17af", "\u17b1", "\u17b6", "\u17b7", "\u17b8", "\u17b9", "\u17ba", "\u17bb", "\u17bc", "\u17bd", "\u17be", "\u17bf", "\u17c0", "\u17c1", "\u17c2", "\u17c3", "\u17c4", "\u17c5", "\u17c6", "\u17c7", "\u17c8", "\u17c9", "\u17ca", "\u17cb", "\u17cc", "\u17cd", "\u17ce", "\u17cf", "\u17d0", "\u17d2", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
|
build_lm_processor.ipynb
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "5393aa33",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2Processor, AutoProcessor, Wav2Vec2ProcessorWithLM\n",
|
11 |
+
"from datasets import load_dataset, load_metric, Audio\n",
|
12 |
+
"from pyctcdecode import build_ctcdecoder\n",
|
13 |
+
"from pydub import AudioSegment\n",
|
14 |
+
"from pydub.playback import play\n",
|
15 |
+
"\n",
|
16 |
+
"import numpy as np\n",
|
17 |
+
"import torch\n",
|
18 |
+
"import kenlm\n",
|
19 |
+
"import pandas as pd\n",
|
20 |
+
"import random\n",
|
21 |
+
"import soundfile as sf\n",
|
22 |
+
"from tqdm.auto import tqdm"
|
23 |
+
]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"cell_type": "code",
|
27 |
+
"execution_count": 2,
|
28 |
+
"id": "2d34d3b8",
|
29 |
+
"metadata": {},
|
30 |
+
"outputs": [],
|
31 |
+
"source": [
|
32 |
+
"# KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_text_word_unigram.arpa'\n",
|
33 |
+
"KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_wiki_ngram.arpa'"
|
34 |
+
]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"cell_type": "code",
|
38 |
+
"execution_count": 3,
|
39 |
+
"id": "f0354cb2",
|
40 |
+
"metadata": {},
|
41 |
+
"outputs": [
|
42 |
+
{
|
43 |
+
"name": "stderr",
|
44 |
+
"output_type": "stream",
|
45 |
+
"text": [
|
46 |
+
"Loading the LM will be faster if you build a binary file.\n",
|
47 |
+
"Reading /workspace/xls-r-300m-km/vitouphy/xls-r-300m-km/language_model/km_text.arpa\n",
|
48 |
+
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
49 |
+
"Only 81 unigrams passed as vocabulary. Is this small or artificial data?\n",
|
50 |
+
"****************************************************************************************************\n"
|
51 |
+
]
|
52 |
+
}
|
53 |
+
],
|
54 |
+
"source": [
|
55 |
+
"processor = AutoProcessor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
|
56 |
+
]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"cell_type": "code",
|
60 |
+
"execution_count": 4,
|
61 |
+
"id": "109f28e9",
|
62 |
+
"metadata": {},
|
63 |
+
"outputs": [
|
64 |
+
{
|
65 |
+
"name": "stdout",
|
66 |
+
"output_type": "stream",
|
67 |
+
"text": [
|
68 |
+
"{'|': 0, 'แ': 1, 'แ': 2, 'แ': 3, 'แ': 4, 'แ': 5, 'แ
': 6, 'แ': 7, 'แ': 8, 'แ': 9, 'แ': 10, 'แ': 11, 'แ': 12, 'แ': 13, 'แ': 14, 'แ': 15, 'แ': 16, 'แ': 17, 'แ': 18, 'แ': 19, 'แ': 20, 'แ': 21, 'แ': 22, 'แ': 23, 'แ': 24, 'แ': 25, 'แ': 26, 'แ': 27, 'แ': 28, 'แ': 29, 'แ': 30, 'แ ': 31, 'แก': 32, 'แข': 33, 'แฅ': 34, 'แง': 35, 'แช': 36, 'แซ': 37, 'แฌ': 38, 'แญ': 39, 'แฎ': 40, 'แฏ': 41, 'แฑ': 42, 'แถ': 43, 'แท': 44, 'แธ': 45, 'แน': 46, 'แบ': 47, 'แป': 48, 'แผ': 49, 'แฝ': 50, 'แพ': 51, 'แฟ': 52, 'แ': 53, 'แ': 54, 'แ': 55, 'แ': 56, 'แ': 57, 'แ
': 58, 'แ': 59, 'แ': 60, 'แ': 61, 'แ': 62, 'แ': 63, 'แ': 64, 'แ': 65, 'แ': 66, 'แ': 67, 'แ': 68, 'แ': 69, 'แ': 70, '[unk]': 71, '[pad]': 72, '<s>': 73, '</s>': 74}\n"
|
69 |
+
]
|
70 |
+
}
|
71 |
+
],
|
72 |
+
"source": [
|
73 |
+
"vocab_dict = processor.tokenizer.get_vocab()\n",
|
74 |
+
"sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}\n",
|
75 |
+
"print(sorted_vocab_dict)"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"cell_type": "code",
|
80 |
+
"execution_count": 5,
|
81 |
+
"id": "300cec39",
|
82 |
+
"metadata": {},
|
83 |
+
"outputs": [
|
84 |
+
{
|
85 |
+
"name": "stderr",
|
86 |
+
"output_type": "stream",
|
87 |
+
"text": [
|
88 |
+
"Loading the LM will be faster if you build a binary file.\n",
|
89 |
+
"Reading /workspace/xls-r-300m-km/data/km_wiki_ngram.arpa\n",
|
90 |
+
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
91 |
+
"Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
|
92 |
+
"****************************************************************************************************\n"
|
93 |
+
]
|
94 |
+
}
|
95 |
+
],
|
96 |
+
"source": [
|
97 |
+
"decoder = build_ctcdecoder(\n",
|
98 |
+
" labels=list(sorted_vocab_dict.keys()),\n",
|
99 |
+
" kenlm_model_path=KENLM_MODEL_LOC,\n",
|
100 |
+
")"
|
101 |
+
]
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"cell_type": "code",
|
105 |
+
"execution_count": 8,
|
106 |
+
"id": "27dd8427",
|
107 |
+
"metadata": {},
|
108 |
+
"outputs": [],
|
109 |
+
"source": [
|
110 |
+
"processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
|
111 |
+
" feature_extractor=processor.feature_extractor,\n",
|
112 |
+
" tokenizer=processor.tokenizer,\n",
|
113 |
+
" decoder=decoder\n",
|
114 |
+
")"
|
115 |
+
]
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"cell_type": "code",
|
119 |
+
"execution_count": 9,
|
120 |
+
"id": "94eb248e",
|
121 |
+
"metadata": {},
|
122 |
+
"outputs": [],
|
123 |
+
"source": [
|
124 |
+
"processor_with_lm.save_pretrained(\".\")"
|
125 |
+
]
|
126 |
+
},
|
127 |
+
{
|
128 |
+
"cell_type": "markdown",
|
129 |
+
"id": "8f9b3dcc",
|
130 |
+
"metadata": {},
|
131 |
+
"source": [
|
132 |
+
"## Save Model"
|
133 |
+
]
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"cell_type": "code",
|
137 |
+
"execution_count": 9,
|
138 |
+
"id": "8b584690",
|
139 |
+
"metadata": {},
|
140 |
+
"outputs": [
|
141 |
+
{
|
142 |
+
"data": {
|
143 |
+
"application/vnd.jupyter.widget-view+json": {
|
144 |
+
"model_id": "bc5bf68946064e97b869d44b02e7af19",
|
145 |
+
"version_major": 2,
|
146 |
+
"version_minor": 0
|
147 |
+
},
|
148 |
+
"text/plain": [
|
149 |
+
"Downloading: 0%| | 0.00/1.18G [00:00<?, ?B/s]"
|
150 |
+
]
|
151 |
+
},
|
152 |
+
"metadata": {},
|
153 |
+
"output_type": "display_data"
|
154 |
+
}
|
155 |
+
],
|
156 |
+
"source": [
|
157 |
+
"model = AutoModelForCTC.from_pretrained(\"vitouphy/xls-r-300m-km\")"
|
158 |
+
]
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"cell_type": "code",
|
162 |
+
"execution_count": 12,
|
163 |
+
"id": "3712c030",
|
164 |
+
"metadata": {},
|
165 |
+
"outputs": [],
|
166 |
+
"source": [
|
167 |
+
"model.save_pretrained('.')"
|
168 |
+
]
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"cell_type": "code",
|
172 |
+
"execution_count": null,
|
173 |
+
"id": "b5d8de20",
|
174 |
+
"metadata": {},
|
175 |
+
"outputs": [],
|
176 |
+
"source": []
|
177 |
+
}
|
178 |
+
],
|
179 |
+
"metadata": {
|
180 |
+
"kernelspec": {
|
181 |
+
"display_name": "Python 3 (ipykernel)",
|
182 |
+
"language": "python",
|
183 |
+
"name": "python3"
|
184 |
+
},
|
185 |
+
"language_info": {
|
186 |
+
"codemirror_mode": {
|
187 |
+
"name": "ipython",
|
188 |
+
"version": 3
|
189 |
+
},
|
190 |
+
"file_extension": ".py",
|
191 |
+
"mimetype": "text/x-python",
|
192 |
+
"name": "python",
|
193 |
+
"nbconvert_exporter": "python",
|
194 |
+
"pygments_lexer": "ipython3",
|
195 |
+
"version": "3.8.8"
|
196 |
+
}
|
197 |
+
},
|
198 |
+
"nbformat": 4,
|
199 |
+
"nbformat_minor": 5
|
200 |
+
}
|
eval.sh
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
./eval.py \
|
2 |
-
--model_id
|
3 |
--dataset openslr \
|
4 |
--config km \
|
5 |
--split test \
|
|
|
1 |
./eval.py \
|
2 |
+
--model_id vitouphy/xls-r-300m-km \
|
3 |
--dataset openslr \
|
4 |
--config km \
|
5 |
--split test \
|
inference.ipynb
CHANGED
@@ -2,32 +2,40 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
-
"id": "
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": [
|
10 |
-
"from transformers import AutoModelForCTC, Wav2Vec2Processor\n",
|
11 |
"from datasets import load_dataset, load_metric, Audio\n",
|
|
|
|
|
|
|
|
|
12 |
"import numpy as np\n",
|
13 |
-
"import torch"
|
|
|
|
|
|
|
|
|
14 |
]
|
15 |
},
|
16 |
{
|
17 |
"cell_type": "code",
|
18 |
-
"execution_count":
|
19 |
-
"id": "
|
20 |
"metadata": {},
|
21 |
"outputs": [],
|
22 |
"source": [
|
23 |
-
"model = AutoModelForCTC.from_pretrained(\".\")
|
24 |
"processor = Wav2Vec2Processor.from_pretrained(\".\")"
|
25 |
]
|
26 |
},
|
27 |
{
|
28 |
"cell_type": "code",
|
29 |
"execution_count": 28,
|
30 |
-
"id": "
|
31 |
"metadata": {},
|
32 |
"outputs": [],
|
33 |
"source": [
|
@@ -37,62 +45,43 @@
|
|
37 |
},
|
38 |
{
|
39 |
"cell_type": "code",
|
40 |
-
"execution_count":
|
41 |
-
"id": "
|
42 |
"metadata": {},
|
43 |
"outputs": [
|
44 |
{
|
45 |
"name": "stderr",
|
46 |
"output_type": "stream",
|
47 |
"text": [
|
48 |
-
"Using custom data configuration default-
|
49 |
-
"Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-
|
50 |
]
|
51 |
}
|
52 |
],
|
53 |
"source": [
|
54 |
-
"common_voice_test
|
55 |
-
]
|
56 |
-
},
|
57 |
-
{
|
58 |
-
"cell_type": "code",
|
59 |
-
"execution_count": 30,
|
60 |
-
"id": "f14c1cfa",
|
61 |
-
"metadata": {},
|
62 |
-
"outputs": [],
|
63 |
-
"source": [
|
64 |
-
"common_voice_test = (common_voice_test\n",
|
65 |
" .remove_columns([\"Unnamed: 0\", \"drop\"])\n",
|
66 |
-
" .rename_column('text', 'sentence')
|
|
|
67 |
]
|
68 |
},
|
69 |
{
|
70 |
"cell_type": "code",
|
71 |
-
"execution_count":
|
72 |
-
"id": "
|
73 |
-
"metadata": {},
|
74 |
-
"outputs": [],
|
75 |
-
"source": [
|
76 |
-
"common_voice_test = common_voice_test.cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio')"
|
77 |
-
]
|
78 |
-
},
|
79 |
-
{
|
80 |
-
"cell_type": "code",
|
81 |
-
"execution_count": 32,
|
82 |
-
"id": "64758ba8",
|
83 |
"metadata": {},
|
84 |
"outputs": [
|
85 |
{
|
86 |
"data": {
|
87 |
"text/plain": [
|
88 |
-
"{'audio': {'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/
|
89 |
-
" 'array': array([
|
90 |
-
" -
|
91 |
" 'sampling_rate': 16000},\n",
|
92 |
-
" 'sentence': '
|
93 |
]
|
94 |
},
|
95 |
-
"execution_count":
|
96 |
"metadata": {},
|
97 |
"output_type": "execute_result"
|
98 |
}
|
@@ -103,8 +92,8 @@
|
|
103 |
},
|
104 |
{
|
105 |
"cell_type": "code",
|
106 |
-
"execution_count":
|
107 |
-
"id": "
|
108 |
"metadata": {},
|
109 |
"outputs": [],
|
110 |
"source": [
|
@@ -122,15 +111,15 @@
|
|
122 |
},
|
123 |
{
|
124 |
"cell_type": "code",
|
125 |
-
"execution_count":
|
126 |
-
"id": "
|
127 |
"metadata": {},
|
128 |
"outputs": [
|
129 |
{
|
130 |
"name": "stderr",
|
131 |
"output_type": "stream",
|
132 |
"text": [
|
133 |
-
"Loading cached processed dataset at /workspace/.cache/huggingface/datasets/csv/default-
|
134 |
]
|
135 |
}
|
136 |
],
|
@@ -140,8 +129,8 @@
|
|
140 |
},
|
141 |
{
|
142 |
"cell_type": "code",
|
143 |
-
"execution_count":
|
144 |
-
"id": "
|
145 |
"metadata": {},
|
146 |
"outputs": [],
|
147 |
"source": [
|
@@ -150,8 +139,48 @@
|
|
150 |
},
|
151 |
{
|
152 |
"cell_type": "code",
|
153 |
-
"execution_count":
|
154 |
-
"id": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
"metadata": {},
|
156 |
"outputs": [
|
157 |
{
|
@@ -170,8 +199,41 @@
|
|
170 |
},
|
171 |
{
|
172 |
"cell_type": "code",
|
173 |
-
"execution_count":
|
174 |
-
"id": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
"metadata": {},
|
176 |
"outputs": [
|
177 |
{
|
@@ -179,10 +241,10 @@
|
|
179 |
"output_type": "stream",
|
180 |
"text": [
|
181 |
"Prediction:\n",
|
182 |
-
"
|
183 |
"\n",
|
184 |
"Reference:\n",
|
185 |
-
"
|
186 |
]
|
187 |
}
|
188 |
],
|
@@ -199,7 +261,7 @@
|
|
199 |
{
|
200 |
"cell_type": "code",
|
201 |
"execution_count": null,
|
202 |
-
"id": "
|
203 |
"metadata": {},
|
204 |
"outputs": [],
|
205 |
"source": []
|
@@ -207,7 +269,7 @@
|
|
207 |
{
|
208 |
"cell_type": "code",
|
209 |
"execution_count": null,
|
210 |
-
"id": "
|
211 |
"metadata": {},
|
212 |
"outputs": [],
|
213 |
"source": []
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "33e4a305",
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": [
|
10 |
+
"from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2Processor\n",
|
11 |
"from datasets import load_dataset, load_metric, Audio\n",
|
12 |
+
"from pyctcdecode import build_ctcdecoder\n",
|
13 |
+
"from pydub import AudioSegment\n",
|
14 |
+
"from pydub.playback import play\n",
|
15 |
+
"\n",
|
16 |
"import numpy as np\n",
|
17 |
+
"import torch\n",
|
18 |
+
"import kenlm\n",
|
19 |
+
"import pandas as pd\n",
|
20 |
+
"import random\n",
|
21 |
+
"import soundfile as sf"
|
22 |
]
|
23 |
},
|
24 |
{
|
25 |
"cell_type": "code",
|
26 |
+
"execution_count": 2,
|
27 |
+
"id": "328d0662",
|
28 |
"metadata": {},
|
29 |
"outputs": [],
|
30 |
"source": [
|
31 |
+
"model = AutoModelForCTC.from_pretrained(\".\")\n",
|
32 |
"processor = Wav2Vec2Processor.from_pretrained(\".\")"
|
33 |
]
|
34 |
},
|
35 |
{
|
36 |
"cell_type": "code",
|
37 |
"execution_count": 28,
|
38 |
+
"id": "0fea2518",
|
39 |
"metadata": {},
|
40 |
"outputs": [],
|
41 |
"source": [
|
|
|
45 |
},
|
46 |
{
|
47 |
"cell_type": "code",
|
48 |
+
"execution_count": 3,
|
49 |
+
"id": "9cfef23c",
|
50 |
"metadata": {},
|
51 |
"outputs": [
|
52 |
{
|
53 |
"name": "stderr",
|
54 |
"output_type": "stream",
|
55 |
"text": [
|
56 |
+
"Using custom data configuration default-36119ec2a15afb82\n",
|
57 |
+
"Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-36119ec2a15afb82/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)\n"
|
58 |
]
|
59 |
}
|
60 |
],
|
61 |
"source": [
|
62 |
+
"common_voice_test = (load_dataset('csv', data_files='km_kh_male/line_index_test.csv', split = 'train')\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
" .remove_columns([\"Unnamed: 0\", \"drop\"])\n",
|
64 |
+
" .rename_column('text', 'sentence')\n",
|
65 |
+
" .cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio'))"
|
66 |
]
|
67 |
},
|
68 |
{
|
69 |
"cell_type": "code",
|
70 |
+
"execution_count": 4,
|
71 |
+
"id": "29e6bb1a",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
"metadata": {},
|
73 |
"outputs": [
|
74 |
{
|
75 |
"data": {
|
76 |
"text/plain": [
|
77 |
+
"{'audio': {'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/khm_3154_2555595821.wav',\n",
|
78 |
+
" 'array': array([ 0.00014737, 0.00016698, 0.00013704, ..., -0.00011244,\n",
|
79 |
+
" -0.0001059 , -0.00011476], dtype=float32),\n",
|
80 |
" 'sampling_rate': 16000},\n",
|
81 |
+
" 'sentence': 'แแถแ แแแแพ แขแถแแธแแแแแ แแแ แแแแผแ แแ
แแแแแปแแถ'}"
|
82 |
]
|
83 |
},
|
84 |
+
"execution_count": 4,
|
85 |
"metadata": {},
|
86 |
"output_type": "execute_result"
|
87 |
}
|
|
|
92 |
},
|
93 |
{
|
94 |
"cell_type": "code",
|
95 |
+
"execution_count": 5,
|
96 |
+
"id": "0554b8d8",
|
97 |
"metadata": {},
|
98 |
"outputs": [],
|
99 |
"source": [
|
|
|
111 |
},
|
112 |
{
|
113 |
"cell_type": "code",
|
114 |
+
"execution_count": 6,
|
115 |
+
"id": "d26a6659",
|
116 |
"metadata": {},
|
117 |
"outputs": [
|
118 |
{
|
119 |
"name": "stderr",
|
120 |
"output_type": "stream",
|
121 |
"text": [
|
122 |
+
"Loading cached processed dataset at /workspace/.cache/huggingface/datasets/csv/default-36119ec2a15afb82/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-081703c0621182da.arrow\n"
|
123 |
]
|
124 |
}
|
125 |
],
|
|
|
129 |
},
|
130 |
{
|
131 |
"cell_type": "code",
|
132 |
+
"execution_count": 9,
|
133 |
+
"id": "04a94f74",
|
134 |
"metadata": {},
|
135 |
"outputs": [],
|
136 |
"source": [
|
|
|
139 |
},
|
140 |
{
|
141 |
"cell_type": "code",
|
142 |
+
"execution_count": 10,
|
143 |
+
"id": "3993d2c4",
|
144 |
+
"metadata": {},
|
145 |
+
"outputs": [
|
146 |
+
{
|
147 |
+
"name": "stderr",
|
148 |
+
"output_type": "stream",
|
149 |
+
"text": [
|
150 |
+
"It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
|
151 |
+
]
|
152 |
+
}
|
153 |
+
],
|
154 |
+
"source": [
|
155 |
+
"input_dict = processor(common_voice_test[i][\"input_values\"], return_tensors=\"pt\", padding=True)"
|
156 |
+
]
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"cell_type": "code",
|
160 |
+
"execution_count": 11,
|
161 |
+
"id": "7e3026dc",
|
162 |
+
"metadata": {},
|
163 |
+
"outputs": [
|
164 |
+
{
|
165 |
+
"data": {
|
166 |
+
"text/plain": [
|
167 |
+
"{'input_values': tensor([[ 2.8537e-04, 2.5043e-04, 2.7738e-04, ..., -4.8949e-05,\n",
|
168 |
+
" -1.1382e-04, 2.7166e-04]]), 'attention_mask': tensor([[1, 1, 1, ..., 1, 1, 1]], dtype=torch.int32)}"
|
169 |
+
]
|
170 |
+
},
|
171 |
+
"execution_count": 11,
|
172 |
+
"metadata": {},
|
173 |
+
"output_type": "execute_result"
|
174 |
+
}
|
175 |
+
],
|
176 |
+
"source": [
|
177 |
+
"input_dict"
|
178 |
+
]
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"cell_type": "code",
|
182 |
+
"execution_count": 12,
|
183 |
+
"id": "adf215c0",
|
184 |
"metadata": {},
|
185 |
"outputs": [
|
186 |
{
|
|
|
199 |
},
|
200 |
{
|
201 |
"cell_type": "code",
|
202 |
+
"execution_count": 14,
|
203 |
+
"id": "e8310629",
|
204 |
+
"metadata": {},
|
205 |
+
"outputs": [
|
206 |
+
{
|
207 |
+
"data": {
|
208 |
+
"text/plain": [
|
209 |
+
"tensor([ 1, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,\n",
|
210 |
+
" 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 10, 70, 70, 70, 10, 72,\n",
|
211 |
+
" 43, 72, 72, 72, 72, 72, 72, 0, 0, 72, 72, 18, 72, 54, 72, 72, 72, 72,\n",
|
212 |
+
" 72, 0, 72, 21, 72, 49, 72, 72, 72, 72, 72, 72, 23, 70, 70, 27, 72, 46,\n",
|
213 |
+
" 72, 72, 72, 1, 72, 0, 0, 30, 72, 72, 72, 72, 25, 70, 70, 72, 72, 11,\n",
|
214 |
+
" 55, 72, 72, 72, 72, 5, 72, 0, 20, 58, 72, 72, 72, 0, 0, 16, 72, 72,\n",
|
215 |
+
" 72, 20, 70, 70, 72, 72, 16, 70, 27, 72, 72, 72, 72, 72, 45, 0, 0, 30,\n",
|
216 |
+
" 30, 70, 70, 27, 72, 43, 72, 72, 72, 72, 72, 72, 21, 72, 53, 72, 72, 72,\n",
|
217 |
+
" 27, 72, 0, 1, 72, 72, 72, 72, 25, 70, 23, 23, 48, 72, 72, 72, 72, 72,\n",
|
218 |
+
" 72, 8, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,\n",
|
219 |
+
" 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,\n",
|
220 |
+
" 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,\n",
|
221 |
+
" 72, 72, 72, 72, 72, 72, 72, 72, 43], device='cuda:0')"
|
222 |
+
]
|
223 |
+
},
|
224 |
+
"execution_count": 14,
|
225 |
+
"metadata": {},
|
226 |
+
"output_type": "execute_result"
|
227 |
+
}
|
228 |
+
],
|
229 |
+
"source": [
|
230 |
+
"pred_ids"
|
231 |
+
]
|
232 |
+
},
|
233 |
+
{
|
234 |
+
"cell_type": "code",
|
235 |
+
"execution_count": 15,
|
236 |
+
"id": "5dd986a0",
|
237 |
"metadata": {},
|
238 |
"outputs": [
|
239 |
{
|
|
|
241 |
"output_type": "stream",
|
242 |
"text": [
|
243 |
"Prediction:\n",
|
244 |
+
"แแแแแถ แแ แแผแแแแนแ แแแแแแ แแ
แแแแแแแธ แแแแถแแแ แแแแแปแแถ\n",
|
245 |
"\n",
|
246 |
"Reference:\n",
|
247 |
+
"แแแแแถ แแแ แแผแแแแนแแแ แแแแแแ แแ
แแแแแแแธ แแแแถแแแ แแแแแปแแถ\n"
|
248 |
]
|
249 |
}
|
250 |
],
|
|
|
261 |
{
|
262 |
"cell_type": "code",
|
263 |
"execution_count": null,
|
264 |
+
"id": "8e39b112",
|
265 |
"metadata": {},
|
266 |
"outputs": [],
|
267 |
"source": []
|
|
|
269 |
{
|
270 |
"cell_type": "code",
|
271 |
"execution_count": null,
|
272 |
+
"id": "562af933",
|
273 |
"metadata": {},
|
274 |
"outputs": [],
|
275 |
"source": []
|
language_model/attrs.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
|
language_model/km_wiki_ngram.arpa
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4eae7d94d04e95668df7306edf35e21f4bbab2a73c736b921e531cd25cde6d0
|
3 |
+
size 109085039
|
language_model/unigrams.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
preprocessor_config.json
CHANGED
@@ -4,6 +4,7 @@
|
|
4 |
"feature_size": 1,
|
5 |
"padding_side": "right",
|
6 |
"padding_value": 0.0,
|
|
|
7 |
"return_attention_mask": true,
|
8 |
"sampling_rate": 16000
|
9 |
}
|
|
|
4 |
"feature_size": 1,
|
5 |
"padding_side": "right",
|
6 |
"padding_value": 0.0,
|
7 |
+
"processor_class": "Wav2Vec2ProcessorWithLM",
|
8 |
"return_attention_mask": true,
|
9 |
"sampling_rate": 16000
|
10 |
}
|
special_tokens_map.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
tokenizer_config.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "
|
|
|
1 |
+
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "vitouphy/xls-r-300m-km", "processor_class": "Wav2Vec2ProcessorWithLM", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
|
train_kh.ipynb
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
"execution_count": 1,
|
6 |
-
"id": "
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": [
|
@@ -16,7 +16,7 @@
|
|
16 |
{
|
17 |
"cell_type": "code",
|
18 |
"execution_count": null,
|
19 |
-
"id": "
|
20 |
"metadata": {
|
21 |
"collapsed": true,
|
22 |
"jupyter": {
|
@@ -19167,7 +19167,7 @@
|
|
19167 |
},
|
19168 |
{
|
19169 |
"cell_type": "markdown",
|
19170 |
-
"id": "
|
19171 |
"metadata": {},
|
19172 |
"source": [
|
19173 |
"### Load KH Data"
|
@@ -19176,7 +19176,7 @@
|
|
19176 |
{
|
19177 |
"cell_type": "code",
|
19178 |
"execution_count": 6,
|
19179 |
-
"id": "
|
19180 |
"metadata": {},
|
19181 |
"outputs": [],
|
19182 |
"source": [
|
@@ -19199,7 +19199,7 @@
|
|
19199 |
{
|
19200 |
"cell_type": "code",
|
19201 |
"execution_count": 2,
|
19202 |
-
"id": "
|
19203 |
"metadata": {},
|
19204 |
"outputs": [
|
19205 |
{
|
@@ -19221,7 +19221,7 @@
|
|
19221 |
{
|
19222 |
"cell_type": "code",
|
19223 |
"execution_count": 3,
|
19224 |
-
"id": "
|
19225 |
"metadata": {},
|
19226 |
"outputs": [],
|
19227 |
"source": [
|
@@ -19235,7 +19235,7 @@
|
|
19235 |
},
|
19236 |
{
|
19237 |
"cell_type": "markdown",
|
19238 |
-
"id": "
|
19239 |
"metadata": {},
|
19240 |
"source": [
|
19241 |
"### Clean Up the Text"
|
@@ -19244,7 +19244,7 @@
|
|
19244 |
{
|
19245 |
"cell_type": "code",
|
19246 |
"execution_count": 4,
|
19247 |
-
"id": "
|
19248 |
"metadata": {},
|
19249 |
"outputs": [],
|
19250 |
"source": [
|
@@ -19260,7 +19260,7 @@
|
|
19260 |
{
|
19261 |
"cell_type": "code",
|
19262 |
"execution_count": 5,
|
19263 |
-
"id": "
|
19264 |
"metadata": {},
|
19265 |
"outputs": [],
|
19266 |
"source": [
|
@@ -19272,7 +19272,7 @@
|
|
19272 |
{
|
19273 |
"cell_type": "code",
|
19274 |
"execution_count": 6,
|
19275 |
-
"id": "
|
19276 |
"metadata": {},
|
19277 |
"outputs": [
|
19278 |
{
|
@@ -19293,7 +19293,7 @@
|
|
19293 |
},
|
19294 |
{
|
19295 |
"cell_type": "markdown",
|
19296 |
-
"id": "
|
19297 |
"metadata": {},
|
19298 |
"source": [
|
19299 |
"### Build Character"
|
@@ -19302,7 +19302,7 @@
|
|
19302 |
{
|
19303 |
"cell_type": "code",
|
19304 |
"execution_count": 7,
|
19305 |
-
"id": "
|
19306 |
"metadata": {},
|
19307 |
"outputs": [
|
19308 |
{
|
@@ -19350,7 +19350,7 @@
|
|
19350 |
{
|
19351 |
"cell_type": "code",
|
19352 |
"execution_count": 8,
|
19353 |
-
"id": "
|
19354 |
"metadata": {},
|
19355 |
"outputs": [],
|
19356 |
"source": [
|
@@ -19361,7 +19361,7 @@
|
|
19361 |
{
|
19362 |
"cell_type": "code",
|
19363 |
"execution_count": 9,
|
19364 |
-
"id": "
|
19365 |
"metadata": {},
|
19366 |
"outputs": [
|
19367 |
{
|
@@ -19379,7 +19379,7 @@
|
|
19379 |
{
|
19380 |
"cell_type": "code",
|
19381 |
"execution_count": 10,
|
19382 |
-
"id": "
|
19383 |
"metadata": {},
|
19384 |
"outputs": [
|
19385 |
{
|
@@ -19406,7 +19406,7 @@
|
|
19406 |
{
|
19407 |
"cell_type": "code",
|
19408 |
"execution_count": 11,
|
19409 |
-
"id": "
|
19410 |
"metadata": {},
|
19411 |
"outputs": [
|
19412 |
{
|
@@ -19424,7 +19424,7 @@
|
|
19424 |
{
|
19425 |
"cell_type": "code",
|
19426 |
"execution_count": 12,
|
19427 |
-
"id": "
|
19428 |
"metadata": {},
|
19429 |
"outputs": [],
|
19430 |
"source": [
|
@@ -19435,7 +19435,7 @@
|
|
19435 |
},
|
19436 |
{
|
19437 |
"cell_type": "markdown",
|
19438 |
-
"id": "
|
19439 |
"metadata": {},
|
19440 |
"source": [
|
19441 |
"# Tokenizer"
|
@@ -19444,7 +19444,7 @@
|
|
19444 |
{
|
19445 |
"cell_type": "code",
|
19446 |
"execution_count": 13,
|
19447 |
-
"id": "
|
19448 |
"metadata": {},
|
19449 |
"outputs": [],
|
19450 |
"source": [
|
@@ -19456,7 +19456,7 @@
|
|
19456 |
{
|
19457 |
"cell_type": "code",
|
19458 |
"execution_count": 14,
|
19459 |
-
"id": "
|
19460 |
"metadata": {},
|
19461 |
"outputs": [],
|
19462 |
"source": [
|
@@ -19468,7 +19468,7 @@
|
|
19468 |
{
|
19469 |
"cell_type": "code",
|
19470 |
"execution_count": 26,
|
19471 |
-
"id": "
|
19472 |
"metadata": {},
|
19473 |
"outputs": [],
|
19474 |
"source": [
|
@@ -19485,7 +19485,7 @@
|
|
19485 |
{
|
19486 |
"cell_type": "code",
|
19487 |
"execution_count": 27,
|
19488 |
-
"id": "
|
19489 |
"metadata": {},
|
19490 |
"outputs": [
|
19491 |
{
|
@@ -19525,7 +19525,7 @@
|
|
19525 |
{
|
19526 |
"cell_type": "code",
|
19527 |
"execution_count": 15,
|
19528 |
-
"id": "
|
19529 |
"metadata": {},
|
19530 |
"outputs": [],
|
19531 |
"source": [
|
@@ -19536,7 +19536,7 @@
|
|
19536 |
{
|
19537 |
"cell_type": "code",
|
19538 |
"execution_count": 16,
|
19539 |
-
"id": "
|
19540 |
"metadata": {},
|
19541 |
"outputs": [
|
19542 |
{
|
@@ -19561,7 +19561,7 @@
|
|
19561 |
{
|
19562 |
"cell_type": "code",
|
19563 |
"execution_count": 17,
|
19564 |
-
"id": "
|
19565 |
"metadata": {},
|
19566 |
"outputs": [
|
19567 |
{
|
@@ -19608,7 +19608,7 @@
|
|
19608 |
{
|
19609 |
"cell_type": "code",
|
19610 |
"execution_count": 18,
|
19611 |
-
"id": "
|
19612 |
"metadata": {},
|
19613 |
"outputs": [],
|
19614 |
"source": [
|
@@ -19630,7 +19630,7 @@
|
|
19630 |
{
|
19631 |
"cell_type": "code",
|
19632 |
"execution_count": 19,
|
19633 |
-
"id": "
|
19634 |
"metadata": {
|
19635 |
"collapsed": true,
|
19636 |
"jupyter": {
|
@@ -19669,7 +19669,7 @@
|
|
19669 |
{
|
19670 |
"cell_type": "code",
|
19671 |
"execution_count": 20,
|
19672 |
-
"id": "
|
19673 |
"metadata": {},
|
19674 |
"outputs": [],
|
19675 |
"source": [
|
@@ -19681,7 +19681,7 @@
|
|
19681 |
{
|
19682 |
"cell_type": "code",
|
19683 |
"execution_count": 21,
|
19684 |
-
"id": "
|
19685 |
"metadata": {},
|
19686 |
"outputs": [],
|
19687 |
"source": [
|
@@ -19741,7 +19741,7 @@
|
|
19741 |
{
|
19742 |
"cell_type": "code",
|
19743 |
"execution_count": 22,
|
19744 |
-
"id": "
|
19745 |
"metadata": {},
|
19746 |
"outputs": [],
|
19747 |
"source": [
|
@@ -19751,7 +19751,7 @@
|
|
19751 |
{
|
19752 |
"cell_type": "code",
|
19753 |
"execution_count": 23,
|
19754 |
-
"id": "
|
19755 |
"metadata": {},
|
19756 |
"outputs": [],
|
19757 |
"source": [
|
@@ -19762,7 +19762,7 @@
|
|
19762 |
{
|
19763 |
"cell_type": "code",
|
19764 |
"execution_count": 24,
|
19765 |
-
"id": "
|
19766 |
"metadata": {},
|
19767 |
"outputs": [],
|
19768 |
"source": [
|
@@ -19783,7 +19783,7 @@
|
|
19783 |
{
|
19784 |
"cell_type": "code",
|
19785 |
"execution_count": 25,
|
19786 |
-
"id": "
|
19787 |
"metadata": {},
|
19788 |
"outputs": [
|
19789 |
{
|
@@ -19819,7 +19819,7 @@
|
|
19819 |
{
|
19820 |
"cell_type": "code",
|
19821 |
"execution_count": 26,
|
19822 |
-
"id": "
|
19823 |
"metadata": {},
|
19824 |
"outputs": [],
|
19825 |
"source": [
|
@@ -19829,7 +19829,7 @@
|
|
19829 |
{
|
19830 |
"cell_type": "code",
|
19831 |
"execution_count": 27,
|
19832 |
-
"id": "
|
19833 |
"metadata": {},
|
19834 |
"outputs": [],
|
19835 |
"source": [
|
@@ -19857,7 +19857,7 @@
|
|
19857 |
{
|
19858 |
"cell_type": "code",
|
19859 |
"execution_count": 29,
|
19860 |
-
"id": "
|
19861 |
"metadata": {},
|
19862 |
"outputs": [
|
19863 |
{
|
@@ -19885,7 +19885,7 @@
|
|
19885 |
{
|
19886 |
"cell_type": "code",
|
19887 |
"execution_count": 30,
|
19888 |
-
"id": "
|
19889 |
"metadata": {},
|
19890 |
"outputs": [
|
19891 |
{
|
@@ -20232,7 +20232,7 @@
|
|
20232 |
{
|
20233 |
"cell_type": "code",
|
20234 |
"execution_count": 31,
|
20235 |
-
"id": "
|
20236 |
"metadata": {},
|
20237 |
"outputs": [
|
20238 |
{
|
@@ -20253,7 +20253,7 @@
|
|
20253 |
{
|
20254 |
"cell_type": "code",
|
20255 |
"execution_count": 32,
|
20256 |
-
"id": "
|
20257 |
"metadata": {},
|
20258 |
"outputs": [
|
20259 |
{
|
@@ -20286,7 +20286,7 @@
|
|
20286 |
{
|
20287 |
"cell_type": "code",
|
20288 |
"execution_count": 34,
|
20289 |
-
"id": "
|
20290 |
"metadata": {},
|
20291 |
"outputs": [],
|
20292 |
"source": [
|
@@ -20303,7 +20303,7 @@
|
|
20303 |
{
|
20304 |
"cell_type": "code",
|
20305 |
"execution_count": 35,
|
20306 |
-
"id": "
|
20307 |
"metadata": {},
|
20308 |
"outputs": [
|
20309 |
{
|
@@ -20322,7 +20322,7 @@
|
|
20322 |
{
|
20323 |
"cell_type": "code",
|
20324 |
"execution_count": 36,
|
20325 |
-
"id": "
|
20326 |
"metadata": {},
|
20327 |
"outputs": [
|
20328 |
{
|
@@ -20373,8 +20373,8 @@
|
|
20373 |
},
|
20374 |
{
|
20375 |
"cell_type": "code",
|
20376 |
-
"execution_count":
|
20377 |
-
"id": "
|
20378 |
"metadata": {},
|
20379 |
"outputs": [
|
20380 |
{
|
@@ -20395,7 +20395,7 @@
|
|
20395 |
{
|
20396 |
"cell_type": "code",
|
20397 |
"execution_count": null,
|
20398 |
-
"id": "
|
20399 |
"metadata": {},
|
20400 |
"outputs": [],
|
20401 |
"source": []
|
|
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
"execution_count": 1,
|
6 |
+
"id": "0ee7433e",
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": [
|
|
|
16 |
{
|
17 |
"cell_type": "code",
|
18 |
"execution_count": null,
|
19 |
+
"id": "90323ec7",
|
20 |
"metadata": {
|
21 |
"collapsed": true,
|
22 |
"jupyter": {
|
|
|
19167 |
},
|
19168 |
{
|
19169 |
"cell_type": "markdown",
|
19170 |
+
"id": "eda834f4",
|
19171 |
"metadata": {},
|
19172 |
"source": [
|
19173 |
"### Load KH Data"
|
|
|
19176 |
{
|
19177 |
"cell_type": "code",
|
19178 |
"execution_count": 6,
|
19179 |
+
"id": "e8b86dab",
|
19180 |
"metadata": {},
|
19181 |
"outputs": [],
|
19182 |
"source": [
|
|
|
19199 |
{
|
19200 |
"cell_type": "code",
|
19201 |
"execution_count": 2,
|
19202 |
+
"id": "0b17a0e1",
|
19203 |
"metadata": {},
|
19204 |
"outputs": [
|
19205 |
{
|
|
|
19221 |
{
|
19222 |
"cell_type": "code",
|
19223 |
"execution_count": 3,
|
19224 |
+
"id": "21239531",
|
19225 |
"metadata": {},
|
19226 |
"outputs": [],
|
19227 |
"source": [
|
|
|
19235 |
},
|
19236 |
{
|
19237 |
"cell_type": "markdown",
|
19238 |
+
"id": "68736f61",
|
19239 |
"metadata": {},
|
19240 |
"source": [
|
19241 |
"### Clean Up the Text"
|
|
|
19244 |
{
|
19245 |
"cell_type": "code",
|
19246 |
"execution_count": 4,
|
19247 |
+
"id": "fcba882e",
|
19248 |
"metadata": {},
|
19249 |
"outputs": [],
|
19250 |
"source": [
|
|
|
19260 |
{
|
19261 |
"cell_type": "code",
|
19262 |
"execution_count": 5,
|
19263 |
+
"id": "9ef37613",
|
19264 |
"metadata": {},
|
19265 |
"outputs": [],
|
19266 |
"source": [
|
|
|
19272 |
{
|
19273 |
"cell_type": "code",
|
19274 |
"execution_count": 6,
|
19275 |
+
"id": "8e4fdc71",
|
19276 |
"metadata": {},
|
19277 |
"outputs": [
|
19278 |
{
|
|
|
19293 |
},
|
19294 |
{
|
19295 |
"cell_type": "markdown",
|
19296 |
+
"id": "1fcdf7d8",
|
19297 |
"metadata": {},
|
19298 |
"source": [
|
19299 |
"### Build Character"
|
|
|
19302 |
{
|
19303 |
"cell_type": "code",
|
19304 |
"execution_count": 7,
|
19305 |
+
"id": "7b7da87a",
|
19306 |
"metadata": {},
|
19307 |
"outputs": [
|
19308 |
{
|
|
|
19350 |
{
|
19351 |
"cell_type": "code",
|
19352 |
"execution_count": 8,
|
19353 |
+
"id": "eb6f0804",
|
19354 |
"metadata": {},
|
19355 |
"outputs": [],
|
19356 |
"source": [
|
|
|
19361 |
{
|
19362 |
"cell_type": "code",
|
19363 |
"execution_count": 9,
|
19364 |
+
"id": "9189ac57",
|
19365 |
"metadata": {},
|
19366 |
"outputs": [
|
19367 |
{
|
|
|
19379 |
{
|
19380 |
"cell_type": "code",
|
19381 |
"execution_count": 10,
|
19382 |
+
"id": "c5fb8a71",
|
19383 |
"metadata": {},
|
19384 |
"outputs": [
|
19385 |
{
|
|
|
19406 |
{
|
19407 |
"cell_type": "code",
|
19408 |
"execution_count": 11,
|
19409 |
+
"id": "10043978",
|
19410 |
"metadata": {},
|
19411 |
"outputs": [
|
19412 |
{
|
|
|
19424 |
{
|
19425 |
"cell_type": "code",
|
19426 |
"execution_count": 12,
|
19427 |
+
"id": "42f02a78",
|
19428 |
"metadata": {},
|
19429 |
"outputs": [],
|
19430 |
"source": [
|
|
|
19435 |
},
|
19436 |
{
|
19437 |
"cell_type": "markdown",
|
19438 |
+
"id": "95b09010",
|
19439 |
"metadata": {},
|
19440 |
"source": [
|
19441 |
"# Tokenizer"
|
|
|
19444 |
{
|
19445 |
"cell_type": "code",
|
19446 |
"execution_count": 13,
|
19447 |
+
"id": "c4d0f5a6",
|
19448 |
"metadata": {},
|
19449 |
"outputs": [],
|
19450 |
"source": [
|
|
|
19456 |
{
|
19457 |
"cell_type": "code",
|
19458 |
"execution_count": 14,
|
19459 |
+
"id": "825623c4",
|
19460 |
"metadata": {},
|
19461 |
"outputs": [],
|
19462 |
"source": [
|
|
|
19468 |
{
|
19469 |
"cell_type": "code",
|
19470 |
"execution_count": 26,
|
19471 |
+
"id": "cfb44de0",
|
19472 |
"metadata": {},
|
19473 |
"outputs": [],
|
19474 |
"source": [
|
|
|
19485 |
{
|
19486 |
"cell_type": "code",
|
19487 |
"execution_count": 27,
|
19488 |
+
"id": "05ab24c0",
|
19489 |
"metadata": {},
|
19490 |
"outputs": [
|
19491 |
{
|
|
|
19525 |
{
|
19526 |
"cell_type": "code",
|
19527 |
"execution_count": 15,
|
19528 |
+
"id": "0cfd158b",
|
19529 |
"metadata": {},
|
19530 |
"outputs": [],
|
19531 |
"source": [
|
|
|
19536 |
{
|
19537 |
"cell_type": "code",
|
19538 |
"execution_count": 16,
|
19539 |
+
"id": "10d224fa",
|
19540 |
"metadata": {},
|
19541 |
"outputs": [
|
19542 |
{
|
|
|
19561 |
{
|
19562 |
"cell_type": "code",
|
19563 |
"execution_count": 17,
|
19564 |
+
"id": "132efaa8",
|
19565 |
"metadata": {},
|
19566 |
"outputs": [
|
19567 |
{
|
|
|
19608 |
{
|
19609 |
"cell_type": "code",
|
19610 |
"execution_count": 18,
|
19611 |
+
"id": "c39872d6",
|
19612 |
"metadata": {},
|
19613 |
"outputs": [],
|
19614 |
"source": [
|
|
|
19630 |
{
|
19631 |
"cell_type": "code",
|
19632 |
"execution_count": 19,
|
19633 |
+
"id": "fef54a48",
|
19634 |
"metadata": {
|
19635 |
"collapsed": true,
|
19636 |
"jupyter": {
|
|
|
19669 |
{
|
19670 |
"cell_type": "code",
|
19671 |
"execution_count": 20,
|
19672 |
+
"id": "2f280b0d",
|
19673 |
"metadata": {},
|
19674 |
"outputs": [],
|
19675 |
"source": [
|
|
|
19681 |
{
|
19682 |
"cell_type": "code",
|
19683 |
"execution_count": 21,
|
19684 |
+
"id": "c9dec52e",
|
19685 |
"metadata": {},
|
19686 |
"outputs": [],
|
19687 |
"source": [
|
|
|
19741 |
{
|
19742 |
"cell_type": "code",
|
19743 |
"execution_count": 22,
|
19744 |
+
"id": "639dcc23",
|
19745 |
"metadata": {},
|
19746 |
"outputs": [],
|
19747 |
"source": [
|
|
|
19751 |
{
|
19752 |
"cell_type": "code",
|
19753 |
"execution_count": 23,
|
19754 |
+
"id": "3bb04288",
|
19755 |
"metadata": {},
|
19756 |
"outputs": [],
|
19757 |
"source": [
|
|
|
19762 |
{
|
19763 |
"cell_type": "code",
|
19764 |
"execution_count": 24,
|
19765 |
+
"id": "9ba8858b",
|
19766 |
"metadata": {},
|
19767 |
"outputs": [],
|
19768 |
"source": [
|
|
|
19783 |
{
|
19784 |
"cell_type": "code",
|
19785 |
"execution_count": 25,
|
19786 |
+
"id": "434869f9",
|
19787 |
"metadata": {},
|
19788 |
"outputs": [
|
19789 |
{
|
|
|
19819 |
{
|
19820 |
"cell_type": "code",
|
19821 |
"execution_count": 26,
|
19822 |
+
"id": "9ffb97fd",
|
19823 |
"metadata": {},
|
19824 |
"outputs": [],
|
19825 |
"source": [
|
|
|
19829 |
{
|
19830 |
"cell_type": "code",
|
19831 |
"execution_count": 27,
|
19832 |
+
"id": "c83b8d4e",
|
19833 |
"metadata": {},
|
19834 |
"outputs": [],
|
19835 |
"source": [
|
|
|
19857 |
{
|
19858 |
"cell_type": "code",
|
19859 |
"execution_count": 29,
|
19860 |
+
"id": "7352a29a",
|
19861 |
"metadata": {},
|
19862 |
"outputs": [
|
19863 |
{
|
|
|
19885 |
{
|
19886 |
"cell_type": "code",
|
19887 |
"execution_count": 30,
|
19888 |
+
"id": "5a73ff08",
|
19889 |
"metadata": {},
|
19890 |
"outputs": [
|
19891 |
{
|
|
|
20232 |
{
|
20233 |
"cell_type": "code",
|
20234 |
"execution_count": 31,
|
20235 |
+
"id": "967962d1",
|
20236 |
"metadata": {},
|
20237 |
"outputs": [
|
20238 |
{
|
|
|
20253 |
{
|
20254 |
"cell_type": "code",
|
20255 |
"execution_count": 32,
|
20256 |
+
"id": "da40a75c",
|
20257 |
"metadata": {},
|
20258 |
"outputs": [
|
20259 |
{
|
|
|
20286 |
{
|
20287 |
"cell_type": "code",
|
20288 |
"execution_count": 34,
|
20289 |
+
"id": "24166e72",
|
20290 |
"metadata": {},
|
20291 |
"outputs": [],
|
20292 |
"source": [
|
|
|
20303 |
{
|
20304 |
"cell_type": "code",
|
20305 |
"execution_count": 35,
|
20306 |
+
"id": "95d69b2e",
|
20307 |
"metadata": {},
|
20308 |
"outputs": [
|
20309 |
{
|
|
|
20322 |
{
|
20323 |
"cell_type": "code",
|
20324 |
"execution_count": 36,
|
20325 |
+
"id": "d60a731d",
|
20326 |
"metadata": {},
|
20327 |
"outputs": [
|
20328 |
{
|
|
|
20373 |
},
|
20374 |
{
|
20375 |
"cell_type": "code",
|
20376 |
+
"execution_count": 38,
|
20377 |
+
"id": "beca9a8c",
|
20378 |
"metadata": {},
|
20379 |
"outputs": [
|
20380 |
{
|
|
|
20395 |
{
|
20396 |
"cell_type": "code",
|
20397 |
"execution_count": null,
|
20398 |
+
"id": "20063dbc",
|
20399 |
"metadata": {},
|
20400 |
"outputs": [],
|
20401 |
"source": []
|
train_kh_lm.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
vocab.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"
|
|
|
1 |
+
{"แ": 1, "แ": 2, "แ": 3, "แ": 4, "แ": 5, "แ
": 6, "แ": 7, "แ": 8, "แ": 9, "แ": 10, "แ": 11, "แ": 12, "แ": 13, "แ": 14, "แ": 15, "แ": 16, "แ": 17, "แ": 18, "แ": 19, "แ": 20, "แ": 21, "แ": 22, "แ": 23, "แ": 24, "แ": 25, "แ": 26, "แ": 27, "แ": 28, "แ": 29, "แ": 30, "แ ": 31, "แก": 32, "แข": 33, "แฅ": 34, "แง": 35, "แช": 36, "แซ": 37, "แฌ": 38, "แญ": 39, "แฎ": 40, "แฏ": 41, "แฑ": 42, "แถ": 43, "แท": 44, "แธ": 45, "แน": 46, "แบ": 47, "แป": 48, "แผ": 49, "แฝ": 50, "แพ": 51, "แฟ": 52, "แ": 53, "แ": 54, "แ": 55, "แ": 56, "แ": 57, "แ
": 58, "แ": 59, "แ": 60, "แ": 61, "แ": 62, "แ": 63, "แ": 64, "แ": 65, "แ": 66, "แ": 67, "แ": 68, "แ": 69, "แ": 70, "|": 0, "[UNK]": 71, "[PAD]": 72}
|