vitouphy commited on
Commit
1a48339
โ€ข
1 Parent(s): 6bd1306

add language model

Browse files
.gitignore CHANGED
@@ -3,3 +3,4 @@ km_kh*
3
  .ipynb_checkpoints
4
  vitouphy
5
  *checkpoint*
 
 
3
  .ipynb_checkpoints
4
  vitouphy
5
  *checkpoint*
6
+ data
alphabet.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"labels": [" ", "\u1780", "\u1781", "\u1782", "\u1783", "\u1784", "\u1785", "\u1786", "\u1787", "\u1788", "\u1789", "\u178a", "\u178b", "\u178c", "\u178d", "\u178e", "\u178f", "\u1790", "\u1791", "\u1792", "\u1793", "\u1794", "\u1795", "\u1796", "\u1797", "\u1798", "\u1799", "\u179a", "\u179b", "\u179c", "\u179f", "\u17a0", "\u17a1", "\u17a2", "\u17a5", "\u17a7", "\u17aa", "\u17ab", "\u17ac", "\u17ad", "\u17ae", "\u17af", "\u17b1", "\u17b6", "\u17b7", "\u17b8", "\u17b9", "\u17ba", "\u17bb", "\u17bc", "\u17bd", "\u17be", "\u17bf", "\u17c0", "\u17c1", "\u17c2", "\u17c3", "\u17c4", "\u17c5", "\u17c6", "\u17c7", "\u17c8", "\u17c9", "\u17ca", "\u17cb", "\u17cc", "\u17cd", "\u17ce", "\u17cf", "\u17d0", "\u17d2", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
build_lm_processor.ipynb ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "5393aa33",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2Processor, AutoProcessor, Wav2Vec2ProcessorWithLM\n",
11
+ "from datasets import load_dataset, load_metric, Audio\n",
12
+ "from pyctcdecode import build_ctcdecoder\n",
13
+ "from pydub import AudioSegment\n",
14
+ "from pydub.playback import play\n",
15
+ "\n",
16
+ "import numpy as np\n",
17
+ "import torch\n",
18
+ "import kenlm\n",
19
+ "import pandas as pd\n",
20
+ "import random\n",
21
+ "import soundfile as sf\n",
22
+ "from tqdm.auto import tqdm"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 2,
28
+ "id": "2d34d3b8",
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "# KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_text_word_unigram.arpa'\n",
33
+ "KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_wiki_ngram.arpa'"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 3,
39
+ "id": "f0354cb2",
40
+ "metadata": {},
41
+ "outputs": [
42
+ {
43
+ "name": "stderr",
44
+ "output_type": "stream",
45
+ "text": [
46
+ "Loading the LM will be faster if you build a binary file.\n",
47
+ "Reading /workspace/xls-r-300m-km/vitouphy/xls-r-300m-km/language_model/km_text.arpa\n",
48
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
49
+ "Only 81 unigrams passed as vocabulary. Is this small or artificial data?\n",
50
+ "****************************************************************************************************\n"
51
+ ]
52
+ }
53
+ ],
54
+ "source": [
55
+ "processor = AutoProcessor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 4,
61
+ "id": "109f28e9",
62
+ "metadata": {},
63
+ "outputs": [
64
+ {
65
+ "name": "stdout",
66
+ "output_type": "stream",
67
+ "text": [
68
+ "{'|': 0, 'แž€': 1, 'แž': 2, 'แž‚': 3, 'แžƒ': 4, 'แž„': 5, 'แž…': 6, 'แž†': 7, 'แž‡': 8, 'แžˆ': 9, 'แž‰': 10, 'แžŠ': 11, 'แž‹': 12, 'แžŒ': 13, 'แž': 14, 'แžŽ': 15, 'แž': 16, 'แž': 17, 'แž‘': 18, 'แž’': 19, 'แž“': 20, 'แž”': 21, 'แž•': 22, 'แž–': 23, 'แž—': 24, 'แž˜': 25, 'แž™': 26, 'แžš': 27, 'แž›': 28, 'แžœ': 29, 'แžŸ': 30, 'แž ': 31, 'แžก': 32, 'แžข': 33, 'แžฅ': 34, 'แžง': 35, 'แžช': 36, 'แžซ': 37, 'แžฌ': 38, 'แžญ': 39, 'แžฎ': 40, 'แžฏ': 41, 'แžฑ': 42, 'แžถ': 43, 'แžท': 44, 'แžธ': 45, 'แžน': 46, 'แžบ': 47, 'แžป': 48, 'แžผ': 49, 'แžฝ': 50, 'แžพ': 51, 'แžฟ': 52, 'แŸ€': 53, 'แŸ': 54, 'แŸ‚': 55, 'แŸƒ': 56, 'แŸ„': 57, 'แŸ…': 58, 'แŸ†': 59, 'แŸ‡': 60, 'แŸˆ': 61, 'แŸ‰': 62, 'แŸŠ': 63, 'แŸ‹': 64, 'แŸŒ': 65, 'แŸ': 66, 'แŸŽ': 67, 'แŸ': 68, 'แŸ': 69, 'แŸ’': 70, '[unk]': 71, '[pad]': 72, '<s>': 73, '</s>': 74}\n"
69
+ ]
70
+ }
71
+ ],
72
+ "source": [
73
+ "vocab_dict = processor.tokenizer.get_vocab()\n",
74
+ "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}\n",
75
+ "print(sorted_vocab_dict)"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 5,
81
+ "id": "300cec39",
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "name": "stderr",
86
+ "output_type": "stream",
87
+ "text": [
88
+ "Loading the LM will be faster if you build a binary file.\n",
89
+ "Reading /workspace/xls-r-300m-km/data/km_wiki_ngram.arpa\n",
90
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
91
+ "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
92
+ "****************************************************************************************************\n"
93
+ ]
94
+ }
95
+ ],
96
+ "source": [
97
+ "decoder = build_ctcdecoder(\n",
98
+ " labels=list(sorted_vocab_dict.keys()),\n",
99
+ " kenlm_model_path=KENLM_MODEL_LOC,\n",
100
+ ")"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 8,
106
+ "id": "27dd8427",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
111
+ " feature_extractor=processor.feature_extractor,\n",
112
+ " tokenizer=processor.tokenizer,\n",
113
+ " decoder=decoder\n",
114
+ ")"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": 9,
120
+ "id": "94eb248e",
121
+ "metadata": {},
122
+ "outputs": [],
123
+ "source": [
124
+ "processor_with_lm.save_pretrained(\".\")"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "markdown",
129
+ "id": "8f9b3dcc",
130
+ "metadata": {},
131
+ "source": [
132
+ "## Save Model"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": 9,
138
+ "id": "8b584690",
139
+ "metadata": {},
140
+ "outputs": [
141
+ {
142
+ "data": {
143
+ "application/vnd.jupyter.widget-view+json": {
144
+ "model_id": "bc5bf68946064e97b869d44b02e7af19",
145
+ "version_major": 2,
146
+ "version_minor": 0
147
+ },
148
+ "text/plain": [
149
+ "Downloading: 0%| | 0.00/1.18G [00:00<?, ?B/s]"
150
+ ]
151
+ },
152
+ "metadata": {},
153
+ "output_type": "display_data"
154
+ }
155
+ ],
156
+ "source": [
157
+ "model = AutoModelForCTC.from_pretrained(\"vitouphy/xls-r-300m-km\")"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 12,
163
+ "id": "3712c030",
164
+ "metadata": {},
165
+ "outputs": [],
166
+ "source": [
167
+ "model.save_pretrained('.')"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": null,
173
+ "id": "b5d8de20",
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": []
177
+ }
178
+ ],
179
+ "metadata": {
180
+ "kernelspec": {
181
+ "display_name": "Python 3 (ipykernel)",
182
+ "language": "python",
183
+ "name": "python3"
184
+ },
185
+ "language_info": {
186
+ "codemirror_mode": {
187
+ "name": "ipython",
188
+ "version": 3
189
+ },
190
+ "file_extension": ".py",
191
+ "mimetype": "text/x-python",
192
+ "name": "python",
193
+ "nbconvert_exporter": "python",
194
+ "pygments_lexer": "ipython3",
195
+ "version": "3.8.8"
196
+ }
197
+ },
198
+ "nbformat": 4,
199
+ "nbformat_minor": 5
200
+ }
eval.sh CHANGED
@@ -1,5 +1,5 @@
1
  ./eval.py \
2
- --model_id ./ \
3
  --dataset openslr \
4
  --config km \
5
  --split test \
 
1
  ./eval.py \
2
+ --model_id vitouphy/xls-r-300m-km \
3
  --dataset openslr \
4
  --config km \
5
  --split test \
inference.ipynb CHANGED
@@ -2,32 +2,40 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 24,
6
- "id": "2bdeda95",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
10
- "from transformers import AutoModelForCTC, Wav2Vec2Processor\n",
11
  "from datasets import load_dataset, load_metric, Audio\n",
 
 
 
 
12
  "import numpy as np\n",
13
- "import torch"
 
 
 
 
14
  ]
15
  },
16
  {
17
  "cell_type": "code",
18
- "execution_count": 27,
19
- "id": "8f840be9",
20
  "metadata": {},
21
  "outputs": [],
22
  "source": [
23
- "model = AutoModelForCTC.from_pretrained(\".\").to('cuda')\n",
24
  "processor = Wav2Vec2Processor.from_pretrained(\".\")"
25
  ]
26
  },
27
  {
28
  "cell_type": "code",
29
  "execution_count": 28,
30
- "id": "46339a6d",
31
  "metadata": {},
32
  "outputs": [],
33
  "source": [
@@ -37,62 +45,43 @@
37
  },
38
  {
39
  "cell_type": "code",
40
- "execution_count": 29,
41
- "id": "2c28d4f3",
42
  "metadata": {},
43
  "outputs": [
44
  {
45
  "name": "stderr",
46
  "output_type": "stream",
47
  "text": [
48
- "Using custom data configuration default-fbad308ab5a03eb2\n",
49
- "Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-fbad308ab5a03eb2/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)\n"
50
  ]
51
  }
52
  ],
53
  "source": [
54
- "common_voice_test = load_dataset('csv', data_files='km_kh_male/line_index_test.csv', split = 'train')"
55
- ]
56
- },
57
- {
58
- "cell_type": "code",
59
- "execution_count": 30,
60
- "id": "f14c1cfa",
61
- "metadata": {},
62
- "outputs": [],
63
- "source": [
64
- "common_voice_test = (common_voice_test\n",
65
  " .remove_columns([\"Unnamed: 0\", \"drop\"])\n",
66
- " .rename_column('text', 'sentence'))"
 
67
  ]
68
  },
69
  {
70
  "cell_type": "code",
71
- "execution_count": 31,
72
- "id": "b60360b2",
73
- "metadata": {},
74
- "outputs": [],
75
- "source": [
76
- "common_voice_test = common_voice_test.cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio')"
77
- ]
78
- },
79
- {
80
- "cell_type": "code",
81
- "execution_count": 32,
82
- "id": "64758ba8",
83
  "metadata": {},
84
  "outputs": [
85
  {
86
  "data": {
87
  "text/plain": [
88
- "{'audio': {'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/khm_1443_3799144408.wav',\n",
89
- " 'array': array([-1.0600963e-06, 1.2359066e-06, -1.4001107e-06, ...,\n",
90
- " -3.1423504e-05, 4.4914182e-06, 0.0000000e+00], dtype=float32),\n",
91
  " 'sampling_rate': 16000},\n",
92
- " 'sentence': 'แžŸแŸŠแžธ แžŠแžถแž…แŸ‹ แž˜แŸ‰แžผแžแžผ แž“แŸ… แž–แŸแž› แžŠแŸ‚แž› แž”แŸ’แžšแžพ แžฑแŸ’แž™ แžŒแžปแž” แžŸแž˜แŸ’แž—แžถแžšแŸˆ แž‚แŸ’แžšแžฟแž„ แžŸแž„แŸ’แž แžถแžšแžนแž˜ แž™แž€ แž‘แŸ… แžฑแŸ’แž™ แž˜แŸ‰แžผแž™ แž“แŸ… แž˜แŸ’แžŠแžปแŸ† แžœแžแŸ’แžŠ แžŸแŸ†แžšแŸ„แž„แžขแžŽแŸ’แžŠแŸ‚แž'}"
93
  ]
94
  },
95
- "execution_count": 32,
96
  "metadata": {},
97
  "output_type": "execute_result"
98
  }
@@ -103,8 +92,8 @@
103
  },
104
  {
105
  "cell_type": "code",
106
- "execution_count": 33,
107
- "id": "93cd7415",
108
  "metadata": {},
109
  "outputs": [],
110
  "source": [
@@ -122,15 +111,15 @@
122
  },
123
  {
124
  "cell_type": "code",
125
- "execution_count": 34,
126
- "id": "04751885",
127
  "metadata": {},
128
  "outputs": [
129
  {
130
  "name": "stderr",
131
  "output_type": "stream",
132
  "text": [
133
- "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/csv/default-fbad308ab5a03eb2/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-abf3b661c395248b.arrow\n"
134
  ]
135
  }
136
  ],
@@ -140,8 +129,8 @@
140
  },
141
  {
142
  "cell_type": "code",
143
- "execution_count": 35,
144
- "id": "e55d9cc9",
145
  "metadata": {},
146
  "outputs": [],
147
  "source": [
@@ -150,8 +139,48 @@
150
  },
151
  {
152
  "cell_type": "code",
153
- "execution_count": 36,
154
- "id": "4f637d1a",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  "metadata": {},
156
  "outputs": [
157
  {
@@ -170,8 +199,41 @@
170
  },
171
  {
172
  "cell_type": "code",
173
- "execution_count": 37,
174
- "id": "85334ad6",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  "metadata": {},
176
  "outputs": [
177
  {
@@ -179,10 +241,10 @@
179
  "output_type": "stream",
180
  "text": [
181
  "Prediction:\n",
182
- "แž€แŸ’แžšแžปแž„ แž”แŸ‰แŸ„แž™แž”แŸ‰แŸ‚แž แž“แžนแž„ แž€แŸ’แžœแžถแŸ‡ แž‘แžนแž€ แžŸแŸ’แžขแžถแž แž”แŸ’แžšแžพ แž…แžถแž”แŸ‹ แž–แžธ แžŸแž”แŸ’แžแžถแž  แž€แŸ’แžšแŸ„แž™ แž‘แŸ…\n",
183
  "\n",
184
  "Reference:\n",
185
- "แž€แŸ’แžšแžปแž„ แž”แŸ‰แŸ„แž™แž”แŸ‰แŸ‚แž แž“แžนแž„ แžแŸ’แžœแŸ‡ แž‘แžนแž€ แžŸแŸ’แžขแžถแž แž”แŸ’แžšแžพ แž…แžถแž”แŸ‹ แž–แžธ แžŸแž”แŸ’แžแžถแž แŸ แž€แŸ’แžšแŸ„แž™ แž‘แŸ…\n"
186
  ]
187
  }
188
  ],
@@ -199,7 +261,7 @@
199
  {
200
  "cell_type": "code",
201
  "execution_count": null,
202
- "id": "be1c8d79",
203
  "metadata": {},
204
  "outputs": [],
205
  "source": []
@@ -207,7 +269,7 @@
207
  {
208
  "cell_type": "code",
209
  "execution_count": null,
210
- "id": "1f7eaba0",
211
  "metadata": {},
212
  "outputs": [],
213
  "source": []
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "33e4a305",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
10
+ "from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2Processor\n",
11
  "from datasets import load_dataset, load_metric, Audio\n",
12
+ "from pyctcdecode import build_ctcdecoder\n",
13
+ "from pydub import AudioSegment\n",
14
+ "from pydub.playback import play\n",
15
+ "\n",
16
  "import numpy as np\n",
17
+ "import torch\n",
18
+ "import kenlm\n",
19
+ "import pandas as pd\n",
20
+ "import random\n",
21
+ "import soundfile as sf"
22
  ]
23
  },
24
  {
25
  "cell_type": "code",
26
+ "execution_count": 2,
27
+ "id": "328d0662",
28
  "metadata": {},
29
  "outputs": [],
30
  "source": [
31
+ "model = AutoModelForCTC.from_pretrained(\".\")\n",
32
  "processor = Wav2Vec2Processor.from_pretrained(\".\")"
33
  ]
34
  },
35
  {
36
  "cell_type": "code",
37
  "execution_count": 28,
38
+ "id": "0fea2518",
39
  "metadata": {},
40
  "outputs": [],
41
  "source": [
 
45
  },
46
  {
47
  "cell_type": "code",
48
+ "execution_count": 3,
49
+ "id": "9cfef23c",
50
  "metadata": {},
51
  "outputs": [
52
  {
53
  "name": "stderr",
54
  "output_type": "stream",
55
  "text": [
56
+ "Using custom data configuration default-36119ec2a15afb82\n",
57
+ "Reusing dataset csv (/workspace/.cache/huggingface/datasets/csv/default-36119ec2a15afb82/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)\n"
58
  ]
59
  }
60
  ],
61
  "source": [
62
+ "common_voice_test = (load_dataset('csv', data_files='km_kh_male/line_index_test.csv', split = 'train')\n",
 
 
 
 
 
 
 
 
 
 
63
  " .remove_columns([\"Unnamed: 0\", \"drop\"])\n",
64
+ " .rename_column('text', 'sentence')\n",
65
+ " .cast_column(\"path\", Audio(sampling_rate=16_000)).rename_column('path', 'audio'))"
66
  ]
67
  },
68
  {
69
  "cell_type": "code",
70
+ "execution_count": 4,
71
+ "id": "29e6bb1a",
 
 
 
 
 
 
 
 
 
 
72
  "metadata": {},
73
  "outputs": [
74
  {
75
  "data": {
76
  "text/plain": [
77
+ "{'audio': {'path': '/workspace/xls-r-300m-km/km_kh_male/wavs/khm_3154_2555595821.wav',\n",
78
+ " 'array': array([ 0.00014737, 0.00016698, 0.00013704, ..., -0.00011244,\n",
79
+ " -0.0001059 , -0.00011476], dtype=float32),\n",
80
  " 'sampling_rate': 16000},\n",
81
+ " 'sentence': 'แž€แžถแžš แž’แŸ’แžœแžพ แžขแžถแž‡แžธแžœแž€แž˜แŸ’แž˜ แžšแŸ‰แŸ‚ แžŠแŸ†แž”แžผแž„ แž“แŸ… แž€แž˜แŸ’แž–แžปแž‡แžถ'}"
82
  ]
83
  },
84
+ "execution_count": 4,
85
  "metadata": {},
86
  "output_type": "execute_result"
87
  }
 
92
  },
93
  {
94
  "cell_type": "code",
95
+ "execution_count": 5,
96
+ "id": "0554b8d8",
97
  "metadata": {},
98
  "outputs": [],
99
  "source": [
 
111
  },
112
  {
113
  "cell_type": "code",
114
+ "execution_count": 6,
115
+ "id": "d26a6659",
116
  "metadata": {},
117
  "outputs": [
118
  {
119
  "name": "stderr",
120
  "output_type": "stream",
121
  "text": [
122
+ "Loading cached processed dataset at /workspace/.cache/huggingface/datasets/csv/default-36119ec2a15afb82/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-081703c0621182da.arrow\n"
123
  ]
124
  }
125
  ],
 
129
  },
130
  {
131
  "cell_type": "code",
132
+ "execution_count": 9,
133
+ "id": "04a94f74",
134
  "metadata": {},
135
  "outputs": [],
136
  "source": [
 
139
  },
140
  {
141
  "cell_type": "code",
142
+ "execution_count": 10,
143
+ "id": "3993d2c4",
144
+ "metadata": {},
145
+ "outputs": [
146
+ {
147
+ "name": "stderr",
148
+ "output_type": "stream",
149
+ "text": [
150
+ "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n"
151
+ ]
152
+ }
153
+ ],
154
+ "source": [
155
+ "input_dict = processor(common_voice_test[i][\"input_values\"], return_tensors=\"pt\", padding=True)"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": 11,
161
+ "id": "7e3026dc",
162
+ "metadata": {},
163
+ "outputs": [
164
+ {
165
+ "data": {
166
+ "text/plain": [
167
+ "{'input_values': tensor([[ 2.8537e-04, 2.5043e-04, 2.7738e-04, ..., -4.8949e-05,\n",
168
+ " -1.1382e-04, 2.7166e-04]]), 'attention_mask': tensor([[1, 1, 1, ..., 1, 1, 1]], dtype=torch.int32)}"
169
+ ]
170
+ },
171
+ "execution_count": 11,
172
+ "metadata": {},
173
+ "output_type": "execute_result"
174
+ }
175
+ ],
176
+ "source": [
177
+ "input_dict"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 12,
183
+ "id": "adf215c0",
184
  "metadata": {},
185
  "outputs": [
186
  {
 
199
  },
200
  {
201
  "cell_type": "code",
202
+ "execution_count": 14,
203
+ "id": "e8310629",
204
+ "metadata": {},
205
+ "outputs": [
206
+ {
207
+ "data": {
208
+ "text/plain": [
209
+ "tensor([ 1, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,\n",
210
+ " 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 10, 70, 70, 70, 10, 72,\n",
211
+ " 43, 72, 72, 72, 72, 72, 72, 0, 0, 72, 72, 18, 72, 54, 72, 72, 72, 72,\n",
212
+ " 72, 0, 72, 21, 72, 49, 72, 72, 72, 72, 72, 72, 23, 70, 70, 27, 72, 46,\n",
213
+ " 72, 72, 72, 1, 72, 0, 0, 30, 72, 72, 72, 72, 25, 70, 70, 72, 72, 11,\n",
214
+ " 55, 72, 72, 72, 72, 5, 72, 0, 20, 58, 72, 72, 72, 0, 0, 16, 72, 72,\n",
215
+ " 72, 20, 70, 70, 72, 72, 16, 70, 27, 72, 72, 72, 72, 72, 45, 0, 0, 30,\n",
216
+ " 30, 70, 70, 27, 72, 43, 72, 72, 72, 72, 72, 72, 21, 72, 53, 72, 72, 72,\n",
217
+ " 27, 72, 0, 1, 72, 72, 72, 72, 25, 70, 23, 23, 48, 72, 72, 72, 72, 72,\n",
218
+ " 72, 8, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,\n",
219
+ " 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,\n",
220
+ " 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72,\n",
221
+ " 72, 72, 72, 72, 72, 72, 72, 72, 43], device='cuda:0')"
222
+ ]
223
+ },
224
+ "execution_count": 14,
225
+ "metadata": {},
226
+ "output_type": "execute_result"
227
+ }
228
+ ],
229
+ "source": [
230
+ "pred_ids"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": 15,
236
+ "id": "5dd986a0",
237
  "metadata": {},
238
  "outputs": [
239
  {
 
241
  "output_type": "stream",
242
  "text": [
243
  "Prediction:\n",
244
+ "แž€แž‰แŸ’แž‰แžถ แž‘แŸ แž”แžผแž–แŸ’แžšแžนแž€ แžŸแž˜แŸ’แžŠแŸ‚แž„ แž“แŸ… แžแž“แŸ’แžแŸ’แžšแžธ แžŸแŸ’แžšแžถแž”แŸ€แžš แž€แž˜แŸ’แž–แžปแž‡แžถ\n",
245
  "\n",
246
  "Reference:\n",
247
+ "แž€แž‰แŸ’แž‰แžถ แž‘แŸแž– แž”แžผแž–แŸ’แžšแžนแž€แŸ’แžŸ แžŸแž˜แŸ’แžŠแŸ‚แž„ แž“แŸ… แžแž“แŸ’แžแŸ’แžšแžธ แžŸแŸ’แžšแžถแž”แŸ€แžš แž€แž˜แŸ’แž–แžปแž‡แžถ\n"
248
  ]
249
  }
250
  ],
 
261
  {
262
  "cell_type": "code",
263
  "execution_count": null,
264
+ "id": "8e39b112",
265
  "metadata": {},
266
  "outputs": [],
267
  "source": []
 
269
  {
270
  "cell_type": "code",
271
  "execution_count": null,
272
+ "id": "562af933",
273
  "metadata": {},
274
  "outputs": [],
275
  "source": []
language_model/attrs.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
language_model/km_wiki_ngram.arpa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4eae7d94d04e95668df7306edf35e21f4bbab2a73c736b921e531cd25cde6d0
3
+ size 109085039
language_model/unigrams.txt ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json CHANGED
@@ -4,6 +4,7 @@
4
  "feature_size": 1,
5
  "padding_side": "right",
6
  "padding_value": 0.0,
 
7
  "return_attention_mask": true,
8
  "sampling_rate": 16000
9
  }
 
4
  "feature_size": 1,
5
  "padding_side": "right",
6
  "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2ProcessorWithLM",
8
  "return_attention_mask": true,
9
  "sampling_rate": 16000
10
  }
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "vitouphy/xls-r-300m-km", "processor_class": "Wav2Vec2ProcessorWithLM", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
train_kh.ipynb CHANGED
@@ -3,7 +3,7 @@
3
  {
4
  "cell_type": "code",
5
  "execution_count": 1,
6
- "id": "a88514f8",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
@@ -16,7 +16,7 @@
16
  {
17
  "cell_type": "code",
18
  "execution_count": null,
19
- "id": "2d955148",
20
  "metadata": {
21
  "collapsed": true,
22
  "jupyter": {
@@ -19167,7 +19167,7 @@
19167
  },
19168
  {
19169
  "cell_type": "markdown",
19170
- "id": "54b0e493",
19171
  "metadata": {},
19172
  "source": [
19173
  "### Load KH Data"
@@ -19176,7 +19176,7 @@
19176
  {
19177
  "cell_type": "code",
19178
  "execution_count": 6,
19179
- "id": "1f31e61b",
19180
  "metadata": {},
19181
  "outputs": [],
19182
  "source": [
@@ -19199,7 +19199,7 @@
19199
  {
19200
  "cell_type": "code",
19201
  "execution_count": 2,
19202
- "id": "63b2d9b0",
19203
  "metadata": {},
19204
  "outputs": [
19205
  {
@@ -19221,7 +19221,7 @@
19221
  {
19222
  "cell_type": "code",
19223
  "execution_count": 3,
19224
- "id": "dbb54220",
19225
  "metadata": {},
19226
  "outputs": [],
19227
  "source": [
@@ -19235,7 +19235,7 @@
19235
  },
19236
  {
19237
  "cell_type": "markdown",
19238
- "id": "3bb5808a",
19239
  "metadata": {},
19240
  "source": [
19241
  "### Clean Up the Text"
@@ -19244,7 +19244,7 @@
19244
  {
19245
  "cell_type": "code",
19246
  "execution_count": 4,
19247
- "id": "8d407f91",
19248
  "metadata": {},
19249
  "outputs": [],
19250
  "source": [
@@ -19260,7 +19260,7 @@
19260
  {
19261
  "cell_type": "code",
19262
  "execution_count": 5,
19263
- "id": "9fb25eaf",
19264
  "metadata": {},
19265
  "outputs": [],
19266
  "source": [
@@ -19272,7 +19272,7 @@
19272
  {
19273
  "cell_type": "code",
19274
  "execution_count": 6,
19275
- "id": "84c7300e",
19276
  "metadata": {},
19277
  "outputs": [
19278
  {
@@ -19293,7 +19293,7 @@
19293
  },
19294
  {
19295
  "cell_type": "markdown",
19296
- "id": "66dfb9ff",
19297
  "metadata": {},
19298
  "source": [
19299
  "### Build Character"
@@ -19302,7 +19302,7 @@
19302
  {
19303
  "cell_type": "code",
19304
  "execution_count": 7,
19305
- "id": "64329ebd",
19306
  "metadata": {},
19307
  "outputs": [
19308
  {
@@ -19350,7 +19350,7 @@
19350
  {
19351
  "cell_type": "code",
19352
  "execution_count": 8,
19353
- "id": "78297789",
19354
  "metadata": {},
19355
  "outputs": [],
19356
  "source": [
@@ -19361,7 +19361,7 @@
19361
  {
19362
  "cell_type": "code",
19363
  "execution_count": 9,
19364
- "id": "d66aebea",
19365
  "metadata": {},
19366
  "outputs": [
19367
  {
@@ -19379,7 +19379,7 @@
19379
  {
19380
  "cell_type": "code",
19381
  "execution_count": 10,
19382
- "id": "7c085935",
19383
  "metadata": {},
19384
  "outputs": [
19385
  {
@@ -19406,7 +19406,7 @@
19406
  {
19407
  "cell_type": "code",
19408
  "execution_count": 11,
19409
- "id": "fba33316",
19410
  "metadata": {},
19411
  "outputs": [
19412
  {
@@ -19424,7 +19424,7 @@
19424
  {
19425
  "cell_type": "code",
19426
  "execution_count": 12,
19427
- "id": "5376a5b4",
19428
  "metadata": {},
19429
  "outputs": [],
19430
  "source": [
@@ -19435,7 +19435,7 @@
19435
  },
19436
  {
19437
  "cell_type": "markdown",
19438
- "id": "aec637e0",
19439
  "metadata": {},
19440
  "source": [
19441
  "# Tokenizer"
@@ -19444,7 +19444,7 @@
19444
  {
19445
  "cell_type": "code",
19446
  "execution_count": 13,
19447
- "id": "781094bc",
19448
  "metadata": {},
19449
  "outputs": [],
19450
  "source": [
@@ -19456,7 +19456,7 @@
19456
  {
19457
  "cell_type": "code",
19458
  "execution_count": 14,
19459
- "id": "3a3eb52f",
19460
  "metadata": {},
19461
  "outputs": [],
19462
  "source": [
@@ -19468,7 +19468,7 @@
19468
  {
19469
  "cell_type": "code",
19470
  "execution_count": 26,
19471
- "id": "2711ed79",
19472
  "metadata": {},
19473
  "outputs": [],
19474
  "source": [
@@ -19485,7 +19485,7 @@
19485
  {
19486
  "cell_type": "code",
19487
  "execution_count": 27,
19488
- "id": "2772b591",
19489
  "metadata": {},
19490
  "outputs": [
19491
  {
@@ -19525,7 +19525,7 @@
19525
  {
19526
  "cell_type": "code",
19527
  "execution_count": 15,
19528
- "id": "db2af48f",
19529
  "metadata": {},
19530
  "outputs": [],
19531
  "source": [
@@ -19536,7 +19536,7 @@
19536
  {
19537
  "cell_type": "code",
19538
  "execution_count": 16,
19539
- "id": "b7f42c6a",
19540
  "metadata": {},
19541
  "outputs": [
19542
  {
@@ -19561,7 +19561,7 @@
19561
  {
19562
  "cell_type": "code",
19563
  "execution_count": 17,
19564
- "id": "42b525d0",
19565
  "metadata": {},
19566
  "outputs": [
19567
  {
@@ -19608,7 +19608,7 @@
19608
  {
19609
  "cell_type": "code",
19610
  "execution_count": 18,
19611
- "id": "1db1a77c",
19612
  "metadata": {},
19613
  "outputs": [],
19614
  "source": [
@@ -19630,7 +19630,7 @@
19630
  {
19631
  "cell_type": "code",
19632
  "execution_count": 19,
19633
- "id": "b0a33568",
19634
  "metadata": {
19635
  "collapsed": true,
19636
  "jupyter": {
@@ -19669,7 +19669,7 @@
19669
  {
19670
  "cell_type": "code",
19671
  "execution_count": 20,
19672
- "id": "ca8be265",
19673
  "metadata": {},
19674
  "outputs": [],
19675
  "source": [
@@ -19681,7 +19681,7 @@
19681
  {
19682
  "cell_type": "code",
19683
  "execution_count": 21,
19684
- "id": "53a815bf",
19685
  "metadata": {},
19686
  "outputs": [],
19687
  "source": [
@@ -19741,7 +19741,7 @@
19741
  {
19742
  "cell_type": "code",
19743
  "execution_count": 22,
19744
- "id": "1d0cbdf6",
19745
  "metadata": {},
19746
  "outputs": [],
19747
  "source": [
@@ -19751,7 +19751,7 @@
19751
  {
19752
  "cell_type": "code",
19753
  "execution_count": 23,
19754
- "id": "e26e68a2",
19755
  "metadata": {},
19756
  "outputs": [],
19757
  "source": [
@@ -19762,7 +19762,7 @@
19762
  {
19763
  "cell_type": "code",
19764
  "execution_count": 24,
19765
- "id": "f347bb3e",
19766
  "metadata": {},
19767
  "outputs": [],
19768
  "source": [
@@ -19783,7 +19783,7 @@
19783
  {
19784
  "cell_type": "code",
19785
  "execution_count": 25,
19786
- "id": "aff51ef4",
19787
  "metadata": {},
19788
  "outputs": [
19789
  {
@@ -19819,7 +19819,7 @@
19819
  {
19820
  "cell_type": "code",
19821
  "execution_count": 26,
19822
- "id": "6e363fc8",
19823
  "metadata": {},
19824
  "outputs": [],
19825
  "source": [
@@ -19829,7 +19829,7 @@
19829
  {
19830
  "cell_type": "code",
19831
  "execution_count": 27,
19832
- "id": "447dfc3e",
19833
  "metadata": {},
19834
  "outputs": [],
19835
  "source": [
@@ -19857,7 +19857,7 @@
19857
  {
19858
  "cell_type": "code",
19859
  "execution_count": 29,
19860
- "id": "eeda7f6e",
19861
  "metadata": {},
19862
  "outputs": [
19863
  {
@@ -19885,7 +19885,7 @@
19885
  {
19886
  "cell_type": "code",
19887
  "execution_count": 30,
19888
- "id": "af09f9f9",
19889
  "metadata": {},
19890
  "outputs": [
19891
  {
@@ -20232,7 +20232,7 @@
20232
  {
20233
  "cell_type": "code",
20234
  "execution_count": 31,
20235
- "id": "e9563734",
20236
  "metadata": {},
20237
  "outputs": [
20238
  {
@@ -20253,7 +20253,7 @@
20253
  {
20254
  "cell_type": "code",
20255
  "execution_count": 32,
20256
- "id": "4c8fe67e",
20257
  "metadata": {},
20258
  "outputs": [
20259
  {
@@ -20286,7 +20286,7 @@
20286
  {
20287
  "cell_type": "code",
20288
  "execution_count": 34,
20289
- "id": "dc64c376",
20290
  "metadata": {},
20291
  "outputs": [],
20292
  "source": [
@@ -20303,7 +20303,7 @@
20303
  {
20304
  "cell_type": "code",
20305
  "execution_count": 35,
20306
- "id": "9f9d87c3",
20307
  "metadata": {},
20308
  "outputs": [
20309
  {
@@ -20322,7 +20322,7 @@
20322
  {
20323
  "cell_type": "code",
20324
  "execution_count": 36,
20325
- "id": "4b50cbfe",
20326
  "metadata": {},
20327
  "outputs": [
20328
  {
@@ -20373,8 +20373,8 @@
20373
  },
20374
  {
20375
  "cell_type": "code",
20376
- "execution_count": 37,
20377
- "id": "33a99751",
20378
  "metadata": {},
20379
  "outputs": [
20380
  {
@@ -20395,7 +20395,7 @@
20395
  {
20396
  "cell_type": "code",
20397
  "execution_count": null,
20398
- "id": "b9482eed",
20399
  "metadata": {},
20400
  "outputs": [],
20401
  "source": []
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": 1,
6
+ "id": "0ee7433e",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
 
16
  {
17
  "cell_type": "code",
18
  "execution_count": null,
19
+ "id": "90323ec7",
20
  "metadata": {
21
  "collapsed": true,
22
  "jupyter": {
 
19167
  },
19168
  {
19169
  "cell_type": "markdown",
19170
+ "id": "eda834f4",
19171
  "metadata": {},
19172
  "source": [
19173
  "### Load KH Data"
 
19176
  {
19177
  "cell_type": "code",
19178
  "execution_count": 6,
19179
+ "id": "e8b86dab",
19180
  "metadata": {},
19181
  "outputs": [],
19182
  "source": [
 
19199
  {
19200
  "cell_type": "code",
19201
  "execution_count": 2,
19202
+ "id": "0b17a0e1",
19203
  "metadata": {},
19204
  "outputs": [
19205
  {
 
19221
  {
19222
  "cell_type": "code",
19223
  "execution_count": 3,
19224
+ "id": "21239531",
19225
  "metadata": {},
19226
  "outputs": [],
19227
  "source": [
 
19235
  },
19236
  {
19237
  "cell_type": "markdown",
19238
+ "id": "68736f61",
19239
  "metadata": {},
19240
  "source": [
19241
  "### Clean Up the Text"
 
19244
  {
19245
  "cell_type": "code",
19246
  "execution_count": 4,
19247
+ "id": "fcba882e",
19248
  "metadata": {},
19249
  "outputs": [],
19250
  "source": [
 
19260
  {
19261
  "cell_type": "code",
19262
  "execution_count": 5,
19263
+ "id": "9ef37613",
19264
  "metadata": {},
19265
  "outputs": [],
19266
  "source": [
 
19272
  {
19273
  "cell_type": "code",
19274
  "execution_count": 6,
19275
+ "id": "8e4fdc71",
19276
  "metadata": {},
19277
  "outputs": [
19278
  {
 
19293
  },
19294
  {
19295
  "cell_type": "markdown",
19296
+ "id": "1fcdf7d8",
19297
  "metadata": {},
19298
  "source": [
19299
  "### Build Character"
 
19302
  {
19303
  "cell_type": "code",
19304
  "execution_count": 7,
19305
+ "id": "7b7da87a",
19306
  "metadata": {},
19307
  "outputs": [
19308
  {
 
19350
  {
19351
  "cell_type": "code",
19352
  "execution_count": 8,
19353
+ "id": "eb6f0804",
19354
  "metadata": {},
19355
  "outputs": [],
19356
  "source": [
 
19361
  {
19362
  "cell_type": "code",
19363
  "execution_count": 9,
19364
+ "id": "9189ac57",
19365
  "metadata": {},
19366
  "outputs": [
19367
  {
 
19379
  {
19380
  "cell_type": "code",
19381
  "execution_count": 10,
19382
+ "id": "c5fb8a71",
19383
  "metadata": {},
19384
  "outputs": [
19385
  {
 
19406
  {
19407
  "cell_type": "code",
19408
  "execution_count": 11,
19409
+ "id": "10043978",
19410
  "metadata": {},
19411
  "outputs": [
19412
  {
 
19424
  {
19425
  "cell_type": "code",
19426
  "execution_count": 12,
19427
+ "id": "42f02a78",
19428
  "metadata": {},
19429
  "outputs": [],
19430
  "source": [
 
19435
  },
19436
  {
19437
  "cell_type": "markdown",
19438
+ "id": "95b09010",
19439
  "metadata": {},
19440
  "source": [
19441
  "# Tokenizer"
 
19444
  {
19445
  "cell_type": "code",
19446
  "execution_count": 13,
19447
+ "id": "c4d0f5a6",
19448
  "metadata": {},
19449
  "outputs": [],
19450
  "source": [
 
19456
  {
19457
  "cell_type": "code",
19458
  "execution_count": 14,
19459
+ "id": "825623c4",
19460
  "metadata": {},
19461
  "outputs": [],
19462
  "source": [
 
19468
  {
19469
  "cell_type": "code",
19470
  "execution_count": 26,
19471
+ "id": "cfb44de0",
19472
  "metadata": {},
19473
  "outputs": [],
19474
  "source": [
 
19485
  {
19486
  "cell_type": "code",
19487
  "execution_count": 27,
19488
+ "id": "05ab24c0",
19489
  "metadata": {},
19490
  "outputs": [
19491
  {
 
19525
  {
19526
  "cell_type": "code",
19527
  "execution_count": 15,
19528
+ "id": "0cfd158b",
19529
  "metadata": {},
19530
  "outputs": [],
19531
  "source": [
 
19536
  {
19537
  "cell_type": "code",
19538
  "execution_count": 16,
19539
+ "id": "10d224fa",
19540
  "metadata": {},
19541
  "outputs": [
19542
  {
 
19561
  {
19562
  "cell_type": "code",
19563
  "execution_count": 17,
19564
+ "id": "132efaa8",
19565
  "metadata": {},
19566
  "outputs": [
19567
  {
 
19608
  {
19609
  "cell_type": "code",
19610
  "execution_count": 18,
19611
+ "id": "c39872d6",
19612
  "metadata": {},
19613
  "outputs": [],
19614
  "source": [
 
19630
  {
19631
  "cell_type": "code",
19632
  "execution_count": 19,
19633
+ "id": "fef54a48",
19634
  "metadata": {
19635
  "collapsed": true,
19636
  "jupyter": {
 
19669
  {
19670
  "cell_type": "code",
19671
  "execution_count": 20,
19672
+ "id": "2f280b0d",
19673
  "metadata": {},
19674
  "outputs": [],
19675
  "source": [
 
19681
  {
19682
  "cell_type": "code",
19683
  "execution_count": 21,
19684
+ "id": "c9dec52e",
19685
  "metadata": {},
19686
  "outputs": [],
19687
  "source": [
 
19741
  {
19742
  "cell_type": "code",
19743
  "execution_count": 22,
19744
+ "id": "639dcc23",
19745
  "metadata": {},
19746
  "outputs": [],
19747
  "source": [
 
19751
  {
19752
  "cell_type": "code",
19753
  "execution_count": 23,
19754
+ "id": "3bb04288",
19755
  "metadata": {},
19756
  "outputs": [],
19757
  "source": [
 
19762
  {
19763
  "cell_type": "code",
19764
  "execution_count": 24,
19765
+ "id": "9ba8858b",
19766
  "metadata": {},
19767
  "outputs": [],
19768
  "source": [
 
19783
  {
19784
  "cell_type": "code",
19785
  "execution_count": 25,
19786
+ "id": "434869f9",
19787
  "metadata": {},
19788
  "outputs": [
19789
  {
 
19819
  {
19820
  "cell_type": "code",
19821
  "execution_count": 26,
19822
+ "id": "9ffb97fd",
19823
  "metadata": {},
19824
  "outputs": [],
19825
  "source": [
 
19829
  {
19830
  "cell_type": "code",
19831
  "execution_count": 27,
19832
+ "id": "c83b8d4e",
19833
  "metadata": {},
19834
  "outputs": [],
19835
  "source": [
 
19857
  {
19858
  "cell_type": "code",
19859
  "execution_count": 29,
19860
+ "id": "7352a29a",
19861
  "metadata": {},
19862
  "outputs": [
19863
  {
 
19885
  {
19886
  "cell_type": "code",
19887
  "execution_count": 30,
19888
+ "id": "5a73ff08",
19889
  "metadata": {},
19890
  "outputs": [
19891
  {
 
20232
  {
20233
  "cell_type": "code",
20234
  "execution_count": 31,
20235
+ "id": "967962d1",
20236
  "metadata": {},
20237
  "outputs": [
20238
  {
 
20253
  {
20254
  "cell_type": "code",
20255
  "execution_count": 32,
20256
+ "id": "da40a75c",
20257
  "metadata": {},
20258
  "outputs": [
20259
  {
 
20286
  {
20287
  "cell_type": "code",
20288
  "execution_count": 34,
20289
+ "id": "24166e72",
20290
  "metadata": {},
20291
  "outputs": [],
20292
  "source": [
 
20303
  {
20304
  "cell_type": "code",
20305
  "execution_count": 35,
20306
+ "id": "95d69b2e",
20307
  "metadata": {},
20308
  "outputs": [
20309
  {
 
20322
  {
20323
  "cell_type": "code",
20324
  "execution_count": 36,
20325
+ "id": "d60a731d",
20326
  "metadata": {},
20327
  "outputs": [
20328
  {
 
20373
  },
20374
  {
20375
  "cell_type": "code",
20376
+ "execution_count": 38,
20377
+ "id": "beca9a8c",
20378
  "metadata": {},
20379
  "outputs": [
20380
  {
 
20395
  {
20396
  "cell_type": "code",
20397
  "execution_count": null,
20398
+ "id": "20063dbc",
20399
  "metadata": {},
20400
  "outputs": [],
20401
  "source": []
train_kh_lm.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json CHANGED
@@ -1 +1 @@
1
- {"\u1780": 1, "\u1781": 2, "\u1782": 3, "\u1783": 4, "\u1784": 5, "\u1785": 6, "\u1786": 7, "\u1787": 8, "\u1788": 9, "\u1789": 10, "\u178a": 11, "\u178b": 12, "\u178c": 13, "\u178d": 14, "\u178e": 15, "\u178f": 16, "\u1790": 17, "\u1791": 18, "\u1792": 19, "\u1793": 20, "\u1794": 21, "\u1795": 22, "\u1796": 23, "\u1797": 24, "\u1798": 25, "\u1799": 26, "\u179a": 27, "\u179b": 28, "\u179c": 29, "\u179f": 30, "\u17a0": 31, "\u17a1": 32, "\u17a2": 33, "\u17a5": 34, "\u17a7": 35, "\u17aa": 36, "\u17ab": 37, "\u17ac": 38, "\u17ad": 39, "\u17ae": 40, "\u17af": 41, "\u17b1": 42, "\u17b6": 43, "\u17b7": 44, "\u17b8": 45, "\u17b9": 46, "\u17ba": 47, "\u17bb": 48, "\u17bc": 49, "\u17bd": 50, "\u17be": 51, "\u17bf": 52, "\u17c0": 53, "\u17c1": 54, "\u17c2": 55, "\u17c3": 56, "\u17c4": 57, "\u17c5": 58, "\u17c6": 59, "\u17c7": 60, "\u17c8": 61, "\u17c9": 62, "\u17ca": 63, "\u17cb": 64, "\u17cc": 65, "\u17cd": 66, "\u17ce": 67, "\u17cf": 68, "\u17d0": 69, "\u17d2": 70, "|": 0, "[UNK]": 71, "[PAD]": 72}
 
1
+ {"แž€": 1, "แž": 2, "แž‚": 3, "แžƒ": 4, "แž„": 5, "แž…": 6, "แž†": 7, "แž‡": 8, "แžˆ": 9, "แž‰": 10, "แžŠ": 11, "แž‹": 12, "แžŒ": 13, "แž": 14, "แžŽ": 15, "แž": 16, "แž": 17, "แž‘": 18, "แž’": 19, "แž“": 20, "แž”": 21, "แž•": 22, "แž–": 23, "แž—": 24, "แž˜": 25, "แž™": 26, "แžš": 27, "แž›": 28, "แžœ": 29, "แžŸ": 30, "แž ": 31, "แžก": 32, "แžข": 33, "แžฅ": 34, "แžง": 35, "แžช": 36, "แžซ": 37, "แžฌ": 38, "แžญ": 39, "แžฎ": 40, "แžฏ": 41, "แžฑ": 42, "แžถ": 43, "แžท": 44, "แžธ": 45, "แžน": 46, "แžบ": 47, "แžป": 48, "แžผ": 49, "แžฝ": 50, "แžพ": 51, "แžฟ": 52, "แŸ€": 53, "แŸ": 54, "แŸ‚": 55, "แŸƒ": 56, "แŸ„": 57, "แŸ…": 58, "แŸ†": 59, "แŸ‡": 60, "แŸˆ": 61, "แŸ‰": 62, "แŸŠ": 63, "แŸ‹": 64, "แŸŒ": 65, "แŸ": 66, "แŸŽ": 67, "แŸ": 68, "แŸ": 69, "แŸ’": 70, "|": 0, "[UNK]": 71, "[PAD]": 72}