Upload lm-boosted decoder
Browse files- .gitattributes +1 -0
- .gitignore +2 -1
- Add_LM_to_model.ipynb +302 -0
- alphabet.json +1 -0
- language_model/3gram.bin +3 -0
- language_model/attrs.json +1 -0
- language_model/unigrams.txt +3 -0
- preprocessor_config.json +1 -0
- special_tokens_map.json +1 -1
- tokenizer_config.json +1 -1
.gitattributes
CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
language_model/unigrams.txt filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
checkpoint-*/
|
|
|
|
1 |
+
checkpoint-*/
|
2 |
+
.ipynb_checkpoints*/
|
Add_LM_to_model.ipynb
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "d23f1f27-fbf4-4fe5-a7b4-17815b23f283",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"from transformers import AutoProcessor"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 2,
|
16 |
+
"id": "cdefcb5e-0824-49ef-be73-8788cbb4e2a9",
|
17 |
+
"metadata": {},
|
18 |
+
"outputs": [],
|
19 |
+
"source": [
|
20 |
+
"processor = AutoProcessor.from_pretrained(\"chmanoj/xls-r-300m-te\")"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": 3,
|
26 |
+
"id": "ef78538d-ca83-4cd3-824d-1b7928f5bc4e",
|
27 |
+
"metadata": {},
|
28 |
+
"outputs": [],
|
29 |
+
"source": [
|
30 |
+
"vocab_dict = processor.tokenizer.get_vocab()\n",
|
31 |
+
"sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}"
|
32 |
+
]
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"cell_type": "code",
|
36 |
+
"execution_count": 4,
|
37 |
+
"id": "cd355539-6dfb-4978-82a3-905c0236c6c3",
|
38 |
+
"metadata": {},
|
39 |
+
"outputs": [],
|
40 |
+
"source": [
|
41 |
+
"from pyctcdecode import build_ctcdecoder"
|
42 |
+
]
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"cell_type": "code",
|
46 |
+
"execution_count": 9,
|
47 |
+
"id": "34429a23-a3e5-40ca-be4e-186bf12e1ff4",
|
48 |
+
"metadata": {},
|
49 |
+
"outputs": [],
|
50 |
+
"source": [
|
51 |
+
"# !which python\n",
|
52 |
+
"\n",
|
53 |
+
"# !pip install https://github.com/kpu/kenlm/archive/master.zip"
|
54 |
+
]
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"cell_type": "code",
|
58 |
+
"execution_count": 5,
|
59 |
+
"id": "21f4fb99-1c19-4a0a-9ac0-90dd38645585",
|
60 |
+
"metadata": {},
|
61 |
+
"outputs": [
|
62 |
+
{
|
63 |
+
"name": "stderr",
|
64 |
+
"output_type": "stream",
|
65 |
+
"text": [
|
66 |
+
"Loading the LM will be faster if you build a binary file.\n",
|
67 |
+
"Reading /mnt/c/Projects/Speech/xls-R-finetuning/xls-r-300m-te/3gram_correct.arpa\n",
|
68 |
+
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
69 |
+
"****************************************************************************************************\n",
|
70 |
+
"Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
|
71 |
+
"Unigrams and labels don't seem to agree.\n"
|
72 |
+
]
|
73 |
+
}
|
74 |
+
],
|
75 |
+
"source": [
|
76 |
+
"decoder = build_ctcdecoder(\n",
|
77 |
+
" labels=list(sorted_vocab_dict.keys()),\n",
|
78 |
+
" kenlm_model_path=\"3gram_correct.arpa\",\n",
|
79 |
+
")"
|
80 |
+
]
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"cell_type": "code",
|
84 |
+
"execution_count": 6,
|
85 |
+
"id": "f892aada-710c-4bc2-a11f-c9a35c00870a",
|
86 |
+
"metadata": {},
|
87 |
+
"outputs": [],
|
88 |
+
"source": [
|
89 |
+
"from transformers import Wav2Vec2ProcessorWithLM\n",
|
90 |
+
"\n",
|
91 |
+
"processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
|
92 |
+
" feature_extractor=processor.feature_extractor,\n",
|
93 |
+
" tokenizer=processor.tokenizer,\n",
|
94 |
+
" decoder=decoder\n",
|
95 |
+
")"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"cell_type": "code",
|
100 |
+
"execution_count": 7,
|
101 |
+
"id": "5e29f7f7-e116-4c65-9c14-ae7e871390bb",
|
102 |
+
"metadata": {},
|
103 |
+
"outputs": [
|
104 |
+
{
|
105 |
+
"data": {
|
106 |
+
"text/plain": [
|
107 |
+
"'/mnt/c/Projects/Speech/xls-R-finetuning/xls-r-300m-te'"
|
108 |
+
]
|
109 |
+
},
|
110 |
+
"execution_count": 7,
|
111 |
+
"metadata": {},
|
112 |
+
"output_type": "execute_result"
|
113 |
+
}
|
114 |
+
],
|
115 |
+
"source": [
|
116 |
+
"import os\n",
|
117 |
+
"os.getcwd()"
|
118 |
+
]
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"cell_type": "code",
|
122 |
+
"execution_count": 8,
|
123 |
+
"id": "6f5775eb-aece-41fc-a1eb-8bf6f9b8f429",
|
124 |
+
"metadata": {},
|
125 |
+
"outputs": [],
|
126 |
+
"source": [
|
127 |
+
"processor_with_lm.save_pretrained(os.getcwd())"
|
128 |
+
]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"cell_type": "code",
|
132 |
+
"execution_count": null,
|
133 |
+
"id": "0e7e4d6f-01d0-4a24-9980-a6583fb6d048",
|
134 |
+
"metadata": {},
|
135 |
+
"outputs": [],
|
136 |
+
"source": []
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"cell_type": "code",
|
140 |
+
"execution_count": 10,
|
141 |
+
"id": "c5ea011b-9412-484a-b798-15fb6e338a99",
|
142 |
+
"metadata": {},
|
143 |
+
"outputs": [
|
144 |
+
{
|
145 |
+
"name": "stdout",
|
146 |
+
"output_type": "stream",
|
147 |
+
"text": [
|
148 |
+
"Reading language_model/3gram_correct.arpa\n",
|
149 |
+
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
150 |
+
"****************************************************************************************************\n",
|
151 |
+
"SUCCESS\n"
|
152 |
+
]
|
153 |
+
}
|
154 |
+
],
|
155 |
+
"source": [
|
156 |
+
"!../kenlm/build/bin/build_binary language_model/3gram_correct.arpa language_model/3gram.bin"
|
157 |
+
]
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"cell_type": "code",
|
161 |
+
"execution_count": null,
|
162 |
+
"id": "70c2709b-0b5c-440f-ae9f-11f8045e8fed",
|
163 |
+
"metadata": {},
|
164 |
+
"outputs": [],
|
165 |
+
"source": []
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"cell_type": "code",
|
169 |
+
"execution_count": 11,
|
170 |
+
"id": "c5db962f-15f1-4b65-87e3-81e1af14e32e",
|
171 |
+
"metadata": {},
|
172 |
+
"outputs": [],
|
173 |
+
"source": [
|
174 |
+
"from huggingface_hub import Repository"
|
175 |
+
]
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"cell_type": "code",
|
179 |
+
"execution_count": 12,
|
180 |
+
"id": "d3801f28-cdb5-40cd-b1b9-5a00f8f24720",
|
181 |
+
"metadata": {},
|
182 |
+
"outputs": [],
|
183 |
+
"source": [
|
184 |
+
"repo = Repository(local_dir=\".\")"
|
185 |
+
]
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"cell_type": "code",
|
189 |
+
"execution_count": null,
|
190 |
+
"id": "c6421313-5d36-45ce-8300-3988985e7239",
|
191 |
+
"metadata": {},
|
192 |
+
"outputs": [],
|
193 |
+
"source": []
|
194 |
+
},
|
195 |
+
{
|
196 |
+
"cell_type": "code",
|
197 |
+
"execution_count": 14,
|
198 |
+
"id": "7dcfe5d2-063f-4b34-9fdd-5f025ef9f699",
|
199 |
+
"metadata": {},
|
200 |
+
"outputs": [
|
201 |
+
{
|
202 |
+
"name": "stderr",
|
203 |
+
"output_type": "stream",
|
204 |
+
"text": [
|
205 |
+
"Several commits (2) will be pushed upstream.\n",
|
206 |
+
"The progress bars may be unreliable.\n"
|
207 |
+
]
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"data": {
|
211 |
+
"application/vnd.jupyter.widget-view+json": {
|
212 |
+
"model_id": "d17d7664ff97403f9f428264855729c2",
|
213 |
+
"version_major": 2,
|
214 |
+
"version_minor": 0
|
215 |
+
},
|
216 |
+
"text/plain": [
|
217 |
+
"Upload file language_model/3gram_correct.arpa: 0%| | 32.0k/2.59G [00:00<?, ?B/s]"
|
218 |
+
]
|
219 |
+
},
|
220 |
+
"metadata": {},
|
221 |
+
"output_type": "display_data"
|
222 |
+
},
|
223 |
+
{
|
224 |
+
"data": {
|
225 |
+
"application/vnd.jupyter.widget-view+json": {
|
226 |
+
"model_id": "479389ca92884367bdda025024eaa38d",
|
227 |
+
"version_major": 2,
|
228 |
+
"version_minor": 0
|
229 |
+
},
|
230 |
+
"text/plain": [
|
231 |
+
"Upload file language_model/3gram.bin: 0%| | 32.0k/771M [00:00<?, ?B/s]"
|
232 |
+
]
|
233 |
+
},
|
234 |
+
"metadata": {},
|
235 |
+
"output_type": "display_data"
|
236 |
+
},
|
237 |
+
{
|
238 |
+
"data": {
|
239 |
+
"application/vnd.jupyter.widget-view+json": {
|
240 |
+
"model_id": "2c975b43bb9040c2b29653eede9add4c",
|
241 |
+
"version_major": 2,
|
242 |
+
"version_minor": 0
|
243 |
+
},
|
244 |
+
"text/plain": [
|
245 |
+
"Upload file language_model/unigrams.txt: 0%| | 32.0k/39.0M [00:00<?, ?B/s]"
|
246 |
+
]
|
247 |
+
},
|
248 |
+
"metadata": {},
|
249 |
+
"output_type": "display_data"
|
250 |
+
},
|
251 |
+
{
|
252 |
+
"ename": "KeyboardInterrupt",
|
253 |
+
"evalue": "",
|
254 |
+
"output_type": "error",
|
255 |
+
"traceback": [
|
256 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
257 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
258 |
+
"\u001b[0;32m/tmp/ipykernel_7629/1986395493.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mrepo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpush_to_hub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommit_message\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Upload lm-boosted decoder\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
259 |
+
"\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/site-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mpush_to_hub\u001b[0;34m(self, commit_message, blocking, clean_ok, auto_lfs_prune)\u001b[0m\n\u001b[1;32m 1233\u001b[0m \u001b[0mupstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34mf\"origin {self.current_branch}\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1234\u001b[0m \u001b[0mblocking\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mblocking\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1235\u001b[0;31m \u001b[0mauto_lfs_prune\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mauto_lfs_prune\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1236\u001b[0m )\n\u001b[1;32m 1237\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
260 |
+
"\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/site-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mgit_push\u001b[0;34m(self, upstream, blocking, auto_lfs_prune)\u001b[0m\n\u001b[1;32m 989\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 990\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mblocking\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 991\u001b[0;31m \u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommunicate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 992\u001b[0m \u001b[0mreturn_code\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpoll\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 993\u001b[0m \u001b[0mprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkill\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
261 |
+
"\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36mcommunicate\u001b[0;34m(self, input, timeout)\u001b[0m\n\u001b[1;32m 962\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 963\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 964\u001b[0;31m \u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_communicate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendtime\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 965\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[0;31m# https://bugs.python.org/issue25942\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
262 |
+
"\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36m_communicate\u001b[0;34m(self, input, endtime, orig_timeout)\u001b[0m\n\u001b[1;32m 1713\u001b[0m 'failed to raise TimeoutExpired.')\n\u001b[1;32m 1714\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1715\u001b[0;31m \u001b[0mready\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mselector\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1716\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_timeout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mendtime\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morig_timeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1717\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
263 |
+
"\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/selectors.py\u001b[0m in \u001b[0;36mselect\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 413\u001b[0m \u001b[0mready\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 414\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 415\u001b[0;31m \u001b[0mfd_event_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_selector\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpoll\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 416\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mInterruptedError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 417\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mready\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
264 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
265 |
+
]
|
266 |
+
}
|
267 |
+
],
|
268 |
+
"source": [
|
269 |
+
"repo.push_to_hub(commit_message=\"Upload lm-boosted decoder\")"
|
270 |
+
]
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"cell_type": "code",
|
274 |
+
"execution_count": null,
|
275 |
+
"id": "a505c088-5f40-4d9a-8d75-263a07cc93a5",
|
276 |
+
"metadata": {},
|
277 |
+
"outputs": [],
|
278 |
+
"source": []
|
279 |
+
}
|
280 |
+
],
|
281 |
+
"metadata": {
|
282 |
+
"kernelspec": {
|
283 |
+
"display_name": "Python 3 (ipykernel)",
|
284 |
+
"language": "python",
|
285 |
+
"name": "python3"
|
286 |
+
},
|
287 |
+
"language_info": {
|
288 |
+
"codemirror_mode": {
|
289 |
+
"name": "ipython",
|
290 |
+
"version": 3
|
291 |
+
},
|
292 |
+
"file_extension": ".py",
|
293 |
+
"mimetype": "text/x-python",
|
294 |
+
"name": "python",
|
295 |
+
"nbconvert_exporter": "python",
|
296 |
+
"pygments_lexer": "ipython3",
|
297 |
+
"version": "3.7.10"
|
298 |
+
}
|
299 |
+
},
|
300 |
+
"nbformat": 4,
|
301 |
+
"nbformat_minor": 5
|
302 |
+
}
|
alphabet.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"labels": [" ", "\\", "_", "e", "g", "l", "n", "p", "r", "s", "t", "\u0c01", "\u0c02", "\u0c03", "\u0c05", "\u0c06", "\u0c07", "\u0c08", "\u0c09", "\u0c0a", "\u0c0b", "\u0c0e", "\u0c0f", "\u0c10", "\u0c12", "\u0c13", "\u0c14", "\u0c15", "\u0c16", "\u0c17", "\u0c18", "\u0c19", "\u0c1a", "\u0c1b", "\u0c1c", "\u0c1e", "\u0c1f", "\u0c20", "\u0c21", "\u0c22", "\u0c23", "\u0c24", "\u0c25", "\u0c26", "\u0c27", "\u0c28", "\u0c2a", "\u0c2b", "\u0c2c", "\u0c2d", "\u0c2e", "\u0c2f", "\u0c30", "\u0c31", "\u0c32", "\u0c33", "\u0c35", "\u0c36", "\u0c37", "\u0c38", "\u0c39", "\u0c3e", "\u0c3f", "\u0c40", "\u0c41", "\u0c42", "\u0c43", "\u0c46", "\u0c47", "\u0c48", "\u0c4a", "\u0c4b", "\u0c4c", "\u0c4d", "\u200c", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
|
language_model/3gram.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e25016b514710dfc9e6dbcb35012f1b08a21302642e6efe56c14b107f2e412d4
|
3 |
+
size 808445046
|
language_model/attrs.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
|
language_model/unigrams.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:11a16ee59701a2b8769eb92a856b8fc786fda333dd16923f0d51c6f887093735
|
3 |
+
size 40926732
|
preprocessor_config.json
CHANGED
@@ -4,6 +4,7 @@
|
|
4 |
"feature_size": 1,
|
5 |
"padding_side": "right",
|
6 |
"padding_value": 0,
|
|
|
7 |
"return_attention_mask": true,
|
8 |
"sampling_rate": 16000
|
9 |
}
|
|
|
4 |
"feature_size": 1,
|
5 |
"padding_side": "right",
|
6 |
"padding_value": 0,
|
7 |
+
"processor_class": "Wav2Vec2ProcessorWithLM",
|
8 |
"return_attention_mask": true,
|
9 |
"sampling_rate": 16000
|
10 |
}
|
special_tokens_map.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
tokenizer_config.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "
|
|
|
1 |
+
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "chmanoj/xls-r-300m-te", "tokenizer_class": "Wav2Vec2CTCTokenizer", "processor_class": "Wav2Vec2ProcessorWithLM"}
|