chmanoj commited on
Commit
dbca3b5
1 Parent(s): aa77a85

Upload lm-boosted decoder

Browse files
.gitattributes CHANGED
@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
  *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ language_model/unigrams.txt filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1 +1,2 @@
1
- checkpoint-*/
 
 
1
+ checkpoint-*/
2
+ .ipynb_checkpoints*/
Add_LM_to_model.ipynb ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "d23f1f27-fbf4-4fe5-a7b4-17815b23f283",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from transformers import AutoProcessor"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "id": "cdefcb5e-0824-49ef-be73-8788cbb4e2a9",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "processor = AutoProcessor.from_pretrained(\"chmanoj/xls-r-300m-te\")"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 3,
26
+ "id": "ef78538d-ca83-4cd3-824d-1b7928f5bc4e",
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "vocab_dict = processor.tokenizer.get_vocab()\n",
31
+ "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 4,
37
+ "id": "cd355539-6dfb-4978-82a3-905c0236c6c3",
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "from pyctcdecode import build_ctcdecoder"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 9,
47
+ "id": "34429a23-a3e5-40ca-be4e-186bf12e1ff4",
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": [
51
+ "# !which python\n",
52
+ "\n",
53
+ "# !pip install https://github.com/kpu/kenlm/archive/master.zip"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 5,
59
+ "id": "21f4fb99-1c19-4a0a-9ac0-90dd38645585",
60
+ "metadata": {},
61
+ "outputs": [
62
+ {
63
+ "name": "stderr",
64
+ "output_type": "stream",
65
+ "text": [
66
+ "Loading the LM will be faster if you build a binary file.\n",
67
+ "Reading /mnt/c/Projects/Speech/xls-R-finetuning/xls-r-300m-te/3gram_correct.arpa\n",
68
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
69
+ "****************************************************************************************************\n",
70
+ "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
71
+ "Unigrams and labels don't seem to agree.\n"
72
+ ]
73
+ }
74
+ ],
75
+ "source": [
76
+ "decoder = build_ctcdecoder(\n",
77
+ " labels=list(sorted_vocab_dict.keys()),\n",
78
+ " kenlm_model_path=\"3gram_correct.arpa\",\n",
79
+ ")"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": 6,
85
+ "id": "f892aada-710c-4bc2-a11f-c9a35c00870a",
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": [
89
+ "from transformers import Wav2Vec2ProcessorWithLM\n",
90
+ "\n",
91
+ "processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
92
+ " feature_extractor=processor.feature_extractor,\n",
93
+ " tokenizer=processor.tokenizer,\n",
94
+ " decoder=decoder\n",
95
+ ")"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 7,
101
+ "id": "5e29f7f7-e116-4c65-9c14-ae7e871390bb",
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "data": {
106
+ "text/plain": [
107
+ "'/mnt/c/Projects/Speech/xls-R-finetuning/xls-r-300m-te'"
108
+ ]
109
+ },
110
+ "execution_count": 7,
111
+ "metadata": {},
112
+ "output_type": "execute_result"
113
+ }
114
+ ],
115
+ "source": [
116
+ "import os\n",
117
+ "os.getcwd()"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": 8,
123
+ "id": "6f5775eb-aece-41fc-a1eb-8bf6f9b8f429",
124
+ "metadata": {},
125
+ "outputs": [],
126
+ "source": [
127
+ "processor_with_lm.save_pretrained(os.getcwd())"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": null,
133
+ "id": "0e7e4d6f-01d0-4a24-9980-a6583fb6d048",
134
+ "metadata": {},
135
+ "outputs": [],
136
+ "source": []
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 10,
141
+ "id": "c5ea011b-9412-484a-b798-15fb6e338a99",
142
+ "metadata": {},
143
+ "outputs": [
144
+ {
145
+ "name": "stdout",
146
+ "output_type": "stream",
147
+ "text": [
148
+ "Reading language_model/3gram_correct.arpa\n",
149
+ "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
150
+ "****************************************************************************************************\n",
151
+ "SUCCESS\n"
152
+ ]
153
+ }
154
+ ],
155
+ "source": [
156
+ "!../kenlm/build/bin/build_binary language_model/3gram_correct.arpa language_model/3gram.bin"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": null,
162
+ "id": "70c2709b-0b5c-440f-ae9f-11f8045e8fed",
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": []
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": 11,
170
+ "id": "c5db962f-15f1-4b65-87e3-81e1af14e32e",
171
+ "metadata": {},
172
+ "outputs": [],
173
+ "source": [
174
+ "from huggingface_hub import Repository"
175
+ ]
176
+ },
177
+ {
178
+ "cell_type": "code",
179
+ "execution_count": 12,
180
+ "id": "d3801f28-cdb5-40cd-b1b9-5a00f8f24720",
181
+ "metadata": {},
182
+ "outputs": [],
183
+ "source": [
184
+ "repo = Repository(local_dir=\".\")"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "execution_count": null,
190
+ "id": "c6421313-5d36-45ce-8300-3988985e7239",
191
+ "metadata": {},
192
+ "outputs": [],
193
+ "source": []
194
+ },
195
+ {
196
+ "cell_type": "code",
197
+ "execution_count": 14,
198
+ "id": "7dcfe5d2-063f-4b34-9fdd-5f025ef9f699",
199
+ "metadata": {},
200
+ "outputs": [
201
+ {
202
+ "name": "stderr",
203
+ "output_type": "stream",
204
+ "text": [
205
+ "Several commits (2) will be pushed upstream.\n",
206
+ "The progress bars may be unreliable.\n"
207
+ ]
208
+ },
209
+ {
210
+ "data": {
211
+ "application/vnd.jupyter.widget-view+json": {
212
+ "model_id": "d17d7664ff97403f9f428264855729c2",
213
+ "version_major": 2,
214
+ "version_minor": 0
215
+ },
216
+ "text/plain": [
217
+ "Upload file language_model/3gram_correct.arpa: 0%| | 32.0k/2.59G [00:00<?, ?B/s]"
218
+ ]
219
+ },
220
+ "metadata": {},
221
+ "output_type": "display_data"
222
+ },
223
+ {
224
+ "data": {
225
+ "application/vnd.jupyter.widget-view+json": {
226
+ "model_id": "479389ca92884367bdda025024eaa38d",
227
+ "version_major": 2,
228
+ "version_minor": 0
229
+ },
230
+ "text/plain": [
231
+ "Upload file language_model/3gram.bin: 0%| | 32.0k/771M [00:00<?, ?B/s]"
232
+ ]
233
+ },
234
+ "metadata": {},
235
+ "output_type": "display_data"
236
+ },
237
+ {
238
+ "data": {
239
+ "application/vnd.jupyter.widget-view+json": {
240
+ "model_id": "2c975b43bb9040c2b29653eede9add4c",
241
+ "version_major": 2,
242
+ "version_minor": 0
243
+ },
244
+ "text/plain": [
245
+ "Upload file language_model/unigrams.txt: 0%| | 32.0k/39.0M [00:00<?, ?B/s]"
246
+ ]
247
+ },
248
+ "metadata": {},
249
+ "output_type": "display_data"
250
+ },
251
+ {
252
+ "ename": "KeyboardInterrupt",
253
+ "evalue": "",
254
+ "output_type": "error",
255
+ "traceback": [
256
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
257
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
258
+ "\u001b[0;32m/tmp/ipykernel_7629/1986395493.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mrepo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpush_to_hub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommit_message\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Upload lm-boosted decoder\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
259
+ "\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/site-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mpush_to_hub\u001b[0;34m(self, commit_message, blocking, clean_ok, auto_lfs_prune)\u001b[0m\n\u001b[1;32m 1233\u001b[0m \u001b[0mupstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34mf\"origin {self.current_branch}\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1234\u001b[0m \u001b[0mblocking\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mblocking\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1235\u001b[0;31m \u001b[0mauto_lfs_prune\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mauto_lfs_prune\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1236\u001b[0m )\n\u001b[1;32m 1237\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
260
+ "\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/site-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mgit_push\u001b[0;34m(self, upstream, blocking, auto_lfs_prune)\u001b[0m\n\u001b[1;32m 989\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 990\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mblocking\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 991\u001b[0;31m \u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommunicate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 992\u001b[0m \u001b[0mreturn_code\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpoll\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 993\u001b[0m \u001b[0mprocess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkill\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
261
+ "\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36mcommunicate\u001b[0;34m(self, input, timeout)\u001b[0m\n\u001b[1;32m 962\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 963\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 964\u001b[0;31m \u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_communicate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendtime\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 965\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[0;31m# https://bugs.python.org/issue25942\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
262
+ "\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/subprocess.py\u001b[0m in \u001b[0;36m_communicate\u001b[0;34m(self, input, endtime, orig_timeout)\u001b[0m\n\u001b[1;32m 1713\u001b[0m 'failed to raise TimeoutExpired.')\n\u001b[1;32m 1714\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1715\u001b[0;31m \u001b[0mready\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mselector\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1716\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_timeout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mendtime\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morig_timeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstdout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstderr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1717\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
263
+ "\u001b[0;32m~/miniconda3/envs/xlsr_ft/lib/python3.7/selectors.py\u001b[0m in \u001b[0;36mselect\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 413\u001b[0m \u001b[0mready\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 414\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 415\u001b[0;31m \u001b[0mfd_event_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_selector\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpoll\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 416\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mInterruptedError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 417\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mready\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
264
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
265
+ ]
266
+ }
267
+ ],
268
+ "source": [
269
+ "repo.push_to_hub(commit_message=\"Upload lm-boosted decoder\")"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": null,
275
+ "id": "a505c088-5f40-4d9a-8d75-263a07cc93a5",
276
+ "metadata": {},
277
+ "outputs": [],
278
+ "source": []
279
+ }
280
+ ],
281
+ "metadata": {
282
+ "kernelspec": {
283
+ "display_name": "Python 3 (ipykernel)",
284
+ "language": "python",
285
+ "name": "python3"
286
+ },
287
+ "language_info": {
288
+ "codemirror_mode": {
289
+ "name": "ipython",
290
+ "version": 3
291
+ },
292
+ "file_extension": ".py",
293
+ "mimetype": "text/x-python",
294
+ "name": "python",
295
+ "nbconvert_exporter": "python",
296
+ "pygments_lexer": "ipython3",
297
+ "version": "3.7.10"
298
+ }
299
+ },
300
+ "nbformat": 4,
301
+ "nbformat_minor": 5
302
+ }
alphabet.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"labels": [" ", "\\", "_", "e", "g", "l", "n", "p", "r", "s", "t", "\u0c01", "\u0c02", "\u0c03", "\u0c05", "\u0c06", "\u0c07", "\u0c08", "\u0c09", "\u0c0a", "\u0c0b", "\u0c0e", "\u0c0f", "\u0c10", "\u0c12", "\u0c13", "\u0c14", "\u0c15", "\u0c16", "\u0c17", "\u0c18", "\u0c19", "\u0c1a", "\u0c1b", "\u0c1c", "\u0c1e", "\u0c1f", "\u0c20", "\u0c21", "\u0c22", "\u0c23", "\u0c24", "\u0c25", "\u0c26", "\u0c27", "\u0c28", "\u0c2a", "\u0c2b", "\u0c2c", "\u0c2d", "\u0c2e", "\u0c2f", "\u0c30", "\u0c31", "\u0c32", "\u0c33", "\u0c35", "\u0c36", "\u0c37", "\u0c38", "\u0c39", "\u0c3e", "\u0c3f", "\u0c40", "\u0c41", "\u0c42", "\u0c43", "\u0c46", "\u0c47", "\u0c48", "\u0c4a", "\u0c4b", "\u0c4c", "\u0c4d", "\u200c", "\u2047", "", "<s>", "</s>"], "is_bpe": false}
language_model/3gram.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e25016b514710dfc9e6dbcb35012f1b08a21302642e6efe56c14b107f2e412d4
3
+ size 808445046
language_model/attrs.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
language_model/unigrams.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11a16ee59701a2b8769eb92a856b8fc786fda333dd16923f0d51c6f887093735
3
+ size 40926732
preprocessor_config.json CHANGED
@@ -4,6 +4,7 @@
4
  "feature_size": 1,
5
  "padding_side": "right",
6
  "padding_value": 0,
 
7
  "return_attention_mask": true,
8
  "sampling_rate": 16000
9
  }
 
4
  "feature_size": 1,
5
  "padding_side": "right",
6
  "padding_value": 0,
7
+ "processor_class": "Wav2Vec2ProcessorWithLM",
8
  "return_attention_mask": true,
9
  "sampling_rate": 16000
10
  }
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
 
1
+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "chmanoj/xls-r-300m-te", "tokenizer_class": "Wav2Vec2CTCTokenizer", "processor_class": "Wav2Vec2ProcessorWithLM"}