diallomama commited on
Commit
2e00fbe
1 Parent(s): 42ebed4

tokenizer and vocab (from up)

Browse files
Files changed (2) hide show
  1. tokenizer_config.json +12 -0
  2. vocab.json +317 -0
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "do_lower_case": false,
5
+ "eos_token": "</s>",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "pad_token": "[PAD]",
8
+ "replace_word_delimiter_char": " ",
9
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
10
+ "unk_token": "[UNK]",
11
+ "word_delimiter_token": "|"
12
+ }
vocab.json ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "!": 1,
3
+ "\"": 2,
4
+ "$": 3,
5
+ "%": 4,
6
+ "&": 5,
7
+ "'": 6,
8
+ "(": 7,
9
+ ")": 8,
10
+ "*": 9,
11
+ ",": 10,
12
+ "-": 11,
13
+ ".": 12,
14
+ "/": 13,
15
+ ":": 14,
16
+ ";": 15,
17
+ "=": 16,
18
+ "?": 17,
19
+ "[PAD]": 314,
20
+ "[UNK]": 313,
21
+ "^": 18,
22
+ "_": 19,
23
+ "`": 20,
24
+ "a": 21,
25
+ "b": 22,
26
+ "c": 23,
27
+ "d": 24,
28
+ "e": 25,
29
+ "f": 26,
30
+ "g": 27,
31
+ "h": 28,
32
+ "i": 29,
33
+ "j": 30,
34
+ "k": 31,
35
+ "l": 32,
36
+ "m": 33,
37
+ "n": 34,
38
+ "o": 35,
39
+ "p": 36,
40
+ "q": 37,
41
+ "r": 38,
42
+ "s": 39,
43
+ "t": 40,
44
+ "u": 41,
45
+ "v": 42,
46
+ "w": 43,
47
+ "x": 44,
48
+ "y": 45,
49
+ "z": 46,
50
+ "{": 47,
51
+ "|": 0,
52
+ "}": 49,
53
+ "~": 50,
54
+ "£": 51,
55
+ "§": 52,
56
+ "«": 53,
57
+ "®": 54,
58
+ "°": 55,
59
+ "±": 56,
60
+ "´": 57,
61
+ "µ": 58,
62
+ "·": 59,
63
+ "º": 60,
64
+ "»": 61,
65
+ "½": 62,
66
+ "×": 63,
67
+ "ß": 64,
68
+ "à": 65,
69
+ "á": 66,
70
+ "â": 67,
71
+ "ã": 68,
72
+ "ä": 69,
73
+ "å": 70,
74
+ "æ": 71,
75
+ "ç": 72,
76
+ "è": 73,
77
+ "é": 74,
78
+ "ê": 75,
79
+ "ë": 76,
80
+ "ì": 77,
81
+ "í": 78,
82
+ "î": 79,
83
+ "ï": 80,
84
+ "ð": 81,
85
+ "ñ": 82,
86
+ "ò": 83,
87
+ "ó": 84,
88
+ "ô": 85,
89
+ "õ": 86,
90
+ "ö": 87,
91
+ "ø": 88,
92
+ "ù": 89,
93
+ "ú": 90,
94
+ "û": 91,
95
+ "ü": 92,
96
+ "ý": 93,
97
+ "þ": 94,
98
+ "ÿ": 95,
99
+ "ā": 96,
100
+ "ă": 97,
101
+ "ą": 98,
102
+ "ć": 99,
103
+ "ċ": 100,
104
+ "č": 101,
105
+ "ď": 102,
106
+ "đ": 103,
107
+ "ē": 104,
108
+ "ė": 105,
109
+ "ę": 106,
110
+ "ě": 107,
111
+ "ğ": 108,
112
+ "ġ": 109,
113
+ "ħ": 110,
114
+ "ĩ": 111,
115
+ "ī": 112,
116
+ "ı": 113,
117
+ "ķ": 114,
118
+ "ĺ": 115,
119
+ "ļ": 116,
120
+ "ľ": 117,
121
+ "ł": 118,
122
+ "ń": 119,
123
+ "ņ": 120,
124
+ "ň": 121,
125
+ "ō": 122,
126
+ "ŏ": 123,
127
+ "ő": 124,
128
+ "œ": 125,
129
+ "ř": 126,
130
+ "ś": 127,
131
+ "ş": 128,
132
+ "š": 129,
133
+ "ţ": 130,
134
+ "ť": 131,
135
+ "ũ": 132,
136
+ "ū": 133,
137
+ "ů": 134,
138
+ "ų": 135,
139
+ "ź": 136,
140
+ "ż": 137,
141
+ "ž": 138,
142
+ "ơ": 139,
143
+ "ư": 140,
144
+ "ǀ": 141,
145
+ "ǃ": 142,
146
+ "ǎ": 143,
147
+ "ǔ": 144,
148
+ "ǫ": 145,
149
+ "ǹ": 146,
150
+ "ș": 147,
151
+ "ț": 148,
152
+ "ə": 149,
153
+ "ɨ": 150,
154
+ "ʉ": 151,
155
+ "ʔ": 152,
156
+ "ʻ": 153,
157
+ "ʼ": 154,
158
+ "ʽ": 155,
159
+ "ʾ": 156,
160
+ "ʿ": 157,
161
+ "ː": 158,
162
+ "ˢ": 159,
163
+ "̀": 160,
164
+ "́": 161,
165
+ "̂": 162,
166
+ "̃": 163,
167
+ "̇": 164,
168
+ "̈": 165,
169
+ "̐": 166,
170
+ "̠": 167,
171
+ "̧": 168,
172
+ "̱": 169,
173
+ "̲": 170,
174
+ "α": 171,
175
+ "β": 172,
176
+ "γ": 173,
177
+ "δ": 174,
178
+ "ε": 175,
179
+ "ζ": 176,
180
+ "η": 177,
181
+ "θ": 178,
182
+ "ι": 179,
183
+ "κ": 180,
184
+ "μ": 181,
185
+ "ν": 182,
186
+ "ο": 183,
187
+ "π": 184,
188
+ "ρ": 185,
189
+ "ς": 186,
190
+ "σ": 187,
191
+ "τ": 188,
192
+ "υ": 189,
193
+ "ψ": 190,
194
+ "ω": 191,
195
+ "ό": 192,
196
+ "а": 193,
197
+ "г": 194,
198
+ "е": 195,
199
+ "з": 196,
200
+ "и": 197,
201
+ "к": 198,
202
+ "м": 199,
203
+ "н": 200,
204
+ "п": 201,
205
+ "р": 202,
206
+ "э": 203,
207
+ "я": 204,
208
+ "і": 205,
209
+ "ј": 206,
210
+ "ҫ": 207,
211
+ "գ": 208,
212
+ "զ": 209,
213
+ "ا": 210,
214
+ "ب": 211,
215
+ "ة": 212,
216
+ "د": 213,
217
+ "ر": 214,
218
+ "ل": 215,
219
+ "م": 216,
220
+ "ن": 217,
221
+ "و": 218,
222
+ "ي": 219,
223
+ "ቀ": 220,
224
+ "ወ": 221,
225
+ "ደ": 222,
226
+ "ጠ": 223,
227
+ "ḍ": 224,
228
+ "ṅ": 225,
229
+ "ṇ": 226,
230
+ "ṣ": 227,
231
+ "ṭ": 228,
232
+ "ṯ": 229,
233
+ "ạ": 230,
234
+ "ả": 231,
235
+ "ầ": 232,
236
+ "ậ": 233,
237
+ "ắ": 234,
238
+ "ẵ": 235,
239
+ "ề": 236,
240
+ "ễ": 237,
241
+ "ệ": 238,
242
+ "ị": 239,
243
+ "ồ": 240,
244
+ "ổ": 241,
245
+ "ộ": 242,
246
+ "ờ": 243,
247
+ "ợ": 244,
248
+ "ủ": 245,
249
+ "ử": 246,
250
+ "ỳ": 247,
251
+ "‐": 248,
252
+ "–": 249,
253
+ "—": 250,
254
+ "―": 251,
255
+ "‘": 252,
256
+ "’": 253,
257
+ "“": 254,
258
+ "”": 255,
259
+ "„": 256,
260
+ "†": 257,
261
+ "…": 258,
262
+ "′": 259,
263
+ "‹": 260,
264
+ "›": 261,
265
+ "€": 262,
266
+ "₽": 263,
267
+ "ℂ": 264,
268
+ "ℕ": 265,
269
+ "ℝ": 266,
270
+ "ℤ": 267,
271
+ "ℰ": 268,
272
+ "ℵ": 269,
273
+ "→": 270,
274
+ "∅": 271,
275
+ "∆": 272,
276
+ "∈": 273,
277
+ "−": 274,
278
+ "∞": 275,
279
+ "∨": 276,
280
+ "∼": 277,
281
+ "≥": 278,
282
+ "─": 279,
283
+ "☉": 280,
284
+ "い": 281,
285
+ "う": 282,
286
+ "た": 283,
287
+ "つ": 284,
288
+ "ぬ": 285,
289
+ "の": 286,
290
+ "ひ": 287,
291
+ "へ": 288,
292
+ "ま": 289,
293
+ "め": 290,
294
+ "や": 291,
295
+ "ゔ": 292,
296
+ "乃": 293,
297
+ "京": 294,
298
+ "北": 295,
299
+ "扬": 296,
300
+ "文": 297,
301
+ "星": 298,
302
+ "术": 299,
303
+ "杜": 300,
304
+ "甌": 301,
305
+ "美": 302,
306
+ "西": 303,
307
+ "貴": 304,
308
+ "青": 305,
309
+ "馆": 306,
310
+ "ꝑ": 307,
311
+ "고": 308,
312
+ "기": 309,
313
+ "먹": 310,
314
+ "삼": 311,
315
+ "생": 312,
316
+ "집": 313
317
+ }