Antony Kalloniatis commited on
Commit
49633a2
1 Parent(s): 3a17b6b

Upload tokenizer

Browse files
Files changed (4) hide show
  1. merges.txt +0 -0
  2. special_tokens_map.json +20 -0
  3. tokenizer_config.json +342 -0
  4. vocab.json +0 -0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<special0>",
4
+ "<special1>",
5
+ "<special2>",
6
+ "<special3>",
7
+ "<special4>",
8
+ "<special5>",
9
+ "<special6>",
10
+ "<special7>",
11
+ "<special8>",
12
+ "<special9>"
13
+ ],
14
+ "bos_token": "<s>",
15
+ "cls_token": "</s>",
16
+ "mask_token": "<special1>",
17
+ "pad_token": "<pad>",
18
+ "sep_token": "</s>",
19
+ "unk_token": "<unk>"
20
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "</s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<special0>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "<special1>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "<special2>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "7": {
60
+ "content": "<special3>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "8": {
68
+ "content": "<special4>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "9": {
76
+ "content": "<special5>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "10": {
84
+ "content": "<special6>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "11": {
92
+ "content": "<special7>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "12": {
100
+ "content": "<special8>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "13": {
108
+ "content": "<special9>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ }
115
+ },
116
+ "additional_special_tokens": [
117
+ "<special0>",
118
+ "<special1>",
119
+ "<special2>",
120
+ "<special3>",
121
+ "<special4>",
122
+ "<special5>",
123
+ "<special6>",
124
+ "<special7>",
125
+ "<special8>",
126
+ "<special9>"
127
+ ],
128
+ "bos_token": "<s>",
129
+ "clean_up_tokenization_spaces": true,
130
+ "cls_token": "</s>",
131
+ "do_lowercase_and_remove_accent": false,
132
+ "id2lang": {
133
+ "0": "af",
134
+ "1": "als",
135
+ "10": "be",
136
+ "11": "bg",
137
+ "12": "bn",
138
+ "13": "br",
139
+ "14": "bs",
140
+ "15": "ca",
141
+ "16": "ceb",
142
+ "17": "ckb",
143
+ "18": "cs",
144
+ "19": "cy",
145
+ "2": "am",
146
+ "20": "da",
147
+ "21": "de",
148
+ "22": "el",
149
+ "23": "en",
150
+ "24": "eo",
151
+ "25": "es",
152
+ "26": "et",
153
+ "27": "eu",
154
+ "28": "fa",
155
+ "29": "fi",
156
+ "3": "an",
157
+ "30": "fr",
158
+ "31": "fy",
159
+ "32": "ga",
160
+ "33": "gan",
161
+ "34": "gl",
162
+ "35": "gu",
163
+ "36": "he",
164
+ "37": "hi",
165
+ "38": "hr",
166
+ "39": "hu",
167
+ "4": "ang",
168
+ "40": "hy",
169
+ "41": "ia",
170
+ "42": "id",
171
+ "43": "is",
172
+ "44": "it",
173
+ "45": "ja",
174
+ "46": "jv",
175
+ "47": "ka",
176
+ "48": "kk",
177
+ "49": "kn",
178
+ "5": "ar",
179
+ "50": "ko",
180
+ "51": "ku",
181
+ "52": "la",
182
+ "53": "lb",
183
+ "54": "lt",
184
+ "55": "lv",
185
+ "56": "mk",
186
+ "57": "ml",
187
+ "58": "mn",
188
+ "59": "mr",
189
+ "6": "arz",
190
+ "60": "ms",
191
+ "61": "my",
192
+ "62": "nds",
193
+ "63": "ne",
194
+ "64": "nl",
195
+ "65": "nn",
196
+ "66": "no",
197
+ "67": "oc",
198
+ "68": "pl",
199
+ "69": "pt",
200
+ "7": "ast",
201
+ "70": "ro",
202
+ "71": "ru",
203
+ "72": "scn",
204
+ "73": "sco",
205
+ "74": "sh",
206
+ "75": "si",
207
+ "76": "simple",
208
+ "77": "sk",
209
+ "78": "sl",
210
+ "79": "sq",
211
+ "8": "az",
212
+ "80": "sr",
213
+ "81": "sv",
214
+ "82": "sw",
215
+ "83": "ta",
216
+ "84": "te",
217
+ "85": "th",
218
+ "86": "tl",
219
+ "87": "tr",
220
+ "88": "tt",
221
+ "89": "uk",
222
+ "9": "bar",
223
+ "90": "ur",
224
+ "91": "uz",
225
+ "92": "vi",
226
+ "93": "war",
227
+ "94": "wuu",
228
+ "95": "yi",
229
+ "96": "zh",
230
+ "97": "zh_classical",
231
+ "98": "zh_min_nan",
232
+ "99": "zh_yue"
233
+ },
234
+ "lang2id": {
235
+ "af": 0,
236
+ "als": 1,
237
+ "am": 2,
238
+ "an": 3,
239
+ "ang": 4,
240
+ "ar": 5,
241
+ "arz": 6,
242
+ "ast": 7,
243
+ "az": 8,
244
+ "bar": 9,
245
+ "be": 10,
246
+ "bg": 11,
247
+ "bn": 12,
248
+ "br": 13,
249
+ "bs": 14,
250
+ "ca": 15,
251
+ "ceb": 16,
252
+ "ckb": 17,
253
+ "cs": 18,
254
+ "cy": 19,
255
+ "da": 20,
256
+ "de": 21,
257
+ "el": 22,
258
+ "en": 23,
259
+ "eo": 24,
260
+ "es": 25,
261
+ "et": 26,
262
+ "eu": 27,
263
+ "fa": 28,
264
+ "fi": 29,
265
+ "fr": 30,
266
+ "fy": 31,
267
+ "ga": 32,
268
+ "gan": 33,
269
+ "gl": 34,
270
+ "gu": 35,
271
+ "he": 36,
272
+ "hi": 37,
273
+ "hr": 38,
274
+ "hu": 39,
275
+ "hy": 40,
276
+ "ia": 41,
277
+ "id": 42,
278
+ "is": 43,
279
+ "it": 44,
280
+ "ja": 45,
281
+ "jv": 46,
282
+ "ka": 47,
283
+ "kk": 48,
284
+ "kn": 49,
285
+ "ko": 50,
286
+ "ku": 51,
287
+ "la": 52,
288
+ "lb": 53,
289
+ "lt": 54,
290
+ "lv": 55,
291
+ "mk": 56,
292
+ "ml": 57,
293
+ "mn": 58,
294
+ "mr": 59,
295
+ "ms": 60,
296
+ "my": 61,
297
+ "nds": 62,
298
+ "ne": 63,
299
+ "nl": 64,
300
+ "nn": 65,
301
+ "no": 66,
302
+ "oc": 67,
303
+ "pl": 68,
304
+ "pt": 69,
305
+ "ro": 70,
306
+ "ru": 71,
307
+ "scn": 72,
308
+ "sco": 73,
309
+ "sh": 74,
310
+ "si": 75,
311
+ "simple": 76,
312
+ "sk": 77,
313
+ "sl": 78,
314
+ "sq": 79,
315
+ "sr": 80,
316
+ "sv": 81,
317
+ "sw": 82,
318
+ "ta": 83,
319
+ "te": 84,
320
+ "th": 85,
321
+ "tl": 86,
322
+ "tr": 87,
323
+ "tt": 88,
324
+ "uk": 89,
325
+ "ur": 90,
326
+ "uz": 91,
327
+ "vi": 92,
328
+ "war": 93,
329
+ "wuu": 94,
330
+ "yi": 95,
331
+ "zh": 96,
332
+ "zh_classical": 97,
333
+ "zh_min_nan": 98,
334
+ "zh_yue": 99
335
+ },
336
+ "mask_token": "<special1>",
337
+ "model_max_length": 512,
338
+ "pad_token": "<pad>",
339
+ "sep_token": "</s>",
340
+ "tokenizer_class": "XLMTokenizer",
341
+ "unk_token": "<unk>"
342
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff