GinnM commited on
Commit
84ed39b
1 Parent(s): 72eaaca

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +9 -0
  2. tokenizer.json +340 -0
  3. tokenizer_config.json +5 -0
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<cls>",
3
+ "cls_token": "<cls>",
4
+ "eos_token": "<sep>",
5
+ "mask_token": "<mask>",
6
+ "pad_token": "<pad>",
7
+ "sep_token": "<sep>",
8
+ "unk_token": "<unk>"
9
+ }
tokenizer.json ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<pad>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<cls>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<sep>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<unk>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "<mask>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": null,
53
+ "pre_tokenizer": null,
54
+ "post_processor": {
55
+ "type": "TemplateProcessing",
56
+ "single": [
57
+ {
58
+ "SpecialToken": {
59
+ "id": "<cls>",
60
+ "type_id": 0
61
+ }
62
+ },
63
+ {
64
+ "Sequence": {
65
+ "id": "A",
66
+ "type_id": 0
67
+ }
68
+ },
69
+ {
70
+ "SpecialToken": {
71
+ "id": "<sep>",
72
+ "type_id": 0
73
+ }
74
+ }
75
+ ],
76
+ "pair": [
77
+ {
78
+ "SpecialToken": {
79
+ "id": "<cls>",
80
+ "type_id": 0
81
+ }
82
+ },
83
+ {
84
+ "Sequence": {
85
+ "id": "A",
86
+ "type_id": 0
87
+ }
88
+ },
89
+ {
90
+ "SpecialToken": {
91
+ "id": "<sep>",
92
+ "type_id": 0
93
+ }
94
+ },
95
+ {
96
+ "Sequence": {
97
+ "id": "B",
98
+ "type_id": 1
99
+ }
100
+ },
101
+ {
102
+ "SpecialToken": {
103
+ "id": "<sep>",
104
+ "type_id": 1
105
+ }
106
+ }
107
+ ],
108
+ "special_tokens": {
109
+ "<cls>": {
110
+ "id": "<cls>",
111
+ "ids": [
112
+ 1
113
+ ],
114
+ "tokens": [
115
+ "<cls>"
116
+ ]
117
+ },
118
+ "<sep>": {
119
+ "id": "<sep>",
120
+ "ids": [
121
+ 2
122
+ ],
123
+ "tokens": [
124
+ "<sep>"
125
+ ]
126
+ }
127
+ }
128
+ },
129
+ "decoder": {
130
+ "type": "Metaspace",
131
+ "replacement": "▁",
132
+ "add_prefix_space": true
133
+ },
134
+ "model": {
135
+ "type": "Unigram",
136
+ "unk_id": 3,
137
+ "vocab": [
138
+ [
139
+ "<pad>",
140
+ 0.0
141
+ ],
142
+ [
143
+ "<cls>",
144
+ 0.0
145
+ ],
146
+ [
147
+ "<sep>",
148
+ 0.0
149
+ ],
150
+ [
151
+ "<unk>",
152
+ 0.0
153
+ ],
154
+ [
155
+ "<mask>",
156
+ 0.0
157
+ ],
158
+ [
159
+ "L",
160
+ -2.7509312510529185
161
+ ],
162
+ [
163
+ "R",
164
+ -2.755084699217022
165
+ ],
166
+ [
167
+ "V",
168
+ -2.8114993435449485
169
+ ],
170
+ [
171
+ "T",
172
+ -2.818179108697013
173
+ ],
174
+ [
175
+ "D",
176
+ -2.8358745908237744
177
+ ],
178
+ [
179
+ "I",
180
+ -2.8386883982403575
181
+ ],
182
+ [
183
+ "S",
184
+ -2.8508125229928503
185
+ ],
186
+ [
187
+ "G",
188
+ -2.8876033679551227
189
+ ],
190
+ [
191
+ "E",
192
+ -2.9024328890049524
193
+ ],
194
+ [
195
+ "P",
196
+ -2.911018940638389
197
+ ],
198
+ [
199
+ "K",
200
+ -2.9183231107580596
201
+ ],
202
+ [
203
+ "A",
204
+ -2.919747614014888
205
+ ],
206
+ [
207
+ "N",
208
+ -3.123051382634724
209
+ ],
210
+ [
211
+ "F",
212
+ -3.17920066040503
213
+ ],
214
+ [
215
+ "Q",
216
+ -3.190869184456883
217
+ ],
218
+ [
219
+ "Y",
220
+ -3.4628954984309086
221
+ ],
222
+ [
223
+ "H",
224
+ -3.741564333998106
225
+ ],
226
+ [
227
+ "M",
228
+ -3.771740452244479
229
+ ],
230
+ [
231
+ "C",
232
+ -4.177244477126914
233
+ ],
234
+ [
235
+ "W",
236
+ -4.26244359647038
237
+ ],
238
+ [
239
+ "AA",
240
+ -5.151553502459924
241
+ ],
242
+ [
243
+ "LL",
244
+ -5.297524048938797
245
+ ],
246
+ [
247
+ "LA",
248
+ -5.433632759818389
249
+ ],
250
+ [
251
+ "AL",
252
+ -5.5585947635634465
253
+ ],
254
+ [
255
+ "SS",
256
+ -5.653368570054353
257
+ ],
258
+ [
259
+ "LS",
260
+ -5.6542024366077595
261
+ ],
262
+ [
263
+ "SL",
264
+ -5.774350740282891
265
+ ],
266
+ [
267
+ "AG",
268
+ -5.927206325143018
269
+ ],
270
+ [
271
+ "VL",
272
+ -5.971670833218097
273
+ ],
274
+ [
275
+ "GG",
276
+ -5.998372467775059
277
+ ],
278
+ [
279
+ "VA",
280
+ -6.014387210129083
281
+ ],
282
+ [
283
+ "SA",
284
+ -6.0238415835311105
285
+ ],
286
+ [
287
+ "LV",
288
+ -6.027713726026638
289
+ ],
290
+ [
291
+ "EL",
292
+ -6.053145724740345
293
+ ],
294
+ [
295
+ "AV",
296
+ -6.058233627458359
297
+ ],
298
+ [
299
+ "GA",
300
+ -6.083252372183383
301
+ ],
302
+ [
303
+ "LG",
304
+ -6.1040977735585
305
+ ],
306
+ [
307
+ "AS",
308
+ -6.118553954419422
309
+ ],
310
+ [
311
+ "SG",
312
+ -6.132691839839374
313
+ ],
314
+ [
315
+ "EA",
316
+ -6.152907367995331
317
+ ],
318
+ [
319
+ "X",
320
+ -7.727541617091807
321
+ ],
322
+ [
323
+ "B",
324
+ -14.572756361550985
325
+ ],
326
+ [
327
+ "Z",
328
+ -15.799721233660796
329
+ ],
330
+ [
331
+ "U",
332
+ -16.47917235225718
333
+ ],
334
+ [
335
+ "O",
336
+ -19.974280430470184
337
+ ]
338
+ ]
339
+ }
340
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "model_max_length": 1000000000000000019884624838656,
4
+ "tokenizer_class": "PreTrainedTokenizerFast"
5
+ }