zpn commited on
Commit
25688bc
1 Parent(s): 546cf4f

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +7 -0
  2. tokenizer.json +314 -0
  3. tokenizer_config.json +9 -0
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[UNK]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[CLS]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[SEP]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[PAD]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": null,
53
+ "pre_tokenizer": {
54
+ "type": "WhitespaceSplit"
55
+ },
56
+ "post_processor": {
57
+ "type": "TemplateProcessing",
58
+ "single": [
59
+ {
60
+ "SpecialToken": {
61
+ "id": "[CLS]",
62
+ "type_id": 0
63
+ }
64
+ },
65
+ {
66
+ "Sequence": {
67
+ "id": "A",
68
+ "type_id": 0
69
+ }
70
+ },
71
+ {
72
+ "SpecialToken": {
73
+ "id": "[SEP]",
74
+ "type_id": 0
75
+ }
76
+ }
77
+ ],
78
+ "pair": [
79
+ {
80
+ "Sequence": {
81
+ "id": "A",
82
+ "type_id": 0
83
+ }
84
+ },
85
+ {
86
+ "Sequence": {
87
+ "id": "B",
88
+ "type_id": 1
89
+ }
90
+ }
91
+ ],
92
+ "special_tokens": {
93
+ "[CLS]": {
94
+ "id": "[CLS]",
95
+ "ids": [
96
+ 1
97
+ ],
98
+ "tokens": [
99
+ "[CLS]"
100
+ ]
101
+ },
102
+ "[SEP]": {
103
+ "id": "[SEP]",
104
+ "ids": [
105
+ 2
106
+ ],
107
+ "tokens": [
108
+ "[SEP]"
109
+ ]
110
+ }
111
+ }
112
+ },
113
+ "decoder": null,
114
+ "model": {
115
+ "type": "WordLevel",
116
+ "vocab": {
117
+ "[UNK]": 0,
118
+ "[CLS]": 1,
119
+ "[SEP]": 2,
120
+ "[PAD]": 3,
121
+ "[MASK]": 4,
122
+ "[=Branch1]": 5,
123
+ "[Branch1]": 6,
124
+ "[=C]": 7,
125
+ "[Ring1]": 8,
126
+ "[N]": 9,
127
+ "[=O]": 10,
128
+ "[O]": 11,
129
+ "[Ring2]": 12,
130
+ "[=N]": 13,
131
+ "[C@H1]": 14,
132
+ "[C@@H1]": 15,
133
+ "[Branch2]": 16,
134
+ "[F]": 17,
135
+ "[S]": 18,
136
+ "[=Branch2]": 19,
137
+ "[#Branch1]": 20,
138
+ "[NH1+1]": 21,
139
+ "[=Ring1]": 22,
140
+ "[Cl]": 23,
141
+ "[#Branch2]": 24,
142
+ "[NH1]": 25,
143
+ "[NH2+1]": 26,
144
+ "[#C]": 27,
145
+ "[Br]": 28,
146
+ "[#N]": 29,
147
+ "[/C]": 30,
148
+ "[C]": 31,
149
+ "[P]": 32,
150
+ "[C@@]": 33,
151
+ "[C@]": 34,
152
+ "[O-1]": 35,
153
+ "[NH3+1]": 36,
154
+ "[N-1]": 37,
155
+ "[\\C]": 38,
156
+ "[=NH1+1]": 39,
157
+ "[=S]": 40,
158
+ "[I]": 41,
159
+ "[/N]": 42,
160
+ "[\\-Ring1]": 43,
161
+ "[/S]": 44,
162
+ "[/-Ring1]": 45,
163
+ "[/Cl]": 46,
164
+ "[\\N]": 47,
165
+ "[\\Cl]": 48,
166
+ "[=NH2+1]": 49,
167
+ "[/NH1+1]": 50,
168
+ "[/O]": 51,
169
+ "[H]": 52,
170
+ "[Si]": 53,
171
+ "[\\O]": 54,
172
+ "[=Ring2]": 55,
173
+ "[\\-Ring2]": 56,
174
+ "[N+1]": 57,
175
+ "[\\S]": 58,
176
+ "[S-1]": 59,
177
+ "[/-Ring2]": 60,
178
+ "[/C@@H1]": 61,
179
+ "[/C@H1]": 62,
180
+ "[\\C@@H1]": 63,
181
+ "[S@]": 64,
182
+ "[=N+1]": 65,
183
+ "[S@@]": 66,
184
+ "[\\C@H1]": 67,
185
+ "[/NH1]": 68,
186
+ "[B]": 69,
187
+ "[/F]": 70,
188
+ "[CH1]": 71,
189
+ "[CH0]": 72,
190
+ "[\\O-1]": 73,
191
+ "[/O-1]": 74,
192
+ "[\\F]": 75,
193
+ "[/Br]": 76,
194
+ "[/C@]": 77,
195
+ "[\\NH1]": 78,
196
+ "[\\C@]": 79,
197
+ "[\\NH1+1]": 80,
198
+ "[P@@]": 81,
199
+ "[/C@@]": 82,
200
+ "[P@]": 83,
201
+ "[\\Br]": 84,
202
+ "[\\C@@]": 85,
203
+ "[/I]": 86,
204
+ "[S+1]": 87,
205
+ "[N@+1]": 88,
206
+ "[N@@+1]": 89,
207
+ "[/N+1]": 90,
208
+ "[CH2]": 91,
209
+ "[Sn]": 92,
210
+ "[OH0]": 93,
211
+ "[\\I]": 94,
212
+ "[/NH2+1]": 95,
213
+ "[\\N+1]": 96,
214
+ "[=S+1]": 97,
215
+ "[\\NH2+1]": 98,
216
+ "[/OH0]": 99,
217
+ "[=S@]": 100,
218
+ "[=S@@]": 101,
219
+ "[=P]": 102,
220
+ "[P+1]": 103,
221
+ "[/H]": 104,
222
+ "[/P]": 105,
223
+ "[/NH3+1]": 106,
224
+ "[\\H]": 107,
225
+ "[B-1]": 108,
226
+ "[S@@+1]": 109,
227
+ "[\\P]": 110,
228
+ "[C+1]": 111,
229
+ "[S@+1]": 112,
230
+ "[=O+1]": 113,
231
+ "[/Si]": 114,
232
+ "[\\NH3+1]": 115,
233
+ "[N@]": 116,
234
+ "[NH0]": 117,
235
+ "[\\OH0]": 118,
236
+ "[C-1]": 119,
237
+ "[/S@]": 120,
238
+ "[/S@@]": 121,
239
+ "[Si@@]": 122,
240
+ "[P@@H1]": 123,
241
+ "[\\Si]": 124,
242
+ "[/Sn]": 125,
243
+ "[CH1-1]": 126,
244
+ "[Si@]": 127,
245
+ "[/N-1]": 128,
246
+ "[N@@]": 129,
247
+ "[=NH0]": 130,
248
+ "[BH3-1]": 131,
249
+ "[IH2]": 132,
250
+ "[\\B]": 133,
251
+ "[/B]": 134,
252
+ "[Sn@]": 135,
253
+ "[P@+1]": 136,
254
+ "[P@@+1]": 137,
255
+ "[/S+1]": 138,
256
+ "[Sn@@]": 139,
257
+ "[=B]": 140,
258
+ "[=IH2]": 141,
259
+ "[BH1-1]": 142,
260
+ "[P@H1]": 143,
261
+ "[#N+1]": 144,
262
+ "[=P@@]": 145,
263
+ "[=P@]": 146,
264
+ "[O+1]": 147,
265
+ "[SnH2+1]": 148,
266
+ "[SnH4+2]": 149,
267
+ "[/CH0]": 150,
268
+ "[=17O]": 151,
269
+ "[=CH0]": 152,
270
+ "[SnH1]": 153,
271
+ "[\\N-1]": 154,
272
+ "[\\S@]": 155,
273
+ "[P@@H1+1]": 156,
274
+ "[B@-1]": 157,
275
+ "[B@@-1]": 158,
276
+ "[I+1]": 159,
277
+ "[Sn+1]": 160,
278
+ "[Sn+2]": 161,
279
+ "[Br+1]": 162,
280
+ "[\\P@@]": 163,
281
+ "[\\P@]": 164,
282
+ "[\\Sn]": 165,
283
+ "[#S]": 166,
284
+ "[/CH1]": 167,
285
+ "[/NH0]": 168,
286
+ "[17O]": 169,
287
+ "[18OH1]": 170,
288
+ "[=Si]": 171,
289
+ "[BH2-1]": 172,
290
+ "[S@@-1]": 173,
291
+ "[S@@H1]": 174,
292
+ "[Sn+3]": 175,
293
+ "[SnH2]": 176,
294
+ "[SnH6+3]": 177,
295
+ "[\\C-1]": 178,
296
+ "[\\NH0]": 179,
297
+ "[\\S+1]": 180,
298
+ "[#P]": 181,
299
+ "[/CH2]": 182,
300
+ "[/O+1]": 183,
301
+ "[/P@@]": 184,
302
+ "[/P@]": 185,
303
+ "[/Si@@]": 186,
304
+ "[/Si@]": 187,
305
+ "[S@H1]": 188,
306
+ "[SH3]": 189,
307
+ "[SiH2]": 190,
308
+ "[SiH3]": 191,
309
+ "[\\C+1]": 192,
310
+ "[\\S@@]": 193
311
+ },
312
+ "unk_token": "[UNK]"
313
+ }
314
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "model_max_length": 512,
5
+ "pad_token": "[PAD]",
6
+ "sep_token": "[SEP]",
7
+ "tokenizer_class": "PreTrainedTokenizerFast",
8
+ "unk_token": "[UNK]"
9
+ }