Parsa commited on
Commit
1f25e11
1 Parent(s): 6bd97f8

add tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>": 591, "</s>": 592}
merges.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ #version: 0.2 - Trained by `huggingface/tokenizers`
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tokenizer.json ADDED
@@ -0,0 +1,696 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "special": true,
9
+ "content": "[PAD]",
10
+ "single_word": false,
11
+ "lstrip": false,
12
+ "rstrip": false,
13
+ "normalized": false
14
+ },
15
+ {
16
+ "id": 11,
17
+ "special": true,
18
+ "content": "[UNK]",
19
+ "single_word": false,
20
+ "lstrip": false,
21
+ "rstrip": false,
22
+ "normalized": false
23
+ },
24
+ {
25
+ "id": 12,
26
+ "special": true,
27
+ "content": "[CLS]",
28
+ "single_word": false,
29
+ "lstrip": false,
30
+ "rstrip": false,
31
+ "normalized": false
32
+ },
33
+ {
34
+ "id": 13,
35
+ "special": true,
36
+ "content": "[SEP]",
37
+ "single_word": false,
38
+ "lstrip": false,
39
+ "rstrip": false,
40
+ "normalized": false
41
+ },
42
+ {
43
+ "id": 14,
44
+ "special": true,
45
+ "content": "[MASK]",
46
+ "single_word": false,
47
+ "lstrip": true,
48
+ "rstrip": false,
49
+ "normalized": false
50
+ },
51
+ {
52
+ "id": 591,
53
+ "special": true,
54
+ "content": "<s>",
55
+ "single_word": false,
56
+ "lstrip": false,
57
+ "rstrip": false,
58
+ "normalized": true
59
+ },
60
+ {
61
+ "id": 592,
62
+ "special": true,
63
+ "content": "</s>",
64
+ "single_word": false,
65
+ "lstrip": false,
66
+ "rstrip": false,
67
+ "normalized": true
68
+ }
69
+ ],
70
+ "normalizer": null,
71
+ "pre_tokenizer": {
72
+ "type": "ByteLevel",
73
+ "add_prefix_space": false,
74
+ "trim_offsets": true
75
+ },
76
+ "post_processor": {
77
+ "type": "RobertaProcessing",
78
+ "sep": [
79
+ "[SEP]",
80
+ 13
81
+ ],
82
+ "cls": [
83
+ "[CLS]",
84
+ 12
85
+ ],
86
+ "trim_offsets": true,
87
+ "add_prefix_space": false
88
+ },
89
+ "decoder": {
90
+ "type": "ByteLevel",
91
+ "add_prefix_space": true,
92
+ "trim_offsets": true
93
+ },
94
+ "model": {
95
+ "type": "BPE",
96
+ "dropout": null,
97
+ "unk_token": null,
98
+ "continuing_subword_prefix": "",
99
+ "end_of_word_suffix": "",
100
+ "fuse_unk": false,
101
+ "vocab": {
102
+ "[PAD]": 0,
103
+ "[unused1]": 1,
104
+ "[unused2]": 2,
105
+ "[unused3]": 3,
106
+ "[unused4]": 4,
107
+ "[unused5]": 5,
108
+ "[unused6]": 6,
109
+ "[unused7]": 7,
110
+ "[unused8]": 8,
111
+ "[unused9]": 9,
112
+ "[unused10]": 10,
113
+ "[UNK]": 11,
114
+ "[CLS]": 12,
115
+ "[SEP]": 13,
116
+ "[MASK]": 14,
117
+ "c": 15,
118
+ "C": 16,
119
+ "(": 17,
120
+ ")": 18,
121
+ "O": 19,
122
+ "1": 20,
123
+ "2": 21,
124
+ "=": 22,
125
+ "N": 23,
126
+ ".": 24,
127
+ "n": 25,
128
+ "3": 26,
129
+ "F": 27,
130
+ "Cl": 28,
131
+ ">>": 29,
132
+ "~": 30,
133
+ "-": 31,
134
+ "4": 32,
135
+ "[C@H]": 33,
136
+ "S": 34,
137
+ "[C@@H]": 35,
138
+ "[O-]": 36,
139
+ "Br": 37,
140
+ "#": 38,
141
+ "/": 39,
142
+ "[nH]": 40,
143
+ "[N+]": 41,
144
+ "s": 42,
145
+ "5": 43,
146
+ "o": 44,
147
+ "P": 45,
148
+ "[Na+]": 46,
149
+ "[Si]": 47,
150
+ "I": 48,
151
+ "[Na]": 49,
152
+ "[Pd]": 50,
153
+ "[K+]": 51,
154
+ "[K]": 52,
155
+ "[P]": 53,
156
+ "B": 54,
157
+ "[C@]": 55,
158
+ "[C@@]": 56,
159
+ "[Cl-]": 57,
160
+ "6": 58,
161
+ "[OH-]": 59,
162
+ "\\": 60,
163
+ "[N-]": 61,
164
+ "[Li]": 62,
165
+ "[H]": 63,
166
+ "[2H]": 64,
167
+ "[NH4+]": 65,
168
+ "[c-]": 66,
169
+ "[P-]": 67,
170
+ "[Cs+]": 68,
171
+ "[Li+]": 69,
172
+ "[Cs]": 70,
173
+ "[NaH]": 71,
174
+ "[H-]": 72,
175
+ "[O+]": 73,
176
+ "[BH4-]": 74,
177
+ "[Cu]": 75,
178
+ "7": 76,
179
+ "[Mg]": 77,
180
+ "[Fe+2]": 78,
181
+ "[n+]": 79,
182
+ "[Sn]": 80,
183
+ "[BH-]": 81,
184
+ "[Pd+2]": 82,
185
+ "[CH]": 83,
186
+ "[I-]": 84,
187
+ "[Br-]": 85,
188
+ "[C-]": 86,
189
+ "[Zn]": 87,
190
+ "[B-]": 88,
191
+ "[F-]": 89,
192
+ "[Al]": 90,
193
+ "[P+]": 91,
194
+ "[BH3-]": 92,
195
+ "[Fe]": 93,
196
+ "[C]": 94,
197
+ "[AlH4]": 95,
198
+ "[Ni]": 96,
199
+ "[SiH]": 97,
200
+ "8": 98,
201
+ "[Cu+2]": 99,
202
+ "[Mn]": 100,
203
+ "[AlH]": 101,
204
+ "[nH+]": 102,
205
+ "[AlH4-]": 103,
206
+ "[O-2]": 104,
207
+ "[Cr]": 105,
208
+ "[Mg+2]": 106,
209
+ "[NH3+]": 107,
210
+ "[S@]": 108,
211
+ "[Pt]": 109,
212
+ "[Al+3]": 110,
213
+ "[S@@]": 111,
214
+ "[S-]": 112,
215
+ "[Ti]": 113,
216
+ "[Zn+2]": 114,
217
+ "[PH]": 115,
218
+ "[NH2+]": 116,
219
+ "[Ru]": 117,
220
+ "[Ag+]": 118,
221
+ "[S+]": 119,
222
+ "[I+3]": 120,
223
+ "[NH+]": 121,
224
+ "[Ca+2]": 122,
225
+ "[Ag]": 123,
226
+ "9": 124,
227
+ "[Os]": 125,
228
+ "[Se]": 126,
229
+ "[SiH2]": 127,
230
+ "[Ca]": 128,
231
+ "[Ti+4]": 129,
232
+ "[Ac]": 130,
233
+ "[Cu+]": 131,
234
+ "[S]": 132,
235
+ "[Rh]": 133,
236
+ "[Cl+3]": 134,
237
+ "[cH-]": 135,
238
+ "[Zn+]": 136,
239
+ "[O]": 137,
240
+ "[Cl+]": 138,
241
+ "[SH]": 139,
242
+ "[H+]": 140,
243
+ "[Pd+]": 141,
244
+ "[se]": 142,
245
+ "[PH+]": 143,
246
+ "[I]": 144,
247
+ "[Pt+2]": 145,
248
+ "[C+]": 146,
249
+ "[Mg+]": 147,
250
+ "[Hg]": 148,
251
+ "[W]": 149,
252
+ "[SnH]": 150,
253
+ "[SiH3]": 151,
254
+ "[Fe+3]": 152,
255
+ "[NH]": 153,
256
+ "[Mo]": 154,
257
+ "[CH2+]": 155,
258
+ "%10": 156,
259
+ "[CH2-]": 157,
260
+ "[CH2]": 158,
261
+ "[n-]": 159,
262
+ "[Ce+4]": 160,
263
+ "[NH-]": 161,
264
+ "[Co]": 162,
265
+ "[I+]": 163,
266
+ "[PH2]": 164,
267
+ "[Pt+4]": 165,
268
+ "[Ce]": 166,
269
+ "[B]": 167,
270
+ "[Sn+2]": 168,
271
+ "[Ba+2]": 169,
272
+ "%11": 170,
273
+ "[Fe-3]": 171,
274
+ "[18F]": 172,
275
+ "[SH-]": 173,
276
+ "[Pb+2]": 174,
277
+ "[Os-2]": 175,
278
+ "[Zr+4]": 176,
279
+ "[N]": 177,
280
+ "[Ir]": 178,
281
+ "[Bi]": 179,
282
+ "[Ni+2]": 180,
283
+ "[P@]": 181,
284
+ "[Co+2]": 182,
285
+ "[s+]": 183,
286
+ "[As]": 184,
287
+ "[P+3]": 185,
288
+ "[Hg+2]": 186,
289
+ "[Yb+3]": 187,
290
+ "[CH-]": 188,
291
+ "[Zr+2]": 189,
292
+ "[Mn+2]": 190,
293
+ "[CH+]": 191,
294
+ "[In]": 192,
295
+ "[KH]": 193,
296
+ "[Ce+3]": 194,
297
+ "[Zr]": 195,
298
+ "[AlH2-]": 196,
299
+ "[OH2+]": 197,
300
+ "[Ti+3]": 198,
301
+ "[Rh+2]": 199,
302
+ "[Sb]": 200,
303
+ "[S-2]": 201,
304
+ "%12": 202,
305
+ "[P@@]": 203,
306
+ "[Si@H]": 204,
307
+ "[Mn+4]": 205,
308
+ "p": 206,
309
+ "[Ba]": 207,
310
+ "[NH2-]": 208,
311
+ "[Ge]": 209,
312
+ "[Pb+4]": 210,
313
+ "[Cr+3]": 211,
314
+ "[Au]": 212,
315
+ "[LiH]": 213,
316
+ "[Sc+3]": 214,
317
+ "[o+]": 215,
318
+ "[Rh-3]": 216,
319
+ "%13": 217,
320
+ "[Br]": 218,
321
+ "[Sb-]": 219,
322
+ "[S@+]": 220,
323
+ "[I+2]": 221,
324
+ "[Ar]": 222,
325
+ "[V]": 223,
326
+ "[Cu-]": 224,
327
+ "[Al-]": 225,
328
+ "[Te]": 226,
329
+ "[13c]": 227,
330
+ "[13C]": 228,
331
+ "[Cl]": 229,
332
+ "[PH4+]": 230,
333
+ "[SiH4]": 231,
334
+ "[te]": 232,
335
+ "[CH3-]": 233,
336
+ "[S@@+]": 234,
337
+ "[Rh+3]": 235,
338
+ "[SH+]": 236,
339
+ "[Bi+3]": 237,
340
+ "[Br+2]": 238,
341
+ "[La]": 239,
342
+ "[La+3]": 240,
343
+ "[Pt-2]": 241,
344
+ "[N@@]": 242,
345
+ "[PH3+]": 243,
346
+ "[N@]": 244,
347
+ "[Si+4]": 245,
348
+ "[Sr+2]": 246,
349
+ "[Al+]": 247,
350
+ "[Pb]": 248,
351
+ "[SeH]": 249,
352
+ "[Si-]": 250,
353
+ "[V+5]": 251,
354
+ "[Y+3]": 252,
355
+ "[Re]": 253,
356
+ "[Ru+]": 254,
357
+ "[Sm]": 255,
358
+ "*": 256,
359
+ "[3H]": 257,
360
+ "[NH2]": 258,
361
+ "[Ag-]": 259,
362
+ "[13CH3]": 260,
363
+ "[OH+]": 261,
364
+ "[Ru+3]": 262,
365
+ "[OH]": 263,
366
+ "[Gd+3]": 264,
367
+ "[13CH2]": 265,
368
+ "[In+3]": 266,
369
+ "[Si@@]": 267,
370
+ "[Si@]": 268,
371
+ "[Ti+2]": 269,
372
+ "[Sn+]": 270,
373
+ "[Cl+2]": 271,
374
+ "[AlH-]": 272,
375
+ "[Pd-2]": 273,
376
+ "[SnH3]": 274,
377
+ "[B+3]": 275,
378
+ "[Cu-2]": 276,
379
+ "[Nd+3]": 277,
380
+ "[Pb+3]": 278,
381
+ "[13cH]": 279,
382
+ "[Fe-4]": 280,
383
+ "[Ga]": 281,
384
+ "[Sn+4]": 282,
385
+ "[Hg+]": 283,
386
+ "[11CH3]": 284,
387
+ "[Hf]": 285,
388
+ "[Pr]": 286,
389
+ "[Y]": 287,
390
+ "[S+2]": 288,
391
+ "[Cd]": 289,
392
+ "[Cr+6]": 290,
393
+ "[Zr+3]": 291,
394
+ "[Rh+]": 292,
395
+ "[CH3]": 293,
396
+ "[N-3]": 294,
397
+ "[Hf+2]": 295,
398
+ "[Th]": 296,
399
+ "[Sb+3]": 297,
400
+ "%14": 298,
401
+ "[Cr+2]": 299,
402
+ "[Ru+2]": 300,
403
+ "[Hf+4]": 301,
404
+ "[14C]": 302,
405
+ "[Ta]": 303,
406
+ "[Tl+]": 304,
407
+ "[B+]": 305,
408
+ "[Os+4]": 306,
409
+ "[PdH2]": 307,
410
+ "[Pd-]": 308,
411
+ "[Cd+2]": 309,
412
+ "[Co+3]": 310,
413
+ "[S+4]": 311,
414
+ "[Nb+5]": 312,
415
+ "[123I]": 313,
416
+ "[c+]": 314,
417
+ "[Rb+]": 315,
418
+ "[V+2]": 316,
419
+ "[CH3+]": 317,
420
+ "[Ag+2]": 318,
421
+ "[cH+]": 319,
422
+ "[Mn+3]": 320,
423
+ "[Se-]": 321,
424
+ "[As-]": 322,
425
+ "[Eu+3]": 323,
426
+ "[SH2]": 324,
427
+ "[Sm+3]": 325,
428
+ "[IH+]": 326,
429
+ "%15": 327,
430
+ "[OH3+]": 328,
431
+ "[PH3]": 329,
432
+ "[IH2+]": 330,
433
+ "[SH2+]": 331,
434
+ "[Ir+3]": 332,
435
+ "[AlH3]": 333,
436
+ "[Sc]": 334,
437
+ "[Yb]": 335,
438
+ "[15NH2]": 336,
439
+ "[Lu]": 337,
440
+ "[sH+]": 338,
441
+ "[Gd]": 339,
442
+ "[18F-]": 340,
443
+ "[SH3+]": 341,
444
+ "[SnH4]": 342,
445
+ "[TeH]": 343,
446
+ "[Si@@H]": 344,
447
+ "[Ga+3]": 345,
448
+ "[CaH2]": 346,
449
+ "[Tl]": 347,
450
+ "[Ta+5]": 348,
451
+ "[GeH]": 349,
452
+ "[Br+]": 350,
453
+ "[Sr]": 351,
454
+ "[Tl+3]": 352,
455
+ "[Sm+2]": 353,
456
+ "[PH5]": 354,
457
+ "%16": 355,
458
+ "[N@@+]": 356,
459
+ "[Au+3]": 357,
460
+ "[C-4]": 358,
461
+ "[Nd]": 359,
462
+ "[Ti+]": 360,
463
+ "[IH]": 361,
464
+ "[N@+]": 362,
465
+ "[125I]": 363,
466
+ "[Eu]": 364,
467
+ "[Sn+3]": 365,
468
+ "[Nb]": 366,
469
+ "[Er+3]": 367,
470
+ "[123I-]": 368,
471
+ "[14c]": 369,
472
+ "%17": 370,
473
+ "[SnH2]": 371,
474
+ "[YH]": 372,
475
+ "[Sb+5]": 373,
476
+ "[Pr+3]": 374,
477
+ "[Ir+]": 375,
478
+ "[N+3]": 376,
479
+ "[AlH2]": 377,
480
+ "[19F]": 378,
481
+ "%18": 379,
482
+ "[Tb]": 380,
483
+ "[14CH]": 381,
484
+ "[Mo+4]": 382,
485
+ "[Si+]": 383,
486
+ "[BH]": 384,
487
+ "[Be]": 385,
488
+ "[Rb]": 386,
489
+ "[pH]": 387,
490
+ "%19": 388,
491
+ "%20": 389,
492
+ "[Xe]": 390,
493
+ "[Ir-]": 391,
494
+ "[Be+2]": 392,
495
+ "[C+4]": 393,
496
+ "[RuH2]": 394,
497
+ "[15NH]": 395,
498
+ "[U+2]": 396,
499
+ "[Au-]": 397,
500
+ "%21": 398,
501
+ "%22": 399,
502
+ "[Au+]": 400,
503
+ "[15n]": 401,
504
+ "[Al+2]": 402,
505
+ "[Tb+3]": 403,
506
+ "[15N]": 404,
507
+ "[V+3]": 405,
508
+ "[W+6]": 406,
509
+ "[14CH3]": 407,
510
+ "[Cr+4]": 408,
511
+ "[ClH+]": 409,
512
+ "b": 410,
513
+ "[Ti+6]": 411,
514
+ "[Nd+]": 412,
515
+ "[Zr+]": 413,
516
+ "[PH2+]": 414,
517
+ "[Fm]": 415,
518
+ "[N@H+]": 416,
519
+ "[RuH]": 417,
520
+ "[Dy+3]": 418,
521
+ "%23": 419,
522
+ "[Hf+3]": 420,
523
+ "[W+4]": 421,
524
+ "[11C]": 422,
525
+ "[13CH]": 423,
526
+ "[Er]": 424,
527
+ "[124I]": 425,
528
+ "[LaH]": 426,
529
+ "[F]": 427,
530
+ "[siH]": 428,
531
+ "[Ga+]": 429,
532
+ "[Cm]": 430,
533
+ "[GeH3]": 431,
534
+ "[IH-]": 432,
535
+ "[U+6]": 433,
536
+ "[SeH+]": 434,
537
+ "[32P]": 435,
538
+ "[SeH-]": 436,
539
+ "[Pt-]": 437,
540
+ "[Ir+2]": 438,
541
+ "[se+]": 439,
542
+ "[U]": 440,
543
+ "[F+]": 441,
544
+ "[BH2]": 442,
545
+ "[As+]": 443,
546
+ "[Cf]": 444,
547
+ "[ClH2+]": 445,
548
+ "[Ni+]": 446,
549
+ "[TeH3]": 447,
550
+ "[SbH2]": 448,
551
+ "[Ag+3]": 449,
552
+ "%24": 450,
553
+ "[18O]": 451,
554
+ "[PH4]": 452,
555
+ "[Os+2]": 453,
556
+ "[Na-]": 454,
557
+ "[Sb+2]": 455,
558
+ "[V+4]": 456,
559
+ "[Ho+3]": 457,
560
+ "[68Ga]": 458,
561
+ "[PH-]": 459,
562
+ "[Bi+2]": 460,
563
+ "[Ce+2]": 461,
564
+ "[Pd+3]": 462,
565
+ "[99Tc]": 463,
566
+ "[13C@@H]": 464,
567
+ "[Fe+6]": 465,
568
+ "[c]": 466,
569
+ "[GeH2]": 467,
570
+ "[10B]": 468,
571
+ "[Cu+3]": 469,
572
+ "[Mo+2]": 470,
573
+ "[Cr+]": 471,
574
+ "[Pd+4]": 472,
575
+ "[Dy]": 473,
576
+ "[AsH]": 474,
577
+ "[Ba+]": 475,
578
+ "[SeH2]": 476,
579
+ "[In+]": 477,
580
+ "[TeH2]": 478,
581
+ "[BrH+]": 479,
582
+ "[14cH]": 480,
583
+ "[W+]": 481,
584
+ "[13C@H]": 482,
585
+ "[AsH2]": 483,
586
+ "[In+2]": 484,
587
+ "[N+2]": 485,
588
+ "[N@@H+]": 486,
589
+ "[SbH]": 487,
590
+ "[60Co]": 488,
591
+ "[AsH4+]": 489,
592
+ "[AsH3]": 490,
593
+ "[18OH]": 491,
594
+ "[Ru-2]": 492,
595
+ "[Na-2]": 493,
596
+ "[CuH2]": 494,
597
+ "[31P]": 495,
598
+ "[Ti+5]": 496,
599
+ "[35S]": 497,
600
+ "[P@@H]": 498,
601
+ "[ArH]": 499,
602
+ "[Co+]": 500,
603
+ "[Zr-2]": 501,
604
+ "[BH2-]": 502,
605
+ "[131I]": 503,
606
+ "[SH5]": 504,
607
+ "[VH]": 505,
608
+ "[B+2]": 506,
609
+ "[Yb+2]": 507,
610
+ "[14C@H]": 508,
611
+ "[211At]": 509,
612
+ "[NH3+2]": 510,
613
+ "[IrH]": 511,
614
+ "[IrH2]": 512,
615
+ "[Rh-]": 513,
616
+ "[Cr-]": 514,
617
+ "[Sb+]": 515,
618
+ "[Ni+3]": 516,
619
+ "[TaH3]": 517,
620
+ "[Tl+2]": 518,
621
+ "[64Cu]": 519,
622
+ "[Tc]": 520,
623
+ "[Cd+]": 521,
624
+ "[1H]": 522,
625
+ "[15nH]": 523,
626
+ "[AlH2+]": 524,
627
+ "[FH+2]": 525,
628
+ "[BiH3]": 526,
629
+ "[Ru-]": 527,
630
+ "[Mo+6]": 528,
631
+ "[AsH+]": 529,
632
+ "[BaH2]": 530,
633
+ "[BaH]": 531,
634
+ "[Fe+4]": 532,
635
+ "[229Th]": 533,
636
+ "[Th+4]": 534,
637
+ "[As+3]": 535,
638
+ "[NH+3]": 536,
639
+ "[P@H]": 537,
640
+ "[Li-]": 538,
641
+ "[7NaH]": 539,
642
+ "[Bi+]": 540,
643
+ "[PtH+2]": 541,
644
+ "[p-]": 542,
645
+ "[Re+5]": 543,
646
+ "[NiH]": 544,
647
+ "[Ni-]": 545,
648
+ "[Xe+]": 546,
649
+ "[Ca+]": 547,
650
+ "[11c]": 548,
651
+ "[Rh+4]": 549,
652
+ "[AcH]": 550,
653
+ "[HeH]": 551,
654
+ "[Sc+2]": 552,
655
+ "[Mn+]": 553,
656
+ "[UH]": 554,
657
+ "[14CH2]": 555,
658
+ "[SiH4+]": 556,
659
+ "[18OH2]": 557,
660
+ "[Ac-]": 558,
661
+ "[Re+4]": 559,
662
+ "[118Sn]": 560,
663
+ "[153Sm]": 561,
664
+ "[P+2]": 562,
665
+ "[9CH]": 563,
666
+ "[9CH3]": 564,
667
+ "[Y-]": 565,
668
+ "[NiH2]": 566,
669
+ "[Si+2]": 567,
670
+ "[Mn+6]": 568,
671
+ "[ZrH2]": 569,
672
+ "[C-2]": 570,
673
+ "[Bi+5]": 571,
674
+ "[24NaH]": 572,
675
+ "[Fr]": 573,
676
+ "[15CH]": 574,
677
+ "[Se+]": 575,
678
+ "[At]": 576,
679
+ "[P-3]": 577,
680
+ "[124I-]": 578,
681
+ "[CuH2-]": 579,
682
+ "[Nb+4]": 580,
683
+ "[Nb+3]": 581,
684
+ "[MgH]": 582,
685
+ "[Ir+4]": 583,
686
+ "[67Ga+3]": 584,
687
+ "[67Ga]": 585,
688
+ "[13N]": 586,
689
+ "[15OH2]": 587,
690
+ "[2NH]": 588,
691
+ "[Ho]": 589,
692
+ "[Cn]": 590
693
+ },
694
+ "merges": []
695
+ }
696
+ }
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": {"content": "[UNK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "sep_token": {"content": "[SEP]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "[CLS]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "[PAD]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "trim_offsets": true, "max_len": 512, "special_tokens_map_file": "/content/drive/My Drive/Project De Novo/Molecule Transformer/SMILES_tokenized_PubChem_shard00_100k/checkpoint-60000/special_tokens_map.json", "full_tokenizer_file": null, "name_or_path": "/content/outputs/best_model", "do_lower_case": false, "tokenizer_class": "RobertaTokenizer"}
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"[PAD]":0,"[unused1]":1,"[unused2]":2,"[unused3]":3,"[unused4]":4,"[unused5]":5,"[unused6]":6,"[unused7]":7,"[unused8]":8,"[unused9]":9,"[unused10]":10,"[UNK]":11,"[CLS]":12,"[SEP]":13,"[MASK]":14,"c":15,"C":16,"(":17,")":18,"O":19,"1":20,"2":21,"=":22,"N":23,".":24,"n":25,"3":26,"F":27,"Cl":28,">>":29,"~":30,"-":31,"4":32,"[C@H]":33,"S":34,"[C@@H]":35,"[O-]":36,"Br":37,"#":38,"/":39,"[nH]":40,"[N+]":41,"s":42,"5":43,"o":44,"P":45,"[Na+]":46,"[Si]":47,"I":48,"[Na]":49,"[Pd]":50,"[K+]":51,"[K]":52,"[P]":53,"B":54,"[C@]":55,"[C@@]":56,"[Cl-]":57,"6":58,"[OH-]":59,"\\":60,"[N-]":61,"[Li]":62,"[H]":63,"[2H]":64,"[NH4+]":65,"[c-]":66,"[P-]":67,"[Cs+]":68,"[Li+]":69,"[Cs]":70,"[NaH]":71,"[H-]":72,"[O+]":73,"[BH4-]":74,"[Cu]":75,"7":76,"[Mg]":77,"[Fe+2]":78,"[n+]":79,"[Sn]":80,"[BH-]":81,"[Pd+2]":82,"[CH]":83,"[I-]":84,"[Br-]":85,"[C-]":86,"[Zn]":87,"[B-]":88,"[F-]":89,"[Al]":90,"[P+]":91,"[BH3-]":92,"[Fe]":93,"[C]":94,"[AlH4]":95,"[Ni]":96,"[SiH]":97,"8":98,"[Cu+2]":99,"[Mn]":100,"[AlH]":101,"[nH+]":102,"[AlH4-]":103,"[O-2]":104,"[Cr]":105,"[Mg+2]":106,"[NH3+]":107,"[S@]":108,"[Pt]":109,"[Al+3]":110,"[S@@]":111,"[S-]":112,"[Ti]":113,"[Zn+2]":114,"[PH]":115,"[NH2+]":116,"[Ru]":117,"[Ag+]":118,"[S+]":119,"[I+3]":120,"[NH+]":121,"[Ca+2]":122,"[Ag]":123,"9":124,"[Os]":125,"[Se]":126,"[SiH2]":127,"[Ca]":128,"[Ti+4]":129,"[Ac]":130,"[Cu+]":131,"[S]":132,"[Rh]":133,"[Cl+3]":134,"[cH-]":135,"[Zn+]":136,"[O]":137,"[Cl+]":138,"[SH]":139,"[H+]":140,"[Pd+]":141,"[se]":142,"[PH+]":143,"[I]":144,"[Pt+2]":145,"[C+]":146,"[Mg+]":147,"[Hg]":148,"[W]":149,"[SnH]":150,"[SiH3]":151,"[Fe+3]":152,"[NH]":153,"[Mo]":154,"[CH2+]":155,"%10":156,"[CH2-]":157,"[CH2]":158,"[n-]":159,"[Ce+4]":160,"[NH-]":161,"[Co]":162,"[I+]":163,"[PH2]":164,"[Pt+4]":165,"[Ce]":166,"[B]":167,"[Sn+2]":168,"[Ba+2]":169,"%11":170,"[Fe-3]":171,"[18F]":172,"[SH-]":173,"[Pb+2]":174,"[Os-2]":175,"[Zr+4]":176,"[N]":177,"[Ir]":178,"[Bi]":179,"[Ni+2]":180,"[P@]":181,"[Co+2]":182,"[s+]":183,"[As]":184,"[P+3]":185,"[Hg+2]":186,"[Yb+3]":187,"[CH-]":188,"[Zr+2]":189,"[Mn+2]":190,"[CH+]":191,"[In]":192,"[KH]":193,"[Ce+3]":194,"[Zr]":195,"[AlH2-]":196,"[OH2+]":197,"[Ti+3]":198,"[Rh+2]":199,"[Sb]":200,"[S-2]":201,"%12":202,"[P@@]":203,"[Si@H]":204,"[Mn+4]":205,"p":206,"[Ba]":207,"[NH2-]":208,"[Ge]":209,"[Pb+4]":210,"[Cr+3]":211,"[Au]":212,"[LiH]":213,"[Sc+3]":214,"[o+]":215,"[Rh-3]":216,"%13":217,"[Br]":218,"[Sb-]":219,"[S@+]":220,"[I+2]":221,"[Ar]":222,"[V]":223,"[Cu-]":224,"[Al-]":225,"[Te]":226,"[13c]":227,"[13C]":228,"[Cl]":229,"[PH4+]":230,"[SiH4]":231,"[te]":232,"[CH3-]":233,"[S@@+]":234,"[Rh+3]":235,"[SH+]":236,"[Bi+3]":237,"[Br+2]":238,"[La]":239,"[La+3]":240,"[Pt-2]":241,"[N@@]":242,"[PH3+]":243,"[N@]":244,"[Si+4]":245,"[Sr+2]":246,"[Al+]":247,"[Pb]":248,"[SeH]":249,"[Si-]":250,"[V+5]":251,"[Y+3]":252,"[Re]":253,"[Ru+]":254,"[Sm]":255,"*":256,"[3H]":257,"[NH2]":258,"[Ag-]":259,"[13CH3]":260,"[OH+]":261,"[Ru+3]":262,"[OH]":263,"[Gd+3]":264,"[13CH2]":265,"[In+3]":266,"[Si@@]":267,"[Si@]":268,"[Ti+2]":269,"[Sn+]":270,"[Cl+2]":271,"[AlH-]":272,"[Pd-2]":273,"[SnH3]":274,"[B+3]":275,"[Cu-2]":276,"[Nd+3]":277,"[Pb+3]":278,"[13cH]":279,"[Fe-4]":280,"[Ga]":281,"[Sn+4]":282,"[Hg+]":283,"[11CH3]":284,"[Hf]":285,"[Pr]":286,"[Y]":287,"[S+2]":288,"[Cd]":289,"[Cr+6]":290,"[Zr+3]":291,"[Rh+]":292,"[CH3]":293,"[N-3]":294,"[Hf+2]":295,"[Th]":296,"[Sb+3]":297,"%14":298,"[Cr+2]":299,"[Ru+2]":300,"[Hf+4]":301,"[14C]":302,"[Ta]":303,"[Tl+]":304,"[B+]":305,"[Os+4]":306,"[PdH2]":307,"[Pd-]":308,"[Cd+2]":309,"[Co+3]":310,"[S+4]":311,"[Nb+5]":312,"[123I]":313,"[c+]":314,"[Rb+]":315,"[V+2]":316,"[CH3+]":317,"[Ag+2]":318,"[cH+]":319,"[Mn+3]":320,"[Se-]":321,"[As-]":322,"[Eu+3]":323,"[SH2]":324,"[Sm+3]":325,"[IH+]":326,"%15":327,"[OH3+]":328,"[PH3]":329,"[IH2+]":330,"[SH2+]":331,"[Ir+3]":332,"[AlH3]":333,"[Sc]":334,"[Yb]":335,"[15NH2]":336,"[Lu]":337,"[sH+]":338,"[Gd]":339,"[18F-]":340,"[SH3+]":341,"[SnH4]":342,"[TeH]":343,"[Si@@H]":344,"[Ga+3]":345,"[CaH2]":346,"[Tl]":347,"[Ta+5]":348,"[GeH]":349,"[Br+]":350,"[Sr]":351,"[Tl+3]":352,"[Sm+2]":353,"[PH5]":354,"%16":355,"[N@@+]":356,"[Au+3]":357,"[C-4]":358,"[Nd]":359,"[Ti+]":360,"[IH]":361,"[N@+]":362,"[125I]":363,"[Eu]":364,"[Sn+3]":365,"[Nb]":366,"[Er+3]":367,"[123I-]":368,"[14c]":369,"%17":370,"[SnH2]":371,"[YH]":372,"[Sb+5]":373,"[Pr+3]":374,"[Ir+]":375,"[N+3]":376,"[AlH2]":377,"[19F]":378,"%18":379,"[Tb]":380,"[14CH]":381,"[Mo+4]":382,"[Si+]":383,"[BH]":384,"[Be]":385,"[Rb]":386,"[pH]":387,"%19":388,"%20":389,"[Xe]":390,"[Ir-]":391,"[Be+2]":392,"[C+4]":393,"[RuH2]":394,"[15NH]":395,"[U+2]":396,"[Au-]":397,"%21":398,"%22":399,"[Au+]":400,"[15n]":401,"[Al+2]":402,"[Tb+3]":403,"[15N]":404,"[V+3]":405,"[W+6]":406,"[14CH3]":407,"[Cr+4]":408,"[ClH+]":409,"b":410,"[Ti+6]":411,"[Nd+]":412,"[Zr+]":413,"[PH2+]":414,"[Fm]":415,"[N@H+]":416,"[RuH]":417,"[Dy+3]":418,"%23":419,"[Hf+3]":420,"[W+4]":421,"[11C]":422,"[13CH]":423,"[Er]":424,"[124I]":425,"[LaH]":426,"[F]":427,"[siH]":428,"[Ga+]":429,"[Cm]":430,"[GeH3]":431,"[IH-]":432,"[U+6]":433,"[SeH+]":434,"[32P]":435,"[SeH-]":436,"[Pt-]":437,"[Ir+2]":438,"[se+]":439,"[U]":440,"[F+]":441,"[BH2]":442,"[As+]":443,"[Cf]":444,"[ClH2+]":445,"[Ni+]":446,"[TeH3]":447,"[SbH2]":448,"[Ag+3]":449,"%24":450,"[18O]":451,"[PH4]":452,"[Os+2]":453,"[Na-]":454,"[Sb+2]":455,"[V+4]":456,"[Ho+3]":457,"[68Ga]":458,"[PH-]":459,"[Bi+2]":460,"[Ce+2]":461,"[Pd+3]":462,"[99Tc]":463,"[13C@@H]":464,"[Fe+6]":465,"[c]":466,"[GeH2]":467,"[10B]":468,"[Cu+3]":469,"[Mo+2]":470,"[Cr+]":471,"[Pd+4]":472,"[Dy]":473,"[AsH]":474,"[Ba+]":475,"[SeH2]":476,"[In+]":477,"[TeH2]":478,"[BrH+]":479,"[14cH]":480,"[W+]":481,"[13C@H]":482,"[AsH2]":483,"[In+2]":484,"[N+2]":485,"[N@@H+]":486,"[SbH]":487,"[60Co]":488,"[AsH4+]":489,"[AsH3]":490,"[18OH]":491,"[Ru-2]":492,"[Na-2]":493,"[CuH2]":494,"[31P]":495,"[Ti+5]":496,"[35S]":497,"[P@@H]":498,"[ArH]":499,"[Co+]":500,"[Zr-2]":501,"[BH2-]":502,"[131I]":503,"[SH5]":504,"[VH]":505,"[B+2]":506,"[Yb+2]":507,"[14C@H]":508,"[211At]":509,"[NH3+2]":510,"[IrH]":511,"[IrH2]":512,"[Rh-]":513,"[Cr-]":514,"[Sb+]":515,"[Ni+3]":516,"[TaH3]":517,"[Tl+2]":518,"[64Cu]":519,"[Tc]":520,"[Cd+]":521,"[1H]":522,"[15nH]":523,"[AlH2+]":524,"[FH+2]":525,"[BiH3]":526,"[Ru-]":527,"[Mo+6]":528,"[AsH+]":529,"[BaH2]":530,"[BaH]":531,"[Fe+4]":532,"[229Th]":533,"[Th+4]":534,"[As+3]":535,"[NH+3]":536,"[P@H]":537,"[Li-]":538,"[7NaH]":539,"[Bi+]":540,"[PtH+2]":541,"[p-]":542,"[Re+5]":543,"[NiH]":544,"[Ni-]":545,"[Xe+]":546,"[Ca+]":547,"[11c]":548,"[Rh+4]":549,"[AcH]":550,"[HeH]":551,"[Sc+2]":552,"[Mn+]":553,"[UH]":554,"[14CH2]":555,"[SiH4+]":556,"[18OH2]":557,"[Ac-]":558,"[Re+4]":559,"[118Sn]":560,"[153Sm]":561,"[P+2]":562,"[9CH]":563,"[9CH3]":564,"[Y-]":565,"[NiH2]":566,"[Si+2]":567,"[Mn+6]":568,"[ZrH2]":569,"[C-2]":570,"[Bi+5]":571,"[24NaH]":572,"[Fr]":573,"[15CH]":574,"[Se+]":575,"[At]":576,"[P-3]":577,"[124I-]":578,"[CuH2-]":579,"[Nb+4]":580,"[Nb+3]":581,"[MgH]":582,"[Ir+4]":583,"[67Ga+3]":584,"[67Ga]":585,"[13N]":586,"[15OH2]":587,"[2NH]":588,"[Ho]":589,"[Cn]":590}