Transformers
MechSMILES_tokenizer / tokenizer.json
Volowan's picture
Upload tokenizer
e8318f3 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[cls]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 255,
"content": "[bos]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 256,
"content": "[pad]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 257,
"content": "[eos]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 258,
"content": "[sep]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 259,
"content": "[unk]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Sequence",
"pretokenizers": [
{
"type": "WhitespaceSplit"
},
{
"type": "Split",
"pattern": {
"Regex": "(\\[[a-z]+]|\\[[A-Z][a-z]?|Br?|Cl?|N|O|S|P|F|I|H[2-4]?|\\[|\\]|,|;|\\(|\\)|\\.|=|\\#|-|\\+|\\\\|/|:|~|@|\\?|>|\\*|\\$|%|(?<=%)[0-9]{2}|(?<=,)[0-9]{2}(?=\\))|(?<=\\()[0-9]{2}(?=,)|(?<=:)[0-9]{2}(?=\\])|[0-9]|\\|)"
},
"behavior": "Isolated",
"invert": false
}
]
},
"post_processor": null,
"decoder": null,
"model": {
"type": "WordLevel",
"vocab": {
"[cls]": 0,
"[H": 1,
"[He": 2,
"[Li": 3,
"[Be": 4,
"[B": 5,
"[C": 6,
"[N": 7,
"[O": 8,
"[F": 9,
"[Ne": 10,
"[Na": 11,
"[Mg": 12,
"[Al": 13,
"[Si": 14,
"[P": 15,
"[S": 16,
"[Cl": 17,
"[Ar": 18,
"[K": 19,
"[Ca": 20,
"[Sc": 21,
"[Ti": 22,
"[V": 23,
"[Cr": 24,
"[Mn": 25,
"[Fe": 26,
"[Co": 27,
"[Ni": 28,
"[Cu": 29,
"[Zn": 30,
"[Ga": 31,
"[Ge": 32,
"[As": 33,
"[Se": 34,
"[Br": 35,
"[Kr": 36,
"[Rb": 37,
"[Sr": 38,
"[Y": 39,
"[Zr": 40,
"[Nb": 41,
"[Mo": 42,
"[Tc": 43,
"[Ru": 44,
"[Rh": 45,
"[Pd": 46,
"[Ag": 47,
"[Cd": 48,
"[In": 49,
"[Sn": 50,
"[Sb": 51,
"[Te": 52,
"[I": 53,
"[Xe": 54,
"[Cs": 55,
"[Ba": 56,
"[La": 57,
"[Ce": 58,
"[Pr": 59,
"[Nd": 60,
"[Pm": 61,
"[Sm": 62,
"[Eu": 63,
"[Gd": 64,
"[Tb": 65,
"[Dy": 66,
"[Ho": 67,
"[Er": 68,
"[Tm": 69,
"[Yb": 70,
"[Lu": 71,
"[Hf": 72,
"[Ta": 73,
"[W": 74,
"[Re": 75,
"[Os": 76,
"[Ir": 77,
"[Pt": 78,
"[Au": 79,
"[Hg": 80,
"[Tl": 81,
"[Pb": 82,
"[Bi": 83,
"[Po": 84,
"[At": 85,
"[Rn": 86,
"[Fr": 87,
"[Ra": 88,
"[Ac": 89,
"[Th": 90,
"[Pa": 91,
"[U": 92,
"[Np": 93,
"[Pu": 94,
"[Am": 95,
"[Cm": 96,
"[Bk": 97,
"[Cf": 98,
"[Es": 99,
"[Fm": 100,
"[Md": 101,
"[No": 102,
"[Lr": 103,
"[Rf": 104,
"[Db": 105,
"[Sg": 106,
"[Bh": 107,
"[Hs": 108,
"[Mt": 109,
"[Ds": 110,
"[Rg": 111,
"[Cn": 112,
"[Nh": 113,
"[Fl": 114,
"[Mc": 115,
"[Lv": 116,
"[Ts": 117,
"[Og": 118,
"C": 119,
"Cl": 120,
"B": 121,
"Br": 122,
"N": 123,
"O": 124,
"S": 125,
"P": 126,
"F": 127,
"I": 128,
"H": 129,
"H2": 130,
"H3": 131,
"H4": 132,
"[": 133,
"]": 134,
":": 135,
"=": 136,
"#": 137,
"$": 138,
"\\": 139,
"/": 140,
"(": 141,
")": 142,
".": 143,
"[prod]": 144,
"[reac]": 145,
"[mech]": 146,
"+": 147,
"-": 148,
"@": 149,
"*": 150,
"0": 151,
"1": 152,
"2": 153,
"3": 154,
"4": 155,
"5": 156,
"6": 157,
"7": 158,
"8": 159,
"9": 160,
"10": 161,
"11": 162,
"12": 163,
"13": 164,
"14": 165,
"15": 166,
"16": 167,
"17": 168,
"18": 169,
"19": 170,
"20": 171,
"21": 172,
"22": 173,
"23": 174,
"24": 175,
"25": 176,
"26": 177,
"27": 178,
"28": 179,
"29": 180,
"30": 181,
"31": 182,
"32": 183,
"33": 184,
"34": 185,
"35": 186,
"36": 187,
"37": 188,
"38": 189,
"39": 190,
"40": 191,
"41": 192,
"42": 193,
"43": 194,
"44": 195,
"45": 196,
"46": 197,
"47": 198,
"48": 199,
"49": 200,
"50": 201,
"51": 202,
"52": 203,
"53": 204,
"54": 205,
"55": 206,
"56": 207,
"57": 208,
"58": 209,
"59": 210,
"60": 211,
"61": 212,
"62": 213,
"63": 214,
"64": 215,
"65": 216,
"66": 217,
"67": 218,
"68": 219,
"69": 220,
"70": 221,
"71": 222,
"72": 223,
"73": 224,
"74": 225,
"75": 226,
"76": 227,
"77": 228,
"78": 229,
"79": 230,
"80": 231,
"81": 232,
"82": 233,
"83": 234,
"84": 235,
"85": 236,
"86": 237,
"87": 238,
"88": 239,
"89": 240,
"90": 241,
"91": 242,
"92": 243,
"93": 244,
"94": 245,
"95": 246,
"96": 247,
"97": 248,
"98": 249,
"99": 250,
"%": 251,
",": 252,
";": 253,
"|": 254,
"[bos]": 255,
"[pad]": 256,
"[eos]": 257,
"[sep]": 258,
"[unk]": 259
},
"unk_token": "[unk]"
}
}