Transformers
modernbert-base-chocolm / tokenizer.json
n28div's picture
Upload tokenizer
599335f verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<|padding|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<|endoftext|>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 5,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 6,
"content": "[MASK]",
"single_word": false,
"lstrip": true,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "NFC"
},
"pre_tokenizer": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
3
],
"tokens": [
"[CLS]"
]
},
"[MASK]": {
"id": "[MASK]",
"ids": [
6
],
"tokens": [
"[MASK]"
]
},
"[PAD]": {
"id": "[PAD]",
"ids": [
5
],
"tokens": [
"[PAD]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
4
],
"tokens": [
"[SEP]"
]
},
"[UNK]": {
"id": "[UNK]",
"ids": [
2
],
"tokens": [
"[UNK]"
]
}
}
},
"decoder": {
"type": "ByteLevel",
"add_prefix_space": false,
"trim_offsets": true,
"use_regex": true
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<|padding|>": 0,
"<|endoftext|>": 1,
"[UNK]": 2,
"[CLS]": 3,
"[SEP]": 4,
"[PAD]": 5,
"[MASK]": 6,
"!": 7,
"\"": 8,
"#": 9,
"$": 10,
"%": 11,
"&": 12,
"'": 13,
"(": 14,
")": 15,
"*": 16,
"+": 17,
",": 18,
"-": 19,
".": 20,
"/": 21,
"0": 22,
"1": 23,
"2": 24,
"3": 25,
"4": 26,
"5": 27,
"6": 28,
"7": 29,
"8": 30,
"9": 31,
":": 32,
";": 33,
"<": 34,
"=": 35,
">": 36,
"?": 37,
"@": 38,
"A": 39,
"B": 40,
"C": 41,
"D": 42,
"E": 43,
"F": 44,
"G": 45,
"H": 46,
"I": 47,
"J": 48,
"K": 49,
"L": 50,
"M": 51,
"N": 52,
"O": 53,
"P": 54,
"Q": 55,
"R": 56,
"S": 57,
"T": 58,
"U": 59,
"V": 60,
"W": 61,
"X": 62,
"Y": 63,
"Z": 64,
"[": 65,
"\\": 66,
"]": 67,
"^": 68,
"_": 69,
"`": 70,
"a": 71,
"b": 72,
"c": 73,
"d": 74,
"e": 75,
"f": 76,
"g": 77,
"h": 78,
"i": 79,
"j": 80,
"k": 81,
"l": 82,
"m": 83,
"n": 84,
"o": 85,
"p": 86,
"q": 87,
"r": 88,
"s": 89,
"t": 90,
"u": 91,
"v": 92,
"w": 93,
"x": 94,
"y": 95,
"z": 96,
"{": 97,
"|": 98,
"}": 99,
"~": 100,
"¡": 101,
"¢": 102,
"£": 103,
"¤": 104,
"¥": 105,
"¦": 106,
"§": 107,
"¨": 108,
"©": 109,
"ª": 110,
"«": 111,
"¬": 112,
"®": 113,
"¯": 114,
"°": 115,
"±": 116,
"²": 117,
"³": 118,
"´": 119,
"µ": 120,
"¶": 121,
"·": 122,
"¸": 123,
"¹": 124,
"º": 125,
"»": 126,
"¼": 127,
"½": 128,
"¾": 129,
"¿": 130,
"À": 131,
"Á": 132,
"Â": 133,
"Ã": 134,
"Ä": 135,
"Å": 136,
"Æ": 137,
"Ç": 138,
"È": 139,
"É": 140,
"Ê": 141,
"Ë": 142,
"Ì": 143,
"Í": 144,
"Î": 145,
"Ï": 146,
"Ð": 147,
"Ñ": 148,
"Ò": 149,
"Ó": 150,
"Ô": 151,
"Õ": 152,
"Ö": 153,
"×": 154,
"Ø": 155,
"Ù": 156,
"Ú": 157,
"Û": 158,
"Ü": 159,
"Ý": 160,
"Þ": 161,
"ß": 162,
"à": 163,
"á": 164,
"â": 165,
"ã": 166,
"ä": 167,
"å": 168,
"æ": 169,
"ç": 170,
"è": 171,
"é": 172,
"ê": 173,
"ë": 174,
"ì": 175,
"í": 176,
"î": 177,
"ï": 178,
"ð": 179,
"ñ": 180,
"ò": 181,
"ó": 182,
"ô": 183,
"õ": 184,
"ö": 185,
"÷": 186,
"ø": 187,
"ù": 188,
"ú": 189,
"û": 190,
"ü": 191,
"ý": 192,
"þ": 193,
"ÿ": 194,
"Ā": 195,
"ā": 196,
"Ă": 197,
"ă": 198,
"Ą": 199,
"ą": 200,
"Ć": 201,
"ć": 202,
"Ĉ": 203,
"ĉ": 204,
"Ċ": 205,
"ċ": 206,
"Č": 207,
"č": 208,
"Ď": 209,
"ď": 210,
"Đ": 211,
"đ": 212,
"Ē": 213,
"ē": 214,
"Ĕ": 215,
"ĕ": 216,
"Ė": 217,
"ė": 218,
"Ę": 219,
"ę": 220,
"Ě": 221,
"ě": 222,
"Ĝ": 223,
"ĝ": 224,
"Ğ": 225,
"ğ": 226,
"Ġ": 227,
"ġ": 228,
"Ģ": 229,
"ģ": 230,
"Ĥ": 231,
"ĥ": 232,
"Ħ": 233,
"ħ": 234,
"Ĩ": 235,
"ĩ": 236,
"Ī": 237,
"ī": 238,
"Ĭ": 239,
"ĭ": 240,
"Į": 241,
"į": 242,
"İ": 243,
"ı": 244,
"IJ": 245,
"ij": 246,
"Ĵ": 247,
"ĵ": 248,
"Ķ": 249,
"ķ": 250,
"ĸ": 251,
"Ĺ": 252,
"ĺ": 253,
"Ļ": 254,
"ļ": 255,
"Ľ": 256,
"ľ": 257,
"Ŀ": 258,
"ŀ": 259,
"Ł": 260,
"ł": 261,
"Ń": 262,
"aj": 263,
"maj": 264,
"in": 265,
"min": 266,
"ĠG": 267,
"ĠC": 268,
"ĠD": 269,
"ĠF": 270,
":(": 271,
"ĠA": 272,
"ĠE": 273,
"ĠB": 274,
"ĠBb": 275,
"ĠEb": 276,
"di": 277,
"dim": 278,
"#:": 279,
"ĠAb": 280,
")/": 281,
"11": 282,
"ĠDb": 283,
",*": 284,
",#": 285,
"13": 286,
":(*": 287,
"hdim": 288,
"ĠGb": 289,
"su": 290,
"sus": 291,
"(#": 292,
"au": 293,
"aug": 294,
"bb": 295,
"#:(": 296,
"Bb": 297,
"Eb": 298,
"ĠN": 299,
"##:": 300,
"ĠCb": 301,
"Ab": 302,
"#:(*": 303,
"/#": 304,
")/#": 305,
"ĠFb": 306,
"minmaj": 307,
"Db": 308,
"ĠBbb": 309,
"(*": 310,
"##:(": 311,
"Gb": 312,
"###:": 313,
"ĠEbb": 314,
"#/": 315,
":(#": 316,
"ĠAbb": 317,
"ĠDbb": 318,
"##:(*": 319,
"ĠGbb": 320,
"ĠFbb": 321,
"Cb": 322,
",##": 323,
"ĠCbb": 324,
"ĠEbbb": 325,
"ĠAbbb": 326
},
"merges": [
[
"a",
"j"
],
[
"m",
"aj"
],
[
"i",
"n"
],
[
"m",
"in"
],
[
"Ġ",
"G"
],
[
"Ġ",
"C"
],
[
"Ġ",
"D"
],
[
"Ġ",
"F"
],
[
":",
"("
],
[
"Ġ",
"A"
],
[
"Ġ",
"E"
],
[
"Ġ",
"B"
],
[
"ĠB",
"b"
],
[
"ĠE",
"b"
],
[
"d",
"i"
],
[
"di",
"m"
],
[
"#",
":"
],
[
"ĠA",
"b"
],
[
")",
"/"
],
[
"1",
"1"
],
[
"ĠD",
"b"
],
[
",",
"*"
],
[
",",
"#"
],
[
"1",
"3"
],
[
":(",
"*"
],
[
"h",
"dim"
],
[
"ĠG",
"b"
],
[
"s",
"u"
],
[
"su",
"s"
],
[
"(",
"#"
],
[
"a",
"u"
],
[
"au",
"g"
],
[
"b",
"b"
],
[
"#",
":("
],
[
"B",
"b"
],
[
"E",
"b"
],
[
"Ġ",
"N"
],
[
"#",
"#:"
],
[
"ĠC",
"b"
],
[
"A",
"b"
],
[
"#",
":(*"
],
[
"/",
"#"
],
[
")/",
"#"
],
[
"ĠF",
"b"
],
[
"min",
"maj"
],
[
"D",
"b"
],
[
"ĠBb",
"b"
],
[
"(",
"*"
],
[
"#",
"#:("
],
[
"G",
"b"
],
[
"#",
"##:"
],
[
"ĠEb",
"b"
],
[
"#",
"/"
],
[
":(",
"#"
],
[
"ĠAb",
"b"
],
[
"ĠDb",
"b"
],
[
"#",
"#:(*"
],
[
"ĠGb",
"b"
],
[
"ĠF",
"bb"
],
[
"C",
"b"
],
[
",#",
"#"
],
[
"ĠC",
"bb"
],
[
"ĠEb",
"bb"
],
[
"ĠAb",
"bb"
]
]
}
}