character-level-tokenizer / tokenizer.json
lhy's picture
Upload tokenizer
9e262c9
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 5,
"content": "\u0001",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 6,
"content": "\u0002",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 7,
"content": "\u0003",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 8,
"content": "\u0004",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 9,
"content": "\u0005",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "BertNormalizer",
"clean_text": true,
"handle_chinese_chars": false,
"strip_accents": true,
"lowercase": false
},
"pre_tokenizer": {
"type": "BertPreTokenizer"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
1
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
2
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": {
"type": "WordPiece",
"prefix": "##",
"cleanup": true
},
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[UNK]": 0,
"[CLS]": 1,
"[SEP]": 2,
"[PAD]": 3,
"[MASK]": 4,
"\u0001": 5,
"\u0002": 6,
"\u0003": 7,
"\u0004": 8,
"\u0005": 9,
"!": 10,
"\"": 11,
"#": 12,
"$": 13,
"%": 14,
"&": 15,
"'": 16,
"(": 17,
")": 18,
"*": 19,
"+": 20,
",": 21,
"-": 22,
".": 23,
"/": 24,
"0": 25,
"1": 26,
"2": 27,
"3": 28,
"4": 29,
"5": 30,
"6": 31,
"7": 32,
"8": 33,
"9": 34,
":": 35,
";": 36,
"<": 37,
"=": 38,
">": 39,
"?": 40,
"@": 41,
"[": 42,
"\\": 43,
"]": 44,
"^": 45,
"_": 46,
"`": 47,
"a": 48,
"b": 49,
"c": 50,
"d": 51,
"e": 52,
"f": 53,
"g": 54,
"h": 55,
"i": 56,
"j": 57,
"k": 58,
"l": 59,
"m": 60,
"n": 61,
"o": 62,
"p": 63,
"q": 64,
"r": 65,
"s": 66,
"t": 67,
"u": 68,
"v": 69,
"w": 70,
"x": 71,
"y": 72,
"z": 73,
"{": 74,
"|": 75,
"}": 76,
"~": 77,
"¡": 78,
"¢": 79,
"£": 80,
"¥": 81,
"§": 82,
"¯": 83,
"µ": 84,
"º": 85,
"»": 86,
"¿": 87,
"À": 88,
"Â": 89,
"Ã": 90,
"Ä": 91,
"Å": 92,
"Ç": 93,
"Ë": 94,
"Í": 95,
"Î": 96,
"Ï": 97,
"Ñ": 98,
"Ó": 99,
"Ø": 100,
"Ù": 101,
"Ú": 102,
"Ü": 103,
"ß": 104,
"à": 105,
"á": 106,
"â": 107,
"ã": 108,
"ä": 109,
"å": 110,
"æ": 111,
"ç": 112,
"è": 113,
"é": 114,
"ê": 115,
"ë": 116,
"ì": 117,
"í": 118,
"î": 119,
"ï": 120,
"ñ": 121,
"ò": 122,
"ó": 123,
"ô": 124,
"õ": 125,
"ö": 126,
"ø": 127,
"ù": 128,
"ú": 129,
"û": 130,
"ü": 131,
"ý": 132,
"þ": 133,
"ā": 134,
"ă": 135,
"ą": 136,
"ć": 137,
"č": 138,
"ď": 139,
"đ": 140,
"ē": 141,
"ĕ": 142,
"ė": 143,
"Ę": 144,
"ę": 145,
"ě": 146,
"ġ": 147,
"ģ": 148,
"ĩ": 149,
"ī": 150,
"ĭ": 151,
"İ": 152,
"ı": 153,
"ĵ": 154,
"ķ": 155,
"ĸ": 156,
"ĺ": 157,
"ł": 158,
"ń": 159,
"ň": 160,
"ʼn": 161,
"ŋ": 162,
"ō": 163,
"ŏ": 164,
"ő": 165,
"œ": 166,
"ŕ": 167,
"ŗ": 168,
"Ř": 169,
"ř": 170,
"Ś": 171,
"ś": 172,
"Ş": 173,
"ş": 174,
"š": 175,
"ţ": 176,
"Ť": 177,
"ť": 178,
"ũ": 179,
"ū": 180,
"ŭ": 181,
"ű": 182,
"ų": 183,
"ŵ": 184,
"Ÿ": 185,
"ż": 186,
"ž": 187,
"ƀ": 188,
"Ɓ": 189,
"Ƅ": 190,
"ƅ": 191,
"Ƈ": 192,
"ƒ": 193,
"ƙ": 194,
"ƞ": 195,
"Ƭ": 196,
"Ư": 197,
"Ƴ": 198,
"Ǐ": 199,
"Ƿ": 200,
"ǹ": 201,
"ȋ": 202,
"ș": 203,
"ț": 204,
"ȧ": 205,
"ȯ": 206,
"Ʌ": 207,
"ɑ": 208,
"ɗ": 209,
"ɠ": 210,
"ɡ": 211,
"ɢ": 212,
"ɣ": 213,
"ɩ": 214,
"ɪ": 215,
"ɭ": 216,
"ɯ": 217,
"ɱ": 218,
"ɳ": 219,
"ɴ": 220,
"ɺ": 221,
"ɼ": 222,
"ɾ": 223,
"ʀ": 224,
"ʂ": 225,
"ʄ": 226,
"ʋ": 227,
"ʌ": 228,
"ʍ": 229,
"ʏ": 230,
"ʙ": 231,
"ʜ": 232,
"ʝ": 233,
"ʟ": 234,
"ʨ": 235,
"˄": 236,
"Α": 237,
"Β": 238,
"Ε": 239,
"Ζ": 240,
"Η": 241,
"Ι": 242,
"Κ": 243,
"Μ": 244,
"Ν": 245,
"Ο": 246,
"Ρ": 247,
"Τ": 248,
"Υ": 249,
"Χ": 250,
"ί": 251,
"α": 252,
"β": 253,
"γ": 254,
"η": 255,
"ι": 256,
"κ": 257,
"μ": 258,
"ν": 259,
"ο": 260,
"π": 261,
"ρ": 262,
"σ": 263,
"τ": 264,
"υ": 265,
"χ": 266,
"ω": 267,
"ϲ": 268,
"ϳ": 269,
"Ϲ": 270,
"Ϻ": 271,
"Ѕ": 272,
"Ј": 273,
"А": 274,
"В": 275,
"Е": 276,
"З": 277,
"К": 278,
"М": 279,
"Н": 280,
"О": 281,
"Р": 282,
"С": 283,
"Т": 284,
"У": 285,
"Х": 286,
"Ь": 287,
"а": 288,
"в": 289,
"г": 290,
"д": 291,
"е": 292,
"и": 293,
"к": 294,
"л": 295,
"н": 296,
"о": 297,
"п": 298,
"р": 299,
"с": 300,
"т": 301,
"у": 302,
"х": 303,
"ч": 304,
"ш": 305,
"щ": 306,
"ѐ": 307,
"ё": 308,
"ѕ": 309,
"і": 310,
"ј": 311,
"џ": 312,
"ѡ": 313,
"Ѵ": 314,
"ѵ": 315,
"ҏ": 316,
"қ": 317,
"ҡ": 318,
"ң": 319,
"ҥ": 320,
"Ү": 321,
"ү": 322,
"ҳ": 323,
"һ": 324,
"ҽ": 325,
"ӏ": 326,
"ԁ": 327,
"ԛ": 328,
"Ա": 329,
"Ի": 330,
"Ս": 331,
"Տ": 332,
"Օ": 333,
"ա": 334,
"գ": 335,
"զ": 336,
"ժ": 337,
"հ": 338,
"յ": 339,
"ս": 340,
"օ": 341,
"Ⴍ": 342,
"Ⴓ": 343,
"Ⴝ": 344,
"Ꭰ": 345,
"Ꭲ": 346,
"Ꭵ": 347,
"Ꭺ": 348,
"Ꭻ": 349,
"Ꮃ": 350,
"Ꮇ": 351,
"Ꮋ": 352,
"Ꮐ": 353,
"Ꮓ": 354,
"Ꮢ": 355,
"Ꮩ": 356,
"Ꮪ": 357,
"Ꮮ": 358,
"Ꮯ": 359,
"Ꮲ": 360,
"Ꮶ": 361,
"Ᏼ": 362,
"ᚱ": 363,
"ᛁ": 364,
"ᛒ": 365,
"ᛕ": 366,
"ᛖ": 367,
"ᴄ": 368,
"ᴇ": 369,
"ᴋ": 370,
"ᴍ": 371,
"ᴏ": 372,
"ᴑ": 373,
"ᴜ": 374,
"ᴠ": 375,
"ᴡ": 376,
"ᴦ": 377,
"ᴨ": 378,
"ᴺ": 379,
"ᴼ": 380,
"ᴾ": 381,
"ᴿ": 382,
"ḟ": 383,
"ḱ": 384,
"ḿ": 385,
"ṁ": 386,
"ṅ": 387,
"Ṛ": 388,
"ṡ": 389,
"ẁ": 390,
"ẃ": 391,
"ẇ": 392,
"ἀ": 393,
"ἁ": 394,
"ἇ": 395,
"ἰ": 396,
"ἱ": 397,
"ἳ": 398,
"ὀ": 399,
"ὁ": 400,
"ὶ": 401,
"ί": 402,
"ῤ": 403,
"ῥ": 404,
"―": 405,
"₩": 406,
"€": 407,
"₿": 408,
"ℹ": 409,
"⋃": 410,
"𝘼": 411,
"𝘾": 412,
"𝘿": 413,
"𝙀": 414,
"𝙍": 415,
"𝙏": 416,
"##\u0001": 417,
"##\u0002": 418,
"##\u0003": 419,
"##\u0004": 420,
"##\u0005": 421,
"##!": 422,
"##\"": 423,
"###": 424,
"##$": 425,
"##%": 426,
"##&": 427,
"##'": 428,
"##(": 429,
"##)": 430,
"##*": 431,
"##+": 432,
"##,": 433,
"##-": 434,
"##.": 435,
"##/": 436,
"##0": 437,
"##1": 438,
"##2": 439,
"##3": 440,
"##4": 441,
"##5": 442,
"##6": 443,
"##7": 444,
"##8": 445,
"##9": 446,
"##:": 447,
"##;": 448,
"##<": 449,
"##=": 450,
"##>": 451,
"##?": 452,
"##@": 453,
"##[": 454,
"##\\": 455,
"##]": 456,
"##^": 457,
"##_": 458,
"##`": 459,
"##a": 460,
"##b": 461,
"##c": 462,
"##d": 463,
"##e": 464,
"##f": 465,
"##g": 466,
"##h": 467,
"##i": 468,
"##j": 469,
"##k": 470,
"##l": 471,
"##m": 472,
"##n": 473,
"##o": 474,
"##p": 475,
"##q": 476,
"##r": 477,
"##s": 478,
"##t": 479,
"##u": 480,
"##v": 481,
"##w": 482,
"##x": 483,
"##y": 484,
"##z": 485,
"##{": 486,
"##|": 487,
"##}": 488,
"##~": 489,
"##¡": 490,
"##¢": 491,
"##£": 492,
"##¥": 493,
"##§": 494,
"##¯": 495,
"##µ": 496,
"##º": 497,
"##»": 498,
"##¿": 499,
"##À": 500,
"##Â": 501,
"##Ã": 502,
"##Ä": 503,
"##Å": 504,
"##Ç": 505,
"##Ë": 506,
"##Í": 507,
"##Î": 508,
"##Ï": 509,
"##Ñ": 510,
"##Ó": 511,
"##Ø": 512,
"##Ù": 513,
"##Ú": 514,
"##Ü": 515,
"##ß": 516,
"##à": 517,
"##á": 518,
"##â": 519,
"##ã": 520,
"##ä": 521,
"##å": 522,
"##æ": 523,
"##ç": 524,
"##è": 525,
"##é": 526,
"##ê": 527,
"##ë": 528,
"##ì": 529,
"##í": 530,
"##î": 531,
"##ï": 532,
"##ñ": 533,
"##ò": 534,
"##ó": 535,
"##ô": 536,
"##õ": 537,
"##ö": 538,
"##ø": 539,
"##ù": 540,
"##ú": 541,
"##û": 542,
"##ü": 543,
"##ý": 544,
"##þ": 545,
"##ā": 546,
"##ă": 547,
"##ą": 548,
"##ć": 549,
"##č": 550,
"##ď": 551,
"##đ": 552,
"##ē": 553,
"##ĕ": 554,
"##ė": 555,
"##Ę": 556,
"##ę": 557,
"##ě": 558,
"##ġ": 559,
"##ģ": 560,
"##ĩ": 561,
"##ī": 562,
"##ĭ": 563,
"##İ": 564,
"##ı": 565,
"##ĵ": 566,
"##ķ": 567,
"##ĸ": 568,
"##ĺ": 569,
"##ł": 570,
"##ń": 571,
"##ň": 572,
"##ʼn": 573,
"##ŋ": 574,
"##ō": 575,
"##ŏ": 576,
"##ő": 577,
"##œ": 578,
"##ŕ": 579,
"##ŗ": 580,
"##Ř": 581,
"##ř": 582,
"##Ś": 583,
"##ś": 584,
"##Ş": 585,
"##ş": 586,
"##š": 587,
"##ţ": 588,
"##Ť": 589,
"##ť": 590,
"##ũ": 591,
"##ū": 592,
"##ŭ": 593,
"##ű": 594,
"##ų": 595,
"##ŵ": 596,
"##Ÿ": 597,
"##ż": 598,
"##ž": 599,
"##ƀ": 600,
"##Ɓ": 601,
"##Ƅ": 602,
"##ƅ": 603,
"##Ƈ": 604,
"##ƒ": 605,
"##ƙ": 606,
"##ƞ": 607,
"##Ƭ": 608,
"##Ư": 609,
"##Ƴ": 610,
"##Ǐ": 611,
"##Ƿ": 612,
"##ǹ": 613,
"##ȋ": 614,
"##ș": 615,
"##ț": 616,
"##ȧ": 617,
"##ȯ": 618,
"##Ʌ": 619,
"##ɑ": 620,
"##ɗ": 621,
"##ɠ": 622,
"##ɡ": 623,
"##ɢ": 624,
"##ɣ": 625,
"##ɩ": 626,
"##ɪ": 627,
"##ɭ": 628,
"##ɯ": 629,
"##ɱ": 630,
"##ɳ": 631,
"##ɴ": 632,
"##ɺ": 633,
"##ɼ": 634,
"##ɾ": 635,
"##ʀ": 636,
"##ʂ": 637,
"##ʄ": 638,
"##ʋ": 639,
"##ʌ": 640,
"##ʍ": 641,
"##ʏ": 642,
"##ʙ": 643,
"##ʜ": 644,
"##ʝ": 645,
"##ʟ": 646,
"##ʨ": 647,
"##˄": 648,
"##Α": 649,
"##Β": 650,
"##Ε": 651,
"##Ζ": 652,
"##Η": 653,
"##Ι": 654,
"##Κ": 655,
"##Μ": 656,
"##Ν": 657,
"##Ο": 658,
"##Ρ": 659,
"##Τ": 660,
"##Υ": 661,
"##Χ": 662,
"##ί": 663,
"##α": 664,
"##β": 665,
"##γ": 666,
"##η": 667,
"##ι": 668,
"##κ": 669,
"##μ": 670,
"##ν": 671,
"##ο": 672,
"##π": 673,
"##ρ": 674,
"##σ": 675,
"##τ": 676,
"##υ": 677,
"##χ": 678,
"##ω": 679,
"##ϲ": 680,
"##ϳ": 681,
"##Ϲ": 682,
"##Ϻ": 683,
"##Ѕ": 684,
"##Ј": 685,
"##А": 686,
"##В": 687,
"##Е": 688,
"##З": 689,
"##К": 690,
"##М": 691,
"##Н": 692,
"##О": 693,
"##Р": 694,
"##С": 695,
"##Т": 696,
"##У": 697,
"##Х": 698,
"##Ь": 699,
"##а": 700,
"##в": 701,
"##г": 702,
"##д": 703,
"##е": 704,
"##и": 705,
"##к": 706,
"##л": 707,
"##н": 708,
"##о": 709,
"##п": 710,
"##р": 711,
"##с": 712,
"##т": 713,
"##у": 714,
"##х": 715,
"##ч": 716,
"##ш": 717,
"##щ": 718,
"##ѐ": 719,
"##ё": 720,
"##ѕ": 721,
"##і": 722,
"##ј": 723,
"##џ": 724,
"##ѡ": 725,
"##Ѵ": 726,
"##ѵ": 727,
"##ҏ": 728,
"##қ": 729,
"##ҡ": 730,
"##ң": 731,
"##ҥ": 732,
"##Ү": 733,
"##ү": 734,
"##ҳ": 735,
"##һ": 736,
"##ҽ": 737,
"##ӏ": 738,
"##ԁ": 739,
"##ԛ": 740,
"##Ա": 741,
"##Ի": 742,
"##Ս": 743,
"##Տ": 744,
"##Օ": 745,
"##ա": 746,
"##գ": 747,
"##զ": 748,
"##ժ": 749,
"##հ": 750,
"##յ": 751,
"##ս": 752,
"##օ": 753,
"##Ⴍ": 754,
"##Ⴓ": 755,
"##Ⴝ": 756,
"##Ꭰ": 757,
"##Ꭲ": 758,
"##Ꭵ": 759,
"##Ꭺ": 760,
"##Ꭻ": 761,
"##Ꮃ": 762,
"##Ꮇ": 763,
"##Ꮋ": 764,
"##Ꮐ": 765,
"##Ꮓ": 766,
"##Ꮢ": 767,
"##Ꮩ": 768,
"##Ꮪ": 769,
"##Ꮮ": 770,
"##Ꮯ": 771,
"##Ꮲ": 772,
"##Ꮶ": 773,
"##Ᏼ": 774,
"##ᚱ": 775,
"##ᛁ": 776,
"##ᛒ": 777,
"##ᛕ": 778,
"##ᛖ": 779,
"##ᴄ": 780,
"##ᴇ": 781,
"##ᴋ": 782,
"##ᴍ": 783,
"##ᴏ": 784,
"##ᴑ": 785,
"##ᴜ": 786,
"##ᴠ": 787,
"##ᴡ": 788,
"##ᴦ": 789,
"##ᴨ": 790,
"##ᴺ": 791,
"##ᴼ": 792,
"##ᴾ": 793,
"##ᴿ": 794,
"##ḟ": 795,
"##ḱ": 796,
"##ḿ": 797,
"##ṁ": 798,
"##ṅ": 799,
"##Ṛ": 800,
"##ṡ": 801,
"##ẁ": 802,
"##ẃ": 803,
"##ẇ": 804,
"##ἀ": 805,
"##ἁ": 806,
"##ἇ": 807,
"##ἰ": 808,
"##ἱ": 809,
"##ἳ": 810,
"##ὀ": 811,
"##ὁ": 812,
"##ὶ": 813,
"##ί": 814,
"##ῤ": 815,
"##ῥ": 816,
"##―": 817,
"##₩": 818,
"##€": 819,
"##₿": 820,
"##ℹ": 821,
"##⋃": 822,
"##𝘼": 823,
"##𝘾": 824,
"##𝘿": 825,
"##𝙀": 826,
"##𝙍": 827,
"##𝙏": 828
}
}
}