chatterxbox / tokenizer.json
tel4vn's picture
Upload tokenizer.json
18f57d9 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[STOP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[SPACE]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 255,
"content": "[START]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": null,
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "[UNK]",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"vocab": {
"[STOP]": 0,
"[UNK]": 1,
"[SPACE]": 2,
" ": 3,
"!": 4,
"%": 5,
"&": 6,
"'": 7,
",": 8,
"-": 9,
".": 10,
"0": 11,
"1": 12,
"2": 13,
"3": 14,
"4": 15,
"5": 16,
"6": 17,
"7": 18,
"8": 19,
"9": 20,
"?": 21,
"a": 22,
"b": 23,
"c": 24,
"d": 25,
"e": 26,
"f": 27,
"g": 28,
"h": 29,
"i": 30,
"j": 31,
"k": 32,
"l": 33,
"m": 34,
"n": 35,
"o": 36,
"p": 37,
"q": 38,
"r": 39,
"s": 40,
"t": 41,
"u": 42,
"v": 43,
"w": 44,
"x": 45,
"y": 46,
"z": 47,
"à": 48,
"á": 49,
"ả": 50,
"ã": 51,
"ạ": 52,
"ă": 53,
"ắ": 54,
"ằ": 55,
"ẳ": 56,
"ẵ": 57,
"ặ": 58,
"â": 59,
"ấ": 60,
"ầ": 61,
"ẩ": 62,
"ẫ": 63,
"ậ": 64,
"è": 65,
"é": 66,
"ẻ": 67,
"ẽ": 68,
"ẹ": 69,
"ê": 70,
"ế": 71,
"ề": 72,
"ể": 73,
"ễ": 74,
"ệ": 75,
"ì": 76,
"í": 77,
"ỉ": 78,
"ĩ": 79,
"ị": 80,
"ò": 81,
"ó": 82,
"ỏ": 83,
"õ": 84,
"ọ": 85,
"ô": 86,
"ố": 87,
"ồ": 88,
"ổ": 89,
"ỗ": 90,
"ộ": 91,
"ơ": 92,
"ớ": 93,
"ờ": 94,
"ở": 95,
"ỡ": 96,
"ợ": 97,
"ù": 98,
"ú": 99,
"ủ": 100,
"ũ": 101,
"ụ": 102,
"ư": 103,
"ứ": 104,
"ừ": 105,
"ử": 106,
"ữ": 107,
"ự": 108,
"ỳ": 109,
"ý": 110,
"ỷ": 111,
"ỹ": 112,
"ỵ": 113,
"đ": 114,
"A": 115,
"B": 116,
"C": 117,
"D": 118,
"E": 119,
"F": 120,
"G": 121,
"H": 122,
"I": 123,
"J": 124,
"K": 125,
"L": 126,
"M": 127,
"N": 128,
"O": 129,
"P": 130,
"Q": 131,
"R": 132,
"S": 133,
"T": 134,
"U": 135,
"V": 136,
"W": 137,
"X": 138,
"Y": 139,
"Z": 140,
"À": 141,
"Á": 142,
"Ả": 143,
"Ã": 144,
"Ạ": 145,
"Ă": 146,
"Ắ": 147,
"Ằ": 148,
"Ẳ": 149,
"Ẵ": 150,
"Ặ": 151,
"Â": 152,
"Ấ": 153,
"Ầ": 154,
"Ẩ": 155,
"Ẫ": 156,
"Ậ": 157,
"È": 158,
"É": 159,
"Ẻ": 160,
"Ẽ": 161,
"Ẹ": 162,
"Ê": 163,
"Ế": 164,
"Ề": 165,
"Ể": 166,
"Ễ": 167,
"Ệ": 168,
"Ì": 169,
"Í": 170,
"Ỉ": 171,
"Ĩ": 172,
"Ị": 173,
"Ò": 174,
"Ó": 175,
"Ỏ": 176,
"Õ": 177,
"Ọ": 178,
"Ô": 179,
"Ố": 180,
"Ồ": 181,
"Ổ": 182,
"Ỗ": 183,
"Ộ": 184,
"Ơ": 185,
"Ớ": 186,
"Ờ": 187,
"Ở": 188,
"Ỡ": 189,
"Ợ": 190,
"Ù": 191,
"Ú": 192,
"Ủ": 193,
"Ũ": 194,
"Ụ": 195,
"Ư": 196,
"Ứ": 197,
"Ừ": 198,
"Ử": 199,
"Ữ": 200,
"Ự": 201,
"Ỳ": 202,
"Ý": 203,
"Ỷ": 204,
"Ỹ": 205,
"Ỵ": 206,
"Đ": 207,
":": 208,
";": 209,
"(": 210,
")": 211,
"[": 212,
"]": 213,
"{": 214,
"}": 215,
"/": 216,
"\\": 217,
"@": 218,
"#": 219,
"$": 220,
"*": 221,
"+": 222,
"=": 223,
"<": 224,
">": 225,
"~": 226,
"`": 227,
"^": 228,
"_": 229,
"|": 230,
"\"": 231,
", ": 232,
"…": 233,
"—": 234,
"–": 235,
",": 236,
"、": 237,
"。": 238,
"!": 239,
"?": 240,
"°": 241,
"±": 242,
"×": 243,
"÷": 244,
"€": 245,
"£": 246,
"¥": 247,
"ƀ": 248,
"Ɓ": 249,
"Ƃ": 250,
"ƃ": 251,
"Ƅ": 252,
"ƅ": 253,
"Ɔ": 254,
"[START]": 255,
"ng": 256,
"nh": 257,
"th": 258,
"ch": 259,
"tr": 260,
"kh": 261,
"ph": 262,
"gh": 263,
"gi": 264,
"qu": 265,
"có": 266,
"là": 267,
"và": 268,
"một": 269,
"của": 270,
"không": 271,
"thể": 272,
"người": 273,
"các": 274,
"trong": 275,
"những": 276,
"cho": 277,
"để": 278,
"được": 279,
"tôi": 280,
"bạn": 281,
"với": 282,
"đã": 283,
"sự": 284,
"ta": 285,
"việc": 286,
"sẽ": 287,
"chúng": 288,
"khi": 289,
"cũng": 290,
"như": 291,
"mà": 292,
"đến": 293,
"ra": 294,
"này": 295,
"từ": 296,
"về": 297,
"nên": 298,
"sau": 299,
"thì": 300,
"năm": 301,
"ngày": 302,
"họ": 303,
"mình": 304,
"rất": 305,
"đang": 306,
"còn": 307,
"vẫn": 308,
"đều": 309,
"cả": 310,
"nhiều": 311,
"nào": 312,
"hay": 313,
"đó": 314,
"nó": 315,
"ai": 316,
"gì": 317,
"đây": 318,
"đấy": 319,
"ấy": 320,
"kia": 321,
"nọ": 322,
"bao": 323,
"bất": 324,
"cứ": 325,
"mỗi": 326,
"mọi": 327,
"tất": 328,
"toàn": 329,
"cùng": 330,
"nhau": 331,
"nhất": 332,
"hơn": 333,
"lại": 334,
"nữa": 335,
"thêm": 336,
"luôn": 337,
"vừa": 338,
"mới": 339,
"sắp": 340,
"rồi": 341,
"xong": 342,
"hết": 343,
"bị": 344,
"phải": 345,
"muốn": 346,
"thích": 347,
"yêu": 348,
"ghét": 349,
"biết": 350,
"hiểu": 351,
"nghĩ": 352,
"tin": 353,
"làm": 354,
"nói": 355,
"hỏi": 356,
"trả": 357,
"lời": 358,
"kể": 359,
"bảo": 360,
"gọi": 361,
"đọc": 362,
"viết": 363,
"nghe": 364,
"nhìn": 365,
"thấy": 366,
"Ā": 367,
"ā": 368,
"Ą": 369,
"ą": 370,
"Ć": 371,
"ć": 372,
"Ĉ": 373,
"ĉ": 374,
"Ċ": 375,
"ċ": 376,
"Č": 377,
"č": 378,
"Ď": 379,
"ď": 380,
"Ē": 381,
"ē": 382,
"Ĕ": 383,
"ĕ": 384,
"Ė": 385,
"ė": 386,
"Ę": 387,
"ę": 388,
"Ě": 389,
"ě": 390,
"Ĝ": 391,
"ĝ": 392,
"Ğ": 393,
"ğ": 394,
"Ġ": 395,
"ġ": 396,
"Ģ": 397,
"ģ": 398,
"Ĥ": 399,
"ĥ": 400,
"Ħ": 401,
"ħ": 402,
"Ī": 403,
"ī": 404,
"Ĭ": 405,
"ĭ": 406,
"Į": 407,
"į": 408,
"İ": 409,
"ı": 410,
"IJ": 411,
"ij": 412,
"Ĵ": 413,
"ĵ": 414,
"Ķ": 415,
"ķ": 416,
"ĸ": 417,
"Ĺ": 418,
"ĺ": 419,
"Ļ": 420,
"ļ": 421,
"Ľ": 422,
"ľ": 423,
"Ŀ": 424,
"ŀ": 425,
"Ł": 426,
"ł": 427,
"Ń": 428,
"ń": 429,
"Ņ": 430,
"ņ": 431,
"Ň": 432,
"ň": 433,
"ʼn": 434,
"Ŋ": 435,
"ŋ": 436,
"Ō": 437,
"ō": 438,
"Ŏ": 439,
"ŏ": 440,
"Ő": 441,
"ő": 442,
"Œ": 443,
"œ": 444,
"Ŕ": 445,
"ŕ": 446,
"Ŗ": 447,
"ŗ": 448,
"Ř": 449,
"ř": 450,
"Ś": 451,
"ś": 452,
"Ŝ": 453,
"ŝ": 454,
"Ş": 455,
"ş": 456,
"Š": 457,
"š": 458,
"Ţ": 459,
"ţ": 460,
"Ť": 461,
"ť": 462,
"Ŧ": 463,
"ŧ": 464,
"Ū": 465,
"ū": 466,
"Ŭ": 467,
"ŭ": 468,
"Ů": 469,
"ů": 470,
"Ű": 471,
"ű": 472,
"Ų": 473,
"ų": 474,
"Ŵ": 475,
"ŵ": 476,
"Ŷ": 477,
"ŷ": 478,
"Ÿ": 479,
"Ź": 480,
"ź": 481,
"Ż": 482,
"ż": 483,
"Ž": 484,
"ž": 485,
"ſ": 486,
"Ƈ": 487,
"ƈ": 488,
"Ɖ": 489,
"Ɗ": 490,
"Ƌ": 491,
"ƌ": 492,
"ƍ": 493,
"Ǝ": 494,
"Ə": 495,
"Ɛ": 496,
"Ƒ": 497,
"ƒ": 498,
"Ɠ": 499,
"Ɣ": 500,
"ƕ": 501,
"Ɩ": 502,
"Ɨ": 503,
"Ƙ": 504,
"ƙ": 505,
"ƚ": 506,
"ƛ": 507,
"Ɯ": 508,
"Ɲ": 509,
"ƞ": 510,
"Ɵ": 511,
"Ƣ": 512,
"ƣ": 513,
"Ƥ": 514,
"ƥ": 515,
"Ʀ": 516,
"Ƨ": 517,
"ƨ": 518,
"Ʃ": 519,
"ƪ": 520,
"ƫ": 521,
"Ƭ": 522,
"ƭ": 523,
"Ʈ": 524,
"Ʊ": 525,
"Ʋ": 526,
"Ƴ": 527,
"ƴ": 528,
"Ƶ": 529,
"ƶ": 530,
"Ʒ": 531,
"Ƹ": 532,
"ƹ": 533,
"ƺ": 534,
"ƻ": 535,
"Ƽ": 536,
"ƽ": 537,
"ƾ": 538,
"ƿ": 539,
"ǀ": 540,
"ǁ": 541,
"ǂ": 542,
"ǃ": 543,
"DŽ": 544,
"Dž": 545,
"dž": 546,
"LJ": 547,
"Lj": 548,
"lj": 549,
"NJ": 550,
"Nj": 551,
"nj": 552,
"Ǎ": 553,
"ǎ": 554,
"Ǐ": 555,
"ǐ": 556,
"Ǒ": 557,
"ǒ": 558,
"Ǔ": 559,
"ǔ": 560,
"Ǖ": 561,
"ǖ": 562,
"Ǘ": 563,
"ǘ": 564,
"Ǚ": 565,
"ǚ": 566,
"Ǜ": 567,
"ǜ": 568,
"ǝ": 569,
"Ǟ": 570,
"ǟ": 571,
"Ǡ": 572,
"ǡ": 573,
"Ǣ": 574,
"ǣ": 575,
"Ǥ": 576,
"ǥ": 577,
"Ǧ": 578,
"ǧ": 579,
"Ǩ": 580,
"ǩ": 581,
"Ǫ": 582,
"ǫ": 583,
"Ǭ": 584,
"ǭ": 585,
"Ǯ": 586,
"ǯ": 587,
"ǰ": 588,
"DZ": 589,
"Dz": 590,
"dz": 591,
"Ǵ": 592,
"ǵ": 593,
"Ƕ": 594,
"Ƿ": 595,
"Ǹ": 596,
"ǹ": 597,
"Ǻ": 598,
"ǻ": 599,
"Ǽ": 600,
"ǽ": 601,
"Ǿ": 602,
"ǿ": 603,
"[UH]": 604,
"[UM]": 605,
"[giggle]": 606,
"[laughter]": 607,
"[guffaw]": 608,
"[inhale]": 609,
"[exhale]": 610,
"[sigh]": 611,
"[cry]": 612,
"[bark]": 613,
"[howl]": 614,
"[meow]": 615,
"[singing]": 616,
"[music]": 617,
"[whistle]": 618,
"[humming]": 619,
"[gasp]": 620,
"[groan]": 621,
"[whisper]": 622,
"[mumble]": 623,
"[sniff]": 624,
"[sneeze]": 625,
"[cough]": 626,
"[snore]": 627,
"[chew]": 628,
"[sip]": 629,
"[clear_throat]": 630,
"[kiss]": 631,
"[shhh]": 632,
"[gibberish]": 633,
"[fr]": 634,
"[es]": 635,
"[de]": 636,
"[it]": 637,
"[ipa]": 638,
"[end_of_label]": 639,
"θ": 640,
"ð": 641,
"ʃ": 642,
"ʒ": 643,
"tʃ": 644,
"dʒ": 645,
"ʔ": 646,
"ɑː": 647,
"æ": 648,
"ʌ": 649,
"ɒ": 650,
"ɔː": 651,
"ɜː": 652,
"ə": 653,
"ɪ": 654,
"iː": 655,
"ʊ": 656,
"uː": 657,
"eɪ": 658,
"aɪ": 659,
"ɔɪ": 660,
"aʊ": 661,
"əʊ": 662,
"ɯ": 663,
"ɤ": 664,
"ɨ": 665,
"ʉ": 666,
"ɘ": 667,
"ɵ": 668,
"ɜ": 669,
"ɞ": 670,
"ɐ": 671,
"ɶ": 672,
"ɑ": 673,
"ɔ": 674,
"˧": 675,
"˥": 676,
"˩˧": 677,
"˧˥": 678,
"˧˩˧": 679,
"˧˩": 680,
"ɓ": 681,
"ɗ": 682,
"ɠ": 683,
"ʄ": 684,
"ʛ": 685,
"ɲ": 686,
"ɳ": 687,
"ɱ": 688,
"ʈ": 689,
"ɖ": 690,
"ɟ": 691,
"ɡ": 692,
"ɢ": 693,
"ʡ": 694,
"[PLACEHOLDER55]": 695,
"[PLACEHOLDER56]": 696,
"[PLACEHOLDER57]": 697,
"[PLACEHOLDER58]": 698,
"[PLACEHOLDER59]": 699,
"[PLACEHOLDER60]": 700,
"[PLACEHOLDER61]": 701,
"[PLACEHOLDER62]": 702,
"[PLACEHOLDER63]": 703
},
"merges": [
"n g",
"n h",
"t h",
"c h",
"t r",
"k h",
"p h",
"g h",
"g i",
"q u"
],
"language": "vi"
}
}