NPC-Bert / tokenizer.json
lwong
add tokenizer
99da5ad
raw
history blame contribute delete
No virus
19.8 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "BertNormalizer",
"clean_text": true,
"handle_chinese_chars": true,
"strip_accents": null,
"lowercase": true
},
"pre_tokenizer": {
"type": "BertPreTokenizer"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
2
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
3
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": {
"type": "WordPiece",
"prefix": "##",
"cleanup": true
},
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
",": 5,
"-": 6,
".": 7,
"2": 8,
";": 9,
"a": 10,
"b": 11,
"c": 12,
"d": 13,
"e": 14,
"f": 15,
"g": 16,
"h": 17,
"i": 18,
"j": 19,
"k": 20,
"l": 21,
"m": 22,
"n": 23,
"o": 24,
"p": 25,
"q": 26,
"r": 27,
"s": 28,
"t": 29,
"u": 30,
"v": 31,
"w": 32,
"x": 33,
"y": 34,
"##u": 35,
"##e": 36,
"##s": 37,
"##t": 38,
"##i": 39,
"##o": 40,
"##n": 41,
"##a": 42,
"##b": 43,
"##l": 44,
"##k": 45,
"##y": 46,
"##f": 47,
"##r": 48,
"##m": 49,
"##h": 50,
"##c": 51,
"##g": 52,
"##q": 53,
"##v": 54,
"##p": 55,
"##x": 56,
"##d": 57,
"##w": 58,
"##2": 59,
"##j": 60,
"##al": 61,
"##as": 62,
"##ar": 63,
"##er": 64,
"##tas": 65,
"##in": 66,
"##ph": 67,
"##ary": 68,
"no": 69,
"##aryn": 70,
"##ic": 71,
"##pharyn": 72,
"##es": 73,
"##opharyn": 74,
"##ge": 75,
"nas": 76,
"nasopharyn": 77,
"##it": 78,
"##geal": 79,
"##om": 80,
"nod": 81,
"me": 82,
"metas": 83,
"##at": 84,
"##cin": 85,
"nasopharyngeal": 86,
"##oma": 87,
"car": 88,
"##cinoma": 89,
"carcinoma": 90,
"##ith": 91,
"with": 92,
"metastas": 93,
"##vic": 94,
"metastases": 95,
"nodal": 96,
"th": 97,
"cer": 98,
"##on": 99,
"##vical": 100,
"cervical": 101,
"in": 102,
"##il": 103,
"##nd": 104,
"the": 105,
"##us": 106,
"and": 107,
"##en": 108,
"##ed": 109,
"##eral": 110,
"##ateral": 111,
"##ilateral": 112,
"##ul": 113,
"##id": 114,
"##ef": 115,
"##ig": 116,
"bilateral": 117,
"##ion": 118,
"##ht": 119,
"##ight": 120,
"right": 121,
"to": 122,
"ex": 123,
"lef": 124,
"left": 125,
"##or": 126,
"##mal": 127,
"nodes": 128,
"ext": 129,
"##an": 130,
"##ac": 131,
"##ly": 132,
"##ve": 133,
"##ens": 134,
"sid": 135,
"##ur": 136,
"##os": 137,
"inv": 138,
"smal": 139,
"##tr": 140,
"##ter": 141,
"nasopharynx": 142,
"small": 143,
"con": 144,
"##ous": 145,
"extens": 146,
"##ined": 147,
"##ent": 148,
"##ular": 149,
"sided": 150,
"##le": 151,
"##fined": 152,
"##re": 153,
"confined": 154,
"of": 155,
"re": 156,
"sp": 157,
"pr": 158,
"##ut": 159,
"##tat": 160,
"##is": 161,
"##asion": 162,
"metastat": 163,
"invasion": 164,
"metastatic": 165,
"is": 166,
"su": 167,
"sus": 168,
"##ious": 169,
"##pp": 170,
"bas": 171,
"un": 172,
"##ive": 173,
"##oid": 174,
"##pic": 175,
"suspic": 176,
"base": 177,
"suspicious": 178,
"but": 179,
"wh": 180,
"##um": 181,
"##ec": 182,
"##ing": 183,
"def": 184,
"ly": 185,
"pos": 186,
"##ib": 187,
"##if": 188,
"##our": 189,
"##ad": 190,
"##mph": 191,
"##ich": 192,
"##opharyngeal": 193,
"##ite": 194,
"##tropharyngeal": 195,
"extensive": 196,
"retropharyngeal": 197,
"which": 198,
"lymph": 199,
"bul": 200,
"sk": 201,
"tum": 202,
"upp": 203,
"##li": 204,
"##ky": 205,
"##pr": 206,
"##inite": 207,
"non": 208,
"##ull": 209,
"extension": 210,
"unilateral": 211,
"definite": 212,
"##ific": 213,
"bulky": 214,
"skull": 215,
"tumour": 216,
"upper": 217,
"as": 218,
"are": 219,
"ear": 220,
"##ug": 221,
"##ep": 222,
"##ely": 223,
"##ol": 224,
"##op": 225,
"##nal": 226,
"##la": 227,
"##kely": 228,
"##cl": 229,
"spec": 230,
"early": 231,
"specific": 232,
"ab": 233,
"for": 234,
"jug": 235,
"li": 236,
"pal": 237,
"par": 238,
"##ow": 239,
"##vicular": 240,
"##acla": 241,
"invol": 242,
"supr": 243,
"##aden": 244,
"likely": 245,
"##aclavicular": 246,
"supraclavicular": 247,
"bon": 248,
"ca": 249,
"ch": 250,
"mos": 251,
"pter": 252,
"##ue": 253,
"##ud": 254,
"##ti": 255,
"##th": 256,
"##ted": 257,
"##im": 258,
"##ior": 259,
"##apharyn": 260,
"##yg": 261,
"##ran": 262,
"##ment": 263,
"##hy": 264,
"##inent": 265,
"##ominent": 266,
"##ation": 267,
"##athy": 268,
"metastasis": 269,
"inter": 270,
"##terior": 271,
"prim": 272,
"prominent": 273,
"posterior": 274,
"lymphaden": 275,
"##opathy": 276,
"##clud": 277,
"jugular": 278,
"parapharyn": 279,
"involve": 280,
"most": 281,
"pteryg": 282,
"internal": 283,
"primary": 284,
"lymphadenopathy": 285,
"parapharyngeal": 286,
"at": 287,
"an": 288,
"app": 289,
"cor": 290,
"due": 291,
"fr": 292,
"fos": 293,
"lar": 294,
"mul": 295,
"mid": 296,
"or": 297,
"sin": 298,
"ve": 299,
"ver": 300,
"##ear": 301,
"##sa": 302,
"##sib": 303,
"##to": 304,
"##ial": 305,
"##ob": 306,
"##oc": 307,
"##ab": 308,
"##ain": 309,
"##cep": 310,
"##ver": 311,
"##ine": 312,
"##ini": 313,
"##ess": 314,
"node": 315,
"this": 316,
"into": 317,
"spac": 318,
"possib": 319,
"bony": 320,
"##ranial": 321,
"involvement": 322,
"pterygoid": 323,
"any": 324,
"appear": 325,
"fossa": 326,
"large": 327,
"sinus": 328,
"veli": 329,
"very": 330,
"##ocal": 331,
"##able": 332,
"##cept": 333,
"al": 334,
"cli": 335,
"des": 336,
"dis": 337,
"dow": 338,
"hy": 339,
"ma": 340,
"mus": 341,
"on": 342,
"ro": 343,
"sig": 344,
"tr": 345,
"##ui": 346,
"##ual": 347,
"##und": 348,
"##tic": 349,
"##ian": 350,
"##od": 351,
"##ove": 352,
"##out": 353,
"##nor": 354,
"##fil": 355,
"##fus": 356,
"##rib": 357,
"##crib": 358,
"##gg": 359,
"##gle": 360,
"##qui": 361,
"##vus": 362,
"##ps": 363,
"##ple": 364,
"##ested": 365,
"without": 366,
"intr": 367,
"includ": 368,
"infil": 369,
"except": 370,
"##ant": 371,
"##anc": 372,
"##act": 373,
"##acranial": 374,
"side": 375,
"##tration": 376,
"##read": 377,
"spread": 378,
"prob": 379,
"sub": 380,
"sugg": 381,
"above": 382,
"abnor": 383,
"palat": 384,
"chain": 385,
"##tiple": 386,
"from": 387,
"multiple": 388,
"space": 389,
"clivus": 390,
"describ": 391,
"down": 392,
"hyp": 393,
"trian": 394,
"intracranial": 395,
"infiltration": 396,
"probable": 397,
"suggested": 398,
"abnormal": 399,
"chains": 400,
"described": 401,
"triangle": 402,
"ad": 403,
"bo": 404,
"com": 405,
"de": 406,
"dur": 407,
"en": 408,
"ef": 409,
"equi": 410,
"fur": 411,
"le": 412,
"mil": 413,
"mini": 414,
"ob": 415,
"pe": 416,
"pit": 417,
"sph": 418,
"tens": 419,
"##uit": 420,
"##eter": 421,
"##te": 422,
"##tal": 423,
"##tom": 424,
"##ia": 425,
"##ies": 426,
"##nous": 427,
"##ll": 428,
"##las": 429,
"##lation": 430,
"##yr": 431,
"##min": 432,
"##han": 433,
"##cur": 434,
"##vat": 435,
"##vocal": 436,
"##deter": 437,
"##jac": 438,
"not": 439,
"##ities": 440,
"nodul": 441,
"##atur": 442,
"##cing": 443,
"indeter": 444,
"##using": 445,
"##usual": 446,
"##enoid": 447,
"exclud": 448,
"extr": 449,
"extent": 450,
"##anti": 451,
"##relation": 452,
"react": 453,
"unusual": 454,
"lymphoid": 455,
"##press": 456,
"##cles": 457,
"palanti": 458,
"caver": 459,
"causing": 460,
"##ther": 461,
"cord": 462,
"correlation": 463,
"possibly": 464,
"possible": 465,
"appearanc": 466,
"sinuses": 467,
"may": 468,
"muscles": 469,
"signal": 470,
"##odular": 471,
"##fusion": 472,
"##psular": 473,
"includes": 474,
"palatini": 475,
"hyper": 476,
"abnormalities": 477,
"adjac": 478,
"both": 479,
"compress": 480,
"deep": 481,
"dura": 482,
"enhan": 483,
"effusion": 484,
"equivocal": 485,
"further": 486,
"levat": 487,
"mild": 488,
"minimal": 489,
"pituit": 490,
"sphenoid": 491,
"tensor": 492,
"##yroid": 493,
"indetermin": 494,
"reactive": 495,
"cavernous": 496,
"adjacent": 497,
"enhancing": 498,
"levator": 499,
"pituitary": 500,
"ai": 501,
"ar": 502,
"ac": 503,
"ap": 504,
"be": 505,
"br": 506,
"bor": 507,
"ct": 508,
"co": 509,
"cl": 510,
"cen": 511,
"cranial": 512,
"di": 513,
"do": 514,
"dif": 515,
"dep": 516,
"ev": 517,
"end": 518,
"fe": 519,
"fa": 520,
"fl": 521,
"fas": 522,
"gla": 523,
"hu": 524,
"he": 525,
"how": 526,
"ips": 527,
"ill": 528,
"lo": 529,
"ling": 530,
"low": 531,
"less": 532,
"local": 533,
"mal": 534,
"mas": 535,
"mar": 536,
"mor": 537,
"mac": 538,
"ne": 539,
"nar": 540,
"ner": 541,
"nec": 542,
"natur": 543,
"op": 544,
"otom": 545,
"ph": 546,
"per": 547,
"pre": 548,
"qu": 549,
"sl": 550,
"sit": 551,
"site": 552,
"tw": 553,
"t2": 554,
"ton": 555,
"tis": 556,
"ul": 557,
"und": 558,
"v2": 559,
"vol": 560,
"vocal": 561,
"we": 562,
"##ub": 563,
"##uous": 564,
"##ex": 565,
"##ever": 566,
"##so": 567,
"##sy": 568,
"##sil": 569,
"##sid": 570,
"##sue": 571,
"##sess": 572,
"##scur": 573,
"##tf": 574,
"##tin": 575,
"##tid": 576,
"##tef": 577,
"##tig": 578,
"##tion": 579,
"##tens": 580,
"##tle": 581,
"##toid": 582,
"##tund": 583,
"##tant": 584,
"##ias": 585,
"##ok": 586,
"##of": 587,
"##oor": 588,
"##oug": 589,
"##oaden": 590,
"##ound": 591,
"##otid": 592,
"##nific": 593,
"##nant": 594,
"##ay": 595,
"##am": 596,
"##and": 597,
"##apsular": 598,
"##be": 599,
"##br": 600,
"##bit": 601,
"##lary": 602,
"##line": 603,
"##ked": 604,
"##rw": 605,
"##res": 606,
"##rus": 607,
"##rmal": 608,
"##rent": 609,
"##row": 610,
"##roaden": 611,
"##mith": 612,
"##mand": 613,
"##ct": 614,
"##cle": 615,
"##cre": 616,
"##cia": 617,
"##gion": 618,
"##gob": 619,
"##vious": 620,
"##ving": 621,
"##vour": 622,
"##pec": 623,
"##plas": 624,
"##xil": 625,
"##der": 626,
"##dle": 627,
"##alat": 628,
"##asil": 629,
"##astr": 630,
"##astoid": 631,
"##asound": 632,
"##ered": 633,
"##erly": 634,
"##inic": 635,
"##intens": 636,
"noted": 637,
"normal": 638,
"##aryngob": 639,
"##estion": 640,
"##ity": 641,
"##ate": 642,
"that": 643,
"than": 644,
"thyroid": 645,
"##ong": 646,
"##ontal": 647,
"##ile": 648,
"##ility": 649,
"##uld": 650,
"##ulod": 651,
"##ident": 652,
"##ignant": 653,
"##igastr": 654,
"##ightly": 655,
"torus": 656,
"expec": 657,
"extran": 658,
"##ange": 659,
"##acapsular": 660,
"##ural": 661,
"##uroma": 662,
"##osit": 663,
"##trous": 664,
"##trasound": 665,
"##tered": 666,
"consid": 667,
"contig": 668,
"##oustic": 669,
"##rect": 670,
"requi": 671,
"recur": 672,
"region": 673,
"suscept": 674,
"unli": 675,
"while": 676,
"##ume": 677,
"##ectom": 678,
"defined": 679,
"##ibular": 680,
"##ibility": 681,
"assess": 682,
"area": 683,
"##oplas": 684,
"##opalat": 685,
"foram": 686,
"jugulod": 687,
"palsy": 688,
"parotid": 689,
"involving": 690,
"bone": 691,
"capsular": 692,
"chias": 693,
"change": 694,
"##thoug": 695,
"##mental": 696,
"pterygopalat": 697,
"frontal": 698,
"multin": 699,
"middle": 700,
"orbit": 701,
"##verte": 702,
"##ineural": 703,
"spaces": 704,
"appears": 705,
"appearing": 706,
"also": 707,
"along": 708,
"althoug": 709,
"distal": 710,
"distant": 711,
"discre": 712,
"maxil": 713,
"muscle": 714,
"round": 715,
"rotund": 716,
"roof": 717,
"signific": 718,
"##fuse": 719,
"including": 720,
"##ance": 721,
"subtle": 722,
"submand": 723,
"submental": 724,
"palatine": 725,
"hypoplas": 726,
"obscur": 727,
"obvious": 728,
"pet": 729,
"petrous": 730,
"nodule": 731,
"nodules": 732,
"##atures": 733,
"exclude": 734,
"excluded": 735,
"extra": 736,
"extracapsular": 737,
"appearance": 738,
"appearances": 739,
"maybe": 740,
"hyperplas": 741,
"hyperintens": 742,
"compression": 743,
"compressing": 744,
"deeper": 745,
"##yroidectom": 746,
"indeterminant": 747,
"indeterminate": 748,
"airw": 749,
"artef": 750,
"acoustic": 751,
"apex": 752,
"brain": 753,
"border": 754,
"could": 755,
"clinic": 756,
"centered": 757,
"direct": 758,
"doub": 759,
"diffuse": 760,
"deposit": 761,
"evident": 762,
"features": 763,
"favour": 764,
"floor": 765,
"fascia": 766,
"gland": 767,
"huge": 768,
"hemith": 769,
"however": 770,
"ipsilateral": 771,
"look": 772,
"lingual": 773,
"lower": 774,
"malignant": 775,
"mass": 776,
"marked": 777,
"more": 778,
"macroaden": 779,
"neuroma": 780,
"narrow": 781,
"nerve": 782,
"neck": 783,
"nature": 784,
"optic": 785,
"otomastoid": 786,
"pharyngob": 787,
"perineural": 788,
"preverte": 789,
"question": 790,
"slightly": 791,
"sites": 792,
"two": 793,
"tonsil": 794,
"tissue": 795,
"ultrasound": 796,
"underly": 797,
"volume": 798,
"well": 799,
"##tful": 800,
"##bral": 801,
"##asilar": 802,
"normally": 803,
"##igastric": 804,
"expected": 805,
"extranodular": 806,
"considered": 807,
"contiguous": 808,
"requires": 809,
"recurrent": 810,
"regions": 811,
"susceptibility": 812,
"unlikely": 813,
"foramen": 814,
"jugulodigastric": 815,
"chiasm": 816,
"pterygopalatine": 817,
"multinodular": 818,
"orbital": 819,
"although": 820,
"discrete": 821,
"maxillary": 822,
"rounded": 823,
"rotundum": 824,
"significance": 825,
"submandibular": 826,
"hypoplastic": 827,
"obscure": 828,
"obviously": 829,
"hyperplasia": 830,
"hyperintense": 831,
"##yroidectomy": 832,
"airway": 833,
"artefact": 834,
"borderline": 835,
"clinical": 836,
"doubtful": 837,
"hemithyroidectomy": 838,
"looking": 839,
"macroadenoma": 840,
"narrowing": 841,
"pharyngobasilar": 842,
"prevertebral": 843,
"questionable": 844,
"tonsils": 845,
"underlying": 846
}
}
}