markuplm-base-finetuned-websrc / tokenizer_config.json
nielsr's picture
nielsr HF staff
Upload tokenizer
f20d7d6
{
"add_prefix_space": false,
"bos_token": {
"__type": "AddedToken",
"content": "<s>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false
},
"cls_token": {
"__type": "AddedToken",
"content": "<s>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false
},
"eos_token": {
"__type": "AddedToken",
"content": "</s>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false
},
"errors": "replace",
"mask_token": {
"__type": "AddedToken",
"content": "<mask>",
"lstrip": true,
"normalized": true,
"rstrip": false,
"single_word": false
},
"max_depth": 50,
"max_width": 1000,
"model_max_length": 512,
"name_or_path": "microsoft/markuplm-base",
"only_label_first_subword": true,
"pad_token": {
"__type": "AddedToken",
"content": "<pad>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false
},
"pad_token_label": -100,
"pad_width": 1001,
"sep_token": {
"__type": "AddedToken",
"content": "</s>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false
},
"special_tokens_map_file": null,
"tags_dict": {
"a": 0,
"abbr": 1,
"acronym": 2,
"address": 3,
"altGlyph": 4,
"altGlyphDef": 5,
"altGlyphItem": 6,
"animate": 7,
"animateColor": 8,
"animateMotion": 9,
"animateTransform": 10,
"applet": 11,
"area": 12,
"article": 13,
"aside": 14,
"audio": 15,
"b": 16,
"base": 17,
"basefont": 18,
"bdi": 19,
"bdo": 20,
"bgsound": 21,
"big": 22,
"blink": 23,
"blockquote": 24,
"body": 25,
"br": 26,
"button": 27,
"canvas": 28,
"caption": 29,
"center": 30,
"circle": 31,
"cite": 32,
"clipPath": 33,
"code": 34,
"col": 35,
"colgroup": 36,
"color-profile": 37,
"content": 38,
"cursor": 39,
"data": 40,
"datalist": 41,
"dd": 42,
"defs": 43,
"del": 44,
"desc": 45,
"details": 46,
"dfn": 47,
"dialog": 48,
"dir": 49,
"div": 50,
"dl": 51,
"dt": 52,
"ellipse": 53,
"em": 54,
"embed": 55,
"feBlend": 56,
"feColorMatrix": 57,
"feComponentTransfer": 58,
"feComposite": 59,
"feConvolveMatrix": 60,
"feDiffuseLighting": 61,
"feDisplacementMap": 62,
"feDistantLight": 63,
"feFlood": 64,
"feFuncA": 65,
"feFuncB": 66,
"feFuncG": 67,
"feFuncR": 68,
"feGaussianBlur": 69,
"feImage": 70,
"feMerge": 71,
"feMergeNode": 72,
"feMorphology": 73,
"feOffset": 74,
"fePointLight": 75,
"feSpecularLighting": 76,
"feSpotLight": 77,
"feTile": 78,
"feTurbulence": 79,
"fieldset": 80,
"figcaption": 81,
"figure": 82,
"filter": 83,
"font": 89,
"font-face": 88,
"font-face-format": 84,
"font-face-name": 85,
"font-face-src": 86,
"font-face-uri": 87,
"footer": 90,
"foreignObject": 91,
"form": 92,
"frame": 93,
"frameset": 94,
"g": 95,
"glyph": 96,
"glyphRef": 97,
"h1": 98,
"h2": 99,
"h3": 100,
"h4": 101,
"h5": 102,
"h6": 103,
"head": 104,
"header": 105,
"hgroup": 106,
"hkern": 107,
"hr": 108,
"html": 109,
"i": 110,
"iframe": 111,
"image": 112,
"img": 113,
"input": 114,
"ins": 115,
"kbd": 116,
"keygen": 117,
"label": 118,
"legend": 119,
"li": 120,
"line": 121,
"linearGradient": 122,
"link": 123,
"main": 124,
"map": 125,
"mark": 126,
"marker": 127,
"marquee": 128,
"mask": 129,
"math": 130,
"menu": 131,
"menuitem": 132,
"meta": 133,
"metadata": 134,
"meter": 135,
"missing-glyph": 136,
"mpath": 137,
"nav": 138,
"nobr": 139,
"noembed": 140,
"noframes": 141,
"noscript": 142,
"object": 143,
"ol": 144,
"optgroup": 145,
"option": 146,
"output": 147,
"p": 148,
"param": 149,
"path": 150,
"pattern": 151,
"picture": 152,
"plaintext": 153,
"polygon": 154,
"polyline": 155,
"portal": 156,
"pre": 157,
"progress": 158,
"q": 159,
"radialGradient": 160,
"rb": 161,
"rect": 162,
"rp": 163,
"rt": 164,
"rtc": 165,
"ruby": 166,
"s": 167,
"samp": 168,
"script": 169,
"section": 170,
"select": 171,
"set": 172,
"shadow": 173,
"slot": 174,
"small": 175,
"source": 176,
"spacer": 177,
"span": 178,
"stop": 179,
"strike": 180,
"strong": 181,
"style": 182,
"sub": 183,
"summary": 184,
"sup": 185,
"svg": 186,
"switch": 187,
"symbol": 188,
"table": 189,
"tbody": 190,
"td": 191,
"template": 192,
"text": 193,
"textPath": 194,
"textarea": 195,
"tfoot": 196,
"th": 197,
"thead": 198,
"time": 199,
"title": 200,
"tr": 201,
"track": 202,
"tref": 203,
"tspan": 204,
"tt": 205,
"u": 206,
"ul": 207,
"use": 208,
"var": 209,
"video": 210,
"view": 211,
"vkern": 212,
"wbr": 213,
"xmp": 214
},
"tokenizer_class": "MarkupLMTokenizer",
"trim_offsets": false,
"unk_token": {
"__type": "AddedToken",
"content": "<unk>",
"lstrip": false,
"normalized": true,
"rstrip": false,
"single_word": false
}
}