Spaces:

yhshin
/

latex-ocr

Running

latex-ocr / tokenizer-wordlevel.json

Young Ho Shin

Add examples and article.md

36bccd1 about 2 years ago

No virus

6.1 kB

	{
	"version": "1.0",
	"truncation": {
	"direction": "Right",
	"max_length": 100,
	"strategy": "LongestFirst",
	"stride": 0
	},
	"padding": {
	"strategy": {
	"Fixed": 100
	},
	"direction": "Right",
	"pad_to_multiple_of": null,
	"pad_id": 0,
	"pad_type_id": 0,
	"pad_token": "[PAD]"
	},
	"added_tokens": [
	{
	"id": 0,
	"content": "[UNK]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 1,
	"content": "[CLS]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 2,
	"content": "[SEP]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 3,
	"content": "[PAD]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	},
	{
	"id": 4,
	"content": "[MASK]",
	"single_word": false,
	"lstrip": false,
	"rstrip": false,
	"normalized": false,
	"special": true
	}
	],
	"normalizer": null,
	"pre_tokenizer": {
	"type": "Whitespace"
	},
	"post_processor": {
	"type": "TemplateProcessing",
	"single": [
	{
	"SpecialToken": {
	"id": "[CLS]",
	"type_id": 0
	}
	},
	{
	"Sequence": {
	"id": "A",
	"type_id": 0
	}
	},
	{
	"SpecialToken": {
	"id": "[SEP]",
	"type_id": 0
	}
	}
	],
	"pair": [
	{
	"SpecialToken": {
	"id": "[CLS]",
	"type_id": 0
	}
	},
	{
	"Sequence": {
	"id": "A",
	"type_id": 0
	}
	},
	{
	"SpecialToken": {
	"id": "[SEP]",
	"type_id": 0
	}
	},
	{
	"Sequence": {
	"id": "B",
	"type_id": 1
	}
	},
	{
	"SpecialToken": {
	"id": "[SEP]",
	"type_id": 1
	}
	}
	],
	"special_tokens": {
	"[CLS]": {
	"id": "[CLS]",
	"ids": [
	1
	],
	"tokens": [
	"[CLS]"
	]
	},
	"[SEP]": {
	"id": "[SEP]",
	"ids": [
	2
	],
	"tokens": [
	"[SEP]"
	]
	}
	}
	},
	"decoder": null,
	"model": {
	"type": "WordLevel",
	"vocab": {
	"[UNK]": 0,
	"[CLS]": 1,
	"[SEP]": 2,
	"[PAD]": 3,
	"[MASK]": 4,
	"}": 5,
	"{": 6,
	"\\": 7,
	"_": 8,
	"^": 9,
	"(": 10,
	")": 11,
	"2": 12,
	"1": 13,
	"-": 14,
	"=": 15,
	",": 16,
	"+": 17,
	"frac": 18,
	"i": 19,
	"0": 20,
	"x": 21,
	"n": 22,
	".": 23,
	"d": 24,
	"\\,": 25,
	"a": 26,
	"mu": 27,
	"left": 28,
	"right": 29,
	"e": 30,
	"k": 31,
	"c": 32,
	"m": 33,
	"r": 34,
	"p": 35,
	"3": 36,
	"alpha": 37,
	"t": 38,
	"partial": 39,
	"~": 40,
	"l": 41,
	"A": 42,
	"s": 43,
	"&": 44,
	"4": 45,
	"j": 46,
	"\\;": 47,
	"g": 48,
	"prime": 49,
	"]": 50,
	"[": 51,
	"nu": 52,
	"z": 53,
	"pi": 54,
	"\|": 55,
	"b": 56,
	"phi": 57,
	"\\\\": 58,
	"mathrm": 59,
	"q": 60,
	"operatorname": 61,
	"cal": 62,
	"N": 63,
	"delta": 64,
	"f": 65,
	"lambda": 66,
	"beta": 67,
	"bar": 68,
	"T": 69,
	"int": 70,
	"array": 71,
	"R": 72,
	"S": 73,
	"D": 74,
	"L": 75,
	"M": 76,
	"B": 77,
	"y": 78,
	"sigma": 79,
	"F": 80,
	"theta": 81,
	"/": 82,
	"gamma": 83,
	"h": 84,
	"hat": 85,
	"psi": 86,
	"sqrt": 87,
	"sum": 88,
	"u": 89,
	"H": 90,
	"o": 91,
	"rho": 92,
	"tilde": 93,
	"tau": 94,
	"C": 95,
	"P": 96,
	"G": 97,
	"V": 98,
	"I": 99,
	"X": 100,
	"omega": 101,
	"epsilon": 102,
	"E": 103,
	"J": 104,
	"bf": 105,
	"eta": 106,
	"v": 107,
	"xi": 108,
	"Q": 109,
	"Phi": 110,
	"quad": 111,
	"*": 112,
	"5": 113,
	"\\{": 114,
	"vec": 115,
	"begin": 116,
	"end": 117,
	"Gamma": 118,
	"K": 119,
	"infty": 120,
	"\\}": 121,
	"6": 122,
	"U": 123,
	"rangle": 124,
	"dot": 125,
	"W": 126,
	"pm": 127,
	"Lambda": 128,
	"Z": 129,
	"varphi": 130,
	"Delta": 131,
	"w": 132,
	"chi": 133,
	";": 134,
	"8": 135,
	"\\!": 136,
	"Omega": 137,
	"kappa": 138,
	"qquad": 139,
	"cdot": 140,
	"Psi": 141,
	"equiv": 142,
	"langle": 143,
	"overline": 144,
	">": 145,
	"<": 146,
	"dagger": 147,
	"zeta": 148,
	"varepsilon": 149,
	"cdots": 150,
	"rightarrow": 151,
	"O": 152,
	"nabla": 153,
	"Y": 154,
	"ldots": 155,
	":": 156,
	"Sigma": 157,
	"ell": 158,
	"7": 159,
	"mathcal": 160,
	"\\:": 161,
	"!": 162,
	"otimes": 163,
	"prod": 164,
	"wedge": 165,
	"9": 166,
	"hspace": 167,
	"Pi": 168,
	"hbar": 169,
	"sim": 170,
	"vert": 171,
	"in": 172,
	"Big": 173,
	"widetilde": 174,
	"displaystyle": 175,
	"times": 176,
	"Theta": 177,
	"underline": 178,
	"mid": 179,
	"to": 180,
	"dots": 181,
	"mathbf": 182,
	"ast": 183,
	"leq": 184,
	"approx": 185,
	"star": 186,
	"stackrel": 187,
	"perp": 188,
	"widehat": 189,
	"big": 190,
	"vartheta": 191,
	"'": 192,
	"Bigr": 193,
	"geq": 194,
	"mp": 195,
	"Bigl": 196,
	"dag": 197,
	"neq": 198,
	"simeq": 199
	},
	"unk_token": "[UNK]"
	}
	}