levanti_arabic2diacritics / tokenizer.json
guymorlan's picture
now with correct tavbert model
c33e789
raw
history blame contribute delete
No virus
8.24 kB
{
"version": "1.0",
"truncation": {
"direction": "Right",
"max_length": 512,
"strategy": "LongestFirst",
"stride": 0
},
"padding": {
"strategy": "BatchLongest",
"direction": "Right",
"pad_to_multiple_of": null,
"pad_id": 1,
"pad_type_id": 0,
"pad_token": "[PAD]"
},
"added_tokens": [
{
"id": 0,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 301,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Split",
"pattern": {
"String": ""
},
"behavior": "Isolated",
"invert": false
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
0
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
2
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": null,
"model": {
"type": "WordLevel",
"vocab": {
"[CLS]": 0,
"[PAD]": 1,
"[SEP]": 2,
"[UNK]": 3,
"unused0": 4,
"unused1": 5,
"unused2": 6,
"unused3": 7,
"unused4": 8,
"unused5": 9,
"unused6": 10,
"unused7": 11,
"unused8": 12,
"unused9": 13,
"unused10": 14,
"unused11": 15,
"unused12": 16,
"unused13": 17,
"unused14": 18,
"unused15": 19,
"unused16": 20,
"unused17": 21,
"unused18": 22,
"unused19": 23,
"unused20": 24,
"unused21": 25,
"unused22": 26,
"unused23": 27,
"unused24": 28,
"unused25": 29,
"unused26": 30,
"unused27": 31,
"unused28": 32,
"unused29": 33,
"unused30": 34,
"unused31": 35,
"unused32": 36,
"unused33": 37,
"unused34": 38,
"unused35": 39,
"unused36": 40,
"unused37": 41,
"unused38": 42,
"unused39": 43,
"unused40": 44,
"unused41": 45,
"unused42": 46,
"unused43": 47,
"unused44": 48,
"unused45": 49,
"unused46": 50,
"unused47": 51,
"unused48": 52,
"unused49": 53,
"unused50": 54,
"unused51": 55,
"unused52": 56,
"unused53": 57,
"unused54": 58,
"unused55": 59,
"unused56": 60,
"unused57": 61,
"unused58": 62,
"unused59": 63,
"unused60": 64,
"unused61": 65,
"unused62": 66,
"unused63": 67,
"unused64": 68,
"unused65": 69,
"unused66": 70,
"unused67": 71,
"unused68": 72,
"unused69": 73,
"unused70": 74,
"unused71": 75,
"unused72": 76,
"unused73": 77,
"unused74": 78,
"unused75": 79,
"unused76": 80,
"unused77": 81,
"unused78": 82,
"unused79": 83,
"unused80": 84,
"unused81": 85,
"unused82": 86,
"unused83": 87,
"unused84": 88,
"unused85": 89,
"unused86": 90,
"unused87": 91,
"unused88": 92,
"unused89": 93,
"unused90": 94,
"unused91": 95,
"unused92": 96,
"unused93": 97,
"unused94": 98,
"unused95": 99,
"unused96": 100,
"unused97": 101,
"unused98": 102,
"unused99": 103,
" ": 104,
"!": 105,
"\"": 106,
"#": 107,
"$": 108,
"%": 109,
"&": 110,
"'": 111,
"(": 112,
")": 113,
"*": 114,
"+": 115,
",": 116,
"-": 117,
".": 118,
"/": 119,
"0": 120,
"1": 121,
"2": 122,
"3": 123,
"4": 124,
"5": 125,
"6": 126,
"7": 127,
"8": 128,
"9": 129,
":": 130,
";": 131,
"<": 132,
"=": 133,
">": 134,
"?": 135,
"@": 136,
"A": 137,
"B": 138,
"C": 139,
"D": 140,
"E": 141,
"F": 142,
"G": 143,
"H": 144,
"I": 145,
"J": 146,
"K": 147,
"L": 148,
"M": 149,
"N": 150,
"O": 151,
"P": 152,
"Q": 153,
"R": 154,
"S": 155,
"T": 156,
"U": 157,
"V": 158,
"W": 159,
"X": 160,
"Y": 161,
"Z": 162,
"[": 163,
"\\": 164,
"]": 165,
"^": 166,
"_": 167,
"a": 168,
"b": 169,
"c": 170,
"d": 171,
"e": 172,
"f": 173,
"g": 174,
"h": 175,
"i": 176,
"j": 177,
"k": 178,
"l": 179,
"m": 180,
"n": 181,
"o": 182,
"p": 183,
"q": 184,
"r": 185,
"s": 186,
"t": 187,
"u": 188,
"v": 189,
"w": 190,
"x": 191,
"y": 192,
"z": 193,
"{": 194,
"|": 195,
"}": 196,
"~": 197,
"«": 198,
"°": 199,
"·": 200,
"»": 201,
"é": 202,
"а": 203,
"в": 204,
"д": 205,
"е": 206,
"и": 207,
"к": 208,
"л": 209,
"м": 210,
"н": 211,
"о": 212,
"п": 213,
"р": 214,
"с": 215,
"т": 216,
"،": 217,
"؛": 218,
"؟": 219,
"ء": 220,
"آ": 221,
"أ": 222,
"ؤ": 223,
"إ": 224,
"ئ": 225,
"ا": 226,
"ب": 227,
"ة": 228,
"ت": 229,
"ث": 230,
"ج": 231,
"ح": 232,
"خ": 233,
"د": 234,
"ذ": 235,
"ر": 236,
"ز": 237,
"س": 238,
"ش": 239,
"ص": 240,
"ض": 241,
"ط": 242,
"ظ": 243,
"ع": 244,
"غ": 245,
"ـ": 246,
"ف": 247,
"ق": 248,
"ك": 249,
"ل": 250,
"م": 251,
"ن": 252,
"ه": 253,
"و": 254,
"ى": 255,
"ي": 256,
"ً": 257,
"ٌ": 258,
"ٍ": 259,
"َ": 260,
"ُ": 261,
"ِ": 262,
"ّ": 263,
"ْ": 264,
"٠": 265,
"١": 266,
"٢": 267,
"٣": 268,
"٤": 269,
"٥": 270,
"٦": 271,
"٧": 272,
"٨": 273,
"٩": 274,
"٪": 275,
"پ": 276,
"ک": 277,
"گ": 278,
"ھ": 279,
"ی": 280,
"​": 281,
"‌": 282,
"‎": 283,
"‏": 284,
"–": 285,
"—": 286,
"‘": 287,
"’": 288,
"“": 289,
"”": 290,
"•": 291,
"…": 292,
"‪": 293,
"‫": 294,
"‬": 295,
"‭": 296,
"‮": 297,
"﴾": 298,
"﴿": 299,
"�": 300,
"[MASK]": 301
},
"unk_token": "[UNK]"
}
}