MARBERT / tokenizer.json
lamaabdulaziz's picture
Upload tokenizer
c907789
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "BertNormalizer",
"clean_text": true,
"handle_chinese_chars": true,
"strip_accents": null,
"lowercase": true
},
"pre_tokenizer": {
"type": "BertPreTokenizer"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
2
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
3
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": {
"type": "WordPiece",
"prefix": "##",
"cleanup": true
},
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"ء": 5,
"ا": 6,
"ب": 7,
"ت": 8,
"ث": 9,
"ج": 10,
"ح": 11,
"خ": 12,
"د": 13,
"ذ": 14,
"ر": 15,
"ز": 16,
"س": 17,
"ش": 18,
"ص": 19,
"ض": 20,
"ط": 21,
"ظ": 22,
"ع": 23,
"غ": 24,
"ف": 25,
"ق": 26,
"ك": 27,
"ل": 28,
"م": 29,
"ن": 30,
"ه": 31,
"و": 32,
"ي": 33,
"پ": 34,
"ލ": 35,
"##ل": 36,
"##ب": 37,
"##ر": 38,
"##ق": 39,
"##ي": 40,
"##ه": 41,
"##ن": 42,
"##ج": 43,
"##غ": 44,
"##ع": 45,
"##ض": 46,
"##ح": 47,
"##ك": 48,
"##و": 49,
"##م": 50,
"##ت": 51,
"##ش": 52,
"##ا": 53,
"##خ": 54,
"##ف": 55,
"##ث": 56,
"##ز": 57,
"##د": 58,
"##ء": 59,
"##س": 60,
"##ظ": 61,
"##ط": 62,
"##ذ": 63,
"##ص": 64,
"ال": 65,
"##ال": 66,
"##يه": 67,
"الم": 68,
"الا": 69,
"##ات": 70,
"##ان": 71,
"##ري": 72,
"##لي": 73,
"##اء": 74,
"##ار": 75,
"##ام": 76,
"ان": 77,
"##ين": 78,
"##اد": 79,
"##ير": 80,
"##اب": 81,
"##ول": 82,
"علي": 83,
"##ون": 84,
"##ها": 85,
"الع": 86,
"##اع": 87,
"وال": 88,
"##ست": 89,
"الس": 90,
"الي": 91,
"لل": 92,
"الت": 93,
"##ور": 94,
"##اس": 95,
"##اف": 96,
"الج": 97,
"##مه": 98,
"##يد": 99,
"الح": 100,
"الق": 101,
"##رب": 102,
"##وا": 103,
"##يا": 104,
"الف": 105,
"##ره": 106,
"بال": 107,
"##له": 108,
"##ود": 109,
"##را": 110,
"##وم": 111,
"الد": 112,
"##لا": 113,
"##هم": 114,
"الش": 115,
"وا": 116,
"##حد": 117,
"##يس": 118,
"##نا": 119,
"الب": 120,
"##قه": 121,
"##يل": 122,
"##من": 123,
"##عه": 124,
"الر": 125,
"##قي": 126,
"##رك": 127,
"##نت": 128,
"##اه": 129,
"وت": 130,
"##مد": 131,
"##قد": 132,
"##في": 133,
"##وي": 134,
"الو": 135,
"##اره": 136,
"الن": 137,
"##مل": 138,
"##مر": 139,
"##ته": 140,
"##لس": 141,
"اع": 142,
"##يم": 143,
"وق": 144,
"الاس": 145,
"##راء": 146,
"##وري": 147,
"الخ": 148,
"مح": 149,
"##جه": 150,
"##ءيس": 151,
"##بي": 152,
"##به": 153,
"##ني": 154,
"##صر": 155,
"##عد": 156,
"##كن": 157,
"##وق": 158,
"ام": 159,
"##لال": 160,
"##هد": 161,
"##وس": 162,
"اس": 163,
"##بد": 164,
"##بر": 165,
"##حه": 166,
"##تي": 167,
"##لام": 168,
"##مال": 169,
"##لم": 170,
"الص": 171,
"الث": 172,
"##كر": 173,
"##تم": 174,
"##فا": 175,
"من": 176,
"##ده": 177,
"المت": 178,
"است": 179,
"الام": 180,
"##وله": 181,
"##اني": 182,
"##قت": 183,
"##ما": 184,
"##ريق": 185,
"##حت": 186,
"الك": 187,
"##سي": 188,
"اي": 189,
"##قل": 190,
"##جم": 191,
"##با": 192,
"##اص": 193,
"##دي": 194,
"##فه": 195,
"او": 196,
"##ضي": 197,
"##وع": 198,
"اك": 199,
"بن": 200,
"##وات": 201,
"##شر": 202,
"##طه": 203,
"##كه": 204,
"##بار": 205,
"##زي": 206,
"##نه": 207,
"مس": 208,
"##تح": 209,
"##لك": 210,
"وك": 211,
"وي": 212,
"اب": 213,
"اخ": 214,
"##وض": 215,
"خلال": 216,
"##ادي": 217,
"##عت": 218,
"##شار": 219,
"##صل": 220,
"##الي": 221,
"##قب": 222,
"سي": 223,
"##اله": 224,
"##رت": 225,
"##اري": 226,
"وم": 227,
"وقال": 228,
"الل": 229,
"المس": 230,
"الان": 231,
"##اده": 232,
"##ولي": 233,
"انه": 234,
"##خل": 235,
"##هر": 236,
"مد": 237,
"##اج": 238,
"عبد": 239,
"##دد": 240,
"##زاء": 241,
"##وب": 242,
"##يره": 243,
"اج": 244,
"##دم": 245,
"##عود": 246,
"مع": 247,
"مت": 248,
"##قا": 249,
"##وف": 250,
"##اي": 251,
"##وره": 252,
"##حي": 253,
"العام": 254,
"المن": 255,
"بر": 256,
"##رض": 257,
"##انيه": 258,
"لم": 259,
"##سم": 260,
"##صري": 261,
"وز": 262,
"تع": 263,
"##طر": 264,
"##كو": 265,
"##ديد": 266,
"بد": 267,
"##ضاف": 268,
"المد": 269,
"##كل": 270,
"الاخ": 271,
"##ريك": 272,
"##جلس": 273,
"##كون": 274,
"##اح": 275,
"##عب": 276,
"##تر": 277,
"##حده": 278,
"##انت": 279,
"اليوم": 280,
"##خص": 281,
"##طين": 282,
"والم": 283,
"##زه": 284,
"وب": 285,
"اف": 286,
"##ثر": 287,
"##سه": 288,
"العرب": 289,
"##ويه": 290,
"تم": 291,
"لت": 292,
"الرءيس": 293,
"الشر": 294,
"##طل": 295,
"##ينه": 296,
"##سب": 297,
"##ند": 298,
"محمد": 299,
"رءيس": 300,
"عام": 301,
"##عا": 302,
"##طقه": 303,
"##لسطين": 304,
"##عل": 305,
"##وج": 306,
"وان": 307,
"##الم": 308,
"##وز": 309,
"الجزاء": 310,
"بم": 311,
"##صد": 312,
"يت": 313,
"##قر": 314,
"##ابه": 315,
"##نظ": 316,
"##يش": 317,
"##اعه": 318,
"##يين": 319,
"##يب": 320,
"##اك": 321,
"اح": 322,
"##تها": 323,
"مر": 324,
"##اعب": 325,
"قال": 326,
"##ركه": 327,
"اله": 328,
"##از": 329,
"##طن": 330,
"الط": 331,
"##تل": 332,
"المح": 333,
"الز": 334,
"وح": 335,
"##عم": 336,
"وس": 337,
"الله": 338,
"الغ": 339,
"اد": 340,
"##قات": 341,
"##رف": 342,
"##وه": 343,
"المع": 344,
"##ارات": 345,
"يوم": 346,
"حس": 347,
"##وان": 348,
"وع": 349,
"##نتخ": 350,
"بان": 351,
"مء": 352,
"##ليه": 353,
"##ذا": 354,
"بت": 355,
"##رين": 356,
"##كومه": 357,
"##فر": 358,
"للم": 359,
"بش": 360,
"##كت": 361,
"تح": 362,
"##ذلك": 363,
"وج": 364,
"الما": 365,
"##ءه": 366,
"فر": 367,
"مست": 368,
"##يان": 369,
"##لاث": 370,
"##يلي": 371,
"الاست": 372,
"بل": 373,
"##ناء": 374,
"المتحده": 375,
"##قيه": 376,
"مش": 377,
"##قاء": 378,
"##زال": 379,
"الاه": 380,
"##يع": 381,
"##اخل": 382,
"##اليه": 383,
"السعود": 384,
"##وريا": 385,
"المر": 386,
"الامريك": 387,
"وه": 388,
"وفي": 389,
"##ضاء": 390,
"##فت": 391,
"##ارج": 392,
"با": 393,
"تق": 394,
"الوز": 395,
"##ضه": 396,
"##حدث": 397,
"ات": 398,
"البل": 399,
"الجم": 400,
"##باراه": 401,
"##اسه": 402,
"##رات": 403,
"##ابع": 404,
"الامن": 405,
"جم": 406,
"الاول": 407,
"بح": 408,
"وف": 409,
"##راءيل": 410,
"##عي": 411,
"##كري": 412,
"##كم": 413,
"مصر": 414,
"##قط": 415,
"##واجه": 416,
"##مالك": 417,
"##رد": 418,
"##وريه": 419,
"##بيه": 420,
"##نس": 421,
"احد": 422,
"الاسلام": 423,
"الاهلي": 424,
"عدد": 425,
"الاع": 426,
"اكثر": 427,
"الجزاءر": 428,
"##ليم": 429,
"قر": 430,
"يكن": 431,
"##ضيه": 432,
"##طال": 433,
"##اصه": 434,
"##ركز": 435,
"##بل": 436,
"العالم": 437,
"الفلسطين": 438,
"حتي": 439,
"يع": 440,
"##زب": 441,
"##بت": 442,
"##يران": 443,
"الفر": 444,
"اق": 445,
"والت": 446,
"##فع": 447,
"##هدف": 448,
"المست": 449,
"مجلس": 450,
"اعل": 451,
"امام": 452,
"##نظيم": 453,
"ون": 454,
"##دا": 455,
"##قبل": 456,
"لا": 457,
"ول": 458,
"اص": 459,
"موق": 460,
"##جل": 461,
"المصري": 462,
"الدول": 463,
"##تحاد": 464,
"##ربع": 465,
"##وت": 466,
"خط": 467,
"##نيه": 468,
"الماضي": 469,
"مص": 470,
"##جد": 471,
"##امه": 472,
"##دري": 473,
"##ضع": 474,
"واضاف": 475,
"التح": 476,
"##ابات": 477,
"السل": 478,
"حم": 479,
"##صف": 480,
"الاف": 481,
"##ظام": 482,
"##انه": 483,
"مخ": 484,
"##اءل": 485,
"##يف": 486,
"تر": 487,
"##ضا": 488,
"##غرب": 489,
"##ملكه": 490,
"قاء": 491,
"تش": 492,
"السي": 493,
"تزال": 494,
"سوريا": 495,
"##جي": 496,
"انت": 497,
"##اسيه": 498,
"##اسي": 499
}
}
}