guymorlan commited on
Commit
d5646eb
1 Parent(s): 9de7328

add tavbert-ar model

Browse files
Files changed (5) hide show
  1. config.json +24 -25
  2. model.safetensors +2 -2
  3. special_tokens_map.json +5 -42
  4. tokenizer.json +452 -0
  5. tokenizer_config.json +27 -21
config.json CHANGED
@@ -1,43 +1,42 @@
1
  {
2
- "_name_or_path": "/home/etherx/arabic2tashkeel/tashkeela_pretraining/checkpoint-84392",
3
  "architectures": [
4
- "TranslitModel"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
- "bos_token_id": 57344,
8
- "downsampling_rate": 4,
9
- "eos_token_id": 57345,
 
10
  "hidden_act": "gelu",
11
  "hidden_dropout_prob": 0.1,
12
  "hidden_size": 768,
13
  "id2label": {
14
- "0": "SHADDA",
15
- "1": "FATHA",
16
- "2": "KASRA",
17
- "3": "DAMMA",
18
- "4": "SUKKUN"
19
  },
20
  "initializer_range": 0.02,
21
  "intermediate_size": 3072,
22
  "label2id": {
23
- "SHADDA": 0,
24
- "FATHA": 1,
25
- "KASRA": 2,
26
- "DAMMA": 3,
27
- "SUKKUN": 4
28
  },
29
- "layer_norm_eps": 1e-12,
30
- "local_transformer_stride": 128,
31
- "max_position_embeddings": 16384,
32
- "model_type": "canine",
33
  "num_attention_heads": 12,
34
- "num_hash_buckets": 16384,
35
- "num_hash_functions": 8,
36
  "num_hidden_layers": 12,
37
- "pad_token_id": 0,
 
38
  "torch_dtype": "float32",
39
  "transformers_version": "4.38.1",
40
- "type_vocab_size": 16,
41
- "upsampling_kernel_size": 4,
42
- "use_cache": true
43
  }
 
1
  {
2
+ "_name_or_path": "tau/tavbert-ar",
3
  "architectures": [
4
+ "RobertaForTokenClassification"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
  "hidden_act": "gelu",
12
  "hidden_dropout_prob": 0.1,
13
  "hidden_size": 768,
14
  "id2label": {
15
+ "0": "LABEL_0",
16
+ "1": "LABEL_1",
17
+ "2": "LABEL_2",
18
+ "3": "LABEL_3",
19
+ "4": "LABEL_4"
20
  },
21
  "initializer_range": 0.02,
22
  "intermediate_size": 3072,
23
  "label2id": {
24
+ "LABEL_0": 0,
25
+ "LABEL_1": 1,
26
+ "LABEL_2": 2,
27
+ "LABEL_3": 3,
28
+ "LABEL_4": 4
29
  },
30
+ "layer_norm_eps": 1e-05,
31
+ "max_position_embeddings": 2050,
32
+ "model_type": "roberta",
 
33
  "num_attention_heads": 12,
 
 
34
  "num_hidden_layers": 12,
35
+ "pad_token_id": 1,
36
+ "position_embedding_type": "absolute",
37
  "torch_dtype": "float32",
38
  "transformers_version": "4.38.1",
39
+ "type_vocab_size": 2,
40
+ "use_cache": true,
41
+ "vocab_size": 302
42
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a1ee9625ef023ef8e36f4292c8d8f979b80a5f66cf7ce44a395231e378145c9
3
- size 528377156
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81dbe43961509b495e3dbfbc084c8900fbe2c5a8b93c5bd169b7d204f462434d
3
+ size 349857052
special_tokens_map.json CHANGED
@@ -1,44 +1,7 @@
1
  {
2
- "bos_token": {
3
- "content": "",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "cls_token": {
10
- "content": "",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "eos_token": {
17
- "content": "",
18
- "lstrip": false,
19
- "normalized": true,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "mask_token": {
24
- "content": "",
25
- "lstrip": true,
26
- "normalized": true,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
- "pad_token": {
31
- "content": "\u0000",
32
- "lstrip": false,
33
- "normalized": true,
34
- "rstrip": false,
35
- "single_word": false
36
- },
37
- "sep_token": {
38
- "content": "",
39
- "lstrip": false,
40
- "normalized": true,
41
- "rstrip": false,
42
- "single_word": false
43
- }
44
  }
 
1
  {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  }
tokenizer.json ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": {
5
+ "strategy": "BatchLongest",
6
+ "direction": "Right",
7
+ "pad_to_multiple_of": null,
8
+ "pad_id": 1,
9
+ "pad_type_id": 0,
10
+ "pad_token": "[PAD]"
11
+ },
12
+ "added_tokens": [
13
+ {
14
+ "id": 0,
15
+ "content": "[CLS]",
16
+ "single_word": false,
17
+ "lstrip": false,
18
+ "rstrip": false,
19
+ "normalized": false,
20
+ "special": true
21
+ },
22
+ {
23
+ "id": 1,
24
+ "content": "[PAD]",
25
+ "single_word": false,
26
+ "lstrip": false,
27
+ "rstrip": false,
28
+ "normalized": false,
29
+ "special": true
30
+ },
31
+ {
32
+ "id": 2,
33
+ "content": "[SEP]",
34
+ "single_word": false,
35
+ "lstrip": false,
36
+ "rstrip": false,
37
+ "normalized": false,
38
+ "special": true
39
+ },
40
+ {
41
+ "id": 3,
42
+ "content": "[UNK]",
43
+ "single_word": false,
44
+ "lstrip": false,
45
+ "rstrip": false,
46
+ "normalized": false,
47
+ "special": true
48
+ },
49
+ {
50
+ "id": 301,
51
+ "content": "[MASK]",
52
+ "single_word": false,
53
+ "lstrip": false,
54
+ "rstrip": false,
55
+ "normalized": false,
56
+ "special": true
57
+ }
58
+ ],
59
+ "normalizer": null,
60
+ "pre_tokenizer": {
61
+ "type": "Split",
62
+ "pattern": {
63
+ "String": ""
64
+ },
65
+ "behavior": "Isolated",
66
+ "invert": false
67
+ },
68
+ "post_processor": {
69
+ "type": "TemplateProcessing",
70
+ "single": [
71
+ {
72
+ "SpecialToken": {
73
+ "id": "[CLS]",
74
+ "type_id": 0
75
+ }
76
+ },
77
+ {
78
+ "Sequence": {
79
+ "id": "A",
80
+ "type_id": 0
81
+ }
82
+ },
83
+ {
84
+ "SpecialToken": {
85
+ "id": "[SEP]",
86
+ "type_id": 0
87
+ }
88
+ }
89
+ ],
90
+ "pair": [
91
+ {
92
+ "SpecialToken": {
93
+ "id": "[CLS]",
94
+ "type_id": 0
95
+ }
96
+ },
97
+ {
98
+ "Sequence": {
99
+ "id": "A",
100
+ "type_id": 0
101
+ }
102
+ },
103
+ {
104
+ "SpecialToken": {
105
+ "id": "[SEP]",
106
+ "type_id": 0
107
+ }
108
+ },
109
+ {
110
+ "Sequence": {
111
+ "id": "B",
112
+ "type_id": 1
113
+ }
114
+ },
115
+ {
116
+ "SpecialToken": {
117
+ "id": "[SEP]",
118
+ "type_id": 1
119
+ }
120
+ }
121
+ ],
122
+ "special_tokens": {
123
+ "[CLS]": {
124
+ "id": "[CLS]",
125
+ "ids": [
126
+ 0
127
+ ],
128
+ "tokens": [
129
+ "[CLS]"
130
+ ]
131
+ },
132
+ "[SEP]": {
133
+ "id": "[SEP]",
134
+ "ids": [
135
+ 2
136
+ ],
137
+ "tokens": [
138
+ "[SEP]"
139
+ ]
140
+ }
141
+ }
142
+ },
143
+ "decoder": null,
144
+ "model": {
145
+ "type": "WordLevel",
146
+ "vocab": {
147
+ "[CLS]": 0,
148
+ "[PAD]": 1,
149
+ "[SEP]": 2,
150
+ "[UNK]": 3,
151
+ "unused0": 4,
152
+ "unused1": 5,
153
+ "unused2": 6,
154
+ "unused3": 7,
155
+ "unused4": 8,
156
+ "unused5": 9,
157
+ "unused6": 10,
158
+ "unused7": 11,
159
+ "unused8": 12,
160
+ "unused9": 13,
161
+ "unused10": 14,
162
+ "unused11": 15,
163
+ "unused12": 16,
164
+ "unused13": 17,
165
+ "unused14": 18,
166
+ "unused15": 19,
167
+ "unused16": 20,
168
+ "unused17": 21,
169
+ "unused18": 22,
170
+ "unused19": 23,
171
+ "unused20": 24,
172
+ "unused21": 25,
173
+ "unused22": 26,
174
+ "unused23": 27,
175
+ "unused24": 28,
176
+ "unused25": 29,
177
+ "unused26": 30,
178
+ "unused27": 31,
179
+ "unused28": 32,
180
+ "unused29": 33,
181
+ "unused30": 34,
182
+ "unused31": 35,
183
+ "unused32": 36,
184
+ "unused33": 37,
185
+ "unused34": 38,
186
+ "unused35": 39,
187
+ "unused36": 40,
188
+ "unused37": 41,
189
+ "unused38": 42,
190
+ "unused39": 43,
191
+ "unused40": 44,
192
+ "unused41": 45,
193
+ "unused42": 46,
194
+ "unused43": 47,
195
+ "unused44": 48,
196
+ "unused45": 49,
197
+ "unused46": 50,
198
+ "unused47": 51,
199
+ "unused48": 52,
200
+ "unused49": 53,
201
+ "unused50": 54,
202
+ "unused51": 55,
203
+ "unused52": 56,
204
+ "unused53": 57,
205
+ "unused54": 58,
206
+ "unused55": 59,
207
+ "unused56": 60,
208
+ "unused57": 61,
209
+ "unused58": 62,
210
+ "unused59": 63,
211
+ "unused60": 64,
212
+ "unused61": 65,
213
+ "unused62": 66,
214
+ "unused63": 67,
215
+ "unused64": 68,
216
+ "unused65": 69,
217
+ "unused66": 70,
218
+ "unused67": 71,
219
+ "unused68": 72,
220
+ "unused69": 73,
221
+ "unused70": 74,
222
+ "unused71": 75,
223
+ "unused72": 76,
224
+ "unused73": 77,
225
+ "unused74": 78,
226
+ "unused75": 79,
227
+ "unused76": 80,
228
+ "unused77": 81,
229
+ "unused78": 82,
230
+ "unused79": 83,
231
+ "unused80": 84,
232
+ "unused81": 85,
233
+ "unused82": 86,
234
+ "unused83": 87,
235
+ "unused84": 88,
236
+ "unused85": 89,
237
+ "unused86": 90,
238
+ "unused87": 91,
239
+ "unused88": 92,
240
+ "unused89": 93,
241
+ "unused90": 94,
242
+ "unused91": 95,
243
+ "unused92": 96,
244
+ "unused93": 97,
245
+ "unused94": 98,
246
+ "unused95": 99,
247
+ "unused96": 100,
248
+ "unused97": 101,
249
+ "unused98": 102,
250
+ "unused99": 103,
251
+ " ": 104,
252
+ "!": 105,
253
+ "\"": 106,
254
+ "#": 107,
255
+ "$": 108,
256
+ "%": 109,
257
+ "&": 110,
258
+ "'": 111,
259
+ "(": 112,
260
+ ")": 113,
261
+ "*": 114,
262
+ "+": 115,
263
+ ",": 116,
264
+ "-": 117,
265
+ ".": 118,
266
+ "/": 119,
267
+ "0": 120,
268
+ "1": 121,
269
+ "2": 122,
270
+ "3": 123,
271
+ "4": 124,
272
+ "5": 125,
273
+ "6": 126,
274
+ "7": 127,
275
+ "8": 128,
276
+ "9": 129,
277
+ ":": 130,
278
+ ";": 131,
279
+ "<": 132,
280
+ "=": 133,
281
+ ">": 134,
282
+ "?": 135,
283
+ "@": 136,
284
+ "A": 137,
285
+ "B": 138,
286
+ "C": 139,
287
+ "D": 140,
288
+ "E": 141,
289
+ "F": 142,
290
+ "G": 143,
291
+ "H": 144,
292
+ "I": 145,
293
+ "J": 146,
294
+ "K": 147,
295
+ "L": 148,
296
+ "M": 149,
297
+ "N": 150,
298
+ "O": 151,
299
+ "P": 152,
300
+ "Q": 153,
301
+ "R": 154,
302
+ "S": 155,
303
+ "T": 156,
304
+ "U": 157,
305
+ "V": 158,
306
+ "W": 159,
307
+ "X": 160,
308
+ "Y": 161,
309
+ "Z": 162,
310
+ "[": 163,
311
+ "\\": 164,
312
+ "]": 165,
313
+ "^": 166,
314
+ "_": 167,
315
+ "a": 168,
316
+ "b": 169,
317
+ "c": 170,
318
+ "d": 171,
319
+ "e": 172,
320
+ "f": 173,
321
+ "g": 174,
322
+ "h": 175,
323
+ "i": 176,
324
+ "j": 177,
325
+ "k": 178,
326
+ "l": 179,
327
+ "m": 180,
328
+ "n": 181,
329
+ "o": 182,
330
+ "p": 183,
331
+ "q": 184,
332
+ "r": 185,
333
+ "s": 186,
334
+ "t": 187,
335
+ "u": 188,
336
+ "v": 189,
337
+ "w": 190,
338
+ "x": 191,
339
+ "y": 192,
340
+ "z": 193,
341
+ "{": 194,
342
+ "|": 195,
343
+ "}": 196,
344
+ "~": 197,
345
+ "«": 198,
346
+ "°": 199,
347
+ "·": 200,
348
+ "»": 201,
349
+ "é": 202,
350
+ "а": 203,
351
+ "в": 204,
352
+ "д": 205,
353
+ "е": 206,
354
+ "и": 207,
355
+ "к": 208,
356
+ "л": 209,
357
+ "м": 210,
358
+ "н": 211,
359
+ "о": 212,
360
+ "п": 213,
361
+ "р": 214,
362
+ "с": 215,
363
+ "т": 216,
364
+ "،": 217,
365
+ "؛": 218,
366
+ "؟": 219,
367
+ "ء": 220,
368
+ "آ": 221,
369
+ "أ": 222,
370
+ "ؤ": 223,
371
+ "إ": 224,
372
+ "ئ": 225,
373
+ "ا": 226,
374
+ "ب": 227,
375
+ "ة": 228,
376
+ "ت": 229,
377
+ "ث": 230,
378
+ "ج": 231,
379
+ "ح": 232,
380
+ "خ": 233,
381
+ "د": 234,
382
+ "ذ": 235,
383
+ "ر": 236,
384
+ "ز": 237,
385
+ "س": 238,
386
+ "ش": 239,
387
+ "ص": 240,
388
+ "ض": 241,
389
+ "ط": 242,
390
+ "ظ": 243,
391
+ "ع": 244,
392
+ "غ": 245,
393
+ "ـ": 246,
394
+ "ف": 247,
395
+ "ق": 248,
396
+ "ك": 249,
397
+ "ل": 250,
398
+ "م": 251,
399
+ "ن": 252,
400
+ "ه": 253,
401
+ "و": 254,
402
+ "ى": 255,
403
+ "ي": 256,
404
+ "ً": 257,
405
+ "ٌ": 258,
406
+ "ٍ": 259,
407
+ "َ": 260,
408
+ "ُ": 261,
409
+ "ِ": 262,
410
+ "ّ": 263,
411
+ "ْ": 264,
412
+ "٠": 265,
413
+ "١": 266,
414
+ "٢": 267,
415
+ "٣": 268,
416
+ "٤": 269,
417
+ "٥": 270,
418
+ "٦": 271,
419
+ "٧": 272,
420
+ "٨": 273,
421
+ "٩": 274,
422
+ "٪": 275,
423
+ "پ": 276,
424
+ "ک": 277,
425
+ "گ": 278,
426
+ "ھ": 279,
427
+ "ی": 280,
428
+ "​": 281,
429
+ "‌": 282,
430
+ "‎": 283,
431
+ "‏": 284,
432
+ "–": 285,
433
+ "—": 286,
434
+ "‘": 287,
435
+ "’": 288,
436
+ "“": 289,
437
+ "”": 290,
438
+ "•": 291,
439
+ "…": 292,
440
+ "‪": 293,
441
+ "‫": 294,
442
+ "‬": 295,
443
+ "‭": 296,
444
+ "‮": 297,
445
+ "﴾": 298,
446
+ "﴿": 299,
447
+ "�": 300,
448
+ "[MASK]": 301
449
+ },
450
+ "unk_token": "[UNK]"
451
+ }
452
+ }
tokenizer_config.json CHANGED
@@ -1,46 +1,52 @@
1
  {
2
- "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
  "0": {
5
- "content": "\u0000",
6
  "lstrip": false,
7
- "normalized": true,
8
  "rstrip": false,
9
  "single_word": false,
10
  "special": true
11
  },
12
- "57344": {
13
- "content": "",
14
  "lstrip": false,
15
- "normalized": true,
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
19
  },
20
- "57345": {
21
- "content": "",
22
  "lstrip": false,
23
- "normalized": true,
24
  "rstrip": false,
25
  "single_word": false,
26
  "special": true
27
  },
28
- "57347": {
29
- "content": "",
30
- "lstrip": true,
31
- "normalized": true,
 
 
 
 
 
 
 
 
32
  "rstrip": false,
33
  "single_word": false,
34
  "special": true
35
  }
36
  },
37
- "bos_token": "",
38
  "clean_up_tokenization_spaces": true,
39
- "cls_token": "",
40
- "eos_token": "",
41
- "mask_token": "",
42
- "model_max_length": 2048,
43
- "pad_token": "\u0000",
44
- "sep_token": "",
45
- "tokenizer_class": "CanineTokenizer"
46
  }
 
1
  {
 
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "[CLS]",
5
  "lstrip": false,
6
+ "normalized": false,
7
  "rstrip": false,
8
  "single_word": false,
9
  "special": true
10
  },
11
+ "1": {
12
+ "content": "[PAD]",
13
  "lstrip": false,
14
+ "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
+ "2": {
20
+ "content": "[SEP]",
21
  "lstrip": false,
22
+ "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "301": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  }
43
  },
 
44
  "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "mask_token": "[MASK]",
47
+ "model_max_length": 1000000000000000019884624838656,
48
+ "pad_token": "[PAD]",
49
+ "sep_token": "[SEP]",
50
+ "tokenizer_class": "PreTrainedTokenizerFast",
51
+ "unk_token": "[UNK]"
52
  }