sqrk commited on
Commit
c398318
1 Parent(s): e41445d

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +2 -2
  2. tokenizer_config.json +4 -4
  3. vocab.json +112 -77
added_tokens.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "</s>": 79,
3
- "<s>": 78
4
  }
 
1
  {
2
+ "</s>": 114,
3
+ "<s>": 113
4
  }
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "added_tokens_decoder": {
3
- "76": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
@@ -8,7 +8,7 @@
8
  "single_word": false,
9
  "special": false
10
  },
11
- "77": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
@@ -16,7 +16,7 @@
16
  "single_word": false,
17
  "special": false
18
  },
19
- "78": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": true,
@@ -24,7 +24,7 @@
24
  "single_word": false,
25
  "special": true
26
  },
27
- "79": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": true,
 
1
  {
2
  "added_tokens_decoder": {
3
+ "111": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
 
8
  "single_word": false,
9
  "special": false
10
  },
11
+ "112": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
 
16
  "single_word": false,
17
  "special": false
18
  },
19
+ "113": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": true,
 
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "114": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": true,
vocab.json CHANGED
@@ -1,82 +1,117 @@
1
  {
2
  "ara": {
3
- "0": 1,
4
- "3": 2,
5
- "4": 3,
6
- "[": 4,
7
- "[PAD]": 77,
8
- "[UNK]": 76,
9
- "]": 5,
10
- "a": 6,
11
- "b": 7,
12
- "c": 8,
13
- "d": 9,
14
- "e": 10,
15
- "f": 11,
16
- "g": 12,
17
- "h": 13,
18
- "i": 14,
19
- "j": 15,
20
- "k": 16,
21
- "l": 17,
22
- "m": 18,
23
- "n": 19,
24
- "o": 20,
25
- "p": 21,
26
- "q": 22,
27
- "r": 23,
28
- "s": 24,
29
- "t": 25,
30
- "u": 26,
31
- "v": 27,
32
- "w": 28,
33
- "x": 29,
34
- "y": 30,
35
- "z": 31,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  "|": 0,
37
- "؛": 32,
38
- "ء": 33,
39
- "آ": 34,
40
- "أ": 35,
41
- "ؤ": 36,
42
- "إ": 37,
43
- "ئ": 38,
44
- "ا": 39,
45
- "ب": 40,
46
- "ة": 41,
47
- "ت": 42,
48
- "ث": 43,
49
- "ج": 44,
50
- "ح": 45,
51
- "خ": 46,
52
- "د": 47,
53
- "ذ": 48,
54
- "ر": 49,
55
- "ز": 50,
56
- "س": 51,
57
- "ش": 52,
58
- "ص": 53,
59
- "ض": 54,
60
- "ط": 55,
61
- "ظ": 56,
62
- "ع": 57,
63
- "غ": 58,
64
- "ف": 59,
65
- "ق": 60,
66
- "ك": 61,
67
- "ل": 62,
68
- "م": 63,
69
- "ن": 64,
70
- "ه": 65,
71
- "و": 66,
72
- "ى": 67,
73
- "ي": 68,
74
- "ً": 69,
75
- "ٍ": 70,
76
- "َ": 71,
77
- "ُ": 72,
78
- "ِ": 73,
79
- "ی": 74,
80
- "": 75
 
 
 
 
81
  }
82
  }
 
1
  {
2
  "ara": {
3
+ "!": 1,
4
+ "'": 2,
5
+ ",": 3,
6
+ "-": 4,
7
+ ".": 5,
8
+ "0": 6,
9
+ "3": 7,
10
+ "4": 8,
11
+ "?": 9,
12
+ "A": 10,
13
+ "B": 11,
14
+ "C": 12,
15
+ "D": 13,
16
+ "E": 14,
17
+ "F": 15,
18
+ "G": 16,
19
+ "H": 17,
20
+ "I": 18,
21
+ "J": 19,
22
+ "K": 20,
23
+ "L": 21,
24
+ "M": 22,
25
+ "N": 23,
26
+ "O": 24,
27
+ "P": 25,
28
+ "R": 26,
29
+ "S": 27,
30
+ "T": 28,
31
+ "U": 29,
32
+ "V": 30,
33
+ "W": 31,
34
+ "X": 32,
35
+ "Y": 33,
36
+ "Z": 34,
37
+ "[": 35,
38
+ "[PAD]": 112,
39
+ "[UNK]": 111,
40
+ "]": 36,
41
+ "a": 37,
42
+ "b": 38,
43
+ "c": 39,
44
+ "d": 40,
45
+ "e": 41,
46
+ "f": 42,
47
+ "g": 43,
48
+ "h": 44,
49
+ "i": 45,
50
+ "j": 46,
51
+ "k": 47,
52
+ "l": 48,
53
+ "m": 49,
54
+ "n": 50,
55
+ "o": 51,
56
+ "p": 52,
57
+ "q": 53,
58
+ "r": 54,
59
+ "s": 55,
60
+ "t": 56,
61
+ "u": 57,
62
+ "v": 58,
63
+ "w": 59,
64
+ "x": 60,
65
+ "y": 61,
66
+ "z": 62,
67
  "|": 0,
68
+ "،": 63,
69
+ "؛": 64,
70
+ "؟": 65,
71
+ "ء": 66,
72
+ "آ": 67,
73
+ "أ": 68,
74
+ "ؤ": 69,
75
+ "إ": 70,
76
+ "ئ": 71,
77
+ "ا": 72,
78
+ "ب": 73,
79
+ "ة": 74,
80
+ "ت": 75,
81
+ "ث": 76,
82
+ "ج": 77,
83
+ "ح": 78,
84
+ "خ": 79,
85
+ "د": 80,
86
+ "ذ": 81,
87
+ "ر": 82,
88
+ "ز": 83,
89
+ "س": 84,
90
+ "ش": 85,
91
+ "ص": 86,
92
+ "ض": 87,
93
+ "ط": 88,
94
+ "ظ": 89,
95
+ "ع": 90,
96
+ "غ": 91,
97
+ "ف": 92,
98
+ "ق": 93,
99
+ "ك": 94,
100
+ "ل": 95,
101
+ "م": 96,
102
+ "ن": 97,
103
+ "ه": 98,
104
+ "و": 99,
105
+ "ى": 100,
106
+ "ي": 101,
107
+ "ً": 102,
108
+ "ٍ": 103,
109
+ "َ": 104,
110
+ "ُ": 105,
111
+ "ِ": 106,
112
+ "ّ": 107,
113
+ "ی": 108,
114
+ "–": 109,
115
+ "’": 110
116
  }
117
  }