sqrk commited on
Commit
e7a5eba
·
verified ·
1 Parent(s): 1f3a246

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +2 -2
  2. tokenizer_config.json +4 -4
  3. vocab.json +149 -1
added_tokens.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "</s>": 90,
3
- "<s>": 89
4
  }
 
1
  {
2
+ "</s>": 146,
3
+ "<s>": 145
4
  }
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "added_tokens_decoder": {
3
- "87": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
@@ -8,7 +8,7 @@
8
  "single_word": false,
9
  "special": false
10
  },
11
- "88": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
@@ -16,7 +16,7 @@
16
  "single_word": false,
17
  "special": false
18
  },
19
- "89": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": false,
@@ -24,7 +24,7 @@
24
  "single_word": false,
25
  "special": true
26
  },
27
- "90": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": false,
 
1
  {
2
  "added_tokens_decoder": {
3
+ "143": {
4
  "content": "[UNK]",
5
  "lstrip": true,
6
  "normalized": false,
 
8
  "single_word": false,
9
  "special": false
10
  },
11
+ "144": {
12
  "content": "[PAD]",
13
  "lstrip": true,
14
  "normalized": false,
 
16
  "single_word": false,
17
  "special": false
18
  },
19
+ "145": {
20
  "content": "<s>",
21
  "lstrip": false,
22
  "normalized": false,
 
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "146": {
28
  "content": "</s>",
29
  "lstrip": false,
30
  "normalized": false,
vocab.json CHANGED
@@ -1 +1,149 @@
1
- {"ara": {"@": 1, "\\": 2, "a": 3, "b": 4, "c": 5, "d": 6, "e": 7, "f": 8, "g": 9, "h": 10, "i": 11, "j": 12, "k": 13, "l": 14, "m": 15, "n": 16, "o": 17, "p": 18, "q": 19, "r": 20, "s": 21, "t": 22, "u": 23, "v": 24, "w": 25, "x": 26, "y": 27, "z": 28, "\u00b7": 29, "\u00e0": 30, "\u00e5": 31, "\u00e7": 32, "\u00e8": 33, "\u00e9": 34, "\u00ea": 35, "\u00fb": 36, "\u0307": 37, "\u061b": 38, "\u0621": 39, "\u0622": 40, "\u0623": 41, "\u0624": 42, "\u0625": 43, "\u0626": 44, "\u0627": 45, "\u0628": 46, "\u0629": 47, "\u062a": 48, "\u062b": 49, "\u062c": 50, "\u062d": 51, "\u062e": 52, "\u062f": 53, "\u0630": 54, "\u0631": 55, "\u0632": 56, "\u0633": 57, "\u0634": 58, "\u0635": 59, "\u0636": 60, "\u0637": 61, "\u0638": 62, "\u0639": 63, "\u063a": 64, "\u0640": 65, "\u0641": 66, "\u0642": 67, "\u0643": 68, "\u0644": 69, "\u0645": 70, "\u0646": 71, "\u0647": 72, "\u0648": 73, "\u0649": 74, "\u064a": 75, "\u064b": 76, "\u064c": 77, "\u064d": 78, "\u064e": 79, "\u064f": 80, "\u0650": 81, "\u0651": 82, "\u0663": 83, "\u0665": 84, "\u0667": 85, "\u2019": 86, "|": 0, "[UNK]": 87, "[PAD]": 88}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ara": {
3
+ "!": 1,
4
+ "\"": 2,
5
+ "#": 3,
6
+ "%": 4,
7
+ "&": 5,
8
+ "'": 6,
9
+ "(": 7,
10
+ ")": 8,
11
+ "+": 9,
12
+ ",": 10,
13
+ "-": 11,
14
+ ".": 12,
15
+ "/": 13,
16
+ "0": 14,
17
+ "1": 15,
18
+ "2": 16,
19
+ "3": 17,
20
+ "4": 18,
21
+ "5": 19,
22
+ "6": 20,
23
+ "7": 21,
24
+ "8": 22,
25
+ "9": 23,
26
+ ":": 24,
27
+ ";": 25,
28
+ "?": 26,
29
+ "@": 27,
30
+ "A": 28,
31
+ "B": 29,
32
+ "C": 30,
33
+ "D": 31,
34
+ "E": 32,
35
+ "F": 33,
36
+ "G": 34,
37
+ "H": 35,
38
+ "I": 36,
39
+ "J": 37,
40
+ "K": 38,
41
+ "L": 39,
42
+ "M": 40,
43
+ "N": 41,
44
+ "O": 42,
45
+ "P": 43,
46
+ "Q": 44,
47
+ "R": 45,
48
+ "S": 46,
49
+ "T": 47,
50
+ "U": 48,
51
+ "V": 49,
52
+ "W": 50,
53
+ "X": 51,
54
+ "Y": 52,
55
+ "Z": 53,
56
+ "[PAD]": 144,
57
+ "[UNK]": 143,
58
+ "\\": 54,
59
+ "a": 55,
60
+ "b": 56,
61
+ "c": 57,
62
+ "d": 58,
63
+ "e": 59,
64
+ "f": 60,
65
+ "g": 61,
66
+ "h": 62,
67
+ "i": 63,
68
+ "j": 64,
69
+ "k": 65,
70
+ "l": 66,
71
+ "m": 67,
72
+ "n": 68,
73
+ "o": 69,
74
+ "p": 70,
75
+ "q": 71,
76
+ "r": 72,
77
+ "s": 73,
78
+ "t": 74,
79
+ "u": 75,
80
+ "v": 76,
81
+ "w": 77,
82
+ "x": 78,
83
+ "y": 79,
84
+ "z": 80,
85
+ "|": 0,
86
+ "·": 81,
87
+ "à": 82,
88
+ "å": 83,
89
+ "ç": 84,
90
+ "è": 85,
91
+ "é": 86,
92
+ "ê": 87,
93
+ "û": 88,
94
+ "İ": 89,
95
+ "،": 90,
96
+ "؛": 91,
97
+ "؟": 92,
98
+ "ء": 93,
99
+ "آ": 94,
100
+ "أ": 95,
101
+ "ؤ": 96,
102
+ "إ": 97,
103
+ "ئ": 98,
104
+ "ا": 99,
105
+ "ب": 100,
106
+ "ة": 101,
107
+ "ت": 102,
108
+ "ث": 103,
109
+ "ج": 104,
110
+ "ح": 105,
111
+ "خ": 106,
112
+ "د": 107,
113
+ "ذ": 108,
114
+ "ر": 109,
115
+ "ز": 110,
116
+ "س": 111,
117
+ "ش": 112,
118
+ "ص": 113,
119
+ "ض": 114,
120
+ "ط": 115,
121
+ "ظ": 116,
122
+ "ع": 117,
123
+ "غ": 118,
124
+ "ـ": 119,
125
+ "ف": 120,
126
+ "ق": 121,
127
+ "ك": 122,
128
+ "ل": 123,
129
+ "م": 124,
130
+ "ن": 125,
131
+ "ه": 126,
132
+ "و": 127,
133
+ "ى": 128,
134
+ "ي": 129,
135
+ "ً": 130,
136
+ "ٌ": 131,
137
+ "ٍ": 132,
138
+ "َ": 133,
139
+ "ُ": 134,
140
+ "ِ": 135,
141
+ "ّ": 136,
142
+ "٣": 137,
143
+ "٥": 138,
144
+ "٧": 139,
145
+ "’": 140,
146
+ "“": 141,
147
+ "”": 142
148
+ }
149
+ }