adibvafa commited on
Commit
f8cf390
1 Parent(s): 905b4a8

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +9 -0
  2. tokenizer.json +250 -0
  3. tokenizer_config.json +54 -0
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
tokenizer.json ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[UNK]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[CLS]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[SEP]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[PAD]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": {
53
+ "type": "Sequence",
54
+ "normalizers": [
55
+ {
56
+ "type": "Lowercase"
57
+ }
58
+ ]
59
+ },
60
+ "pre_tokenizer": {
61
+ "type": "Sequence",
62
+ "pretokenizers": [
63
+ {
64
+ "type": "Split",
65
+ "pattern": {
66
+ "String": " "
67
+ },
68
+ "behavior": "Isolated",
69
+ "invert": false
70
+ },
71
+ {
72
+ "type": "Whitespace"
73
+ }
74
+ ]
75
+ },
76
+ "post_processor": {
77
+ "type": "TemplateProcessing",
78
+ "single": [
79
+ {
80
+ "SpecialToken": {
81
+ "id": "[CLS]",
82
+ "type_id": 0
83
+ }
84
+ },
85
+ {
86
+ "Sequence": {
87
+ "id": "A",
88
+ "type_id": 0
89
+ }
90
+ },
91
+ {
92
+ "SpecialToken": {
93
+ "id": "[SEP]",
94
+ "type_id": 0
95
+ }
96
+ }
97
+ ],
98
+ "pair": [
99
+ {
100
+ "SpecialToken": {
101
+ "id": "[CLS]",
102
+ "type_id": 0
103
+ }
104
+ },
105
+ {
106
+ "Sequence": {
107
+ "id": "A",
108
+ "type_id": 0
109
+ }
110
+ },
111
+ {
112
+ "SpecialToken": {
113
+ "id": "[SEP]",
114
+ "type_id": 0
115
+ }
116
+ },
117
+ {
118
+ "Sequence": {
119
+ "id": "B",
120
+ "type_id": 1
121
+ }
122
+ },
123
+ {
124
+ "SpecialToken": {
125
+ "id": "[SEP]",
126
+ "type_id": 1
127
+ }
128
+ }
129
+ ],
130
+ "special_tokens": {
131
+ "[CLS]": {
132
+ "id": "[CLS]",
133
+ "ids": [
134
+ 1
135
+ ],
136
+ "tokens": [
137
+ "[CLS]"
138
+ ]
139
+ },
140
+ "[SEP]": {
141
+ "id": "[SEP]",
142
+ "ids": [
143
+ 2
144
+ ],
145
+ "tokens": [
146
+ "[SEP]"
147
+ ]
148
+ }
149
+ }
150
+ },
151
+ "decoder": null,
152
+ "model": {
153
+ "type": "WordPiece",
154
+ "unk_token": "[UNK]",
155
+ "continuing_subword_prefix": "##",
156
+ "max_input_chars_per_word": 100,
157
+ "vocab": {
158
+ "[UNK]": 0,
159
+ "[CLS]": 1,
160
+ "[SEP]": 2,
161
+ "[PAD]": 3,
162
+ "[MASK]": 4,
163
+ "a_unk": 5,
164
+ "c_unk": 6,
165
+ "d_unk": 7,
166
+ "e_unk": 8,
167
+ "f_unk": 9,
168
+ "g_unk": 10,
169
+ "h_unk": 11,
170
+ "i_unk": 12,
171
+ "k_unk": 13,
172
+ "l_unk": 14,
173
+ "m_unk": 15,
174
+ "n_unk": 16,
175
+ "p_unk": 17,
176
+ "q_unk": 18,
177
+ "r_unk": 19,
178
+ "s_unk": 20,
179
+ "t_unk": 21,
180
+ "v_unk": 22,
181
+ "w_unk": 23,
182
+ "y_unk": 24,
183
+ "__unk": 25,
184
+ "k_aaa": 26,
185
+ "n_aac": 27,
186
+ "k_aag": 28,
187
+ "n_aat": 29,
188
+ "t_aca": 30,
189
+ "t_acc": 31,
190
+ "t_acg": 32,
191
+ "t_act": 33,
192
+ "r_aga": 34,
193
+ "s_agc": 35,
194
+ "r_agg": 36,
195
+ "s_agt": 37,
196
+ "i_ata": 38,
197
+ "i_atc": 39,
198
+ "m_atg": 40,
199
+ "i_att": 41,
200
+ "q_caa": 42,
201
+ "h_cac": 43,
202
+ "q_cag": 44,
203
+ "h_cat": 45,
204
+ "p_cca": 46,
205
+ "p_ccc": 47,
206
+ "p_ccg": 48,
207
+ "p_cct": 49,
208
+ "r_cga": 50,
209
+ "r_cgc": 51,
210
+ "r_cgg": 52,
211
+ "r_cgt": 53,
212
+ "l_cta": 54,
213
+ "l_ctc": 55,
214
+ "l_ctg": 56,
215
+ "l_ctt": 57,
216
+ "e_gaa": 58,
217
+ "d_gac": 59,
218
+ "e_gag": 60,
219
+ "d_gat": 61,
220
+ "a_gca": 62,
221
+ "a_gcc": 63,
222
+ "a_gcg": 64,
223
+ "a_gct": 65,
224
+ "g_gga": 66,
225
+ "g_ggc": 67,
226
+ "g_ggg": 68,
227
+ "g_ggt": 69,
228
+ "v_gta": 70,
229
+ "v_gtc": 71,
230
+ "v_gtg": 72,
231
+ "v_gtt": 73,
232
+ "__taa": 74,
233
+ "y_tac": 75,
234
+ "__tag": 76,
235
+ "y_tat": 77,
236
+ "s_tca": 78,
237
+ "s_tcc": 79,
238
+ "s_tcg": 80,
239
+ "s_tct": 81,
240
+ "__tga": 82,
241
+ "c_tgc": 83,
242
+ "w_tgg": 84,
243
+ "c_tgt": 85,
244
+ "l_tta": 86,
245
+ "f_ttc": 87,
246
+ "l_ttg": 88,
247
+ "f_ttt": 89
248
+ }
249
+ }
250
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
+ "eos_token": "[SEP]",
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 1000000000000000019884624838656,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "tokenizer_class": "PreTrainedTokenizerFast",
53
+ "unk_token": "[UNK]"
54
+ }