English
pszemraj commited on
Commit
ba51bc8
1 Parent(s): affb7f0

update tokenizer

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. special_tokens_map.json +7 -0
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +34 -41
README.md CHANGED
@@ -1,9 +1,9 @@
1
  ---
 
 
2
  language:
3
  - en
4
  license: apache-2.0
5
- datasets:
6
- - BEE-spoke-data/bees-internal
7
  ---
8
 
9
  # BeeTokenizer
 
1
  ---
2
+ datasets:
3
+ - BEE-spoke-data/bees-internal
4
  language:
5
  - en
6
  license: apache-2.0
 
 
7
  ---
8
 
9
  # BeeTokenizer
special_tokens_map.json CHANGED
@@ -1,4 +1,11 @@
1
  {
 
 
 
 
 
 
 
2
  "cls_token": {
3
  "content": "<cls>",
4
  "lstrip": false,
 
1
  {
2
+ "bos_token": {
3
+ "content": "<bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
  "cls_token": {
10
  "content": "<cls>",
11
  "lstrip": false,
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "<s>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
@@ -17,14 +17,6 @@
17
  "special": true
18
  },
19
  "2": {
20
- "content": "</s>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "3": {
28
  "content": "<unk>",
29
  "lstrip": false,
30
  "normalized": false,
@@ -32,7 +24,7 @@
32
  "single_word": false,
33
  "special": true
34
  },
35
- "4": {
36
  "content": "<cls>",
37
  "lstrip": false,
38
  "normalized": false,
@@ -40,7 +32,7 @@
40
  "single_word": false,
41
  "special": true
42
  },
43
- "5": {
44
  "content": "<sep>",
45
  "lstrip": false,
46
  "normalized": false,
@@ -48,7 +40,7 @@
48
  "single_word": false,
49
  "special": true
50
  },
51
- "6": {
52
  "content": "<mask>",
53
  "lstrip": false,
54
  "normalized": false,
@@ -56,7 +48,7 @@
56
  "single_word": false,
57
  "special": true
58
  },
59
- "7": {
60
  "content": "<|endoftext|>",
61
  "lstrip": false,
62
  "normalized": false,
@@ -64,7 +56,7 @@
64
  "single_word": false,
65
  "special": true
66
  },
67
- "32072": {
68
  "content": " ",
69
  "lstrip": false,
70
  "normalized": true,
@@ -72,7 +64,7 @@
72
  "single_word": false,
73
  "special": false
74
  },
75
- "32073": {
76
  "content": " ",
77
  "lstrip": false,
78
  "normalized": true,
@@ -80,7 +72,7 @@
80
  "single_word": false,
81
  "special": false
82
  },
83
- "32074": {
84
  "content": " ",
85
  "lstrip": false,
86
  "normalized": true,
@@ -88,7 +80,7 @@
88
  "single_word": false,
89
  "special": false
90
  },
91
- "32075": {
92
  "content": " ",
93
  "lstrip": false,
94
  "normalized": true,
@@ -96,7 +88,7 @@
96
  "single_word": false,
97
  "special": false
98
  },
99
- "32076": {
100
  "content": " ",
101
  "lstrip": false,
102
  "normalized": true,
@@ -104,7 +96,7 @@
104
  "single_word": false,
105
  "special": false
106
  },
107
- "32077": {
108
  "content": " ",
109
  "lstrip": false,
110
  "normalized": true,
@@ -112,7 +104,7 @@
112
  "single_word": false,
113
  "special": false
114
  },
115
- "32078": {
116
  "content": " ",
117
  "lstrip": false,
118
  "normalized": true,
@@ -120,7 +112,7 @@
120
  "single_word": false,
121
  "special": false
122
  },
123
- "32079": {
124
  "content": " ",
125
  "lstrip": false,
126
  "normalized": true,
@@ -128,7 +120,7 @@
128
  "single_word": false,
129
  "special": false
130
  },
131
- "32080": {
132
  "content": " ",
133
  "lstrip": false,
134
  "normalized": true,
@@ -136,7 +128,7 @@
136
  "single_word": false,
137
  "special": false
138
  },
139
- "32081": {
140
  "content": " ",
141
  "lstrip": false,
142
  "normalized": true,
@@ -144,7 +136,7 @@
144
  "single_word": false,
145
  "special": false
146
  },
147
- "32082": {
148
  "content": " ",
149
  "lstrip": false,
150
  "normalized": true,
@@ -152,7 +144,7 @@
152
  "single_word": false,
153
  "special": false
154
  },
155
- "32083": {
156
  "content": " ",
157
  "lstrip": false,
158
  "normalized": true,
@@ -160,7 +152,7 @@
160
  "single_word": false,
161
  "special": false
162
  },
163
- "32084": {
164
  "content": " ",
165
  "lstrip": false,
166
  "normalized": true,
@@ -168,7 +160,7 @@
168
  "single_word": false,
169
  "special": false
170
  },
171
- "32085": {
172
  "content": " ",
173
  "lstrip": false,
174
  "normalized": true,
@@ -176,7 +168,7 @@
176
  "single_word": false,
177
  "special": false
178
  },
179
- "32086": {
180
  "content": " ",
181
  "lstrip": false,
182
  "normalized": true,
@@ -184,7 +176,7 @@
184
  "single_word": false,
185
  "special": false
186
  },
187
- "32087": {
188
  "content": " ",
189
  "lstrip": false,
190
  "normalized": true,
@@ -192,7 +184,7 @@
192
  "single_word": false,
193
  "special": false
194
  },
195
- "32088": {
196
  "content": " ",
197
  "lstrip": false,
198
  "normalized": true,
@@ -200,7 +192,7 @@
200
  "single_word": false,
201
  "special": false
202
  },
203
- "32089": {
204
  "content": " ",
205
  "lstrip": false,
206
  "normalized": true,
@@ -208,7 +200,7 @@
208
  "single_word": false,
209
  "special": false
210
  },
211
- "32090": {
212
  "content": " ",
213
  "lstrip": false,
214
  "normalized": true,
@@ -216,7 +208,7 @@
216
  "single_word": false,
217
  "special": false
218
  },
219
- "32091": {
220
  "content": " ",
221
  "lstrip": false,
222
  "normalized": true,
@@ -224,7 +216,7 @@
224
  "single_word": false,
225
  "special": false
226
  },
227
- "32092": {
228
  "content": " ",
229
  "lstrip": false,
230
  "normalized": true,
@@ -232,7 +224,7 @@
232
  "single_word": false,
233
  "special": false
234
  },
235
- "32093": {
236
  "content": " ",
237
  "lstrip": false,
238
  "normalized": true,
@@ -240,7 +232,7 @@
240
  "single_word": false,
241
  "special": false
242
  },
243
- "32094": {
244
  "content": " ",
245
  "lstrip": false,
246
  "normalized": true,
@@ -248,7 +240,7 @@
248
  "single_word": false,
249
  "special": false
250
  },
251
- "32095": {
252
  "content": " ",
253
  "lstrip": false,
254
  "normalized": true,
@@ -256,7 +248,7 @@
256
  "single_word": false,
257
  "special": false
258
  },
259
- "32096": {
260
  "content": " ",
261
  "lstrip": false,
262
  "normalized": true,
@@ -264,7 +256,7 @@
264
  "single_word": false,
265
  "special": false
266
  },
267
- "32097": {
268
  "content": " ",
269
  "lstrip": false,
270
  "normalized": true,
@@ -272,7 +264,7 @@
272
  "single_word": false,
273
  "special": false
274
  },
275
- "32098": {
276
  "content": " ",
277
  "lstrip": false,
278
  "normalized": true,
@@ -280,7 +272,7 @@
280
  "single_word": false,
281
  "special": false
282
  },
283
- "32099": {
284
  "content": " ",
285
  "lstrip": false,
286
  "normalized": true,
@@ -289,6 +281,7 @@
289
  "special": false
290
  }
291
  },
 
292
  "clean_up_tokenization_spaces": true,
293
  "cls_token": "<cls>",
294
  "eos_token": "<|endoftext|>",
 
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "<bos>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
 
17
  "special": true
18
  },
19
  "2": {
 
 
 
 
 
 
 
 
20
  "content": "<unk>",
21
  "lstrip": false,
22
  "normalized": false,
 
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "3": {
28
  "content": "<cls>",
29
  "lstrip": false,
30
  "normalized": false,
 
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "4": {
36
  "content": "<sep>",
37
  "lstrip": false,
38
  "normalized": false,
 
40
  "single_word": false,
41
  "special": true
42
  },
43
+ "5": {
44
  "content": "<mask>",
45
  "lstrip": false,
46
  "normalized": false,
 
48
  "single_word": false,
49
  "special": true
50
  },
51
+ "6": {
52
  "content": "<|endoftext|>",
53
  "lstrip": false,
54
  "normalized": false,
 
56
  "single_word": false,
57
  "special": true
58
  },
59
+ "31972": {
60
  "content": " ",
61
  "lstrip": false,
62
  "normalized": true,
 
64
  "single_word": false,
65
  "special": false
66
  },
67
+ "31973": {
68
  "content": " ",
69
  "lstrip": false,
70
  "normalized": true,
 
72
  "single_word": false,
73
  "special": false
74
  },
75
+ "31974": {
76
  "content": " ",
77
  "lstrip": false,
78
  "normalized": true,
 
80
  "single_word": false,
81
  "special": false
82
  },
83
+ "31975": {
84
  "content": " ",
85
  "lstrip": false,
86
  "normalized": true,
 
88
  "single_word": false,
89
  "special": false
90
  },
91
+ "31976": {
92
  "content": " ",
93
  "lstrip": false,
94
  "normalized": true,
 
96
  "single_word": false,
97
  "special": false
98
  },
99
+ "31977": {
100
  "content": " ",
101
  "lstrip": false,
102
  "normalized": true,
 
104
  "single_word": false,
105
  "special": false
106
  },
107
+ "31978": {
108
  "content": " ",
109
  "lstrip": false,
110
  "normalized": true,
 
112
  "single_word": false,
113
  "special": false
114
  },
115
+ "31979": {
116
  "content": " ",
117
  "lstrip": false,
118
  "normalized": true,
 
120
  "single_word": false,
121
  "special": false
122
  },
123
+ "31980": {
124
  "content": " ",
125
  "lstrip": false,
126
  "normalized": true,
 
128
  "single_word": false,
129
  "special": false
130
  },
131
+ "31981": {
132
  "content": " ",
133
  "lstrip": false,
134
  "normalized": true,
 
136
  "single_word": false,
137
  "special": false
138
  },
139
+ "31982": {
140
  "content": " ",
141
  "lstrip": false,
142
  "normalized": true,
 
144
  "single_word": false,
145
  "special": false
146
  },
147
+ "31983": {
148
  "content": " ",
149
  "lstrip": false,
150
  "normalized": true,
 
152
  "single_word": false,
153
  "special": false
154
  },
155
+ "31984": {
156
  "content": " ",
157
  "lstrip": false,
158
  "normalized": true,
 
160
  "single_word": false,
161
  "special": false
162
  },
163
+ "31985": {
164
  "content": " ",
165
  "lstrip": false,
166
  "normalized": true,
 
168
  "single_word": false,
169
  "special": false
170
  },
171
+ "31986": {
172
  "content": " ",
173
  "lstrip": false,
174
  "normalized": true,
 
176
  "single_word": false,
177
  "special": false
178
  },
179
+ "31987": {
180
  "content": " ",
181
  "lstrip": false,
182
  "normalized": true,
 
184
  "single_word": false,
185
  "special": false
186
  },
187
+ "31988": {
188
  "content": " ",
189
  "lstrip": false,
190
  "normalized": true,
 
192
  "single_word": false,
193
  "special": false
194
  },
195
+ "31989": {
196
  "content": " ",
197
  "lstrip": false,
198
  "normalized": true,
 
200
  "single_word": false,
201
  "special": false
202
  },
203
+ "31990": {
204
  "content": " ",
205
  "lstrip": false,
206
  "normalized": true,
 
208
  "single_word": false,
209
  "special": false
210
  },
211
+ "31991": {
212
  "content": " ",
213
  "lstrip": false,
214
  "normalized": true,
 
216
  "single_word": false,
217
  "special": false
218
  },
219
+ "31992": {
220
  "content": " ",
221
  "lstrip": false,
222
  "normalized": true,
 
224
  "single_word": false,
225
  "special": false
226
  },
227
+ "31993": {
228
  "content": " ",
229
  "lstrip": false,
230
  "normalized": true,
 
232
  "single_word": false,
233
  "special": false
234
  },
235
+ "31994": {
236
  "content": " ",
237
  "lstrip": false,
238
  "normalized": true,
 
240
  "single_word": false,
241
  "special": false
242
  },
243
+ "31995": {
244
  "content": " ",
245
  "lstrip": false,
246
  "normalized": true,
 
248
  "single_word": false,
249
  "special": false
250
  },
251
+ "31996": {
252
  "content": " ",
253
  "lstrip": false,
254
  "normalized": true,
 
256
  "single_word": false,
257
  "special": false
258
  },
259
+ "31997": {
260
  "content": " ",
261
  "lstrip": false,
262
  "normalized": true,
 
264
  "single_word": false,
265
  "special": false
266
  },
267
+ "31998": {
268
  "content": " ",
269
  "lstrip": false,
270
  "normalized": true,
 
272
  "single_word": false,
273
  "special": false
274
  },
275
+ "31999": {
276
  "content": " ",
277
  "lstrip": false,
278
  "normalized": true,
 
281
  "special": false
282
  }
283
  },
284
+ "bos_token": "<bos>",
285
  "clean_up_tokenization_spaces": true,
286
  "cls_token": "<cls>",
287
  "eos_token": "<|endoftext|>",