OwOOwO commited on
Commit
a645da1
1 Parent(s): f2de4dc

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +0 -0
  2. special_tokens_map.json +39 -4
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +290 -20
  5. vocab.json +0 -0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -1,27 +1,62 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": {
3
- "content": "<s>",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "</s>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
- "content": "</s>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
  "unk_token": {
24
- "content": "<unk>",
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ "<|reg_extra|>",
4
+ "<|endoftext|>",
5
+ "<|fim_prefix|>",
6
+ "<|fim_middle|>",
7
+ "<|fim_suffix|>",
8
+ "<|fim_pad|>",
9
+ "<gh_stars>",
10
+ "<filename>",
11
+ "<issue_start>",
12
+ "<issue_comment>",
13
+ "<issue_closed>",
14
+ "<jupyter_start>",
15
+ "<jupyter_text>",
16
+ "<jupyter_code>",
17
+ "<jupyter_output>",
18
+ "<empty_output>",
19
+ "<commit_before>",
20
+ "<commit_msg>",
21
+ "<commit_after>",
22
+ "<reponame>",
23
+ "<|endofprompt|>",
24
+ "<|im_start|>",
25
+ "<|im_end|>",
26
+ "<|pause|>",
27
+ "<|reg0|>",
28
+ "<|reg1|>",
29
+ "<|reg2|>",
30
+ "<|reg3|>",
31
+ "<|reg4|>",
32
+ "<|reg5|>",
33
+ "<|reg6|>",
34
+ "<|reg7|>",
35
+ "<|extra0|>"
36
+ ],
37
  "bos_token": {
38
+ "content": "<|endoftext|>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
42
  "single_word": false
43
  },
44
  "eos_token": {
45
+ "content": "<|endoftext|>",
46
  "lstrip": false,
47
  "normalized": false,
48
  "rstrip": false,
49
  "single_word": false
50
  },
51
  "pad_token": {
52
+ "content": "<|endoftext|>",
53
  "lstrip": false,
54
  "normalized": false,
55
  "rstrip": false,
56
  "single_word": false
57
  },
58
  "unk_token": {
59
+ "content": "<|endoftext|>",
60
  "lstrip": false,
61
  "normalized": false,
62
  "rstrip": false,
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,25 +1,265 @@
1
  {
2
  "add_bos_token": true,
3
- "add_eos_token": false,
4
  "added_tokens_decoder": {
5
- "0": {
6
- "content": "<unk>",
7
  "lstrip": false,
8
  "normalized": false,
9
  "rstrip": false,
10
  "single_word": false,
11
  "special": true
12
  },
13
- "1": {
14
- "content": "<s>",
15
  "lstrip": false,
16
  "normalized": false,
17
  "rstrip": false,
18
  "single_word": false,
19
  "special": true
20
  },
21
- "2": {
22
- "content": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  "lstrip": false,
24
  "normalized": false,
25
  "rstrip": false,
@@ -27,21 +267,51 @@
27
  "special": true
28
  }
29
  },
30
- "additional_special_tokens": [],
31
- "bos_token": "<s>",
32
- "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
33
- "clean_up_tokenization_spaces": false,
34
- "eos_token": "</s>",
35
- "legacy": true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  "max_length": 2048,
37
- "model_max_length": 1000000000000000019884624838656,
38
- "pad_token": "</s>",
39
- "sp_model_kwargs": {},
40
- "spaces_between_special_tokens": false,
41
  "stride": 0,
42
- "tokenizer_class": "LlamaTokenizer",
43
  "truncation_side": "right",
44
  "truncation_strategy": "longest_first",
45
- "unk_token": "<unk>",
46
- "use_default_system_prompt": false
47
  }
 
1
  {
2
  "add_bos_token": true,
3
+ "add_prefix_space": false,
4
  "added_tokens_decoder": {
5
+ "100256": {
6
+ "content": "<|reg_extra|>",
7
  "lstrip": false,
8
  "normalized": false,
9
  "rstrip": false,
10
  "single_word": false,
11
  "special": true
12
  },
13
+ "100257": {
14
+ "content": "<|endoftext|>",
15
  "lstrip": false,
16
  "normalized": false,
17
  "rstrip": false,
18
  "single_word": false,
19
  "special": true
20
  },
21
+ "100258": {
22
+ "content": "<|fim_prefix|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "100259": {
30
+ "content": "<|fim_middle|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "100260": {
38
+ "content": "<|fim_suffix|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "100261": {
46
+ "content": "<|fim_pad|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "100262": {
54
+ "content": "<gh_stars>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "100263": {
62
+ "content": "<filename>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "100264": {
70
+ "content": "<issue_start>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "100265": {
78
+ "content": "<issue_comment>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "100266": {
86
+ "content": "<issue_closed>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "100267": {
94
+ "content": "<jupyter_start>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "100268": {
102
+ "content": "<jupyter_text>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "100269": {
110
+ "content": "<jupyter_code>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "100270": {
118
+ "content": "<jupyter_output>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "100271": {
126
+ "content": "<empty_output>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "100272": {
134
+ "content": "<commit_before>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "100273": {
142
+ "content": "<commit_msg>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "100274": {
150
+ "content": "<commit_after>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "100275": {
158
+ "content": "<reponame>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "100276": {
166
+ "content": "<|endofprompt|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "100277": {
174
+ "content": "<|im_start|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "100278": {
182
+ "content": "<|im_end|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "100279": {
190
+ "content": "<|pause|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "100280": {
198
+ "content": "<|reg0|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "100281": {
206
+ "content": "<|reg1|>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "100282": {
214
+ "content": "<|reg2|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "100283": {
222
+ "content": "<|reg3|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "100284": {
230
+ "content": "<|reg4|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "100285": {
238
+ "content": "<|reg5|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "100286": {
246
+ "content": "<|reg6|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "100287": {
254
+ "content": "<|reg7|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "100288": {
262
+ "content": "<|extra0|>",
263
  "lstrip": false,
264
  "normalized": false,
265
  "rstrip": false,
 
267
  "special": true
268
  }
269
  },
270
+ "additional_special_tokens": [
271
+ "<|reg_extra|>",
272
+ "<|endoftext|>",
273
+ "<|fim_prefix|>",
274
+ "<|fim_middle|>",
275
+ "<|fim_suffix|>",
276
+ "<|fim_pad|>",
277
+ "<gh_stars>",
278
+ "<filename>",
279
+ "<issue_start>",
280
+ "<issue_comment>",
281
+ "<issue_closed>",
282
+ "<jupyter_start>",
283
+ "<jupyter_text>",
284
+ "<jupyter_code>",
285
+ "<jupyter_output>",
286
+ "<empty_output>",
287
+ "<commit_before>",
288
+ "<commit_msg>",
289
+ "<commit_after>",
290
+ "<reponame>",
291
+ "<|endofprompt|>",
292
+ "<|im_start|>",
293
+ "<|im_end|>",
294
+ "<|pause|>",
295
+ "<|reg0|>",
296
+ "<|reg1|>",
297
+ "<|reg2|>",
298
+ "<|reg3|>",
299
+ "<|reg4|>",
300
+ "<|reg5|>",
301
+ "<|reg6|>",
302
+ "<|reg7|>",
303
+ "<|extra0|>"
304
+ ],
305
+ "bos_token": "<|endoftext|>",
306
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
307
+ "clean_up_tokenization_spaces": true,
308
+ "eos_token": "<|endoftext|>",
309
  "max_length": 2048,
310
+ "model_max_length": 2048,
311
+ "pad_token": "<|endoftext|>",
 
 
312
  "stride": 0,
313
+ "tokenizer_class": "GPT2Tokenizer",
314
  "truncation_side": "right",
315
  "truncation_strategy": "longest_first",
316
+ "unk_token": "<|endoftext|>"
 
317
  }
vocab.json ADDED
The diff for this file is too large to render. See raw diff