whizzzzkid commited on
Commit
24f38b3
1 Parent(s): d93547f

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +63 -4
  2. tokenizer.json +2 -2
  3. tokenizer_config.json +40 -14
special_tokens_map.json CHANGED
@@ -1,6 +1,65 @@
1
  {
2
- "bos_token": "<|end|>",
3
- "eos_token": "<|end|>",
4
- "pad_token": "<|end|>",
5
- "unk_token": "<|end|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  }
 
1
  {
2
+ "additional_special_tokens": [
3
+ "<|reg_extra|>",
4
+ "<|endoftext|>",
5
+ "<|fim_prefix|>",
6
+ "<|fim_middle|>",
7
+ "<|fim_suffix|>",
8
+ "<|fim_pad|>",
9
+ "<gh_stars>",
10
+ "<filename>",
11
+ "<issue_start>",
12
+ "<issue_comment>",
13
+ "<issue_closed>",
14
+ "<jupyter_start>",
15
+ "<jupyter_text>",
16
+ "<jupyter_code>",
17
+ "<jupyter_output>",
18
+ "<empty_output>",
19
+ "<commit_before>",
20
+ "<commit_msg>",
21
+ "<commit_after>",
22
+ "<reponame>",
23
+ "<|endofprompt|>",
24
+ "<|im_start|>",
25
+ "<|im_end|>",
26
+ "<|pause|>",
27
+ "<|reg0|>",
28
+ "<|reg1|>",
29
+ "<|reg2|>",
30
+ "<|reg3|>",
31
+ "<|reg4|>",
32
+ "<|reg5|>",
33
+ "<|reg6|>",
34
+ "<|reg7|>",
35
+ "<|extra0|>"
36
+ ],
37
+ "bos_token": {
38
+ "content": "<|endoftext|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "eos_token": {
45
+ "content": "<|endoftext|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ },
51
+ "pad_token": {
52
+ "content": "<|endoftext|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false
57
+ },
58
+ "unk_token": {
59
+ "content": "<|endoftext|>",
60
+ "lstrip": false,
61
+ "normalized": false,
62
+ "rstrip": false,
63
+ "single_word": false
64
+ }
65
  }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86bb7b272d5e2649c6013ed8cbf564802f0c36cede60687af4192dc3f8d1acfb
3
- size 4239708
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:290f60278265b590e6e79137d425f9aa8c19d934e08b6ac7a6229e55a1bdaa27
3
+ size 4239524
tokenizer_config.json CHANGED
@@ -264,27 +264,53 @@
264
  "rstrip": false,
265
  "single_word": false,
266
  "special": true
267
- },
268
- "100289": {
269
- "content": "<|end|>",
270
- "lstrip": false,
271
- "normalized": false,
272
- "rstrip": false,
273
- "single_word": false,
274
- "special": true
275
  }
276
  },
277
- "additional_special_tokens": [],
278
- "bos_token": "<|end|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
280
  "clean_up_tokenization_spaces": true,
281
- "eos_token": "<|end|>",
282
  "max_length": 2048,
283
- "model_max_length": 512,
284
- "pad_token": "<|end|>",
285
  "stride": 0,
286
  "tokenizer_class": "GPT2Tokenizer",
287
  "truncation_side": "right",
288
  "truncation_strategy": "longest_first",
289
- "unk_token": "<|end|>"
290
  }
 
264
  "rstrip": false,
265
  "single_word": false,
266
  "special": true
 
 
 
 
 
 
 
 
267
  }
268
  },
269
+ "additional_special_tokens": [
270
+ "<|reg_extra|>",
271
+ "<|endoftext|>",
272
+ "<|fim_prefix|>",
273
+ "<|fim_middle|>",
274
+ "<|fim_suffix|>",
275
+ "<|fim_pad|>",
276
+ "<gh_stars>",
277
+ "<filename>",
278
+ "<issue_start>",
279
+ "<issue_comment>",
280
+ "<issue_closed>",
281
+ "<jupyter_start>",
282
+ "<jupyter_text>",
283
+ "<jupyter_code>",
284
+ "<jupyter_output>",
285
+ "<empty_output>",
286
+ "<commit_before>",
287
+ "<commit_msg>",
288
+ "<commit_after>",
289
+ "<reponame>",
290
+ "<|endofprompt|>",
291
+ "<|im_start|>",
292
+ "<|im_end|>",
293
+ "<|pause|>",
294
+ "<|reg0|>",
295
+ "<|reg1|>",
296
+ "<|reg2|>",
297
+ "<|reg3|>",
298
+ "<|reg4|>",
299
+ "<|reg5|>",
300
+ "<|reg6|>",
301
+ "<|reg7|>",
302
+ "<|extra0|>"
303
+ ],
304
+ "bos_token": "<|endoftext|>",
305
  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
306
  "clean_up_tokenization_spaces": true,
307
+ "eos_token": "<|endoftext|>",
308
  "max_length": 2048,
309
+ "model_max_length": 2048,
310
+ "pad_token": "<|endoftext|>",
311
  "stride": 0,
312
  "tokenizer_class": "GPT2Tokenizer",
313
  "truncation_side": "right",
314
  "truncation_strategy": "longest_first",
315
+ "unk_token": "<|endoftext|>"
316
  }