TokenBender commited on
Commit
96b9b67
1 Parent(s): eb180e7

Upload 4 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<|im_end|>": 49153,
3
+ "<|im_start|>": 49152
4
+ }
special_tokens_map.json CHANGED
@@ -1,58 +1,23 @@
1
  {
2
  "additional_special_tokens": [
3
- "<|endoftext|>",
4
- "<fim_prefix>",
5
- "<fim_middle>",
6
- "<fim_suffix>",
7
- "<fim_pad>",
8
- "<repo_name>",
9
- "<file_sep>",
10
- "<issue_start>",
11
- "<issue_comment>",
12
- "<issue_closed>",
13
- "<jupyter_start>",
14
- "<jupyter_text>",
15
- "<jupyter_code>",
16
- "<jupyter_output>",
17
- "<jupyter_script>",
18
- "<empty_output>",
19
- "<code_to_intermediate>",
20
- "<intermediate_to_code>",
21
- "<pr>",
22
- "<pr_status>",
23
- "<pr_is_merged>",
24
- "<pr_base>",
25
- "<pr_file>",
26
- "<pr_base_code>",
27
- "<pr_diff>",
28
- "<pr_diff_hunk>",
29
- "<pr_comment>",
30
- "<pr_event_id>",
31
- "<pr_review>",
32
- "<pr_review_state>",
33
- "<pr_review_comment>",
34
- "<pr_in_reply_to_review_id>",
35
- "<pr_in_reply_to_comment_id>",
36
- "<pr_diff_hunk_comment_line>",
37
- "<NAME>",
38
- "<EMAIL>",
39
- "<KEY>",
40
- "<PASSWORD>"
41
  ],
42
- "bos_token": {
43
- "content": "<|endoftext|>",
44
- "lstrip": false,
45
- "normalized": false,
46
- "rstrip": false,
47
- "single_word": false
48
- },
49
- "eos_token": {
50
- "content": "<|endoftext|>",
51
- "lstrip": false,
52
- "normalized": false,
53
- "rstrip": false,
54
- "single_word": false
55
- },
56
  "unk_token": {
57
  "content": "<|endoftext|>",
58
  "lstrip": false,
 
1
  {
2
  "additional_special_tokens": [
3
+ {
4
+ "content": "<|im_start|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|im_end|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ],
18
+ "bos_token": "<|im_start|>",
19
+ "eos_token": "<|im_end|>",
20
+ "pad_token": "<|im_end|>",
 
 
 
 
 
 
 
 
 
 
 
21
  "unk_token": {
22
  "content": "<|endoftext|>",
23
  "lstrip": false,
tokenizer.json CHANGED
@@ -344,6 +344,24 @@
344
  "rstrip": false,
345
  "normalized": false,
346
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  }
348
  ],
349
  "normalizer": null,
 
344
  "rstrip": false,
345
  "normalized": false,
346
  "special": true
347
+ },
348
+ {
349
+ "id": 49152,
350
+ "content": "<|im_start|>",
351
+ "single_word": false,
352
+ "lstrip": false,
353
+ "rstrip": false,
354
+ "normalized": false,
355
+ "special": true
356
+ },
357
+ {
358
+ "id": 49153,
359
+ "content": "<|im_end|>",
360
+ "single_word": false,
361
+ "lstrip": false,
362
+ "rstrip": false,
363
+ "normalized": false,
364
+ "special": true
365
  }
366
  ],
367
  "normalizer": null,
tokenizer_config.json CHANGED
@@ -304,52 +304,34 @@
304
  "rstrip": false,
305
  "single_word": false,
306
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  }
308
  },
309
  "additional_special_tokens": [
310
- "<|endoftext|>",
311
- "<fim_prefix>",
312
- "<fim_middle>",
313
- "<fim_suffix>",
314
- "<fim_pad>",
315
- "<repo_name>",
316
- "<file_sep>",
317
- "<issue_start>",
318
- "<issue_comment>",
319
- "<issue_closed>",
320
- "<jupyter_start>",
321
- "<jupyter_text>",
322
- "<jupyter_code>",
323
- "<jupyter_output>",
324
- "<jupyter_script>",
325
- "<empty_output>",
326
- "<code_to_intermediate>",
327
- "<intermediate_to_code>",
328
- "<pr>",
329
- "<pr_status>",
330
- "<pr_is_merged>",
331
- "<pr_base>",
332
- "<pr_file>",
333
- "<pr_base_code>",
334
- "<pr_diff>",
335
- "<pr_diff_hunk>",
336
- "<pr_comment>",
337
- "<pr_event_id>",
338
- "<pr_review>",
339
- "<pr_review_state>",
340
- "<pr_review_comment>",
341
- "<pr_in_reply_to_review_id>",
342
- "<pr_in_reply_to_comment_id>",
343
- "<pr_diff_hunk_comment_line>",
344
- "<NAME>",
345
- "<EMAIL>",
346
- "<KEY>",
347
- "<PASSWORD>"
348
  ],
349
- "bos_token": "<|endoftext|>",
 
350
  "clean_up_tokenization_spaces": true,
351
- "eos_token": "<|endoftext|>",
352
  "model_max_length": 1000000000000000019884624838656,
 
353
  "tokenizer_class": "GPT2Tokenizer",
354
  "unk_token": "<|endoftext|>",
355
  "vocab_size": 49152
 
304
  "rstrip": false,
305
  "single_word": false,
306
  "special": true
307
+ },
308
+ "49152": {
309
+ "content": "<|im_start|>",
310
+ "lstrip": false,
311
+ "normalized": false,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "49153": {
317
+ "content": "<|im_end|>",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
  }
324
  },
325
  "additional_special_tokens": [
326
+ "<|im_start|>",
327
+ "<|im_end|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  ],
329
+ "bos_token": "<|im_start|>",
330
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
331
  "clean_up_tokenization_spaces": true,
332
+ "eos_token": "<|im_end|>",
333
  "model_max_length": 1000000000000000019884624838656,
334
+ "pad_token": "<|im_end|>",
335
  "tokenizer_class": "GPT2Tokenizer",
336
  "unk_token": "<|endoftext|>",
337
  "vocab_size": 49152