perlthoughts commited on
Commit
fc163ef
1 Parent(s): 07c021c

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,4 +1,5 @@
1
  {
2
- "<|im_end|>": 32000,
3
- "<|im_start|>": 32001
 
4
  }
 
1
  {
2
+ "<sep>": 32002,
3
+ "<|end_of_turn|>": 32000,
4
+ "<|pad_0|>": 32001
5
  }
special_tokens_map.json CHANGED
@@ -1,8 +1,7 @@
1
  {
2
  "additional_special_tokens": [
3
- "<unk>",
4
- "<s>",
5
- "</s>"
6
  ],
7
  "bos_token": {
8
  "content": "<s>",
@@ -12,7 +11,21 @@
12
  "single_word": false
13
  },
14
  "eos_token": {
15
- "content": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  "lstrip": false,
17
  "normalized": false,
18
  "rstrip": false,
 
1
  {
2
  "additional_special_tokens": [
3
+ "<|end_of_turn|>",
4
+ "<|pad_0|>"
 
5
  ],
6
  "bos_token": {
7
  "content": "<s>",
 
11
  "single_word": false
12
  },
13
  "eos_token": {
14
+ "content": "<|end_of_turn|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<|end_of_turn|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "sep_token": {
28
+ "content": "<sep>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
tokenizer.json CHANGED
@@ -32,7 +32,7 @@
32
  },
33
  {
34
  "id": 32000,
35
- "content": "<|im_end|>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
@@ -41,7 +41,16 @@
41
  },
42
  {
43
  "id": 32001,
44
- "content": "<|im_start|>",
 
 
 
 
 
 
 
 
 
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
 
32
  },
33
  {
34
  "id": 32000,
35
+ "content": "<|end_of_turn|>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
 
41
  },
42
  {
43
  "id": 32001,
44
+ "content": "<|pad_0|>",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ },
51
+ {
52
+ "id": 32002,
53
+ "content": "<sep>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -25,7 +25,7 @@
25
  "special": true
26
  },
27
  "32000": {
28
- "content": "<|im_end|>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
@@ -33,7 +33,15 @@
33
  "special": true
34
  },
35
  "32001": {
36
- "content": "<|im_start|>",
 
 
 
 
 
 
 
 
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
@@ -42,19 +50,18 @@
42
  }
43
  },
44
  "additional_special_tokens": [
45
- "<unk>",
46
- "<s>",
47
- "</s>"
48
  ],
49
  "bos_token": "<s>",
 
50
  "clean_up_tokenization_spaces": false,
51
- "device_map": {
52
- "": "cuda"
53
- },
54
- "eos_token": "</s>",
55
  "legacy": true,
56
  "model_max_length": 1000000000000000019884624838656,
57
- "pad_token": null,
 
58
  "sp_model_kwargs": {},
59
  "spaces_between_special_tokens": false,
60
  "tokenizer_class": "LlamaTokenizer",
 
25
  "special": true
26
  },
27
  "32000": {
28
+ "content": "<|end_of_turn|>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
 
33
  "special": true
34
  },
35
  "32001": {
36
+ "content": "<|pad_0|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "32002": {
44
+ "content": "<sep>",
45
  "lstrip": false,
46
  "normalized": false,
47
  "rstrip": false,
 
50
  }
51
  },
52
  "additional_special_tokens": [
53
+ "<|end_of_turn|>",
54
+ "<|pad_0|>"
 
55
  ],
56
  "bos_token": "<s>",
57
+ "chat_template": "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
58
  "clean_up_tokenization_spaces": false,
59
+ "device_map": "auto",
60
+ "eos_token": "<|end_of_turn|>",
 
 
61
  "legacy": true,
62
  "model_max_length": 1000000000000000019884624838656,
63
+ "pad_token": "<|end_of_turn|>",
64
+ "sep_token": "<sep>",
65
  "sp_model_kwargs": {},
66
  "spaces_between_special_tokens": false,
67
  "tokenizer_class": "LlamaTokenizer",