sabber commited on
Commit
bd1812e
1 Parent(s): 641c179

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +12 -16
  2. tokenizer.json +29 -2
  3. tokenizer_config.json +36 -5
special_tokens_map.json CHANGED
@@ -1,19 +1,9 @@
1
  {
2
  "additional_special_tokens": [
3
- {
4
- "content": "<|im_start|>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "<|im_end|>",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- }
17
  ],
18
  "bos_token": {
19
  "content": "<s>",
@@ -23,13 +13,19 @@
23
  "single_word": false
24
  },
25
  "eos_token": {
26
- "content": "</s>",
 
 
 
 
 
 
 
27
  "lstrip": false,
28
  "normalized": false,
29
  "rstrip": false,
30
  "single_word": false
31
  },
32
- "pad_token": "</s>",
33
  "unk_token": {
34
  "content": "<unk>",
35
  "lstrip": false,
 
1
  {
2
  "additional_special_tokens": [
3
+ "<|im_start|>system",
4
+ "<|im_start|>user",
5
+ "<|im_start|>assistant",
6
+ "<|im_end|>"
 
 
 
 
 
 
 
 
 
 
7
  ],
8
  "bos_token": {
9
  "content": "<s>",
 
13
  "single_word": false
14
  },
15
  "eos_token": {
16
+ "content": "<|im_end|>",
17
+ "lstrip": false,
18
+ "normalized": false,
19
+ "rstrip": false,
20
+ "single_word": false
21
+ },
22
+ "pad_token": {
23
+ "content": "[PAD]",
24
  "lstrip": false,
25
  "normalized": false,
26
  "rstrip": false,
27
  "single_word": false
28
  },
 
29
  "unk_token": {
30
  "content": "<unk>",
31
  "lstrip": false,
tokenizer.json CHANGED
@@ -2,7 +2,7 @@
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
- "max_length": 2200,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
@@ -37,7 +37,7 @@
37
  },
38
  {
39
  "id": 32000,
40
- "content": "<|im_start|>",
41
  "single_word": false,
42
  "lstrip": false,
43
  "rstrip": false,
@@ -46,6 +46,33 @@
46
  },
47
  {
48
  "id": 32001,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  "content": "<|im_end|>",
50
  "single_word": false,
51
  "lstrip": false,
 
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
+ "max_length": 2048,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
 
37
  },
38
  {
39
  "id": 32000,
40
+ "content": "[PAD]",
41
  "single_word": false,
42
  "lstrip": false,
43
  "rstrip": false,
 
46
  },
47
  {
48
  "id": 32001,
49
+ "content": "<|im_start|>system",
50
+ "single_word": false,
51
+ "lstrip": false,
52
+ "rstrip": false,
53
+ "normalized": false,
54
+ "special": true
55
+ },
56
+ {
57
+ "id": 32002,
58
+ "content": "<|im_start|>user",
59
+ "single_word": false,
60
+ "lstrip": false,
61
+ "rstrip": false,
62
+ "normalized": false,
63
+ "special": true
64
+ },
65
+ {
66
+ "id": 32003,
67
+ "content": "<|im_start|>assistant",
68
+ "single_word": false,
69
+ "lstrip": false,
70
+ "rstrip": false,
71
+ "normalized": false,
72
+ "special": true
73
+ },
74
+ {
75
+ "id": 32004,
76
  "content": "<|im_end|>",
77
  "single_word": false,
78
  "lstrip": false,
tokenizer_config.json CHANGED
@@ -27,7 +27,7 @@
27
  "special": true
28
  },
29
  "32000": {
30
- "content": "<|im_start|>",
31
  "lstrip": false,
32
  "normalized": false,
33
  "rstrip": false,
@@ -35,6 +35,30 @@
35
  "special": true
36
  },
37
  "32001": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  "content": "<|im_end|>",
39
  "lstrip": false,
40
  "normalized": false,
@@ -44,19 +68,26 @@
44
  }
45
  },
46
  "additional_special_tokens": [
47
- "<|im_start|>",
 
 
48
  "<|im_end|>"
49
  ],
50
  "bos_token": "<s>",
51
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
52
  "clean_up_tokenization_spaces": false,
53
- "eos_token": "</s>",
54
  "legacy": true,
 
55
  "model_max_length": 1000000000000000019884624838656,
56
- "pad_token": "</s>",
 
57
  "sp_model_kwargs": {},
58
  "spaces_between_special_tokens": false,
 
59
  "tokenizer_class": "LlamaTokenizer",
 
 
60
  "unk_token": "<unk>",
61
  "use_default_system_prompt": false
62
  }
 
27
  "special": true
28
  },
29
  "32000": {
30
+ "content": "[PAD]",
31
  "lstrip": false,
32
  "normalized": false,
33
  "rstrip": false,
 
35
  "special": true
36
  },
37
  "32001": {
38
+ "content": "<|im_start|>system",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "32002": {
46
+ "content": "<|im_start|>user",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "32003": {
54
+ "content": "<|im_start|>assistant",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "32004": {
62
  "content": "<|im_end|>",
63
  "lstrip": false,
64
  "normalized": false,
 
68
  }
69
  },
70
  "additional_special_tokens": [
71
+ "<|im_start|>system",
72
+ "<|im_start|>user",
73
+ "<|im_start|>assistant",
74
  "<|im_end|>"
75
  ],
76
  "bos_token": "<s>",
77
+ "chat_template": "\n {% for message in messages %}\n {% if message['role'] == 'user' %}\n {{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }}\n {% elif message['role'] == 'system' %}\n {{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }}\n {% elif message['role'] == 'assistant' %}\n {{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}\n {% endif %}\n {% endfor %}\n ",
78
  "clean_up_tokenization_spaces": false,
79
+ "eos_token": "<|im_end|>",
80
  "legacy": true,
81
+ "max_length": 2048,
82
  "model_max_length": 1000000000000000019884624838656,
83
+ "pad_token": "[PAD]",
84
+ "padding_side": "right",
85
  "sp_model_kwargs": {},
86
  "spaces_between_special_tokens": false,
87
+ "stride": 0,
88
  "tokenizer_class": "LlamaTokenizer",
89
+ "truncation_side": "right",
90
+ "truncation_strategy": "longest_first",
91
  "unk_token": "<unk>",
92
  "use_default_system_prompt": false
93
  }