cthiriet commited on
Commit
52a5b8c
1 Parent(s): 49696ec

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +7 -1
  2. tokenizer.json +45 -42
  3. tokenizer_config.json +149 -1
special_tokens_map.json CHANGED
@@ -16,5 +16,11 @@
16
  "<start_assistant>",
17
  "<end_message>"
18
  ],
19
- "eos_token": "<end_message>"
 
 
 
 
 
 
20
  }
 
16
  "<start_assistant>",
17
  "<end_message>"
18
  ],
19
+ "eos_token": {
20
+ "content": "<end_message>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ }
26
  }
tokenizer.json CHANGED
@@ -5,147 +5,147 @@
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
- "special": true,
9
  "content": ">>TITLE<<",
10
  "single_word": false,
11
  "lstrip": false,
12
  "rstrip": false,
13
- "normalized": false
 
14
  },
15
  {
16
  "id": 1,
17
- "special": true,
18
  "content": ">>ABSTRACT<<",
19
  "single_word": false,
20
  "lstrip": false,
21
  "rstrip": false,
22
- "normalized": false
 
23
  },
24
  {
25
  "id": 2,
26
- "special": true,
27
  "content": ">>INTRODUCTION<<",
28
  "single_word": false,
29
  "lstrip": false,
30
  "rstrip": false,
31
- "normalized": false
 
32
  },
33
  {
34
  "id": 3,
35
- "special": true,
36
  "content": ">>SUMMARY<<",
37
  "single_word": false,
38
  "lstrip": false,
39
  "rstrip": false,
40
- "normalized": false
 
41
  },
42
  {
43
  "id": 4,
44
- "special": true,
45
  "content": ">>COMMENT<<",
46
  "single_word": false,
47
  "lstrip": false,
48
  "rstrip": false,
49
- "normalized": false
 
50
  },
51
  {
52
  "id": 5,
53
- "special": true,
54
  "content": ">>ANSWER<<",
55
  "single_word": false,
56
  "lstrip": false,
57
  "rstrip": false,
58
- "normalized": false
 
59
  },
60
  {
61
  "id": 6,
62
- "special": true,
63
  "content": ">>QUESTION<<",
64
  "single_word": false,
65
  "lstrip": false,
66
  "rstrip": false,
67
- "normalized": false
 
68
  },
69
  {
70
  "id": 7,
71
- "special": true,
72
  "content": ">>DOMAIN<<",
73
  "single_word": false,
74
  "lstrip": false,
75
  "rstrip": false,
76
- "normalized": false
 
77
  },
78
  {
79
  "id": 8,
80
- "special": true,
81
  "content": ">>PREFIX<<",
82
  "single_word": false,
83
  "lstrip": false,
84
  "rstrip": false,
85
- "normalized": false
 
86
  },
87
  {
88
  "id": 9,
89
- "special": true,
90
  "content": ">>SUFFIX<<",
91
  "single_word": false,
92
  "lstrip": false,
93
  "rstrip": false,
94
- "normalized": false
 
95
  },
96
  {
97
  "id": 10,
98
- "special": true,
99
  "content": ">>MIDDLE<<",
100
  "single_word": false,
101
  "lstrip": false,
102
  "rstrip": false,
103
- "normalized": false
 
104
  },
105
  {
106
  "id": 11,
107
- "special": true,
108
  "content": "<|endoftext|>",
109
  "single_word": false,
110
  "lstrip": false,
111
  "rstrip": false,
112
- "normalized": false
 
113
  },
114
  {
115
- "id": 65023,
116
- "special": true,
117
- "content": "<start_system>",
118
  "single_word": false,
119
  "lstrip": false,
120
  "rstrip": false,
121
- "normalized": false
 
122
  },
123
  {
124
- "id": 65022,
125
- "special": true,
126
- "content": "<start_user>",
127
  "single_word": false,
128
  "lstrip": false,
129
  "rstrip": false,
130
- "normalized": false
 
131
  },
132
  {
133
- "id": 65021,
134
- "special": true,
135
- "content": "<start_assistant>",
136
  "single_word": false,
137
  "lstrip": false,
138
  "rstrip": false,
139
- "normalized": false
 
140
  },
141
  {
142
- "id": 65020,
143
- "special": true,
144
- "content": "<end_message>",
145
  "single_word": false,
146
  "lstrip": false,
147
  "rstrip": false,
148
- "normalized": false
 
149
  }
150
  ],
151
  "normalizer": null,
@@ -159,7 +159,8 @@
159
  {
160
  "type": "ByteLevel",
161
  "add_prefix_space": false,
162
- "trim_offsets": true
 
163
  },
164
  {
165
  "type": "Digits",
@@ -179,7 +180,8 @@
179
  "decoder": {
180
  "type": "ByteLevel",
181
  "add_prefix_space": true,
182
- "trim_offsets": true
 
183
  },
184
  "model": {
185
  "type": "BPE",
@@ -188,6 +190,7 @@
188
  "continuing_subword_prefix": null,
189
  "end_of_word_suffix": null,
190
  "fuse_unk": false,
 
191
  "vocab": {
192
  ">>TITLE<<": 0,
193
  ">>ABSTRACT<<": 1,
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
8
  "content": ">>TITLE<<",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
  },
15
  {
16
  "id": 1,
 
17
  "content": ">>ABSTRACT<<",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
  },
24
  {
25
  "id": 2,
 
26
  "content": ">>INTRODUCTION<<",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
  },
33
  {
34
  "id": 3,
 
35
  "content": ">>SUMMARY<<",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
  },
42
  {
43
  "id": 4,
 
44
  "content": ">>COMMENT<<",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
  },
51
  {
52
  "id": 5,
 
53
  "content": ">>ANSWER<<",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
  },
60
  {
61
  "id": 6,
 
62
  "content": ">>QUESTION<<",
63
  "single_word": false,
64
  "lstrip": false,
65
  "rstrip": false,
66
+ "normalized": false,
67
+ "special": true
68
  },
69
  {
70
  "id": 7,
 
71
  "content": ">>DOMAIN<<",
72
  "single_word": false,
73
  "lstrip": false,
74
  "rstrip": false,
75
+ "normalized": false,
76
+ "special": true
77
  },
78
  {
79
  "id": 8,
 
80
  "content": ">>PREFIX<<",
81
  "single_word": false,
82
  "lstrip": false,
83
  "rstrip": false,
84
+ "normalized": false,
85
+ "special": true
86
  },
87
  {
88
  "id": 9,
 
89
  "content": ">>SUFFIX<<",
90
  "single_word": false,
91
  "lstrip": false,
92
  "rstrip": false,
93
+ "normalized": false,
94
+ "special": true
95
  },
96
  {
97
  "id": 10,
 
98
  "content": ">>MIDDLE<<",
99
  "single_word": false,
100
  "lstrip": false,
101
  "rstrip": false,
102
+ "normalized": false,
103
+ "special": true
104
  },
105
  {
106
  "id": 11,
 
107
  "content": "<|endoftext|>",
108
  "single_word": false,
109
  "lstrip": false,
110
  "rstrip": false,
111
+ "normalized": false,
112
+ "special": true
113
  },
114
  {
115
+ "id": 65020,
116
+ "content": "<end_message>",
 
117
  "single_word": false,
118
  "lstrip": false,
119
  "rstrip": false,
120
+ "normalized": false,
121
+ "special": true
122
  },
123
  {
124
+ "id": 65021,
125
+ "content": "<start_assistant>",
 
126
  "single_word": false,
127
  "lstrip": false,
128
  "rstrip": false,
129
+ "normalized": false,
130
+ "special": true
131
  },
132
  {
133
+ "id": 65022,
134
+ "content": "<start_user>",
 
135
  "single_word": false,
136
  "lstrip": false,
137
  "rstrip": false,
138
+ "normalized": false,
139
+ "special": true
140
  },
141
  {
142
+ "id": 65023,
143
+ "content": "<start_system>",
 
144
  "single_word": false,
145
  "lstrip": false,
146
  "rstrip": false,
147
+ "normalized": false,
148
+ "special": true
149
  }
150
  ],
151
  "normalizer": null,
 
159
  {
160
  "type": "ByteLevel",
161
  "add_prefix_space": false,
162
+ "trim_offsets": true,
163
+ "use_regex": true
164
  },
165
  {
166
  "type": "Digits",
 
180
  "decoder": {
181
  "type": "ByteLevel",
182
  "add_prefix_space": true,
183
+ "trim_offsets": true,
184
+ "use_regex": true
185
  },
186
  "model": {
187
  "type": "BPE",
 
190
  "continuing_subword_prefix": null,
191
  "end_of_word_suffix": null,
192
  "fuse_unk": false,
193
+ "byte_fallback": false,
194
  "vocab": {
195
  ">>TITLE<<": 0,
196
  ">>ABSTRACT<<": 1,
tokenizer_config.json CHANGED
@@ -1,7 +1,155 @@
1
  {
2
  "add_prefix_space": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "eos_token": "<end_message>",
4
  "model_max_length": 8192,
5
- "special_tokens_map_file": null,
6
  "tokenizer_class": "PreTrainedTokenizerFast"
7
  }
 
1
  {
2
  "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": ">>TITLE<<",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": ">>ABSTRACT<<",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": ">>INTRODUCTION<<",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": ">>SUMMARY<<",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": ">>COMMENT<<",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": ">>ANSWER<<",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": ">>QUESTION<<",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": ">>DOMAIN<<",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": ">>PREFIX<<",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": ">>SUFFIX<<",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": ">>MIDDLE<<",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<|endoftext|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "65020": {
101
+ "content": "<end_message>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "65021": {
109
+ "content": "<start_assistant>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "65022": {
117
+ "content": "<start_user>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "65023": {
125
+ "content": "<start_system>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ }
132
+ },
133
+ "additional_special_tokens": [
134
+ ">>TITLE<<",
135
+ ">>ABSTRACT<<",
136
+ ">>INTRODUCTION<<",
137
+ ">>SUMMARY<<",
138
+ ">>COMMENT<<",
139
+ ">>ANSWER<<",
140
+ ">>QUESTION<<",
141
+ ">>DOMAIN<<",
142
+ ">>PREFIX<<",
143
+ ">>SUFFIX<<",
144
+ ">>MIDDLE<<",
145
+ "<start_system>",
146
+ "<start_user>",
147
+ "<start_assistant>",
148
+ "<end_message>"
149
+ ],
150
+ "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<start_user>' + message['content'].strip() + '<end_message>' }}{% elif message['role'] == 'system' %}{{ '<start_system>' + message['content'].strip() + '<end_message>' }}{% elif message['role'] == 'assistant' %}{{ '<start_asssistant>' + message['content'] + '<end_message>' }}{% else %}{{ raise_exception('Only system, user and assistant roles are supported.') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<start_asssistant>' }}{% endif %}{% endfor %}",
151
+ "clean_up_tokenization_spaces": true,
152
  "eos_token": "<end_message>",
153
  "model_max_length": 8192,
 
154
  "tokenizer_class": "PreTrainedTokenizerFast"
155
  }