rajammanabrolu commited on
Commit
062a60f
1 Parent(s): 2177a48

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|im_end|>": 100279,
3
+ "<|im_start|>": 100278,
4
+ "<|pad|>": 100277
5
+ }
special_tokens_map.json CHANGED
@@ -3,8 +3,26 @@
3
  "<|im_start|>",
4
  "<|im_end|>"
5
  ],
6
- "bos_token": "<|endoftext|>",
7
- "eos_token": "<|endoftext|>",
 
 
 
 
 
 
 
 
 
 
 
 
8
  "pad_token": "<|pad|>",
9
- "unk_token": "<|endoftext|>"
 
 
 
 
 
 
10
  }
 
3
  "<|im_start|>",
4
  "<|im_end|>"
5
  ],
6
+ "bos_token": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
  "pad_token": "<|pad|>",
21
+ "unk_token": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ }
28
  }
tiktoken.py CHANGED
@@ -1,11 +1,14 @@
1
  # Copyright 2022 MosaicML LLM Foundry authors
2
  # SPDX-License-Identifier: Apache-2.0
3
 
 
4
  from typing import Any, Dict, List, Optional, Tuple, Union
5
 
6
  import torch
7
  from transformers import PreTrainedTokenizer
8
 
 
 
9
 
10
  class TiktokenTokenizerWrapper(PreTrainedTokenizer):
11
  """A thin wrapper around tiktoken to make it compatible with Hugging Face.
@@ -22,11 +25,12 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
22
  encoding_name: Optional[str] = None,
23
  add_bos_token: bool = False,
24
  add_eos_token: bool = False,
 
25
  unk_token: Optional[str] = '<|endoftext|>',
26
  eos_token: Optional[str] = '<|endoftext|>',
27
  bos_token: Optional[str] = '<|endoftext|>',
28
  pad_token: Optional[str] = None,
29
- **kwargs: Dict[str, Any]):
30
  """Constructor creates a tiktoken tokenizer to use as the underlying.
31
 
32
  tokenizer.
@@ -38,6 +42,7 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
38
  Either model_name or encoding_name must be set, but not both.
39
  add_bos_token (bool, optional): Whether to add bos tokens. Defaults to False.
40
  add_eos_token (bool, optional): Whether to add eos tokens. Defaults to False.
 
41
  unk_token (Optional[str], optional): The unk token. Defaults to '<|endoftext|>'.
42
  eos_token (Optional[str], optional): The eos token. Defaults to '<|endoftext|>'.
43
  bos_token (Optional[str], optional): The bos token. Defaults to '<|endoftext|>'.
@@ -49,6 +54,23 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
49
  raise ImportError(
50
  'You need to install tiktoken to use TiktokenTokenizerWrapper.')
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  if model_name is not None and encoding_name is not None:
53
  raise ValueError(
54
  'You need to specify either model_name or encoding_name, not both.'
@@ -69,11 +91,13 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
69
 
70
  self.add_bos_token = add_bos_token
71
  self.add_eos_token = add_eos_token
 
72
 
73
  super().__init__(model_name=model_name,
74
  encoding_name=encoding_name,
75
  add_bos_token=add_bos_token,
76
  add_eos_token=add_eos_token,
 
77
  unk_token=unk_token,
78
  eos_token=eos_token,
79
  bos_token=bos_token,
@@ -89,8 +113,39 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
89
  def is_fast(self) -> bool:
90
  return False
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  def get_vocab(self) -> Dict[str, int]:
93
- """Returns vocab as a dict."""
 
 
 
 
 
 
 
 
 
 
94
  vocab = {}
95
  for i in range(self.vocab_size):
96
  try:
@@ -101,6 +156,24 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
101
  except KeyError:
102
  pass
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  return vocab
105
 
106
  def _tokenize(self, text: str) -> List[int]:
@@ -155,7 +228,7 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
155
  """
156
  if isinstance(ids, int):
157
  if ids in self.added_tokens_decoder:
158
- return self.added_tokens_decoder[ids]
159
 
160
  return self._convert_id_to_token(ids)
161
 
@@ -171,7 +244,7 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
171
  if index in self.added_tokens_decoder:
172
  tokens.append(self.encoding.decode(current_stream))
173
  current_stream = []
174
- tokens.append(self.added_tokens_decoder[index])
175
  else:
176
  current_stream.append(index)
177
 
 
1
  # Copyright 2022 MosaicML LLM Foundry authors
2
  # SPDX-License-Identifier: Apache-2.0
3
 
4
+ import warnings
5
  from typing import Any, Dict, List, Optional, Tuple, Union
6
 
7
  import torch
8
  from transformers import PreTrainedTokenizer
9
 
10
+ DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
11
+
12
 
13
  class TiktokenTokenizerWrapper(PreTrainedTokenizer):
14
  """A thin wrapper around tiktoken to make it compatible with Hugging Face.
 
25
  encoding_name: Optional[str] = None,
26
  add_bos_token: bool = False,
27
  add_eos_token: bool = False,
28
+ use_default_system_prompt: bool = False,
29
  unk_token: Optional[str] = '<|endoftext|>',
30
  eos_token: Optional[str] = '<|endoftext|>',
31
  bos_token: Optional[str] = '<|endoftext|>',
32
  pad_token: Optional[str] = None,
33
+ **kwargs: Any):
34
  """Constructor creates a tiktoken tokenizer to use as the underlying.
35
 
36
  tokenizer.
 
42
  Either model_name or encoding_name must be set, but not both.
43
  add_bos_token (bool, optional): Whether to add bos tokens. Defaults to False.
44
  add_eos_token (bool, optional): Whether to add eos tokens. Defaults to False.
45
+ use_default_system_prompt (bool, optional): Use the default system prompt or not. Defaults to False.
46
  unk_token (Optional[str], optional): The unk token. Defaults to '<|endoftext|>'.
47
  eos_token (Optional[str], optional): The eos token. Defaults to '<|endoftext|>'.
48
  bos_token (Optional[str], optional): The bos token. Defaults to '<|endoftext|>'.
 
54
  raise ImportError(
55
  'You need to install tiktoken to use TiktokenTokenizerWrapper.')
56
 
57
+ # Workaround to make tiktokenizer picklable.
58
+ # https://github.com/huggingface/datasets/issues/5536#issuecomment-1682309347
59
+ # There is an open PR from HF to add this to tiktoken: https://github.com/openai/tiktoken/pull/181
60
+ import copyreg
61
+ import functools
62
+
63
+ from tiktoken import Encoding # type: ignore (thirdParty)
64
+
65
+ def pickle_Encoding(enc: Encoding):
66
+ return (functools.partial(Encoding,
67
+ enc.name,
68
+ pat_str=enc._pat_str,
69
+ mergeable_ranks=enc._mergeable_ranks,
70
+ special_tokens=enc._special_tokens), ())
71
+
72
+ copyreg.pickle(Encoding, pickle_Encoding)
73
+
74
  if model_name is not None and encoding_name is not None:
75
  raise ValueError(
76
  'You need to specify either model_name or encoding_name, not both.'
 
91
 
92
  self.add_bos_token = add_bos_token
93
  self.add_eos_token = add_eos_token
94
+ self.use_default_system_prompt = use_default_system_prompt
95
 
96
  super().__init__(model_name=model_name,
97
  encoding_name=encoding_name,
98
  add_bos_token=add_bos_token,
99
  add_eos_token=add_eos_token,
100
+ use_default_system_prompt=use_default_system_prompt,
101
  unk_token=unk_token,
102
  eos_token=eos_token,
103
  bos_token=bos_token,
 
113
  def is_fast(self) -> bool:
114
  return False
115
 
116
+ @property
117
+ def default_chat_template(self):
118
+ """Chat ML Template for User/Assistant.
119
+
120
+ Pinning default Chat ML template in case defaults change.
121
+ """
122
+ template = (
123
+ "{% set system_message = '' %}"
124
+ '{% if USE_DEFAULT_PROMPT == true %}'
125
+ "{{'<|im_start|>system\n' + 'DEFAULT_SYSTEM_PROMPT'}}"
126
+ '{% endif %}'
127
+ '{% for message in messages %}'
128
+ "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
129
+ '{% endfor %}')
130
+ template = template.replace(
131
+ 'USE_DEFAULT_PROMPT',
132
+ 'true' if self.use_default_system_prompt else 'false')
133
+ template = template.replace('DEFAULT_SYSTEM_PROMPT',
134
+ DEFAULT_SYSTEM_PROMPT)
135
+ return template
136
+
137
  def get_vocab(self) -> Dict[str, int]:
138
+ """Returns vocab as a dict.
139
+
140
+ Note: This function does not work properly due to difference in assumptions between tiktoken and Hugging Face tokenizers.
141
+ Most uses do not need to use get_vocab, so this is not a priority to fix.
142
+ """
143
+ warnings.warn(
144
+ 'get_vocab does not work properly with TiktokenTokenizerWrapper. Please do not rely on it being perfectly correct.'
145
+ +
146
+ ' It will be called once init just to get the size of the vocab inside the base class.'
147
+ )
148
+
149
  vocab = {}
150
  for i in range(self.vocab_size):
151
  try:
 
156
  except KeyError:
157
  pass
158
 
159
+ # As far as I can tell, we don't require get_vocab to completely work,
160
+ # but when using additional_special_tokens, Hugging Face determines the next
161
+ # token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
162
+ extra_id_index = 0
163
+ candidate_extra_id = f'<extra_id_{extra_id_index}>'
164
+ indices_to_fill_in = {i for i in range(self.vocab_size)} - set(
165
+ vocab.values())
166
+
167
+ # Add enough indices to make get_vocab() the right length
168
+ for index_to_add in indices_to_fill_in:
169
+ # Make sure we don't overwrite a token that already exists
170
+ while candidate_extra_id in vocab:
171
+ extra_id_index += 1
172
+ candidate_extra_id = f'<extra_id_{extra_id_index}>'
173
+
174
+ # Get an index to add and add the item
175
+ vocab[candidate_extra_id] = index_to_add
176
+
177
  return vocab
178
 
179
  def _tokenize(self, text: str) -> List[int]:
 
228
  """
229
  if isinstance(ids, int):
230
  if ids in self.added_tokens_decoder:
231
+ return str(self.added_tokens_decoder[ids])
232
 
233
  return self._convert_id_to_token(ids)
234
 
 
244
  if index in self.added_tokens_decoder:
245
  tokens.append(self.encoding.decode(current_stream))
246
  current_stream = []
247
+ tokens.append(str(self.added_tokens_decoder[index]))
248
  else:
249
  current_stream.append(index)
250
 
tokenizer_config.json CHANGED
@@ -2,6 +2,72 @@
2
  "add_bos_token": false,
3
  "add_eos_token": false,
4
  "add_prefix_space": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "additional_special_tokens": [
6
  "<|im_start|>",
7
  "<|im_end|>"
@@ -20,5 +86,6 @@
20
  "model_name": "gpt-4",
21
  "pad_token": "<|pad|>",
22
  "tokenizer_class": "TiktokenTokenizerWrapper",
23
- "unk_token": "<|endoftext|>"
 
24
  }
 
2
  "add_bos_token": false,
3
  "add_eos_token": false,
4
  "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "100257": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "100258": {
15
+ "content": "<|fim_prefix|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "100259": {
23
+ "content": "<|fim_middle|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "100260": {
31
+ "content": "<|fim_suffix|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "100276": {
39
+ "content": "<|endofprompt|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "100277": {
47
+ "content": "<|pad|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "100278": {
55
+ "content": "<|im_start|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "100279": {
63
+ "content": "<|im_end|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ }
70
+ },
71
  "additional_special_tokens": [
72
  "<|im_start|>",
73
  "<|im_end|>"
 
86
  "model_name": "gpt-4",
87
  "pad_token": "<|pad|>",
88
  "tokenizer_class": "TiktokenTokenizerWrapper",
89
+ "unk_token": "<|endoftext|>",
90
+ "use_default_system_prompt": false
91
  }