yangapku commited on
Commit
69bd8ac
1 Parent(s): 04df5dd

refactor tokenization and update readme

Browse files
README.md CHANGED
@@ -61,7 +61,7 @@ We show an example of multi-turn interaction with Qwen-7B-Chat in the following
61
  from transformers import AutoModelForCausalLM, AutoTokenizer
62
  from transformers.generation import GenerationConfig
63
 
64
- # Note: our tokenizer rejects attacks and so that you cannot input special tokens like <|endoftext|> or it will throw an error.
65
  # To remove the strategy, you can add `allowed_special`, which accepts the string "all" or a `set` of special tokens.
66
  # For example: tokens = tokenizer(text, allowed_special="all")
67
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
 
61
  from transformers import AutoModelForCausalLM, AutoTokenizer
62
  from transformers.generation import GenerationConfig
63
 
64
+ # Note: The default behavior now has injection attack prevention off.
65
  # To remove the strategy, you can add `allowed_special`, which accepts the string "all" or a `set` of special tokens.
66
  # For example: tokens = tokenizer(text, allowed_special="all")
67
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
qwen_generation_utils.py CHANGED
@@ -135,8 +135,8 @@ def make_context(
135
 
136
  def _tokenize_str(role, content):
137
  return f"{role}\n{content}", tokenizer.encode(
138
- role
139
- ) + nl_tokens + tokenizer.encode(content)
140
 
141
  system_text, system_tokens_part = _tokenize_str("system", system)
142
  system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
 
135
 
136
  def _tokenize_str(role, content):
137
  return f"{role}\n{content}", tokenizer.encode(
138
+ role, allowed_special=set()
139
+ ) + nl_tokens + tokenizer.encode(content, allowed_special=set())
140
 
141
  system_text, system_tokens_part = _tokenize_str("system", system)
142
  system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
tokenization_qwen.py CHANGED
@@ -5,164 +5,120 @@
5
 
6
  """Tokenization classes for QWen."""
7
 
8
- from __future__ import absolute_import, division, print_function, unicode_literals
9
-
10
- import json
11
  import logging
12
  import os
13
  import unicodedata
14
- from io import open
15
- import base64
16
- import tiktoken
17
- from typing import List, Optional, Tuple, Union
18
 
 
19
  from transformers import PreTrainedTokenizer, AddedToken
20
 
21
  logger = logging.getLogger(__name__)
22
 
 
23
  VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  class QWenTokenizer(PreTrainedTokenizer):
26
  """QWen tokenizer."""
27
 
28
- """NOTE: This tokenizer will not handle special tokens to avoid injection attacks"""
29
-
30
  vocab_files_names = VOCAB_FILES_NAMES
31
 
32
  def __init__(
33
  self,
34
  vocab_file,
35
  errors="replace",
36
- max_len=None,
37
- unk_token="<|endoftext|>",
38
- bos_token="<|endoftext|>",
39
- eos_token="<|endoftext|>",
40
- pad_token=None,
41
- add_prefix_space=False,
42
- add_bos_token=False,
43
- add_more_sp_tokens=True,
44
  **kwargs,
45
  ):
46
- bos_token = (
47
- AddedToken(bos_token, lstrip=False, rstrip=False)
48
- if isinstance(bos_token, str)
49
- else bos_token
50
- )
51
- eos_token = (
52
- AddedToken(eos_token, lstrip=False, rstrip=False)
53
- if isinstance(eos_token, str)
54
- else eos_token
55
- )
56
- unk_token = (
57
- AddedToken(unk_token, lstrip=False, rstrip=False)
58
- if isinstance(unk_token, str)
59
- else unk_token
60
- )
61
- pad_token = (
62
- AddedToken(pad_token, lstrip=False, rstrip=False)
63
- if isinstance(pad_token, str)
64
- else pad_token
65
- )
66
- super().__init__(
67
- errors=errors,
68
- unk_token=unk_token,
69
- bos_token=bos_token,
70
- eos_token=eos_token,
71
- pad_token=pad_token,
72
- add_prefix_space=add_prefix_space,
73
- add_bos_token=add_bos_token,
74
- )
75
- self.add_bos_token = add_bos_token
76
- self.max_len = max_len if max_len is not None else int(1e12)
77
 
78
  self.errors = errors # how to handle errors in decoding
79
 
80
- name = "Qwen"
81
- ENDOFTEXT = "<|endoftext|>"
82
- IMSTART = "<|im_start|>"
83
- IMEND = "<|im_end|>"
84
- if add_more_sp_tokens:
85
- special_tokens = (
86
- ENDOFTEXT,
87
- IMSTART,
88
- IMEND,
89
- "<R>",
90
- "<S>",
91
- "<X>",
92
- "<mask>",
93
- "<sep>",
94
- ) + tuple([f"<extra_{i}>" for i in range(200)])
95
- else:
96
- special_tokens = (ENDOFTEXT, IMSTART, IMEND)
97
-
98
- PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
99
-
100
- def load_tiktoken_bpe(tiktoken_bpe_file: str) -> "dict[bytes, int]":
101
- contents = open(tiktoken_bpe_file, "rb").read()
102
- return {
103
- base64.b64decode(token): int(rank)
104
- for token, rank in (
105
- line.split() for line in contents.splitlines() if line
106
- )
107
- }
108
-
109
- mergeable_ranks = load_tiktoken_bpe(vocab_file)
110
- special_tokens = {
111
  token: index
112
- for index, token in enumerate(special_tokens, start=len(mergeable_ranks))
 
 
113
  }
114
- self.special_tokens = special_tokens
115
  enc = tiktoken.Encoding(
116
- name,
117
  pat_str=PAT_STR,
118
- mergeable_ranks=mergeable_ranks,
119
- special_tokens=special_tokens,
120
  )
121
  assert (
122
- len(mergeable_ranks) + len(special_tokens) == enc.n_vocab
123
- ), f"{len(mergeable_ranks) + len(special_tokens)} != {enc.n_vocab} in encoding"
124
 
125
- self.mergeable_ranks = mergeable_ranks
126
- self.encoder = self.mergeable_ranks
127
- self.decoder = {v: k for k, v in self.encoder.items()}
128
  self.decoder.update({v: k for k, v in self.special_tokens.items()})
 
129
  self.tokenizer = enc # type: tiktoken.Encoding
 
130
  self.eod_id = self.tokenizer.eot_token
131
- self.im_start_id = special_tokens[IMSTART]
132
- self.im_end_id = special_tokens[IMEND]
133
 
134
- def __len__(self):
135
  return self.tokenizer.n_vocab
136
 
137
- def get_vocab(self):
138
  return self.mergeable_ranks
139
 
140
- def convert_tokens_to_ids(self, tokens):
 
 
141
  ids = []
142
- # Remove support for py2
143
- if isinstance(tokens, str):
144
  if tokens in self.special_tokens:
145
  return self.special_tokens[tokens]
146
  else:
147
- return self.encoder.get(tokens)
148
  for token in tokens:
149
  if token in self.special_tokens:
150
  ids.append(self.special_tokens[token])
151
  else:
152
- ids.append(self.encoder.get(token))
153
- if len(ids) > self.max_len:
154
- logger.warning(
155
- "Token indices sequence length is longer than the specified maximum "
156
- " sequence length for this model ({} > {}). Running this"
157
- " sequence through the model will result in indexing errors".format(
158
- len(ids), self.max_len
159
- )
160
- )
161
  return ids
162
 
 
 
 
 
 
 
 
 
 
163
  def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
164
  """
165
- Save only the vocabulary of the tokenizer (vocabulary + added tokens).
166
 
167
  Returns:
168
  `Tuple(str)`: Paths to the files saved.
@@ -174,76 +130,81 @@ class QWenTokenizer(PreTrainedTokenizer):
174
  w.write(line)
175
  return (file_path,)
176
 
177
- def tokenize(self, text: str, **kwargs) -> List[str]:
 
 
 
 
 
 
178
  """
179
- Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.
180
 
181
  Args:
182
  text (`str`):
183
  The sequence to be encoded.
 
 
 
 
 
 
 
184
  kwargs (additional keyword arguments, *optional*):
185
  Will be passed to the underlying model specific encode method.
186
- Tiktoken allows users to allow the tokenization of special tokens with the following args:
187
- `allowed_special`: set to 'all' or a `set` of special tokens.
188
- `disallowed_special`: set to 'all' or a `Collection` of special tokens. NOT RECOMMENDED, AS IT MAY BE CONFLICTED WITH `allowed_special`.
189
 
190
  Returns:
191
- `List[str]`: The list of tokens.
192
  """
193
  tokens = []
194
  text = unicodedata.normalize("NFC", text)
195
 
196
- for t in self.tokenizer.encode(text, **kwargs):
 
 
 
197
  tokens.append(self.decoder[t])
198
-
199
  return tokens
200
 
201
- def convert_tokens_to_string(self, tokens: List[str]) -> str:
202
  """
203
- Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
204
- often want to remove sub-word tokenization artifacts at the same time.
205
  """
206
- text = "".join(tokens)
207
- text = bytearray([self.byte_decoder[c] for c in text]).decode(
208
- "utf-8", errors=self.errors
209
- )
 
 
 
 
 
 
 
 
 
 
210
  return text
211
-
212
  @property
213
  def vocab_size(self):
214
  return self.tokenizer.n_vocab
215
 
216
- def _convert_id_to_token(self, index: int) -> str:
217
- if index >= self.tokenizer.n_vocab:
218
- return self.unk_token
219
- return self.tokenizer.decode([index])
220
-
221
- def _convert_token_to_id(self, token: str) -> int:
222
- """Converts a token to an id using the vocab."""
223
- return self.encoder.get(
224
- token.encode("UTF-8"),
225
- self.tokenizer.encode(self.unk_token, allowed_special="all")[0],
226
- )
227
-
228
- @property
229
- def all_special_tokens(self) -> List[str]:
230
- """
231
- `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
232
-
233
- Convert tokens of `tokenizers.AddedToken` type to string.
234
- """
235
- all_toks = [str(s) for s in self.special_tokens.keys()]
236
- return all_toks
237
-
238
- @property
239
- def all_special_ids(self) -> List[int]:
240
- """
241
- `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
242
- """
243
- all_ids = [v for v in self.special_tokens.values()]
244
- return all_ids
245
-
246
- def _tokenize(self, text, **kwargs):
247
  """
248
  Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
249
  vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
@@ -261,5 +222,5 @@ class QWenTokenizer(PreTrainedTokenizer):
261
  if isinstance(token_ids, int):
262
  token_ids = [token_ids]
263
  if skip_special_tokens:
264
- token_ids = [i for i in token_ids if i not in self.all_special_ids]
265
- return self.tokenizer.decode(token_ids)
 
5
 
6
  """Tokenization classes for QWen."""
7
 
8
+ import base64
 
 
9
  import logging
10
  import os
11
  import unicodedata
12
+ from typing import Collection, Dict, List, Set, Tuple, Union
 
 
 
13
 
14
+ import tiktoken
15
  from transformers import PreTrainedTokenizer, AddedToken
16
 
17
  logger = logging.getLogger(__name__)
18
 
19
+
20
  VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
21
 
22
+ PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
23
+ ENDOFTEXT = "<|endoftext|>"
24
+ IMSTART = "<|im_start|>"
25
+ IMEND = "<|im_end|>"
26
+ # as the default behavior is changed to allow special tokens in
27
+ # regular texts, the surface forms of special tokens need to be
28
+ # as different as possible to minimize the impact
29
+ EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
+ SPECIAL_TOKENS = (
31
+ ENDOFTEXT,
32
+ IMSTART,
33
+ IMEND,
34
+ ) + EXTRAS
35
+
36
+
37
+ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
38
+ contents = open(tiktoken_bpe_file, "rb").read()
39
+ return {
40
+ base64.b64decode(token): int(rank)
41
+ for token, rank in (line.split() for line in contents.splitlines() if line)
42
+ }
43
+
44
  class QWenTokenizer(PreTrainedTokenizer):
45
  """QWen tokenizer."""
46
 
 
 
47
  vocab_files_names = VOCAB_FILES_NAMES
48
 
49
  def __init__(
50
  self,
51
  vocab_file,
52
  errors="replace",
 
 
 
 
 
 
 
 
53
  **kwargs,
54
  ):
55
+ super().__init__(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  self.errors = errors # how to handle errors in decoding
58
 
59
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
60
+ self.special_tokens = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  token: index
62
+ for index, token in enumerate(
63
+ SPECIAL_TOKENS, start=len(self.mergeable_ranks)
64
+ )
65
  }
66
+
67
  enc = tiktoken.Encoding(
68
+ "Qwen",
69
  pat_str=PAT_STR,
70
+ mergeable_ranks=self.mergeable_ranks,
71
+ special_tokens=self.special_tokens,
72
  )
73
  assert (
74
+ len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
75
+ ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
76
 
77
+ self.decoder = {
78
+ v: k for k, v in self.mergeable_ranks.items()
79
+ } # type: dict[int, bytes|str]
80
  self.decoder.update({v: k for k, v in self.special_tokens.items()})
81
+
82
  self.tokenizer = enc # type: tiktoken.Encoding
83
+
84
  self.eod_id = self.tokenizer.eot_token
85
+ self.im_start_id = self.special_tokens[IMSTART]
86
+ self.im_end_id = self.special_tokens[IMEND]
87
 
88
+ def __len__(self) -> int:
89
  return self.tokenizer.n_vocab
90
 
91
+ def get_vocab(self) -> Dict[bytes, int]:
92
  return self.mergeable_ranks
93
 
94
+ def convert_tokens_to_ids(
95
+ self, tokens: Union[bytes, str, List[Union[bytes, str]]]
96
+ ) -> List[int]:
97
  ids = []
98
+ if isinstance(tokens, (str, bytes)):
 
99
  if tokens in self.special_tokens:
100
  return self.special_tokens[tokens]
101
  else:
102
+ return self.mergeable_ranks.get(tokens)
103
  for token in tokens:
104
  if token in self.special_tokens:
105
  ids.append(self.special_tokens[token])
106
  else:
107
+ ids.append(self.mergeable_ranks.get(token))
 
 
 
 
 
 
 
 
108
  return ids
109
 
110
+ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
111
+ if not special_tokens and new_tokens:
112
+ raise ValueError('Adding regular tokens is not supported')
113
+ for token in new_tokens:
114
+ surface_form = token.content if isinstance(token, AddedToken) else token
115
+ if surface_form not in SPECIAL_TOKENS:
116
+ raise ValueError('Adding unknown special tokens is not supported')
117
+ return 0
118
+
119
  def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
120
  """
121
+ Save only the vocabulary of the tokenizer (vocabulary).
122
 
123
  Returns:
124
  `Tuple(str)`: Paths to the files saved.
 
130
  w.write(line)
131
  return (file_path,)
132
 
133
+ def tokenize(
134
+ self,
135
+ text: str,
136
+ allowed_special: Union[Set, str] = "all",
137
+ disallowed_special: Union[Collection, str] = (),
138
+ **kwargs,
139
+ ) -> List[Union[bytes, str]]:
140
  """
141
+ Converts a string in a sequence of tokens.
142
 
143
  Args:
144
  text (`str`):
145
  The sequence to be encoded.
146
+ allowed_special (`Literal["all"]` or `set`):
147
+ The surface forms of the tokens to be encoded as special tokens in regular texts.
148
+ Default to "all".
149
+ disallowed_special (`Literal["all"]` or `Collection`):
150
+ The surface forms of the tokens that should not be in regular texts and trigger errors.
151
+ Default to an empty tuple.
152
+
153
  kwargs (additional keyword arguments, *optional*):
154
  Will be passed to the underlying model specific encode method.
 
 
 
155
 
156
  Returns:
157
+ `List[bytes|str]`: The list of tokens.
158
  """
159
  tokens = []
160
  text = unicodedata.normalize("NFC", text)
161
 
162
+ # this implementation takes a detour: text -> token id -> token surface forms
163
+ for t in self.tokenizer.encode(
164
+ text, allowed_special=allowed_special, disallowed_special=disallowed_special
165
+ ):
166
  tokens.append(self.decoder[t])
 
167
  return tokens
168
 
169
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
170
  """
171
+ Converts a sequence of tokens in a single string.
 
172
  """
173
+ text = ""
174
+ temp = b""
175
+ for t in tokens:
176
+ if isinstance(t, str):
177
+ if temp:
178
+ text += temp.decode("utf-8", errors=self.errors)
179
+ temp = b""
180
+ text += t
181
+ elif isinstance(t, bytes):
182
+ temp += t
183
+ else:
184
+ raise TypeError("token should only be of type types or str")
185
+ if temp:
186
+ text += temp.decode("utf-8", errors=self.errors)
187
  return text
188
+
189
  @property
190
  def vocab_size(self):
191
  return self.tokenizer.n_vocab
192
 
193
+ def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
194
+ """Converts an id to a token, special tokens included"""
195
+ if index in self.decoder:
196
+ return self.decoder[index]
197
+ raise ValueError("unknown ids")
198
+
199
+ def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
200
+ """Converts a token to an id using the vocab, special tokens included"""
201
+ if token in self.special_tokens:
202
+ return self.special_tokens[token]
203
+ if token in self.mergeable_ranks:
204
+ return self.mergeable_ranks[token]
205
+ raise ValueError("unknown token")
206
+
207
+ def _tokenize(self, text: str, **kwargs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  """
209
  Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
210
  vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
 
222
  if isinstance(token_ids, int):
223
  token_ids = [token_ids]
224
  if skip_special_tokens:
225
+ token_ids = [i for i in token_ids if i < self.eod_id]
226
+ return self.tokenizer.decode(token_ids, errors=self.errors)
tokenizer_config.json CHANGED
@@ -1,6 +1,5 @@
1
  {
2
- "remove_space": false,
3
- "do_lower_case": false,
4
  "tokenizer_class": "QWenTokenizer",
5
  "auto_map": {
6
  "AutoTokenizer": [
 
1
  {
2
+ "model_max_length": 8192,
 
3
  "tokenizer_class": "QWenTokenizer",
4
  "auto_map": {
5
  "AutoTokenizer": [