yangapku commited on
Commit
10173d4
1 Parent(s): 405556d

implement _convert_id_to_token

Browse files
Files changed (1) hide show
  1. tokenization_qwen.py +28 -7
tokenization_qwen.py CHANGED
@@ -78,7 +78,7 @@ class QWenTokenizer(PreTrainedTokenizer):
78
 
79
  self.errors = errors # how to handle errors in decoding
80
 
81
- name = "QWen"
82
  ENDOFTEXT = "<|endoftext|>"
83
  IMSTART = "<|im_start|>"
84
  IMEND = "<|im_end|>"
@@ -181,10 +181,6 @@ class QWenTokenizer(PreTrainedTokenizer):
181
  Args:
182
  text (`str`):
183
  The sequence to be encoded.
184
- pair (`str`, *optional*):
185
- A second sequence to be encoded with the first.
186
- add_special_tokens (`bool`, *optional*, defaults to `False`):
187
- Whether or not to add the special tokens associated with the corresponding model.
188
  kwargs (additional keyword arguments, *optional*):
189
  Will be passed to the underlying model specific encode method. See details in
190
  [`~PreTrainedTokenizerBase.__call__`]
@@ -214,7 +210,31 @@ class QWenTokenizer(PreTrainedTokenizer):
214
  return self.tokenizer.n_vocab
215
 
216
  def _convert_id_to_token(self, index: int) -> str:
217
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  def _tokenize(self, text, **kwargs):
220
  """
@@ -229,9 +249,10 @@ class QWenTokenizer(PreTrainedTokenizer):
229
  self,
230
  token_ids: Union[int, List[int]],
231
  skip_special_tokens: bool = False,
232
- clean_up_tokenization_spaces: bool = None,
233
  **kwargs,
234
  ) -> str:
235
  if isinstance(token_ids, int):
236
  token_ids = [token_ids]
 
 
237
  return self.tokenizer.decode(token_ids)
 
78
 
79
  self.errors = errors # how to handle errors in decoding
80
 
81
+ name = "Qwen"
82
  ENDOFTEXT = "<|endoftext|>"
83
  IMSTART = "<|im_start|>"
84
  IMEND = "<|im_end|>"
 
181
  Args:
182
  text (`str`):
183
  The sequence to be encoded.
 
 
 
 
184
  kwargs (additional keyword arguments, *optional*):
185
  Will be passed to the underlying model specific encode method. See details in
186
  [`~PreTrainedTokenizerBase.__call__`]
 
210
  return self.tokenizer.n_vocab
211
 
212
  def _convert_id_to_token(self, index: int) -> str:
213
+ if index >= self.tokenizer.n_vocab:
214
+ return self.unk_token
215
+ return self.tokenizer.decode([index])
216
+
217
+ def _convert_token_to_id(self, token: str) -> int:
218
+ """Converts a token to an id using the vocab."""
219
+ return self.encoder.get(token.encode('UTF-8'), self.tokenizer.encode(self.unk_token, allowed_special='all')[0])
220
+
221
+ @property
222
+ def all_special_tokens(self) -> List[str]:
223
+ """
224
+ `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
225
+
226
+ Convert tokens of `tokenizers.AddedToken` type to string.
227
+ """
228
+ all_toks = [str(s) for s in self.special_tokens.keys()]
229
+ return all_toks
230
+
231
+ @property
232
+ def all_special_ids(self) -> List[int]:
233
+ """
234
+ `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
235
+ """
236
+ all_ids = [v for v in self.special_tokens.values()]
237
+ return all_ids
238
 
239
  def _tokenize(self, text, **kwargs):
240
  """
 
249
  self,
250
  token_ids: Union[int, List[int]],
251
  skip_special_tokens: bool = False,
 
252
  **kwargs,
253
  ) -> str:
254
  if isinstance(token_ids, int):
255
  token_ids = [token_ids]
256
+ if skip_special_tokens:
257
+ token_ids = [i for i in token_ids if i not in self.all_special_ids]
258
  return self.tokenizer.decode(token_ids)