zxdu20 commited on
Commit
3a99d79
1 Parent(s): 53f0197

Always add gmask in token ids

Browse files
Files changed (1) hide show
  1. tokenization_chatglm.py +3 -14
tokenization_chatglm.py CHANGED
@@ -326,22 +326,11 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
326
  Returns:
327
  `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
328
  """
329
- mask_ids = self.sp_tokenizer[self.mask_token]
330
- gmask_ids = self.sp_tokenizer[self.gmask_token]
331
  eos_id = self.sp_tokenizer[self.eos_token]
332
- if mask_ids not in token_ids_0 and gmask_ids not in token_ids_0:
333
- token_ids_0 += [gmask_ids]
334
-
335
- if token_ids_0[-1] != mask_ids and token_ids_0[-1] != gmask_ids:
336
- token_ids_0 += [self.sp_tokenizer[self.end_token]]
337
-
338
- token_ids_0 += [self.sp_tokenizer[self.bos_token]]
339
-
340
  if token_ids_1 is not None:
341
- if not token_ids_1 or token_ids_1[-1] != eos_id:
342
- token_ids_1 += [eos_id]
343
- token_ids_0 += token_ids_1
344
-
345
  return token_ids_0
346
 
347
  def _pad(
 
326
  Returns:
327
  `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
328
  """
329
+ gmask_id = self.sp_tokenizer[self.gmask_token]
 
330
  eos_id = self.sp_tokenizer[self.eos_token]
331
+ token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
 
 
 
 
 
 
 
332
  if token_ids_1 is not None:
333
+ token_ids_0 = token_ids_0 + token_ids_1 + [eos_id]
 
 
 
334
  return token_ids_0
335
 
336
  def _pad(