zxdu20 commited on
Commit
53f0197
1 Parent(s): eb55ff0
Files changed (1) hide show
  1. tokenization_chatglm.py +8 -0
tokenization_chatglm.py CHANGED
@@ -176,6 +176,8 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
176
  mask_token='[MASK]',
177
  gmask_token='[gMASK]',
178
  padding_side="left",
 
 
179
  num_image_tokens=20000,
180
  **kwargs
181
  ) -> None:
@@ -188,6 +190,8 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
188
  end_token=end_token,
189
  mask_token=mask_token,
190
  gmask_token=gmask_token,
 
 
191
  num_image_tokens=num_image_tokens,
192
  **kwargs
193
  )
@@ -402,6 +406,10 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
402
  encoded_inputs["attention_mask"] = attention_mask
403
 
404
  if "position_ids" not in encoded_inputs:
 
 
 
 
405
  position_ids = np.arange(seq_length, dtype=np.int64)
406
  mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
407
  if mask_token in required_input:
 
176
  mask_token='[MASK]',
177
  gmask_token='[gMASK]',
178
  padding_side="left",
179
+ pad_token="<pad>",
180
+ unk_token="<unk>",
181
  num_image_tokens=20000,
182
  **kwargs
183
  ) -> None:
 
190
  end_token=end_token,
191
  mask_token=mask_token,
192
  gmask_token=gmask_token,
193
+ pad_token=pad_token,
194
+ unk_token=unk_token,
195
  num_image_tokens=num_image_tokens,
196
  **kwargs
197
  )
 
406
  encoded_inputs["attention_mask"] = attention_mask
407
 
408
  if "position_ids" not in encoded_inputs:
409
+ if bos_token_id in required_input:
410
+ context_length = required_input.index(bos_token_id)
411
+ else:
412
+ context_length = seq_length
413
  position_ids = np.arange(seq_length, dtype=np.int64)
414
  mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
415
  if mask_token in required_input: