zxdu20 commited on
Commit
9163f7e
1 Parent(s): 649466f

Fix eos token in tokenizer

Browse files
Files changed (2) hide show
  1. tokenization_chatglm.py +12 -12
  2. tokenizer_config.json +2 -2
tokenization_chatglm.py CHANGED
@@ -171,8 +171,8 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
171
  do_lower_case=False,
172
  remove_space=False,
173
  bos_token='<sop>',
174
- eos_token='</s>',
175
- eop_token='<eop>',
176
  mask_token='[MASK]',
177
  gmask_token='[gMASK]',
178
  padding_side="left",
@@ -185,7 +185,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
185
  padding_side=padding_side,
186
  bos_token=bos_token,
187
  eos_token=eos_token,
188
- eop_token=eop_token,
189
  mask_token=mask_token,
190
  gmask_token=gmask_token,
191
  num_image_tokens=num_image_tokens,
@@ -198,7 +198,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
198
 
199
  self.bos_token = bos_token
200
  self.eos_token = eos_token
201
- self.eop_token = eop_token
202
  self.mask_token = mask_token
203
  self.gmask_token = gmask_token
204
 
@@ -213,14 +213,14 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
213
  return self.convert_tokens_to_ids(self.gmask_token)
214
 
215
  @property
216
- def eop_token_id(self) -> Optional[int]:
217
  """
218
- `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been
219
  set.
220
  """
221
- if self.eop_token is None:
222
  return None
223
- return self.convert_tokens_to_ids(self.eop_token)
224
 
225
  @property
226
  def vocab_size(self):
@@ -324,18 +324,18 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
324
  """
325
  mask_ids = self.sp_tokenizer[self.mask_token]
326
  gmask_ids = self.sp_tokenizer[self.gmask_token]
327
- eop_id = self.sp_tokenizer[self.eop_token]
328
  if mask_ids not in token_ids_0 and gmask_ids not in token_ids_0:
329
  token_ids_0 += [gmask_ids]
330
 
331
  if token_ids_0[-1] != mask_ids and token_ids_0[-1] != gmask_ids:
332
- token_ids_0 += [self.sp_tokenizer[self.eos_token]]
333
 
334
  token_ids_0 += [self.sp_tokenizer[self.bos_token]]
335
 
336
  if token_ids_1 is not None:
337
- if not token_ids_1 or token_ids_1[-1] != eop_id:
338
- token_ids_1 += [eop_id]
339
  token_ids_0 += token_ids_1
340
 
341
  return token_ids_0
171
  do_lower_case=False,
172
  remove_space=False,
173
  bos_token='<sop>',
174
+ eos_token='<eop>',
175
+ end_token='</s>',
176
  mask_token='[MASK]',
177
  gmask_token='[gMASK]',
178
  padding_side="left",
185
  padding_side=padding_side,
186
  bos_token=bos_token,
187
  eos_token=eos_token,
188
+ end_token=end_token,
189
  mask_token=mask_token,
190
  gmask_token=gmask_token,
191
  num_image_tokens=num_image_tokens,
198
 
199
  self.bos_token = bos_token
200
  self.eos_token = eos_token
201
+ self.end_token = end_token
202
  self.mask_token = mask_token
203
  self.gmask_token = gmask_token
204
 
213
  return self.convert_tokens_to_ids(self.gmask_token)
214
 
215
  @property
216
+ def end_token_id(self) -> Optional[int]:
217
  """
218
+ `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
219
  set.
220
  """
221
+ if self.end_token is None:
222
  return None
223
+ return self.convert_tokens_to_ids(self.end_token)
224
 
225
  @property
226
  def vocab_size(self):
324
  """
325
  mask_ids = self.sp_tokenizer[self.mask_token]
326
  gmask_ids = self.sp_tokenizer[self.gmask_token]
327
+ eos_id = self.sp_tokenizer[self.eos_token]
328
  if mask_ids not in token_ids_0 and gmask_ids not in token_ids_0:
329
  token_ids_0 += [gmask_ids]
330
 
331
  if token_ids_0[-1] != mask_ids and token_ids_0[-1] != gmask_ids:
332
+ token_ids_0 += [self.sp_tokenizer[self.end_token]]
333
 
334
  token_ids_0 += [self.sp_tokenizer[self.bos_token]]
335
 
336
  if token_ids_1 is not None:
337
+ if not token_ids_1 or token_ids_1[-1] != eos_id:
338
+ token_ids_1 += [eos_id]
339
  token_ids_0 += token_ids_1
340
 
341
  return token_ids_0
tokenizer_config.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "name_or_path": "THUDM/chatglm-6b-int4",
3
  "bos_token": "<sop>",
4
- "eop_token": "<eop>",
5
- "eos_token": "</s>",
6
  "gmask_token": "[gMASK]",
7
  "mask_token": "[MASK]",
8
  "pad_token": "<pad>",
1
  {
2
  "name_or_path": "THUDM/chatglm-6b-int4",
3
  "bos_token": "<sop>",
4
+ "eos_token": "<eop>",
5
+ "end_token": "</s>",
6
  "gmask_token": "[gMASK]",
7
  "mask_token": "[MASK]",
8
  "pad_token": "<pad>",