baichuan7B

#26
by wttRucer - opened
Files changed (1) hide show
  1. tokenization_baichuan.py +6 -7
tokenization_baichuan.py CHANGED
@@ -71,12 +71,6 @@ class BaiChuanTokenizer(PreTrainedTokenizer):
71
  eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
72
  unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
73
  pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
74
- self.vocab_file = vocab_file
75
- self.add_bos_token = add_bos_token
76
- self.add_eos_token = add_eos_token
77
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
78
- self.sp_model.Load(vocab_file)
79
-
80
  super().__init__(
81
  bos_token=bos_token,
82
  eos_token=eos_token,
@@ -88,6 +82,11 @@ class BaiChuanTokenizer(PreTrainedTokenizer):
88
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
89
  **kwargs,
90
  )
 
 
 
 
 
91
 
92
  def __getstate__(self):
93
  state = self.__dict__.copy()
@@ -248,4 +247,4 @@ class BaiChuanTokenizer(PreTrainedTokenizer):
248
  if token_ids_1 is not None:
249
  output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
250
 
251
- return output
 
71
  eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
72
  unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
73
  pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
 
 
 
 
 
 
74
  super().__init__(
75
  bos_token=bos_token,
76
  eos_token=eos_token,
 
82
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
83
  **kwargs,
84
  )
85
+ self.vocab_file = vocab_file
86
+ self.add_bos_token = add_bos_token
87
+ self.add_eos_token = add_eos_token
88
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
89
+ self.sp_model.Load(vocab_file)
90
 
91
  def __getstate__(self):
92
  state = self.__dict__.copy()
 
247
  if token_ids_1 is not None:
248
  output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
249
 
250
+ return output