curtis-sun commited on
Commit
c5b3968
1 Parent(s): 8069b5c

update tokenizer according to https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/discussions/18

Browse files
Files changed (1) hide show
  1. tokenization_baichuan.py +5 -5
tokenization_baichuan.py CHANGED
@@ -68,6 +68,11 @@ class BaichuanTokenizer(PreTrainedTokenizer):
68
  if isinstance(pad_token, str)
69
  else pad_token
70
  )
 
 
 
 
 
71
  super().__init__(
72
  bos_token=bos_token,
73
  eos_token=eos_token,
@@ -79,11 +84,6 @@ class BaichuanTokenizer(PreTrainedTokenizer):
79
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
80
  **kwargs,
81
  )
82
- self.vocab_file = vocab_file
83
- self.add_bos_token = add_bos_token
84
- self.add_eos_token = add_eos_token
85
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
86
- self.sp_model.Load(vocab_file)
87
 
88
  def __getstate__(self):
89
  state = self.__dict__.copy()
 
68
  if isinstance(pad_token, str)
69
  else pad_token
70
  )
71
+ self.vocab_file = vocab_file
72
+ self.add_bos_token = add_bos_token
73
+ self.add_eos_token = add_eos_token
74
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
75
+ self.sp_model.Load(vocab_file)
76
  super().__init__(
77
  bos_token=bos_token,
78
  eos_token=eos_token,
 
84
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
85
  **kwargs,
86
  )
 
 
 
 
 
87
 
88
  def __getstate__(self):
89
  state = self.__dict__.copy()