Nanobit commited on
Commit
e0b7eea
1 Parent(s): 43856c0

Fix(tokenizer): Set rstrip,lstrip,norm to False (#678)

Browse files
Files changed (1) hide show
  1. src/axolotl/utils/models.py +10 -2
src/axolotl/utils/models.py CHANGED
@@ -11,6 +11,7 @@ from optimum.bettertransformer import BetterTransformer
11
  from peft import PeftConfig, prepare_model_for_kbit_training
12
  from peft.tuners.lora import QuantLinear
13
  from transformers import ( # noqa: F401
 
14
  AutoConfig,
15
  AutoModelForCausalLM,
16
  AutoTokenizer,
@@ -82,9 +83,16 @@ def load_tokenizer(cfg):
82
 
83
  if cfg.special_tokens:
84
  for k, val in cfg.special_tokens.items():
85
- tokenizer.add_special_tokens({k: val})
 
 
86
  if cfg.tokens:
87
- tokenizer.add_tokens(list(cfg.tokens))
 
 
 
 
 
88
 
89
  return tokenizer
90
 
 
11
  from peft import PeftConfig, prepare_model_for_kbit_training
12
  from peft.tuners.lora import QuantLinear
13
  from transformers import ( # noqa: F401
14
+ AddedToken,
15
  AutoConfig,
16
  AutoModelForCausalLM,
17
  AutoTokenizer,
 
83
 
84
  if cfg.special_tokens:
85
  for k, val in cfg.special_tokens.items():
86
+ tokenizer.add_special_tokens(
87
+ {k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)}
88
+ )
89
  if cfg.tokens:
90
+ tokenizer.add_tokens(
91
+ [
92
+ AddedToken(token, rstrip=False, lstrip=False, normalized=False)
93
+ for token in cfg.tokens
94
+ ]
95
+ )
96
 
97
  return tokenizer
98