from transformers import LlamaTokenizerFast import json import os def fn_load_tokenizer_llama( max_seq_length, dir_tokenizer: str = "./tokenizer.json", # dir_tokenizer:str = os.path.abspath(os.path.join(os.getcwd(), '..', "models_mtr/tokenizer.json")), # for JUP add_eos_token:bool = True, ): tokenizer = LlamaTokenizerFast( tokenizer_file=dir_tokenizer, model_max_length=max_seq_length, padding_side="right", bos_token="", eos_token="", unk_token="", add_eos_token=add_eos_token, ) tokenizer.add_special_tokens({"pad_token": "", "sep_token": "", "cls_token": "", "mask_token":""}) # tokenizer.add_special_tokens({"pad_token": ""}) return tokenizer def fn_load_descriptor_list( key_descriptor_list, dir_descriptor_list, ): with open(dir_descriptor_list, "r") as js: list_descriptor = json.load(js)[key_descriptor_list] return list_descriptor