xDAN2099's picture
Upload folder using huggingface_hub
5f8aad7 verified
import os
import argparse
from transformers import LlamaTokenizer
import sentencepiece as spm
# 设置环境变量
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
def update_tokenizer(set_chatml, output_hf_dir="./tokenizer.model"):
# 从预训练路径重新加载tokenizer
custom_tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir)
# 如果启用了set_chatml,更新bos_token和eos_token
print(f"当前bos标记: (id: {custom_tokenizer.bos_token})")
print(f"当前eos标记: (id: {custom_tokenizer.eos_token})")
# 统计总词汇量
vocab_size = len(custom_tokenizer)
print(f"总词汇量: {vocab_size}")
# 根据 --set_chatml 参数设置 bos_token 和 eos_token
if set_chatml:
print("设置Chatml格式为EOS/BOS")
custom_tokenizer.bos_token = "<|im_start|>"
custom_tokenizer.eos_token = "<|im_end|>"
print("BOS and EOS tokens set to '<|im_start|>' and '<|im_end|>' for ChatML.")
else:
print(f"Default BOS token: {custom_tokenizer.bos_token}, EOS token: {custom_tokenizer.eos_token}")
startoftext_id = custom_tokenizer.convert_tokens_to_ids('<|startoftext|>')
endoftext_id = custom_tokenizer.convert_tokens_to_ids('<|endoftext|>')
print(f"当前 <|startoftext|> 索引: {startoftext_id}")
print(f"当前 <|endoftext|> 索引: {endoftext_id}")
startofs_id = custom_tokenizer.convert_tokens_to_ids('<s>')
endofs_id = custom_tokenizer.convert_tokens_to_ids('</s>')
print(f"当前 <s> 索引: {startofs_id}")
print(f"当前 </s>索引: {endofs_id}")
# 统计bos和eos标记及其数量
bos_id = custom_tokenizer.bos_token_id
eos_id = custom_tokenizer.eos_token_id
bos_token = custom_tokenizer.convert_ids_to_tokens(bos_id)
eos_token = custom_tokenizer.convert_ids_to_tokens(eos_id)
print(f"最终bos标记: {bos_token} (id: {custom_tokenizer.bos_token})")
print(f"最终eos标记: {eos_token} (id: {custom_tokenizer.eos_token})")
print("all_special_tokens: \n\n",custom_tokenizer.all_special_tokens)
print("all_special_ids: \n\n",custom_tokenizer.all_special_ids)
print("special_tokens_map: \n\n",custom_tokenizer.special_tokens_map)
output_hf_dir ="./save_tokenizer"
custom_tokenizer.save_pretrained(output_hf_dir)
print(f" Tokenizer has been saved to {output_hf_dir}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="更新Tokenizer,根据需要设置BOS/EOS为ChatML格式")
parser.add_argument("--set_chatml", action="store_true", help="如果设置,将BOS和EOS标记用于ChatML。")
args = parser.parse_args()
# 根据命令行参数更新tokenizer
update_tokenizer(args.set_chatml)