|
import os |
|
import argparse |
|
from transformers import LlamaTokenizer |
|
import sentencepiece as spm |
|
|
|
|
|
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" |
|
|
|
def update_tokenizer(set_chatml, output_hf_dir="./tokenizer.model"): |
|
|
|
custom_tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir) |
|
|
|
|
|
|
|
print(f"当前bos标记: (id: {custom_tokenizer.bos_token})") |
|
print(f"当前eos标记: (id: {custom_tokenizer.eos_token})") |
|
|
|
vocab_size = len(custom_tokenizer) |
|
print(f"总词汇量: {vocab_size}") |
|
|
|
|
|
|
|
if set_chatml: |
|
print("设置Chatml格式为EOS/BOS") |
|
custom_tokenizer.bos_token = "<|im_start|>" |
|
custom_tokenizer.eos_token = "<|im_end|>" |
|
print("BOS and EOS tokens set to '<|im_start|>' and '<|im_end|>' for ChatML.") |
|
else: |
|
print(f"Default BOS token: {custom_tokenizer.bos_token}, EOS token: {custom_tokenizer.eos_token}") |
|
|
|
|
|
startoftext_id = custom_tokenizer.convert_tokens_to_ids('<|startoftext|>') |
|
endoftext_id = custom_tokenizer.convert_tokens_to_ids('<|endoftext|>') |
|
|
|
print(f"当前 <|startoftext|> 索引: {startoftext_id}") |
|
print(f"当前 <|endoftext|> 索引: {endoftext_id}") |
|
|
|
|
|
startofs_id = custom_tokenizer.convert_tokens_to_ids('<s>') |
|
endofs_id = custom_tokenizer.convert_tokens_to_ids('</s>') |
|
|
|
print(f"当前 <s> 索引: {startofs_id}") |
|
print(f"当前 </s>索引: {endofs_id}") |
|
|
|
|
|
bos_id = custom_tokenizer.bos_token_id |
|
eos_id = custom_tokenizer.eos_token_id |
|
bos_token = custom_tokenizer.convert_ids_to_tokens(bos_id) |
|
eos_token = custom_tokenizer.convert_ids_to_tokens(eos_id) |
|
print(f"最终bos标记: {bos_token} (id: {custom_tokenizer.bos_token})") |
|
print(f"最终eos标记: {eos_token} (id: {custom_tokenizer.eos_token})") |
|
|
|
print("all_special_tokens: \n\n",custom_tokenizer.all_special_tokens) |
|
print("all_special_ids: \n\n",custom_tokenizer.all_special_ids) |
|
print("special_tokens_map: \n\n",custom_tokenizer.special_tokens_map) |
|
output_hf_dir ="./save_tokenizer" |
|
custom_tokenizer.save_pretrained(output_hf_dir) |
|
print(f" Tokenizer has been saved to {output_hf_dir}") |
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser(description="更新Tokenizer,根据需要设置BOS/EOS为ChatML格式") |
|
parser.add_argument("--set_chatml", action="store_true", help="如果设置,将BOS和EOS标记用于ChatML。") |
|
args = parser.parse_args() |
|
|
|
|
|
update_tokenizer(args.set_chatml) |
|
|
|
|