""" tokenizer类型:HFTokenizer ## Run ## 来源 - https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py - https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/tokenizer """ import json import ftfy from gpt_nexo_20b.tokenizer import build_tokenizer class Encoder(object): def __init__(self, args): self.args = args def initializer(self): # Use Encoder class as a container for global data Encoder.tokenizer = build_tokenizer(self.args) def encode(self, text): if self.args.ftfy: text = ftfy.fix_text(text) ids = {} text_ids = Encoder.tokenizer.tokenize(text) return text_ids class HFConfig: """ jsonl_keys 是干嘛的? 对应的配置文件:https://github.com/EleutherAI/gpt-neox/blob/main/configs/20B.yml "vocab-file": "./20B_checkpoints/20B_tokenizer.json", "tokenizer_type": "HFTokenizer", """ def __init__(self): self.append_eod = True self.ftfy = False self.keep_empty = False self.log_interval = 100 self.make_vocab_size_divisible_by = 128 self.model_parallel_size = 1 self.padded_vocab_size = 50304 self.rank = 0 self.tokenizer_type = 'HFTokenizer' self.vocab_file = '20B_tokenizer.json' class GPTConfig: """ 对应的配置文件:https://github.com/EleutherAI/gpt-neox/blob/main/configs/local_setup.yml "vocab-file": "data/gpt2-vocab.json", "merge-file": "data/gpt2-merges.txt", "tokenizer_type": Default = GPT2BPETokenizer # 默认值 """ def __init__(self): self.input = './data/enwik8/enwik8.zip' self.merge_file = './data/gpt2-merges.txt' self.workers = 1 class BERTConfig: """ 好像不支持 "vocab-file": "./20B_checkpoints/20B_tokenizer.json", "tokenizer_type": "HFTokenizer", """ pass def test(): args = HFConfig() encoder = Encoder(args) tokenizer = build_tokenizer(args) print(f"Vocab size: {tokenizer.vocab_size}") encoder.initializer() tokens = encoder.encode("中国\ngood job一个人去哪里") # 13609 中 # 23197 国 print(tokens) for token in tokens: print(token, Encoder.tokenizer.detokenize([token])) def convert_vocab(): vocab = json.load(open("20B_tokenizer.json", "r", encoding="utf-8")) json.dump(vocab, open("20B_tokenizer.zh.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2) def dump_vocab(): args = HFConfig() tokenizer = build_tokenizer(args) print(f"Vocab size: {tokenizer.vocab_size}") with open("20B.vocab.txt", "w", encoding="utf-8") as f_out: for token in tokenizer.vocab: f_out.write(token + "\n") """ 13609 中 23197 国 187 12311 good 2628 job 27896 一个 13484 人 44781 去 20833 � 105 � 42013 里 """ if __name__ == "__main__": test() # convert_vocab() # dump_vocab()