File size: 2,964 Bytes
751936e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
"""
tokenizer类型:HFTokenizer
## Run
## 来源
- https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
- https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/tokenizer
"""
import json
import ftfy
from gpt_nexo_20b.tokenizer import build_tokenizer
class Encoder(object):
def __init__(self, args):
self.args = args
def initializer(self):
# Use Encoder class as a container for global data
Encoder.tokenizer = build_tokenizer(self.args)
def encode(self, text):
if self.args.ftfy:
text = ftfy.fix_text(text)
ids = {}
text_ids = Encoder.tokenizer.tokenize(text)
return text_ids
class HFConfig:
"""
jsonl_keys 是干嘛的?
对应的配置文件:https://github.com/EleutherAI/gpt-neox/blob/main/configs/20B.yml
"vocab-file": "./20B_checkpoints/20B_tokenizer.json",
"tokenizer_type": "HFTokenizer",
"""
def __init__(self):
self.append_eod = True
self.ftfy = False
self.keep_empty = False
self.log_interval = 100
self.make_vocab_size_divisible_by = 128
self.model_parallel_size = 1
self.padded_vocab_size = 50304
self.rank = 0
self.tokenizer_type = 'HFTokenizer'
self.vocab_file = '20B_tokenizer.json'
class GPTConfig:
"""
对应的配置文件:https://github.com/EleutherAI/gpt-neox/blob/main/configs/local_setup.yml
"vocab-file": "data/gpt2-vocab.json",
"merge-file": "data/gpt2-merges.txt",
"tokenizer_type": Default = GPT2BPETokenizer # 默认值
"""
def __init__(self):
self.input = './data/enwik8/enwik8.zip'
self.merge_file = './data/gpt2-merges.txt'
self.workers = 1
class BERTConfig:
""" 好像不支持
"vocab-file": "./20B_checkpoints/20B_tokenizer.json",
"tokenizer_type": "HFTokenizer",
"""
pass
def test():
args = HFConfig()
encoder = Encoder(args)
tokenizer = build_tokenizer(args)
print(f"Vocab size: {tokenizer.vocab_size}")
encoder.initializer()
tokens = encoder.encode("中国\ngood job一个人去哪里")
# 13609 中
# 23197 国
print(tokens)
for token in tokens:
print(token, Encoder.tokenizer.detokenize([token]))
def convert_vocab():
vocab = json.load(open("20B_tokenizer.json", "r", encoding="utf-8"))
json.dump(vocab, open("20B_tokenizer.zh.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)
def dump_vocab():
args = HFConfig()
tokenizer = build_tokenizer(args)
print(f"Vocab size: {tokenizer.vocab_size}")
with open("20B.vocab.txt", "w", encoding="utf-8") as f_out:
for token in tokenizer.vocab:
f_out.write(token + "\n")
"""
13609 中
23197 国
187
12311 good
2628 job
27896 一个
13484 人
44781 去
20833 �
105 �
42013 里
"""
if __name__ == "__main__":
test()
# convert_vocab()
# dump_vocab() |