Spaces:
Running
Running
""" | |
tokenizer类型:HFTokenizer | |
## Run | |
## 来源 | |
- https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py | |
- https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/tokenizer | |
""" | |
import json | |
import ftfy | |
from gpt_nexo_20b.tokenizer import build_tokenizer | |
class Encoder(object): | |
def __init__(self, args): | |
self.args = args | |
def initializer(self): | |
# Use Encoder class as a container for global data | |
Encoder.tokenizer = build_tokenizer(self.args) | |
def encode(self, text): | |
if self.args.ftfy: | |
text = ftfy.fix_text(text) | |
ids = {} | |
text_ids = Encoder.tokenizer.tokenize(text) | |
return text_ids | |
class HFConfig: | |
""" | |
jsonl_keys 是干嘛的? | |
对应的配置文件:https://github.com/EleutherAI/gpt-neox/blob/main/configs/20B.yml | |
"vocab-file": "./20B_checkpoints/20B_tokenizer.json", | |
"tokenizer_type": "HFTokenizer", | |
""" | |
def __init__(self): | |
self.append_eod = True | |
self.ftfy = False | |
self.keep_empty = False | |
self.log_interval = 100 | |
self.make_vocab_size_divisible_by = 128 | |
self.model_parallel_size = 1 | |
self.padded_vocab_size = 50304 | |
self.rank = 0 | |
self.tokenizer_type = 'HFTokenizer' | |
self.vocab_file = '20B_tokenizer.json' | |
class GPTConfig: | |
""" | |
对应的配置文件:https://github.com/EleutherAI/gpt-neox/blob/main/configs/local_setup.yml | |
"vocab-file": "data/gpt2-vocab.json", | |
"merge-file": "data/gpt2-merges.txt", | |
"tokenizer_type": Default = GPT2BPETokenizer # 默认值 | |
""" | |
def __init__(self): | |
self.input = './data/enwik8/enwik8.zip' | |
self.merge_file = './data/gpt2-merges.txt' | |
self.workers = 1 | |
class BERTConfig: | |
""" 好像不支持 | |
"vocab-file": "./20B_checkpoints/20B_tokenizer.json", | |
"tokenizer_type": "HFTokenizer", | |
""" | |
pass | |
def test(): | |
args = HFConfig() | |
encoder = Encoder(args) | |
tokenizer = build_tokenizer(args) | |
print(f"Vocab size: {tokenizer.vocab_size}") | |
encoder.initializer() | |
tokens = encoder.encode("中国\ngood job一个人去哪里") | |
# 13609 中 | |
# 23197 国 | |
print(tokens) | |
for token in tokens: | |
print(token, Encoder.tokenizer.detokenize([token])) | |
def convert_vocab(): | |
vocab = json.load(open("20B_tokenizer.json", "r", encoding="utf-8")) | |
json.dump(vocab, open("20B_tokenizer.zh.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2) | |
def dump_vocab(): | |
args = HFConfig() | |
tokenizer = build_tokenizer(args) | |
print(f"Vocab size: {tokenizer.vocab_size}") | |
with open("20B.vocab.txt", "w", encoding="utf-8") as f_out: | |
for token in tokenizer.vocab: | |
f_out.write(token + "\n") | |
""" | |
13609 中 | |
23197 国 | |
187 | |
12311 good | |
2628 job | |
27896 一个 | |
13484 人 | |
44781 去 | |
20833 � | |
105 � | |
42013 里 | |
""" | |
if __name__ == "__main__": | |
test() | |
# convert_vocab() | |
# dump_vocab() |