Spaces:

eson
/

tokenizer-arena

Running

App Files Files Community

tokenizer-arena / vocab /gpt_nexo_20b /test_gpt_neox_20b.py

eson

update

751936e 10 months ago

raw history blame

No virus

2.96 kB

	"""

	tokenizer类型：HFTokenizer


	## Run



	## 来源

	- https://github.com/EleutherAI/gpt-neox/blob/main/tools/preprocess_data.py
	- https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/tokenizer

	"""

	import json
	import ftfy
	from gpt_nexo_20b.tokenizer import build_tokenizer


	class Encoder(object):
	def __init__(self, args):
	self.args = args

	def initializer(self):
	# Use Encoder class as a container for global data
	Encoder.tokenizer = build_tokenizer(self.args)

	def encode(self, text):
	if self.args.ftfy:
	text = ftfy.fix_text(text)
	ids = {}
	text_ids = Encoder.tokenizer.tokenize(text)
	return text_ids


	class HFConfig:
	"""
	jsonl_keys 是干嘛的？

	对应的配置文件：https://github.com/EleutherAI/gpt-neox/blob/main/configs/20B.yml
	"vocab-file": "./20B_checkpoints/20B_tokenizer.json",
	"tokenizer_type": "HFTokenizer",
	"""
	def __init__(self):
	self.append_eod = True
	self.ftfy = False
	self.keep_empty = False
	self.log_interval = 100
	self.make_vocab_size_divisible_by = 128
	self.model_parallel_size = 1
	self.padded_vocab_size = 50304
	self.rank = 0
	self.tokenizer_type = 'HFTokenizer'
	self.vocab_file = '20B_tokenizer.json'


	class GPTConfig:
	"""
	对应的配置文件：https://github.com/EleutherAI/gpt-neox/blob/main/configs/local_setup.yml
	"vocab-file": "data/gpt2-vocab.json",
	"merge-file": "data/gpt2-merges.txt",

	"tokenizer_type": Default = GPT2BPETokenizer # 默认值
	"""
	def __init__(self):
	self.input = './data/enwik8/enwik8.zip'
	self.merge_file = './data/gpt2-merges.txt'
	self.workers = 1

	class BERTConfig:
	""" 好像不支持
	"vocab-file": "./20B_checkpoints/20B_tokenizer.json",
	"tokenizer_type": "HFTokenizer",
	"""
	pass


	def test():
	args = HFConfig()
	encoder = Encoder(args)
	tokenizer = build_tokenizer(args)
	print(f"Vocab size: {tokenizer.vocab_size}")
	encoder.initializer()

	tokens = encoder.encode("中国\ngood job一个人去哪里")
	# 13609 中
	# 23197 国
	print(tokens)
	for token in tokens:
	print(token, Encoder.tokenizer.detokenize([token]))



	def convert_vocab():
	vocab = json.load(open("20B_tokenizer.json", "r", encoding="utf-8"))
	json.dump(vocab, open("20B_tokenizer.zh.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)


	def dump_vocab():
	args = HFConfig()
	tokenizer = build_tokenizer(args)
	print(f"Vocab size: {tokenizer.vocab_size}")
	with open("20B.vocab.txt", "w", encoding="utf-8") as f_out:
	for token in tokenizer.vocab:
	f_out.write(token + "\n")

	"""
	13609 中
	23197 国
	187

	12311 good
	2628 job
	27896 一个
	13484 人
	44781 去
	20833 �
	105 �
	42013 里
	"""



	if __name__ == "__main__":
	test()
	# convert_vocab()
	# dump_vocab()