Spaces:

xu-song
/

tokenizer-arena

Running

update

9495a4f about 1 year ago

822 Bytes

	import copy
	import json
	from tokenizers import Tokenizer

	def export_mock_tokenizer():
	input_path = "20B_tokenizer_chinese.json"

	tokenizer = json.load(open(input_path, "r", encoding="utf-8"))

	vocab = tokenizer["model"]["vocab"]
	added_tokens = [token["id"] for token in tokenizer["added_tokens"]]

	for k, v in copy.deepcopy(vocab).items():
	if v not in added_tokens:
	vocab[str(v)] = v
	vocab.pop(k)

	out_path = input_path.replace(".json", ".mock.json")
	with open(out_path, "w", encoding="utf-8") as f_out:
	f_out.write(json.dumps(tokenizer, ensure_ascii=False, indent=2))


	def mock2():
	pass


	def load_mock_tokenizer():
	tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.mock.json")
	print('')

	export_mock_tokenizer()
	load_mock_tokenizer()