Spaces:

eubinecto
/

idiomify

Runtime error

idiomify / main_upload_tokenizer.py

[#7] tokenizer:t-1-1. The tokenizer with the idiom special tokens is now fetchable directly from wandb

12f548d over 2 years ago

No virus

997 Bytes

	import wandb
	import shutil
	from transformers import BartTokenizer
	from idiomify.fetchers import fetch_config
	from idiomify.paths import ROOT_DIR


	def main():
	config = fetch_config()['tokenizer']
	tokenizer = BartTokenizer.from_pretrained(config['bart'])
	tokenizer.add_special_tokens({
	"additional_special_tokens": ["<idiom>", "</idiom>"], # beginning and end of an idiom
	})

	with wandb.init(entity="eubinecto", project="idiomify") as run:
	# the paths to write datasets in
	tok_dir = ROOT_DIR / "tokenizer"
	tokenizer.save_pretrained(tok_dir)
	artifact = wandb.Artifact(name="tokenizer", type="other", description=config['description'],
	metadata=config)
	artifact.add_dir(tok_dir)
	# then, we just log them here.
	run.log_artifact(artifact, aliases=["latest", config['ver']])
	# don't forget to remove them
	shutil.rmtree(tok_dir)


	if __name__ == '__main__':
	main()