import wandb import shutil from transformers import BartTokenizer from idiomify.fetchers import fetch_config from idiomify.paths import ROOT_DIR def main(): config = fetch_config()['tokenizer'] tokenizer = BartTokenizer.from_pretrained(config['bart']) tokenizer.add_special_tokens({ "additional_special_tokens": ["", ""], # beginning and end of an idiom }) with wandb.init(entity="eubinecto", project="idiomify") as run: # the paths to write datasets in tok_dir = ROOT_DIR / "tokenizer" tokenizer.save_pretrained(tok_dir) artifact = wandb.Artifact(name="tokenizer", type="other", description=config['description'], metadata=config) artifact.add_dir(tok_dir) # then, we just log them here. run.log_artifact(artifact, aliases=["latest", config['ver']]) # don't forget to remove them shutil.rmtree(tok_dir) if __name__ == '__main__': main()