File size: 997 Bytes
12f548d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import wandb
import shutil
from transformers import BartTokenizer
from idiomify.fetchers import fetch_config
from idiomify.paths import ROOT_DIR


def main():
    config = fetch_config()['tokenizer']
    tokenizer = BartTokenizer.from_pretrained(config['bart'])
    tokenizer.add_special_tokens({
        "additional_special_tokens": ["<idiom>", "</idiom>"],  # beginning and end of an idiom
    })

    with wandb.init(entity="eubinecto", project="idiomify") as run:
        # the paths to write datasets in
        tok_dir = ROOT_DIR / "tokenizer"
        tokenizer.save_pretrained(tok_dir)
        artifact = wandb.Artifact(name="tokenizer", type="other", description=config['description'],
                                  metadata=config)
        artifact.add_dir(tok_dir)
        # then, we just log them here.
        run.log_artifact(artifact, aliases=["latest", config['ver']])
        # don't forget to remove them
        shutil.rmtree(tok_dir)


if __name__ == '__main__':
    main()