eubinecto commited on
Commit
210581d
2 Parent(s): 70f038c 12f548d

Merge pull request #8 from eubinecto/issue-7

Browse files

[#7] tokenizer:t-1-1. The tokenizer with the idiom special tokens is …

config.yaml CHANGED
@@ -1,3 +1,4 @@
 
1
  idiomifier:
2
  ver: m-1-2
3
  desc: just overfitting the model, but on the entire PIE dataset.
@@ -21,3 +22,9 @@ literal2idiomatic:
21
  seed: 104
22
  boi_token: <idiom>
23
  eoi_token: </idiom>
 
 
 
 
 
 
 
1
+ # for training an idiomifier
2
  idiomifier:
3
  ver: m-1-2
4
  desc: just overfitting the model, but on the entire PIE dataset.
 
22
  seed: 104
23
  boi_token: <idiom>
24
  eoi_token: </idiom>
25
+ tokenizer:
26
+ ver: t-1-1
27
+ description: A pretrained BartTokenizer. The idiom special tokens are pre-added.
28
+ bart: facebook/bart-base
29
+ boi_token: <idiom>
30
+ eoi_token: </idiom>
explore/explore_fetch_tokenizer.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from idiomify.fetchers import fetch_tokenizer
2
+
3
+
4
+ def main():
5
+ tokenizer = fetch_tokenizer("t-1-1")
6
+ print(tokenizer.bos_token)
7
+ print(tokenizer.cls_token)
8
+ print(tokenizer.eos_token)
9
+ print(tokenizer.sep_token)
10
+ print(tokenizer.mask_token)
11
+ print(tokenizer.pad_token)
12
+ print(tokenizer.unk_token)
13
+ print(tokenizer.additional_special_tokens) # this should have been added
14
+
15
+
16
+ """
17
+ <s>
18
+ <s>
19
+ </s>
20
+ </s>
21
+ <mask>
22
+ <pad>
23
+ <unk>
24
+ ['<idiom>', '</idiom>']
25
+ """
26
+
27
+ if __name__ == '__main__':
28
+ main()
idiomify/fetchers.py CHANGED
@@ -2,11 +2,11 @@ import yaml
2
  import wandb
3
  from os import path
4
  import pandas as pd
5
- from typing import Tuple, List
6
  from wandb.sdk.wandb_run import Run
7
- from idiomify.paths import CONFIG_YAML, idioms_dir, literal2idiomatic, idiomifier_dir
8
  from idiomify.urls import PIE_URL
9
- from transformers import AutoModelForSeq2SeqLM, AutoConfig
10
  from idiomify.models import Idiomifier
11
 
12
 
@@ -64,6 +64,16 @@ def fetch_idiomifier(ver: str, run: Run = None) -> Idiomifier:
64
  return model
65
 
66
 
 
 
 
 
 
 
 
 
 
 
67
  def fetch_config() -> dict:
68
  with open(str(CONFIG_YAML), 'r', encoding="utf-8") as fh:
69
  return yaml.safe_load(fh)
 
2
  import wandb
3
  from os import path
4
  import pandas as pd
5
+ from typing import Tuple
6
  from wandb.sdk.wandb_run import Run
7
+ from idiomify.paths import CONFIG_YAML, idioms_dir, literal2idiomatic, idiomifier_dir, tokenizer_dir
8
  from idiomify.urls import PIE_URL
9
+ from transformers import AutoModelForSeq2SeqLM, AutoConfig, BartTokenizer
10
  from idiomify.models import Idiomifier
11
 
12
 
 
64
  return model
65
 
66
 
67
+ def fetch_tokenizer(ver: str, run: Run = None) -> BartTokenizer:
68
+ if run:
69
+ artifact = run.use_artifact(f"tokenizer:{ver}", type="other")
70
+ else:
71
+ artifact = wandb.Api().artifact(f"eubinecto/idiomify/tokenizer:{ver}", type="other")
72
+ artifact_dir = artifact.download(root=tokenizer_dir(ver))
73
+ tokenizer = BartTokenizer.from_pretrained(artifact_dir)
74
+ return tokenizer
75
+
76
+
77
  def fetch_config() -> dict:
78
  with open(str(CONFIG_YAML), 'r', encoding="utf-8") as fh:
79
  return yaml.safe_load(fh)
idiomify/paths.py CHANGED
@@ -15,3 +15,8 @@ def literal2idiomatic(ver: str) -> Path:
15
 
16
  def idiomifier_dir(ver: str) -> Path:
17
  return ARTIFACTS_DIR / f"idiomifier_{ver}"
 
 
 
 
 
 
15
 
16
  def idiomifier_dir(ver: str) -> Path:
17
  return ARTIFACTS_DIR / f"idiomifier_{ver}"
18
+
19
+
20
+ def tokenizer_dir(ver: str) -> Path:
21
+ return ARTIFACTS_DIR / f"tokenizer_{ver}"
22
+
main_upload_tokenizer.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wandb
2
+ import shutil
3
+ from transformers import BartTokenizer
4
+ from idiomify.fetchers import fetch_config
5
+ from idiomify.paths import ROOT_DIR
6
+
7
+
8
+ def main():
9
+ config = fetch_config()['tokenizer']
10
+ tokenizer = BartTokenizer.from_pretrained(config['bart'])
11
+ tokenizer.add_special_tokens({
12
+ "additional_special_tokens": ["<idiom>", "</idiom>"], # beginning and end of an idiom
13
+ })
14
+
15
+ with wandb.init(entity="eubinecto", project="idiomify") as run:
16
+ # the paths to write datasets in
17
+ tok_dir = ROOT_DIR / "tokenizer"
18
+ tokenizer.save_pretrained(tok_dir)
19
+ artifact = wandb.Artifact(name="tokenizer", type="other", description=config['description'],
20
+ metadata=config)
21
+ artifact.add_dir(tok_dir)
22
+ # then, we just log them here.
23
+ run.log_artifact(artifact, aliases=["latest", config['ver']])
24
+ # don't forget to remove them
25
+ shutil.rmtree(tok_dir)
26
+
27
+
28
+ if __name__ == '__main__':
29
+ main()