Merge pull request #8 from eubinecto/issue-7
Browse files[#7] tokenizer:t-1-1. The tokenizer with the idiom special tokens is …
- config.yaml +7 -0
- explore/explore_fetch_tokenizer.py +28 -0
- idiomify/fetchers.py +13 -3
- idiomify/paths.py +5 -0
- main_upload_tokenizer.py +29 -0
config.yaml
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
idiomifier:
|
2 |
ver: m-1-2
|
3 |
desc: just overfitting the model, but on the entire PIE dataset.
|
@@ -21,3 +22,9 @@ literal2idiomatic:
|
|
21 |
seed: 104
|
22 |
boi_token: <idiom>
|
23 |
eoi_token: </idiom>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# for training an idiomifier
|
2 |
idiomifier:
|
3 |
ver: m-1-2
|
4 |
desc: just overfitting the model, but on the entire PIE dataset.
|
|
|
22 |
seed: 104
|
23 |
boi_token: <idiom>
|
24 |
eoi_token: </idiom>
|
25 |
+
tokenizer:
|
26 |
+
ver: t-1-1
|
27 |
+
description: A pretrained BartTokenizer. The idiom special tokens are pre-added.
|
28 |
+
bart: facebook/bart-base
|
29 |
+
boi_token: <idiom>
|
30 |
+
eoi_token: </idiom>
|
explore/explore_fetch_tokenizer.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from idiomify.fetchers import fetch_tokenizer
|
2 |
+
|
3 |
+
|
4 |
+
def main():
|
5 |
+
tokenizer = fetch_tokenizer("t-1-1")
|
6 |
+
print(tokenizer.bos_token)
|
7 |
+
print(tokenizer.cls_token)
|
8 |
+
print(tokenizer.eos_token)
|
9 |
+
print(tokenizer.sep_token)
|
10 |
+
print(tokenizer.mask_token)
|
11 |
+
print(tokenizer.pad_token)
|
12 |
+
print(tokenizer.unk_token)
|
13 |
+
print(tokenizer.additional_special_tokens) # this should have been added
|
14 |
+
|
15 |
+
|
16 |
+
"""
|
17 |
+
<s>
|
18 |
+
<s>
|
19 |
+
</s>
|
20 |
+
</s>
|
21 |
+
<mask>
|
22 |
+
<pad>
|
23 |
+
<unk>
|
24 |
+
['<idiom>', '</idiom>']
|
25 |
+
"""
|
26 |
+
|
27 |
+
if __name__ == '__main__':
|
28 |
+
main()
|
idiomify/fetchers.py
CHANGED
@@ -2,11 +2,11 @@ import yaml
|
|
2 |
import wandb
|
3 |
from os import path
|
4 |
import pandas as pd
|
5 |
-
from typing import Tuple
|
6 |
from wandb.sdk.wandb_run import Run
|
7 |
-
from idiomify.paths import CONFIG_YAML, idioms_dir, literal2idiomatic, idiomifier_dir
|
8 |
from idiomify.urls import PIE_URL
|
9 |
-
from transformers import AutoModelForSeq2SeqLM, AutoConfig
|
10 |
from idiomify.models import Idiomifier
|
11 |
|
12 |
|
@@ -64,6 +64,16 @@ def fetch_idiomifier(ver: str, run: Run = None) -> Idiomifier:
|
|
64 |
return model
|
65 |
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
def fetch_config() -> dict:
|
68 |
with open(str(CONFIG_YAML), 'r', encoding="utf-8") as fh:
|
69 |
return yaml.safe_load(fh)
|
|
|
2 |
import wandb
|
3 |
from os import path
|
4 |
import pandas as pd
|
5 |
+
from typing import Tuple
|
6 |
from wandb.sdk.wandb_run import Run
|
7 |
+
from idiomify.paths import CONFIG_YAML, idioms_dir, literal2idiomatic, idiomifier_dir, tokenizer_dir
|
8 |
from idiomify.urls import PIE_URL
|
9 |
+
from transformers import AutoModelForSeq2SeqLM, AutoConfig, BartTokenizer
|
10 |
from idiomify.models import Idiomifier
|
11 |
|
12 |
|
|
|
64 |
return model
|
65 |
|
66 |
|
67 |
+
def fetch_tokenizer(ver: str, run: Run = None) -> BartTokenizer:
|
68 |
+
if run:
|
69 |
+
artifact = run.use_artifact(f"tokenizer:{ver}", type="other")
|
70 |
+
else:
|
71 |
+
artifact = wandb.Api().artifact(f"eubinecto/idiomify/tokenizer:{ver}", type="other")
|
72 |
+
artifact_dir = artifact.download(root=tokenizer_dir(ver))
|
73 |
+
tokenizer = BartTokenizer.from_pretrained(artifact_dir)
|
74 |
+
return tokenizer
|
75 |
+
|
76 |
+
|
77 |
def fetch_config() -> dict:
|
78 |
with open(str(CONFIG_YAML), 'r', encoding="utf-8") as fh:
|
79 |
return yaml.safe_load(fh)
|
idiomify/paths.py
CHANGED
@@ -15,3 +15,8 @@ def literal2idiomatic(ver: str) -> Path:
|
|
15 |
|
16 |
def idiomifier_dir(ver: str) -> Path:
|
17 |
return ARTIFACTS_DIR / f"idiomifier_{ver}"
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
def idiomifier_dir(ver: str) -> Path:
|
17 |
return ARTIFACTS_DIR / f"idiomifier_{ver}"
|
18 |
+
|
19 |
+
|
20 |
+
def tokenizer_dir(ver: str) -> Path:
|
21 |
+
return ARTIFACTS_DIR / f"tokenizer_{ver}"
|
22 |
+
|
main_upload_tokenizer.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import wandb
|
2 |
+
import shutil
|
3 |
+
from transformers import BartTokenizer
|
4 |
+
from idiomify.fetchers import fetch_config
|
5 |
+
from idiomify.paths import ROOT_DIR
|
6 |
+
|
7 |
+
|
8 |
+
def main():
|
9 |
+
config = fetch_config()['tokenizer']
|
10 |
+
tokenizer = BartTokenizer.from_pretrained(config['bart'])
|
11 |
+
tokenizer.add_special_tokens({
|
12 |
+
"additional_special_tokens": ["<idiom>", "</idiom>"], # beginning and end of an idiom
|
13 |
+
})
|
14 |
+
|
15 |
+
with wandb.init(entity="eubinecto", project="idiomify") as run:
|
16 |
+
# the paths to write datasets in
|
17 |
+
tok_dir = ROOT_DIR / "tokenizer"
|
18 |
+
tokenizer.save_pretrained(tok_dir)
|
19 |
+
artifact = wandb.Artifact(name="tokenizer", type="other", description=config['description'],
|
20 |
+
metadata=config)
|
21 |
+
artifact.add_dir(tok_dir)
|
22 |
+
# then, we just log them here.
|
23 |
+
run.log_artifact(artifact, aliases=["latest", config['ver']])
|
24 |
+
# don't forget to remove them
|
25 |
+
shutil.rmtree(tok_dir)
|
26 |
+
|
27 |
+
|
28 |
+
if __name__ == '__main__':
|
29 |
+
main()
|