eubinecto
commited on
Commit
•
1bf3d62
1
Parent(s):
cfa482d
first mvp done, removing wandb
Browse files- config.yaml +46 -0
- explore/explore_bert_base_multilingual_tokenizer.py +44 -0
- explore/explore_bert_base_tokenizer.py +45 -0
- explore/explore_fetch_idiom2def.py +15 -0
- explore/explore_fetch_idioms.py +9 -0
- explore/explore_fetch_wisdom2def.py +0 -15
- idiomify/datamodules.py +75 -0
- idiomify/fetchers.py +24 -9
- idiomify/models.py +269 -4
- idiomify/paths.py +7 -2
- idiomify/tensors.py +55 -0
- main_train.py +67 -0
- requirements.txt +0 -68
- wandb/latest-run +0 -1
- wandb/run-20220120_131057-39a70no5/files/conda-environment.yaml +0 -82
- wandb/run-20220120_131057-39a70no5/files/config.yaml +0 -21
- wandb/run-20220120_131057-39a70no5/files/diff.patch +0 -77
- wandb/run-20220120_131057-39a70no5/files/requirements.txt +0 -62
- wandb/run-20220120_131057-39a70no5/files/wandb-metadata.json +0 -31
- wandb/run-20220120_131057-39a70no5/files/wandb-summary.json +0 -1
- wandb/run-20220120_131057-39a70no5/run-39a70no5.wandb +0 -0
- wandb/run-20220120_131124-isjyx9fs/files/conda-environment.yaml +0 -82
- wandb/run-20220120_131124-isjyx9fs/files/config.yaml +0 -21
- wandb/run-20220120_131124-isjyx9fs/files/diff.patch +0 -77
- wandb/run-20220120_131124-isjyx9fs/files/requirements.txt +0 -62
- wandb/run-20220120_131124-isjyx9fs/files/wandb-metadata.json +0 -31
- wandb/run-20220120_131124-isjyx9fs/files/wandb-summary.json +0 -1
- wandb/run-20220120_131124-isjyx9fs/run-isjyx9fs.wandb +0 -0
config.yaml
CHANGED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
alpha:
|
2 |
+
eng2eng:
|
3 |
+
bert: bert-base-uncased
|
4 |
+
desc:
|
5 |
+
seed: 410
|
6 |
+
idioms_ver: c
|
7 |
+
idiom2def_ver: c
|
8 |
+
k: 11
|
9 |
+
lr: 0.00001
|
10 |
+
max_epochs: 200
|
11 |
+
batch_size: 64
|
12 |
+
shuffle: true
|
13 |
+
kor2eng:
|
14 |
+
bert: bert-base-multilingual-uncased
|
15 |
+
desc:
|
16 |
+
seed: 410
|
17 |
+
idioms_ver: c
|
18 |
+
idiom2def_ver: d
|
19 |
+
k: 11
|
20 |
+
lr: 0.00001
|
21 |
+
max_epochs: 200
|
22 |
+
batch_size: 64
|
23 |
+
num_workers: 4
|
24 |
+
shuffle: true
|
25 |
+
gamma:
|
26 |
+
eng2eng:
|
27 |
+
bert: bert-base-uncased
|
28 |
+
seed: 410
|
29 |
+
idioms_ver: c
|
30 |
+
idiom2def_ver: c
|
31 |
+
k: 11
|
32 |
+
lr: 0.00001
|
33 |
+
max_epochs: 200
|
34 |
+
batch_size: 64
|
35 |
+
shuffle: true
|
36 |
+
kor2eng:
|
37 |
+
bert: bert-base-multilingual-uncased
|
38 |
+
seed: 410
|
39 |
+
idioms_ver: c
|
40 |
+
idiom2def_ver: d
|
41 |
+
k: 11
|
42 |
+
lr: 0.00001
|
43 |
+
max_epochs: 200
|
44 |
+
batch_size: 64
|
45 |
+
num_workers: 4
|
46 |
+
shuffle: true
|
explore/explore_bert_base_multilingual_tokenizer.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from idiomify.fetchers import fetch_idiom2def
|
2 |
+
from transformers import AutoTokenizer, BertTokenizer, BertTokenizerFast
|
3 |
+
|
4 |
+
|
5 |
+
def main():
|
6 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
|
7 |
+
idiom2def = fetch_idiom2def("d") # eng2kor
|
8 |
+
|
9 |
+
for idiom, definition in idiom2def:
|
10 |
+
print(tokenizer.decode(tokenizer(idiom)['input_ids']),
|
11 |
+
tokenizer.decode(tokenizer(definition)['input_ids']))
|
12 |
+
|
13 |
+
# right, the tokenizer knows Korean, which is great.
|
14 |
+
"""
|
15 |
+
/opt/homebrew/Caskroom/miniforge/base/envs/idiomify-demo/bin/python /Users/eubinecto/Desktop/Projects/Toy/idiomify-demo/explore/explore_mbert_tokenizer.py
|
16 |
+
[CLS] beat around the bush [SEP] [CLS] 불쾌하거나 민감한 주제에 대해 직접적으로 이야기하는 것을 피하기 위해 모호하거나 완곡하게 말한다. [SEP]
|
17 |
+
[CLS] beat around the bush [SEP] [CLS] 단어나 태도가 우회적이다 [SEP]
|
18 |
+
[CLS] beat around the bush [SEP] [CLS] 우물쭈물하다 [SEP]
|
19 |
+
[CLS] beat around the bush [SEP] [CLS] 우회적으로 접근하다 [SEP]
|
20 |
+
[CLS] backhanded compliment [SEP] [CLS] 칭찬으로 가장한 모욕적이거나 부정적인 논평 [SEP]
|
21 |
+
[CLS] backhanded compliment [SEP] [CLS] 의도하지 않거나 애매한 칭찬 [SEP]
|
22 |
+
[CLS] backhanded compliment [SEP] [CLS] 누군가를 칭찬하는 것 같지만 비판으로도 이해될 수 있는 말 [SEP]
|
23 |
+
[CLS] backhanded compliment [SEP] [CLS] 남을 기쁘게 하는 말 같지만 모욕이 될 수도 있는 말 [SEP]
|
24 |
+
[CLS] backhanded compliment [SEP] [CLS] 감탄하는 듯 하면서도 모욕으로 이해될 수 있는 말 [SEP]
|
25 |
+
[CLS] steer clear of [SEP] [CLS] 누군가나 뭔가를 피하다 [SEP]
|
26 |
+
[CLS] steer clear of [SEP] [CLS] 떨어져 지내다 [SEP]
|
27 |
+
[CLS] steer clear of [SEP] [CLS] 피하거나 멀리하도록 주의하다 [SEP]
|
28 |
+
[CLS] steer clear of [SEP] [CLS] 불쾌하거나 위험하거나 문제를 일으킬 것 같은 사람이나 물건을 피하다 [SEP]
|
29 |
+
[CLS] steer clear of [SEP] [CLS] 일부러 피하다 [SEP]
|
30 |
+
[CLS] dish it out [SEP] [CLS] 가혹한 생각, 비판, 또는 모욕의 목소리를 내는 것. [SEP]
|
31 |
+
[CLS] dish it out [SEP] [CLS] 누군가 또는 무언가에 대해 험담하는 것 [SEP]
|
32 |
+
[CLS] dish it out [SEP] [CLS] 어떤 것을 주거나 정보나 당신의 의견과 같은 것을 말하는 것 [SEP]
|
33 |
+
[CLS] dish it out [SEP] [CLS] 다른 사람을 쉽게 비판하지만 다른 사람이 자신을 비판할때는 좋아하지 않음 [SEP]
|
34 |
+
[CLS] dish it out [SEP] [CLS] 다른 사람을 비판하다 [SEP]
|
35 |
+
[CLS] make headway [SEP] [CLS] 성취하고자 하는 어떤 것에 진척이 생기다 [SEP]
|
36 |
+
[CLS] make headway [SEP] [CLS] 특히 이것이 느리거나 어려울 때, 진전을 이루다. [SEP]
|
37 |
+
[CLS] make headway [SEP] [CLS] 전진하다 [SEP]
|
38 |
+
[CLS] make headway [SEP] [CLS] 앞으로 나아가거나 진전을 이루다 [SEP]
|
39 |
+
[CLS] make headway [SEP] [CLS] 성공하기 시작하다 [SEP]
|
40 |
+
"""
|
41 |
+
|
42 |
+
|
43 |
+
if __name__ == '__main__':
|
44 |
+
main()
|
explore/explore_bert_base_tokenizer.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from idiomify.fetchers import fetch_idiom2def
|
2 |
+
from transformers import AutoTokenizer, BertTokenizer, BertTokenizerFast
|
3 |
+
|
4 |
+
|
5 |
+
def main():
|
6 |
+
|
7 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
8 |
+
idiom2def = fetch_idiom2def("c") # eng2eng
|
9 |
+
for idiom, definition in idiom2def:
|
10 |
+
print(tokenizer.decode(tokenizer(idiom)['input_ids']),
|
11 |
+
tokenizer.decode(tokenizer(definition)['input_ids']))
|
12 |
+
|
13 |
+
"""
|
14 |
+
/opt/homebrew/Caskroom/miniforge/base/envs/idiomify-demo/bin/python /Users/eubinecto/Desktop/Projects/Toy/idiomify-demo/explore/explore_bert_base_tokenizer.py
|
15 |
+
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 298kB/s]
|
16 |
+
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 8.27kB/s]
|
17 |
+
Downloading: 100%|██████████| 455k/455k [00:01<00:00, 449kB/s]
|
18 |
+
[CLS] beat around the bush [SEP] [CLS] to speak vaguely or euphemistically so as to avoid talkingdirectly about an unpleasant or sensitive topic [SEP]
|
19 |
+
[CLS] beat around the bush [SEP] [CLS] indirection in word or deed [SEP]
|
20 |
+
[CLS] beat around the bush [SEP] [CLS] to shilly - shally [SEP]
|
21 |
+
[CLS] beat around the bush [SEP] [CLS] to approach something in a roundabout way [SEP]
|
22 |
+
[CLS] backhanded compliment [SEP] [CLS] an insulting or negative comment disguised as praise. [SEP]
|
23 |
+
[CLS] backhanded compliment [SEP] [CLS] an unintended or ambiguous compliment. [SEP]
|
24 |
+
[CLS] backhanded compliment [SEP] [CLS] a remark which seems to be praising someone or something but which could also be understood as criticism [SEP]
|
25 |
+
[CLS] backhanded compliment [SEP] [CLS] a remark that seems to say something pleasant about a person but could also be an insult [SEP]
|
26 |
+
[CLS] backhanded compliment [SEP] [CLS] a remark that seems to express admiration but could also be understood as an insult [SEP]
|
27 |
+
[CLS] steer clear of [SEP] [CLS] to avoid someone or something. [SEP]
|
28 |
+
[CLS] steer clear of [SEP] [CLS] stay away from [SEP]
|
29 |
+
[CLS] steer clear of [SEP] [CLS] take care to avoid or keep away from [SEP]
|
30 |
+
[CLS] steer clear of [SEP] [CLS] to avoid someone or something that seems unpleasant, dangerous, or likely to cause problems [SEP]
|
31 |
+
[CLS] steer clear of [SEP] [CLS] deliberately avoid someone [SEP]
|
32 |
+
[CLS] dish it out [SEP] [CLS] to voice harsh thoughts, criticisms, or insults. [SEP]
|
33 |
+
[CLS] dish it out [SEP] [CLS] to gossip about someone or something [SEP]
|
34 |
+
[CLS] dish it out [SEP] [CLS] to give something, or to tell something such as information or your opinions [SEP]
|
35 |
+
[CLS] dish it out [SEP] [CLS] someone easily criticizes other people but does not like it when other people criticize him or her [SEP]
|
36 |
+
[CLS] dish it out [SEP] [CLS] to criticize other people [SEP]
|
37 |
+
[CLS] make headway [SEP] [CLS] make progress with something that you are trying to achieve. [SEP]
|
38 |
+
[CLS] make headway [SEP] [CLS] make progress, especially when this is slow or difficult [SEP]
|
39 |
+
[CLS] make headway [SEP] [CLS] to advance. [SEP]
|
40 |
+
[CLS] make headway [SEP] [CLS] to move forward or make progress [SEP]
|
41 |
+
[CLS] make headway [SEP] [CLS] to begin to succeed [SEP]
|
42 |
+
"""
|
43 |
+
|
44 |
+
if __name__ == '__main__':
|
45 |
+
main()
|
explore/explore_fetch_idiom2def.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from idiomify.fetchers import fetch_idiom2def
|
2 |
+
|
3 |
+
|
4 |
+
def main():
|
5 |
+
idiom2def = fetch_idiom2def("c")
|
6 |
+
for idiom, definition in idiom2def:
|
7 |
+
print(idiom, definition)
|
8 |
+
|
9 |
+
df = fetch_idiom2def("d")
|
10 |
+
for idiom, definition in idiom2def:
|
11 |
+
print(idiom, definition)
|
12 |
+
|
13 |
+
|
14 |
+
if __name__ == '__main__':
|
15 |
+
main()
|
explore/explore_fetch_idioms.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from idiomify.fetchers import fetch_idioms
|
2 |
+
|
3 |
+
|
4 |
+
def main():
|
5 |
+
print(fetch_idioms("c"))
|
6 |
+
|
7 |
+
|
8 |
+
if __name__ == '__main__':
|
9 |
+
main()
|
explore/explore_fetch_wisdom2def.py
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
from idiomify.fetchers import fetch_wisdom2def
|
2 |
-
|
3 |
-
|
4 |
-
def main():
|
5 |
-
df = fetch_wisdom2def("c")
|
6 |
-
for idx, row in df.iterrows():
|
7 |
-
print(row[0], row[1])
|
8 |
-
|
9 |
-
df = fetch_wisdom2def("d")
|
10 |
-
for idx, row in df.iterrows():
|
11 |
-
print(row[0], row[1])
|
12 |
-
|
13 |
-
|
14 |
-
if __name__ == '__main__':
|
15 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
idiomify/datamodules.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from typing import Tuple, Optional, List
|
3 |
+
from torch.utils.data import Dataset, DataLoader
|
4 |
+
from pytorch_lightning import LightningDataModule
|
5 |
+
from transformers import BertTokenizer
|
6 |
+
from idiomify.fetchers import fetch_idiom2def
|
7 |
+
from idiomify import tensors as T
|
8 |
+
|
9 |
+
|
10 |
+
class IdiomifyDataset(Dataset):
|
11 |
+
def __init__(self,
|
12 |
+
X: torch.Tensor,
|
13 |
+
y: torch.Tensor):
|
14 |
+
self.X = X
|
15 |
+
self.y = y
|
16 |
+
|
17 |
+
def __len__(self) -> int:
|
18 |
+
"""
|
19 |
+
Returning the size of the dataset
|
20 |
+
:return:
|
21 |
+
"""
|
22 |
+
return self.y.shape[0]
|
23 |
+
|
24 |
+
def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.LongTensor]:
|
25 |
+
"""
|
26 |
+
Returns features & the label
|
27 |
+
:param idx:
|
28 |
+
:return:
|
29 |
+
"""
|
30 |
+
return self.X[idx], self.y[idx]
|
31 |
+
|
32 |
+
|
33 |
+
class IdiomifyDataModule(LightningDataModule):
|
34 |
+
|
35 |
+
# boilerplate - just ignore these
|
36 |
+
def test_dataloader(self):
|
37 |
+
pass
|
38 |
+
|
39 |
+
def val_dataloader(self):
|
40 |
+
pass
|
41 |
+
|
42 |
+
def predict_dataloader(self):
|
43 |
+
pass
|
44 |
+
|
45 |
+
def __init__(self,
|
46 |
+
config: dict,
|
47 |
+
tokenizer: BertTokenizer,
|
48 |
+
idioms: List[str]):
|
49 |
+
super().__init__()
|
50 |
+
self.config = config
|
51 |
+
self.tokenizer = tokenizer
|
52 |
+
self.idioms = idioms
|
53 |
+
# --- to be downloaded & built --- #
|
54 |
+
self.idiom2def: Optional[List[Tuple[str, str]]] = None
|
55 |
+
self.dataset: Optional[IdiomifyDataset] = None
|
56 |
+
|
57 |
+
def prepare_data(self):
|
58 |
+
"""
|
59 |
+
prepare: download all data needed for this from wandb to local.
|
60 |
+
"""
|
61 |
+
self.idiom2def = fetch_idiom2def(self.config['idiom2def_ver'])
|
62 |
+
|
63 |
+
def setup(self, stage: Optional[str] = None):
|
64 |
+
"""
|
65 |
+
setup the builders.
|
66 |
+
"""
|
67 |
+
# --- set up the builders --- #
|
68 |
+
# build the datasets
|
69 |
+
X = T.inputs([definition for _, definition in self.idiom2def], self.tokenizer, self.config['k'])
|
70 |
+
y = T.targets(self.idioms)
|
71 |
+
self.dataset = IdiomifyDataset(X, y)
|
72 |
+
|
73 |
+
def train_dataloader(self) -> DataLoader:
|
74 |
+
return DataLoader(self.dataset, batch_size=self.config['batch_size'],
|
75 |
+
shuffle=self.config['shuffle'], num_workers=self.config['num_workers'])
|
idiomify/fetchers.py
CHANGED
@@ -1,18 +1,34 @@
|
|
|
|
|
|
1 |
import wandb
|
2 |
import pandas as pd
|
3 |
-
from transformers import BertTokenizer
|
4 |
from idiomify.models import Alpha, Gamma
|
5 |
-
from idiomify.paths import
|
6 |
|
7 |
|
8 |
# dataset
|
9 |
-
def
|
10 |
-
artifact = wandb.Api().artifact(f"eubinecto/idiomify-demo/
|
11 |
-
artifact_path =
|
12 |
artifact.download(root=str(artifact_path))
|
13 |
tsv_path = artifact_path / "all.tsv"
|
14 |
df = pd.read_csv(str(tsv_path), delimiter="\t")
|
15 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
# models
|
@@ -25,6 +41,5 @@ def fetch_gamma(ver: str) -> Gamma:
|
|
25 |
|
26 |
|
27 |
def fetch_config() -> dict:
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
1 |
+
from typing import Tuple, List
|
2 |
+
import yaml
|
3 |
import wandb
|
4 |
import pandas as pd
|
|
|
5 |
from idiomify.models import Alpha, Gamma
|
6 |
+
from idiomify.paths import idiom2def_dir, CONFIG_YAML, idioms_dir
|
7 |
|
8 |
|
9 |
# dataset
|
10 |
+
def fetch_idiom2def(ver: str) -> List[Tuple[str, str]]:
|
11 |
+
artifact = wandb.Api().artifact(f"eubinecto/idiomify-demo/idiom2def:{ver}", type="dataset")
|
12 |
+
artifact_path = idiom2def_dir(ver)
|
13 |
artifact.download(root=str(artifact_path))
|
14 |
tsv_path = artifact_path / "all.tsv"
|
15 |
df = pd.read_csv(str(tsv_path), delimiter="\t")
|
16 |
+
return [
|
17 |
+
(row[0], row[1])
|
18 |
+
for _, row in df.iterrows()
|
19 |
+
]
|
20 |
+
|
21 |
+
|
22 |
+
def fetch_idioms(ver: str) -> List[str]:
|
23 |
+
artifact = wandb.Api().artifact(f"eubinecto/idiomify-demo/idioms:{ver}", type="dataset")
|
24 |
+
artifact_path = idioms_dir(ver)
|
25 |
+
artifact.download(root=str(artifact_path))
|
26 |
+
tsv_path = artifact_path / "all.tsv"
|
27 |
+
df = pd.read_csv(str(tsv_path), delimiter="\t")
|
28 |
+
return [
|
29 |
+
row[0]
|
30 |
+
for _, row in df.iterrows()
|
31 |
+
]
|
32 |
|
33 |
|
34 |
# models
|
|
|
41 |
|
42 |
|
43 |
def fetch_config() -> dict:
|
44 |
+
with open(str(CONFIG_YAML), 'r', encoding="utf-8") as fh:
|
45 |
+
return yaml.safe_load(fh)
|
|
idiomify/models.py
CHANGED
@@ -1,9 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
|
3 |
-
class
|
4 |
-
|
|
|
|
|
|
|
|
|
5 |
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
|
8 |
-
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The reverse dictionary models below are based off of: https://github.com/yhcc/BertForRD/blob/master/mono/model/bert.py
|
3 |
+
"""
|
4 |
+
from typing import Tuple, List, Optional
|
5 |
+
import torch
|
6 |
+
import pytorch_lightning as pl
|
7 |
+
from transformers.models.bert.modeling_bert import BertForMaskedLM
|
8 |
+
from torch.nn import functional as F
|
9 |
|
10 |
|
11 |
+
class RD(pl.LightningModule):
|
12 |
+
"""
|
13 |
+
@eubinecto
|
14 |
+
The superclass of all the reverse-dictionaries. This class houses any methods that are required by
|
15 |
+
whatever reverse-dictionaries we define.
|
16 |
+
"""
|
17 |
|
18 |
+
# --- boilerplate; the loaders are defined in datamodules, so we don't define them here
|
19 |
+
# passing them to avoid warnings --- #
|
20 |
+
def train_dataloader(self):
|
21 |
+
pass
|
22 |
|
23 |
+
def test_dataloader(self):
|
24 |
+
pass
|
25 |
|
26 |
+
def val_dataloader(self):
|
27 |
+
pass
|
28 |
+
|
29 |
+
def predict_dataloader(self):
|
30 |
+
pass
|
31 |
+
|
32 |
+
def __init__(self, mlm: BertForMaskedLM, wisdom2subwords: torch.Tensor, k: int, lr: float): # noqa
|
33 |
+
"""
|
34 |
+
:param mlm: a bert model for masked language modeling
|
35 |
+
:param wisdom2subwords: (|W|, K)
|
36 |
+
:return: (N, K, |V|); (num samples, k, the size of the vocabulary of subwords)
|
37 |
+
"""
|
38 |
+
super().__init__()
|
39 |
+
# -- hyper params --- #
|
40 |
+
# should be saved to self.hparams
|
41 |
+
# https://github.com/PyTorchLightning/pytorch-lightning/issues/4390#issue-730493746
|
42 |
+
self.save_hyperparameters(ignore=["mlm", "wisdom2subwords"])
|
43 |
+
# -- the only neural network we need -- #
|
44 |
+
self.mlm = mlm
|
45 |
+
# --- to be used for getting H_k --- #
|
46 |
+
self.wisdom_mask: Optional[torch.Tensor] = None # (N, L)
|
47 |
+
# --- to be used for getting H_desc --- #
|
48 |
+
self.desc_mask: Optional[torch.Tensor] = None # (N, L)
|
49 |
+
# -- constant tensors -- #
|
50 |
+
self.register_buffer("wisdom2subwords", wisdom2subwords) # (|W|, K)
|
51 |
+
|
52 |
+
def forward(self, X: torch.Tensor) -> torch.Tensor:
|
53 |
+
"""
|
54 |
+
:param X: (N, 4, L);
|
55 |
+
(num samples, 0=input_ids/1=token_type_ids/2=attention_mask/3=wisdom_mask, the maximum length)
|
56 |
+
:return: (N, L, H); (num samples, k, the size of the vocabulary of subwords)
|
57 |
+
"""
|
58 |
+
input_ids = X[:, 0] # (N, 4, L) -> (N, L)
|
59 |
+
token_type_ids = X[:, 1] # (N, 4, L) -> (N, L)
|
60 |
+
attention_mask = X[:, 2] # (N, 4, L) -> (N, L)
|
61 |
+
self.wisdom_mask = X[:, 3] # (N, 4, L) -> (N, L)
|
62 |
+
self.desc_mask = X[:, 4] # (N, 4, L) -> (N, L)
|
63 |
+
H_all = self.mlm.bert.forward(input_ids, attention_mask, token_type_ids)[0] # (N, 3, L) -> (N, L, H)
|
64 |
+
return H_all
|
65 |
+
|
66 |
+
def H_k(self, H_all: torch.Tensor) -> torch.Tensor:
|
67 |
+
"""
|
68 |
+
You may want to override this. (e.g. RDGamma - the k's could be anywhere)
|
69 |
+
:param H_all (N, L, H)
|
70 |
+
:return H_k (N, K, H)
|
71 |
+
"""
|
72 |
+
N, _, H = H_all.size()
|
73 |
+
# refer to: wisdomify/examples/explore_masked_select.py
|
74 |
+
wisdom_mask = self.wisdom_mask.unsqueeze(2).expand(H_all.shape) # (N, L) -> (N, L, 1) -> (N, L, H)
|
75 |
+
H_k = torch.masked_select(H_all, wisdom_mask.bool()) # (N, L, H), (N, L, H) -> (N * K * H)
|
76 |
+
H_k = H_k.reshape(N, self.hparams['k'], H) # (N * K * H) -> (N, K, H)
|
77 |
+
return H_k
|
78 |
+
|
79 |
+
def H_desc(self, H_all: torch.Tensor) -> torch.Tensor:
|
80 |
+
"""
|
81 |
+
:param H_all (N, L, H)
|
82 |
+
:return H_desc (N, L - (K + 3), H)
|
83 |
+
"""
|
84 |
+
N, L, H = H_all.size()
|
85 |
+
desc_mask = self.desc_mask.unsqueeze(2).expand(H_all.shape)
|
86 |
+
H_desc = torch.masked_select(H_all, desc_mask.bool()) # (N, L, H), (N, L, H) -> (N * (L - (K + 3)) * H)
|
87 |
+
H_desc = H_desc.reshape(N, L - (self.hparams['k'] + 3), H) # (N * (L - (K + 3)) * H) -> (N, L - (K + 3), H)
|
88 |
+
return H_desc
|
89 |
+
|
90 |
+
def S_wisdom_literal(self, H_k: torch.Tensor) -> torch.Tensor:
|
91 |
+
"""
|
92 |
+
To be used for both RDAlpha & RDBeta
|
93 |
+
:param H_k: (N, K, H)
|
94 |
+
:return: S_wisdom_literal (N, |W|)
|
95 |
+
"""
|
96 |
+
S_vocab = self.mlm.cls(H_k) # bmm; (N, K, H) * (H, |V|) -> (N, K, |V|)
|
97 |
+
indices = self.wisdom2subwords.T.repeat(S_vocab.shape[0], 1, 1) # (|W|, K) -> (N, K, |W|)
|
98 |
+
S_wisdom_literal = S_vocab.gather(dim=-1, index=indices) # (N, K, |V|) -> (N, K, |W|)
|
99 |
+
S_wisdom_literal = S_wisdom_literal.sum(dim=1) # (N, K, |W|) -> (N, |W|)
|
100 |
+
return S_wisdom_literal
|
101 |
+
|
102 |
+
def S_wisdom(self, H_all: torch.Tensor) -> torch.Tensor:
|
103 |
+
"""
|
104 |
+
:param H_all: (N, L, H)
|
105 |
+
:return S_wisdom: (N, |W|)
|
106 |
+
"""
|
107 |
+
raise NotImplementedError("An RD class must implement S_wisdom")
|
108 |
+
|
109 |
+
def P_wisdom(self, X: torch.Tensor) -> torch.Tensor:
|
110 |
+
"""
|
111 |
+
:param X: (N, 3, L)
|
112 |
+
:return P_wisdom: (N, |W|), normalized over dim 1.
|
113 |
+
"""
|
114 |
+
H_all = self.forward(X) # (N, 3, L) -> (N, L, H)
|
115 |
+
S_wisdom = self.S_wisdom(H_all) # (N, L, H) -> (N, W)
|
116 |
+
P_wisdom = F.softmax(S_wisdom, dim=1) # (N, W) -> (N, W)
|
117 |
+
return P_wisdom
|
118 |
+
|
119 |
+
def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> dict:
|
120 |
+
X, y = batch
|
121 |
+
H_all = self.forward(X) # (N, 3, L) -> (N, L, H)
|
122 |
+
S_wisdom = self.S_wisdom(H_all) # (N, L, H) -> (N, |W|)
|
123 |
+
loss = F.cross_entropy(S_wisdom, y) # (N, |W|), (N,) -> (N,)
|
124 |
+
loss = loss.sum() # (N,) -> (1,)
|
125 |
+
# so that the metrics accumulate over the course of this epoch
|
126 |
+
# why dict? - just a boilerplate
|
127 |
+
return {
|
128 |
+
# you cannot change the keyword for the loss
|
129 |
+
"loss": loss,
|
130 |
+
}
|
131 |
+
|
132 |
+
def on_train_batch_end(self, outputs: dict, *args, **kwargs) -> None:
|
133 |
+
# watch the loss for this batch
|
134 |
+
self.log("Train/Loss", outputs['loss'])
|
135 |
+
|
136 |
+
def training_epoch_end(self, outputs: List[dict]) -> None:
|
137 |
+
# to see an average performance over the batches in this specific epoch
|
138 |
+
avg_loss = torch.stack([output['loss'] for output in outputs]).mean()
|
139 |
+
self.log("Train/Average Loss", avg_loss)
|
140 |
+
|
141 |
+
def validation_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> dict:
|
142 |
+
return self.training_step(batch, batch_idx)
|
143 |
+
|
144 |
+
def on_validation_batch_end(self, outputs: dict, *args, **kwargs) -> None:
|
145 |
+
self.log("Validation/Loss", outputs['loss'])
|
146 |
+
|
147 |
+
def validation_epoch_end(self, outputs: List[dict]) -> None:
|
148 |
+
# to see an average performance over the batches in this specific epoch
|
149 |
+
avg_loss = torch.stack([output['loss'] for output in outputs]).mean()
|
150 |
+
self.log("Validation/Average Loss", avg_loss)
|
151 |
+
|
152 |
+
def configure_optimizers(self) -> torch.optim.Optimizer:
|
153 |
+
"""
|
154 |
+
Instantiates and returns the optimizer to be used for this model
|
155 |
+
e.g. torch.optim.Adam
|
156 |
+
"""
|
157 |
+
# The authors used Adam, so we might as well use it as well.
|
158 |
+
return torch.optim.AdamW(self.parameters(), lr=self.hparams['lr'])
|
159 |
+
|
160 |
+
@classmethod
|
161 |
+
def name(cls) -> str:
|
162 |
+
return cls.__name__.lower()
|
163 |
+
|
164 |
+
|
165 |
+
class Alpha(RD):
|
166 |
+
"""
|
167 |
+
@eubinecto
|
168 |
+
The first prototype.
|
169 |
+
S_wisdom = S_wisdom_literal
|
170 |
+
trained on: wisdom2def only.
|
171 |
+
"""
|
172 |
+
|
173 |
+
def S_wisdom(self, H_all: torch.Tensor) -> torch.Tensor:
|
174 |
+
H_k = self.H_k(H_all) # (N, L, H) -> (N, K, H)
|
175 |
+
S_wisdom = self.S_wisdom_literal(H_k) # (N, K, H) -> (N, |W|)
|
176 |
+
return S_wisdom
|
177 |
+
|
178 |
+
|
179 |
+
class BiLSTMPooler(torch.nn.Module):
|
180 |
+
def __init__(self, hidden_size: int):
|
181 |
+
super().__init__()
|
182 |
+
self.lstm = torch.nn.LSTM(input_size=hidden_size, hidden_size=hidden_size // 2, batch_first=True,
|
183 |
+
num_layers=1, bidirectional=True)
|
184 |
+
|
185 |
+
def forward(self, X: torch.Tensor) -> torch.Tensor:
|
186 |
+
hiddens, _ = self.lstm(X)
|
187 |
+
return hiddens[:, -1]
|
188 |
+
|
189 |
+
|
190 |
+
class Gamma(RD):
|
191 |
+
"""
|
192 |
+
@eubinecto
|
193 |
+
S_wisdom = S_wisdom_literal + S_wisdom_figurative
|
194 |
+
but the way we get S_wisdom_figurative is much simplified, compared with RDBeta.
|
195 |
+
"""
|
196 |
+
|
197 |
+
def __init__(self, mlm: BertForMaskedLM, wisdom2subwords: torch.Tensor, k: int, lr: float):
|
198 |
+
super().__init__(mlm, wisdom2subwords, k, lr)
|
199 |
+
# a pooler is a multilayer perceptron that pools wisdom_embeddings from wisdom2subwords_embeddings
|
200 |
+
self.pooler = BiLSTMPooler(self.mlm.config.hidden_size)
|
201 |
+
# --- to be used to compute attentions --- #
|
202 |
+
self.attention_mask: Optional[torch.Tensor] = None
|
203 |
+
|
204 |
+
def forward(self, X: torch.Tensor) -> torch.Tensor:
|
205 |
+
"""
|
206 |
+
:param X: (N, 4, L);
|
207 |
+
(num samples, 0=input_ids/1=token_type_ids/2=attention_mask/3=wisdom_mask, the maximum length)
|
208 |
+
:return: (N, L, H); (num samples, k, the size of the vocabulary of subwords)
|
209 |
+
"""
|
210 |
+
input_ids = X[:, 0] # (N, 4, L) -> (N, L)
|
211 |
+
token_type_ids = X[:, 1] # (N, 4, L) -> (N, L)
|
212 |
+
self.attention_mask = X[:, 2] # (N, 4, L) -> (N, L)
|
213 |
+
self.wisdom_mask = X[:, 3] # (N, 4, L) -> (N, L)
|
214 |
+
self.desc_mask = X[:, 4] # (N, 4, L) -> (N, L)
|
215 |
+
H_all = self.mlm.bert.forward(input_ids, self.attention_mask, token_type_ids)[0] # (N, 3, L) -> (N, L, H)
|
216 |
+
return H_all
|
217 |
+
|
218 |
+
def H_desc_attention_mask(self, attention_mask: torch.Tensor) -> torch.Tensor:
|
219 |
+
"""
|
220 |
+
this is needed mask the padding tokens
|
221 |
+
:param attention_mask: (N, L)
|
222 |
+
"""
|
223 |
+
N, L = attention_mask.size()
|
224 |
+
H_desc_attention_mask = torch.masked_select(attention_mask, self.desc_mask.bool())
|
225 |
+
H_desc_attention_mask = H_desc_attention_mask.reshape(N, L - (self.hparams['k'] + 3))
|
226 |
+
return H_desc_attention_mask
|
227 |
+
|
228 |
+
def S_wisdom(self, H_all: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
229 |
+
S_wisdom_literal = self.S_wisdom_literal(self.H_k(H_all))
|
230 |
+
S_wisdom_figurative = self.S_wisdom_figurative(H_all)
|
231 |
+
S_wisdom = S_wisdom_literal + S_wisdom_figurative
|
232 |
+
return S_wisdom, S_wisdom_literal, S_wisdom_figurative
|
233 |
+
|
234 |
+
def S_wisdom_figurative(self, H_all: torch.Tensor) -> torch.Tensor:
|
235 |
+
# --- draw the embeddings for wisdoms from the embeddings of wisdom2subwords -- #
|
236 |
+
# this is to use as less of newly initialised weights as possible
|
237 |
+
wisdom2subwords_embeddings = self.mlm.bert \
|
238 |
+
.embeddings.word_embeddings(self.wisdom2subwords) # (W, K) -> (W, K, H)
|
239 |
+
wisdom_embeddings = self.pooler(wisdom2subwords_embeddings).squeeze() # (W, H, K) -> (W, H, 1) -> (W, H)
|
240 |
+
# --- draw H_wisdom from H_desc with attention --- #
|
241 |
+
H_cls = H_all[:, 0] # (N, L, H) -> (N, H)
|
242 |
+
H_desc = self.H_desc(H_all) # (N, L, H) -> (N, D, H)
|
243 |
+
H_desc_attention_mask = self.H_desc_attention_mask(self.attention_mask) # (N, L) -> (N, D)
|
244 |
+
scores = torch.einsum("...h,...dh->...d", H_cls, H_desc) # (N, D)
|
245 |
+
# ignore the padding tokens
|
246 |
+
scores = torch.masked_fill(scores, H_desc_attention_mask != 1, float("-inf")) # (N, D)
|
247 |
+
attentions = torch.softmax(scores, dim=1) # over D
|
248 |
+
H_wisdom = torch.einsum("...d,...dh->...h", attentions, H_desc) # -> (N, H)
|
249 |
+
# --- now compare H_wisdom with all the wisdoms --- #
|
250 |
+
S_wisdom_figurative = torch.einsum("...h,wh->...w", H_wisdom, wisdom_embeddings) # (N, H) * (W, H) -> (N, W)
|
251 |
+
return S_wisdom_figurative
|
252 |
+
|
253 |
+
def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> dict:
|
254 |
+
X, y = batch
|
255 |
+
H_all = self.forward(X) # (N, 3, L) -> (N, L, H)
|
256 |
+
S_wisdom, S_wisdom_literal, S_wisdom_figurative = self.S_wisdom(H_all) # (N, L, H) -> (N, |W|)
|
257 |
+
loss_all = F.cross_entropy(S_wisdom, y).sum() # (N, |W|), (N,) -> (N,) -> (1,)
|
258 |
+
loss_literal = F.cross_entropy(S_wisdom_literal, y).sum() # (N, |W|), (N,) -> (N,) -> (1,)
|
259 |
+
loss_figurative = F.cross_entropy(S_wisdom_figurative, y).sum() # (N, |W|), (N,) -> (N,) -> (1,)
|
260 |
+
loss = loss_all + loss_literal + loss_figurative # unweighted multi-task learning
|
261 |
+
return {
|
262 |
+
# you cannot change the keyword for the loss
|
263 |
+
"loss": loss,
|
264 |
+
}
|
265 |
+
|
266 |
+
def P_wisdom(self, X: torch.Tensor) -> torch.Tensor:
|
267 |
+
"""
|
268 |
+
:param X: (N, 3, L)
|
269 |
+
:return P_wisdom: (N, |W|), normalized over dim 1.
|
270 |
+
"""
|
271 |
+
H_all = self.forward(X) # (N, 3, L) -> (N, L, H)
|
272 |
+
S_wisdom, _, _ = self.S_wisdom(H_all) # (N, L, H) -> (N, W)
|
273 |
+
P_wisdom = F.softmax(S_wisdom, dim=1) # (N, W) -> (N, W)
|
274 |
+
return P_wisdom
|
idiomify/paths.py
CHANGED
@@ -2,10 +2,15 @@ from pathlib import Path
|
|
2 |
|
3 |
ROOT_DIR = Path(__file__).resolve().parent.parent
|
4 |
ARTIFACTS_DIR = ROOT_DIR / "artifacts"
|
|
|
5 |
|
6 |
|
7 |
-
def
|
8 |
-
return ARTIFACTS_DIR / f"
|
|
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
def alpha_dir(ver: str) -> Path:
|
|
|
2 |
|
3 |
ROOT_DIR = Path(__file__).resolve().parent.parent
|
4 |
ARTIFACTS_DIR = ROOT_DIR / "artifacts"
|
5 |
+
CONFIG_YAML = ROOT_DIR / "config.yaml"
|
6 |
|
7 |
|
8 |
+
def idiom2def_dir(ver: str) -> Path:
|
9 |
+
return ARTIFACTS_DIR / f"idiom2def_{ver}"
|
10 |
+
|
11 |
+
|
12 |
+
def idioms_dir(ver: str) -> Path:
|
13 |
+
return ARTIFACTS_DIR / f"idioms_{ver}"
|
14 |
|
15 |
|
16 |
def alpha_dir(ver: str) -> Path:
|
idiomify/tensors.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
all the functions for building tensors are defined here.
|
3 |
+
builders must accept device as one of the parameters.
|
4 |
+
"""
|
5 |
+
import torch
|
6 |
+
from typing import List
|
7 |
+
from transformers import BertTokenizer
|
8 |
+
|
9 |
+
|
10 |
+
def wisdom2subwords(idioms: List[str], tokenizer: BertTokenizer, k: int) -> torch.Tensor:
|
11 |
+
mask_id = tokenizer.mask_token_id
|
12 |
+
pad_id = tokenizer.pad_token_id
|
13 |
+
# temporarily disable single-token status of the wisdoms
|
14 |
+
wisdoms = [idiom.split(" ") for idiom in idioms]
|
15 |
+
encodings = tokenizer(text=wisdoms,
|
16 |
+
add_special_tokens=False,
|
17 |
+
# should set this to True, as we already have the wisdoms split.
|
18 |
+
is_split_into_words=True,
|
19 |
+
padding='max_length',
|
20 |
+
max_length=k, # set to k
|
21 |
+
return_tensors="pt")
|
22 |
+
input_ids = encodings['input_ids']
|
23 |
+
input_ids[input_ids == pad_id] = mask_id # replace them with masks
|
24 |
+
return input_ids
|
25 |
+
|
26 |
+
|
27 |
+
def inputs(definitions: List[str], tokenizer: BertTokenizer, k: int) -> torch.Tensor:
|
28 |
+
lefts = [" ".join(["[MASK]"] * k)] * len(definitions)
|
29 |
+
encodings = tokenizer(text=lefts,
|
30 |
+
text_pair=definitions,
|
31 |
+
return_tensors="pt",
|
32 |
+
add_special_tokens=True,
|
33 |
+
truncation=True,
|
34 |
+
padding=True,
|
35 |
+
verbose=True)
|
36 |
+
input_ids: torch.Tensor = encodings['input_ids']
|
37 |
+
cls_id: int = tokenizer.cls_token_id
|
38 |
+
sep_id: int = tokenizer.sep_token_id
|
39 |
+
mask_id: int = tokenizer.mask_token_id
|
40 |
+
|
41 |
+
wisdom_mask = torch.where(input_ids == mask_id, 1, 0)
|
42 |
+
desc_mask = torch.where(((input_ids != cls_id) & (input_ids != sep_id) & (input_ids != mask_id)), 1, 0)
|
43 |
+
return torch.stack([input_ids,
|
44 |
+
encodings['token_type_ids'],
|
45 |
+
encodings['attention_mask'],
|
46 |
+
wisdom_mask,
|
47 |
+
desc_mask], dim=1)
|
48 |
+
|
49 |
+
|
50 |
+
def targets(idioms: List[str]) -> torch.Tensor:
|
51 |
+
return torch.LongTensor([
|
52 |
+
idioms.index(idiom)
|
53 |
+
for idiom in idioms
|
54 |
+
])
|
55 |
+
|
main_train.py
CHANGED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch.cuda
|
3 |
+
import wandb
|
4 |
+
import argparse
|
5 |
+
import pytorch_lightning as pl
|
6 |
+
from pytorch_lightning.loggers import WandbLogger
|
7 |
+
from termcolor import colored
|
8 |
+
from transformers import BertForMaskedLM, BertTokenizer
|
9 |
+
from idiomify.datamodules import IdiomifyDataModule
|
10 |
+
from idiomify.fetchers import fetch_config, fetch_idioms
|
11 |
+
from idiomify.models import Alpha, Gamma
|
12 |
+
from idiomify.paths import ROOT_DIR
|
13 |
+
from idiomify import tensors as T
|
14 |
+
|
15 |
+
|
16 |
+
def main():
|
17 |
+
parser = argparse.ArgumentParser()
|
18 |
+
parser.add_argument("entity", type=str)
|
19 |
+
parser.add_argument("--model", type=str, default="alpha")
|
20 |
+
parser.add_argument("--ver", type=str, default="eng2eng")
|
21 |
+
parser.add_argument("--num_workers", type=int, default=os.cpu_count())
|
22 |
+
parser.add_argument("--log_every_n_steps", type=int, default=1)
|
23 |
+
parser.add_argument("--fast_dev_run", action="store_true", default=False)
|
24 |
+
parser.add_argument("--upload", dest='upload', action='store_true', default=False)
|
25 |
+
args = parser.parse_args()
|
26 |
+
config = fetch_config()[args.model][args.ver]
|
27 |
+
config.update(vars(args))
|
28 |
+
if not config['upload']:
|
29 |
+
print(colored("WARNING: YOU CHOSE NOT TO UPLOAD. NOTHING BUT LOGS WILL BE SAVED TO WANDB", color="red"))
|
30 |
+
|
31 |
+
# prepare arguments
|
32 |
+
mlm = BertForMaskedLM.from_pretrained(config['bert'])
|
33 |
+
tokenizer = BertTokenizer.from_pretrained(config['bert'])
|
34 |
+
idioms = fetch_idioms(config['idioms_ver'])
|
35 |
+
wisdom2subwords = T.wisdom2subwords(idioms, tokenizer, config['k'])
|
36 |
+
# choose the model to train
|
37 |
+
if config['model'] == Alpha.name():
|
38 |
+
rd = Alpha(mlm, wisdom2subwords, config['k'], config['lr'])
|
39 |
+
elif config['model'] == Gamma.name():
|
40 |
+
rd = Gamma(mlm, wisdom2subwords, config['k'], config['lr'])
|
41 |
+
else:
|
42 |
+
raise ValueError
|
43 |
+
# prepare datamodule
|
44 |
+
datamodule = IdiomifyDataModule(config, tokenizer, idioms)
|
45 |
+
|
46 |
+
with wandb.init(entity=config['entity'], project="idiomify_demo", config=config) as run:
|
47 |
+
logger = WandbLogger(log_model=False)
|
48 |
+
trainer = pl.Trainer(max_epochs=config['max_epochs'],
|
49 |
+
fast_dev_run=config['fast_dev_run'],
|
50 |
+
log_every_n_steps=config['log_every_n_steps'],
|
51 |
+
gpus=torch.cuda.device_count(),
|
52 |
+
default_root_dir=str(ROOT_DIR),
|
53 |
+
logger=logger)
|
54 |
+
# start training
|
55 |
+
trainer.fit(model=rd, datamodule=datamodule)
|
56 |
+
# upload the model to wandb only if the training is properly done #
|
57 |
+
if not config['fast_dev_run'] and trainer.current_epoch == config['max_epochs'] - 1:
|
58 |
+
ckpt_path = ROOT_DIR / "rd.ckpt"
|
59 |
+
trainer.save_checkpoint(str(ckpt_path))
|
60 |
+
artifact = wandb.Artifact(name=config['model'], type="model", metadata=config)
|
61 |
+
artifact.add_file(str(ckpt_path))
|
62 |
+
run.log_artifact(artifact, aliases=["latest", config['ver']])
|
63 |
+
os.remove(str(ckpt_path)) # make sure you remove it after you are done with uploading it
|
64 |
+
|
65 |
+
|
66 |
+
if __name__ == '__main__':
|
67 |
+
main()
|
requirements.txt
DELETED
@@ -1,68 +0,0 @@
|
|
1 |
-
absl-py==1.0.0
|
2 |
-
aiohttp==3.8.1
|
3 |
-
aiosignal==1.2.0
|
4 |
-
async-timeout==4.0.2
|
5 |
-
attrs==21.4.0
|
6 |
-
cachetools==4.2.4
|
7 |
-
certifi==2021.10.8
|
8 |
-
charset-normalizer==2.0.10
|
9 |
-
click==8.0.3
|
10 |
-
configparser==5.2.0
|
11 |
-
docker-pycreds==0.4.0
|
12 |
-
filelock==3.4.2
|
13 |
-
frozenlist==1.3.0
|
14 |
-
fsspec==2022.1.0
|
15 |
-
future==0.18.2
|
16 |
-
gitdb==4.0.9
|
17 |
-
GitPython==3.1.26
|
18 |
-
google-auth==2.3.3
|
19 |
-
google-auth-oauthlib==0.4.6
|
20 |
-
grpcio==1.43.0
|
21 |
-
huggingface-hub==0.4.0
|
22 |
-
idna==3.3
|
23 |
-
importlib-metadata==4.10.1
|
24 |
-
joblib==1.1.0
|
25 |
-
Markdown==3.3.6
|
26 |
-
multidict==5.2.0
|
27 |
-
numpy==1.22.1
|
28 |
-
oauthlib==3.1.1
|
29 |
-
packaging==21.3
|
30 |
-
pathtools==0.1.2
|
31 |
-
promise==2.3
|
32 |
-
protobuf==3.19.3
|
33 |
-
psutil==5.9.0
|
34 |
-
pyasn1==0.4.8
|
35 |
-
pyasn1-modules==0.2.8
|
36 |
-
pyDeprecate==0.3.1
|
37 |
-
pyparsing==3.0.6
|
38 |
-
python-dateutil==2.8.2
|
39 |
-
pytorch-lightning==1.5.8
|
40 |
-
PyYAML==6.0
|
41 |
-
regex==2022.1.18
|
42 |
-
requests==2.27.1
|
43 |
-
requests-oauthlib==1.3.0
|
44 |
-
rsa==4.8
|
45 |
-
sacremoses==0.0.47
|
46 |
-
sentry-sdk==1.5.2
|
47 |
-
shortuuid==1.0.8
|
48 |
-
six==1.16.0
|
49 |
-
smmap==5.0.0
|
50 |
-
subprocess32==3.5.4
|
51 |
-
tensorboard==2.7.0
|
52 |
-
tensorboard-data-server==0.6.1
|
53 |
-
tensorboard-plugin-wit==1.8.1
|
54 |
-
termcolor==1.1.0
|
55 |
-
tokenizers==0.10.3
|
56 |
-
torch==1.10.1
|
57 |
-
torchmetrics==0.7.0
|
58 |
-
tqdm==4.62.3
|
59 |
-
transformers==4.15.0
|
60 |
-
typing_extensions==4.0.1
|
61 |
-
urllib3==1.26.8
|
62 |
-
wandb==0.12.9
|
63 |
-
Werkzeug==2.0.2
|
64 |
-
yarl==1.7.2
|
65 |
-
yaspin==2.1.0
|
66 |
-
zipp==3.7.0
|
67 |
-
|
68 |
-
pandas~=1.3.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/latest-run
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
run-20220120_133013-zhqz22ma
|
|
|
|
wandb/run-20220120_131057-39a70no5/files/conda-environment.yaml
DELETED
@@ -1,82 +0,0 @@
|
|
1 |
-
name: idiomify-demo
|
2 |
-
channels:
|
3 |
-
- conda-forge
|
4 |
-
dependencies:
|
5 |
-
- bzip2=1.0.8=h3422bc3_4
|
6 |
-
- ca-certificates=2021.10.8=h4653dfc_0
|
7 |
-
- libffi=3.4.2=h3422bc3_5
|
8 |
-
- libzlib=1.2.11=hee7b306_1013
|
9 |
-
- ncurses=6.3=hc470f4d_0
|
10 |
-
- openssl=3.0.0=h3422bc3_2
|
11 |
-
- pip=21.3.1=pyhd8ed1ab_0
|
12 |
-
- python=3.9.9=h43b31ca_0_cpython
|
13 |
-
- python_abi=3.9=2_cp39
|
14 |
-
- readline=8.1=hedafd6a_0
|
15 |
-
- setuptools=60.5.0=py39h2804cbe_0
|
16 |
-
- sqlite=3.37.0=h72a2b83_0
|
17 |
-
- tk=8.6.11=he1e0b03_1
|
18 |
-
- tzdata=2021e=he74cb21_0
|
19 |
-
- wheel=0.37.1=pyhd8ed1ab_0
|
20 |
-
- xz=5.2.5=h642e427_1
|
21 |
-
- zlib=1.2.11=hee7b306_1013
|
22 |
-
- pip:
|
23 |
-
- absl-py==1.0.0
|
24 |
-
- aiohttp==3.8.1
|
25 |
-
- aiosignal==1.2.0
|
26 |
-
- async-timeout==4.0.2
|
27 |
-
- attrs==21.4.0
|
28 |
-
- cachetools==4.2.4
|
29 |
-
- certifi==2021.10.8
|
30 |
-
- charset-normalizer==2.0.10
|
31 |
-
- click==8.0.3
|
32 |
-
- configparser==5.2.0
|
33 |
-
- docker-pycreds==0.4.0
|
34 |
-
- frozenlist==1.3.0
|
35 |
-
- fsspec==2022.1.0
|
36 |
-
- future==0.18.2
|
37 |
-
- gitdb==4.0.9
|
38 |
-
- gitpython==3.1.26
|
39 |
-
- google-auth==2.3.3
|
40 |
-
- google-auth-oauthlib==0.4.6
|
41 |
-
- grpcio==1.43.0
|
42 |
-
- idna==3.3
|
43 |
-
- importlib-metadata==4.10.1
|
44 |
-
- markdown==3.3.6
|
45 |
-
- multidict==5.2.0
|
46 |
-
- numpy==1.22.1
|
47 |
-
- oauthlib==3.1.1
|
48 |
-
- packaging==21.3
|
49 |
-
- pathtools==0.1.2
|
50 |
-
- promise==2.3
|
51 |
-
- protobuf==3.19.3
|
52 |
-
- psutil==5.9.0
|
53 |
-
- pyasn1==0.4.8
|
54 |
-
- pyasn1-modules==0.2.8
|
55 |
-
- pydeprecate==0.3.1
|
56 |
-
- pyparsing==3.0.6
|
57 |
-
- python-dateutil==2.8.2
|
58 |
-
- pytorch-lightning==1.5.8
|
59 |
-
- pyyaml==6.0
|
60 |
-
- requests==2.27.1
|
61 |
-
- requests-oauthlib==1.3.0
|
62 |
-
- rsa==4.8
|
63 |
-
- sentry-sdk==1.5.2
|
64 |
-
- shortuuid==1.0.8
|
65 |
-
- six==1.16.0
|
66 |
-
- smmap==5.0.0
|
67 |
-
- subprocess32==3.5.4
|
68 |
-
- tensorboard==2.7.0
|
69 |
-
- tensorboard-data-server==0.6.1
|
70 |
-
- tensorboard-plugin-wit==1.8.1
|
71 |
-
- termcolor==1.1.0
|
72 |
-
- torch==1.10.1
|
73 |
-
- torchmetrics==0.7.0
|
74 |
-
- tqdm==4.62.3
|
75 |
-
- typing-extensions==4.0.1
|
76 |
-
- urllib3==1.26.8
|
77 |
-
- wandb==0.12.9
|
78 |
-
- werkzeug==2.0.2
|
79 |
-
- yarl==1.7.2
|
80 |
-
- yaspin==2.1.0
|
81 |
-
- zipp==3.7.0
|
82 |
-
prefix: /opt/homebrew/Caskroom/miniforge/base/envs/idiomify-demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20220120_131057-39a70no5/files/config.yaml
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
wandb_version: 1
|
2 |
-
|
3 |
-
_wandb:
|
4 |
-
desc: null
|
5 |
-
value:
|
6 |
-
cli_version: 0.12.9
|
7 |
-
is_jupyter_run: false
|
8 |
-
is_kaggle_kernel: false
|
9 |
-
python_version: 3.9.9
|
10 |
-
start_time: 1642651857
|
11 |
-
t:
|
12 |
-
3:
|
13 |
-
- 16
|
14 |
-
4: 3.9.9
|
15 |
-
5: 0.12.9
|
16 |
-
8:
|
17 |
-
- 4
|
18 |
-
- 5
|
19 |
-
path:
|
20 |
-
desc: null
|
21 |
-
value: artifacts/wisdom2def_c.tsv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20220120_131057-39a70no5/files/diff.patch
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
diff --git a/README.md b/README.md
|
2 |
-
index f7b5541..167966c 100644
|
3 |
-
--- a/README.md
|
4 |
-
+++ b/README.md
|
5 |
-
@@ -1,2 +1,7 @@
|
6 |
-
# idiomify-demo
|
7 |
-
Cross-lingual reverse dictionary of English idioms
|
8 |
-
+
|
9 |
-
+
|
10 |
-
+## Requirements
|
11 |
-
+- wandb
|
12 |
-
+- pytorch-lightning
|
13 |
-
|
14 |
-
diff --git a/artifacts/wisdom2def_c.tsv b/artifacts/wisdom2def_c.tsv
|
15 |
-
new file mode 100644
|
16 |
-
index 0000000..324d169
|
17 |
-
--- /dev/null
|
18 |
-
+++ b/artifacts/wisdom2def_c.tsv
|
19 |
-
@@ -0,0 +1,25 @@
|
20 |
-
+beat around the bush To fail to come to the important point about something
|
21 |
-
+beat around the bush To speak vaguely or euphemistically so as to avoid talkingdirectly about an unpleasant or sensitive topic
|
22 |
-
+beat around the bush Indirection in word or deed
|
23 |
-
+beat around the bush to shilly-shally
|
24 |
-
+beat around the bush to approach something in a roundabout way
|
25 |
-
+backhanded compliment An insulting or negative comment disguised as praise.
|
26 |
-
+backhanded compliment an unintended or ambiguous compliment.
|
27 |
-
+backhanded compliment a remark which seems to be praising someone or something but which could also be understood as criticism
|
28 |
-
+backhanded compliment a remark that seems to say something pleasant about a person but could also be an insult
|
29 |
-
+backhanded compliment a remark that seems to express admiration but could also be understood as an insult
|
30 |
-
+steer clear of To avoid someone or something.
|
31 |
-
+steer clear of Stay away from
|
32 |
-
+steer clear of take care to avoid or keep away from
|
33 |
-
+steer clear of to avoid someone or something that seems unpleasant, dangerous, or likely to cause problems
|
34 |
-
+steer clear of deliberately avoid someone
|
35 |
-
+dish it out To voice harsh thoughts, criticisms, or insults.
|
36 |
-
+dish it out To gossip about someone or something
|
37 |
-
+dish it out To give something, or to tell something such as information or your opinions
|
38 |
-
+dish it out someone easily criticizes other people but does not like it when other people criticize him or her
|
39 |
-
+dish it out to criticize other people
|
40 |
-
+make headway make progress with something that you are trying to achieve.
|
41 |
-
+make headway make progress, especially when this is slow or difficult
|
42 |
-
+make headway To advance.
|
43 |
-
+make headway to move forward or make progress
|
44 |
-
+make headway to begin to succeed
|
45 |
-
|
46 |
-
diff --git a/artifacts/wisdom2def_d.tsv b/artifacts/wisdom2def_d.tsv
|
47 |
-
new file mode 100644
|
48 |
-
index 0000000..74549d8
|
49 |
-
--- /dev/null
|
50 |
-
+++ b/artifacts/wisdom2def_d.tsv
|
51 |
-
@@ -0,0 +1,25 @@
|
52 |
-
+beat around the bush 어떤 것에 대해 중요한 요점을 찾지 못하는 것
|
53 |
-
+beat around the bush 불쾌하거나 민감한 주제에 대해 직접적으로 이야기하는 것을 피하기 위해 모호하거나 완곡하게 말한다.
|
54 |
-
+beat around the bush 단어나 태도가 우회적이다
|
55 |
-
+beat around the bush 우물쭈물하다
|
56 |
-
+beat around the bush 우회적으로 접근하다
|
57 |
-
+backhanded compliment 칭찬으로 가장한 모욕적이거나 부정적인 논평
|
58 |
-
+backhanded compliment 의도하지 않거나 애매한 칭찬
|
59 |
-
+backhanded compliment 누군가를 칭찬하는 것 같지만 비판으로도 이해될 수 있는 말
|
60 |
-
+backhanded compliment 남을 기쁘게 하는 말 같지만 모욕이 될 수도 있는 말
|
61 |
-
+backhanded compliment 감탄하는 듯 하면서도 모욕으로 이해될 수 있는 말
|
62 |
-
+steer clear of 누군가나 뭔가를 피하다
|
63 |
-
+steer clear of 떨어져 지내다
|
64 |
-
+steer clear of 피하거나 멀리하도록 주의하다
|
65 |
-
+steer clear of 불쾌하거나 위험하거나 문제를 일으킬 것 같은 사람이나 물건을 피하다
|
66 |
-
+steer clear of 일부러 피하다
|
67 |
-
+dish it out 가혹한 생각, 비판, 또는 모욕의 목소리를 내는 것.
|
68 |
-
+dish it out 누군가 또는 무언가에 대해 험담하는 것
|
69 |
-
+dish it out 어떤 것을 주거나 정보나 당신의 의견과 같은 것을 말하는 것
|
70 |
-
+dish it out 다른 사람을 쉽게 비판하지만 다른 사람이 자신을 비판할때는 좋아하지 않음
|
71 |
-
+dish it out 다른 사람을 비판하다
|
72 |
-
+make headway 성취하고자 하는 어떤 것에 진척이 생기다
|
73 |
-
+make headway 특히 이것이 느리거나 어려울 때, 진전을 이루다.
|
74 |
-
+make headway 전진하다
|
75 |
-
+make headway 앞으로 나아가거나 진전을 이루다
|
76 |
-
+make headway 성공하기 시작하다
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20220120_131057-39a70no5/files/requirements.txt
DELETED
@@ -1,62 +0,0 @@
|
|
1 |
-
absl-py==1.0.0
|
2 |
-
aiohttp==3.8.1
|
3 |
-
aiosignal==1.2.0
|
4 |
-
async-timeout==4.0.2
|
5 |
-
attrs==21.4.0
|
6 |
-
cachetools==4.2.4
|
7 |
-
certifi==2021.10.8
|
8 |
-
charset-normalizer==2.0.10
|
9 |
-
click==8.0.3
|
10 |
-
configparser==5.2.0
|
11 |
-
docker-pycreds==0.4.0
|
12 |
-
frozenlist==1.3.0
|
13 |
-
fsspec==2022.1.0
|
14 |
-
future==0.18.2
|
15 |
-
gitdb==4.0.9
|
16 |
-
gitpython==3.1.26
|
17 |
-
google-auth-oauthlib==0.4.6
|
18 |
-
google-auth==2.3.3
|
19 |
-
grpcio==1.43.0
|
20 |
-
idna==3.3
|
21 |
-
importlib-metadata==4.10.1
|
22 |
-
markdown==3.3.6
|
23 |
-
multidict==5.2.0
|
24 |
-
numpy==1.22.1
|
25 |
-
oauthlib==3.1.1
|
26 |
-
packaging==21.3
|
27 |
-
pathtools==0.1.2
|
28 |
-
pip==21.3.1
|
29 |
-
promise==2.3
|
30 |
-
protobuf==3.19.3
|
31 |
-
psutil==5.9.0
|
32 |
-
pyasn1-modules==0.2.8
|
33 |
-
pyasn1==0.4.8
|
34 |
-
pydeprecate==0.3.1
|
35 |
-
pyparsing==3.0.6
|
36 |
-
python-dateutil==2.8.2
|
37 |
-
pytorch-lightning==1.5.8
|
38 |
-
pyyaml==6.0
|
39 |
-
requests-oauthlib==1.3.0
|
40 |
-
requests==2.27.1
|
41 |
-
rsa==4.8
|
42 |
-
sentry-sdk==1.5.2
|
43 |
-
setuptools==60.5.0
|
44 |
-
shortuuid==1.0.8
|
45 |
-
six==1.16.0
|
46 |
-
smmap==5.0.0
|
47 |
-
subprocess32==3.5.4
|
48 |
-
tensorboard-data-server==0.6.1
|
49 |
-
tensorboard-plugin-wit==1.8.1
|
50 |
-
tensorboard==2.7.0
|
51 |
-
termcolor==1.1.0
|
52 |
-
torch==1.10.1
|
53 |
-
torchmetrics==0.7.0
|
54 |
-
tqdm==4.62.3
|
55 |
-
typing-extensions==4.0.1
|
56 |
-
urllib3==1.26.8
|
57 |
-
wandb==0.12.9
|
58 |
-
werkzeug==2.0.2
|
59 |
-
wheel==0.37.1
|
60 |
-
yarl==1.7.2
|
61 |
-
yaspin==2.1.0
|
62 |
-
zipp==3.7.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20220120_131057-39a70no5/files/wandb-metadata.json
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"os": "macOS-12.1-arm64-arm-64bit",
|
3 |
-
"python": "3.9.9",
|
4 |
-
"heartbeatAt": "2022-01-20T04:10:57.955060",
|
5 |
-
"startedAt": "2022-01-20T04:10:57.174003",
|
6 |
-
"docker": null,
|
7 |
-
"cpu_count": 8,
|
8 |
-
"cuda": null,
|
9 |
-
"args": [
|
10 |
-
"artifact",
|
11 |
-
"put",
|
12 |
-
"artifacts/wisdom2def_c.tsv",
|
13 |
-
"-n",
|
14 |
-
"wisdom2def",
|
15 |
-
"-t",
|
16 |
-
"dataset",
|
17 |
-
"-a",
|
18 |
-
"c"
|
19 |
-
],
|
20 |
-
"state": "running",
|
21 |
-
"program": "/opt/homebrew/Caskroom/miniforge/base/envs/idiomify-demo/bin/wandb",
|
22 |
-
"git": {
|
23 |
-
"remote": "https://github.com/eubinecto/idiomify-demo.git",
|
24 |
-
"commit": "db5933850fd03c3e44c527c7aa110880a26d8499"
|
25 |
-
},
|
26 |
-
"email": "eubinecto",
|
27 |
-
"root": "/Users/eubinecto/Desktop/Projects/Toy/idiomify-demo",
|
28 |
-
"host": "Eu-Bins-MacBook-Air.local",
|
29 |
-
"username": "eubinecto",
|
30 |
-
"executable": "/opt/homebrew/Caskroom/miniforge/base/envs/idiomify-demo/bin/python3.9"
|
31 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20220120_131057-39a70no5/files/wandb-summary.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"_wandb": {"runtime": 4}}
|
|
|
|
wandb/run-20220120_131057-39a70no5/run-39a70no5.wandb
DELETED
Binary file (991 Bytes)
|
|
wandb/run-20220120_131124-isjyx9fs/files/conda-environment.yaml
DELETED
@@ -1,82 +0,0 @@
|
|
1 |
-
name: idiomify-demo
|
2 |
-
channels:
|
3 |
-
- conda-forge
|
4 |
-
dependencies:
|
5 |
-
- bzip2=1.0.8=h3422bc3_4
|
6 |
-
- ca-certificates=2021.10.8=h4653dfc_0
|
7 |
-
- libffi=3.4.2=h3422bc3_5
|
8 |
-
- libzlib=1.2.11=hee7b306_1013
|
9 |
-
- ncurses=6.3=hc470f4d_0
|
10 |
-
- openssl=3.0.0=h3422bc3_2
|
11 |
-
- pip=21.3.1=pyhd8ed1ab_0
|
12 |
-
- python=3.9.9=h43b31ca_0_cpython
|
13 |
-
- python_abi=3.9=2_cp39
|
14 |
-
- readline=8.1=hedafd6a_0
|
15 |
-
- setuptools=60.5.0=py39h2804cbe_0
|
16 |
-
- sqlite=3.37.0=h72a2b83_0
|
17 |
-
- tk=8.6.11=he1e0b03_1
|
18 |
-
- tzdata=2021e=he74cb21_0
|
19 |
-
- wheel=0.37.1=pyhd8ed1ab_0
|
20 |
-
- xz=5.2.5=h642e427_1
|
21 |
-
- zlib=1.2.11=hee7b306_1013
|
22 |
-
- pip:
|
23 |
-
- absl-py==1.0.0
|
24 |
-
- aiohttp==3.8.1
|
25 |
-
- aiosignal==1.2.0
|
26 |
-
- async-timeout==4.0.2
|
27 |
-
- attrs==21.4.0
|
28 |
-
- cachetools==4.2.4
|
29 |
-
- certifi==2021.10.8
|
30 |
-
- charset-normalizer==2.0.10
|
31 |
-
- click==8.0.3
|
32 |
-
- configparser==5.2.0
|
33 |
-
- docker-pycreds==0.4.0
|
34 |
-
- frozenlist==1.3.0
|
35 |
-
- fsspec==2022.1.0
|
36 |
-
- future==0.18.2
|
37 |
-
- gitdb==4.0.9
|
38 |
-
- gitpython==3.1.26
|
39 |
-
- google-auth==2.3.3
|
40 |
-
- google-auth-oauthlib==0.4.6
|
41 |
-
- grpcio==1.43.0
|
42 |
-
- idna==3.3
|
43 |
-
- importlib-metadata==4.10.1
|
44 |
-
- markdown==3.3.6
|
45 |
-
- multidict==5.2.0
|
46 |
-
- numpy==1.22.1
|
47 |
-
- oauthlib==3.1.1
|
48 |
-
- packaging==21.3
|
49 |
-
- pathtools==0.1.2
|
50 |
-
- promise==2.3
|
51 |
-
- protobuf==3.19.3
|
52 |
-
- psutil==5.9.0
|
53 |
-
- pyasn1==0.4.8
|
54 |
-
- pyasn1-modules==0.2.8
|
55 |
-
- pydeprecate==0.3.1
|
56 |
-
- pyparsing==3.0.6
|
57 |
-
- python-dateutil==2.8.2
|
58 |
-
- pytorch-lightning==1.5.8
|
59 |
-
- pyyaml==6.0
|
60 |
-
- requests==2.27.1
|
61 |
-
- requests-oauthlib==1.3.0
|
62 |
-
- rsa==4.8
|
63 |
-
- sentry-sdk==1.5.2
|
64 |
-
- shortuuid==1.0.8
|
65 |
-
- six==1.16.0
|
66 |
-
- smmap==5.0.0
|
67 |
-
- subprocess32==3.5.4
|
68 |
-
- tensorboard==2.7.0
|
69 |
-
- tensorboard-data-server==0.6.1
|
70 |
-
- tensorboard-plugin-wit==1.8.1
|
71 |
-
- termcolor==1.1.0
|
72 |
-
- torch==1.10.1
|
73 |
-
- torchmetrics==0.7.0
|
74 |
-
- tqdm==4.62.3
|
75 |
-
- typing-extensions==4.0.1
|
76 |
-
- urllib3==1.26.8
|
77 |
-
- wandb==0.12.9
|
78 |
-
- werkzeug==2.0.2
|
79 |
-
- yarl==1.7.2
|
80 |
-
- yaspin==2.1.0
|
81 |
-
- zipp==3.7.0
|
82 |
-
prefix: /opt/homebrew/Caskroom/miniforge/base/envs/idiomify-demo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20220120_131124-isjyx9fs/files/config.yaml
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
wandb_version: 1
|
2 |
-
|
3 |
-
_wandb:
|
4 |
-
desc: null
|
5 |
-
value:
|
6 |
-
cli_version: 0.12.9
|
7 |
-
is_jupyter_run: false
|
8 |
-
is_kaggle_kernel: false
|
9 |
-
python_version: 3.9.9
|
10 |
-
start_time: 1642651884
|
11 |
-
t:
|
12 |
-
3:
|
13 |
-
- 16
|
14 |
-
4: 3.9.9
|
15 |
-
5: 0.12.9
|
16 |
-
8:
|
17 |
-
- 4
|
18 |
-
- 5
|
19 |
-
path:
|
20 |
-
desc: null
|
21 |
-
value: artifacts/wisdom2def_d.tsv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20220120_131124-isjyx9fs/files/diff.patch
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
diff --git a/README.md b/README.md
|
2 |
-
index f7b5541..167966c 100644
|
3 |
-
--- a/README.md
|
4 |
-
+++ b/README.md
|
5 |
-
@@ -1,2 +1,7 @@
|
6 |
-
# idiomify-demo
|
7 |
-
Cross-lingual reverse dictionary of English idioms
|
8 |
-
+
|
9 |
-
+
|
10 |
-
+## Requirements
|
11 |
-
+- wandb
|
12 |
-
+- pytorch-lightning
|
13 |
-
|
14 |
-
diff --git a/artifacts/wisdom2def_c.tsv b/artifacts/wisdom2def_c.tsv
|
15 |
-
new file mode 100644
|
16 |
-
index 0000000..324d169
|
17 |
-
--- /dev/null
|
18 |
-
+++ b/artifacts/wisdom2def_c.tsv
|
19 |
-
@@ -0,0 +1,25 @@
|
20 |
-
+beat around the bush To fail to come to the important point about something
|
21 |
-
+beat around the bush To speak vaguely or euphemistically so as to avoid talkingdirectly about an unpleasant or sensitive topic
|
22 |
-
+beat around the bush Indirection in word or deed
|
23 |
-
+beat around the bush to shilly-shally
|
24 |
-
+beat around the bush to approach something in a roundabout way
|
25 |
-
+backhanded compliment An insulting or negative comment disguised as praise.
|
26 |
-
+backhanded compliment an unintended or ambiguous compliment.
|
27 |
-
+backhanded compliment a remark which seems to be praising someone or something but which could also be understood as criticism
|
28 |
-
+backhanded compliment a remark that seems to say something pleasant about a person but could also be an insult
|
29 |
-
+backhanded compliment a remark that seems to express admiration but could also be understood as an insult
|
30 |
-
+steer clear of To avoid someone or something.
|
31 |
-
+steer clear of Stay away from
|
32 |
-
+steer clear of take care to avoid or keep away from
|
33 |
-
+steer clear of to avoid someone or something that seems unpleasant, dangerous, or likely to cause problems
|
34 |
-
+steer clear of deliberately avoid someone
|
35 |
-
+dish it out To voice harsh thoughts, criticisms, or insults.
|
36 |
-
+dish it out To gossip about someone or something
|
37 |
-
+dish it out To give something, or to tell something such as information or your opinions
|
38 |
-
+dish it out someone easily criticizes other people but does not like it when other people criticize him or her
|
39 |
-
+dish it out to criticize other people
|
40 |
-
+make headway make progress with something that you are trying to achieve.
|
41 |
-
+make headway make progress, especially when this is slow or difficult
|
42 |
-
+make headway To advance.
|
43 |
-
+make headway to move forward or make progress
|
44 |
-
+make headway to begin to succeed
|
45 |
-
|
46 |
-
diff --git a/artifacts/wisdom2def_d.tsv b/artifacts/wisdom2def_d.tsv
|
47 |
-
new file mode 100644
|
48 |
-
index 0000000..74549d8
|
49 |
-
--- /dev/null
|
50 |
-
+++ b/artifacts/wisdom2def_d.tsv
|
51 |
-
@@ -0,0 +1,25 @@
|
52 |
-
+beat around the bush 어떤 것에 대해 중요한 요점을 찾지 못하는 것
|
53 |
-
+beat around the bush 불쾌하거나 민감한 주제에 대해 직접적으로 이야기하는 것을 피하기 위해 모호하거나 완곡하게 말한다.
|
54 |
-
+beat around the bush 단어나 태도가 우회적이다
|
55 |
-
+beat around the bush 우물쭈물하다
|
56 |
-
+beat around the bush 우회적으로 접근하다
|
57 |
-
+backhanded compliment 칭찬으로 가장한 모욕적이거나 부정적인 논평
|
58 |
-
+backhanded compliment 의도하지 않거나 애매한 칭찬
|
59 |
-
+backhanded compliment 누군가를 칭찬하는 것 같지만 비판으로도 이해될 수 있는 말
|
60 |
-
+backhanded compliment 남을 기쁘게 하는 말 같지만 모욕이 될 수도 있는 말
|
61 |
-
+backhanded compliment 감탄하는 듯 하면서도 모욕으로 이해될 수 있는 말
|
62 |
-
+steer clear of 누군가나 뭔가를 피하다
|
63 |
-
+steer clear of 떨어져 지내다
|
64 |
-
+steer clear of 피하거나 멀리하도록 주의하다
|
65 |
-
+steer clear of 불쾌하거나 위험하거나 문제를 일으킬 것 같은 사람이나 물건을 피하다
|
66 |
-
+steer clear of 일부러 피하다
|
67 |
-
+dish it out 가혹한 생각, 비판, 또는 모욕의 목소리를 내는 것.
|
68 |
-
+dish it out 누군가 또는 무언가에 대해 험담하는 것
|
69 |
-
+dish it out 어떤 것을 주거나 정보나 당신의 의견과 같은 것을 말하는 것
|
70 |
-
+dish it out 다른 사람을 쉽게 비판하지만 다른 사람이 자신을 비판할때는 좋아하지 않음
|
71 |
-
+dish it out 다른 사람을 비판하다
|
72 |
-
+make headway 성취하고자 하는 어떤 것에 진척이 생기다
|
73 |
-
+make headway 특히 이것이 느리거나 어려울 때, 진전을 이루다.
|
74 |
-
+make headway 전진하다
|
75 |
-
+make headway 앞으로 나아가거나 진전을 이루다
|
76 |
-
+make headway 성공하기 시작하다
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20220120_131124-isjyx9fs/files/requirements.txt
DELETED
@@ -1,62 +0,0 @@
|
|
1 |
-
absl-py==1.0.0
|
2 |
-
aiohttp==3.8.1
|
3 |
-
aiosignal==1.2.0
|
4 |
-
async-timeout==4.0.2
|
5 |
-
attrs==21.4.0
|
6 |
-
cachetools==4.2.4
|
7 |
-
certifi==2021.10.8
|
8 |
-
charset-normalizer==2.0.10
|
9 |
-
click==8.0.3
|
10 |
-
configparser==5.2.0
|
11 |
-
docker-pycreds==0.4.0
|
12 |
-
frozenlist==1.3.0
|
13 |
-
fsspec==2022.1.0
|
14 |
-
future==0.18.2
|
15 |
-
gitdb==4.0.9
|
16 |
-
gitpython==3.1.26
|
17 |
-
google-auth-oauthlib==0.4.6
|
18 |
-
google-auth==2.3.3
|
19 |
-
grpcio==1.43.0
|
20 |
-
idna==3.3
|
21 |
-
importlib-metadata==4.10.1
|
22 |
-
markdown==3.3.6
|
23 |
-
multidict==5.2.0
|
24 |
-
numpy==1.22.1
|
25 |
-
oauthlib==3.1.1
|
26 |
-
packaging==21.3
|
27 |
-
pathtools==0.1.2
|
28 |
-
pip==21.3.1
|
29 |
-
promise==2.3
|
30 |
-
protobuf==3.19.3
|
31 |
-
psutil==5.9.0
|
32 |
-
pyasn1-modules==0.2.8
|
33 |
-
pyasn1==0.4.8
|
34 |
-
pydeprecate==0.3.1
|
35 |
-
pyparsing==3.0.6
|
36 |
-
python-dateutil==2.8.2
|
37 |
-
pytorch-lightning==1.5.8
|
38 |
-
pyyaml==6.0
|
39 |
-
requests-oauthlib==1.3.0
|
40 |
-
requests==2.27.1
|
41 |
-
rsa==4.8
|
42 |
-
sentry-sdk==1.5.2
|
43 |
-
setuptools==60.5.0
|
44 |
-
shortuuid==1.0.8
|
45 |
-
six==1.16.0
|
46 |
-
smmap==5.0.0
|
47 |
-
subprocess32==3.5.4
|
48 |
-
tensorboard-data-server==0.6.1
|
49 |
-
tensorboard-plugin-wit==1.8.1
|
50 |
-
tensorboard==2.7.0
|
51 |
-
termcolor==1.1.0
|
52 |
-
torch==1.10.1
|
53 |
-
torchmetrics==0.7.0
|
54 |
-
tqdm==4.62.3
|
55 |
-
typing-extensions==4.0.1
|
56 |
-
urllib3==1.26.8
|
57 |
-
wandb==0.12.9
|
58 |
-
werkzeug==2.0.2
|
59 |
-
wheel==0.37.1
|
60 |
-
yarl==1.7.2
|
61 |
-
yaspin==2.1.0
|
62 |
-
zipp==3.7.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20220120_131124-isjyx9fs/files/wandb-metadata.json
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"os": "macOS-12.1-arm64-arm-64bit",
|
3 |
-
"python": "3.9.9",
|
4 |
-
"heartbeatAt": "2022-01-20T04:11:25.393449",
|
5 |
-
"startedAt": "2022-01-20T04:11:24.663767",
|
6 |
-
"docker": null,
|
7 |
-
"cpu_count": 8,
|
8 |
-
"cuda": null,
|
9 |
-
"args": [
|
10 |
-
"artifact",
|
11 |
-
"put",
|
12 |
-
"artifacts/wisdom2def_d.tsv",
|
13 |
-
"-n",
|
14 |
-
"wisdom2def",
|
15 |
-
"-t",
|
16 |
-
"dataset",
|
17 |
-
"-a",
|
18 |
-
"d"
|
19 |
-
],
|
20 |
-
"state": "running",
|
21 |
-
"program": "/opt/homebrew/Caskroom/miniforge/base/envs/idiomify-demo/bin/wandb",
|
22 |
-
"git": {
|
23 |
-
"remote": "https://github.com/eubinecto/idiomify-demo.git",
|
24 |
-
"commit": "db5933850fd03c3e44c527c7aa110880a26d8499"
|
25 |
-
},
|
26 |
-
"email": "eubinecto",
|
27 |
-
"root": "/Users/eubinecto/Desktop/Projects/Toy/idiomify-demo",
|
28 |
-
"host": "Eu-Bins-MacBook-Air.local",
|
29 |
-
"username": "eubinecto",
|
30 |
-
"executable": "/opt/homebrew/Caskroom/miniforge/base/envs/idiomify-demo/bin/python3.9"
|
31 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wandb/run-20220120_131124-isjyx9fs/files/wandb-summary.json
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
{"_wandb": {"runtime": 4}}
|
|
|
|
wandb/run-20220120_131124-isjyx9fs/run-isjyx9fs.wandb
DELETED
Binary file (990 Bytes)
|
|