Spaces:

eubinecto
/

idiomify

Runtime error

App Files Files Community

eubinecto commited on Mar 6, 2022

Commit

dfe3d0b

unverified ·

2 Parent(s): 2bd8a1e e589e3d

Merge pull request #3 from eubinecto/issue_2

Browse files

Files changed (26) hide show

README.md +16 -7
config.yaml +19 -6
explore/explore_bart_logits_shape.py +1 -1
explore/{explore_fetch_seq2seq.py → explore_fetch_idiomifier.py} +2 -2
explore/{explore_fetch_seq2seq_predict.py → explore_fetch_idiomifier_predict.py} +2 -2
explore/explore_fetch_idioms.py +1 -1
explore/explore_fetch_literal2idiomatic.py +3 -2
explore/explore_fetch_pie.py +3 -5
explore/explore_fetch_pie_df_select.py +12 -0
explore/explore_idiomifydatamodule.py +11 -3
explore/explore_torchmetrics_bleu.py +28 -0
idiomify/builders.py +3 -5
idiomify/{data.py → datamodules.py} +24 -13
idiomify/fetchers.py +24 -25
idiomify/metrics.py +0 -4
idiomify/models.py +21 -25
idiomify/paths.py +4 -4
idiomify/pipeline.py +22 -0
idiomify/preprocess.py +31 -0
main_deploy.py +43 -0
main_eval.py +34 -0
main_infer.py +12 -9
main_train.py +5 -7
main_upload_idioms.py +14 -18
main_upload_literal2idiomatic.py +27 -26
requirements.txt +5 -1

README.md CHANGED Viewed

@@ -1,13 +1,22 @@
 # Idiomify
-A human-inspired Idiomifier based on BERT
-<img width="807" alt="image" src="https://user-images.githubusercontent.com/56193069/153775460-5ca04edd-e788-442d-b0f1-e780dc0a5724.png">
-## Requirements
-- wandb
-- pytorch-lightning
-- transformers
-- pandas

 # Idiomify
+[![Open in Streamlit](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://share.streamlit.io/eubinecto/idiomify/issue_2/main_deploy.py)
+Grammarly for idioms. A human-inspired Idiomifier based on BART.
+<img width="764" alt="image" src="https://user-images.githubusercontent.com/56193069/156941205-830b53aa-a3e6-4263-be03-e568124a256e.png">
+## Versions
+### models
+format: `m-a-b`
+- a: used to indicate a change in the architecture, or a revision of the final product
+- b: used to indicate a different version of the same architecture (with a different set of hyperparameters)
+### datasets
+format: `d-a-b`
+- a: used to indicate a change in the dataset we are using
+- b: used to indicate a different version of the dataset

config.yaml CHANGED Viewed

@@ -1,8 +1,21 @@
-tag011:
-  desc: just overfitting
   bart: facebook/bart-base
   lr: 0.0001
-  literal2idiomatic_ver: tag01
-  max_epochs: 100
-  batch_size: 100
-  shuffle: true

+idiomifier:
+  ver: m-1-2
+  desc: just overfitting the model, but on the entire PIE dataset.
   bart: facebook/bart-base
   lr: 0.0001
+  literal2idiomatic_ver: d-1-2
+  idioms_ver: d-1-2
+  max_epochs: 2
+  batch_size: 40
+  shuffle: true
+  seed: 104
+# for building & uploading datasets or tokenizer
+idioms:
+  ver: d-1-2
+  description: the set of idioms in the traning set of literal2idiomatic_d-1-2.
+literal2idiomatic:
+  ver: d-1-2
+  description: PIE data split into train & test set (80 / 20 split). There is no validation set because I don't intend to do any hyperparameter tuning on this thing.
+  train_ratio: 0.8
+  seed: 104

explore/explore_bart_logits_shape.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from transformers import BartTokenizer, BartForConditionalGeneration
-from data import IdiomifyDataModule
 CONFIG = {

 from transformers import BartTokenizer, BartForConditionalGeneration
+from datamodules import IdiomifyDataModule
 CONFIG = {

explore/{explore_fetch_seq2seq.py → explore_fetch_idiomifier.py} RENAMED Viewed

@@ -1,8 +1,8 @@
-from idiomify.fetchers import fetch_seq2seq
 def main():
-    model = fetch_seq2seq("overfit")
     print(model.bart.config)

+from idiomify.fetchers import fetch_idiomifier
 def main():
+    model = fetch_idiomifier("m-1-2")
     print(model.bart.config)

explore/{explore_fetch_seq2seq_predict.py → explore_fetch_idiomifier_predict.py} RENAMED Viewed

@@ -1,10 +1,10 @@
 from transformers import BartTokenizer
 from builders import SourcesBuilder
-from fetchers import fetch_seq2seq
 def main():
-    model = fetch_seq2seq("overfit")
     tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
     lit2idi = [
         ("my man", ""),

 from transformers import BartTokenizer
 from builders import SourcesBuilder
+from fetchers import fetch_idiomifier
 def main():
+    model = fetch_idiomifier("m-1-2")
     tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
     lit2idi = [
         ("my man", ""),

explore/explore_fetch_idioms.py CHANGED Viewed

@@ -2,7 +2,7 @@ from idiomify.fetchers import fetch_idioms
 def main():
-    print(fetch_idioms("pie_v0"))
 if __name__ == '__main__':

 def main():
+    print(fetch_idioms("d-1-2"))
 if __name__ == '__main__':

explore/explore_fetch_literal2idiomatic.py CHANGED Viewed

@@ -2,8 +2,9 @@ from idiomify.fetchers import fetch_literal2idiomatic
 def main():
-    for src, tgt in fetch_literal2idiomatic("pie_v0"):
-        print(src, "->", tgt)
 if __name__ == '__main__':

 def main():
+    train_df, test_df = fetch_literal2idiomatic("d-1-2")
+    print(train_df.size)  # 12408 rows
+    print(test_df.size)  # 3102 rows
 if __name__ == '__main__':

explore/explore_fetch_pie.py CHANGED Viewed

@@ -3,11 +3,9 @@ from idiomify.fetchers import fetch_pie
 def main():
-    for idx, row in enumerate(fetch_pie()):
-        print(idx, row)
-        # the first 105 = V0.
-        if idx == 105:
-            break
 if __name__ == '__main__':

 def main():
+    pie_df = fetch_pie()
+    for idx, row in pie_df.iterrows():
+        print(row)
 if __name__ == '__main__':

explore/explore_fetch_pie_df_select.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from fetchers import fetch_pie
+def main():
+    pie_df = fetch_pie()
+    print(pie_df.columns)
+    pie_df = pie_df[["Literal_Sent", "Idiomatic_Sent"]]
+    print(pie_df.head(5))
+if __name__ == '__main__':
+    main()

explore/explore_idiomifydatamodule.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from transformers import BartTokenizer
-from idiomify.data import IdiomifyDataModule
 CONFIG = {
-    "literal2idiomatic_ver": "pie_v0",
     "batch_size": 20,
     "num_workers": 4,
     "shuffle": True
@@ -11,7 +11,7 @@ CONFIG = {
 def main():
-    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
     datamodule = IdiomifyDataModule(CONFIG, tokenizer)
     datamodule.prepare_data()
     datamodule.setup()
@@ -20,6 +20,14 @@ def main():
         print(srcs.shape)
         print(tgts_r.shape)
         print(tgts.shape)
 if __name__ == '__main__':

 from transformers import BartTokenizer
+from idiomify.datamodules import IdiomifyDataModule
 CONFIG = {
+    "literal2idiomatic_ver": "d-1-2",
     "batch_size": 20,
     "num_workers": 4,
     "shuffle": True
 def main():
+    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
     datamodule = IdiomifyDataModule(CONFIG, tokenizer)
     datamodule.prepare_data()
     datamodule.setup()
         print(srcs.shape)
         print(tgts_r.shape)
         print(tgts.shape)
+        break
+    for batch in datamodule.test_dataloader():
+        srcs, tgts_r, tgts = batch
+        print(srcs.shape)
+        print(tgts_r.shape)
+        print(tgts.shape)
+        break
 if __name__ == '__main__':

explore/explore_torchmetrics_bleu.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from torchmetrics import BLEUScore
+from transformers import BartTokenizer
+pairs = [
+    ("I knew you could do it", "I knew you could do it"),
+    ("I knew you could do it", "you knew you could do it")
+]
+def main():
+    tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
+    metric = BLEUScore()
+    preds = tokenizer([pred for pred, _ in pairs])['input_ids']
+    targets = tokenizer([target for _, target in pairs])['input_ids']
+    print(preds)
+    print(targets)
+    print(metric(preds, targets))
+    # arghhh, so bleu score does not support tensors...
+    """
+    AttributeError: 'int' object has no attribute 'split'
+    """
+    # let's just go for the accuracies then.
+if __name__ == '__main__':
+    main()

idiomify/builders.py CHANGED Viewed

@@ -55,9 +55,9 @@ class SourcesBuilder(TensorBuilder):
                                    padding=True,
                                    truncation=True,
                                    add_special_tokens=True)
-        src = torch.stack([encodings['input_ids'],
-                           encodings['attention_mask']], dim=1)   # (N, 2, L)
-        return src  # (N, 2, L)
 class TargetsRightShiftedBuilder(TensorBuilder):
@@ -83,5 +83,3 @@ class TargetsBuilder(TensorBuilder):
         ], return_tensors="pt", add_special_tokens=False, padding=True, truncation=True)
         tgts = encodings['input_ids']
         return tgts  # (N, L)

                                    padding=True,
                                    truncation=True,
                                    add_special_tokens=True)
+        srcs = torch.stack([encodings['input_ids'],
+                            encodings['attention_mask']], dim=1)   # (N, 2, L)
+        return srcs  # (N, 2, L)
 class TargetsRightShiftedBuilder(TensorBuilder):
         ], return_tensors="pt", add_special_tokens=False, padding=True, truncation=True)
         tgts = encodings['input_ids']
         return tgts  # (N, L)

idiomify/{data.py → datamodules.py} RENAMED Viewed

@@ -1,9 +1,9 @@
 import torch
-from typing import Tuple, Optional, List
 from torch.utils.data import Dataset, DataLoader
 from pytorch_lightning import LightningDataModule
 from wandb.sdk.wandb_run import Run
 from idiomify.fetchers import fetch_literal2idiomatic
 from idiomify.builders import SourcesBuilder, TargetsBuilder, TargetsRightShiftedBuilder
 from transformers import BartTokenizer
@@ -38,9 +38,6 @@ class IdiomifyDataset(Dataset):
 class IdiomifyDataModule(LightningDataModule):
     # boilerplate - just ignore these
-    def test_dataloader(self):
-        pass
     def val_dataloader(self):
         pass
@@ -56,23 +53,37 @@ class IdiomifyDataModule(LightningDataModule):
         self.tokenizer = tokenizer
         self.run = run
         # --- to be downloaded & built --- #
-        self.literal2idiomatic: Optional[List[Tuple[str, str]]] = None
-        self.dataset: Optional[IdiomifyDataset] = None
     def prepare_data(self):
         """
         prepare: download all data needed for this from wandb to local.
         """
-        self.literal2idiomatic = fetch_literal2idiomatic(self.config['literal2idiomatic_ver'], self.run)
     def setup(self, stage: Optional[str] = None):
         # --- set up the builders --- #
         # build the datasets
-        srcs = SourcesBuilder(self.tokenizer)(self.literal2idiomatic)
-        tgts_r = TargetsRightShiftedBuilder(self.tokenizer)(self.literal2idiomatic)
-        tgts = TargetsBuilder(self.tokenizer)(self.literal2idiomatic)
-        self.dataset = IdiomifyDataset(srcs, tgts_r, tgts)
     def train_dataloader(self) -> DataLoader:
-        return DataLoader(self.dataset, batch_size=self.config['batch_size'],
                           shuffle=self.config['shuffle'], num_workers=self.config['num_workers'])

 import torch
+import pandas as pd
 from torch.utils.data import Dataset, DataLoader
+from typing import Tuple, Optional
 from pytorch_lightning import LightningDataModule
 from wandb.sdk.wandb_run import Run
 from idiomify.fetchers import fetch_literal2idiomatic
 from idiomify.builders import SourcesBuilder, TargetsBuilder, TargetsRightShiftedBuilder
 from transformers import BartTokenizer
 class IdiomifyDataModule(LightningDataModule):
     # boilerplate - just ignore these
     def val_dataloader(self):
         pass
         self.tokenizer = tokenizer
         self.run = run
         # --- to be downloaded & built --- #
+        self.train_df: Optional[pd.DataFrame] = None
+        self.test_df: Optional[pd.DataFrame] = None
+        self.train_dataset: Optional[IdiomifyDataset] = None
+        self.test_dataset: Optional[IdiomifyDataset] = None
     def prepare_data(self):
         """
         prepare: download all data needed for this from wandb to local.
         """
+        self.train_df, self.test_df = fetch_literal2idiomatic(self.config['literal2idiomatic_ver'], self.run)
     def setup(self, stage: Optional[str] = None):
         # --- set up the builders --- #
         # build the datasets
+        self.train_dataset = self.build_dataset(self.train_df)
+        self.test_dataset = self.build_dataset(self.test_df)
+    def build_dataset(self, df: pd.DataFrame) -> IdiomifyDataset:
+        literal2idiomatic = [
+            (row['Literal_Sent'], row['Idiomatic_Sent'])
+            for _, row in df.iterrows()
+        ]
+        srcs = SourcesBuilder(self.tokenizer)(literal2idiomatic)
+        tgts_r = TargetsRightShiftedBuilder(self.tokenizer)(literal2idiomatic)
+        tgts = TargetsBuilder(self.tokenizer)(literal2idiomatic)
+        return IdiomifyDataset(srcs, tgts_r, tgts)
     def train_dataloader(self) -> DataLoader:
+        return DataLoader(self.train_dataset, batch_size=self.config['batch_size'],
                           shuffle=self.config['shuffle'], num_workers=self.config['num_workers'])
+    def test_dataloader(self) -> DataLoader:
+        return DataLoader(self.test_dataset, batch_size=self.config['batch_size'],
+                          shuffle=False, num_workers=self.config['num_workers'])

idiomify/fetchers.py CHANGED Viewed

@@ -1,25 +1,19 @@
-import csv
-from os import path
 import yaml
 import wandb
-import requests
 from typing import Tuple, List
 from wandb.sdk.wandb_run import Run
-from idiomify.paths import CONFIG_YAML, idioms_dir, literal2idiomatic, seq2seq_dir
 from idiomify.urls import PIE_URL
 from transformers import AutoModelForSeq2SeqLM, AutoConfig
-from idiomify.models import Seq2Seq
-def fetch_pie() -> list:
-    text = requests.get(PIE_URL).text
-    lines = (line for line in text.split("\n") if line)
-    reader = csv.reader(lines)
-    next(reader)  # skip the header
-    return [
-        row
-        for row in reader
-    ]
 # --- from wandb --- #
@@ -39,7 +33,7 @@ def fetch_idioms(ver: str, run: Run = None) -> List[str]:
         return [line.strip() for line in fh]
-def fetch_literal2idiomatic(ver: str, run: Run = None) -> List[Tuple[str, str]]:
     # if run object is given, we track the lineage of the data.
     # if not, we get the dataset via wandb Api.
     if run:
@@ -47,23 +41,28 @@ def fetch_literal2idiomatic(ver: str, run: Run = None) -> List[Tuple[str, str]]:
     else:
         artifact = wandb.Api().artifact(f"eubinecto/idiomify/literal2idiomatic:{ver}", type="dataset")
     artifact_dir = artifact.download(root=literal2idiomatic(ver))
-    tsv_path = path.join(artifact_dir, "all.tsv")
-    with open(tsv_path, 'r') as fh:
-        reader = csv.reader(fh, delimiter="\t")
-        return [(row[0], row[1]) for row in reader]
-def fetch_seq2seq(ver: str, run: Run = None) -> Seq2Seq:
     if run:
-        artifact = run.use_artifact(f"seq2seq:{ver}", type="model")
     else:
-        artifact = wandb.Api().artifact(f"eubinecto/idiomify/seq2seq:{ver}", type="model")
     config = artifact.metadata
-    artifact_dir = artifact.download(root=seq2seq_dir(ver))
     ckpt_path = path.join(artifact_dir, "model.ckpt")
     bart = AutoModelForSeq2SeqLM.from_config(AutoConfig.from_pretrained(config['bart']))
-    alpha = Seq2Seq.load_from_checkpoint(ckpt_path, bart=bart)
-    return alpha
 def fetch_config() -> dict:

 import yaml
 import wandb
+from os import path
+import pandas as pd
 from typing import Tuple, List
 from wandb.sdk.wandb_run import Run
+from idiomify.paths import CONFIG_YAML, idioms_dir, literal2idiomatic, idiomifier_dir
 from idiomify.urls import PIE_URL
 from transformers import AutoModelForSeq2SeqLM, AutoConfig
+from idiomify.models import Idiomifier
+# --- from the web --- #
+def fetch_pie() -> pd.DataFrame:
+    # fetch & parse it directly from the web
+    return pd.read_csv(PIE_URL)
 # --- from wandb --- #
         return [line.strip() for line in fh]
+def fetch_literal2idiomatic(ver: str, run: Run = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
     # if run object is given, we track the lineage of the data.
     # if not, we get the dataset via wandb Api.
     if run:
     else:
         artifact = wandb.Api().artifact(f"eubinecto/idiomify/literal2idiomatic:{ver}", type="dataset")
     artifact_dir = artifact.download(root=literal2idiomatic(ver))
+    train_path = path.join(artifact_dir, "train.tsv")
+    test_path = path.join(artifact_dir, "test.tsv")
+    train_df = pd.read_csv(train_path, sep="\t")
+    test_df = pd.read_csv(test_path, sep="\t")
+    return train_df, test_df
+def fetch_idiomifier(ver: str, run: Run = None) -> Idiomifier:
+    """
+    you may want to change the name to Idiomifier.
+    The current Idiomifier then turns into a pipeline.
+    """
     if run:
+        artifact = run.use_artifact(f"idiomifier:{ver}", type="model")
     else:
+        artifact = wandb.Api().artifact(f"eubinecto/idiomify/idiomifier:{ver}", type="model")
     config = artifact.metadata
+    artifact_dir = artifact.download(root=idiomifier_dir(ver))
     ckpt_path = path.join(artifact_dir, "model.ckpt")
     bart = AutoModelForSeq2SeqLM.from_config(AutoConfig.from_pretrained(config['bart']))
+    model = Idiomifier.load_from_checkpoint(ckpt_path, bart=bart)
+    return model
 def fetch_config() -> dict:

idiomify/metrics.py DELETED Viewed

@@ -1,4 +0,0 @@
-"""
-you may want to include bleu score.
-and more metrics for paraphrasing.
-"""

idiomify/models.py CHANGED Viewed

@@ -7,17 +7,19 @@ from torch.nn import functional as F
 import pytorch_lightning as pl
 from transformers import BartForConditionalGeneration, BartTokenizer
 from idiomify.builders import SourcesBuilder
-# for training
-class Seq2Seq(pl.LightningModule):  # noqa
     """
     the baseline is in here.
     """
     def __init__(self, bart: BartForConditionalGeneration, lr: float, bos_token_id: int, pad_token_id: int):  # noqa
         super().__init__()
-        self.bart = bart
         self.save_hyperparameters(ignore=["bart"])
     def forward(self, srcs: torch.Tensor, tgts_r: torch.Tensor) -> torch.Tensor:
         """
@@ -38,10 +40,10 @@ class Seq2Seq(pl.LightningModule):  # noqa
     def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor, torch.Tensor]) -> dict:
         srcs, tgts_r, tgts = batch  # (N, 2, L_s), (N, 2, L_t), (N, 2, L_t)
-        logits = self.forward(srcs, tgts_r)  # -> (N, L, |V|)
-        logits = logits.transpose(1, 2)  # (N, L, |V|) -> (N, |V|, L)
         loss = F.cross_entropy(logits, tgts, ignore_index=self.hparams['pad_token_id'])\
                 .sum()  # (N, L, |V|), (N, L) -> (N,) -> (1,)
         return {
             "loss": loss
         }
@@ -49,6 +51,19 @@ class Seq2Seq(pl.LightningModule):  # noqa
     def on_train_batch_end(self, outputs: dict, *args, **kwargs):
         self.log("Train/Loss", outputs['loss'])
     def configure_optimizers(self) -> torch.optim.Optimizer:
         """
         Instantiates and returns the optimizer to be used for this model
@@ -57,22 +72,3 @@ class Seq2Seq(pl.LightningModule):  # noqa
         # The authors used Adam, so we might as well use it as well.
         return torch.optim.AdamW(self.parameters(), lr=self.hparams['lr'])
-# for inference
-class Idiomifier:
-    def __init__(self, model: Seq2Seq, tokenizer: BartTokenizer):
-        self.model = model
-        self.builder = SourcesBuilder(tokenizer)
-        self.model.eval()
-    def __call__(self, src: str, max_length=100) -> str:
-        srcs = self.builder(literal2idiomatic=[(src, "")])
-        pred_ids = self.model.bart.generate(
-            inputs=srcs[:, 0],  # (N, 2, L) -> (N, L)
-            attention_mask=srcs[:, 1],  # (N, 2, L) -> (N, L)
-            decoder_start_token_id=self.model.hparams['bos_token_id'],
-            max_length=max_length,
-        ).squeeze()  # -> (N, L_t) -> (L_t)
-        tgt = self.builder.tokenizer.decode(pred_ids, skip_special_tokens=True)
-        return tgt

 import pytorch_lightning as pl
 from transformers import BartForConditionalGeneration, BartTokenizer
 from idiomify.builders import SourcesBuilder
+from torchmetrics import Accuracy
+class Idiomifier(pl.LightningModule):  # noqa
     """
     the baseline is in here.
     """
     def __init__(self, bart: BartForConditionalGeneration, lr: float, bos_token_id: int, pad_token_id: int):  # noqa
         super().__init__()
         self.save_hyperparameters(ignore=["bart"])
+        self.bart = bart
+        # metrics (using accuracies as of right now)
+        self.acc_train = Accuracy(ignore_index=pad_token_id)
+        self.acc_test = Accuracy(ignore_index=pad_token_id)
     def forward(self, srcs: torch.Tensor, tgts_r: torch.Tensor) -> torch.Tensor:
         """
     def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor, torch.Tensor]) -> dict:
         srcs, tgts_r, tgts = batch  # (N, 2, L_s), (N, 2, L_t), (N, 2, L_t)
+        logits = self.forward(srcs, tgts_r).transpose(1, 2)  # ... -> (N, L, |V|) -> (N, |V|, L)
         loss = F.cross_entropy(logits, tgts, ignore_index=self.hparams['pad_token_id'])\
                 .sum()  # (N, L, |V|), (N, L) -> (N,) -> (1,)
+        self.acc_train.update(logits.detach(), target=tgts.detach())
         return {
             "loss": loss
         }
     def on_train_batch_end(self, outputs: dict, *args, **kwargs):
         self.log("Train/Loss", outputs['loss'])
+    def on_train_epoch_end(self, *args, **kwargs) -> None:
+        self.log("Train/Accuracy", self.acc_train.compute())
+        self.acc_train.reset()
+    def test_step(self, batch: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], *args, **kwargs):
+        srcs, tgts_r, tgts = batch  # (N, 2, L_s), (N, 2, L_t), (N, 2, L_t)
+        logits = self.forward(srcs, tgts_r).transpose(1, 2)  # ... -> (N, L, |V|) -> (N, |V|, L)
+        self.acc_test.update(logits.detach(), target=tgts.detach())
+    def on_test_epoch_end(self, *args, **kwargs) -> None:
+        self.log("Test/Accuracy", self.acc_test.compute())
+        self.acc_test.reset()
     def configure_optimizers(self) -> torch.optim.Optimizer:
         """
         Instantiates and returns the optimizer to be used for this model
         # The authors used Adam, so we might as well use it as well.
         return torch.optim.AdamW(self.parameters(), lr=self.hparams['lr'])

idiomify/paths.py CHANGED Viewed

@@ -6,12 +6,12 @@ CONFIG_YAML = ROOT_DIR / "config.yaml"
 def idioms_dir(ver: str) -> Path:
-    return ARTIFACTS_DIR / f"idioms-{ver}"
 def literal2idiomatic(ver: str) -> Path:
-    return ARTIFACTS_DIR / f"literal2idiomatic-{ver}"
-def seq2seq_dir(ver: str) -> Path:
-    return ARTIFACTS_DIR / f"seq2seq-{ver}"

 def idioms_dir(ver: str) -> Path:
+    return ARTIFACTS_DIR / f"idioms_{ver}"
 def literal2idiomatic(ver: str) -> Path:
+    return ARTIFACTS_DIR / f"literal2idiomatic_{ver}"
+def idiomifier_dir(ver: str) -> Path:
+    return ARTIFACTS_DIR / f"idiomifier_{ver}"

idiomify/pipeline.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from typing import List
+from transformers import BartTokenizer
+from idiomify.builders import SourcesBuilder
+from idiomify.models import Idiomifier
+class Pipeline:
+    def __init__(self, model: Idiomifier, tokenizer: BartTokenizer):
+        self.model = model
+        self.builder = SourcesBuilder(tokenizer)
+    def __call__(self, sents: List[str], max_length=100) -> List[str]:
+        srcs = self.builder(literal2idiomatic=[(sent, "") for sent in sents])
+        pred_ids = self.model.bart.generate(
+            inputs=srcs[:, 0],  # (N, 2, L) -> (N, L)
+            attention_mask=srcs[:, 1],  # (N, 2, L) -> (N, L)
+            decoder_start_token_id=self.model.hparams['bos_token_id'],
+            max_length=max_length,
+        )  # -> (N, L_t)
+        tgts = self.builder.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
+        return tgts

idiomify/preprocess.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from typing import Tuple
+import pandas as pd
+from sklearn.model_selection import train_test_split
+def upsample(df: pd.DataFrame, seed: int) -> pd.DataFrame:
+    # TODO: implement upsampling later
+    return df
+def cleanse(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    :param df:
+    :return:
+    """
+    # TODO: implement cleansing
+    return df
+def stratified_split(df: pd.DataFrame, ratio: float, seed: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    stratified-split the given df into two df's.
+    """
+    total = len(df)
+    ratio_size = int(total * ratio)
+    other_size = total - ratio_size
+    ratio_df, other_df = train_test_split(df, train_size=ratio_size,
+                                          stratify=df['Idiom'],
+                                          test_size=other_size, random_state=seed,
+                                          shuffle=True)
+    return ratio_df, other_df

main_deploy.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+we deploy the pipeline via streamlit.
+"""
+from typing import Tuple, List
+import streamlit as st
+from transformers import BartTokenizer
+from idiomify.fetchers import fetch_config, fetch_idiomifier, fetch_idioms
+from idiomify.pipeline import Pipeline
+from idiomify.models import Idiomifier
+@st.cache(allow_output_mutation=True)
+def fetch_resources() -> Tuple[dict, Idiomifier, BartTokenizer, List[str]]:
+    config = fetch_config()['idiomifier']
+    model = fetch_idiomifier(config['ver'])
+    idioms = fetch_idioms(config['idioms_ver'])
+    tokenizer = BartTokenizer.from_pretrained(config['bart'])
+    return config, model, tokenizer, idioms
+def main():
+    # fetch a pre-trained model
+    config, model, tokenizer, idioms = fetch_resources()
+    pipeline = Pipeline(model, tokenizer)
+    st.title("Idiomify Demo")
+    st.markdown(f"Author: `Eu-Bin KIM`")
+    st.markdown(f"Version: `{config['ver']}`")
+    text = st.text_area("Type sentences here",
+                        value="Just remember there will always be a hope even when things look black")
+    with st.sidebar:
+        st.subheader("Supported idioms")
+        st.write(" / ".join(idioms))
+    if st.button(label="Idiomify"):
+        with st.spinner("Please wait..."):
+            sents = [sent for sent in text.split(".") if sent]
+            sents = pipeline(sents, max_length=200)
+            # highlight the rule & honorifics that were applied
+            st.write(". ".join(sents))
+if __name__ == '__main__':
+    main()

main_eval.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import argparse
+import os
+import wandb
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+from transformers import BartTokenizer
+from idiomify.datamodules import IdiomifyDataModule
+from idiomify.fetchers import fetch_config, fetch_idiomifier
+from idiomify.paths import ROOT_DIR
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num_workers", type=int, default=os.cpu_count())
+    parser.add_argument("--fast_dev_run", action="store_true", default=False)
+    args = parser.parse_args()
+    config = fetch_config()['idiomifier']
+    config.update(vars(args))
+    tokenizer = BartTokenizer.from_pretrained(config['bart'])
+    # prepare the datamodule
+    with wandb.init(entity="eubinecto", project="idiomify", config=config) as run:
+        model = fetch_idiomifier(config['ver'], run)  # fetch a pre-trained model
+        datamodule = IdiomifyDataModule(config, tokenizer, run)
+        logger = WandbLogger(log_model=False)
+        trainer = pl.Trainer(fast_dev_run=config['fast_dev_run'],
+                             gpus=torch.cuda.device_count(),
+                             default_root_dir=str(ROOT_DIR),
+                             logger=logger)
+        trainer.test(model, datamodule)
+if __name__ == '__main__':
+    main()

main_infer.py CHANGED Viewed

@@ -1,23 +1,26 @@
 import argparse
-from idiomify.models import Idiomifier
-from idiomify.fetchers import fetch_config, fetch_seq2seq
 from transformers import BartTokenizer
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--ver", type=str, default="tag011")
-    parser.add_argument("--src", type=str,
                         default="If there's any good to loosing my job,"
                                 " it's that I'll now be able to go to school full-time and finish my degree earlier.")
     args = parser.parse_args()
-    config = fetch_config()[args.ver]
     config.update(vars(args))
-    model = fetch_seq2seq(config['ver'])
     tokenizer = BartTokenizer.from_pretrained(config['bart'])
-    idiomifier = Idiomifier(model, tokenizer)
-    src = config['src']
-    tgt = idiomifier(src=config['src'])
     print(src, "\n->", tgt)

+"""
+This is for just a simple sanity check on the inference.
+"""
 import argparse
+from idiomify.pipeline import Pipeline
+from idiomify.fetchers import fetch_config, fetch_idiomifier
 from transformers import BartTokenizer
 def main():
     parser = argparse.ArgumentParser()
+    parser.add_argument("--sent", type=str,
                         default="If there's any good to loosing my job,"
                                 " it's that I'll now be able to go to school full-time and finish my degree earlier.")
     args = parser.parse_args()
+    config = fetch_config()['idiomifier']
     config.update(vars(args))
+    model = fetch_idiomifier(config['ver'])
+    model.eval()  # this is crucial
     tokenizer = BartTokenizer.from_pretrained(config['bart'])
+    pipeline = Pipeline(model, tokenizer)
+    src = config['sent']
+    tgt = pipeline(sents=[config['sent']])
     print(src, "\n->", tgt)

main_train.py CHANGED Viewed

@@ -6,29 +6,27 @@ import pytorch_lightning as pl
 from termcolor import colored
 from pytorch_lightning.loggers import WandbLogger
 from transformers import BartTokenizer, BartForConditionalGeneration
-from idiomify.data import IdiomifyDataModule
 from idiomify.fetchers import fetch_config
-from idiomify.models import Seq2Seq
 from idiomify.paths import ROOT_DIR
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--ver", type=str, default="tag011")
     parser.add_argument("--num_workers", type=int, default=os.cpu_count())
     parser.add_argument("--log_every_n_steps", type=int, default=1)
     parser.add_argument("--fast_dev_run", action="store_true", default=False)
     parser.add_argument("--upload", dest='upload', action='store_true', default=False)
     args = parser.parse_args()
-    config = fetch_config()[args.ver]
     config.update(vars(args))
     if not config['upload']:
         print(colored("WARNING: YOU CHOSE NOT TO UPLOAD. NOTHING BUT LOGS WILL BE SAVED TO WANDB", color="red"))
     # prepare the model
     bart = BartForConditionalGeneration.from_pretrained(config['bart'])
     tokenizer = BartTokenizer.from_pretrained(config['bart'])
-    model = Seq2Seq(bart, config['lr'], tokenizer.bos_token_id, tokenizer.pad_token_id)
     # prepare the datamodule
     with wandb.init(entity="eubinecto", project="idiomify", config=config) as run:
         datamodule = IdiomifyDataModule(config, tokenizer, run)
@@ -46,7 +44,7 @@ def main():
         if not config['fast_dev_run'] and trainer.current_epoch == config['max_epochs'] - 1:
             ckpt_path = ROOT_DIR / "model.ckpt"
             trainer.save_checkpoint(str(ckpt_path))
-            artifact = wandb.Artifact(name="seq2seq", type="model", metadata=config)
             artifact.add_file(str(ckpt_path))
             run.log_artifact(artifact, aliases=["latest", config['ver']])
             os.remove(str(ckpt_path))  # make sure you remove it after you are done with uploading it

 from termcolor import colored
 from pytorch_lightning.loggers import WandbLogger
 from transformers import BartTokenizer, BartForConditionalGeneration
+from idiomify.datamodules import IdiomifyDataModule
 from idiomify.fetchers import fetch_config
+from idiomify.models import Idiomifier
 from idiomify.paths import ROOT_DIR
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--num_workers", type=int, default=os.cpu_count())
     parser.add_argument("--log_every_n_steps", type=int, default=1)
     parser.add_argument("--fast_dev_run", action="store_true", default=False)
     parser.add_argument("--upload", dest='upload', action='store_true', default=False)
     args = parser.parse_args()
+    config = fetch_config()['idiomifier']
     config.update(vars(args))
     if not config['upload']:
         print(colored("WARNING: YOU CHOSE NOT TO UPLOAD. NOTHING BUT LOGS WILL BE SAVED TO WANDB", color="red"))
     # prepare the model
     bart = BartForConditionalGeneration.from_pretrained(config['bart'])
     tokenizer = BartTokenizer.from_pretrained(config['bart'])
+    model = Idiomifier(bart, config['lr'], tokenizer.bos_token_id, tokenizer.pad_token_id)
     # prepare the datamodule
     with wandb.init(entity="eubinecto", project="idiomify", config=config) as run:
         datamodule = IdiomifyDataModule(config, tokenizer, run)
         if not config['fast_dev_run'] and trainer.current_epoch == config['max_epochs'] - 1:
             ckpt_path = ROOT_DIR / "model.ckpt"
             trainer.save_checkpoint(str(ckpt_path))
+            artifact = wandb.Artifact(name="idiomifier", type="model", metadata=config)
             artifact.add_file(str(ckpt_path))
             run.log_artifact(artifact, aliases=["latest", config['ver']])
             os.remove(str(ckpt_path))  # make sure you remove it after you are done with uploading it

main_upload_idioms.py CHANGED Viewed

@@ -1,35 +1,31 @@
 """
-Here, what should you do here?
-just upload all idioms here - name it as epie.
 """
 import os
-from idiomify.paths import ROOT_DIR
-from idiomify.fetchers import fetch_pie
-import argparse
 import wandb
 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--ver", type=str, default="tag01")
-    config = vars(parser.parse_args())
-    # get the idioms here
-    if config['ver'] == "tag01":
-        # only the first 106, and this is for piloting
-        idioms = set([row[0] for row in fetch_pie()[:106]])
-    else:
-        raise NotImplementedError
-    idioms = list(idioms)
-    with wandb.init(entity="eubinecto", project="idiomify", config=config) as run:
-        artifact = wandb.Artifact(name="idioms", type="dataset")
         txt_path = ROOT_DIR / "all.txt"
         with open(txt_path, 'w') as fh:
             for idiom in idioms:
                 fh.write(idiom + "\n")
         artifact.add_file(txt_path)
         run.log_artifact(artifact, aliases=["latest", config['ver']])
         os.remove(txt_path)

 """
+will do this when I need to.
+Is it absolutely necessary to keep track of idioms separately?
 """
 import os
 import wandb
+from idiomify.fetchers import fetch_literal2idiomatic, fetch_config
+from idiomify.paths import ROOT_DIR
 def main():
+    config = fetch_config()['idioms']
+    train_df, _ = fetch_literal2idiomatic(config['ver'])
+    idioms = train_df['Idiom'].tolist()
+    idioms = list(set(idioms))
+    with wandb.init(entity="eubinecto", project="idiomify") as run:
+        # the paths to write datasets in
         txt_path = ROOT_DIR / "all.txt"
         with open(txt_path, 'w') as fh:
             for idiom in idioms:
                 fh.write(idiom + "\n")
+        artifact = wandb.Artifact(name="idioms", type="dataset", description=config['description'],
+                                  metadata=config)
         artifact.add_file(txt_path)
+        # then, we just log them here.
         run.log_artifact(artifact, aliases=["latest", config['ver']])
+        # don't forget to remove them
         os.remove(txt_path)

main_upload_literal2idiomatic.py CHANGED Viewed

@@ -1,39 +1,40 @@
 """
-Here, what should you do here?
-just upload all idioms here - name it as epie.
 """
-import csv
 import os
 from idiomify.paths import ROOT_DIR
-from idiomify.fetchers import fetch_pie
-import argparse
 import wandb
 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--ver", type=str, default="tag01")
-    config = vars(parser.parse_args())
-    # get the idioms here
-    if config['ver'] == "tag01":
-        # only the first 106, and we use this just for piloting
-        literal2idiom = [
-            (row[3], row[2]) for row in fetch_pie()[:106]
-        ]
-    else:
-        raise NotImplementedError
-    with wandb.init(entity="eubinecto", project="idiomify", config=config) as run:
-        artifact = wandb.Artifact(name="literal2idiomatic", type="dataset")
-        tsv_path = ROOT_DIR / "all.tsv"
-        with open(tsv_path, 'w') as fh:
-            writer = csv.writer(fh, delimiter="\t")
-            for row in literal2idiom:
-                writer.writerow(row)
-        artifact.add_file(tsv_path)
         run.log_artifact(artifact, aliases=["latest", config['ver']])
-        os.remove(tsv_path)
 if __name__ == '__main__':

 """
+literal2idiomatic ver: d-1-2
 """
 import os
 from idiomify.paths import ROOT_DIR
+from idiomify.fetchers import fetch_pie, fetch_config
+from idiomify.preprocess import upsample, cleanse, stratified_split
 import wandb
 def main():
+    # here, we use all of them, while splitting them into train & test
+    pie_df = fetch_pie()
+    config = fetch_config()['literal2idiomatic']
+    train_df, test_df = pie_df.pipe(cleanse)\
+                              .pipe(upsample, seed=config['seed'])\
+                              .pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])
+    # why don't you just "select"  the columns? yeah, stop using csv library. just select them.
+    train_df = train_df[["Idiom", "Literal_Sent", "Idiomatic_Sent"]]
+    test_df = test_df[["Idiom", "Literal_Sent", "Idiomatic_Sent"]]
+    dfs = (train_df, test_df)
+    with wandb.init(entity="eubinecto", project="idiomify") as run:
+        # the paths to write datasets in
+        train_path = ROOT_DIR / "train.tsv"
+        test_path = ROOT_DIR / "test.tsv"
+        paths = (train_path, test_path)
+        artifact = wandb.Artifact(name="literal2idiomatic", type="dataset", description=config['description'],
+                                  metadata=config)
+        for tsv_path, df in zip(paths, dfs):
+            df.to_csv(tsv_path, sep="\t")
+            artifact.add_file(tsv_path)
+        # then, we just log them here.
         run.log_artifact(artifact, aliases=["latest", config['ver']])
+        # don't forget to remove them
+        for tsv_path in paths:
+            os.remove(tsv_path)
 if __name__ == '__main__':

requirements.txt CHANGED Viewed

@@ -1,3 +1,7 @@
 pytorch-lightning==1.5.10
 transformers==4.16.2
-wandb==0.12.10

 pytorch-lightning==1.5.10
 transformers==4.16.2
+wandb==0.12.10
+scikit-learn==1.0.2
+pandas==1.3.5
+streamlit==1.7.0
+watchdog==2.1.6