Spaces:

eubinecto
/

idiomify

Runtime error

App Files Files Community

eubinecto commited on Mar 6, 2022

Commit

642d911

1 Parent(s): f49863b

[#2] evaluating m-1-2 works. config.yaml simplified.

Browse files

Files changed (12) hide show

config.yaml +12 -13
explore/explore_bart_logits_shape.py +1 -1
explore/explore_idiomifydatamodule.py +1 -1
idiomify/{data.py → datamodules.py} +1 -1
idiomify/fetchers.py +2 -2
idiomify/models.py +3 -21
idiomify/pipeline.py +24 -0
main_eval.py +5 -5
main_infer.py +4 -4
main_train.py +2 -2
main_upload_idioms.py +1 -1
main_upload_literal2idiomatic.py +1 -1

config.yaml CHANGED Viewed

@@ -1,20 +1,19 @@
-train:
   ver: m-1-2
   desc: just overfitting the model, but on the entire PIE dataset.
   bart: facebook/bart-base
   lr: 0.0001
   literal2idiomatic_ver: d-1-2
-  max_epochs: 100
-  batch_size: 100
   shuffle: true
-# for building & uploading datasets or others
-upload:
-  idioms:
-    ver: d-1-2
-    description: the set of idioms in the traning set of literal2idiomatic_d-1-2.
-  literal2idiomatic:
-    ver: d-1-2
-    description: PIE data split into train & test set (80 / 20 split). There is no validation set because I don't intend to do any hyperparameter tuning on this thing.
-    train_ratio: 0.8
-    seed: 104

+idiomifier:
   ver: m-1-2
   desc: just overfitting the model, but on the entire PIE dataset.
   bart: facebook/bart-base
   lr: 0.0001
   literal2idiomatic_ver: d-1-2
+  max_epochs: 2
+  batch_size: 40
   shuffle: true
+# for building & uploading datasets or tokenizer
+idioms:
+  ver: d-1-2
+  description: the set of idioms in the traning set of literal2idiomatic_d-1-2.
+literal2idiomatic:
+  ver: d-1-2
+  description: PIE data split into train & test set (80 / 20 split). There is no validation set because I don't intend to do any hyperparameter tuning on this thing.
+  train_ratio: 0.8
+  seed: 104

explore/explore_bart_logits_shape.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from transformers import BartTokenizer, BartForConditionalGeneration
-from data import IdiomifyDataModule
 CONFIG = {

 from transformers import BartTokenizer, BartForConditionalGeneration
+from datamodules import IdiomifyDataModule
 CONFIG = {

explore/explore_idiomifydatamodule.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from transformers import BartTokenizer
-from idiomify.data import IdiomifyDataModule
 CONFIG = {

 from transformers import BartTokenizer
+from idiomify.datamodules import IdiomifyDataModule
 CONFIG = {

idiomify/{data.py → datamodules.py} RENAMED Viewed

@@ -84,6 +84,6 @@ class IdiomifyDataModule(LightningDataModule):
         return DataLoader(self.train_dataset, batch_size=self.config['batch_size'],
                           shuffle=self.config['shuffle'], num_workers=self.config['num_workers'])
-    def test_dataloader(self):
         return DataLoader(self.test_dataset, batch_size=self.config['batch_size'],
                           shuffle=False, num_workers=self.config['num_workers'])

         return DataLoader(self.train_dataset, batch_size=self.config['batch_size'],
                           shuffle=self.config['shuffle'], num_workers=self.config['num_workers'])
+    def test_dataloader(self) -> DataLoader:
         return DataLoader(self.test_dataset, batch_size=self.config['batch_size'],
                           shuffle=False, num_workers=self.config['num_workers'])

idiomify/fetchers.py CHANGED Viewed

@@ -53,9 +53,9 @@ def fetch_idiomifier(ver: str, run: Run = None) -> Idiomifier:
     The current Idiomifier then turns into a pipeline.
     """
     if run:
-        artifact = run.use_artifact(f"seq2seq:{ver}", type="model")
     else:
-        artifact = wandb.Api().artifact(f"eubinecto/idiomify/seq2seq:{ver}", type="model")
     config = artifact.metadata
     artifact_dir = artifact.download(root=seq2seq_dir(ver))
     ckpt_path = path.join(artifact_dir, "model.ckpt")

     The current Idiomifier then turns into a pipeline.
     """
     if run:
+        artifact = run.use_artifact(f"idiomifier:{ver}", type="model")
     else:
+        artifact = wandb.Api().artifact(f"eubinecto/idiomify/idiomifier:{ver}", type="model")
     config = artifact.metadata
     artifact_dir = artifact.download(root=seq2seq_dir(ver))
     ckpt_path = path.join(artifact_dir, "model.ckpt")

idiomify/models.py CHANGED Viewed

@@ -48,19 +48,19 @@ class Idiomifier(pl.LightningModule):  # noqa
             "loss": loss
         }
-    def on_train_batch_end(self, outputs: dict, **kwargs):
         self.log("Train/Loss", outputs['loss'])
     def on_train_epoch_end(self, *args, **kwargs) -> None:
         self.log("Train/Accuracy", self.acc_train.compute())
         self.acc_train.reset()
-    def test_step(self, batch: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], **kwargs):
         srcs, tgts_r, tgts = batch  # (N, 2, L_s), (N, 2, L_t), (N, 2, L_t)
         logits = self.forward(srcs, tgts_r).transpose(1, 2)  # ... -> (N, L, |V|) -> (N, |V|, L)
         self.acc_test.update(logits.detach(), target=tgts.detach())
-    def on_test_end(self):
         self.log("Test/Accuracy", self.acc_test.compute())
         self.acc_test.reset()
@@ -72,21 +72,3 @@ class Idiomifier(pl.LightningModule):  # noqa
         # The authors used Adam, so we might as well use it as well.
         return torch.optim.AdamW(self.parameters(), lr=self.hparams['lr'])
-# for inference
-class Pipeline:
-    def __init__(self, model: Idiomifier, tokenizer: BartTokenizer):
-        self.model = model
-        self.builder = SourcesBuilder(tokenizer)
-    def __call__(self, src: str, max_length=100) -> str:
-        srcs = self.builder(literal2idiomatic=[(src, "")])
-        pred_ids = self.model.bart.generate(
-            inputs=srcs[:, 0],  # (N, 2, L) -> (N, L)
-            attention_mask=srcs[:, 1],  # (N, 2, L) -> (N, L)
-            decoder_start_token_id=self.model.hparams['bos_token_id'],
-            max_length=max_length,
-        ).squeeze()  # -> (N, L_t) -> (L_t)
-        tgt = self.builder.tokenizer.decode(pred_ids, skip_special_tokens=True)
-        return tgt

             "loss": loss
         }
+    def on_train_batch_end(self, outputs: dict, *args, **kwargs):
         self.log("Train/Loss", outputs['loss'])
     def on_train_epoch_end(self, *args, **kwargs) -> None:
         self.log("Train/Accuracy", self.acc_train.compute())
         self.acc_train.reset()
+    def test_step(self, batch: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], *args, **kwargs):
         srcs, tgts_r, tgts = batch  # (N, 2, L_s), (N, 2, L_t), (N, 2, L_t)
         logits = self.forward(srcs, tgts_r).transpose(1, 2)  # ... -> (N, L, |V|) -> (N, |V|, L)
         self.acc_test.update(logits.detach(), target=tgts.detach())
+    def on_test_epoch_end(self, *args, **kwargs) -> None:
         self.log("Test/Accuracy", self.acc_test.compute())
         self.acc_test.reset()
         # The authors used Adam, so we might as well use it as well.
         return torch.optim.AdamW(self.parameters(), lr=self.hparams['lr'])

idiomify/pipeline.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# for inference
+from transformers import BartTokenizer
+from builders import SourcesBuilder
+from models import Idiomifier
+class Pipeline:
+    def __init__(self, model: Idiomifier, tokenizer: BartTokenizer):
+        self.model = model
+        self.builder = SourcesBuilder(tokenizer)
+    def __call__(self, src: str, max_length=100) -> str:
+        srcs = self.builder(literal2idiomatic=[(src, "")])
+        pred_ids = self.model.bart.generate(
+            inputs=srcs[:, 0],  # (N, 2, L) -> (N, L)
+            attention_mask=srcs[:, 1],  # (N, 2, L) -> (N, L)
+            decoder_start_token_id=self.model.hparams['bos_token_id'],
+            max_length=max_length,
+        ).squeeze()  # -> (N, L_t) -> (L_t)
+        tgt = self.builder.tokenizer.decode(pred_ids, skip_special_tokens=True)
+        return tgt

main_eval.py CHANGED Viewed

@@ -5,22 +5,22 @@ import wandb
 import pytorch_lightning as pl
 from pytorch_lightning.loggers import WandbLogger
 from transformers import BartTokenizer
-from idiomify.data import IdiomifyDataModule
 from idiomify.fetchers import fetch_config, fetch_idiomifier
-from paths import ROOT_DIR
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--num_workers", type=int, default=os.cpu_count())
     args = parser.parse_args()
-    config = fetch_config()['train']
     config.update(vars(args))
-    # prepare the model
     tokenizer = BartTokenizer.from_pretrained(config['bart'])
     # prepare the datamodule
     with wandb.init(entity="eubinecto", project="idiomify", config=config) as run:
-        model = fetch_idiomifier(config['ver'], run)
         datamodule = IdiomifyDataModule(config, tokenizer, run)
         logger = WandbLogger(log_model=False)
         trainer = pl.Trainer(fast_dev_run=config['fast_dev_run'],

 import pytorch_lightning as pl
 from pytorch_lightning.loggers import WandbLogger
 from transformers import BartTokenizer
+from idiomify.datamodules import IdiomifyDataModule
 from idiomify.fetchers import fetch_config, fetch_idiomifier
+from idiomify.paths import ROOT_DIR
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--num_workers", type=int, default=os.cpu_count())
+    parser.add_argument("--fast_dev_run", action="store_true", default=False)
     args = parser.parse_args()
+    config = fetch_config()['idiomifier']
     config.update(vars(args))
     tokenizer = BartTokenizer.from_pretrained(config['bart'])
     # prepare the datamodule
     with wandb.init(entity="eubinecto", project="idiomify", config=config) as run:
+        model = fetch_idiomifier(config['ver'], run)  # fetch a pre-trained model
         datamodule = IdiomifyDataModule(config, tokenizer, run)
         logger = WandbLogger(log_model=False)
         trainer = pl.Trainer(fast_dev_run=config['fast_dev_run'],

main_infer.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import argparse
-from idiomify.models import Idiomifier, Pipeline
 from idiomify.fetchers import fetch_config, fetch_idiomifier
 from transformers import BartTokenizer
@@ -10,14 +10,14 @@ def main():
                         default="If there's any good to loosing my job,"
                                 " it's that I'll now be able to go to school full-time and finish my degree earlier.")
     args = parser.parse_args()
-    config = fetch_config()['infer']
     config.update(vars(args))
     model = fetch_idiomifier(config['ver'])
     model.eval()  # this is crucial
     tokenizer = BartTokenizer.from_pretrained(config['bart'])
-    idiomifier = Pipeline(model, tokenizer)
     src = config['src']
-    tgt = idiomifier(src=config['src'])
     print(src, "\n->", tgt)

 import argparse
+from idiomify.models import Pipeline
 from idiomify.fetchers import fetch_config, fetch_idiomifier
 from transformers import BartTokenizer
                         default="If there's any good to loosing my job,"
                                 " it's that I'll now be able to go to school full-time and finish my degree earlier.")
     args = parser.parse_args()
+    config = fetch_config()['idiomifier']
     config.update(vars(args))
     model = fetch_idiomifier(config['ver'])
     model.eval()  # this is crucial
     tokenizer = BartTokenizer.from_pretrained(config['bart'])
+    pipeline = Pipeline(model, tokenizer)
     src = config['src']
+    tgt = pipeline(src=config['src'])
     print(src, "\n->", tgt)

main_train.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pytorch_lightning as pl
 from termcolor import colored
 from pytorch_lightning.loggers import WandbLogger
 from transformers import BartTokenizer, BartForConditionalGeneration
-from idiomify.data import IdiomifyDataModule
 from idiomify.fetchers import fetch_config
 from idiomify.models import Idiomifier
 from idiomify.paths import ROOT_DIR
@@ -19,7 +19,7 @@ def main():
     parser.add_argument("--fast_dev_run", action="store_true", default=False)
     parser.add_argument("--upload", dest='upload', action='store_true', default=False)
     args = parser.parse_args()
-    config = fetch_config()['train']
     config.update(vars(args))
     if not config['upload']:
         print(colored("WARNING: YOU CHOSE NOT TO UPLOAD. NOTHING BUT LOGS WILL BE SAVED TO WANDB", color="red"))

 from termcolor import colored
 from pytorch_lightning.loggers import WandbLogger
 from transformers import BartTokenizer, BartForConditionalGeneration
+from idiomify.datamodules import IdiomifyDataModule
 from idiomify.fetchers import fetch_config
 from idiomify.models import Idiomifier
 from idiomify.paths import ROOT_DIR
     parser.add_argument("--fast_dev_run", action="store_true", default=False)
     parser.add_argument("--upload", dest='upload', action='store_true', default=False)
     args = parser.parse_args()
+    config = fetch_config()['idiomifier']
     config.update(vars(args))
     if not config['upload']:
         print(colored("WARNING: YOU CHOSE NOT TO UPLOAD. NOTHING BUT LOGS WILL BE SAVED TO WANDB", color="red"))

main_upload_idioms.py CHANGED Viewed

@@ -9,7 +9,7 @@ from idiomify.paths import ROOT_DIR
 def main():
-    config = fetch_config()['upload']['idioms']
     train_df, _ = fetch_literal2idiomatic(config['ver'])
     idioms = train_df['Idiom'].tolist()
     idioms = list(set(idioms))

 def main():
+    config = fetch_config()['idioms']
     train_df, _ = fetch_literal2idiomatic(config['ver'])
     idioms = train_df['Idiom'].tolist()
     idioms = list(set(idioms))

main_upload_literal2idiomatic.py CHANGED Viewed

@@ -12,7 +12,7 @@ def main():
     # here, we use all of them, while splitting them into train & test
     pie_df = fetch_pie()
-    config = fetch_config()['upload']['literal2idiomatic']
     train_df, test_df = pie_df.pipe(cleanse)\
                               .pipe(upsample, seed=config['seed'])\
                               .pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])

     # here, we use all of them, while splitting them into train & test
     pie_df = fetch_pie()
+    config = fetch_config()['literal2idiomatic']
     train_df, test_df = pie_df.pipe(cleanse)\
                               .pipe(upsample, seed=config['seed'])\
                               .pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])