Spaces:

eubinecto
/

idiomify

Runtime error

App Files Files Community

eubinecto commited on Feb 20, 2022

Commit

207cddf

•

1 Parent(s): 3646bbf

saving this branch

Browse files

Files changed (14) hide show

explore/explore_fetch_epie.py +27 -0
explore/explore_fetch_epie_counts.py +20 -0
explore/explore_idiom2subwords.py +0 -0
idiomify/builders.py +84 -0
idiomify/datamodules.py +45 -5
idiomify/fetchers.py +37 -3
idiomify/models.py +0 -98
idiomify/tensors.py +0 -56
idiomify/urls.py +11 -0
main_train.py +2 -2
main_upload_idiom2context.py +11 -0
main_upload_idioms.py +13 -0
main_upload_tokenizer.py +13 -0
requirements.txt +3 -66

explore/explore_fetch_epie.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from idiomify.fetchers import fetch_epie
+def main():
+    epie = fetch_epie()
+    idioms = set([
+        idiom
+        for idiom, _, _ in epie
+    ])
+    # so, what do you want? you want to build an idiom-masked language modeling?
+    for idiom, context, tag in epie:
+        print(context)
+    for idx, idiom in enumerate(idioms):
+        print(idx, idiom)
+    # isn't it better to just leave the idiom there, and have it guess what meaning it has?
+    # in that case, It may be better to use a generative model?
+    # but what would happen if you let it... just guess it?
+    # the problem with non-masking is that ... you give the model the answer.
+    # what you should rather do is... do something like...  find similar words.
+if __name__ == '__main__':
+    main()

explore/explore_fetch_epie_counts.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from idiomify.fetchers import fetch_epie
+def main():
+    idioms = set([
+        idiom
+        for idiom, _, _ in fetch_epie()
+    ])
+    contexts = [
+        context
+        for _, _, context in fetch_epie()
+    ]
+    print("Total number of idioms:", len(idioms))
+    # This should learn... this - what I need for now is building a datamodule out of this
+    print("Total number of contexts:", len(contexts))
+if __name__ == '__main__':
+    main()

explore/explore_idiom2subwords.py ADDED Viewed

File without changes

idiomify/builders.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+all the functions for building tensors are defined here.
+builders must accept device as one of the parameters.
+"""
+import torch
+from typing import List, Tuple
+from transformers import BertTokenizer
+class TensorBuilder:
+    def __init__(self, tokenizer: BertTokenizer):
+        self.tokenizer = tokenizer
+    def __call__(self, *args, **kwargs) -> torch.Tensor:
+        raise NotImplementedError
+class Idiom2SubwordsBuilder(TensorBuilder):
+    def __call__(self, idioms: List[str], k: int) -> torch.Tensor:
+        mask_id = self.tokenizer.mask_token_id
+        pad_id = self.tokenizer.pad_token_id
+        # temporarily disable single-token status of the idioms
+        idioms = [idiom.split(" ") for idiom in idioms]
+        encodings = self.tokenizer(text=idioms,
+                                   add_special_tokens=False,
+                                   # should set this to True, as we already have the idioms split.
+                                   is_split_into_words=True,
+                                   padding='max_length',
+                                   max_length=k,  # set to k
+                                   return_tensors="pt")
+        input_ids = encodings['input_ids']
+        input_ids[input_ids == pad_id] = mask_id  # replace them with masks
+        return input_ids
+class Idiom2DefBuilder(TensorBuilder):
+    def __call__(self, idiom2def: List[Tuple[str, str]], k: int) -> torch.Tensor:
+        defs = [definition for _, definition in idiom2def]
+        lefts = [" ".join(["[MASK]"] * k)] * len(defs)
+        encodings = self.tokenizer(text=lefts,
+                                   text_pair=defs,
+                                   return_tensors="pt",
+                                   add_special_tokens=True,
+                                   truncation=True,
+                                   padding=True,
+                                   verbose=True)
+        input_ids: torch.Tensor = encodings['input_ids']
+        cls_id: int = self.tokenizer.cls_token_id
+        sep_id: int = self.tokenizer.sep_token_id
+        mask_id: int = self.tokenizer.mask_token_id
+        wisdom_mask = torch.where(input_ids == mask_id, 1, 0)
+        desc_mask = torch.where(((input_ids != cls_id) & (input_ids != sep_id) & (input_ids != mask_id)), 1, 0)
+        return torch.stack([input_ids,
+                            encodings['token_type_ids'],
+                            encodings['attention_mask'],
+                            wisdom_mask,
+                            desc_mask], dim=1)
+class Idiom2ContextBuilder(TensorBuilder):
+    def __call__(self, idiom2context: List[Tuple[str, str]]):
+        contexts = [context for _, context in idiom2context]
+        encodings = self.tokenizer(text=contexts,
+                                   return_tensors="pt",
+                                   add_special_tokens=True,
+                                   truncation=True,
+                                   padding=True,
+                                   verbose=True)
+        return torch.stack([encodings['input_ids'],
+                            encodings['token_type_ids'],
+                            encodings['attention_mask']], dim=1)
+class TargetsBuilder(TensorBuilder):
+    def __call__(self, idiom2sent: List[Tuple[str, str]], idioms: List[str]) -> torch.Tensor:
+        return torch.LongTensor([
+            idioms.index(idiom)
+            for idiom, _ in idiom2sent
+        ])

idiomify/datamodules.py CHANGED Viewed

@@ -2,8 +2,8 @@ import torch
 from typing import Tuple, Optional, List
 from torch.utils.data import Dataset, DataLoader
 from pytorch_lightning import LightningDataModule
-from idiomify.fetchers import fetch_idiom2def
-from idiomify import tensors as T
 from transformers import BertTokenizer
@@ -30,7 +30,7 @@ class IdiomifyDataset(Dataset):
         return self.X[idx], self.y[idx]
-class IdiomifyDataModule(LightningDataModule):
     # boilerplate - just ignore these
     def test_dataloader(self):
@@ -66,10 +66,50 @@ class IdiomifyDataModule(LightningDataModule):
         """
         # --- set up the builders --- #
         # build the datasets
-        X = T.inputs(self.idiom2def, self.tokenizer, self.config['k'])
-        y = T.targets(self.idiom2def, self.idioms)
         self.dataset = IdiomifyDataset(X, y)
     def train_dataloader(self) -> DataLoader:
         return DataLoader(self.dataset, batch_size=self.config['batch_size'],
                           shuffle=self.config['shuffle'], num_workers=self.config['num_workers'])

 from typing import Tuple, Optional, List
 from torch.utils.data import Dataset, DataLoader
 from pytorch_lightning import LightningDataModule
+from idiomify.fetchers import fetch_idiom2def, fetch_epie
+from idiomify.builders import Idiom2DefBuilder, Idiom2ContextBuilder, TargetsBuilder
 from transformers import BertTokenizer
         return self.X[idx], self.y[idx]
+class Idiom2DefDataModule(LightningDataModule):
     # boilerplate - just ignore these
     def test_dataloader(self):
         """
         # --- set up the builders --- #
         # build the datasets
+        X = Idiom2DefBuilder(self.tokenizer)(self.idiom2def, self.config['k'])
+        y = TargetsBuilder(self.tokenizer)(self.idiom2def, self.idioms)
         self.dataset = IdiomifyDataset(X, y)
     def train_dataloader(self) -> DataLoader:
         return DataLoader(self.dataset, batch_size=self.config['batch_size'],
                           shuffle=self.config['shuffle'], num_workers=self.config['num_workers'])
+class Idiom2ContextsDataModule(LightningDataModule):
+    # boilerplate - just ignore these
+    def test_dataloader(self):
+        pass
+    def val_dataloader(self):
+        pass
+    def predict_dataloader(self):
+        pass
+    def __init__(self, config: dict, tokenizer: BertTokenizer, idioms: List[str]):
+        super().__init__()
+        self.config = config
+        self.tokenizer = tokenizer
+        self.idioms = idioms
+        self.idiom2context: Optional[List[Tuple[str, str]]] = None
+        self.dataset: Optional[IdiomifyDataset] = None
+    def prepare_data(self):
+        """
+        prepare: download all data needed for this from wandb to local.
+        """
+        self.idiom2context = [
+            (idiom, context)
+            for idiom, _, context in fetch_epie()
+        ]
+    def setup(self, stage: Optional[str] = None):
+        # build the datasets
+        X = Idiom2ContextBuilder(self.tokenizer)(self.idiom2context)
+        y = TargetsBuilder(self.tokenizer)(self.idiom2context, self.idioms)
+        self.dataset = IdiomifyDataset(X, y)
+    def train_dataloader(self):
+        return DataLoader(self.dataset, batch_size=self.config['batch_size'],
+                          shuffle=self.config['shuffle'], num_workers=self.config['num_workers'])

idiomify/fetchers.py CHANGED Viewed

@@ -1,13 +1,47 @@
 import csv
 import yaml
 import wandb
 from typing import Tuple, List
-from idiomify.models import Alpha, Gamma, RD
 from idiomify.paths import idiom2def_dir, CONFIG_YAML, idioms_dir, alpha_dir
-from idiomify import tensors as T
 from transformers import AutoModelForMaskedLM, AutoConfig, BertTokenizer
 # dataset
 def fetch_idiom2def(ver: str) -> List[Tuple[str, str]]:
     artifact = wandb.Api().artifact(f"eubinecto/idiomify-demo/idiom2def:{ver}", type="dataset")
@@ -45,7 +79,7 @@ def fetch_rd(model: str, ver: str) -> RD:
     ckpt_path = artifact_path / "rd.ckpt"
     idioms = fetch_idioms(config['idioms_ver'])
     tokenizer = BertTokenizer.from_pretrained(config['bert'])
-    idiom2subwords = T.idiom2subwords(idioms, tokenizer, config['k'])
     if model == Alpha.name():
         rd = Alpha.load_from_checkpoint(str(ckpt_path), mlm=mlm, idiom2subwords=idiom2subwords)
     elif model == Gamma.name():

 import csv
 import yaml
 import wandb
+import requests
 from typing import Tuple, List
+from wandb.sdk.wandb_run import Run
+from idiomify.models import Alpha, RD
 from idiomify.paths import idiom2def_dir, CONFIG_YAML, idioms_dir, alpha_dir
+from idiomify.urls import (
+    EPIE_IMMUTABLE_IDIOMS_URL,
+    EPIE_IMMUTABLE_IDIOMS_CONTEXTS_URL,
+    EPIE_IMMUTABLE_IDIOMS_TAGS_URL,
+    EPIE_MUTABLE_IDIOMS_URL,
+    EPIE_MUTABLE_IDIOMS_CONTEXTS_URL,
+    EPIE_MUTABLE_IDIOMS_TAGS_URL
+)
+from idiomify.builders import Idiom2SubwordsBuilder
 from transformers import AutoModelForMaskedLM, AutoConfig, BertTokenizer
+# sources for dataset
+def fetch_epie() -> List[Tuple[str, str, str]]:
+    idioms = requests.get(EPIE_IMMUTABLE_IDIOMS_URL).text \
+             + requests.get(EPIE_MUTABLE_IDIOMS_URL).text
+    contexts = requests.get(EPIE_IMMUTABLE_IDIOMS_CONTEXTS_URL).text \
+               + requests.get(EPIE_MUTABLE_IDIOMS_CONTEXTS_URL).text
+    tags = requests.get(EPIE_IMMUTABLE_IDIOMS_TAGS_URL).text \
+           + requests.get(EPIE_MUTABLE_IDIOMS_TAGS_URL).text
+    return list(zip(idioms.strip().split("\n"),
+                    contexts.strip().split("\n"),
+                    tags.strip().split("\n")))
+# you should somehow get this from... wandb.
+def fetch_idiom2context(ver: str, run: Run = None) -> List[Tuple[str, str]]:
+    """
+    include run if you want to track the lineage
+    """
+    if run:
+        pass
 # dataset
 def fetch_idiom2def(ver: str) -> List[Tuple[str, str]]:
     artifact = wandb.Api().artifact(f"eubinecto/idiomify-demo/idiom2def:{ver}", type="dataset")
     ckpt_path = artifact_path / "rd.ckpt"
     idioms = fetch_idioms(config['idioms_ver'])
     tokenizer = BertTokenizer.from_pretrained(config['bert'])
+    idiom2subwords = Idiom2SubwordsBuilder(tokenizer)(idioms, config['k'])
     if model == Alpha.name():
         rd = Alpha.load_from_checkpoint(str(ckpt_path), mlm=mlm, idiom2subwords=idiom2subwords)
     elif model == Gamma.name():

idiomify/models.py CHANGED Viewed

@@ -174,101 +174,3 @@ class Alpha(RD):
         H_k = self.H_k(H_all)  # (N, L, H) -> (N, K, H)
         S_wisdom = self.S_wisdom_literal(H_k)  # (N, K, H) -> (N, |W|)
         return S_wisdom
-class BiLSTMPooler(torch.nn.Module):
-    def __init__(self, hidden_size: int):
-        super().__init__()
-        self.lstm = torch.nn.LSTM(input_size=hidden_size, hidden_size=hidden_size // 2, batch_first=True,
-                                  num_layers=1, bidirectional=True)
-    def forward(self, X: torch.Tensor) -> torch.Tensor:
-        hiddens, _ = self.lstm(X)
-        return hiddens[:, -1]
-class Gamma(RD):
-    """
-    @eubinecto
-    S_wisdom  = S_wisdom_literal + S_wisdom_figurative
-    but the way we get S_wisdom_figurative is much simplified, compared with RDBeta.
-    """
-    def __init__(self, mlm: BertForMaskedLM, idiom2subwords: torch.Tensor, k: int, lr: float):
-        super().__init__(mlm, idiom2subwords, k, lr)
-        # a pooler is a multilayer perceptron that pools wisdom_embeddings from idiom2subwords_embeddings
-        self.pooler = BiLSTMPooler(self.mlm.config.hidden_size)
-        # --- to be used to compute  attentions --- #
-        self.attention_mask: Optional[torch.Tensor] = None
-    def forward(self, X: torch.Tensor) -> torch.Tensor:
-        """
-        :param X: (N, 4, L);
-         (num samples, 0=input_ids/1=token_type_ids/2=attention_mask/3=wisdom_mask, the maximum length)
-        :return: (N, L, H); (num samples, k, the size of the vocabulary of subwords)
-        """
-        input_ids = X[:, 0]  # (N, 4, L) -> (N, L)
-        token_type_ids = X[:, 1]  # (N, 4, L) -> (N, L)
-        self.attention_mask = X[:, 2]  # (N, 4, L) -> (N, L)
-        self.wisdom_mask = X[:, 3]  # (N, 4, L) -> (N, L)
-        self.desc_mask = X[:, 4]  # (N, 4, L) -> (N, L)
-        H_all = self.mlm.bert.forward(input_ids, self.attention_mask, token_type_ids)[0]  # (N, 3, L) -> (N, L, H)
-        return H_all
-    def H_desc_attention_mask(self, attention_mask: torch.Tensor) -> torch.Tensor:
-        """
-        this is needed mask the padding tokens
-        :param attention_mask: (N, L)
-        """
-        N, L = attention_mask.size()
-        H_desc_attention_mask = torch.masked_select(attention_mask, self.desc_mask.bool())
-        H_desc_attention_mask = H_desc_attention_mask.reshape(N, L - (self.hparams['k'] + 3))
-        return H_desc_attention_mask
-    def S_wisdom(self, H_all: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        S_wisdom_literal = self.S_wisdom_literal(self.H_k(H_all))
-        S_wisdom_figurative = self.S_wisdom_figurative(H_all)
-        S_wisdom = S_wisdom_literal + S_wisdom_figurative
-        return S_wisdom, S_wisdom_literal, S_wisdom_figurative
-    def S_wisdom_figurative(self, H_all: torch.Tensor) -> torch.Tensor:
-        # --- draw the embeddings for wisdoms from  the embeddings of idiom2subwords -- #
-        # this is to use as less of newly initialised weights as possible
-        idiom2subwords_embeddings = self.mlm.bert \
-            .embeddings.word_embeddings(self.idiom2subwords)  # (W, K)  -> (W, K, H)
-        wisdom_embeddings = self.pooler(idiom2subwords_embeddings).squeeze()  # (W, H, K) -> (W, H, 1) -> (W, H)
-        # --- draw H_wisdom from H_desc with attention --- #
-        H_cls = H_all[:, 0]  # (N, L, H) -> (N, H)
-        H_desc = self.H_desc(H_all)  # (N, L, H) -> (N, D, H)
-        H_desc_attention_mask = self.H_desc_attention_mask(self.attention_mask)  # (N, L) -> (N, D)
-        scores = torch.einsum("...h,...dh->...d", H_cls, H_desc)  # (N, D)
-        # ignore the padding tokens
-        scores = torch.masked_fill(scores, H_desc_attention_mask != 1, float("-inf"))  # (N, D)
-        attentions = torch.softmax(scores, dim=1)  # over D
-        H_wisdom = torch.einsum("...d,...dh->...h", attentions, H_desc)  # -> (N, H)
-        # --- now compare H_wisdom with all the wisdoms --- #
-        S_wisdom_figurative = torch.einsum("...h,wh->...w", H_wisdom, wisdom_embeddings)  # (N, H) * (W, H) -> (N, W)
-        return S_wisdom_figurative
-    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> dict:
-        X, y = batch
-        H_all = self.forward(X)  # (N, 3, L) -> (N, L, H)
-        S_wisdom, S_wisdom_literal, S_wisdom_figurative = self.S_wisdom(H_all)  # (N, L, H) -> (N, |W|)
-        loss_all = F.cross_entropy(S_wisdom, y).sum()  # (N, |W|), (N,) -> (N,) -> (1,)
-        loss_literal = F.cross_entropy(S_wisdom_literal, y).sum()  # (N, |W|), (N,) -> (N,) -> (1,)
-        loss_figurative = F.cross_entropy(S_wisdom_figurative, y).sum()  # (N, |W|), (N,) -> (N,) -> (1,)
-        loss = loss_all + loss_literal + loss_figurative  # unweighted multi-task learning
-        return {
-            # you cannot change the keyword for the loss
-            "loss": loss,
-        }
-    def P_wisdom(self, X: torch.Tensor) -> torch.Tensor:
-        """
-        :param X: (N, 3, L)
-        :return P_wisdom: (N, |W|), normalized over dim 1.
-        """
-        H_all = self.forward(X)  # (N, 3, L) -> (N, L, H)
-        S_wisdom, _, _ = self.S_wisdom(H_all)  # (N, L, H) -> (N, W)
-        P_wisdom = F.softmax(S_wisdom, dim=1)  # (N, W) -> (N, W)
-        return P_wisdom

         H_k = self.H_k(H_all)  # (N, L, H) -> (N, K, H)
         S_wisdom = self.S_wisdom_literal(H_k)  # (N, K, H) -> (N, |W|)
         return S_wisdom

idiomify/tensors.py DELETED Viewed

@@ -1,56 +0,0 @@
-"""
-all the functions for building tensors are defined here.
-builders must accept device as one of the parameters.
-"""
-import torch
-from typing import List, Tuple
-from transformers import BertTokenizer
-def idiom2subwords(idioms: List[str], tokenizer: BertTokenizer, k: int) -> torch.Tensor:
-    mask_id = tokenizer.mask_token_id
-    pad_id = tokenizer.pad_token_id
-    # temporarily disable single-token status of the idioms
-    idioms = [idiom.split(" ") for idiom in idioms]
-    encodings = tokenizer(text=idioms,
-                          add_special_tokens=False,
-                          # should set this to True, as we already have the idioms split.
-                          is_split_into_words=True,
-                          padding='max_length',
-                          max_length=k,  # set to k
-                          return_tensors="pt")
-    input_ids = encodings['input_ids']
-    input_ids[input_ids == pad_id] = mask_id  # replace them with masks
-    return input_ids
-def inputs(idiom2def: List[Tuple[str, str]], tokenizer: BertTokenizer, k: int) -> torch.Tensor:
-    defs = [definition for _, definition in idiom2def]
-    lefts = [" ".join(["[MASK]"] * k)] * len(defs)
-    encodings = tokenizer(text=lefts,
-                          text_pair=defs,
-                          return_tensors="pt",
-                          add_special_tokens=True,
-                          truncation=True,
-                          padding=True,
-                          verbose=True)
-    input_ids: torch.Tensor = encodings['input_ids']
-    cls_id: int = tokenizer.cls_token_id
-    sep_id: int = tokenizer.sep_token_id
-    mask_id: int = tokenizer.mask_token_id
-    wisdom_mask = torch.where(input_ids == mask_id, 1, 0)
-    desc_mask = torch.where(((input_ids != cls_id) & (input_ids != sep_id) & (input_ids != mask_id)), 1, 0)
-    return torch.stack([input_ids,
-                        encodings['token_type_ids'],
-                        encodings['attention_mask'],
-                        wisdom_mask,
-                        desc_mask], dim=1)
-def targets(idiom2def: List[Tuple[str, str]], idioms: List[str]) -> torch.Tensor:
-    return torch.LongTensor([
-        idioms.index(idiom)
-        for idiom, _ in idiom2def
-    ])

idiomify/urls.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# EPIE dataset
+EPIE_IMMUTABLE_IDIOMS_TAGS_URL = "https://raw.githubusercontent.com/prateeksaxena2809/EPIE_Corpus/master/Static_Idioms_Corpus/Static_Idioms_Tags.txt"  # noqa
+EPIE_IMMUTABLE_IDIOMS_URL = "https://raw.githubusercontent.com/prateeksaxena2809/EPIE_Corpus/master/Static_Idioms_Corpus/Static_Idioms_Candidates.txt"  # noqa
+EPIE_IMMUTABLE_IDIOMS_CONTEXTS_URL = "https://raw.githubusercontent.com/prateeksaxena2809/EPIE_Corpus/master/Static_Idioms_Corpus/Static_Idioms_Words.txt"  # noqa
+EPIE_MUTABLE_IDIOMS_TAGS_URL = "https://raw.githubusercontent.com/prateeksaxena2809/EPIE_Corpus/master/Formal_Idioms_Corpus/Formal_Idioms_Tags.txt"  # noqa
+EPIE_MUTABLE_IDIOMS_URL = "https://raw.githubusercontent.com/prateeksaxena2809/EPIE_Corpus/master/Formal_Idioms_Corpus/Formal_Idioms_Candidates.txt"  # noqa
+EPIE_MUTABLE_IDIOMS_CONTEXTS_URL = "https://github.com/prateeksaxena2809/EPIE_Corpus/blob/master/Formal_Idioms_Corpus/Formal_Idioms_Words.txt"  # noqa

main_train.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pytorch_lightning as pl
 from pytorch_lightning.loggers import WandbLogger
 from termcolor import colored
 from transformers import BertForMaskedLM, BertTokenizer
-from idiomify.datamodules import IdiomifyDataModule
 from idiomify.fetchers import fetch_config, fetch_idioms
 from idiomify.models import Alpha, Gamma
 from idiomify.paths import ROOT_DIR
@@ -40,7 +40,7 @@ def main():
     else:
         raise ValueError
     # prepare datamodule
-    datamodule = IdiomifyDataModule(config, tokenizer, idioms)
     with wandb.init(entity="eubinecto", project="idiomify-demo", config=config) as run:
         logger = WandbLogger(log_model=False)

 from pytorch_lightning.loggers import WandbLogger
 from termcolor import colored
 from transformers import BertForMaskedLM, BertTokenizer
+from idiomify.datamodules import Idiom2DefDataModule
 from idiomify.fetchers import fetch_config, fetch_idioms
 from idiomify.models import Alpha, Gamma
 from idiomify.paths import ROOT_DIR
     else:
         raise ValueError
     # prepare datamodule
+    datamodule = Idiom2DefDataModule(config, tokenizer, idioms)
     with wandb.init(entity="eubinecto", project="idiomify-demo", config=config) as run:
         logger = WandbLogger(log_model=False)

main_upload_idiom2context.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+Build and upload an idiom2context dataset to wandb.
+"""
+def main():
+    pass
+if __name__ == '__main__':
+    main()

main_upload_idioms.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+Here,
+ver a: Compatible with the first version
+ver b:
+"""
+def main():
+    pass
+if __name__ == '__main__':
+    main()

main_upload_tokenizer.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+Build & upload a tokenizer to wandb.
+You need this if you were to add more tokens there.
+"""
+def main():
+    pass
+    # TODO: fetch the dataset from wandb first!
+if __name__ == '__main__':
+    main()

requirements.txt CHANGED Viewed

@@ -1,66 +1,3 @@
-absl-py==1.0.0
-aiohttp==3.8.1
-aiosignal==1.2.0
-async-timeout==4.0.2
-attrs==21.4.0
-cachetools==4.2.4
-certifi==2021.10.8
-charset-normalizer==2.0.10
-click==8.0.3
-configparser==5.2.0
-docker-pycreds==0.4.0
-filelock==3.4.2
-frozenlist==1.3.0
-fsspec==2022.1.0
-future==0.18.2
-gitdb==4.0.9
-GitPython==3.1.26
-google-auth==2.3.3
-google-auth-oauthlib==0.4.6
-grpcio==1.43.0
-huggingface-hub==0.4.0
-idna==3.3
-importlib-metadata==4.10.1
-joblib==1.1.0
-Markdown==3.3.6
-multidict==5.2.0
-numpy==1.22.1
-oauthlib==3.1.1
-packaging==21.3
-pathtools==0.1.2
-promise==2.3
-protobuf==3.19.3
-psutil==5.9.0
-pyasn1==0.4.8
-pyasn1-modules==0.2.8
-pyDeprecate==0.3.1
-pyparsing==3.0.6
-python-dateutil==2.8.2
-pytorch-lightning==1.5.8
-PyYAML==6.0
-regex==2022.1.18
-requests==2.27.1
-requests-oauthlib==1.3.0
-rsa==4.8
-sacremoses==0.0.47
-sentry-sdk==1.5.2
-shortuuid==1.0.8
-six==1.16.0
-smmap==5.0.0
-subprocess32==3.5.4
-tensorboard==2.7.0
-tensorboard-data-server==0.6.1
-tensorboard-plugin-wit==1.8.1
-termcolor==1.1.0
-tokenizers==0.10.3
-torch==1.10.1
-torchmetrics==0.7.0
-tqdm==4.62.3
-transformers==4.15.0
-typing_extensions==4.0.1
-urllib3==1.26.8
-wandb==0.12.9
-Werkzeug==2.0.2
-yarl==1.7.2
-yaspin==2.1.0
-zipp==3.7.0

+pytorch-lightning==1.5.10
+transformers==4.16.2
+wandb==0.12.10