Spaces:

eubinecto
/

idiomify

Runtime error

App Files Files Community

eubinecto commited on Mar 7, 2022

Commit

70f038c

unverified ·

2 Parent(s): 927768a 370afc1

Merge pull request #6 from eubinecto/issue-5

Browse files

Files changed (9) hide show

config.yaml +7 -5
explore/explore_fetch_idioms.py +1 -1
explore/explore_fetch_pie_annotate.py +14 -0
explore/explore_list_index.py +13 -0
explore/explore_upload_idioms_groupby.py +22 -0
idiomify/fetchers.py +3 -4
idiomify/preprocess.py +31 -0
main_upload_idioms.py +6 -8
main_upload_literal2idiomatic.py +4 -3

config.yaml CHANGED Viewed

@@ -12,10 +12,12 @@ idiomifier:
 # for building & uploading datasets or tokenizer
 idioms:
-  ver: d-1-2
-  description: the set of idioms in the traning set of literal2idiomatic_d-1-2.
 literal2idiomatic:
-  ver: d-1-2
-  description: PIE data split into train & test set (80 / 20 split). There is no validation set because I don't intend to do any hyperparameter tuning on this thing.
   train_ratio: 0.8
-  seed: 104

 # for building & uploading datasets or tokenizer
 idioms:
+  ver: d-1-3
+  description: the set of idioms in the traning set of literal2idiomatic_d-1-3. Definitions of them are added as well.
 literal2idiomatic:
+  ver: d-1-3
+  description: The idioms are annotated with <idiom> & </idiom>.
   train_ratio: 0.8
+  seed: 104
+  boi_token: <idiom>
+  eoi_token: </idiom>

explore/explore_fetch_idioms.py CHANGED Viewed

@@ -2,7 +2,7 @@ from idiomify.fetchers import fetch_idioms
 def main():
-    print(fetch_idioms("d-1-2"))
 if __name__ == '__main__':

 def main():
+    print(fetch_idioms("d-1-3"))
 if __name__ == '__main__':

explore/explore_fetch_pie_annotate.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from idiomify.fetchers import fetch_pie
+from preprocess import annotate
+def main():
+    pie_df = fetch_pie()
+    pie_df = pie_df.pipe(annotate, boi_token="<idiom>", eoi_token="</idiom>")
+    for _, row in pie_df.iterrows():
+        print(row['Idiomatic_Sent'])
+if __name__ == '__main__':
+    main()

explore/explore_list_index.py ADDED Viewed

	@@ -0,0 +1,13 @@

+def main():
+    labels = ["O", "O", "B", "O", "I", "I" "O", "I", "O", "O"]
+    boi_idx = labels.index("B")
+    eoi_idx = -1 * (list(reversed(labels)).index("I") + 1)
+    print(boi_idx, eoi_idx)
+    print(labels[boi_idx])
+    print(labels[eoi_idx])
+if __name__ == '__main__':
+    main()

explore/explore_upload_idioms_groupby.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from idiomify.fetchers import fetch_literal2idiomatic, fetch_config
+def main():
+    config = fetch_config()['literal2idiomatic']
+    train_df, _ = fetch_literal2idiomatic(config['ver'])
+    idioms_df = train_df[['Idiom', "Sense"]]
+    idioms_df = idioms_df.groupby('Idiom').agg({'Sense': lambda x: list(set(x))})
+    print(idioms_df.head(5))
+    for idx, row in idioms_df.iterrows():
+        print(row['Sense'])
+"""
+['to arrange something in a manner that either someone will gain a wrong disadvantage or a person would get an unfair advantage']
+['Used in general to refer an experience or talent or ability or position, which would be useful or beneficial for a person, his life and his future.']
+['to be very easy to see or notice']
+[' to reach a logical conclusion']
+['to start doing something over from the beginning']
+"""
+if __name__ == '__main__':
+    main()

idiomify/fetchers.py CHANGED Viewed

@@ -17,7 +17,7 @@ def fetch_pie() -> pd.DataFrame:
 # --- from wandb --- #
-def fetch_idioms(ver: str, run: Run = None) -> List[str]:
     """
     why do you need this? -> you need this to have access to the idiom embeddings.
     """
@@ -28,9 +28,8 @@ def fetch_idioms(ver: str, run: Run = None) -> List[str]:
     else:
         artifact = wandb.Api().artifact(f"eubinecto/idiomify/idioms:{ver}", type="dataset")
     artifact_dir = artifact.download(root=idioms_dir(ver))
-    txt_path = path.join(artifact_dir, "all.txt")
-    with open(txt_path, 'r') as fh:
-        return [line.strip() for line in fh]
 def fetch_literal2idiomatic(ver: str, run: Run = None) -> Tuple[pd.DataFrame, pd.DataFrame]:

 # --- from wandb --- #
+def fetch_idioms(ver: str, run: Run = None) -> pd.DataFrame:
     """
     why do you need this? -> you need this to have access to the idiom embeddings.
     """
     else:
         artifact = wandb.Api().artifact(f"eubinecto/idiomify/idioms:{ver}", type="dataset")
     artifact_dir = artifact.download(root=idioms_dir(ver))
+    tsv_path = path.join(artifact_dir, "all.tsv")
+    return pd.read_csv(tsv_path, sep="\t")
 def fetch_literal2idiomatic(ver: str, run: Run = None) -> Tuple[pd.DataFrame, pd.DataFrame]:

idiomify/preprocess.py CHANGED Viewed

@@ -17,6 +17,36 @@ def cleanse(df: pd.DataFrame) -> pd.DataFrame:
     return df
 def stratified_split(df: pd.DataFrame, ratio: float, seed: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """
     stratified-split the given df into two df's.
@@ -29,3 +59,4 @@ def stratified_split(df: pd.DataFrame, ratio: float, seed: int) -> Tuple[pd.Data
                                           test_size=other_size, random_state=seed,
                                           shuffle=True)
     return ratio_df, other_df

     return df
+def annotate(df: pd.DataFrame, boi_token: str, eoi_token: str) -> pd.DataFrame:
+    """
+    e.g.
+    given a row like this:
+    Idiom                                                 keep an eye on
+    Sense                   keep a watch on something or someone closely
+    Idiomatic_Sent     He had put on a lot of weight lately , so he started keeping an eye on what he ate .
+    Literal_Sent       He had put on a lot of weight lately , so he started to watch what he ate .
+    Idiomatic_Label            O O O O O O O O O O O O O B I I O O O O O
+    Literal_Label                  O O O O O O O O O O O O O B I O O O O
+    use Idiomatic_Label to replace Idiomatic_Sent with:
+    He had put on a lot of weight lately , so he started <idiom> keeping an eye on </idiom> what he ate .
+    """
+    for idx, row in df.iterrows():
+        tokens = row['Idiomatic_Sent'].split(" ")
+        labels = row["Idiomatic_Label"].split(" ")
+        if "B" in labels:
+            boi_idx = labels.index("B")
+            if "I" in labels:
+                eoi_idx = -1 * (list(reversed(labels)).index("I") + 1)
+                tokens[boi_idx] = f"{boi_token} {tokens[boi_idx]}"
+                tokens[eoi_idx] = f"{tokens[eoi_idx]} {eoi_token}"
+            else:
+                tokens[boi_idx] = f"{boi_token} {tokens[boi_idx]} {eoi_token}"
+            row['Idiomatic_Sent'] = " ".join(tokens)
+    return df
 def stratified_split(df: pd.DataFrame, ratio: float, seed: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """
     stratified-split the given df into two df's.
                                           test_size=other_size, random_state=seed,
                                           shuffle=True)
     return ratio_df, other_df

main_upload_idioms.py CHANGED Viewed

@@ -11,22 +11,20 @@ from idiomify.paths import ROOT_DIR
 def main():
     config = fetch_config()['idioms']
     train_df, _ = fetch_literal2idiomatic(config['ver'])
-    idioms = train_df['Idiom'].tolist()
-    idioms = list(set(idioms))
     with wandb.init(entity="eubinecto", project="idiomify") as run:
         # the paths to write datasets in
-        txt_path = ROOT_DIR / "all.txt"
-        with open(txt_path, 'w') as fh:
-            for idiom in idioms:
-                fh.write(idiom + "\n")
         artifact = wandb.Artifact(name="idioms", type="dataset", description=config['description'],
                                   metadata=config)
-        artifact.add_file(txt_path)
         # then, we just log them here.
         run.log_artifact(artifact, aliases=["latest", config['ver']])
         # don't forget to remove them
-        os.remove(txt_path)
 if __name__ == '__main__':

 def main():
     config = fetch_config()['idioms']
     train_df, _ = fetch_literal2idiomatic(config['ver'])
+    idioms_df = train_df[['Idiom', "Sense"]]
+    idioms_df = idioms_df.groupby('Idiom').agg({'Sense': lambda x: list(set(x))})
     with wandb.init(entity="eubinecto", project="idiomify") as run:
         # the paths to write datasets in
+        tsv_path = ROOT_DIR / "all.tsv"
+        idioms_df.to_csv(tsv_path, sep="\t")
         artifact = wandb.Artifact(name="idioms", type="dataset", description=config['description'],
                                   metadata=config)
+        artifact.add_file(tsv_path)
         # then, we just log them here.
         run.log_artifact(artifact, aliases=["latest", config['ver']])
         # don't forget to remove them
+        os.remove(tsv_path)
 if __name__ == '__main__':

main_upload_literal2idiomatic.py CHANGED Viewed

@@ -4,7 +4,7 @@ literal2idiomatic ver: d-1-2
 import os
 from idiomify.paths import ROOT_DIR
 from idiomify.fetchers import fetch_pie, fetch_config
-from idiomify.preprocess import upsample, cleanse, stratified_split
 import wandb
@@ -15,10 +15,11 @@ def main():
     config = fetch_config()['literal2idiomatic']
     train_df, test_df = pie_df.pipe(cleanse)\
                               .pipe(upsample, seed=config['seed'])\
                               .pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])
     # why don't you just "select"  the columns? yeah, stop using csv library. just select them.
-    train_df = train_df[["Idiom", "Literal_Sent", "Idiomatic_Sent"]]
-    test_df = test_df[["Idiom", "Literal_Sent", "Idiomatic_Sent"]]
     dfs = (train_df, test_df)
     with wandb.init(entity="eubinecto", project="idiomify") as run:
         # the paths to write datasets in

 import os
 from idiomify.paths import ROOT_DIR
 from idiomify.fetchers import fetch_pie, fetch_config
+from idiomify.preprocess import upsample, cleanse, stratified_split, annotate
 import wandb
     config = fetch_config()['literal2idiomatic']
     train_df, test_df = pie_df.pipe(cleanse)\
                               .pipe(upsample, seed=config['seed'])\
+                              .pipe(annotate, boi_token=config['boi_token'], eoi_token=config['eoi_token'])\
                               .pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])
     # why don't you just "select"  the columns? yeah, stop using csv library. just select them.
+    train_df = train_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]]
+    test_df = test_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]]
     dfs = (train_df, test_df)
     with wandb.init(entity="eubinecto", project="idiomify") as run:
         # the paths to write datasets in