eubinecto commited on
Commit
70f038c
2 Parent(s): 927768a 370afc1

Merge pull request #6 from eubinecto/issue-5

Browse files
config.yaml CHANGED
@@ -12,10 +12,12 @@ idiomifier:
12
 
13
  # for building & uploading datasets or tokenizer
14
  idioms:
15
- ver: d-1-2
16
- description: the set of idioms in the traning set of literal2idiomatic_d-1-2.
17
  literal2idiomatic:
18
- ver: d-1-2
19
- description: PIE data split into train & test set (80 / 20 split). There is no validation set because I don't intend to do any hyperparameter tuning on this thing.
20
  train_ratio: 0.8
21
- seed: 104
 
 
 
12
 
13
  # for building & uploading datasets or tokenizer
14
  idioms:
15
+ ver: d-1-3
16
+ description: the set of idioms in the traning set of literal2idiomatic_d-1-3. Definitions of them are added as well.
17
  literal2idiomatic:
18
+ ver: d-1-3
19
+ description: The idioms are annotated with <idiom> & </idiom>.
20
  train_ratio: 0.8
21
+ seed: 104
22
+ boi_token: <idiom>
23
+ eoi_token: </idiom>
explore/explore_fetch_idioms.py CHANGED
@@ -2,7 +2,7 @@ from idiomify.fetchers import fetch_idioms
2
 
3
 
4
  def main():
5
- print(fetch_idioms("d-1-2"))
6
 
7
 
8
  if __name__ == '__main__':
 
2
 
3
 
4
  def main():
5
+ print(fetch_idioms("d-1-3"))
6
 
7
 
8
  if __name__ == '__main__':
explore/explore_fetch_pie_annotate.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from idiomify.fetchers import fetch_pie
3
+ from preprocess import annotate
4
+
5
+
6
+ def main():
7
+ pie_df = fetch_pie()
8
+ pie_df = pie_df.pipe(annotate, boi_token="<idiom>", eoi_token="</idiom>")
9
+ for _, row in pie_df.iterrows():
10
+ print(row['Idiomatic_Sent'])
11
+
12
+
13
+ if __name__ == '__main__':
14
+ main()
explore/explore_list_index.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ def main():
4
+ labels = ["O", "O", "B", "O", "I", "I" "O", "I", "O", "O"]
5
+ boi_idx = labels.index("B")
6
+ eoi_idx = -1 * (list(reversed(labels)).index("I") + 1)
7
+ print(boi_idx, eoi_idx)
8
+ print(labels[boi_idx])
9
+ print(labels[eoi_idx])
10
+
11
+
12
+ if __name__ == '__main__':
13
+ main()
explore/explore_upload_idioms_groupby.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from idiomify.fetchers import fetch_literal2idiomatic, fetch_config
2
+
3
+
4
+ def main():
5
+ config = fetch_config()['literal2idiomatic']
6
+ train_df, _ = fetch_literal2idiomatic(config['ver'])
7
+ idioms_df = train_df[['Idiom', "Sense"]]
8
+ idioms_df = idioms_df.groupby('Idiom').agg({'Sense': lambda x: list(set(x))})
9
+ print(idioms_df.head(5))
10
+ for idx, row in idioms_df.iterrows():
11
+ print(row['Sense'])
12
+
13
+ """
14
+ ['to arrange something in a manner that either someone will gain a wrong disadvantage or a person would get an unfair advantage']
15
+ ['Used in general to refer an experience or talent or ability or position, which would be useful or beneficial for a person, his life and his future.']
16
+ ['to be very easy to see or notice']
17
+ [' to reach a logical conclusion']
18
+ ['to start doing something over from the beginning']
19
+ """
20
+
21
+ if __name__ == '__main__':
22
+ main()
idiomify/fetchers.py CHANGED
@@ -17,7 +17,7 @@ def fetch_pie() -> pd.DataFrame:
17
 
18
 
19
  # --- from wandb --- #
20
- def fetch_idioms(ver: str, run: Run = None) -> List[str]:
21
  """
22
  why do you need this? -> you need this to have access to the idiom embeddings.
23
  """
@@ -28,9 +28,8 @@ def fetch_idioms(ver: str, run: Run = None) -> List[str]:
28
  else:
29
  artifact = wandb.Api().artifact(f"eubinecto/idiomify/idioms:{ver}", type="dataset")
30
  artifact_dir = artifact.download(root=idioms_dir(ver))
31
- txt_path = path.join(artifact_dir, "all.txt")
32
- with open(txt_path, 'r') as fh:
33
- return [line.strip() for line in fh]
34
 
35
 
36
  def fetch_literal2idiomatic(ver: str, run: Run = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
 
17
 
18
 
19
  # --- from wandb --- #
20
+ def fetch_idioms(ver: str, run: Run = None) -> pd.DataFrame:
21
  """
22
  why do you need this? -> you need this to have access to the idiom embeddings.
23
  """
 
28
  else:
29
  artifact = wandb.Api().artifact(f"eubinecto/idiomify/idioms:{ver}", type="dataset")
30
  artifact_dir = artifact.download(root=idioms_dir(ver))
31
+ tsv_path = path.join(artifact_dir, "all.tsv")
32
+ return pd.read_csv(tsv_path, sep="\t")
 
33
 
34
 
35
  def fetch_literal2idiomatic(ver: str, run: Run = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
idiomify/preprocess.py CHANGED
@@ -17,6 +17,36 @@ def cleanse(df: pd.DataFrame) -> pd.DataFrame:
17
  return df
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def stratified_split(df: pd.DataFrame, ratio: float, seed: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
21
  """
22
  stratified-split the given df into two df's.
@@ -29,3 +59,4 @@ def stratified_split(df: pd.DataFrame, ratio: float, seed: int) -> Tuple[pd.Data
29
  test_size=other_size, random_state=seed,
30
  shuffle=True)
31
  return ratio_df, other_df
 
 
17
  return df
18
 
19
 
20
+ def annotate(df: pd.DataFrame, boi_token: str, eoi_token: str) -> pd.DataFrame:
21
+ """
22
+ e.g.
23
+ given a row like this:
24
+ Idiom keep an eye on
25
+ Sense keep a watch on something or someone closely
26
+ Idiomatic_Sent He had put on a lot of weight lately , so he started keeping an eye on what he ate .
27
+ Literal_Sent He had put on a lot of weight lately , so he started to watch what he ate .
28
+ Idiomatic_Label O O O O O O O O O O O O O B I I O O O O O
29
+ Literal_Label O O O O O O O O O O O O O B I O O O O
30
+
31
+ use Idiomatic_Label to replace Idiomatic_Sent with:
32
+ He had put on a lot of weight lately , so he started <idiom> keeping an eye on </idiom> what he ate .
33
+ """
34
+ for idx, row in df.iterrows():
35
+ tokens = row['Idiomatic_Sent'].split(" ")
36
+ labels = row["Idiomatic_Label"].split(" ")
37
+ if "B" in labels:
38
+ boi_idx = labels.index("B")
39
+ if "I" in labels:
40
+ eoi_idx = -1 * (list(reversed(labels)).index("I") + 1)
41
+ tokens[boi_idx] = f"{boi_token} {tokens[boi_idx]}"
42
+ tokens[eoi_idx] = f"{tokens[eoi_idx]} {eoi_token}"
43
+ else:
44
+ tokens[boi_idx] = f"{boi_token} {tokens[boi_idx]} {eoi_token}"
45
+ row['Idiomatic_Sent'] = " ".join(tokens)
46
+
47
+ return df
48
+
49
+
50
  def stratified_split(df: pd.DataFrame, ratio: float, seed: int) -> Tuple[pd.DataFrame, pd.DataFrame]:
51
  """
52
  stratified-split the given df into two df's.
 
59
  test_size=other_size, random_state=seed,
60
  shuffle=True)
61
  return ratio_df, other_df
62
+
main_upload_idioms.py CHANGED
@@ -11,22 +11,20 @@ from idiomify.paths import ROOT_DIR
11
  def main():
12
  config = fetch_config()['idioms']
13
  train_df, _ = fetch_literal2idiomatic(config['ver'])
14
- idioms = train_df['Idiom'].tolist()
15
- idioms = list(set(idioms))
16
 
17
  with wandb.init(entity="eubinecto", project="idiomify") as run:
18
  # the paths to write datasets in
19
- txt_path = ROOT_DIR / "all.txt"
20
- with open(txt_path, 'w') as fh:
21
- for idiom in idioms:
22
- fh.write(idiom + "\n")
23
  artifact = wandb.Artifact(name="idioms", type="dataset", description=config['description'],
24
  metadata=config)
25
- artifact.add_file(txt_path)
26
  # then, we just log them here.
27
  run.log_artifact(artifact, aliases=["latest", config['ver']])
28
  # don't forget to remove them
29
- os.remove(txt_path)
30
 
31
 
32
  if __name__ == '__main__':
 
11
  def main():
12
  config = fetch_config()['idioms']
13
  train_df, _ = fetch_literal2idiomatic(config['ver'])
14
+ idioms_df = train_df[['Idiom', "Sense"]]
15
+ idioms_df = idioms_df.groupby('Idiom').agg({'Sense': lambda x: list(set(x))})
16
 
17
  with wandb.init(entity="eubinecto", project="idiomify") as run:
18
  # the paths to write datasets in
19
+ tsv_path = ROOT_DIR / "all.tsv"
20
+ idioms_df.to_csv(tsv_path, sep="\t")
 
 
21
  artifact = wandb.Artifact(name="idioms", type="dataset", description=config['description'],
22
  metadata=config)
23
+ artifact.add_file(tsv_path)
24
  # then, we just log them here.
25
  run.log_artifact(artifact, aliases=["latest", config['ver']])
26
  # don't forget to remove them
27
+ os.remove(tsv_path)
28
 
29
 
30
  if __name__ == '__main__':
main_upload_literal2idiomatic.py CHANGED
@@ -4,7 +4,7 @@ literal2idiomatic ver: d-1-2
4
  import os
5
  from idiomify.paths import ROOT_DIR
6
  from idiomify.fetchers import fetch_pie, fetch_config
7
- from idiomify.preprocess import upsample, cleanse, stratified_split
8
  import wandb
9
 
10
 
@@ -15,10 +15,11 @@ def main():
15
  config = fetch_config()['literal2idiomatic']
16
  train_df, test_df = pie_df.pipe(cleanse)\
17
  .pipe(upsample, seed=config['seed'])\
 
18
  .pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])
19
  # why don't you just "select" the columns? yeah, stop using csv library. just select them.
20
- train_df = train_df[["Idiom", "Literal_Sent", "Idiomatic_Sent"]]
21
- test_df = test_df[["Idiom", "Literal_Sent", "Idiomatic_Sent"]]
22
  dfs = (train_df, test_df)
23
  with wandb.init(entity="eubinecto", project="idiomify") as run:
24
  # the paths to write datasets in
 
4
  import os
5
  from idiomify.paths import ROOT_DIR
6
  from idiomify.fetchers import fetch_pie, fetch_config
7
+ from idiomify.preprocess import upsample, cleanse, stratified_split, annotate
8
  import wandb
9
 
10
 
 
15
  config = fetch_config()['literal2idiomatic']
16
  train_df, test_df = pie_df.pipe(cleanse)\
17
  .pipe(upsample, seed=config['seed'])\
18
+ .pipe(annotate, boi_token=config['boi_token'], eoi_token=config['eoi_token'])\
19
  .pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])
20
  # why don't you just "select" the columns? yeah, stop using csv library. just select them.
21
+ train_df = train_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]]
22
+ test_df = test_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]]
23
  dfs = (train_df, test_df)
24
  with wandb.init(entity="eubinecto", project="idiomify") as run:
25
  # the paths to write datasets in