[#5] idioms:d-1-3 is done. Added multiple senses of the same idiom as a list
Browse files- config.yaml +2 -2
- explore/explore_fetch_idioms.py +1 -1
- explore/explore_upload_idioms_groupby.py +22 -0
- idiomify/fetchers.py +3 -4
- main_upload_idioms.py +6 -8
- main_upload_literal2idiomatic.py +2 -2
config.yaml
CHANGED
@@ -12,8 +12,8 @@ idiomifier:
|
|
12 |
|
13 |
# for building & uploading datasets or tokenizer
|
14 |
idioms:
|
15 |
-
ver: d-1-
|
16 |
-
description: the set of idioms in the traning set of literal2idiomatic_d-1-
|
17 |
literal2idiomatic:
|
18 |
ver: d-1-3
|
19 |
description: The idioms are annotated with <idiom> & </idiom>.
|
|
|
12 |
|
13 |
# for building & uploading datasets or tokenizer
|
14 |
idioms:
|
15 |
+
ver: d-1-3
|
16 |
+
description: the set of idioms in the traning set of literal2idiomatic_d-1-3. Definitions of them are added as well.
|
17 |
literal2idiomatic:
|
18 |
ver: d-1-3
|
19 |
description: The idioms are annotated with <idiom> & </idiom>.
|
explore/explore_fetch_idioms.py
CHANGED
@@ -2,7 +2,7 @@ from idiomify.fetchers import fetch_idioms
|
|
2 |
|
3 |
|
4 |
def main():
|
5 |
-
print(fetch_idioms("d-1-
|
6 |
|
7 |
|
8 |
if __name__ == '__main__':
|
|
|
2 |
|
3 |
|
4 |
def main():
|
5 |
+
print(fetch_idioms("d-1-3"))
|
6 |
|
7 |
|
8 |
if __name__ == '__main__':
|
explore/explore_upload_idioms_groupby.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from idiomify.fetchers import fetch_literal2idiomatic, fetch_config
|
2 |
+
|
3 |
+
|
4 |
+
def main():
|
5 |
+
config = fetch_config()['literal2idiomatic']
|
6 |
+
train_df, _ = fetch_literal2idiomatic(config['ver'])
|
7 |
+
idioms_df = train_df[['Idiom', "Sense"]]
|
8 |
+
idioms_df = idioms_df.groupby('Idiom').agg({'Sense': lambda x: list(set(x))})
|
9 |
+
print(idioms_df.head(5))
|
10 |
+
for idx, row in idioms_df.iterrows():
|
11 |
+
print(row['Sense'])
|
12 |
+
|
13 |
+
"""
|
14 |
+
['to arrange something in a manner that either someone will gain a wrong disadvantage or a person would get an unfair advantage']
|
15 |
+
['Used in general to refer an experience or talent or ability or position, which would be useful or beneficial for a person, his life and his future.']
|
16 |
+
['to be very easy to see or notice']
|
17 |
+
[' to reach a logical conclusion']
|
18 |
+
['to start doing something over from the beginning']
|
19 |
+
"""
|
20 |
+
|
21 |
+
if __name__ == '__main__':
|
22 |
+
main()
|
idiomify/fetchers.py
CHANGED
@@ -17,7 +17,7 @@ def fetch_pie() -> pd.DataFrame:
|
|
17 |
|
18 |
|
19 |
# --- from wandb --- #
|
20 |
-
def fetch_idioms(ver: str, run: Run = None) ->
|
21 |
"""
|
22 |
why do you need this? -> you need this to have access to the idiom embeddings.
|
23 |
"""
|
@@ -28,9 +28,8 @@ def fetch_idioms(ver: str, run: Run = None) -> List[str]:
|
|
28 |
else:
|
29 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/idioms:{ver}", type="dataset")
|
30 |
artifact_dir = artifact.download(root=idioms_dir(ver))
|
31 |
-
|
32 |
-
|
33 |
-
return [line.strip() for line in fh]
|
34 |
|
35 |
|
36 |
def fetch_literal2idiomatic(ver: str, run: Run = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
|
17 |
|
18 |
|
19 |
# --- from wandb --- #
|
20 |
+
def fetch_idioms(ver: str, run: Run = None) -> pd.DataFrame:
|
21 |
"""
|
22 |
why do you need this? -> you need this to have access to the idiom embeddings.
|
23 |
"""
|
|
|
28 |
else:
|
29 |
artifact = wandb.Api().artifact(f"eubinecto/idiomify/idioms:{ver}", type="dataset")
|
30 |
artifact_dir = artifact.download(root=idioms_dir(ver))
|
31 |
+
tsv_path = path.join(artifact_dir, "all.tsv")
|
32 |
+
return pd.read_csv(tsv_path, sep="\t")
|
|
|
33 |
|
34 |
|
35 |
def fetch_literal2idiomatic(ver: str, run: Run = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
main_upload_idioms.py
CHANGED
@@ -11,22 +11,20 @@ from idiomify.paths import ROOT_DIR
|
|
11 |
def main():
|
12 |
config = fetch_config()['idioms']
|
13 |
train_df, _ = fetch_literal2idiomatic(config['ver'])
|
14 |
-
|
15 |
-
|
16 |
|
17 |
with wandb.init(entity="eubinecto", project="idiomify") as run:
|
18 |
# the paths to write datasets in
|
19 |
-
|
20 |
-
|
21 |
-
for idiom in idioms:
|
22 |
-
fh.write(idiom + "\n")
|
23 |
artifact = wandb.Artifact(name="idioms", type="dataset", description=config['description'],
|
24 |
metadata=config)
|
25 |
-
artifact.add_file(
|
26 |
# then, we just log them here.
|
27 |
run.log_artifact(artifact, aliases=["latest", config['ver']])
|
28 |
# don't forget to remove them
|
29 |
-
os.remove(
|
30 |
|
31 |
|
32 |
if __name__ == '__main__':
|
|
|
11 |
def main():
|
12 |
config = fetch_config()['idioms']
|
13 |
train_df, _ = fetch_literal2idiomatic(config['ver'])
|
14 |
+
idioms_df = train_df[['Idiom', "Sense"]]
|
15 |
+
idioms_df = idioms_df.groupby('Idiom').agg({'Sense': lambda x: list(set(x))})
|
16 |
|
17 |
with wandb.init(entity="eubinecto", project="idiomify") as run:
|
18 |
# the paths to write datasets in
|
19 |
+
tsv_path = ROOT_DIR / "all.tsv"
|
20 |
+
idioms_df.to_csv(tsv_path, sep="\t")
|
|
|
|
|
21 |
artifact = wandb.Artifact(name="idioms", type="dataset", description=config['description'],
|
22 |
metadata=config)
|
23 |
+
artifact.add_file(tsv_path)
|
24 |
# then, we just log them here.
|
25 |
run.log_artifact(artifact, aliases=["latest", config['ver']])
|
26 |
# don't forget to remove them
|
27 |
+
os.remove(tsv_path)
|
28 |
|
29 |
|
30 |
if __name__ == '__main__':
|
main_upload_literal2idiomatic.py
CHANGED
@@ -18,8 +18,8 @@ def main():
|
|
18 |
.pipe(annotate, boi_token=config['boi_token'], eoi_token=config['eoi_token'])\
|
19 |
.pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])
|
20 |
# why don't you just "select" the columns? yeah, stop using csv library. just select them.
|
21 |
-
train_df = train_df[["Idiom", "Literal_Sent", "Idiomatic_Sent"]]
|
22 |
-
test_df = test_df[["Idiom", "Literal_Sent", "Idiomatic_Sent"]]
|
23 |
dfs = (train_df, test_df)
|
24 |
with wandb.init(entity="eubinecto", project="idiomify") as run:
|
25 |
# the paths to write datasets in
|
|
|
18 |
.pipe(annotate, boi_token=config['boi_token'], eoi_token=config['eoi_token'])\
|
19 |
.pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])
|
20 |
# why don't you just "select" the columns? yeah, stop using csv library. just select them.
|
21 |
+
train_df = train_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]]
|
22 |
+
test_df = test_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]]
|
23 |
dfs = (train_df, test_df)
|
24 |
with wandb.init(entity="eubinecto", project="idiomify") as run:
|
25 |
# the paths to write datasets in
|