File size: 1,769 Bytes
e9d1a5a
e3c7b5a
e9d1a5a
 
 
e3c7b5a
fca50f9
e9d1a5a
 
 
 
 
e3c7b5a
 
642d911
e3c7b5a
 
fca50f9
e3c7b5a
 
370afc1
 
e3c7b5a
 
 
 
 
 
 
 
 
 
 
 
e9d1a5a
e3c7b5a
 
 
e9d1a5a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
"""
literal2idiomatic ver: d-1-2
"""
import os
from idiomify.paths import ROOT_DIR
from idiomify.fetchers import fetch_pie, fetch_config
from idiomify.preprocess import upsample, cleanse, stratified_split, annotate
import wandb


def main():

    # here, we use all of them, while splitting them into train & test
    pie_df = fetch_pie()
    config = fetch_config()['literal2idiomatic']
    train_df, test_df = pie_df.pipe(cleanse)\
                              .pipe(upsample, seed=config['seed'])\
                              .pipe(annotate, boi_token=config['boi_token'], eoi_token=config['eoi_token'])\
                              .pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed'])
    # why don't you just "select"  the columns? yeah, stop using csv library. just select them.
    train_df = train_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]]
    test_df = test_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]]
    dfs = (train_df, test_df)
    with wandb.init(entity="eubinecto", project="idiomify") as run:
        # the paths to write datasets in
        train_path = ROOT_DIR / "train.tsv"
        test_path = ROOT_DIR / "test.tsv"
        paths = (train_path, test_path)
        artifact = wandb.Artifact(name="literal2idiomatic", type="dataset", description=config['description'],
                                  metadata=config)
        for tsv_path, df in zip(paths, dfs):
            df.to_csv(tsv_path, sep="\t")
            artifact.add_file(tsv_path)
        # then, we just log them here.
        run.log_artifact(artifact, aliases=["latest", config['ver']])
        # don't forget to remove them
        for tsv_path in paths:
            os.remove(tsv_path)


if __name__ == '__main__':
    main()