|
""" |
|
literal2idiomatic ver: d-1-2 |
|
""" |
|
import os |
|
from idiomify.paths import ROOT_DIR |
|
from idiomify.fetchers import fetch_pie, fetch_config |
|
from idiomify.preprocess import upsample, cleanse, stratified_split, annotate |
|
import wandb |
|
|
|
|
|
def main(): |
|
|
|
|
|
pie_df = fetch_pie() |
|
config = fetch_config()['literal2idiomatic'] |
|
train_df, test_df = pie_df.pipe(cleanse)\ |
|
.pipe(upsample, seed=config['seed'])\ |
|
.pipe(annotate, boi_token=config['boi_token'], eoi_token=config['eoi_token'])\ |
|
.pipe(stratified_split, ratio=config['train_ratio'], seed=config['seed']) |
|
|
|
train_df = train_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]] |
|
test_df = test_df[["Idiom", "Sense", "Literal_Sent", "Idiomatic_Sent"]] |
|
dfs = (train_df, test_df) |
|
with wandb.init(entity="eubinecto", project="idiomify") as run: |
|
|
|
train_path = ROOT_DIR / "train.tsv" |
|
test_path = ROOT_DIR / "test.tsv" |
|
paths = (train_path, test_path) |
|
artifact = wandb.Artifact(name="literal2idiomatic", type="dataset", description=config['description'], |
|
metadata=config) |
|
for tsv_path, df in zip(paths, dfs): |
|
df.to_csv(tsv_path, sep="\t") |
|
artifact.add_file(tsv_path) |
|
|
|
run.log_artifact(artifact, aliases=["latest", config['ver']]) |
|
|
|
for tsv_path in paths: |
|
os.remove(tsv_path) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|