File size: 623 Bytes
1c6ec0a
2c042c1
52aeedb
33908fd
987e02d
8717423
1b154c5
9951e30
27edbdc
e1f665f
6570db2
fbbe319
d17a51b
fc0bcb4
255f974
0f73b4b
e83f5ad
d9476c0
e7870b2
 
438f138
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import pandas as pd
import yaml
import os


def process_data(split="train"):

    with open("params.yml") as f:
        params = yaml.safe_load(f)

    df = pd.read_csv("data/raw/{}.csv".format(split))
    df.columns = ["Unnamed: 0", "input_text", "output_text"]
    df = df.sample(frac=params["split"], replace=True, random_state=1)
    if os.path.exists("data/raw/{}.csv".format(split)):
        os.remove("data/raw/{}.csv".format(split))
    df.to_csv("data/processed/{}.csv".format(split))


if __name__ == "__main__":
    process_data(split="train")
    process_data(split="test")
    process_data(split="validation")