import pandas as pd import yaml import os def process_data(split="train"): with open("params.yml") as f: params = yaml.safe_load(f) df = pd.read_csv("data/raw/{}.csv".format(split)) df.columns = ["Unnamed: 0", "input_text", "output_text"] df = df.sample(frac=params["split"], replace=True, random_state=1) if os.path.exists("data/raw/{}.csv".format(split)): os.remove("data/raw/{}.csv".format(split)) df.to_csv("data/processed/{}.csv".format(split)) if __name__ == "__main__": process_data(split="train") process_data(split="test") process_data(split="validation")