T5-Summarization / src /data /make_dataset.py
Gagan Bhatia
Update make_dataset.py
5e8234e
raw
history blame
771 Bytes
import yaml
from datasets import load_dataset
import pandas as pd
import os
import pprint
def make_dataset(dataset="cnn_dailymail", split="train"):
"""make dataset for summarisation"""
if not os.path.exists("data/raw"):
os.makedirs("data/raw")
dataset = load_dataset(dataset, "3.0.0", split=split)
df = pd.DataFrame()
df["article"] = dataset["article"]
df["highlights"] = dataset["highlights"]
df.to_csv("data/raw/{}.csv".format(split))
if __name__ == "__main__":
with open("params.yml") as f:
params = yaml.safe_load(f)
pprint.pprint(params)
make_dataset(dataset=params["data"], split="train")
make_dataset(dataset=params["data"], split="test")
make_dataset(dataset=params["data"], split="validation")