T5-Summarization / src /data /make_dataset.py
Gagan Bhatia
Update make_dataset.py
bf1265d
raw
history blame
No virus
772 Bytes
import yaml
from datasets import load_dataset
import pandas as pd
import os
import pprint
def make_dataset(dataset='cnn_dailymail', split='train'):
"""make dataset for summarisation"""
if not os.path.exists('data/raw'):
os.makedirs('data/raw')
dataset = load_dataset(dataset, '3.0.0', split=split)
df = pd.DataFrame()
df['article'] = dataset['article']
df['highlights'] = dataset['highlights']
df.to_csv('data/raw/{}.csv'.format(split))
if __name__ == '__main__':
with open("params.yml") as f:
params = yaml.safe_load(f)
pprint.pprint(params)
make_dataset(dataset=params['data'], split='train')
make_dataset(dataset=params['data'], split='test')
make_dataset(dataset=params['data'], split='validation')