|
import os |
|
import json |
|
import datasets |
|
|
|
|
|
"""Arxiv dataset.""" |
|
|
|
|
|
_CITATION = """ |
|
@article{Cohan_2018, |
|
title={A Discourse-Aware Attention Model for Abstractive Summarization of |
|
Long Documents}, |
|
url={http://dx.doi.org/10.18653/v1/n18-2097}, |
|
DOI={10.18653/v1/n18-2097}, |
|
journal={Proceedings of the 2018 Conference of the North American Chapter of |
|
the Association for Computational Linguistics: Human Language |
|
Technologies, Volume 2 (Short Papers)}, |
|
publisher={Association for Computational Linguistics}, |
|
author={Cohan, Arman and Dernoncourt, Franck and Kim, Doo Soon and Bui, Trung and Kim, Seokhwan and Chang, Walter and Goharian, Nazli}, |
|
year={2018} |
|
} |
|
""" |
|
|
|
_DESCRIPTION = """ |
|
A summarization dataset comprised of pairs of scientific papers. |
|
The dataset provides a challenging testbed for abstractive summarization. |
|
It contains papers and their abstracts. |
|
""" |
|
|
|
_HOMEPAGE = "https://github.com/armancohan/long-summarization" |
|
|
|
_LICENSE = "Apache-2.0 License" |
|
|
|
_URL = "https://archive.org/download/armancohan-long-summarization-paper-code/arxiv-dataset.zip" |
|
|
|
|
|
class SummertimeArxiv(datasets.GeneratorBasedBuilder): |
|
"""Arxiv long summarization dataset.""" |
|
|
|
VERSION = datasets.Version("1.0.0") |
|
|
|
BUILDER_CONFIGS = [ |
|
datasets.BuilderConfig(), |
|
] |
|
|
|
def _info(self): |
|
features = datasets.Features( |
|
{ |
|
"article_id": datasets.Value("string"), |
|
"article_text": [datasets.Value("string")], |
|
"abstract_text": [datasets.Value("string")], |
|
} |
|
) |
|
return datasets.DatasetInfo( |
|
description=_DESCRIPTION, |
|
features=features, |
|
supervised_keys=None, |
|
homepage=_HOMEPAGE, |
|
license=_LICENSE, |
|
citation=_CITATION, |
|
) |
|
|
|
def _split_generators(self, dl_manager): |
|
"""Returns SplitGenerators.""" |
|
my_urls = _URL |
|
path = dl_manager.download_and_extract(my_urls) |
|
path = os.path.join(path, "arxiv-dataset") |
|
|
|
trainpath = os.path.join(path, "train.txt") |
|
valpath = os.path.join(path, "val.txt") |
|
testpath = os.path.join(path, "test.txt") |
|
|
|
return [ |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TRAIN, |
|
|
|
gen_kwargs={"filepath": trainpath, "split": "train"}, |
|
), |
|
datasets.SplitGenerator( |
|
name=datasets.Split.VALIDATION, |
|
|
|
gen_kwargs={"filepath": valpath, "split": "val"}, |
|
), |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TEST, |
|
|
|
gen_kwargs={"filepath": testpath, "split": "test"}, |
|
), |
|
] |
|
|
|
def _generate_examples(self, filepath, split): |
|
"""Yields examples.""" |
|
|
|
with open(filepath, "r") as f: |
|
for line in f: |
|
|
|
instance = json.loads(line) |
|
|
|
entry = {} |
|
entry["article_id"] = instance["article_id"] |
|
entry["article_text"] = instance["article_text"] |
|
entry["abstract_text"] = instance["abstract_text"] |
|
|
|
yield entry["article_id"], entry |
|
|