aliabd
full demo working with old graido
7e3e85d
raw history blame
No virus
3.63 kB
import os
import datasets
"""Scisummnet dataset."""
_CITATION = """
@InProceedings{yasunaga&al.19.scisumm,
title = {{ScisummNet}: A Large Annotated Corpus and Content-Impact Models for Scientific Paper Summarization with Citation Networks},
author = {Michihiro Yasunaga and Jungo Kasai and Rui Zhang and Alexander Fabbri and Irene Li and Dan Friedman and Dragomir Radev},
booktitle = {Proceedings of AAAI 2019},
year = {2019}
}
@InProceedings{yasunaga&al.17,
title = {Graph-based Neural Multi-Document Summarization},
author = {Yasunaga, Michihiro and Zhang, Rui and Meelu, Kshitijh and Pareek, Ayush and Srinivasan, Krishnan and Radev, Dragomir R.},
booktitle = {Proceedings of CoNLL 2017},
year = {2017}
}
"""
_DESCRIPTION = """
A summary of scientific papers should ideally incorporate the impact of the papers on the research community
reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm),
the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain.
"""
_HOMEPAGE = "https://cs.stanford.edu/~myasu/projects/scisumm_net/"
_LICENSE = "CC BY-SA 4.0"
_URLs = "https://cs.stanford.edu/~myasu/projects/scisumm_net/scisummnet_release1.1__20190413.zip"
class SummertimeScisummnet(datasets.GeneratorBasedBuilder):
"""Scisummnet dataset."""
VERSION = datasets.Version("1.1.0")
BUILDER_CONFIGS = [
datasets.BuilderConfig(),
]
def _info(self):
features = datasets.Features(
{
"entry_number": datasets.Value("string"),
"document_xml": datasets.Value("string"),
"citing_sentences_annotated.json": datasets.Value("string"),
"summary": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
supervised_keys=None,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
my_urls = _URLs
path = dl_manager.download_and_extract(my_urls)
trainpath = os.path.join(
path, "scisummnet_release1.1__20190413", "top1000_complete"
)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={"extraction_path": trainpath, "split": "train"},
)
]
def _generate_examples(self, extraction_path, split):
"""Yields examples."""
for folder in os.listdir(extraction_path):
entry = {}
entry["entry_number"] = folder
doc_xml_path = os.path.join(
extraction_path, folder, "Documents_xml", folder + ".xml"
)
with open(doc_xml_path, "r", encoding="utf-8") as f:
entry["document_xml"] = f.read()
cite_annot_path = os.path.join(
extraction_path, folder, "citing_sentences_annotated.json"
)
with open(cite_annot_path, "r", encoding="utf-8") as f:
entry["citing_sentences_annotated.json"] = f.read()
summary_path = os.path.join(
extraction_path, folder, "summary", folder + ".gold.txt"
)
with open(summary_path, "r", encoding="utf-8") as f:
entry["summary"] = f.read()
yield entry["entry_number"], entry