Spaces:
Build error
Build error
File size: 3,634 Bytes
7e3e85d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import os
import datasets
"""Scisummnet dataset."""
_CITATION = """
@InProceedings{yasunaga&al.19.scisumm,
title = {{ScisummNet}: A Large Annotated Corpus and Content-Impact Models for Scientific Paper Summarization with Citation Networks},
author = {Michihiro Yasunaga and Jungo Kasai and Rui Zhang and Alexander Fabbri and Irene Li and Dan Friedman and Dragomir Radev},
booktitle = {Proceedings of AAAI 2019},
year = {2019}
}
@InProceedings{yasunaga&al.17,
title = {Graph-based Neural Multi-Document Summarization},
author = {Yasunaga, Michihiro and Zhang, Rui and Meelu, Kshitijh and Pareek, Ayush and Srinivasan, Krishnan and Radev, Dragomir R.},
booktitle = {Proceedings of CoNLL 2017},
year = {2017}
}
"""
_DESCRIPTION = """
A summary of scientific papers should ideally incorporate the impact of the papers on the research community
reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm),
the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain.
"""
_HOMEPAGE = "https://cs.stanford.edu/~myasu/projects/scisumm_net/"
_LICENSE = "CC BY-SA 4.0"
_URLs = "https://cs.stanford.edu/~myasu/projects/scisumm_net/scisummnet_release1.1__20190413.zip"
class SummertimeScisummnet(datasets.GeneratorBasedBuilder):
"""Scisummnet dataset."""
VERSION = datasets.Version("1.1.0")
BUILDER_CONFIGS = [
datasets.BuilderConfig(),
]
def _info(self):
features = datasets.Features(
{
"entry_number": datasets.Value("string"),
"document_xml": datasets.Value("string"),
"citing_sentences_annotated.json": datasets.Value("string"),
"summary": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
supervised_keys=None,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
my_urls = _URLs
path = dl_manager.download_and_extract(my_urls)
trainpath = os.path.join(
path, "scisummnet_release1.1__20190413", "top1000_complete"
)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={"extraction_path": trainpath, "split": "train"},
)
]
def _generate_examples(self, extraction_path, split):
"""Yields examples."""
for folder in os.listdir(extraction_path):
entry = {}
entry["entry_number"] = folder
doc_xml_path = os.path.join(
extraction_path, folder, "Documents_xml", folder + ".xml"
)
with open(doc_xml_path, "r", encoding="utf-8") as f:
entry["document_xml"] = f.read()
cite_annot_path = os.path.join(
extraction_path, folder, "citing_sentences_annotated.json"
)
with open(cite_annot_path, "r", encoding="utf-8") as f:
entry["citing_sentences_annotated.json"] = f.read()
summary_path = os.path.join(
extraction_path, folder, "summary", folder + ".gold.txt"
)
with open(summary_path, "r", encoding="utf-8") as f:
entry["summary"] = f.read()
yield entry["entry_number"], entry
|