import os import datasets """Scisummnet dataset.""" _CITATION = """ @InProceedings{yasunaga&al.19.scisumm, title = {{ScisummNet}: A Large Annotated Corpus and Content-Impact Models for Scientific Paper Summarization with Citation Networks}, author = {Michihiro Yasunaga and Jungo Kasai and Rui Zhang and Alexander Fabbri and Irene Li and Dan Friedman and Dragomir Radev}, booktitle = {Proceedings of AAAI 2019}, year = {2019} } @InProceedings{yasunaga&al.17, title = {Graph-based Neural Multi-Document Summarization}, author = {Yasunaga, Michihiro and Zhang, Rui and Meelu, Kshitijh and Pareek, Ayush and Srinivasan, Krishnan and Radev, Dragomir R.}, booktitle = {Proceedings of CoNLL 2017}, year = {2017} } """ _DESCRIPTION = """ A summary of scientific papers should ideally incorporate the impact of the papers on the research community reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm), the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain. """ _HOMEPAGE = "https://cs.stanford.edu/~myasu/projects/scisumm_net/" _LICENSE = "CC BY-SA 4.0" _URLs = "https://cs.stanford.edu/~myasu/projects/scisumm_net/scisummnet_release1.1__20190413.zip" class SummertimeScisummnet(datasets.GeneratorBasedBuilder): """Scisummnet dataset.""" VERSION = datasets.Version("1.1.0") BUILDER_CONFIGS = [ datasets.BuilderConfig(), ] def _info(self): features = datasets.Features( { "entry_number": datasets.Value("string"), "document_xml": datasets.Value("string"), "citing_sentences_annotated.json": datasets.Value("string"), "summary": datasets.Value("string"), } ) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" my_urls = _URLs path = dl_manager.download_and_extract(my_urls) trainpath = os.path.join( path, "scisummnet_release1.1__20190413", "top1000_complete" ) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={"extraction_path": trainpath, "split": "train"}, ) ] def _generate_examples(self, extraction_path, split): """Yields examples.""" for folder in os.listdir(extraction_path): entry = {} entry["entry_number"] = folder doc_xml_path = os.path.join( extraction_path, folder, "Documents_xml", folder + ".xml" ) with open(doc_xml_path, "r", encoding="utf-8") as f: entry["document_xml"] = f.read() cite_annot_path = os.path.join( extraction_path, folder, "citing_sentences_annotated.json" ) with open(cite_annot_path, "r", encoding="utf-8") as f: entry["citing_sentences_annotated.json"] = f.read() summary_path = os.path.join( extraction_path, folder, "summary", folder + ".gold.txt" ) with open(summary_path, "r", encoding="utf-8") as f: entry["summary"] = f.read() yield entry["entry_number"], entry