File size: 3,634 Bytes
7e3e85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import datasets


"""Scisummnet dataset."""


_CITATION = """
@InProceedings{yasunaga&al.19.scisumm,
    title = {{ScisummNet}: A Large Annotated Corpus and Content-Impact Models for Scientific Paper Summarization with Citation Networks},
    author = {Michihiro Yasunaga and Jungo Kasai and Rui Zhang and Alexander Fabbri and Irene Li and Dan Friedman and Dragomir Radev},
    booktitle = {Proceedings of AAAI 2019},
    year = {2019}
}
@InProceedings{yasunaga&al.17,
  title = {Graph-based Neural Multi-Document Summarization},
  author = {Yasunaga, Michihiro and Zhang, Rui and Meelu, Kshitijh and Pareek, Ayush and Srinivasan, Krishnan and Radev, Dragomir R.},
  booktitle = {Proceedings of CoNLL 2017},
  year = {2017}
}
"""

_DESCRIPTION = """
A summary of scientific papers should ideally incorporate the impact of the papers on the research community
reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm),
the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain.
"""

_HOMEPAGE = "https://cs.stanford.edu/~myasu/projects/scisumm_net/"

_LICENSE = "CC BY-SA 4.0"

_URLs = "https://cs.stanford.edu/~myasu/projects/scisumm_net/scisummnet_release1.1__20190413.zip"


class SummertimeScisummnet(datasets.GeneratorBasedBuilder):
    """Scisummnet dataset."""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(),
    ]

    def _info(self):
        features = datasets.Features(
            {
                "entry_number": datasets.Value("string"),
                "document_xml": datasets.Value("string"),
                "citing_sentences_annotated.json": datasets.Value("string"),
                "summary": datasets.Value("string"),
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        my_urls = _URLs
        path = dl_manager.download_and_extract(my_urls)
        trainpath = os.path.join(
            path, "scisummnet_release1.1__20190413", "top1000_complete"
        )
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"extraction_path": trainpath, "split": "train"},
            )
        ]

    def _generate_examples(self, extraction_path, split):
        """Yields examples."""

        for folder in os.listdir(extraction_path):

            entry = {}

            entry["entry_number"] = folder

            doc_xml_path = os.path.join(
                extraction_path, folder, "Documents_xml", folder + ".xml"
            )
            with open(doc_xml_path, "r", encoding="utf-8") as f:
                entry["document_xml"] = f.read()

            cite_annot_path = os.path.join(
                extraction_path, folder, "citing_sentences_annotated.json"
            )
            with open(cite_annot_path, "r", encoding="utf-8") as f:
                entry["citing_sentences_annotated.json"] = f.read()

            summary_path = os.path.join(
                extraction_path, folder, "summary", folder + ".gold.txt"
            )
            with open(summary_path, "r", encoding="utf-8") as f:
                entry["summary"] = f.read()

            yield entry["entry_number"], entry