File size: 4,275 Bytes
7e3e85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import json
import datasets


"""Summscreen dataset."""


_CITATION = """
@article{DBLP:journals/corr/abs-2104-07091,
  author    = {Mingda Chen and
               Zewei Chu and
               Sam Wiseman and
               Kevin Gimpel},
  title     = {SummScreen: {A} Dataset for Abstractive Screenplay Summarization},
  journal   = {CoRR},
  volume    = {abs/2104.07091},
  year      = {2021},
  url       = {https://arxiv.org/abs/2104.07091},
  archivePrefix = {arXiv},
  eprint    = {2104.07091},
  timestamp = {Mon, 19 Apr 2021 16:45:47 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2104-07091.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""

_DESCRIPTION = """
A summary of scientific papers should ideally incorporate the impact of the papers on the research community
reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm),
the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain.
"""

_HOMEPAGE = "https://github.com/mingdachen/SummScreen"

_LICENSE = "MIT Licencse"

_URLs = "https://drive.google.com/uc?id=1BvdIllGBo9d2-bzXQRzWuJXB04XPVmfF"


class SummertimeSummscreen(datasets.GeneratorBasedBuilder):
    """Summscreen dataset."""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(),
    ]

    def _info(self):
        features = datasets.Features(
            {
                "entry_number": datasets.Value("string"),
                "transcript": datasets.features.Sequence(datasets.Value("string")),
                "recap": datasets.Value("string"),
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        my_urls = _URLs
        path = dl_manager.download_and_extract(my_urls)
        path = os.path.join(path, "SummScreen")

        trainpath_fd = os.path.join("ForeverDreaming", "fd_train.json")
        trainpath_tms = os.path.join("TVMegaSite", "tms_train.json")
        trainpaths = [trainpath_fd, trainpath_tms]

        devpath_fd = os.path.join("ForeverDreaming", "fd_dev.json")
        devpath_tms = os.path.join("TVMegaSite", "tms_dev.json")
        devpaths = [devpath_fd, devpath_tms]

        testpath_fd = os.path.join("ForeverDreaming", "fd_test.json")
        testpath_tms = os.path.join("TVMegaSite", "tms_test.json")
        testpaths = [testpath_fd, testpath_tms]

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepaths": (path, trainpaths), "split": "train"},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepaths": (path, devpaths), "split": "dev"},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepaths": (path, testpaths), "split": "test"},
            ),
        ]

    def _generate_examples(self, filepaths, split):
        """Yields examples."""

        path, relative_filepaths = filepaths
        for filepath in relative_filepaths:

            extraction_path = os.path.join(path, filepath)

            with open(extraction_path, "r") as f:
                for line in f:
                    processed_line = line.replace("@@ ", "")
                    instance = json.loads(processed_line)

                    entry = {}
                    entry["entry_number"] = instance["filename"]
                    entry["transcript"] = instance["Transcript"]
                    entry["recap"] = instance["Recap"][
                        0
                    ]  # Recap is a single string in list

                    yield entry["entry_number"], entry