Spaces:

aliabd
/

SummerTime

Build error

File size: 4,275 Bytes

7e3e85d

import os
import json
import datasets


"""Summscreen dataset."""


_CITATION = """
@article{DBLP:journals/corr/abs-2104-07091,
  author    = {Mingda Chen and
               Zewei Chu and
               Sam Wiseman and
               Kevin Gimpel},
  title     = {SummScreen: {A} Dataset for Abstractive Screenplay Summarization},
  journal   = {CoRR},
  volume    = {abs/2104.07091},
  year      = {2021},
  url       = {https://arxiv.org/abs/2104.07091},
  archivePrefix = {arXiv},
  eprint    = {2104.07091},
  timestamp = {Mon, 19 Apr 2021 16:45:47 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2104-07091.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""

_DESCRIPTION = """
A summary of scientific papers should ideally incorporate the impact of the papers on the research community
reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm),
the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain.
"""

_HOMEPAGE = "https://github.com/mingdachen/SummScreen"

_LICENSE = "MIT Licencse"

_URLs = "https://drive.google.com/uc?id=1BvdIllGBo9d2-bzXQRzWuJXB04XPVmfF"


class SummertimeSummscreen(datasets.GeneratorBasedBuilder):
    """Summscreen dataset."""

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(),
    ]

    def _info(self):
        features = datasets.Features(
            {
                "entry_number": datasets.Value("string"),
                "transcript": datasets.features.Sequence(datasets.Value("string")),
                "recap": datasets.Value("string"),
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        my_urls = _URLs
        path = dl_manager.download_and_extract(my_urls)
        path = os.path.join(path, "SummScreen")

        trainpath_fd = os.path.join("ForeverDreaming", "fd_train.json")
        trainpath_tms = os.path.join("TVMegaSite", "tms_train.json")
        trainpaths = [trainpath_fd, trainpath_tms]

        devpath_fd = os.path.join("ForeverDreaming", "fd_dev.json")
        devpath_tms = os.path.join("TVMegaSite", "tms_dev.json")
        devpaths = [devpath_fd, devpath_tms]

        testpath_fd = os.path.join("ForeverDreaming", "fd_test.json")
        testpath_tms = os.path.join("TVMegaSite", "tms_test.json")
        testpaths = [testpath_fd, testpath_tms]

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepaths": (path, trainpaths), "split": "train"},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepaths": (path, devpaths), "split": "dev"},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={"filepaths": (path, testpaths), "split": "test"},
            ),
        ]

    def _generate_examples(self, filepaths, split):
        """Yields examples."""

        path, relative_filepaths = filepaths
        for filepath in relative_filepaths:

            extraction_path = os.path.join(path, filepath)

            with open(extraction_path, "r") as f:
                for line in f:
                    processed_line = line.replace("@@ ", "")
                    instance = json.loads(processed_line)

                    entry = {}
                    entry["entry_number"] = instance["filename"]
                    entry["transcript"] = instance["Transcript"]
                    entry["recap"] = instance["Recap"][
                        0
                    ]  # Recap is a single string in list

                    yield entry["entry_number"], entry