aliabd
full demo working with old graido
7e3e85d
raw
history blame
4.28 kB
import os
import json
import datasets
"""Summscreen dataset."""
_CITATION = """
@article{DBLP:journals/corr/abs-2104-07091,
author = {Mingda Chen and
Zewei Chu and
Sam Wiseman and
Kevin Gimpel},
title = {SummScreen: {A} Dataset for Abstractive Screenplay Summarization},
journal = {CoRR},
volume = {abs/2104.07091},
year = {2021},
url = {https://arxiv.org/abs/2104.07091},
archivePrefix = {arXiv},
eprint = {2104.07091},
timestamp = {Mon, 19 Apr 2021 16:45:47 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2104-07091.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""
_DESCRIPTION = """
A summary of scientific papers should ideally incorporate the impact of the papers on the research community
reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm),
the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain.
"""
_HOMEPAGE = "https://github.com/mingdachen/SummScreen"
_LICENSE = "MIT Licencse"
_URLs = "https://drive.google.com/uc?id=1BvdIllGBo9d2-bzXQRzWuJXB04XPVmfF"
class SummertimeSummscreen(datasets.GeneratorBasedBuilder):
"""Summscreen dataset."""
VERSION = datasets.Version("1.1.0")
BUILDER_CONFIGS = [
datasets.BuilderConfig(),
]
def _info(self):
features = datasets.Features(
{
"entry_number": datasets.Value("string"),
"transcript": datasets.features.Sequence(datasets.Value("string")),
"recap": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
supervised_keys=None,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
my_urls = _URLs
path = dl_manager.download_and_extract(my_urls)
path = os.path.join(path, "SummScreen")
trainpath_fd = os.path.join("ForeverDreaming", "fd_train.json")
trainpath_tms = os.path.join("TVMegaSite", "tms_train.json")
trainpaths = [trainpath_fd, trainpath_tms]
devpath_fd = os.path.join("ForeverDreaming", "fd_dev.json")
devpath_tms = os.path.join("TVMegaSite", "tms_dev.json")
devpaths = [devpath_fd, devpath_tms]
testpath_fd = os.path.join("ForeverDreaming", "fd_test.json")
testpath_tms = os.path.join("TVMegaSite", "tms_test.json")
testpaths = [testpath_fd, testpath_tms]
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepaths": (path, trainpaths), "split": "train"},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepaths": (path, devpaths), "split": "dev"},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
# These kwargs will be passed to _generate_examples
gen_kwargs={"filepaths": (path, testpaths), "split": "test"},
),
]
def _generate_examples(self, filepaths, split):
"""Yields examples."""
path, relative_filepaths = filepaths
for filepath in relative_filepaths:
extraction_path = os.path.join(path, filepath)
with open(extraction_path, "r") as f:
for line in f:
processed_line = line.replace("@@ ", "")
instance = json.loads(processed_line)
entry = {}
entry["entry_number"] = instance["filename"]
entry["transcript"] = instance["Transcript"]
entry["recap"] = instance["Recap"][
0
] # Recap is a single string in list
yield entry["entry_number"], entry