|
import os |
|
import json |
|
import datasets |
|
|
|
|
|
"""Summscreen dataset.""" |
|
|
|
|
|
_CITATION = """ |
|
@article{DBLP:journals/corr/abs-2104-07091, |
|
author = {Mingda Chen and |
|
Zewei Chu and |
|
Sam Wiseman and |
|
Kevin Gimpel}, |
|
title = {SummScreen: {A} Dataset for Abstractive Screenplay Summarization}, |
|
journal = {CoRR}, |
|
volume = {abs/2104.07091}, |
|
year = {2021}, |
|
url = {https://arxiv.org/abs/2104.07091}, |
|
archivePrefix = {arXiv}, |
|
eprint = {2104.07091}, |
|
timestamp = {Mon, 19 Apr 2021 16:45:47 +0200}, |
|
biburl = {https://dblp.org/rec/journals/corr/abs-2104-07091.bib}, |
|
bibsource = {dblp computer science bibliography, https://dblp.org} |
|
} |
|
""" |
|
|
|
_DESCRIPTION = """ |
|
A summary of scientific papers should ideally incorporate the impact of the papers on the research community |
|
reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm), |
|
the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain. |
|
""" |
|
|
|
_HOMEPAGE = "https://github.com/mingdachen/SummScreen" |
|
|
|
_LICENSE = "MIT Licencse" |
|
|
|
_URLs = "https://drive.google.com/uc?id=1BvdIllGBo9d2-bzXQRzWuJXB04XPVmfF" |
|
|
|
|
|
class SummertimeSummscreen(datasets.GeneratorBasedBuilder): |
|
"""Summscreen dataset.""" |
|
|
|
VERSION = datasets.Version("1.1.0") |
|
|
|
BUILDER_CONFIGS = [ |
|
datasets.BuilderConfig(), |
|
] |
|
|
|
def _info(self): |
|
features = datasets.Features( |
|
{ |
|
"entry_number": datasets.Value("string"), |
|
"transcript": datasets.features.Sequence(datasets.Value("string")), |
|
"recap": datasets.Value("string"), |
|
} |
|
) |
|
return datasets.DatasetInfo( |
|
description=_DESCRIPTION, |
|
features=features, |
|
supervised_keys=None, |
|
homepage=_HOMEPAGE, |
|
license=_LICENSE, |
|
citation=_CITATION, |
|
) |
|
|
|
def _split_generators(self, dl_manager): |
|
"""Returns SplitGenerators.""" |
|
my_urls = _URLs |
|
path = dl_manager.download_and_extract(my_urls) |
|
path = os.path.join(path, "SummScreen") |
|
|
|
trainpath_fd = os.path.join("ForeverDreaming", "fd_train.json") |
|
trainpath_tms = os.path.join("TVMegaSite", "tms_train.json") |
|
trainpaths = [trainpath_fd, trainpath_tms] |
|
|
|
devpath_fd = os.path.join("ForeverDreaming", "fd_dev.json") |
|
devpath_tms = os.path.join("TVMegaSite", "tms_dev.json") |
|
devpaths = [devpath_fd, devpath_tms] |
|
|
|
testpath_fd = os.path.join("ForeverDreaming", "fd_test.json") |
|
testpath_tms = os.path.join("TVMegaSite", "tms_test.json") |
|
testpaths = [testpath_fd, testpath_tms] |
|
|
|
return [ |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TRAIN, |
|
|
|
gen_kwargs={"filepaths": (path, trainpaths), "split": "train"}, |
|
), |
|
datasets.SplitGenerator( |
|
name=datasets.Split.VALIDATION, |
|
|
|
gen_kwargs={"filepaths": (path, devpaths), "split": "dev"}, |
|
), |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TEST, |
|
|
|
gen_kwargs={"filepaths": (path, testpaths), "split": "test"}, |
|
), |
|
] |
|
|
|
def _generate_examples(self, filepaths, split): |
|
"""Yields examples.""" |
|
|
|
path, relative_filepaths = filepaths |
|
for filepath in relative_filepaths: |
|
|
|
extraction_path = os.path.join(path, filepath) |
|
|
|
with open(extraction_path, "r") as f: |
|
for line in f: |
|
processed_line = line.replace("@@ ", "") |
|
instance = json.loads(processed_line) |
|
|
|
entry = {} |
|
entry["entry_number"] = instance["filename"] |
|
entry["transcript"] = instance["Transcript"] |
|
entry["recap"] = instance["Recap"][ |
|
0 |
|
] |
|
|
|
yield entry["entry_number"], entry |
|
|