Spaces:
Build error
Build error
import os | |
import json | |
import datasets | |
"""Summscreen dataset.""" | |
_CITATION = """ | |
@article{DBLP:journals/corr/abs-2104-07091, | |
author = {Mingda Chen and | |
Zewei Chu and | |
Sam Wiseman and | |
Kevin Gimpel}, | |
title = {SummScreen: {A} Dataset for Abstractive Screenplay Summarization}, | |
journal = {CoRR}, | |
volume = {abs/2104.07091}, | |
year = {2021}, | |
url = {https://arxiv.org/abs/2104.07091}, | |
archivePrefix = {arXiv}, | |
eprint = {2104.07091}, | |
timestamp = {Mon, 19 Apr 2021 16:45:47 +0200}, | |
biburl = {https://dblp.org/rec/journals/corr/abs-2104-07091.bib}, | |
bibsource = {dblp computer science bibliography, https://dblp.org} | |
} | |
""" | |
_DESCRIPTION = """ | |
A summary of scientific papers should ideally incorporate the impact of the papers on the research community | |
reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm), | |
the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain. | |
""" | |
_HOMEPAGE = "https://github.com/mingdachen/SummScreen" | |
_LICENSE = "MIT Licencse" | |
_URLs = "https://drive.google.com/uc?id=1BvdIllGBo9d2-bzXQRzWuJXB04XPVmfF" | |
class SummertimeSummscreen(datasets.GeneratorBasedBuilder): | |
"""Summscreen dataset.""" | |
VERSION = datasets.Version("1.1.0") | |
BUILDER_CONFIGS = [ | |
datasets.BuilderConfig(), | |
] | |
def _info(self): | |
features = datasets.Features( | |
{ | |
"entry_number": datasets.Value("string"), | |
"transcript": datasets.features.Sequence(datasets.Value("string")), | |
"recap": datasets.Value("string"), | |
} | |
) | |
return datasets.DatasetInfo( | |
description=_DESCRIPTION, | |
features=features, | |
supervised_keys=None, | |
homepage=_HOMEPAGE, | |
license=_LICENSE, | |
citation=_CITATION, | |
) | |
def _split_generators(self, dl_manager): | |
"""Returns SplitGenerators.""" | |
my_urls = _URLs | |
path = dl_manager.download_and_extract(my_urls) | |
path = os.path.join(path, "SummScreen") | |
trainpath_fd = os.path.join("ForeverDreaming", "fd_train.json") | |
trainpath_tms = os.path.join("TVMegaSite", "tms_train.json") | |
trainpaths = [trainpath_fd, trainpath_tms] | |
devpath_fd = os.path.join("ForeverDreaming", "fd_dev.json") | |
devpath_tms = os.path.join("TVMegaSite", "tms_dev.json") | |
devpaths = [devpath_fd, devpath_tms] | |
testpath_fd = os.path.join("ForeverDreaming", "fd_test.json") | |
testpath_tms = os.path.join("TVMegaSite", "tms_test.json") | |
testpaths = [testpath_fd, testpath_tms] | |
return [ | |
datasets.SplitGenerator( | |
name=datasets.Split.TRAIN, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={"filepaths": (path, trainpaths), "split": "train"}, | |
), | |
datasets.SplitGenerator( | |
name=datasets.Split.VALIDATION, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={"filepaths": (path, devpaths), "split": "dev"}, | |
), | |
datasets.SplitGenerator( | |
name=datasets.Split.TEST, | |
# These kwargs will be passed to _generate_examples | |
gen_kwargs={"filepaths": (path, testpaths), "split": "test"}, | |
), | |
] | |
def _generate_examples(self, filepaths, split): | |
"""Yields examples.""" | |
path, relative_filepaths = filepaths | |
for filepath in relative_filepaths: | |
extraction_path = os.path.join(path, filepath) | |
with open(extraction_path, "r") as f: | |
for line in f: | |
processed_line = line.replace("@@ ", "") | |
instance = json.loads(processed_line) | |
entry = {} | |
entry["entry_number"] = instance["filename"] | |
entry["transcript"] = instance["Transcript"] | |
entry["recap"] = instance["Recap"][ | |
0 | |
] # Recap is a single string in list | |
yield entry["entry_number"], entry | |