|
from os import path |
|
from tqdm import tqdm |
|
from typing import List, Generator, Optional, Union |
|
|
|
from datasets import Dataset |
|
|
|
from dataset.st_dataset import SummInstance, SummDataset |
|
|
|
|
|
|
|
FILE_DIRECTORY_PATH = path.dirname(path.realpath(__file__)) |
|
BASE_NONHUGGINGFACE_DATASETS_PATH = path.join( |
|
FILE_DIRECTORY_PATH, "non_huggingface_datasets_builders" |
|
) |
|
|
|
|
|
|
|
|
|
|
|
class CnndmDataset(SummDataset): |
|
""" |
|
The CNN/DM dataset |
|
""" |
|
|
|
dataset_name = "CNN/DailyMail" |
|
|
|
is_query_based = False |
|
is_dialogue_based = False |
|
is_multi_document = False |
|
|
|
huggingface_dataset = True |
|
huggingface_page = "https://huggingface.co/datasets/cnn_dailymail" |
|
|
|
def __init__(self): |
|
super().__init__( |
|
dataset_args=( |
|
"cnn_dailymail", |
|
"3.0.0", |
|
) |
|
) |
|
|
|
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: |
|
""" |
|
Overrides the SummDataset '_process_data()' method |
|
This method processes the data contained in the dataset |
|
and puts each data instance into a SummInstance object |
|
:param dataset: a train/validation/test dataset |
|
:rtype: a generator yielding SummInstance objects |
|
""" |
|
for instance in tqdm(data): |
|
article: str = instance["article"] |
|
highlights: str = instance["highlights"] |
|
summ_instance = SummInstance(source=article, summary=highlights) |
|
|
|
yield summ_instance |
|
|
|
|
|
class MultinewsDataset(SummDataset): |
|
""" |
|
The Multi News dataset |
|
""" |
|
|
|
dataset_name = "Multinews" |
|
|
|
is_query_based = False |
|
is_dialogue_based = False |
|
is_multi_document = True |
|
|
|
huggingface_dataset = True |
|
huggingface_page = "https://huggingface.co/datasets/multi_news" |
|
|
|
def __init__(self): |
|
super().__init__(dataset_args=("multi_news",)) |
|
|
|
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: |
|
""" |
|
Overrides the SummDataset '_process_data()' method |
|
This method processes the data contained in the dataset |
|
and puts each data instance into a SummInstance object |
|
:param dataset: a train/validation/test dataset |
|
:rtype: a generator yielding SummInstance objects |
|
""" |
|
for instance in tqdm(data): |
|
document: list = [ |
|
doc for doc in instance["document"].split("|||||") if doc |
|
] |
|
|
|
|
|
summary: str = instance["summary"] |
|
summ_instance = SummInstance(source=document, summary=summary) |
|
|
|
yield summ_instance |
|
|
|
|
|
class SamsumDataset(SummDataset): |
|
""" |
|
The SAMsum Dataset |
|
""" |
|
|
|
dataset_name = "Samsum" |
|
|
|
is_query_based = False |
|
is_dialogue_based = True |
|
is_multi_document = False |
|
|
|
huggingface_dataset = True |
|
huggingface_page = "https://huggingface.co/datasets/samsum" |
|
|
|
def __init__(self): |
|
super().__init__(dataset_args=("samsum",)) |
|
|
|
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: |
|
""" |
|
Overrides the SummDataset '_process_data()' method |
|
This method processes the data contained in the dataset |
|
and puts each data instance into a SummInstance object |
|
:param dataset: a train/validation/test dataset |
|
:rtype: a generator yielding SummInstance objects |
|
""" |
|
for instance in tqdm(data): |
|
dialogue: List = instance["dialogue"].split( |
|
"\r\n" |
|
) |
|
|
|
summary: str = instance["summary"] |
|
summ_instance = SummInstance(source=dialogue, summary=summary) |
|
|
|
yield summ_instance |
|
|
|
|
|
class XsumDataset(SummDataset): |
|
""" |
|
The Xsum Dataset |
|
""" |
|
|
|
dataset_name = "Xsum" |
|
|
|
huggingface_dataset = True |
|
huggingface_page = "https://huggingface.co/datasets/xsum" |
|
|
|
is_query_based = False |
|
is_dialogue_based = False |
|
is_multi_document = False |
|
|
|
def __init__(self): |
|
super().__init__(dataset_args=("xsum",)) |
|
|
|
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: |
|
""" |
|
Overrides the SummDataset '_process_data()' method |
|
This method processes the data contained in the dataset |
|
and puts each data instance into a SummInstance object |
|
:param dataset: a train/validation/test dataset |
|
:rtype: a generator yielding SummInstance objects |
|
""" |
|
for instance in tqdm(data): |
|
document: List = instance["document"] |
|
summary: str = instance["summary"] |
|
summ_instance = SummInstance(source=document, summary=summary) |
|
|
|
yield summ_instance |
|
|
|
|
|
class PubmedqaDataset(SummDataset): |
|
""" |
|
The Pubmed QA dataset |
|
""" |
|
|
|
dataset_name = "Pubmedqa" |
|
|
|
is_query_based = True |
|
is_dialogue_based = False |
|
is_multi_document = False |
|
|
|
huggingface_dataset = True |
|
huggingface_page = "https://huggingface.co/datasets/pubmed_qa" |
|
|
|
def __init__(self, seed=None): |
|
super().__init__( |
|
dataset_args=( |
|
"pubmed_qa", |
|
"pqa_artificial", |
|
) |
|
) |
|
|
|
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: |
|
""" |
|
Overrides the SummDataset '_process_data()' method |
|
This method processes the data contained in the dataset |
|
and puts each data instance into a SummInstance object |
|
:param dataset: a train/validation/test dataset |
|
:rtype: a generator yielding SummInstance objects |
|
""" |
|
for instance in tqdm(data): |
|
context: str = " ".join(instance["context"]["contexts"]) |
|
answer: str = instance["long_answer"] |
|
query: str = instance["question"] |
|
summ_instance = SummInstance(source=context, summary=answer, query=query) |
|
|
|
yield summ_instance |
|
|
|
|
|
class MlsumDataset(SummDataset): |
|
""" |
|
The MLsum Dataset - A multi-lingual dataset featuring 5 languages |
|
Includes 1.5 million news articles and their corresponding summaries |
|
|
|
"de" - German |
|
"es" - Spanish |
|
"fr" - French |
|
"ru" - Russian |
|
"tu" - Turkish |
|
""" |
|
|
|
dataset_name = "MlSum" |
|
|
|
is_query_based = False |
|
is_dialogue_based = False |
|
is_multi_document = False |
|
|
|
huggingface_dataset = True |
|
huggingface_page = "https://huggingface.co/datasets/mlsum" |
|
supported_languages = ["de", "es", "fr", "ru", "tu"] |
|
|
|
mlsum_instantiation_guide = """The languages supported for the Mlsum Dataset are: |
|
de - German |
|
es - Spanish |
|
fr - French |
|
ru - Russian |
|
tu - Turkish |
|
|
|
Examples to instantiate the dataset: |
|
1. Dataset with only one language |
|
dataset = MlsumDataset({language_token}) |
|
dataset = MlsumDataset("es") |
|
dataset = MlsumDataset("tu")... |
|
|
|
2. Dataset with a multiple languages |
|
dataset = MlsumDataset({list of language_token}) |
|
dataset = MlsumDataset(["es","de"]) |
|
dataset = MlsumDataset(["es","de", "tu"])... |
|
|
|
3. Dataset with all supported languages (default) |
|
dataset = MlsumDataset(all) |
|
dataset = MlsumDataset() |
|
""" |
|
|
|
def __init__(self, languages: Optional[Union[str, List[str]]] = "all"): |
|
super().__init__(dataset_args=(languages,)) |
|
|
|
def _load_dataset_safe(self, languages: Optional[Union[str, List[str]]]): |
|
""" |
|
Overrides the parent class method |
|
Method loads multiple datasets of different languages provided in :param languages: |
|
It then concatenates these datasets into one combined dataset |
|
:rtype: datasetDict containing the combined dataset |
|
:param languages: Optional, either a string or list of strings specifying the languages |
|
to load |
|
""" |
|
print(MlsumDataset.mlsum_instantiation_guide) |
|
|
|
|
|
if languages == "all": |
|
selected_languages = MlsumDataset.supported_languages |
|
elif isinstance(languages, list): |
|
for language in languages: |
|
assert self.is_supported(language) |
|
selected_languages = languages |
|
else: |
|
assert self.is_supported(languages) |
|
selected_languages = [languages] |
|
|
|
|
|
language_datasets = [] |
|
for language in selected_languages: |
|
dataset = super()._load_dataset_safe( |
|
"mlsum", |
|
language, |
|
) |
|
|
|
language_datasets.append(dataset) |
|
|
|
mlsum_dataset = self._concatenate_dataset_dicts(language_datasets) |
|
|
|
return mlsum_dataset |
|
|
|
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: |
|
""" |
|
Overrides the SummDataset '_process_data()' method |
|
This method processes the data contained in the dataset |
|
and puts each data instance into a SummInstance object |
|
:param dataset: a train/validation/test dataset |
|
:rtype: a generator yielding SummInstance objects |
|
""" |
|
for instance in tqdm(data): |
|
article: List = instance["text"] |
|
summary: str = instance["summary"] |
|
summ_instance = SummInstance(source=article, summary=summary) |
|
|
|
yield summ_instance |
|
|
|
def is_supported(self, language: str): |
|
""" |
|
Checks whether the requested langues is supported |
|
:param language: string containing the requested language |
|
:rtype bool: |
|
""" |
|
if language not in MlsumDataset.supported_languages: |
|
print(MlsumDataset.mlsum_instantiation_guide) |
|
raise ValueError( |
|
f"The language(s): '{language}' entered is not supported. See above message for usage info" |
|
) |
|
else: |
|
return True |
|
|
|
|
|
|
|
|
|
|
|
class ScisummnetDataset(SummDataset): |
|
""" |
|
The SciSummNet dataset. As a dataset not included by huggingface, we need to do manually download, set basic |
|
information for the dataset |
|
""" |
|
|
|
dataset_name = "ScisummNet" |
|
|
|
version = "1.1.0" |
|
description = ( |
|
"A summary of scientific papers should ideally incorporate the impact of the papers on the " |
|
"research community reflected by citations. To facilitate research in citation-aware scientific " |
|
"paper summarization (Scisumm), the CL-Scisumm shared task has been organized since 2014 for " |
|
"papers in the computational linguistics and NLP domain." |
|
) |
|
|
|
is_dialogue_based = False |
|
is_multi_document = False |
|
is_query_based = False |
|
|
|
huggingface_dataset = False |
|
builder_script_path = path.join( |
|
BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" |
|
) |
|
|
|
def __init__(self, seed=None): |
|
super().__init__() |
|
|
|
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: |
|
""" |
|
Overrides the SummDataset '_process_data()' method |
|
This method processes the data contained in the dataset |
|
and puts each data instance into a SummInstance object |
|
:param dataset: a train/validation/test dataset |
|
:rtype: a generator yielding SummInstance objects |
|
""" |
|
for instance in tqdm(data): |
|
docs: List = [ |
|
instance["document_xml"], |
|
instance["citing_sentences_annotated.json"], |
|
] |
|
summary: str = instance["summary"] |
|
summ_instance = SummInstance(source=docs, summary=summary) |
|
|
|
yield summ_instance |
|
|
|
|
|
class SummscreenDataset(SummDataset): |
|
""" |
|
The SummScreen dataset. As a dataset not included by huggingface, we need to do manually download, set basic |
|
information for the dataset |
|
""" |
|
|
|
dataset_name = "Summscreen" |
|
|
|
version = "1.1.0" |
|
is_dialogue_based = True |
|
is_multi_document = False |
|
is_query_based = False |
|
|
|
huggingface_dataset = False |
|
builder_script_path = path.join( |
|
BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" |
|
) |
|
|
|
def __init__(self, seed=None): |
|
super().__init__() |
|
|
|
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: |
|
""" |
|
Overrides the SummDataset '_process_data()' method |
|
This method processes the data contained in the dataset |
|
and puts each data instance into a SummInstance object |
|
:param dataset: a train/validation/test dataset |
|
:rtype: a generator yielding SummInstance objects |
|
""" |
|
for instance in tqdm(data): |
|
transcript: List = instance[ |
|
"transcript" |
|
] |
|
recap: str = instance["recap"] |
|
summ_instance = SummInstance(source=transcript, summary=recap) |
|
|
|
yield summ_instance |
|
|
|
|
|
class QMsumDataset(SummDataset): |
|
""" |
|
QMSum Dataset |
|
""" |
|
|
|
dataset_name = "QMsum" |
|
description = """ |
|
QMSum is a new human-annotated benchmark for query-based multi-domain meeting summarization task, |
|
which consists of 1,808 query-summary pairs over 232 meetings in multiple domains. |
|
""" |
|
|
|
is_dialogue_based = True |
|
is_multi_document = False |
|
is_query_based = True |
|
|
|
huggingface_dataset = False |
|
builder_script_path = path.join( |
|
BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" |
|
) |
|
|
|
def __init__(self): |
|
super().__init__() |
|
|
|
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: |
|
""" |
|
Overrides the SummDataset '_process_data()' method |
|
This method processes the data contained in the dataset |
|
and puts each data instance into a SummInstance object |
|
:param dataset: a train/validation/test dataset |
|
:rtype: a generator yielding SummInstance objects |
|
""" |
|
for instance in tqdm(data): |
|
for query_set in ( |
|
instance["general_query_list"] + instance["specific_query_list"] |
|
): |
|
meeting: List = [ |
|
utterance["speaker"] + " : " + utterance["content"] |
|
for utterance in instance["meeting_transcripts"] |
|
] |
|
query: str = query_set["query"] |
|
summary: str = query_set["answer"] |
|
summ_instance = SummInstance( |
|
source=meeting, summary=summary, query=query |
|
) |
|
|
|
yield summ_instance |
|
|
|
|
|
class ArxivDataset(SummDataset): |
|
""" |
|
The Arxiv Dataset |
|
""" |
|
|
|
dataset_name = "Arxiv_longsummarization" |
|
description = """ |
|
A summarization dataset comprised of pairs of scientific papers. |
|
The dataset provides a challenging testbed for abstractive summarization. |
|
It contains papers and their abstracts. |
|
""" |
|
|
|
is_dialogue_based = False |
|
is_multi_document = False |
|
is_query_based = False |
|
|
|
huggingface_dataset = False |
|
builder_script_path = path.join( |
|
BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py" |
|
) |
|
|
|
def __init__(self): |
|
|
|
print( |
|
"*****************", |
|
"***Attention***", |
|
"This dataset is quite large (approx 5Gb and will need about 15 Gb for the extraction process", |
|
"Cancel/interrupt the download if size and time constraints will not be met", |
|
"*****************", |
|
sep="\n", |
|
) |
|
|
|
super().__init__() |
|
|
|
def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]: |
|
""" |
|
Overrides the SummDataset '_process_data()' method |
|
This method processes the data contained in the dataset |
|
and puts each data instance into a SummInstance object |
|
:param dataset: a train/validation/test dataset |
|
:rtype: a generator yielding SummInstance objects |
|
""" |
|
for instance in tqdm(data): |
|
article: List = instance["article_text"] |
|
abstract: str = " ".join(instance["abstract_text"]) |
|
summ_instance = SummInstance(source=article, summary=abstract) |
|
|
|
yield summ_instance |
|
|