# coding=utf-8 # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """The Tweet Eval Datasets""" import datasets _CITATION = """\ @inproceedings{barbieri2020tweeteval, title={{TweetEval:Unified Benchmark and Comparative Evaluation for Tweet Classification}}, author={Barbieri, Francesco and Camacho-Collados, Jose and Espinosa-Anke, Luis and Neves, Leonardo}, booktitle={Proceedings of Findings of EMNLP}, year={2020} } """ _DESCRIPTION = """\ TweetEval consists of seven heterogenous tasks in Twitter, all framed as multi-class tweet classification. All tasks have been unified into the same benchmark, with each dataset presented in the same format and with fixed training, validation and test splits. """ _HOMEPAGE = "https://github.com/cardiffnlp/tweeteval" _LICENSE = "" URL = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/" _URLs = { "emoji": { "train_text": URL + "emoji/train_text.txt", "train_labels": URL + "emoji/train_labels.txt", "test_text": URL + "emoji/test_text.txt", "test_labels": URL + "emoji/test_labels.txt", "val_text": URL + "emoji/val_text.txt", "val_labels": URL + "emoji/val_labels.txt", }, "emotion": { "train_text": URL + "emotion/train_text.txt", "train_labels": URL + "emotion/train_labels.txt", "test_text": URL + "emotion/test_text.txt", "test_labels": URL + "emotion/test_labels.txt", "val_text": URL + "emotion/val_text.txt", "val_labels": URL + "emotion/val_labels.txt", }, "hate": { "train_text": URL + "hate/train_text.txt", "train_labels": URL + "hate/train_labels.txt", "test_text": URL + "hate/test_text.txt", "test_labels": URL + "hate/test_labels.txt", "val_text": URL + "hate/val_text.txt", "val_labels": URL + "hate/val_labels.txt", }, "irony": { "train_text": URL + "irony/train_text.txt", "train_labels": URL + "irony/train_labels.txt", "test_text": URL + "irony/test_text.txt", "test_labels": URL + "irony/test_labels.txt", "val_text": URL + "irony/val_text.txt", "val_labels": URL + "irony/val_labels.txt", }, "offensive": { "train_text": URL + "offensive/train_text.txt", "train_labels": URL + "offensive/train_labels.txt", "test_text": URL + "offensive/test_text.txt", "test_labels": URL + "offensive/test_labels.txt", "val_text": URL + "offensive/val_text.txt", "val_labels": URL + "offensive/val_labels.txt", }, "sentiment": { "train_text": URL + "sentiment/train_text.txt", "train_labels": URL + "sentiment/train_labels.txt", "test_text": URL + "sentiment/test_text.txt", "test_labels": URL + "sentiment/test_labels.txt", "val_text": URL + "sentiment/val_text.txt", "val_labels": URL + "sentiment/val_labels.txt", }, "stance": { "abortion": { "train_text": URL + "stance/abortion/train_text.txt", "train_labels": URL + "stance/abortion/train_labels.txt", "test_text": URL + "stance/abortion/test_text.txt", "test_labels": URL + "stance/abortion/test_labels.txt", "val_text": URL + "stance/abortion/val_text.txt", "val_labels": URL + "stance/abortion/val_labels.txt", }, "atheism": { "train_text": URL + "stance/atheism/train_text.txt", "train_labels": URL + "stance/atheism/train_labels.txt", "test_text": URL + "stance/atheism/test_text.txt", "test_labels": URL + "stance/atheism/test_labels.txt", "val_text": URL + "stance/atheism/val_text.txt", "val_labels": URL + "stance/atheism/val_labels.txt", }, "climate": { "train_text": URL + "stance/climate/train_text.txt", "train_labels": URL + "stance/climate/train_labels.txt", "test_text": URL + "stance/climate/test_text.txt", "test_labels": URL + "stance/climate/test_labels.txt", "val_text": URL + "stance/climate/val_text.txt", "val_labels": URL + "stance/climate/val_labels.txt", }, "feminist": { "train_text": URL + "stance/feminist/train_text.txt", "train_labels": URL + "stance/feminist/train_labels.txt", "test_text": URL + "stance/feminist/test_text.txt", "test_labels": URL + "stance/feminist/test_labels.txt", "val_text": URL + "stance/feminist/val_text.txt", "val_labels": URL + "stance/feminist/val_labels.txt", }, "hillary": { "train_text": URL + "stance/hillary/train_text.txt", "train_labels": URL + "stance/hillary/train_labels.txt", "test_text": URL + "stance/hillary/test_text.txt", "test_labels": URL + "stance/hillary/test_labels.txt", "val_text": URL + "stance/hillary/val_text.txt", "val_labels": URL + "stance/hillary/val_labels.txt", }, }, } class TweetEvalConfig(datasets.BuilderConfig): def __init__(self, *args, type=None, sub_type=None, **kwargs): super().__init__( *args, name=f"{type}" if type != "stance" else f"{type}_{sub_type}", **kwargs, ) self.type = type self.sub_type = sub_type class TweetEval(datasets.GeneratorBasedBuilder): """TweetEval Dataset.""" BUILDER_CONFIGS = [ TweetEvalConfig( type=key, sub_type=None, version=datasets.Version("1.1.0"), description=f"This part of my dataset covers {key} part of TweetEval Dataset.", ) for key in list(_URLs.keys()) if key != "stance" ] + [ TweetEvalConfig( type="stance", sub_type=key, version=datasets.Version("1.1.0"), description=f"This part of my dataset covers stance_{key} part of TweetEval Dataset.", ) for key in list(_URLs["stance"].keys()) ] def _info(self): if self.config.type == "stance": names = ["none", "against", "favor"] elif self.config.type == "sentiment": names = ["negative", "neutral", "positive"] elif self.config.type == "offensive": names = ["non-offensive", "offensive"] elif self.config.type == "irony": names = ["non_irony", "irony"] elif self.config.type == "hate": names = ["non-hate", "hate"] elif self.config.type == "emoji": names = [ "❀", "😍", "πŸ˜‚", "πŸ’•", "πŸ”₯", "😊", "😎", "✨", "πŸ’™", "😘", "πŸ“·", "πŸ‡ΊπŸ‡Έ", "β˜€", "πŸ’œ", "πŸ˜‰", "πŸ’―", "😁", "πŸŽ„", "πŸ“Έ", "😜", ] else: names = ["anger", "joy", "optimism", "sadness"] return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( {"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=names)} ), supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" if self.config.type != "stance": my_urls = _URLs[self.config.type] else: my_urls = _URLs[self.config.type][self.config.sub_type] data_dir = dl_manager.download_and_extract(my_urls) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={"text_path": data_dir["train_text"], "labels_path": data_dir["train_labels"]}, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={"text_path": data_dir["test_text"], "labels_path": data_dir["test_labels"]}, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, # These kwargs will be passed to _generate_examples gen_kwargs={"text_path": data_dir["val_text"], "labels_path": data_dir["val_labels"]}, ), ] def _generate_examples(self, text_path, labels_path): """Yields examples.""" with open(text_path, encoding="utf-8") as f: texts = f.readlines() with open(labels_path, encoding="utf-8") as f: labels = f.readlines() for i, text in enumerate(texts): yield i, {"text": text.strip(), "label": int(labels[i].strip())}