shivam
/

split-test

Model card Files Files and versions Community

shivam commited on Mar 17, 2022

Commit

b4bb178

1 Parent(s): 6305210

Initial commit

Browse files

Files changed (3) hide show

split-test.py +249 -0
text.en +3 -0
text.hi +3 -0

split-test.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The Tweet Eval Datasets"""
+import datasets
+_CITATION = """\
+@inproceedings{barbieri2020tweeteval,
+  title={{TweetEval:Unified Benchmark and Comparative Evaluation for Tweet Classification}},
+  author={Barbieri, Francesco and Camacho-Collados, Jose and Espinosa-Anke, Luis and Neves, Leonardo},
+  booktitle={Proceedings of Findings of EMNLP},
+  year={2020}
+}
+"""
+_DESCRIPTION = """\
+TweetEval consists of seven heterogenous tasks in Twitter, all framed as multi-class tweet classification. All tasks have been unified into the same benchmark, with each dataset presented in the same format and with fixed training, validation and test splits.
+"""
+_HOMEPAGE = "https://github.com/cardiffnlp/tweeteval"
+_LICENSE = ""
+URL = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/"
+_URLs = {
+    "emoji": {
+        "train_text": URL + "emoji/train_text.txt",
+        "train_labels": URL + "emoji/train_labels.txt",
+        "test_text": URL + "emoji/test_text.txt",
+        "test_labels": URL + "emoji/test_labels.txt",
+        "val_text": URL + "emoji/val_text.txt",
+        "val_labels": URL + "emoji/val_labels.txt",
+    },
+    "emotion": {
+        "train_text": URL + "emotion/train_text.txt",
+        "train_labels": URL + "emotion/train_labels.txt",
+        "test_text": URL + "emotion/test_text.txt",
+        "test_labels": URL + "emotion/test_labels.txt",
+        "val_text": URL + "emotion/val_text.txt",
+        "val_labels": URL + "emotion/val_labels.txt",
+    },
+    "hate": {
+        "train_text": URL + "hate/train_text.txt",
+        "train_labels": URL + "hate/train_labels.txt",
+        "test_text": URL + "hate/test_text.txt",
+        "test_labels": URL + "hate/test_labels.txt",
+        "val_text": URL + "hate/val_text.txt",
+        "val_labels": URL + "hate/val_labels.txt",
+    },
+    "irony": {
+        "train_text": URL + "irony/train_text.txt",
+        "train_labels": URL + "irony/train_labels.txt",
+        "test_text": URL + "irony/test_text.txt",
+        "test_labels": URL + "irony/test_labels.txt",
+        "val_text": URL + "irony/val_text.txt",
+        "val_labels": URL + "irony/val_labels.txt",
+    },
+    "offensive": {
+        "train_text": URL + "offensive/train_text.txt",
+        "train_labels": URL + "offensive/train_labels.txt",
+        "test_text": URL + "offensive/test_text.txt",
+        "test_labels": URL + "offensive/test_labels.txt",
+        "val_text": URL + "offensive/val_text.txt",
+        "val_labels": URL + "offensive/val_labels.txt",
+    },
+    "sentiment": {
+        "train_text": URL + "sentiment/train_text.txt",
+        "train_labels": URL + "sentiment/train_labels.txt",
+        "test_text": URL + "sentiment/test_text.txt",
+        "test_labels": URL + "sentiment/test_labels.txt",
+        "val_text": URL + "sentiment/val_text.txt",
+        "val_labels": URL + "sentiment/val_labels.txt",
+    },
+    "stance": {
+        "abortion": {
+            "train_text": URL + "stance/abortion/train_text.txt",
+            "train_labels": URL + "stance/abortion/train_labels.txt",
+            "test_text": URL + "stance/abortion/test_text.txt",
+            "test_labels": URL + "stance/abortion/test_labels.txt",
+            "val_text": URL + "stance/abortion/val_text.txt",
+            "val_labels": URL + "stance/abortion/val_labels.txt",
+        },
+        "atheism": {
+            "train_text": URL + "stance/atheism/train_text.txt",
+            "train_labels": URL + "stance/atheism/train_labels.txt",
+            "test_text": URL + "stance/atheism/test_text.txt",
+            "test_labels": URL + "stance/atheism/test_labels.txt",
+            "val_text": URL + "stance/atheism/val_text.txt",
+            "val_labels": URL + "stance/atheism/val_labels.txt",
+        },
+        "climate": {
+            "train_text": URL + "stance/climate/train_text.txt",
+            "train_labels": URL + "stance/climate/train_labels.txt",
+            "test_text": URL + "stance/climate/test_text.txt",
+            "test_labels": URL + "stance/climate/test_labels.txt",
+            "val_text": URL + "stance/climate/val_text.txt",
+            "val_labels": URL + "stance/climate/val_labels.txt",
+        },
+        "feminist": {
+            "train_text": URL + "stance/feminist/train_text.txt",
+            "train_labels": URL + "stance/feminist/train_labels.txt",
+            "test_text": URL + "stance/feminist/test_text.txt",
+            "test_labels": URL + "stance/feminist/test_labels.txt",
+            "val_text": URL + "stance/feminist/val_text.txt",
+            "val_labels": URL + "stance/feminist/val_labels.txt",
+        },
+        "hillary": {
+            "train_text": URL + "stance/hillary/train_text.txt",
+            "train_labels": URL + "stance/hillary/train_labels.txt",
+            "test_text": URL + "stance/hillary/test_text.txt",
+            "test_labels": URL + "stance/hillary/test_labels.txt",
+            "val_text": URL + "stance/hillary/val_text.txt",
+            "val_labels": URL + "stance/hillary/val_labels.txt",
+        },
+    },
+}
+class TweetEvalConfig(datasets.BuilderConfig):
+    def __init__(self, *args, type=None, sub_type=None, **kwargs):
+        super().__init__(
+            *args,
+            name=f"{type}" if type != "stance" else f"{type}_{sub_type}",
+            **kwargs,
+        )
+        self.type = type
+        self.sub_type = sub_type
+class TweetEval(datasets.GeneratorBasedBuilder):
+    """TweetEval Dataset."""
+    BUILDER_CONFIGS = [
+        TweetEvalConfig(
+            type=key,
+            sub_type=None,
+            version=datasets.Version("1.1.0"),
+            description=f"This part of my dataset covers {key} part of TweetEval Dataset.",
+        )
+        for key in list(_URLs.keys())
+        if key != "stance"
+    ] + [
+        TweetEvalConfig(
+            type="stance",
+            sub_type=key,
+            version=datasets.Version("1.1.0"),
+            description=f"This part of my dataset covers stance_{key} part of TweetEval Dataset.",
+        )
+        for key in list(_URLs["stance"].keys())
+    ]
+    def _info(self):
+        if self.config.type == "stance":
+            names = ["none", "against", "favor"]
+        elif self.config.type == "sentiment":
+            names = ["negative", "neutral", "positive"]
+        elif self.config.type == "offensive":
+            names = ["non-offensive", "offensive"]
+        elif self.config.type == "irony":
+            names = ["non_irony", "irony"]
+        elif self.config.type == "hate":
+            names = ["non-hate", "hate"]
+        elif self.config.type == "emoji":
+            names = [
+                "❤",
+                "😍",
+                "😂",
+                "💕",
+                "🔥",
+                "😊",
+                "😎",
+                "✨",
+                "💙",
+                "😘",
+                "📷",
+                "🇺🇸",
+                "☀",
+                "💜",
+                "😉",
+                "💯",
+                "😁",
+                "🎄",
+                "📸",
+                "😜",
+            ]
+        else:
+            names = ["anger", "joy", "optimism", "sadness"]
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=names)}
+            ),
+            supervised_keys=None,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        if self.config.type != "stance":
+            my_urls = _URLs[self.config.type]
+        else:
+            my_urls = _URLs[self.config.type][self.config.sub_type]
+        data_dir = dl_manager.download_and_extract(my_urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"text_path": data_dir["train_text"], "labels_path": data_dir["train_labels"]},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"text_path": data_dir["test_text"], "labels_path": data_dir["test_labels"]},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"text_path": data_dir["val_text"], "labels_path": data_dir["val_labels"]},
+            ),
+        ]
+    def _generate_examples(self, text_path, labels_path):
+        """Yields examples."""
+        with open(text_path, encoding="utf-8") as f:
+            texts = f.readlines()
+        with open(labels_path, encoding="utf-8") as f:
+            labels = f.readlines()
+        for i, text in enumerate(texts):
+            yield i, {"text": text.strip(), "label": int(labels[i].strip())}

text.en ADDED Viewed

	@@ -0,0 +1,3 @@

+english
+tree
+tall

text.hi ADDED Viewed

	@@ -0,0 +1,3 @@

+hindi
+ped
+uncha