shivam commited on
Commit
b4bb178
Β·
1 Parent(s): 6305210

Initial commit

Browse files
Files changed (3) hide show
  1. split-test.py +249 -0
  2. text.en +3 -0
  3. text.hi +3 -0
split-test.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """The Tweet Eval Datasets"""
16
+
17
+
18
+ import datasets
19
+
20
+
21
+ _CITATION = """\
22
+ @inproceedings{barbieri2020tweeteval,
23
+ title={{TweetEval:Unified Benchmark and Comparative Evaluation for Tweet Classification}},
24
+ author={Barbieri, Francesco and Camacho-Collados, Jose and Espinosa-Anke, Luis and Neves, Leonardo},
25
+ booktitle={Proceedings of Findings of EMNLP},
26
+ year={2020}
27
+ }
28
+ """
29
+
30
+ _DESCRIPTION = """\
31
+ TweetEval consists of seven heterogenous tasks in Twitter, all framed as multi-class tweet classification. All tasks have been unified into the same benchmark, with each dataset presented in the same format and with fixed training, validation and test splits.
32
+ """
33
+
34
+ _HOMEPAGE = "https://github.com/cardiffnlp/tweeteval"
35
+
36
+ _LICENSE = ""
37
+
38
+ URL = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/"
39
+
40
+ _URLs = {
41
+ "emoji": {
42
+ "train_text": URL + "emoji/train_text.txt",
43
+ "train_labels": URL + "emoji/train_labels.txt",
44
+ "test_text": URL + "emoji/test_text.txt",
45
+ "test_labels": URL + "emoji/test_labels.txt",
46
+ "val_text": URL + "emoji/val_text.txt",
47
+ "val_labels": URL + "emoji/val_labels.txt",
48
+ },
49
+ "emotion": {
50
+ "train_text": URL + "emotion/train_text.txt",
51
+ "train_labels": URL + "emotion/train_labels.txt",
52
+ "test_text": URL + "emotion/test_text.txt",
53
+ "test_labels": URL + "emotion/test_labels.txt",
54
+ "val_text": URL + "emotion/val_text.txt",
55
+ "val_labels": URL + "emotion/val_labels.txt",
56
+ },
57
+ "hate": {
58
+ "train_text": URL + "hate/train_text.txt",
59
+ "train_labels": URL + "hate/train_labels.txt",
60
+ "test_text": URL + "hate/test_text.txt",
61
+ "test_labels": URL + "hate/test_labels.txt",
62
+ "val_text": URL + "hate/val_text.txt",
63
+ "val_labels": URL + "hate/val_labels.txt",
64
+ },
65
+ "irony": {
66
+ "train_text": URL + "irony/train_text.txt",
67
+ "train_labels": URL + "irony/train_labels.txt",
68
+ "test_text": URL + "irony/test_text.txt",
69
+ "test_labels": URL + "irony/test_labels.txt",
70
+ "val_text": URL + "irony/val_text.txt",
71
+ "val_labels": URL + "irony/val_labels.txt",
72
+ },
73
+ "offensive": {
74
+ "train_text": URL + "offensive/train_text.txt",
75
+ "train_labels": URL + "offensive/train_labels.txt",
76
+ "test_text": URL + "offensive/test_text.txt",
77
+ "test_labels": URL + "offensive/test_labels.txt",
78
+ "val_text": URL + "offensive/val_text.txt",
79
+ "val_labels": URL + "offensive/val_labels.txt",
80
+ },
81
+ "sentiment": {
82
+ "train_text": URL + "sentiment/train_text.txt",
83
+ "train_labels": URL + "sentiment/train_labels.txt",
84
+ "test_text": URL + "sentiment/test_text.txt",
85
+ "test_labels": URL + "sentiment/test_labels.txt",
86
+ "val_text": URL + "sentiment/val_text.txt",
87
+ "val_labels": URL + "sentiment/val_labels.txt",
88
+ },
89
+ "stance": {
90
+ "abortion": {
91
+ "train_text": URL + "stance/abortion/train_text.txt",
92
+ "train_labels": URL + "stance/abortion/train_labels.txt",
93
+ "test_text": URL + "stance/abortion/test_text.txt",
94
+ "test_labels": URL + "stance/abortion/test_labels.txt",
95
+ "val_text": URL + "stance/abortion/val_text.txt",
96
+ "val_labels": URL + "stance/abortion/val_labels.txt",
97
+ },
98
+ "atheism": {
99
+ "train_text": URL + "stance/atheism/train_text.txt",
100
+ "train_labels": URL + "stance/atheism/train_labels.txt",
101
+ "test_text": URL + "stance/atheism/test_text.txt",
102
+ "test_labels": URL + "stance/atheism/test_labels.txt",
103
+ "val_text": URL + "stance/atheism/val_text.txt",
104
+ "val_labels": URL + "stance/atheism/val_labels.txt",
105
+ },
106
+ "climate": {
107
+ "train_text": URL + "stance/climate/train_text.txt",
108
+ "train_labels": URL + "stance/climate/train_labels.txt",
109
+ "test_text": URL + "stance/climate/test_text.txt",
110
+ "test_labels": URL + "stance/climate/test_labels.txt",
111
+ "val_text": URL + "stance/climate/val_text.txt",
112
+ "val_labels": URL + "stance/climate/val_labels.txt",
113
+ },
114
+ "feminist": {
115
+ "train_text": URL + "stance/feminist/train_text.txt",
116
+ "train_labels": URL + "stance/feminist/train_labels.txt",
117
+ "test_text": URL + "stance/feminist/test_text.txt",
118
+ "test_labels": URL + "stance/feminist/test_labels.txt",
119
+ "val_text": URL + "stance/feminist/val_text.txt",
120
+ "val_labels": URL + "stance/feminist/val_labels.txt",
121
+ },
122
+ "hillary": {
123
+ "train_text": URL + "stance/hillary/train_text.txt",
124
+ "train_labels": URL + "stance/hillary/train_labels.txt",
125
+ "test_text": URL + "stance/hillary/test_text.txt",
126
+ "test_labels": URL + "stance/hillary/test_labels.txt",
127
+ "val_text": URL + "stance/hillary/val_text.txt",
128
+ "val_labels": URL + "stance/hillary/val_labels.txt",
129
+ },
130
+ },
131
+ }
132
+
133
+
134
+ class TweetEvalConfig(datasets.BuilderConfig):
135
+ def __init__(self, *args, type=None, sub_type=None, **kwargs):
136
+ super().__init__(
137
+ *args,
138
+ name=f"{type}" if type != "stance" else f"{type}_{sub_type}",
139
+ **kwargs,
140
+ )
141
+ self.type = type
142
+ self.sub_type = sub_type
143
+
144
+
145
+ class TweetEval(datasets.GeneratorBasedBuilder):
146
+ """TweetEval Dataset."""
147
+
148
+ BUILDER_CONFIGS = [
149
+ TweetEvalConfig(
150
+ type=key,
151
+ sub_type=None,
152
+ version=datasets.Version("1.1.0"),
153
+ description=f"This part of my dataset covers {key} part of TweetEval Dataset.",
154
+ )
155
+ for key in list(_URLs.keys())
156
+ if key != "stance"
157
+ ] + [
158
+ TweetEvalConfig(
159
+ type="stance",
160
+ sub_type=key,
161
+ version=datasets.Version("1.1.0"),
162
+ description=f"This part of my dataset covers stance_{key} part of TweetEval Dataset.",
163
+ )
164
+ for key in list(_URLs["stance"].keys())
165
+ ]
166
+
167
+ def _info(self):
168
+ if self.config.type == "stance":
169
+ names = ["none", "against", "favor"]
170
+ elif self.config.type == "sentiment":
171
+ names = ["negative", "neutral", "positive"]
172
+ elif self.config.type == "offensive":
173
+ names = ["non-offensive", "offensive"]
174
+ elif self.config.type == "irony":
175
+ names = ["non_irony", "irony"]
176
+ elif self.config.type == "hate":
177
+ names = ["non-hate", "hate"]
178
+ elif self.config.type == "emoji":
179
+ names = [
180
+ "❀",
181
+ "😍",
182
+ "πŸ˜‚",
183
+ "πŸ’•",
184
+ "πŸ”₯",
185
+ "😊",
186
+ "😎",
187
+ "✨",
188
+ "πŸ’™",
189
+ "😘",
190
+ "πŸ“·",
191
+ "πŸ‡ΊπŸ‡Έ",
192
+ "β˜€",
193
+ "πŸ’œ",
194
+ "πŸ˜‰",
195
+ "πŸ’―",
196
+ "😁",
197
+ "πŸŽ„",
198
+ "πŸ“Έ",
199
+ "😜",
200
+ ]
201
+
202
+ else:
203
+ names = ["anger", "joy", "optimism", "sadness"]
204
+
205
+ return datasets.DatasetInfo(
206
+ description=_DESCRIPTION,
207
+ features=datasets.Features(
208
+ {"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=names)}
209
+ ),
210
+ supervised_keys=None,
211
+ homepage=_HOMEPAGE,
212
+ license=_LICENSE,
213
+ citation=_CITATION,
214
+ )
215
+
216
+ def _split_generators(self, dl_manager):
217
+ """Returns SplitGenerators."""
218
+ if self.config.type != "stance":
219
+ my_urls = _URLs[self.config.type]
220
+ else:
221
+ my_urls = _URLs[self.config.type][self.config.sub_type]
222
+ data_dir = dl_manager.download_and_extract(my_urls)
223
+ return [
224
+ datasets.SplitGenerator(
225
+ name=datasets.Split.TRAIN,
226
+ # These kwargs will be passed to _generate_examples
227
+ gen_kwargs={"text_path": data_dir["train_text"], "labels_path": data_dir["train_labels"]},
228
+ ),
229
+ datasets.SplitGenerator(
230
+ name=datasets.Split.TEST,
231
+ # These kwargs will be passed to _generate_examples
232
+ gen_kwargs={"text_path": data_dir["test_text"], "labels_path": data_dir["test_labels"]},
233
+ ),
234
+ datasets.SplitGenerator(
235
+ name=datasets.Split.VALIDATION,
236
+ # These kwargs will be passed to _generate_examples
237
+ gen_kwargs={"text_path": data_dir["val_text"], "labels_path": data_dir["val_labels"]},
238
+ ),
239
+ ]
240
+
241
+ def _generate_examples(self, text_path, labels_path):
242
+ """Yields examples."""
243
+
244
+ with open(text_path, encoding="utf-8") as f:
245
+ texts = f.readlines()
246
+ with open(labels_path, encoding="utf-8") as f:
247
+ labels = f.readlines()
248
+ for i, text in enumerate(texts):
249
+ yield i, {"text": text.strip(), "label": int(labels[i].strip())}
text.en ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ english
2
+ tree
3
+ tall
text.hi ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ hindi
2
+ ped
3
+ uncha