Upload 2 files
Browse files- bigbench.py +288 -0
- mmlu.py +160 -0
bigbench.py
ADDED
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
|
3 |
+
# Lint as: python3
|
4 |
+
"""bigbench datasets"""
|
5 |
+
|
6 |
+
from __future__ import absolute_import, division, print_function
|
7 |
+
|
8 |
+
import json
|
9 |
+
import os
|
10 |
+
import textwrap
|
11 |
+
import six
|
12 |
+
import datasets
|
13 |
+
|
14 |
+
|
15 |
+
CITATION = r"""
|
16 |
+
@article{srivastava2022beyond,
|
17 |
+
title={Beyond the imitation game: Quantifying and extrapolating the capabilities of language models},
|
18 |
+
author={Srivastava, Aarohi and Rastogi, Abhinav and Rao, Abhishek and Shoeb, Abu Awal Md and Abid, Abubakar and Fisch, Adam and Brown, Adam R and Santoro, Adam and Gupta, Aditya and Garriga-Alonso, Adri{\`a} and others},
|
19 |
+
journal={arXiv preprint arXiv:2206.04615},
|
20 |
+
year={2022}
|
21 |
+
}
|
22 |
+
"""
|
23 |
+
|
24 |
+
DESCRIPTION = """\
|
25 |
+
bigbench json tasks
|
26 |
+
"""
|
27 |
+
|
28 |
+
DATA_URL = "https://www.dropbox.com/s/cjdywlalikdb1c6/bigbench.zip?dl=1"
|
29 |
+
|
30 |
+
CONFIGS=['abstract_narrative_understanding',
|
31 |
+
'anachronisms',
|
32 |
+
'analogical_similarity',
|
33 |
+
'analytic_entailment',
|
34 |
+
'arithmetic',
|
35 |
+
'ascii_word_recognition',
|
36 |
+
'authorship_verification',
|
37 |
+
'auto_categorization',
|
38 |
+
'auto_debugging',
|
39 |
+
'bbq_lite_json',
|
40 |
+
'bridging_anaphora_resolution_barqa',
|
41 |
+
'causal_judgment',
|
42 |
+
'cause_and_effect',
|
43 |
+
'checkmate_in_one',
|
44 |
+
'chess_state_tracking',
|
45 |
+
'chinese_remainder_theorem',
|
46 |
+
'cifar10_classification',
|
47 |
+
'code_line_description',
|
48 |
+
'codenames',
|
49 |
+
'color',
|
50 |
+
'common_morpheme',
|
51 |
+
'conceptual_combinations',
|
52 |
+
'conlang_translation',
|
53 |
+
'contextual_parametric_knowledge_conflicts',
|
54 |
+
'crash_blossom',
|
55 |
+
'crass_ai',
|
56 |
+
'cryobiology_spanish',
|
57 |
+
'cryptonite',
|
58 |
+
'cs_algorithms',
|
59 |
+
'dark_humor_detection',
|
60 |
+
'date_understanding',
|
61 |
+
'disambiguation_qa',
|
62 |
+
'discourse_marker_prediction',
|
63 |
+
'disfl_qa',
|
64 |
+
'dyck_languages',
|
65 |
+
'elementary_math_qa',
|
66 |
+
'emoji_movie',
|
67 |
+
'emojis_emotion_prediction',
|
68 |
+
'empirical_judgments',
|
69 |
+
'english_proverbs',
|
70 |
+
'english_russian_proverbs',
|
71 |
+
'entailed_polarity',
|
72 |
+
'entailed_polarity_hindi',
|
73 |
+
'epistemic_reasoning',
|
74 |
+
'evaluating_information_essentiality',
|
75 |
+
'fact_checker',
|
76 |
+
'fantasy_reasoning',
|
77 |
+
'few_shot_nlg',
|
78 |
+
'figure_of_speech_detection',
|
79 |
+
'formal_fallacies_syllogisms_negation',
|
80 |
+
'gem',
|
81 |
+
'gender_inclusive_sentences_german',
|
82 |
+
'general_knowledge',
|
83 |
+
'geometric_shapes',
|
84 |
+
'goal_step_wikihow',
|
85 |
+
'gre_reading_comprehension',
|
86 |
+
'hhh_alignment',
|
87 |
+
'hindi_question_answering',
|
88 |
+
'hindu_knowledge',
|
89 |
+
'hinglish_toxicity',
|
90 |
+
'human_organs_senses',
|
91 |
+
'hyperbaton',
|
92 |
+
'identify_math_theorems',
|
93 |
+
'identify_odd_metaphor',
|
94 |
+
'implicatures',
|
95 |
+
'implicit_relations',
|
96 |
+
'indic_cause_and_effect',
|
97 |
+
'intent_recognition',
|
98 |
+
'international_phonetic_alphabet_nli',
|
99 |
+
'international_phonetic_alphabet_transliterate',
|
100 |
+
'intersect_geometry',
|
101 |
+
'irony_identification',
|
102 |
+
'kanji_ascii',
|
103 |
+
'kannada',
|
104 |
+
'key_value_maps',
|
105 |
+
'known_unknowns',
|
106 |
+
'language_games',
|
107 |
+
'language_identification',
|
108 |
+
'linguistic_mappings',
|
109 |
+
'linguistics_puzzles',
|
110 |
+
'list_functions',
|
111 |
+
'logic_grid_puzzle',
|
112 |
+
'logical_args',
|
113 |
+
'logical_deduction',
|
114 |
+
'logical_fallacy_detection',
|
115 |
+
'logical_sequence',
|
116 |
+
'mathematical_induction',
|
117 |
+
'matrixshapes',
|
118 |
+
'medical_questions_russian',
|
119 |
+
'metaphor_boolean',
|
120 |
+
'metaphor_understanding',
|
121 |
+
'minute_mysteries_qa',
|
122 |
+
'misconceptions',
|
123 |
+
'misconceptions_russian',
|
124 |
+
'mnist_ascii',
|
125 |
+
'modified_arithmetic',
|
126 |
+
'moral_permissibility',
|
127 |
+
'movie_dialog_same_or_different',
|
128 |
+
'movie_recommendation',
|
129 |
+
'mult_data_wrangling',
|
130 |
+
'navigate',
|
131 |
+
'nonsense_words_grammar',
|
132 |
+
'novel_concepts',
|
133 |
+
'object_counting',
|
134 |
+
'odd_one_out',
|
135 |
+
'operators',
|
136 |
+
'paragraph_segmentation',
|
137 |
+
'parsinlu_qa',
|
138 |
+
'parsinlu_reading_comprehension',
|
139 |
+
'penguins_in_a_table',
|
140 |
+
'periodic_elements',
|
141 |
+
'persian_idioms',
|
142 |
+
'phrase_relatedness',
|
143 |
+
'physical_intuition',
|
144 |
+
'physics',
|
145 |
+
'physics_questions',
|
146 |
+
'play_dialog_same_or_different',
|
147 |
+
'polish_sequence_labeling',
|
148 |
+
'presuppositions_as_nli',
|
149 |
+
'qa_wikidata',
|
150 |
+
'question_selection',
|
151 |
+
'real_or_fake_text',
|
152 |
+
'reasoning_about_colored_objects',
|
153 |
+
'repeat_copy_logic',
|
154 |
+
'rephrase',
|
155 |
+
'rhyming',
|
156 |
+
'riddle_sense',
|
157 |
+
'ruin_names',
|
158 |
+
'salient_translation_error_detection',
|
159 |
+
'scientific_press_release',
|
160 |
+
'semantic_parsing_in_context_sparc',
|
161 |
+
'semantic_parsing_spider',
|
162 |
+
'sentence_ambiguity',
|
163 |
+
'similarities_abstraction',
|
164 |
+
'simp_turing_concept',
|
165 |
+
'simple_arithmetic_json',
|
166 |
+
'simple_arithmetic_json_multiple_choice',
|
167 |
+
'simple_arithmetic_json_subtasks',
|
168 |
+
'simple_arithmetic_multiple_targets_json',
|
169 |
+
'simple_ethical_questions',
|
170 |
+
'simple_text_editing',
|
171 |
+
'snarks',
|
172 |
+
'social_iqa',
|
173 |
+
'social_support',
|
174 |
+
'sports_understanding',
|
175 |
+
'strange_stories',
|
176 |
+
'strategyqa',
|
177 |
+
'sufficient_information',
|
178 |
+
'suicide_risk',
|
179 |
+
'swahili_english_proverbs',
|
180 |
+
'swedish_to_german_proverbs',
|
181 |
+
'symbol_interpretation',
|
182 |
+
'tellmewhy',
|
183 |
+
'temporal_sequences',
|
184 |
+
'tense',
|
185 |
+
'timedial',
|
186 |
+
'topical_chat',
|
187 |
+
'tracking_shuffled_objects',
|
188 |
+
'understanding_fables',
|
189 |
+
'undo_permutation',
|
190 |
+
'unit_conversion',
|
191 |
+
'unit_interpretation',
|
192 |
+
'unnatural_in_context_learning',
|
193 |
+
'vitaminc_fact_verification',
|
194 |
+
'what_is_the_tao',
|
195 |
+
'which_wiki_edit',
|
196 |
+
'winowhy',
|
197 |
+
'word_sorting',
|
198 |
+
'word_unscrambling']
|
199 |
+
|
200 |
+
class bigbench_Config(datasets.BuilderConfig):
|
201 |
+
"""BuilderConfig for bigbench."""
|
202 |
+
|
203 |
+
def __init__(
|
204 |
+
self,
|
205 |
+
text_features,
|
206 |
+
label_classes=None,
|
207 |
+
process_label=lambda x: x,
|
208 |
+
**kwargs,
|
209 |
+
):
|
210 |
+
"""BuilderConfig for bigbench.
|
211 |
+
Args:
|
212 |
+
text_features: `dict[string, string]`, map from the name of the feature
|
213 |
+
dict for each text field to the name of the column in the tsv file
|
214 |
+
data_url: `string`, url to download the zip file from
|
215 |
+
data_dir: `string`, the path to the folder containing the tsv files in the
|
216 |
+
downloaded zip
|
217 |
+
citation: `string`, citation for the data set
|
218 |
+
url: `string`, url for information about the data set
|
219 |
+
"""
|
220 |
+
|
221 |
+
super(bigbench_Config, self).__init__(
|
222 |
+
version=datasets.Version("1.0.0", ""), **kwargs
|
223 |
+
)
|
224 |
+
|
225 |
+
self.text_features = text_features
|
226 |
+
self.data_url = DATA_URL
|
227 |
+
self.data_dir = self.name #os.path.join("bigbench", self.name)
|
228 |
+
self.citation = textwrap.dedent(CITATION)
|
229 |
+
self.description = ""
|
230 |
+
self.url = "https://github.com/google/BIG-bench"
|
231 |
+
|
232 |
+
|
233 |
+
class bigbench(datasets.GeneratorBasedBuilder):
|
234 |
+
|
235 |
+
"""The General Language Understanding Evaluation (bigbench) benchmark."""
|
236 |
+
|
237 |
+
BUILDER_CONFIG_CLASS = bigbench_Config
|
238 |
+
|
239 |
+
BUILDER_CONFIGS = [
|
240 |
+
bigbench_Config(
|
241 |
+
name=name,
|
242 |
+
text_features={"inputs": "inputs"},
|
243 |
+
) for name in CONFIGS
|
244 |
+
]
|
245 |
+
|
246 |
+
def _info(self):
|
247 |
+
features = {
|
248 |
+
"inputs": datasets.Value("string"),
|
249 |
+
"targets": datasets.features.Sequence(datasets.Value("string")),
|
250 |
+
"multiple_choice_targets": datasets.features.Sequence(datasets.Value("string")),
|
251 |
+
"multiple_choice_scores": datasets.features.Sequence(datasets.Value("int32")),
|
252 |
+
|
253 |
+
}
|
254 |
+
features["idx"] = datasets.Value("int32")
|
255 |
+
return datasets.DatasetInfo(
|
256 |
+
description=DESCRIPTION,
|
257 |
+
features=datasets.Features(features),
|
258 |
+
homepage=self.config.url,
|
259 |
+
citation=self.config.citation + "\n" + CITATION,
|
260 |
+
)
|
261 |
+
|
262 |
+
def _split_generators(self, dl_manager):
|
263 |
+
dl_dir = dl_manager.download_and_extract(self.config.data_url)
|
264 |
+
data_dir = os.path.join(dl_dir, self.config.data_dir)
|
265 |
+
|
266 |
+
return [
|
267 |
+
datasets.SplitGenerator(
|
268 |
+
name=datasets.Split.TRAIN,
|
269 |
+
gen_kwargs={
|
270 |
+
"data_file": os.path.join(data_dir or "", "train.jsonl"),
|
271 |
+
"split": "train",
|
272 |
+
},
|
273 |
+
),
|
274 |
+
datasets.SplitGenerator(
|
275 |
+
name=datasets.Split.VALIDATION,
|
276 |
+
gen_kwargs={
|
277 |
+
"data_file": os.path.join(data_dir or "", "validation.jsonl"),
|
278 |
+
"split": "validation",
|
279 |
+
},
|
280 |
+
),
|
281 |
+
]
|
282 |
+
|
283 |
+
def _generate_examples(self, data_file,split):
|
284 |
+
"""Yields examples."""
|
285 |
+
with open(data_file, "r", encoding="utf-8") as f:
|
286 |
+
for id_, line in enumerate(f):
|
287 |
+
line_dict = json.loads(line)
|
288 |
+
yield id_, line_dict
|
mmlu.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
|
16 |
+
|
17 |
+
import csv
|
18 |
+
|
19 |
+
import datasets
|
20 |
+
|
21 |
+
|
22 |
+
_CITATION = """\
|
23 |
+
@article{hendryckstest2021,
|
24 |
+
title={Measuring Massive Multitask Language Understanding},
|
25 |
+
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
26 |
+
journal={Proceedings of the International Conference on Learning Representations (ICLR)},
|
27 |
+
year={2021}
|
28 |
+
}
|
29 |
+
"""
|
30 |
+
|
31 |
+
_DESCRIPTION = """\
|
32 |
+
This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge, covering 57 tasks including elementary mathematics, US history, computer science, law, and more.
|
33 |
+
"""
|
34 |
+
|
35 |
+
_HOMEPAGE = "https://github.com/hendrycks/test"
|
36 |
+
|
37 |
+
_URL = "https://www.dropbox.com/s/nv4z13trkpq80bj/mmlu.tar?dl=1"
|
38 |
+
|
39 |
+
_SUBJECTS = [
|
40 |
+
"abstract_algebra",
|
41 |
+
"anatomy",
|
42 |
+
"astronomy",
|
43 |
+
"business_ethics",
|
44 |
+
"clinical_knowledge",
|
45 |
+
"college_biology",
|
46 |
+
"college_chemistry",
|
47 |
+
"college_computer_science",
|
48 |
+
"college_mathematics",
|
49 |
+
"college_medicine",
|
50 |
+
"college_physics",
|
51 |
+
"computer_security",
|
52 |
+
"conceptual_physics",
|
53 |
+
"econometrics",
|
54 |
+
"electrical_engineering",
|
55 |
+
"elementary_mathematics",
|
56 |
+
"formal_logic",
|
57 |
+
"global_facts",
|
58 |
+
"high_school_biology",
|
59 |
+
"high_school_chemistry",
|
60 |
+
"high_school_computer_science",
|
61 |
+
"high_school_european_history",
|
62 |
+
"high_school_geography",
|
63 |
+
"high_school_government_and_politics",
|
64 |
+
"high_school_macroeconomics",
|
65 |
+
"high_school_mathematics",
|
66 |
+
"high_school_microeconomics",
|
67 |
+
"high_school_physics",
|
68 |
+
"high_school_psychology",
|
69 |
+
"high_school_statistics",
|
70 |
+
"high_school_us_history",
|
71 |
+
"high_school_world_history",
|
72 |
+
"human_aging",
|
73 |
+
"human_sexuality",
|
74 |
+
"international_law",
|
75 |
+
"jurisprudence",
|
76 |
+
"logical_fallacies",
|
77 |
+
"machine_learning",
|
78 |
+
"management",
|
79 |
+
"marketing",
|
80 |
+
"medical_genetics",
|
81 |
+
"miscellaneous",
|
82 |
+
"moral_disputes",
|
83 |
+
"moral_scenarios",
|
84 |
+
"nutrition",
|
85 |
+
"philosophy",
|
86 |
+
"prehistory",
|
87 |
+
"professional_accounting",
|
88 |
+
"professional_law",
|
89 |
+
"professional_medicine",
|
90 |
+
"professional_psychology",
|
91 |
+
"public_relations",
|
92 |
+
"security_studies",
|
93 |
+
"sociology",
|
94 |
+
"us_foreign_policy",
|
95 |
+
"virology",
|
96 |
+
"world_religions",
|
97 |
+
]
|
98 |
+
|
99 |
+
|
100 |
+
class HendrycksTest(datasets.GeneratorBasedBuilder):
|
101 |
+
"""Massive multitask MC test consisting of 57 tasks"""
|
102 |
+
|
103 |
+
BUILDER_CONFIGS = [
|
104 |
+
datasets.BuilderConfig(
|
105 |
+
name=sub, version=datasets.Version("1.0.0"), description=f"Hendrycks Test Subject {sub}"
|
106 |
+
)
|
107 |
+
for sub in _SUBJECTS
|
108 |
+
]
|
109 |
+
|
110 |
+
def _info(self):
|
111 |
+
features = datasets.Features(
|
112 |
+
{
|
113 |
+
"question": datasets.Value("string"),
|
114 |
+
"choices": datasets.features.Sequence(datasets.Value("string")),
|
115 |
+
"answer": datasets.features.ClassLabel(num_classes=4, names=["A", "B", "C", "D"]),
|
116 |
+
}
|
117 |
+
)
|
118 |
+
return datasets.DatasetInfo(
|
119 |
+
description=_DESCRIPTION,
|
120 |
+
features=features,
|
121 |
+
homepage=_HOMEPAGE,
|
122 |
+
citation=_CITATION,
|
123 |
+
)
|
124 |
+
|
125 |
+
def _split_generators(self, dl_manager):
|
126 |
+
"""Returns SplitGenerators."""
|
127 |
+
archive = dl_manager.download(_URL)
|
128 |
+
return [
|
129 |
+
datasets.SplitGenerator(
|
130 |
+
name=datasets.Split.TEST,
|
131 |
+
gen_kwargs={"iter_archive": dl_manager.iter_archive(archive), "split": "test"},
|
132 |
+
),
|
133 |
+
datasets.SplitGenerator(
|
134 |
+
name=datasets.Split.VALIDATION,
|
135 |
+
gen_kwargs={
|
136 |
+
"iter_archive": dl_manager.iter_archive(archive),
|
137 |
+
"split": "val",
|
138 |
+
},
|
139 |
+
),
|
140 |
+
datasets.SplitGenerator(
|
141 |
+
name=datasets.Split("dev"),
|
142 |
+
gen_kwargs={
|
143 |
+
"iter_archive": dl_manager.iter_archive(archive),
|
144 |
+
"split": "dev",
|
145 |
+
},
|
146 |
+
),
|
147 |
+
]
|
148 |
+
|
149 |
+
def _generate_examples(self, iter_archive, split):
|
150 |
+
"""Yields examples as (key, example) tuples."""
|
151 |
+
n_yielded_files = 0
|
152 |
+
for id_file, (path, file) in enumerate(iter_archive):
|
153 |
+
if f"data/{split}/" in path:
|
154 |
+
if f"{self.config.name}_{split}.csv" in path:
|
155 |
+
n_yielded_files += 1
|
156 |
+
lines = (line.decode("utf-8") for line in file)
|
157 |
+
reader = csv.reader(lines)
|
158 |
+
for id_line, data in enumerate(reader):
|
159 |
+
yield f"{id_file}_{id_line}", {"question": data[0], "choices": data[1:5], "answer": data[5]}
|
160 |
+
break
|