nicholasKluge commited on
Commit
0449a5c
·
verified ·
1 Parent(s): 4b97981

Upload results-pt.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. results-pt.json +1303 -0
results-pt.json ADDED
@@ -0,0 +1,1303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "assin2_rte": {
4
+ "f1_macro,all": 0.577927165258549,
5
+ "f1_macro_stderr,all": 0.0071561886831375205,
6
+ "acc,all": 0.6004901960784313,
7
+ "acc_stderr,all": 0.006996270367086327,
8
+ "alias": "assin2_rte"
9
+ },
10
+ "assin2_sts": {
11
+ "pearson,all": 0.01994583242718194,
12
+ "pearson_stderr,all": 0.014030097851152527,
13
+ "mse,all": 2.0782883986928105,
14
+ "mse_stderr,all": "N/A",
15
+ "alias": "assin2_sts"
16
+ },
17
+ "bluex": {
18
+ "acc,all": 0.24756606397774686,
19
+ "acc_stderr,all": 0.009256090690166377,
20
+ "acc,exam_id__USP_2020": 0.23214285714285715,
21
+ "acc_stderr,exam_id__USP_2020": 0.03265301549767108,
22
+ "acc,exam_id__UNICAMP_2022": 0.3076923076923077,
23
+ "acc_stderr,exam_id__UNICAMP_2022": 0.04284625323101336,
24
+ "acc,exam_id__USP_2019": 0.15,
25
+ "acc_stderr,exam_id__USP_2019": 0.032604803052830006,
26
+ "acc,exam_id__UNICAMP_2019": 0.26,
27
+ "acc_stderr,exam_id__UNICAMP_2019": 0.035805342426308485,
28
+ "acc,exam_id__UNICAMP_2023": 0.3488372093023256,
29
+ "acc_stderr,exam_id__UNICAMP_2023": 0.041919014018416384,
30
+ "acc,exam_id__UNICAMP_2021_2": 0.3137254901960784,
31
+ "acc_stderr,exam_id__UNICAMP_2021_2": 0.037486879456641045,
32
+ "acc,exam_id__USP_2023": 0.18181818181818182,
33
+ "acc_stderr,exam_id__USP_2023": 0.03349033495084427,
34
+ "acc,exam_id__USP_2021": 0.23076923076923078,
35
+ "acc_stderr,exam_id__USP_2021": 0.03381852906171476,
36
+ "acc,exam_id__UNICAMP_2021_1": 0.2391304347826087,
37
+ "acc_stderr,exam_id__UNICAMP_2021_1": 0.03638828981438008,
38
+ "acc,exam_id__UNICAMP_2020": 0.32727272727272727,
39
+ "acc_stderr,exam_id__UNICAMP_2020": 0.03653884692218891,
40
+ "acc,exam_id__USP_2022": 0.16326530612244897,
41
+ "acc_stderr,exam_id__USP_2022": 0.03041886398004624,
42
+ "acc,exam_id__USP_2024": 0.1951219512195122,
43
+ "acc_stderr,exam_id__USP_2024": 0.03573567350069458,
44
+ "acc,exam_id__UNICAMP_2024": 0.28888888888888886,
45
+ "acc_stderr,exam_id__UNICAMP_2024": 0.038994404039252965,
46
+ "acc,exam_id__USP_2018": 0.16666666666666666,
47
+ "acc_stderr,exam_id__USP_2018": 0.029375798861409928,
48
+ "acc,exam_id__UNICAMP_2018": 0.2962962962962963,
49
+ "acc_stderr,exam_id__UNICAMP_2018": 0.035803514128866995,
50
+ "alias": "bluex"
51
+ },
52
+ "enem_challenge": {
53
+ "alias": "enem",
54
+ "acc,all": 0.19174247725682295,
55
+ "acc_stderr,all": 0.006016154421005065,
56
+ "acc,exam_id__2014": 0.14678899082568808,
57
+ "acc_stderr,exam_id__2014": 0.019587599223040625,
58
+ "acc,exam_id__2017": 0.1896551724137931,
59
+ "acc_stderr,exam_id__2017": 0.02093347844131792,
60
+ "acc,exam_id__2016_2": 0.21951219512195122,
61
+ "acc_stderr,exam_id__2016_2": 0.021542806426202234,
62
+ "acc,exam_id__2016": 0.23140495867768596,
63
+ "acc_stderr,exam_id__2016": 0.022131076893750434,
64
+ "acc,exam_id__2012": 0.1724137931034483,
65
+ "acc_stderr,exam_id__2012": 0.020164330045300043,
66
+ "acc,exam_id__2010": 0.2222222222222222,
67
+ "acc_stderr,exam_id__2010": 0.02213114984638704,
68
+ "acc,exam_id__2022": 0.16541353383458646,
69
+ "acc_stderr,exam_id__2022": 0.018647718361639067,
70
+ "acc,exam_id__2015": 0.23529411764705882,
71
+ "acc_stderr,exam_id__2015": 0.02241173308841978,
72
+ "acc,exam_id__2011": 0.18803418803418803,
73
+ "acc_stderr,exam_id__2011": 0.020817448020547174,
74
+ "acc,exam_id__2013": 0.23148148148148148,
75
+ "acc_stderr,exam_id__2013": 0.02345320376483438,
76
+ "acc,exam_id__2009": 0.13043478260869565,
77
+ "acc_stderr,exam_id__2009": 0.018075482087183004,
78
+ "acc,exam_id__2023": 0.17037037037037037,
79
+ "acc_stderr,exam_id__2023": 0.018694761764683765
80
+ },
81
+ "faquad_nli": {
82
+ "f1_macro,all": 0.4396551724137931,
83
+ "f1_macro_stderr,all": 0.0035796984729087084,
84
+ "acc,all": 0.7846153846153846,
85
+ "acc_stderr,all": 0.011396120309131327,
86
+ "alias": "faquad_nli"
87
+ },
88
+ "hatebr_offensive": {
89
+ "alias": "hatebr_offensive_binary",
90
+ "f1_macro,all": 0.5373169584712172,
91
+ "f1_macro_stderr,all": 0.009509765977605292,
92
+ "acc,all": 0.5642857142857143,
93
+ "acc_stderr,all": 0.009363640166058677
94
+ },
95
+ "oab_exams": {
96
+ "acc,all": 0.2528473804100228,
97
+ "acc_stderr,all": 0.005369584144606133,
98
+ "acc,exam_id__2012-06a": 0.2875,
99
+ "acc_stderr,exam_id__2012-06a": 0.029268820937911468,
100
+ "acc,exam_id__2012-09": 0.2857142857142857,
101
+ "acc_stderr,exam_id__2012-09": 0.02981338225884132,
102
+ "acc,exam_id__2013-10": 0.1375,
103
+ "acc_stderr,exam_id__2013-10": 0.022269660648017626,
104
+ "acc,exam_id__2011-04": 0.2625,
105
+ "acc_stderr,exam_id__2011-04": 0.02838995857974862,
106
+ "acc,exam_id__2018-25": 0.2,
107
+ "acc_stderr,exam_id__2018-25": 0.02584730185242964,
108
+ "acc,exam_id__2011-05": 0.2625,
109
+ "acc_stderr,exam_id__2011-05": 0.028394014157416495,
110
+ "acc,exam_id__2012-07": 0.275,
111
+ "acc_stderr,exam_id__2012-07": 0.028825122025371252,
112
+ "acc,exam_id__2014-13": 0.2375,
113
+ "acc_stderr,exam_id__2014-13": 0.027472596578792648,
114
+ "acc,exam_id__2012-06": 0.2375,
115
+ "acc_stderr,exam_id__2012-06": 0.027444121032139803,
116
+ "acc,exam_id__2015-16": 0.3125,
117
+ "acc_stderr,exam_id__2015-16": 0.03003339727126325,
118
+ "acc,exam_id__2017-22": 0.25,
119
+ "acc_stderr,exam_id__2017-22": 0.027997530519651585,
120
+ "acc,exam_id__2017-24": 0.275,
121
+ "acc_stderr,exam_id__2017-24": 0.028929264616280162,
122
+ "acc,exam_id__2014-14": 0.2125,
123
+ "acc_stderr,exam_id__2014-14": 0.02645590211050805,
124
+ "acc,exam_id__2011-03": 0.25252525252525254,
125
+ "acc_stderr,exam_id__2011-03": 0.02522016343392691,
126
+ "acc,exam_id__2014-15": 0.2564102564102564,
127
+ "acc_stderr,exam_id__2014-15": 0.02850809134358431,
128
+ "acc,exam_id__2016-19": 0.2948717948717949,
129
+ "acc_stderr,exam_id__2016-19": 0.029792524691902637,
130
+ "acc,exam_id__2012-08": 0.2375,
131
+ "acc_stderr,exam_id__2012-08": 0.02756426923964389,
132
+ "acc,exam_id__2015-17": 0.24358974358974358,
133
+ "acc_stderr,exam_id__2015-17": 0.028007851240875885,
134
+ "acc,exam_id__2015-18": 0.2875,
135
+ "acc_stderr,exam_id__2015-18": 0.02920616482631226,
136
+ "acc,exam_id__2010-01": 0.25882352941176473,
137
+ "acc_stderr,exam_id__2010-01": 0.027343286705705846,
138
+ "acc,exam_id__2016-20a": 0.2125,
139
+ "acc_stderr,exam_id__2016-20a": 0.026410470748996454,
140
+ "acc,exam_id__2016-21": 0.2625,
141
+ "acc_stderr,exam_id__2016-21": 0.028563711821251347,
142
+ "acc,exam_id__2016-20": 0.225,
143
+ "acc_stderr,exam_id__2016-20": 0.02689442014065515,
144
+ "acc,exam_id__2017-23": 0.275,
145
+ "acc_stderr,exam_id__2017-23": 0.028728458302479908,
146
+ "acc,exam_id__2013-12": 0.2875,
147
+ "acc_stderr,exam_id__2013-12": 0.02914378722659053,
148
+ "acc,exam_id__2010-02": 0.27,
149
+ "acc_stderr,exam_id__2010-02": 0.02563525101778004,
150
+ "acc,exam_id__2013-11": 0.225,
151
+ "acc_stderr,exam_id__2013-11": 0.027058590192888813,
152
+ "alias": "oab_exams"
153
+ },
154
+ "portuguese_hate_speech": {
155
+ "alias": "portuguese_hate_speech_binary",
156
+ "f1_macro,all": 0.300060277449729,
157
+ "f1_macro_stderr,all": 0.010227555978721074,
158
+ "acc,all": 0.33137485311398357,
159
+ "acc_stderr,all": 0.011379474523059378
160
+ },
161
+ "tweetsentbr": {
162
+ "f1_macro,all": 0.2072905953605302,
163
+ "f1_macro_stderr,all": 0.0024889383614545776,
164
+ "acc,all": 0.45124378109452734,
165
+ "acc_stderr,all": 0.007861876742012653,
166
+ "alias": "tweetsentbr"
167
+ }
168
+ },
169
+ "configs": {
170
+ "assin2_rte": {
171
+ "task": "assin2_rte",
172
+ "group": [
173
+ "pt_benchmark",
174
+ "assin2"
175
+ ],
176
+ "dataset_path": "assin2",
177
+ "test_split": "test",
178
+ "fewshot_split": "train",
179
+ "doc_to_text": "Premissa: {{premise}}\nHipótese: {{hypothesis}}\nPergunta: A hipótese pode ser inferida pela premissa? Sim ou Não?\nResposta:",
180
+ "doc_to_target": "{{['Não', 'Sim'][entailment_judgment]}}",
181
+ "description": "Abaixo estão pares de premissa e hipótese. Para cada par, indique se a hipótese pode ser inferida a partir da premissa, responda apenas com \"Sim\" ou \"Não\".\n\n",
182
+ "target_delimiter": " ",
183
+ "fewshot_delimiter": "\n\n",
184
+ "fewshot_config": {
185
+ "sampler": "id_sampler",
186
+ "sampler_config": {
187
+ "id_list": [
188
+ 1,
189
+ 3251,
190
+ 2,
191
+ 3252,
192
+ 3,
193
+ 4,
194
+ 5,
195
+ 6,
196
+ 3253,
197
+ 7,
198
+ 3254,
199
+ 3255,
200
+ 3256,
201
+ 8,
202
+ 9,
203
+ 10,
204
+ 3257,
205
+ 11,
206
+ 3258,
207
+ 12,
208
+ 13,
209
+ 14,
210
+ 15,
211
+ 3259,
212
+ 3260,
213
+ 3261,
214
+ 3262,
215
+ 3263,
216
+ 16,
217
+ 17,
218
+ 3264,
219
+ 18,
220
+ 3265,
221
+ 3266,
222
+ 3267,
223
+ 19,
224
+ 20,
225
+ 3268,
226
+ 3269,
227
+ 21,
228
+ 3270,
229
+ 3271,
230
+ 22,
231
+ 3272,
232
+ 3273,
233
+ 23,
234
+ 3274,
235
+ 24,
236
+ 25,
237
+ 3275
238
+ ],
239
+ "id_column": "sentence_pair_id"
240
+ }
241
+ },
242
+ "num_fewshot": 15,
243
+ "metric_list": [
244
+ {
245
+ "metric": "f1_macro",
246
+ "aggregation": "f1_macro",
247
+ "higher_is_better": true
248
+ },
249
+ {
250
+ "metric": "acc",
251
+ "aggregation": "acc",
252
+ "higher_is_better": true
253
+ }
254
+ ],
255
+ "output_type": "generate_until",
256
+ "generation_kwargs": {
257
+ "max_gen_toks": 32,
258
+ "do_sample": false,
259
+ "temperature": 0.0,
260
+ "top_k": null,
261
+ "top_p": null,
262
+ "until": [
263
+ "\n\n"
264
+ ]
265
+ },
266
+ "repeats": 1,
267
+ "filter_list": [
268
+ {
269
+ "name": "all",
270
+ "filter": [
271
+ {
272
+ "function": "find_similar_label",
273
+ "labels": [
274
+ "Sim",
275
+ "Não"
276
+ ]
277
+ },
278
+ {
279
+ "function": "take_first"
280
+ }
281
+ ]
282
+ }
283
+ ],
284
+ "should_decontaminate": false,
285
+ "metadata": {
286
+ "version": 1.1
287
+ }
288
+ },
289
+ "assin2_sts": {
290
+ "task": "assin2_sts",
291
+ "group": [
292
+ "pt_benchmark",
293
+ "assin2"
294
+ ],
295
+ "dataset_path": "assin2",
296
+ "test_split": "test",
297
+ "fewshot_split": "train",
298
+ "doc_to_text": "Frase 1: {{premise}}\nFrase 2: {{hypothesis}}\nPergunta: Quão similares são as duas frases? Dê uma pontuação entre 1,0 a 5,0.\nResposta:",
299
+ "doc_to_target": "<function assin2_float_to_pt_str at 0x14879d55b600>",
300
+ "description": "Abaixo estão pares de frases que você deve avaliar o grau de similaridade. Dê uma pontuação entre 1,0 e 5,0, sendo 1,0 pouco similar e 5,0 muito similar.\n\n",
301
+ "target_delimiter": " ",
302
+ "fewshot_delimiter": "\n\n",
303
+ "fewshot_config": {
304
+ "sampler": "id_sampler",
305
+ "sampler_config": {
306
+ "id_list": [
307
+ 1,
308
+ 3251,
309
+ 2,
310
+ 3252,
311
+ 3,
312
+ 4,
313
+ 5,
314
+ 6,
315
+ 3253,
316
+ 7,
317
+ 3254,
318
+ 3255,
319
+ 3256,
320
+ 8,
321
+ 9,
322
+ 10,
323
+ 3257,
324
+ 11,
325
+ 3258,
326
+ 12,
327
+ 13,
328
+ 14,
329
+ 15,
330
+ 3259,
331
+ 3260,
332
+ 3261,
333
+ 3262,
334
+ 3263,
335
+ 16,
336
+ 17,
337
+ 3264,
338
+ 18,
339
+ 3265,
340
+ 3266,
341
+ 3267,
342
+ 19,
343
+ 20,
344
+ 3268,
345
+ 3269,
346
+ 21,
347
+ 3270,
348
+ 3271,
349
+ 22,
350
+ 3272,
351
+ 3273,
352
+ 23,
353
+ 3274,
354
+ 24,
355
+ 25,
356
+ 3275
357
+ ],
358
+ "id_column": "sentence_pair_id"
359
+ }
360
+ },
361
+ "num_fewshot": 10,
362
+ "metric_list": [
363
+ {
364
+ "metric": "pearson",
365
+ "aggregation": "pearsonr",
366
+ "higher_is_better": true
367
+ },
368
+ {
369
+ "metric": "mse",
370
+ "aggregation": "mean_squared_error",
371
+ "higher_is_better": false
372
+ }
373
+ ],
374
+ "output_type": "generate_until",
375
+ "generation_kwargs": {
376
+ "max_gen_toks": 32,
377
+ "do_sample": false,
378
+ "temperature": 0.0,
379
+ "top_k": null,
380
+ "top_p": null,
381
+ "until": [
382
+ "\n\n"
383
+ ]
384
+ },
385
+ "repeats": 1,
386
+ "filter_list": [
387
+ {
388
+ "name": "all",
389
+ "filter": [
390
+ {
391
+ "function": "number_filter",
392
+ "type": "float",
393
+ "range_min": 1.0,
394
+ "range_max": 5.0,
395
+ "on_outside_range": "clip",
396
+ "fallback": 5.0
397
+ },
398
+ {
399
+ "function": "take_first"
400
+ }
401
+ ]
402
+ }
403
+ ],
404
+ "should_decontaminate": false,
405
+ "metadata": {
406
+ "version": 1.1
407
+ }
408
+ },
409
+ "bluex": {
410
+ "task": "bluex",
411
+ "group": [
412
+ "pt_benchmark",
413
+ "vestibular"
414
+ ],
415
+ "dataset_path": "eduagarcia-temp/BLUEX_without_images",
416
+ "test_split": "train",
417
+ "fewshot_split": "train",
418
+ "doc_to_text": "<function enem_doc_to_text at 0x14879d55ab60>",
419
+ "doc_to_target": "{{answerKey}}",
420
+ "description": "As perguntas a seguir são questões de múltipla escolha de provas de vestibular de universidades brasileiras, selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n",
421
+ "target_delimiter": " ",
422
+ "fewshot_delimiter": "\n\n",
423
+ "fewshot_config": {
424
+ "sampler": "id_sampler",
425
+ "sampler_config": {
426
+ "id_list": [
427
+ "USP_2018_3",
428
+ "UNICAMP_2018_2",
429
+ "USP_2018_35",
430
+ "UNICAMP_2018_16",
431
+ "USP_2018_89"
432
+ ],
433
+ "id_column": "id",
434
+ "exclude_from_task": true
435
+ }
436
+ },
437
+ "num_fewshot": 3,
438
+ "metric_list": [
439
+ {
440
+ "metric": "acc",
441
+ "aggregation": "acc",
442
+ "higher_is_better": true
443
+ }
444
+ ],
445
+ "output_type": "generate_until",
446
+ "generation_kwargs": {
447
+ "max_gen_toks": 32,
448
+ "do_sample": false,
449
+ "temperature": 0.0,
450
+ "top_k": null,
451
+ "top_p": null,
452
+ "until": [
453
+ "\n\n"
454
+ ]
455
+ },
456
+ "repeats": 1,
457
+ "filter_list": [
458
+ {
459
+ "name": "all",
460
+ "filter": [
461
+ {
462
+ "function": "normalize_spaces"
463
+ },
464
+ {
465
+ "function": "remove_accents"
466
+ },
467
+ {
468
+ "function": "find_choices",
469
+ "choices": [
470
+ "A",
471
+ "B",
472
+ "C",
473
+ "D",
474
+ "E"
475
+ ],
476
+ "regex_patterns": [
477
+ "(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCDE])\\b",
478
+ "\\b([ABCDE])\\.",
479
+ "\\b([ABCDE]) ?[.):-]",
480
+ "\\b([ABCDE])$",
481
+ "\\b([ABCDE])\\b"
482
+ ]
483
+ },
484
+ {
485
+ "function": "take_first"
486
+ }
487
+ ],
488
+ "group_by": {
489
+ "column": "exam_id"
490
+ }
491
+ }
492
+ ],
493
+ "should_decontaminate": true,
494
+ "doc_to_decontamination_query": "<function enem_doc_to_text at 0x14879d55ae80>",
495
+ "metadata": {
496
+ "version": 1.1
497
+ }
498
+ },
499
+ "enem_challenge": {
500
+ "task": "enem_challenge",
501
+ "task_alias": "enem",
502
+ "group": [
503
+ "pt_benchmark",
504
+ "vestibular"
505
+ ],
506
+ "dataset_path": "eduagarcia/enem_challenge",
507
+ "test_split": "train",
508
+ "fewshot_split": "train",
509
+ "doc_to_text": "<function enem_doc_to_text at 0x14879d55b060>",
510
+ "doc_to_target": "{{answerKey}}",
511
+ "description": "As perguntas a seguir são questões de múltipla escolha do Exame Nacional do Ensino Médio (ENEM), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n",
512
+ "target_delimiter": " ",
513
+ "fewshot_delimiter": "\n\n",
514
+ "fewshot_config": {
515
+ "sampler": "id_sampler",
516
+ "sampler_config": {
517
+ "id_list": [
518
+ "2022_21",
519
+ "2022_88",
520
+ "2022_143"
521
+ ],
522
+ "id_column": "id",
523
+ "exclude_from_task": true
524
+ }
525
+ },
526
+ "num_fewshot": 3,
527
+ "metric_list": [
528
+ {
529
+ "metric": "acc",
530
+ "aggregation": "acc",
531
+ "higher_is_better": true
532
+ }
533
+ ],
534
+ "output_type": "generate_until",
535
+ "generation_kwargs": {
536
+ "max_gen_toks": 32,
537
+ "do_sample": false,
538
+ "temperature": 0.0,
539
+ "top_k": null,
540
+ "top_p": null,
541
+ "until": [
542
+ "\n\n"
543
+ ]
544
+ },
545
+ "repeats": 1,
546
+ "filter_list": [
547
+ {
548
+ "name": "all",
549
+ "filter": [
550
+ {
551
+ "function": "normalize_spaces"
552
+ },
553
+ {
554
+ "function": "remove_accents"
555
+ },
556
+ {
557
+ "function": "find_choices",
558
+ "choices": [
559
+ "A",
560
+ "B",
561
+ "C",
562
+ "D",
563
+ "E"
564
+ ],
565
+ "regex_patterns": [
566
+ "(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCDE])\\b",
567
+ "\\b([ABCDE])\\.",
568
+ "\\b([ABCDE]) ?[.):-]",
569
+ "\\b([ABCDE])$",
570
+ "\\b([ABCDE])\\b"
571
+ ]
572
+ },
573
+ {
574
+ "function": "take_first"
575
+ }
576
+ ],
577
+ "group_by": {
578
+ "column": "exam_id"
579
+ }
580
+ }
581
+ ],
582
+ "should_decontaminate": true,
583
+ "doc_to_decontamination_query": "<function enem_doc_to_text at 0x14879d55b240>",
584
+ "metadata": {
585
+ "version": 1.1
586
+ }
587
+ },
588
+ "faquad_nli": {
589
+ "task": "faquad_nli",
590
+ "group": [
591
+ "pt_benchmark"
592
+ ],
593
+ "dataset_path": "ruanchaves/faquad-nli",
594
+ "test_split": "test",
595
+ "fewshot_split": "train",
596
+ "doc_to_text": "Pergunta: {{question}}\nResposta: {{answer}}\nA resposta dada satisfaz à pergunta? Sim ou Não?",
597
+ "doc_to_target": "{{['Não', 'Sim'][label]}}",
598
+ "description": "Abaixo estão pares de pergunta e resposta. Para cada par, você deve julgar se a resposta responde à pergunta de maneira satisfatória e aparenta estar correta. Escreva apenas \"Sim\" ou \"Não\".\n\n",
599
+ "target_delimiter": " ",
600
+ "fewshot_delimiter": "\n\n",
601
+ "fewshot_config": {
602
+ "sampler": "first_n",
603
+ "sampler_config": {
604
+ "fewshot_indices": [
605
+ 1893,
606
+ 949,
607
+ 663,
608
+ 105,
609
+ 1169,
610
+ 2910,
611
+ 2227,
612
+ 2813,
613
+ 974,
614
+ 558,
615
+ 1503,
616
+ 1958,
617
+ 2918,
618
+ 601,
619
+ 1560,
620
+ 984,
621
+ 2388,
622
+ 995,
623
+ 2233,
624
+ 1982,
625
+ 165,
626
+ 2788,
627
+ 1312,
628
+ 2285,
629
+ 522,
630
+ 1113,
631
+ 1670,
632
+ 323,
633
+ 236,
634
+ 1263,
635
+ 1562,
636
+ 2519,
637
+ 1049,
638
+ 432,
639
+ 1167,
640
+ 1394,
641
+ 2022,
642
+ 2551,
643
+ 2194,
644
+ 2187,
645
+ 2282,
646
+ 2816,
647
+ 108,
648
+ 301,
649
+ 1185,
650
+ 1315,
651
+ 1420,
652
+ 2436,
653
+ 2322,
654
+ 766
655
+ ]
656
+ }
657
+ },
658
+ "num_fewshot": 15,
659
+ "metric_list": [
660
+ {
661
+ "metric": "f1_macro",
662
+ "aggregation": "f1_macro",
663
+ "higher_is_better": true
664
+ },
665
+ {
666
+ "metric": "acc",
667
+ "aggregation": "acc",
668
+ "higher_is_better": true
669
+ }
670
+ ],
671
+ "output_type": "generate_until",
672
+ "generation_kwargs": {
673
+ "max_gen_toks": 32,
674
+ "do_sample": false,
675
+ "temperature": 0.0,
676
+ "top_k": null,
677
+ "top_p": null,
678
+ "until": [
679
+ "\n\n"
680
+ ]
681
+ },
682
+ "repeats": 1,
683
+ "filter_list": [
684
+ {
685
+ "name": "all",
686
+ "filter": [
687
+ {
688
+ "function": "find_similar_label",
689
+ "labels": [
690
+ "Sim",
691
+ "Não"
692
+ ]
693
+ },
694
+ {
695
+ "function": "take_first"
696
+ }
697
+ ]
698
+ }
699
+ ],
700
+ "should_decontaminate": false,
701
+ "metadata": {
702
+ "version": 1.1
703
+ }
704
+ },
705
+ "hatebr_offensive": {
706
+ "task": "hatebr_offensive",
707
+ "task_alias": "hatebr_offensive_binary",
708
+ "group": [
709
+ "pt_benchmark"
710
+ ],
711
+ "dataset_path": "eduagarcia/portuguese_benchmark",
712
+ "dataset_name": "HateBR_offensive_binary",
713
+ "test_split": "test",
714
+ "fewshot_split": "train",
715
+ "doc_to_text": "Texto: {{sentence}}\nPergunta: O texto é ofensivo?\nResposta:",
716
+ "doc_to_target": "{{'Sim' if label == 1 else 'Não'}}",
717
+ "description": "Abaixo contém o texto de comentários de usuários do Instagram em português, sua tarefa é classificar se o texto é ofensivo ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n",
718
+ "target_delimiter": " ",
719
+ "fewshot_delimiter": "\n\n",
720
+ "fewshot_config": {
721
+ "sampler": "id_sampler",
722
+ "sampler_config": {
723
+ "id_list": [
724
+ 48,
725
+ 44,
726
+ 36,
727
+ 20,
728
+ 3511,
729
+ 88,
730
+ 3555,
731
+ 16,
732
+ 56,
733
+ 3535,
734
+ 60,
735
+ 40,
736
+ 3527,
737
+ 4,
738
+ 76,
739
+ 3579,
740
+ 3523,
741
+ 3551,
742
+ 68,
743
+ 3503,
744
+ 84,
745
+ 3539,
746
+ 64,
747
+ 3599,
748
+ 80,
749
+ 3563,
750
+ 3559,
751
+ 3543,
752
+ 3547,
753
+ 3587,
754
+ 3595,
755
+ 3575,
756
+ 3567,
757
+ 3591,
758
+ 24,
759
+ 96,
760
+ 92,
761
+ 3507,
762
+ 52,
763
+ 72,
764
+ 8,
765
+ 3571,
766
+ 3515,
767
+ 3519,
768
+ 3531,
769
+ 28,
770
+ 32,
771
+ 0,
772
+ 12,
773
+ 3583
774
+ ],
775
+ "id_column": "idx"
776
+ }
777
+ },
778
+ "num_fewshot": 25,
779
+ "metric_list": [
780
+ {
781
+ "metric": "f1_macro",
782
+ "aggregation": "f1_macro",
783
+ "higher_is_better": true
784
+ },
785
+ {
786
+ "metric": "acc",
787
+ "aggregation": "acc",
788
+ "higher_is_better": true
789
+ }
790
+ ],
791
+ "output_type": "generate_until",
792
+ "generation_kwargs": {
793
+ "max_gen_toks": 32,
794
+ "do_sample": false,
795
+ "temperature": 0.0,
796
+ "top_k": null,
797
+ "top_p": null,
798
+ "until": [
799
+ "\n\n"
800
+ ]
801
+ },
802
+ "repeats": 1,
803
+ "filter_list": [
804
+ {
805
+ "name": "all",
806
+ "filter": [
807
+ {
808
+ "function": "find_similar_label",
809
+ "labels": [
810
+ "Sim",
811
+ "Não"
812
+ ]
813
+ },
814
+ {
815
+ "function": "take_first"
816
+ }
817
+ ]
818
+ }
819
+ ],
820
+ "should_decontaminate": false,
821
+ "metadata": {
822
+ "version": 1.0
823
+ }
824
+ },
825
+ "oab_exams": {
826
+ "task": "oab_exams",
827
+ "group": [
828
+ "legal_benchmark",
829
+ "pt_benchmark"
830
+ ],
831
+ "dataset_path": "eduagarcia/oab_exams",
832
+ "test_split": "train",
833
+ "fewshot_split": "train",
834
+ "doc_to_text": "<function doc_to_text at 0x14879d55bd80>",
835
+ "doc_to_target": "{{answerKey}}",
836
+ "description": "As perguntas a seguir são questões de múltipla escolha do Exame de Ordem da Ordem dos Advogados do Brasil (OAB), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\" ou \"D\".\n\n",
837
+ "target_delimiter": " ",
838
+ "fewshot_delimiter": "\n\n",
839
+ "fewshot_config": {
840
+ "sampler": "id_sampler",
841
+ "sampler_config": {
842
+ "id_list": [
843
+ "2010-01_1",
844
+ "2010-01_11",
845
+ "2010-01_13",
846
+ "2010-01_23",
847
+ "2010-01_26",
848
+ "2010-01_28",
849
+ "2010-01_38",
850
+ "2010-01_48",
851
+ "2010-01_58",
852
+ "2010-01_68",
853
+ "2010-01_76",
854
+ "2010-01_83",
855
+ "2010-01_85",
856
+ "2010-01_91",
857
+ "2010-01_99"
858
+ ],
859
+ "id_column": "id",
860
+ "exclude_from_task": true
861
+ }
862
+ },
863
+ "num_fewshot": 3,
864
+ "metric_list": [
865
+ {
866
+ "metric": "acc",
867
+ "aggregation": "acc",
868
+ "higher_is_better": true
869
+ }
870
+ ],
871
+ "output_type": "generate_until",
872
+ "generation_kwargs": {
873
+ "max_gen_toks": 32,
874
+ "do_sample": false,
875
+ "temperature": 0.0,
876
+ "top_k": null,
877
+ "top_p": null,
878
+ "until": [
879
+ "\n\n"
880
+ ]
881
+ },
882
+ "repeats": 1,
883
+ "filter_list": [
884
+ {
885
+ "name": "all",
886
+ "filter": [
887
+ {
888
+ "function": "normalize_spaces"
889
+ },
890
+ {
891
+ "function": "remove_accents"
892
+ },
893
+ {
894
+ "function": "find_choices",
895
+ "choices": [
896
+ "A",
897
+ "B",
898
+ "C",
899
+ "D"
900
+ ],
901
+ "regex_patterns": [
902
+ "(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCD])\\b",
903
+ "\\b([ABCD])\\.",
904
+ "\\b([ABCD]) ?[.):-]",
905
+ "\\b([ABCD])$",
906
+ "\\b([ABCD])\\b"
907
+ ]
908
+ },
909
+ {
910
+ "function": "take_first"
911
+ }
912
+ ],
913
+ "group_by": {
914
+ "column": "exam_id"
915
+ }
916
+ }
917
+ ],
918
+ "should_decontaminate": true,
919
+ "doc_to_decontamination_query": "<function doc_to_text at 0x14879d3ac040>",
920
+ "metadata": {
921
+ "version": 1.5
922
+ }
923
+ },
924
+ "portuguese_hate_speech": {
925
+ "task": "portuguese_hate_speech",
926
+ "task_alias": "portuguese_hate_speech_binary",
927
+ "group": [
928
+ "pt_benchmark"
929
+ ],
930
+ "dataset_path": "eduagarcia/portuguese_benchmark",
931
+ "dataset_name": "Portuguese_Hate_Speech_binary",
932
+ "test_split": "test",
933
+ "fewshot_split": "train",
934
+ "doc_to_text": "Texto: {{sentence}}\nPergunta: O texto contém discurso de ódio?\nResposta:",
935
+ "doc_to_target": "{{'Sim' if label == 1 else 'Não'}}",
936
+ "description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o texto contém discurso de ódio ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n",
937
+ "target_delimiter": " ",
938
+ "fewshot_delimiter": "\n\n",
939
+ "fewshot_config": {
940
+ "sampler": "id_sampler",
941
+ "sampler_config": {
942
+ "id_list": [
943
+ 52,
944
+ 50,
945
+ 39,
946
+ 28,
947
+ 3,
948
+ 105,
949
+ 22,
950
+ 25,
951
+ 60,
952
+ 11,
953
+ 66,
954
+ 41,
955
+ 9,
956
+ 4,
957
+ 91,
958
+ 42,
959
+ 7,
960
+ 20,
961
+ 76,
962
+ 1,
963
+ 104,
964
+ 13,
965
+ 67,
966
+ 54,
967
+ 97,
968
+ 27,
969
+ 24,
970
+ 14,
971
+ 16,
972
+ 48,
973
+ 53,
974
+ 40,
975
+ 34,
976
+ 49,
977
+ 32,
978
+ 119,
979
+ 114,
980
+ 2,
981
+ 58,
982
+ 83,
983
+ 18,
984
+ 36,
985
+ 5,
986
+ 6,
987
+ 10,
988
+ 35,
989
+ 38,
990
+ 0,
991
+ 21,
992
+ 46
993
+ ],
994
+ "id_column": "idx"
995
+ }
996
+ },
997
+ "num_fewshot": 25,
998
+ "metric_list": [
999
+ {
1000
+ "metric": "f1_macro",
1001
+ "aggregation": "f1_macro",
1002
+ "higher_is_better": true
1003
+ },
1004
+ {
1005
+ "metric": "acc",
1006
+ "aggregation": "acc",
1007
+ "higher_is_better": true
1008
+ }
1009
+ ],
1010
+ "output_type": "generate_until",
1011
+ "generation_kwargs": {
1012
+ "max_gen_toks": 32,
1013
+ "do_sample": false,
1014
+ "temperature": 0.0,
1015
+ "top_k": null,
1016
+ "top_p": null,
1017
+ "until": [
1018
+ "\n\n"
1019
+ ]
1020
+ },
1021
+ "repeats": 1,
1022
+ "filter_list": [
1023
+ {
1024
+ "name": "all",
1025
+ "filter": [
1026
+ {
1027
+ "function": "find_similar_label",
1028
+ "labels": [
1029
+ "Sim",
1030
+ "Não"
1031
+ ]
1032
+ },
1033
+ {
1034
+ "function": "take_first"
1035
+ }
1036
+ ]
1037
+ }
1038
+ ],
1039
+ "should_decontaminate": false,
1040
+ "metadata": {
1041
+ "version": 1.0
1042
+ }
1043
+ },
1044
+ "tweetsentbr": {
1045
+ "task": "tweetsentbr",
1046
+ "group": [
1047
+ "pt_benchmark"
1048
+ ],
1049
+ "dataset_path": "eduagarcia/tweetsentbr_fewshot",
1050
+ "test_split": "test",
1051
+ "fewshot_split": "train",
1052
+ "doc_to_text": "Texto: {{sentence}}\nPergunta: O sentimento do texto é Positivo, Neutro ou Negativo?\nResposta:",
1053
+ "doc_to_target": "{{'Positivo' if label == 'Positive' else ('Negativo' if label == 'Negative' else 'Neutro')}}",
1054
+ "description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o sentimento do texto é Positivo, Neutro ou Negativo. Responda apenas com uma das opções.\n\n",
1055
+ "target_delimiter": " ",
1056
+ "fewshot_delimiter": "\n\n",
1057
+ "fewshot_config": {
1058
+ "sampler": "first_n"
1059
+ },
1060
+ "num_fewshot": 25,
1061
+ "metric_list": [
1062
+ {
1063
+ "metric": "f1_macro",
1064
+ "aggregation": "f1_macro",
1065
+ "higher_is_better": true
1066
+ },
1067
+ {
1068
+ "metric": "acc",
1069
+ "aggregation": "acc",
1070
+ "higher_is_better": true
1071
+ }
1072
+ ],
1073
+ "output_type": "generate_until",
1074
+ "generation_kwargs": {
1075
+ "max_gen_toks": 32,
1076
+ "do_sample": false,
1077
+ "temperature": 0.0,
1078
+ "top_k": null,
1079
+ "top_p": null,
1080
+ "until": [
1081
+ "\n\n"
1082
+ ]
1083
+ },
1084
+ "repeats": 1,
1085
+ "filter_list": [
1086
+ {
1087
+ "name": "all",
1088
+ "filter": [
1089
+ {
1090
+ "function": "find_similar_label",
1091
+ "labels": [
1092
+ "Positivo",
1093
+ "Neutro",
1094
+ "Negativo"
1095
+ ]
1096
+ },
1097
+ {
1098
+ "function": "take_first"
1099
+ }
1100
+ ]
1101
+ }
1102
+ ],
1103
+ "should_decontaminate": false,
1104
+ "metadata": {
1105
+ "version": 1.0
1106
+ }
1107
+ }
1108
+ },
1109
+ "versions": {
1110
+ "assin2_rte": 1.1,
1111
+ "assin2_sts": 1.1,
1112
+ "bluex": 1.1,
1113
+ "enem_challenge": 1.1,
1114
+ "faquad_nli": 1.1,
1115
+ "hatebr_offensive": 1.0,
1116
+ "oab_exams": 1.5,
1117
+ "portuguese_hate_speech": 1.0,
1118
+ "tweetsentbr": 1.0
1119
+ },
1120
+ "n-shot": {
1121
+ "assin2_rte": 15,
1122
+ "assin2_sts": 10,
1123
+ "bluex": 3,
1124
+ "enem_challenge": 3,
1125
+ "faquad_nli": 15,
1126
+ "hatebr_offensive": 25,
1127
+ "oab_exams": 3,
1128
+ "portuguese_hate_speech": 25,
1129
+ "tweetsentbr": 25
1130
+ },
1131
+ "model_meta": {
1132
+ "truncated": 2,
1133
+ "non_truncated": 14148,
1134
+ "padded": 0,
1135
+ "non_padded": 14150,
1136
+ "fewshots_truncated": 4,
1137
+ "has_chat_template": false,
1138
+ "chat_type": null,
1139
+ "n_gpus": 1,
1140
+ "accelerate_num_process": null,
1141
+ "model_sha": "None",
1142
+ "model_dtype": "torch.bfloat16",
1143
+ "model_memory_footprint": 1260510976,
1144
+ "model_num_parameters": 630253568,
1145
+ "model_is_loaded_in_4bit": null,
1146
+ "model_is_loaded_in_8bit": null,
1147
+ "model_is_quantized": null,
1148
+ "model_device": "cuda:0",
1149
+ "batch_size": 64,
1150
+ "max_length": 2048,
1151
+ "max_ctx_length": 2016,
1152
+ "max_gen_toks": 32
1153
+ },
1154
+ "task_model_meta": {
1155
+ "assin2_rte": {
1156
+ "sample_size": 2448,
1157
+ "truncated": 0,
1158
+ "non_truncated": 2448,
1159
+ "padded": 0,
1160
+ "non_padded": 2448,
1161
+ "fewshots_truncated": 0,
1162
+ "mean_seq_length": 924.4232026143791,
1163
+ "min_seq_length": 909,
1164
+ "max_seq_length": 963,
1165
+ "max_ctx_length": 2016,
1166
+ "max_gen_toks": 32,
1167
+ "mean_original_fewshots_size": 15.0,
1168
+ "mean_effective_fewshot_size": 15.0
1169
+ },
1170
+ "assin2_sts": {
1171
+ "sample_size": 2448,
1172
+ "truncated": 0,
1173
+ "non_truncated": 2448,
1174
+ "padded": 0,
1175
+ "non_padded": 2448,
1176
+ "fewshots_truncated": 0,
1177
+ "mean_seq_length": 659.4232026143791,
1178
+ "min_seq_length": 644,
1179
+ "max_seq_length": 698,
1180
+ "max_ctx_length": 2016,
1181
+ "max_gen_toks": 32,
1182
+ "mean_original_fewshots_size": 10.0,
1183
+ "mean_effective_fewshot_size": 10.0
1184
+ },
1185
+ "bluex": {
1186
+ "sample_size": 719,
1187
+ "truncated": 0,
1188
+ "non_truncated": 719,
1189
+ "padded": 0,
1190
+ "non_padded": 719,
1191
+ "fewshots_truncated": 0,
1192
+ "mean_seq_length": 1170.817802503477,
1193
+ "min_seq_length": 904,
1194
+ "max_seq_length": 1801,
1195
+ "max_ctx_length": 2016,
1196
+ "max_gen_toks": 32,
1197
+ "mean_original_fewshots_size": 3.0,
1198
+ "mean_effective_fewshot_size": 3.0
1199
+ },
1200
+ "enem_challenge": {
1201
+ "sample_size": 1429,
1202
+ "truncated": 2,
1203
+ "non_truncated": 1427,
1204
+ "padded": 0,
1205
+ "non_padded": 1429,
1206
+ "fewshots_truncated": 4,
1207
+ "mean_seq_length": 1007.4177746675997,
1208
+ "min_seq_length": 829,
1209
+ "max_seq_length": 2484,
1210
+ "max_ctx_length": 2016,
1211
+ "max_gen_toks": 32,
1212
+ "mean_original_fewshots_size": 3.0,
1213
+ "mean_effective_fewshot_size": 2.9972008397480754
1214
+ },
1215
+ "faquad_nli": {
1216
+ "sample_size": 650,
1217
+ "truncated": 0,
1218
+ "non_truncated": 650,
1219
+ "padded": 0,
1220
+ "non_padded": 650,
1221
+ "fewshots_truncated": 0,
1222
+ "mean_seq_length": 968.1338461538462,
1223
+ "min_seq_length": 936,
1224
+ "max_seq_length": 1034,
1225
+ "max_ctx_length": 2016,
1226
+ "max_gen_toks": 32,
1227
+ "mean_original_fewshots_size": 15.0,
1228
+ "mean_effective_fewshot_size": 15.0
1229
+ },
1230
+ "hatebr_offensive": {
1231
+ "sample_size": 1400,
1232
+ "truncated": 0,
1233
+ "non_truncated": 1400,
1234
+ "padded": 0,
1235
+ "non_padded": 1400,
1236
+ "fewshots_truncated": 0,
1237
+ "mean_seq_length": 867.4407142857143,
1238
+ "min_seq_length": 852,
1239
+ "max_seq_length": 1061,
1240
+ "max_ctx_length": 2016,
1241
+ "max_gen_toks": 32,
1242
+ "mean_original_fewshots_size": 25.0,
1243
+ "mean_effective_fewshot_size": 25.0
1244
+ },
1245
+ "oab_exams": {
1246
+ "sample_size": 2195,
1247
+ "truncated": 0,
1248
+ "non_truncated": 2195,
1249
+ "padded": 0,
1250
+ "non_padded": 2195,
1251
+ "fewshots_truncated": 0,
1252
+ "mean_seq_length": 832.024145785877,
1253
+ "min_seq_length": 659,
1254
+ "max_seq_length": 1108,
1255
+ "max_ctx_length": 2016,
1256
+ "max_gen_toks": 32,
1257
+ "mean_original_fewshots_size": 3.0,
1258
+ "mean_effective_fewshot_size": 3.0
1259
+ },
1260
+ "portuguese_hate_speech": {
1261
+ "sample_size": 851,
1262
+ "truncated": 0,
1263
+ "non_truncated": 851,
1264
+ "padded": 0,
1265
+ "non_padded": 851,
1266
+ "fewshots_truncated": 0,
1267
+ "mean_seq_length": 1219.021151586369,
1268
+ "min_seq_length": 1192,
1269
+ "max_seq_length": 1255,
1270
+ "max_ctx_length": 2016,
1271
+ "max_gen_toks": 32,
1272
+ "mean_original_fewshots_size": 25.0,
1273
+ "mean_effective_fewshot_size": 25.0
1274
+ },
1275
+ "tweetsentbr": {
1276
+ "sample_size": 2010,
1277
+ "truncated": 0,
1278
+ "non_truncated": 2010,
1279
+ "padded": 0,
1280
+ "non_padded": 2010,
1281
+ "fewshots_truncated": 0,
1282
+ "mean_seq_length": 1154.4194029850746,
1283
+ "min_seq_length": 1137,
1284
+ "max_seq_length": 1211,
1285
+ "max_ctx_length": 2016,
1286
+ "max_gen_toks": 32,
1287
+ "mean_original_fewshots_size": 25.0,
1288
+ "mean_effective_fewshot_size": 25.0
1289
+ }
1290
+ },
1291
+ "config": {
1292
+ "model": "huggingface",
1293
+ "model_args": "pretrained=/lustre/mlnvme/data/asen_hpc-mula/checkpoints-llama/slurm_job_17032104/step_400000",
1294
+ "batch_size": "auto",
1295
+ "batch_sizes": [],
1296
+ "device": "cuda:0",
1297
+ "use_cache": null,
1298
+ "limit": null,
1299
+ "bootstrap_iters": 100000,
1300
+ "gen_kwargs": null
1301
+ },
1302
+ "git_hash": null
1303
+ }