TucanoBR
/

Tucano-1b1-Instruct

+{
+  "results": {
+    "assin2_rte": {
+      "f1_macro,all": 0.33424046164366045,
+      "f1_macro_stderr,all": 0.0032331317090403522,
+      "acc,all": 0.5004084967320261,
+      "acc_stderr,all": 0.007136880329555878,
+      "alias": "assin2_rte"
+    },
+    "assin2_sts": {
+      "pearson,all": 0.008744289928817576,
+      "pearson_stderr,all": 0.0038621293517420523,
+      "mse,all": 2.075714869281045,
+      "mse_stderr,all": "N/A",
+      "alias": "assin2_sts"
+    },
+    "bluex": {
+      "acc,all": 0.24617524339360222,
+      "acc_stderr,all": 0.009273009820013372,
+      "acc,exam_id__UNICAMP_2018": 0.3333333333333333,
+      "acc_stderr,exam_id__UNICAMP_2018": 0.03696774632504438,
+      "acc,exam_id__USP_2024": 0.2682926829268293,
+      "acc_stderr,exam_id__USP_2024": 0.039972396388918856,
+      "acc,exam_id__UNICAMP_2020": 0.2909090909090909,
+      "acc_stderr,exam_id__UNICAMP_2020": 0.035384159248365375,
+      "acc,exam_id__USP_2018": 0.14814814814814814,
+      "acc_stderr,exam_id__USP_2018": 0.027992054941798152,
+      "acc,exam_id__USP_2023": 0.20454545454545456,
+      "acc_stderr,exam_id__USP_2023": 0.03513959526185611,
+      "acc,exam_id__UNICAMP_2024": 0.28888888888888886,
+      "acc_stderr,exam_id__UNICAMP_2024": 0.03899838751808449,
+      "acc,exam_id__UNICAMP_2021_1": 0.17391304347826086,
+      "acc_stderr,exam_id__UNICAMP_2021_1": 0.03230858020606621,
+      "acc,exam_id__USP_2021": 0.23076923076923078,
+      "acc_stderr,exam_id__USP_2021": 0.03381675612721597,
+      "acc,exam_id__UNICAMP_2023": 0.2558139534883721,
+      "acc_stderr,exam_id__UNICAMP_2023": 0.038213963416673546,
+      "acc,exam_id__USP_2022": 0.12244897959183673,
+      "acc_stderr,exam_id__USP_2022": 0.027043927759826267,
+      "acc,exam_id__UNICAMP_2019": 0.32,
+      "acc_stderr,exam_id__UNICAMP_2019": 0.038145051215416,
+      "acc,exam_id__UNICAMP_2021_2": 0.37254901960784315,
+      "acc_stderr,exam_id__UNICAMP_2021_2": 0.03906677079943241,
+      "acc,exam_id__UNICAMP_2022": 0.28205128205128205,
+      "acc_stderr,exam_id__UNICAMP_2022": 0.041637567043025576,
+      "acc,exam_id__USP_2019": 0.15,
+      "acc_stderr,exam_id__USP_2019": 0.03253910568225049,
+      "acc,exam_id__USP_2020": 0.23214285714285715,
+      "acc_stderr,exam_id__USP_2020": 0.03266460453055193,
+      "alias": "bluex"
+    },
+    "enem_challenge": {
+      "alias": "enem",
+      "acc,all": 0.2106368089573128,
+      "acc_stderr,all": 0.006208562275626841,
+      "acc,exam_id__2015": 0.2184873949579832,
+      "acc_stderr,exam_id__2015": 0.021843927587469662,
+      "acc,exam_id__2017": 0.25862068965517243,
+      "acc_stderr,exam_id__2017": 0.023379099757912622,
+      "acc,exam_id__2013": 0.2962962962962963,
+      "acc_stderr,exam_id__2013": 0.02532192915145962,
+      "acc,exam_id__2014": 0.1559633027522936,
+      "acc_stderr,exam_id__2014": 0.020030453529821687,
+      "acc,exam_id__2023": 0.1259259259259259,
+      "acc_stderr,exam_id__2023": 0.016457875974005987,
+      "acc,exam_id__2012": 0.1724137931034483,
+      "acc_stderr,exam_id__2012": 0.020267357896241688,
+      "acc,exam_id__2009": 0.21739130434782608,
+      "acc_stderr,exam_id__2009": 0.02225982853255498,
+      "acc,exam_id__2016_2": 0.22764227642276422,
+      "acc_stderr,exam_id__2016_2": 0.02185602387874544,
+      "acc,exam_id__2016": 0.23140495867768596,
+      "acc_stderr,exam_id__2016": 0.022086006785517493,
+      "acc,exam_id__2011": 0.23931623931623933,
+      "acc_stderr,exam_id__2011": 0.022801682010629237,
+      "acc,exam_id__2022": 0.19548872180451127,
+      "acc_stderr,exam_id__2022": 0.019880873159993897,
+      "acc,exam_id__2010": 0.20512820512820512,
+      "acc_stderr,exam_id__2010": 0.02153099502515524
+    },
+    "faquad_nli": {
+      "f1_macro,all": 0.4396551724137931,
+      "f1_macro_stderr,all": 0.0035796984729087084,
+      "acc,all": 0.7846153846153846,
+      "acc_stderr,all": 0.011396120309131327,
+      "alias": "faquad_nli"
+    },
+    "hatebr_offensive": {
+      "alias": "hatebr_offensive_binary",
+      "f1_macro,all": 0.3333333333333333,
+      "f1_macro_stderr,all": 0.004211692096804388,
+      "acc,all": 0.5,
+      "acc_stderr,all": 0.009474774173878393
+    },
+    "oab_exams": {
+      "acc,all": 0.2619589977220957,
+      "acc_stderr,all": 0.00542893270365873,
+      "acc,exam_id__2010-02": 0.23,
+      "acc_stderr,exam_id__2010-02": 0.02431926877739462,
+      "acc,exam_id__2013-10": 0.25,
+      "acc_stderr,exam_id__2013-10": 0.02796719960202988,
+      "acc,exam_id__2011-04": 0.2625,
+      "acc_stderr,exam_id__2011-04": 0.028328880871541104,
+      "acc,exam_id__2010-01": 0.2235294117647059,
+      "acc_stderr,exam_id__2010-01": 0.02616235337136691,
+      "acc,exam_id__2012-06": 0.2,
+      "acc_stderr,exam_id__2012-06": 0.02572791466083642,
+      "acc,exam_id__2018-25": 0.3,
+      "acc_stderr,exam_id__2018-25": 0.02953762820808716,
+      "acc,exam_id__2015-17": 0.28205128205128205,
+      "acc_stderr,exam_id__2015-17": 0.029426513732261166,
+      "acc,exam_id__2017-23": 0.2125,
+      "acc_stderr,exam_id__2017-23": 0.026411995739050786,
+      "acc,exam_id__2011-03": 0.26262626262626265,
+      "acc_stderr,exam_id__2011-03": 0.0254508380623283,
+      "acc,exam_id__2014-14": 0.3375,
+      "acc_stderr,exam_id__2014-14": 0.03037596639028736,
+      "acc,exam_id__2017-22": 0.1875,
+      "acc_stderr,exam_id__2017-22": 0.025246246572905185,
+      "acc,exam_id__2014-13": 0.2375,
+      "acc_stderr,exam_id__2014-13": 0.027477127021250086,
+      "acc,exam_id__2016-19": 0.28205128205128205,
+      "acc_stderr,exam_id__2016-19": 0.029354066721668577,
+      "acc,exam_id__2015-16": 0.2375,
+      "acc_stderr,exam_id__2015-16": 0.027519155256356734,
+      "acc,exam_id__2013-12": 0.25,
+      "acc_stderr,exam_id__2013-12": 0.0278249882668402,
+      "acc,exam_id__2016-20a": 0.2,
+      "acc_stderr,exam_id__2016-20a": 0.025816282656908076,
+      "acc,exam_id__2017-24": 0.2625,
+      "acc_stderr,exam_id__2017-24": 0.02837759375253264,
+      "acc,exam_id__2012-07": 0.3375,
+      "acc_stderr,exam_id__2012-07": 0.030522583703563626,
+      "acc,exam_id__2016-20": 0.3125,
+      "acc_stderr,exam_id__2016-20": 0.0299179459870414,
+      "acc,exam_id__2012-09": 0.2987012987012987,
+      "acc_stderr,exam_id__2012-09": 0.03013762764385893,
+      "acc,exam_id__2011-05": 0.2375,
+      "acc_stderr,exam_id__2011-05": 0.02745741550678293,
+      "acc,exam_id__2012-06a": 0.3,
+      "acc_stderr,exam_id__2012-06a": 0.02952659271601095,
+      "acc,exam_id__2012-08": 0.2625,
+      "acc_stderr,exam_id__2012-08": 0.028519076021589346,
+      "acc,exam_id__2013-11": 0.3125,
+      "acc_stderr,exam_id__2013-11": 0.03001703499334383,
+      "acc,exam_id__2016-21": 0.25,
+      "acc_stderr,exam_id__2016-21": 0.027889013139077292,
+      "acc,exam_id__2014-15": 0.28205128205128205,
+      "acc_stderr,exam_id__2014-15": 0.029316321078391615,
+      "acc,exam_id__2015-18": 0.275,
+      "acc_stderr,exam_id__2015-18": 0.02878672149179934,
+      "alias": "oab_exams"
+    },
+    "portuguese_hate_speech": {
+      "alias": "portuguese_hate_speech_binary",
+      "f1_macro,all": 0.412292817679558,
+      "f1_macro_stderr,all": 0.0038204811103947646,
+      "acc,all": 0.7015276145710928,
+      "acc_stderr,all": 0.011059935171649318
+    },
+    "tweetsentbr": {
+      "f1_macro,all": 0.4065189088420502,
+      "f1_macro_stderr,all": 0.006016613199350323,
+      "acc,all": 0.5592039800995025,
+      "acc_stderr,all": 0.007831261950396825,
+      "alias": "tweetsentbr"
+    }
+  },
+  "configs": {
+    "assin2_rte": {
+      "task": "assin2_rte",
+      "group": [
+        "pt_benchmark",
+        "assin2"
+      ],
+      "dataset_path": "assin2",
+      "test_split": "test",
+      "fewshot_split": "train",
+      "doc_to_text": "Premissa: {{premise}}\nHipótese: {{hypothesis}}\nPergunta: A hipótese pode ser inferida pela premissa? Sim ou Não?\nResposta:",
+      "doc_to_target": "{{['Não', 'Sim'][entailment_judgment]}}",
+      "description": "Abaixo estão pares de premissa e hipótese. Para cada par, indique se a hipótese pode ser inferida a partir da premissa, responda apenas com \"Sim\" ou \"Não\".\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "fewshot_config": {
+        "sampler": "id_sampler",
+        "sampler_config": {
+          "id_list": [
+            1,
+            3251,
+            2,
+            3252,
+            3,
+            4,
+            5,
+            6,
+            3253,
+            7,
+            3254,
+            3255,
+            3256,
+            8,
+            9,
+            10,
+            3257,
+            11,
+            3258,
+            12,
+            13,
+            14,
+            15,
+            3259,
+            3260,
+            3261,
+            3262,
+            3263,
+            16,
+            17,
+            3264,
+            18,
+            3265,
+            3266,
+            3267,
+            19,
+            20,
+            3268,
+            3269,
+            21,
+            3270,
+            3271,
+            22,
+            3272,
+            3273,
+            23,
+            3274,
+            24,
+            25,
+            3275
+          ],
+          "id_column": "sentence_pair_id"
+        }
+      },
+      "num_fewshot": 15,
+      "metric_list": [
+        {
+          "metric": "f1_macro",
+          "aggregation": "f1_macro",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc",
+          "aggregation": "acc",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "max_gen_toks": 32,
+        "do_sample": false,
+        "temperature": 0.0,
+        "top_k": null,
+        "top_p": null,
+        "until": [
+          "\n\n"
+        ]
+      },
+      "repeats": 1,
+      "filter_list": [
+        {
+          "name": "all",
+          "filter": [
+            {
+              "function": "find_similar_label",
+              "labels": [
+                "Sim",
+                "Não"
+              ]
+            },
+            {
+              "function": "take_first"
+            }
+          ]
+        }
+      ],
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.1
+      }
+    },
+    "assin2_sts": {
+      "task": "assin2_sts",
+      "group": [
+        "pt_benchmark",
+        "assin2"
+      ],
+      "dataset_path": "assin2",
+      "test_split": "test",
+      "fewshot_split": "train",
+      "doc_to_text": "Frase 1: {{premise}}\nFrase 2: {{hypothesis}}\nPergunta: Quão similares são as duas frases? Dê uma pontuação entre 1,0 a 5,0.\nResposta:",
+      "doc_to_target": "<function assin2_float_to_pt_str at 0x150c29c5b600>",
+      "description": "Abaixo estão pares de frases que você deve avaliar o grau de similaridade. Dê uma pontuação entre 1,0 e 5,0, sendo 1,0 pouco similar e 5,0 muito similar.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "fewshot_config": {
+        "sampler": "id_sampler",
+        "sampler_config": {
+          "id_list": [
+            1,
+            3251,
+            2,
+            3252,
+            3,
+            4,
+            5,
+            6,
+            3253,
+            7,
+            3254,
+            3255,
+            3256,
+            8,
+            9,
+            10,
+            3257,
+            11,
+            3258,
+            12,
+            13,
+            14,
+            15,
+            3259,
+            3260,
+            3261,
+            3262,
+            3263,
+            16,
+            17,
+            3264,
+            18,
+            3265,
+            3266,
+            3267,
+            19,
+            20,
+            3268,
+            3269,
+            21,
+            3270,
+            3271,
+            22,
+            3272,
+            3273,
+            23,
+            3274,
+            24,
+            25,
+            3275
+          ],
+          "id_column": "sentence_pair_id"
+        }
+      },
+      "num_fewshot": 10,
+      "metric_list": [
+        {
+          "metric": "pearson",
+          "aggregation": "pearsonr",
+          "higher_is_better": true
+        },
+        {
+          "metric": "mse",
+          "aggregation": "mean_squared_error",
+          "higher_is_better": false
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "max_gen_toks": 32,
+        "do_sample": false,
+        "temperature": 0.0,
+        "top_k": null,
+        "top_p": null,
+        "until": [
+          "\n\n"
+        ]
+      },
+      "repeats": 1,
+      "filter_list": [
+        {
+          "name": "all",
+          "filter": [
+            {
+              "function": "number_filter",
+              "type": "float",
+              "range_min": 1.0,
+              "range_max": 5.0,
+              "on_outside_range": "clip",
+              "fallback": 5.0
+            },
+            {
+              "function": "take_first"
+            }
+          ]
+        }
+      ],
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.1
+      }
+    },
+    "bluex": {
+      "task": "bluex",
+      "group": [
+        "pt_benchmark",
+        "vestibular"
+      ],
+      "dataset_path": "eduagarcia-temp/BLUEX_without_images",
+      "test_split": "train",
+      "fewshot_split": "train",
+      "doc_to_text": "<function enem_doc_to_text at 0x150c29c5ab60>",
+      "doc_to_target": "{{answerKey}}",
+      "description": "As perguntas a seguir são questões de múltipla escolha de provas de vestibular de universidades brasileiras, selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "fewshot_config": {
+        "sampler": "id_sampler",
+        "sampler_config": {
+          "id_list": [
+            "USP_2018_3",
+            "UNICAMP_2018_2",
+            "USP_2018_35",
+            "UNICAMP_2018_16",
+            "USP_2018_89"
+          ],
+          "id_column": "id",
+          "exclude_from_task": true
+        }
+      },
+      "num_fewshot": 3,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "acc",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "max_gen_toks": 32,
+        "do_sample": false,
+        "temperature": 0.0,
+        "top_k": null,
+        "top_p": null,
+        "until": [
+          "\n\n"
+        ]
+      },
+      "repeats": 1,
+      "filter_list": [
+        {
+          "name": "all",
+          "filter": [
+            {
+              "function": "normalize_spaces"
+            },
+            {
+              "function": "remove_accents"
+            },
+            {
+              "function": "find_choices",
+              "choices": [
+                "A",
+                "B",
+                "C",
+                "D",
+                "E"
+              ],
+              "regex_patterns": [
+                "(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCDE])\\b",
+                "\\b([ABCDE])\\.",
+                "\\b([ABCDE]) ?[.):-]",
+                "\\b([ABCDE])$",
+                "\\b([ABCDE])\\b"
+              ]
+            },
+            {
+              "function": "take_first"
+            }
+          ],
+          "group_by": {
+            "column": "exam_id"
+          }
+        }
+      ],
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "<function enem_doc_to_text at 0x150c29c5ae80>",
+      "metadata": {
+        "version": 1.1
+      }
+    },
+    "enem_challenge": {
+      "task": "enem_challenge",
+      "task_alias": "enem",
+      "group": [
+        "pt_benchmark",
+        "vestibular"
+      ],
+      "dataset_path": "eduagarcia/enem_challenge",
+      "test_split": "train",
+      "fewshot_split": "train",
+      "doc_to_text": "<function enem_doc_to_text at 0x150c29c5b060>",
+      "doc_to_target": "{{answerKey}}",
+      "description": "As perguntas a seguir são questões de múltipla escolha do Exame Nacional do Ensino Médio (ENEM), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "fewshot_config": {
+        "sampler": "id_sampler",
+        "sampler_config": {
+          "id_list": [
+            "2022_21",
+            "2022_88",
+            "2022_143"
+          ],
+          "id_column": "id",
+          "exclude_from_task": true
+        }
+      },
+      "num_fewshot": 3,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "acc",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "max_gen_toks": 32,
+        "do_sample": false,
+        "temperature": 0.0,
+        "top_k": null,
+        "top_p": null,
+        "until": [
+          "\n\n"
+        ]
+      },
+      "repeats": 1,
+      "filter_list": [
+        {
+          "name": "all",
+          "filter": [
+            {
+              "function": "normalize_spaces"
+            },
+            {
+              "function": "remove_accents"
+            },
+            {
+              "function": "find_choices",
+              "choices": [
+                "A",
+                "B",
+                "C",
+                "D",
+                "E"
+              ],
+              "regex_patterns": [
+                "(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCDE])\\b",
+                "\\b([ABCDE])\\.",
+                "\\b([ABCDE]) ?[.):-]",
+                "\\b([ABCDE])$",
+                "\\b([ABCDE])\\b"
+              ]
+            },
+            {
+              "function": "take_first"
+            }
+          ],
+          "group_by": {
+            "column": "exam_id"
+          }
+        }
+      ],
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "<function enem_doc_to_text at 0x150c29c5b240>",
+      "metadata": {
+        "version": 1.1
+      }
+    },
+    "faquad_nli": {
+      "task": "faquad_nli",
+      "group": [
+        "pt_benchmark"
+      ],
+      "dataset_path": "ruanchaves/faquad-nli",
+      "test_split": "test",
+      "fewshot_split": "train",
+      "doc_to_text": "Pergunta: {{question}}\nResposta: {{answer}}\nA resposta dada satisfaz à pergunta? Sim ou Não?",
+      "doc_to_target": "{{['Não', 'Sim'][label]}}",
+      "description": "Abaixo estão pares de pergunta e resposta. Para cada par, você deve julgar se a resposta responde à pergunta de maneira satisfatória e aparenta estar correta. Escreva apenas \"Sim\" ou \"Não\".\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "fewshot_config": {
+        "sampler": "first_n",
+        "sampler_config": {
+          "fewshot_indices": [
+            1893,
+            949,
+            663,
+            105,
+            1169,
+            2910,
+            2227,
+            2813,
+            974,
+            558,
+            1503,
+            1958,
+            2918,
+            601,
+            1560,
+            984,
+            2388,
+            995,
+            2233,
+            1982,
+            165,
+            2788,
+            1312,
+            2285,
+            522,
+            1113,
+            1670,
+            323,
+            236,
+            1263,
+            1562,
+            2519,
+            1049,
+            432,
+            1167,
+            1394,
+            2022,
+            2551,
+            2194,
+            2187,
+            2282,
+            2816,
+            108,
+            301,
+            1185,
+            1315,
+            1420,
+            2436,
+            2322,
+            766
+          ]
+        }
+      },
+      "num_fewshot": 15,
+      "metric_list": [
+        {
+          "metric": "f1_macro",
+          "aggregation": "f1_macro",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc",
+          "aggregation": "acc",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "max_gen_toks": 32,
+        "do_sample": false,
+        "temperature": 0.0,
+        "top_k": null,
+        "top_p": null,
+        "until": [
+          "\n\n"
+        ]
+      },
+      "repeats": 1,
+      "filter_list": [
+        {
+          "name": "all",
+          "filter": [
+            {
+              "function": "find_similar_label",
+              "labels": [
+                "Sim",
+                "Não"
+              ]
+            },
+            {
+              "function": "take_first"
+            }
+          ]
+        }
+      ],
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.1
+      }
+    },
+    "hatebr_offensive": {
+      "task": "hatebr_offensive",
+      "task_alias": "hatebr_offensive_binary",
+      "group": [
+        "pt_benchmark"
+      ],
+      "dataset_path": "eduagarcia/portuguese_benchmark",
+      "dataset_name": "HateBR_offensive_binary",
+      "test_split": "test",
+      "fewshot_split": "train",
+      "doc_to_text": "Texto: {{sentence}}\nPergunta: O texto é ofensivo?\nResposta:",
+      "doc_to_target": "{{'Sim' if label == 1 else 'Não'}}",
+      "description": "Abaixo contém o texto de comentários de usuários do Instagram em português, sua tarefa é classificar se o texto é ofensivo ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "fewshot_config": {
+        "sampler": "id_sampler",
+        "sampler_config": {
+          "id_list": [
+            48,
+            44,
+            36,
+            20,
+            3511,
+            88,
+            3555,
+            16,
+            56,
+            3535,
+            60,
+            40,
+            3527,
+            4,
+            76,
+            3579,
+            3523,
+            3551,
+            68,
+            3503,
+            84,
+            3539,
+            64,
+            3599,
+            80,
+            3563,
+            3559,
+            3543,
+            3547,
+            3587,
+            3595,
+            3575,
+            3567,
+            3591,
+            24,
+            96,
+            92,
+            3507,
+            52,
+            72,
+            8,
+            3571,
+            3515,
+            3519,
+            3531,
+            28,
+            32,
+            0,
+            12,
+            3583
+          ],
+          "id_column": "idx"
+        }
+      },
+      "num_fewshot": 25,
+      "metric_list": [
+        {
+          "metric": "f1_macro",
+          "aggregation": "f1_macro",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc",
+          "aggregation": "acc",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "max_gen_toks": 32,
+        "do_sample": false,
+        "temperature": 0.0,
+        "top_k": null,
+        "top_p": null,
+        "until": [
+          "\n\n"
+        ]
+      },
+      "repeats": 1,
+      "filter_list": [
+        {
+          "name": "all",
+          "filter": [
+            {
+              "function": "find_similar_label",
+              "labels": [
+                "Sim",
+                "Não"
+              ]
+            },
+            {
+              "function": "take_first"
+            }
+          ]
+        }
+      ],
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "oab_exams": {
+      "task": "oab_exams",
+      "group": [
+        "legal_benchmark",
+        "pt_benchmark"
+      ],
+      "dataset_path": "eduagarcia/oab_exams",
+      "test_split": "train",
+      "fewshot_split": "train",
+      "doc_to_text": "<function doc_to_text at 0x150c29c5bd80>",
+      "doc_to_target": "{{answerKey}}",
+      "description": "As perguntas a seguir são questões de múltipla escolha do Exame de Ordem da Ordem dos Advogados do Brasil (OAB), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\" ou \"D\".\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "fewshot_config": {
+        "sampler": "id_sampler",
+        "sampler_config": {
+          "id_list": [
+            "2010-01_1",
+            "2010-01_11",
+            "2010-01_13",
+            "2010-01_23",
+            "2010-01_26",
+            "2010-01_28",
+            "2010-01_38",
+            "2010-01_48",
+            "2010-01_58",
+            "2010-01_68",
+            "2010-01_76",
+            "2010-01_83",
+            "2010-01_85",
+            "2010-01_91",
+            "2010-01_99"
+          ],
+          "id_column": "id",
+          "exclude_from_task": true
+        }
+      },
+      "num_fewshot": 3,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "acc",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "max_gen_toks": 32,
+        "do_sample": false,
+        "temperature": 0.0,
+        "top_k": null,
+        "top_p": null,
+        "until": [
+          "\n\n"
+        ]
+      },
+      "repeats": 1,
+      "filter_list": [
+        {
+          "name": "all",
+          "filter": [
+            {
+              "function": "normalize_spaces"
+            },
+            {
+              "function": "remove_accents"
+            },
+            {
+              "function": "find_choices",
+              "choices": [
+                "A",
+                "B",
+                "C",
+                "D"
+              ],
+              "regex_patterns": [
+                "(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCD])\\b",
+                "\\b([ABCD])\\.",
+                "\\b([ABCD]) ?[.):-]",
+                "\\b([ABCD])$",
+                "\\b([ABCD])\\b"
+              ]
+            },
+            {
+              "function": "take_first"
+            }
+          ],
+          "group_by": {
+            "column": "exam_id"
+          }
+        }
+      ],
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "<function doc_to_text at 0x150c29aac040>",
+      "metadata": {
+        "version": 1.5
+      }
+    },
+    "portuguese_hate_speech": {
+      "task": "portuguese_hate_speech",
+      "task_alias": "portuguese_hate_speech_binary",
+      "group": [
+        "pt_benchmark"
+      ],
+      "dataset_path": "eduagarcia/portuguese_benchmark",
+      "dataset_name": "Portuguese_Hate_Speech_binary",
+      "test_split": "test",
+      "fewshot_split": "train",
+      "doc_to_text": "Texto: {{sentence}}\nPergunta: O texto contém discurso de ódio?\nResposta:",
+      "doc_to_target": "{{'Sim' if label == 1 else 'Não'}}",
+      "description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o texto contém discurso de ódio ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "fewshot_config": {
+        "sampler": "id_sampler",
+        "sampler_config": {
+          "id_list": [
+            52,
+            50,
+            39,
+            28,
+            3,
+            105,
+            22,
+            25,
+            60,
+            11,
+            66,
+            41,
+            9,
+            4,
+            91,
+            42,
+            7,
+            20,
+            76,
+            1,
+            104,
+            13,
+            67,
+            54,
+            97,
+            27,
+            24,
+            14,
+            16,
+            48,
+            53,
+            40,
+            34,
+            49,
+            32,
+            119,
+            114,
+            2,
+            58,
+            83,
+            18,
+            36,
+            5,
+            6,
+            10,
+            35,
+            38,
+            0,
+            21,
+            46
+          ],
+          "id_column": "idx"
+        }
+      },
+      "num_fewshot": 25,
+      "metric_list": [
+        {
+          "metric": "f1_macro",
+          "aggregation": "f1_macro",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc",
+          "aggregation": "acc",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "max_gen_toks": 32,
+        "do_sample": false,
+        "temperature": 0.0,
+        "top_k": null,
+        "top_p": null,
+        "until": [
+          "\n\n"
+        ]
+      },
+      "repeats": 1,
+      "filter_list": [
+        {
+          "name": "all",
+          "filter": [
+            {
+              "function": "find_similar_label",
+              "labels": [
+                "Sim",
+                "Não"
+              ]
+            },
+            {
+              "function": "take_first"
+            }
+          ]
+        }
+      ],
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    },
+    "tweetsentbr": {
+      "task": "tweetsentbr",
+      "group": [
+        "pt_benchmark"
+      ],
+      "dataset_path": "eduagarcia/tweetsentbr_fewshot",
+      "test_split": "test",
+      "fewshot_split": "train",
+      "doc_to_text": "Texto: {{sentence}}\nPergunta: O sentimento do texto é Positivo, Neutro ou Negativo?\nResposta:",
+      "doc_to_target": "{{'Positivo' if label == 'Positive' else ('Negativo' if label == 'Negative' else 'Neutro')}}",
+      "description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o sentimento do texto é Positivo, Neutro ou Negativo. Responda apenas com uma das opções.\n\n",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "fewshot_config": {
+        "sampler": "first_n"
+      },
+      "num_fewshot": 25,
+      "metric_list": [
+        {
+          "metric": "f1_macro",
+          "aggregation": "f1_macro",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc",
+          "aggregation": "acc",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "generate_until",
+      "generation_kwargs": {
+        "max_gen_toks": 32,
+        "do_sample": false,
+        "temperature": 0.0,
+        "top_k": null,
+        "top_p": null,
+        "until": [
+          "\n\n"
+        ]
+      },
+      "repeats": 1,
+      "filter_list": [
+        {
+          "name": "all",
+          "filter": [
+            {
+              "function": "find_similar_label",
+              "labels": [
+                "Positivo",
+                "Neutro",
+                "Negativo"
+              ]
+            },
+            {
+              "function": "take_first"
+            }
+          ]
+        }
+      ],
+      "should_decontaminate": false,
+      "metadata": {
+        "version": 1.0
+      }
+    }
+  },
+  "versions": {
+    "assin2_rte": 1.1,
+    "assin2_sts": 1.1,
+    "bluex": 1.1,
+    "enem_challenge": 1.1,
+    "faquad_nli": 1.1,
+    "hatebr_offensive": 1.0,
+    "oab_exams": 1.5,
+    "portuguese_hate_speech": 1.0,
+    "tweetsentbr": 1.0
+  },
+  "n-shot": {
+    "assin2_rte": 15,
+    "assin2_sts": 10,
+    "bluex": 3,
+    "enem_challenge": 3,
+    "faquad_nli": 15,
+    "hatebr_offensive": 25,
+    "oab_exams": 3,
+    "portuguese_hate_speech": 25,
+    "tweetsentbr": 25
+  },
+  "model_meta": {
+    "truncated": 2,
+    "non_truncated": 14148,
+    "padded": 0,
+    "non_padded": 14150,
+    "fewshots_truncated": 4,
+    "has_chat_template": false,
+    "chat_type": null,
+    "n_gpus": 1,
+    "accelerate_num_process": null,
+    "model_sha": "None",
+    "model_dtype": "torch.bfloat16",
+    "model_memory_footprint": 2200116096,
+    "model_num_parameters": 1100056576,
+    "model_is_loaded_in_4bit": null,
+    "model_is_loaded_in_8bit": null,
+    "model_is_quantized": null,
+    "model_device": "cuda:0",
+    "batch_size": 32,
+    "max_length": 2048,
+    "max_ctx_length": 2016,
+    "max_gen_toks": 32
+  },
+  "task_model_meta": {
+    "assin2_rte": {
+      "sample_size": 2448,
+      "truncated": 0,
+      "non_truncated": 2448,
+      "padded": 0,
+      "non_padded": 2448,
+      "fewshots_truncated": 0,
+      "mean_seq_length": 924.4232026143791,
+      "min_seq_length": 909,
+      "max_seq_length": 963,
+      "max_ctx_length": 2016,
+      "max_gen_toks": 32,
+      "mean_original_fewshots_size": 15.0,
+      "mean_effective_fewshot_size": 15.0
+    },
+    "assin2_sts": {
+      "sample_size": 2448,
+      "truncated": 0,
+      "non_truncated": 2448,
+      "padded": 0,
+      "non_padded": 2448,
+      "fewshots_truncated": 0,
+      "mean_seq_length": 659.4232026143791,
+      "min_seq_length": 644,
+      "max_seq_length": 698,
+      "max_ctx_length": 2016,
+      "max_gen_toks": 32,
+      "mean_original_fewshots_size": 10.0,
+      "mean_effective_fewshot_size": 10.0
+    },
+    "bluex": {
+      "sample_size": 719,
+      "truncated": 0,
+      "non_truncated": 719,
+      "padded": 0,
+      "non_padded": 719,
+      "fewshots_truncated": 0,
+      "mean_seq_length": 1170.817802503477,
+      "min_seq_length": 904,
+      "max_seq_length": 1801,
+      "max_ctx_length": 2016,
+      "max_gen_toks": 32,
+      "mean_original_fewshots_size": 3.0,
+      "mean_effective_fewshot_size": 3.0
+    },
+    "enem_challenge": {
+      "sample_size": 1429,
+      "truncated": 2,
+      "non_truncated": 1427,
+      "padded": 0,
+      "non_padded": 1429,
+      "fewshots_truncated": 4,
+      "mean_seq_length": 1007.4177746675997,
+      "min_seq_length": 829,
+      "max_seq_length": 2484,
+      "max_ctx_length": 2016,
+      "max_gen_toks": 32,
+      "mean_original_fewshots_size": 3.0,
+      "mean_effective_fewshot_size": 2.9972008397480754
+    },
+    "faquad_nli": {
+      "sample_size": 650,
+      "truncated": 0,
+      "non_truncated": 650,
+      "padded": 0,
+      "non_padded": 650,
+      "fewshots_truncated": 0,
+      "mean_seq_length": 968.1338461538462,
+      "min_seq_length": 936,
+      "max_seq_length": 1034,
+      "max_ctx_length": 2016,
+      "max_gen_toks": 32,
+      "mean_original_fewshots_size": 15.0,
+      "mean_effective_fewshot_size": 15.0
+    },
+    "hatebr_offensive": {
+      "sample_size": 1400,
+      "truncated": 0,
+      "non_truncated": 1400,
+      "padded": 0,
+      "non_padded": 1400,
+      "fewshots_truncated": 0,
+      "mean_seq_length": 867.4407142857143,
+      "min_seq_length": 852,
+      "max_seq_length": 1061,
+      "max_ctx_length": 2016,
+      "max_gen_toks": 32,
+      "mean_original_fewshots_size": 25.0,
+      "mean_effective_fewshot_size": 25.0
+    },
+    "oab_exams": {
+      "sample_size": 2195,
+      "truncated": 0,
+      "non_truncated": 2195,
+      "padded": 0,
+      "non_padded": 2195,
+      "fewshots_truncated": 0,
+      "mean_seq_length": 832.024145785877,
+      "min_seq_length": 659,
+      "max_seq_length": 1108,
+      "max_ctx_length": 2016,
+      "max_gen_toks": 32,
+      "mean_original_fewshots_size": 3.0,
+      "mean_effective_fewshot_size": 3.0
+    },
+    "portuguese_hate_speech": {
+      "sample_size": 851,
+      "truncated": 0,
+      "non_truncated": 851,
+      "padded": 0,
+      "non_padded": 851,
+      "fewshots_truncated": 0,
+      "mean_seq_length": 1219.021151586369,
+      "min_seq_length": 1192,
+      "max_seq_length": 1255,
+      "max_ctx_length": 2016,
+      "max_gen_toks": 32,
+      "mean_original_fewshots_size": 25.0,
+      "mean_effective_fewshot_size": 25.0
+    },
+    "tweetsentbr": {
+      "sample_size": 2010,
+      "truncated": 0,
+      "non_truncated": 2010,
+      "padded": 0,
+      "non_padded": 2010,
+      "fewshots_truncated": 0,
+      "mean_seq_length": 1154.4194029850746,
+      "min_seq_length": 1137,
+      "max_seq_length": 1211,
+      "max_ctx_length": 2016,
+      "max_gen_toks": 32,
+      "mean_original_fewshots_size": 25.0,
+      "mean_effective_fewshot_size": 25.0
+    }
+  },
+  "config": {
+    "model": "huggingface",
+    "model_args": "pretrained=/lustre/mlnvme/data/asen_hpc-mula/checkpoints-llama/slurm_job_17066349/step_21084",
+    "batch_size": "auto",
+    "batch_sizes": [],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": null
+}