Tucano-630m / results-pt.json

Upload results-pt.json with huggingface_hub

0449a5c verified 3 months ago

36.3 kB

	{
	"results": {
	"assin2_rte": {
	"f1_macro,all": 0.577927165258549,
	"f1_macro_stderr,all": 0.0071561886831375205,
	"acc,all": 0.6004901960784313,
	"acc_stderr,all": 0.006996270367086327,
	"alias": "assin2_rte"
	},
	"assin2_sts": {
	"pearson,all": 0.01994583242718194,
	"pearson_stderr,all": 0.014030097851152527,
	"mse,all": 2.0782883986928105,
	"mse_stderr,all": "N/A",
	"alias": "assin2_sts"
	},
	"bluex": {
	"acc,all": 0.24756606397774686,
	"acc_stderr,all": 0.009256090690166377,
	"acc,exam_id__USP_2020": 0.23214285714285715,
	"acc_stderr,exam_id__USP_2020": 0.03265301549767108,
	"acc,exam_id__UNICAMP_2022": 0.3076923076923077,
	"acc_stderr,exam_id__UNICAMP_2022": 0.04284625323101336,
	"acc,exam_id__USP_2019": 0.15,
	"acc_stderr,exam_id__USP_2019": 0.032604803052830006,
	"acc,exam_id__UNICAMP_2019": 0.26,
	"acc_stderr,exam_id__UNICAMP_2019": 0.035805342426308485,
	"acc,exam_id__UNICAMP_2023": 0.3488372093023256,
	"acc_stderr,exam_id__UNICAMP_2023": 0.041919014018416384,
	"acc,exam_id__UNICAMP_2021_2": 0.3137254901960784,
	"acc_stderr,exam_id__UNICAMP_2021_2": 0.037486879456641045,
	"acc,exam_id__USP_2023": 0.18181818181818182,
	"acc_stderr,exam_id__USP_2023": 0.03349033495084427,
	"acc,exam_id__USP_2021": 0.23076923076923078,
	"acc_stderr,exam_id__USP_2021": 0.03381852906171476,
	"acc,exam_id__UNICAMP_2021_1": 0.2391304347826087,
	"acc_stderr,exam_id__UNICAMP_2021_1": 0.03638828981438008,
	"acc,exam_id__UNICAMP_2020": 0.32727272727272727,
	"acc_stderr,exam_id__UNICAMP_2020": 0.03653884692218891,
	"acc,exam_id__USP_2022": 0.16326530612244897,
	"acc_stderr,exam_id__USP_2022": 0.03041886398004624,
	"acc,exam_id__USP_2024": 0.1951219512195122,
	"acc_stderr,exam_id__USP_2024": 0.03573567350069458,
	"acc,exam_id__UNICAMP_2024": 0.28888888888888886,
	"acc_stderr,exam_id__UNICAMP_2024": 0.038994404039252965,
	"acc,exam_id__USP_2018": 0.16666666666666666,
	"acc_stderr,exam_id__USP_2018": 0.029375798861409928,
	"acc,exam_id__UNICAMP_2018": 0.2962962962962963,
	"acc_stderr,exam_id__UNICAMP_2018": 0.035803514128866995,
	"alias": "bluex"
	},
	"enem_challenge": {
	"alias": "enem",
	"acc,all": 0.19174247725682295,
	"acc_stderr,all": 0.006016154421005065,
	"acc,exam_id__2014": 0.14678899082568808,
	"acc_stderr,exam_id__2014": 0.019587599223040625,
	"acc,exam_id__2017": 0.1896551724137931,
	"acc_stderr,exam_id__2017": 0.02093347844131792,
	"acc,exam_id__2016_2": 0.21951219512195122,
	"acc_stderr,exam_id__2016_2": 0.021542806426202234,
	"acc,exam_id__2016": 0.23140495867768596,
	"acc_stderr,exam_id__2016": 0.022131076893750434,
	"acc,exam_id__2012": 0.1724137931034483,
	"acc_stderr,exam_id__2012": 0.020164330045300043,
	"acc,exam_id__2010": 0.2222222222222222,
	"acc_stderr,exam_id__2010": 0.02213114984638704,
	"acc,exam_id__2022": 0.16541353383458646,
	"acc_stderr,exam_id__2022": 0.018647718361639067,
	"acc,exam_id__2015": 0.23529411764705882,
	"acc_stderr,exam_id__2015": 0.02241173308841978,
	"acc,exam_id__2011": 0.18803418803418803,
	"acc_stderr,exam_id__2011": 0.020817448020547174,
	"acc,exam_id__2013": 0.23148148148148148,
	"acc_stderr,exam_id__2013": 0.02345320376483438,
	"acc,exam_id__2009": 0.13043478260869565,
	"acc_stderr,exam_id__2009": 0.018075482087183004,
	"acc,exam_id__2023": 0.17037037037037037,
	"acc_stderr,exam_id__2023": 0.018694761764683765
	},
	"faquad_nli": {
	"f1_macro,all": 0.4396551724137931,
	"f1_macro_stderr,all": 0.0035796984729087084,
	"acc,all": 0.7846153846153846,
	"acc_stderr,all": 0.011396120309131327,
	"alias": "faquad_nli"
	},
	"hatebr_offensive": {
	"alias": "hatebr_offensive_binary",
	"f1_macro,all": 0.5373169584712172,
	"f1_macro_stderr,all": 0.009509765977605292,
	"acc,all": 0.5642857142857143,
	"acc_stderr,all": 0.009363640166058677
	},
	"oab_exams": {
	"acc,all": 0.2528473804100228,
	"acc_stderr,all": 0.005369584144606133,
	"acc,exam_id__2012-06a": 0.2875,
	"acc_stderr,exam_id__2012-06a": 0.029268820937911468,
	"acc,exam_id__2012-09": 0.2857142857142857,
	"acc_stderr,exam_id__2012-09": 0.02981338225884132,
	"acc,exam_id__2013-10": 0.1375,
	"acc_stderr,exam_id__2013-10": 0.022269660648017626,
	"acc,exam_id__2011-04": 0.2625,
	"acc_stderr,exam_id__2011-04": 0.02838995857974862,
	"acc,exam_id__2018-25": 0.2,
	"acc_stderr,exam_id__2018-25": 0.02584730185242964,
	"acc,exam_id__2011-05": 0.2625,
	"acc_stderr,exam_id__2011-05": 0.028394014157416495,
	"acc,exam_id__2012-07": 0.275,
	"acc_stderr,exam_id__2012-07": 0.028825122025371252,
	"acc,exam_id__2014-13": 0.2375,
	"acc_stderr,exam_id__2014-13": 0.027472596578792648,
	"acc,exam_id__2012-06": 0.2375,
	"acc_stderr,exam_id__2012-06": 0.027444121032139803,
	"acc,exam_id__2015-16": 0.3125,
	"acc_stderr,exam_id__2015-16": 0.03003339727126325,
	"acc,exam_id__2017-22": 0.25,
	"acc_stderr,exam_id__2017-22": 0.027997530519651585,
	"acc,exam_id__2017-24": 0.275,
	"acc_stderr,exam_id__2017-24": 0.028929264616280162,
	"acc,exam_id__2014-14": 0.2125,
	"acc_stderr,exam_id__2014-14": 0.02645590211050805,
	"acc,exam_id__2011-03": 0.25252525252525254,
	"acc_stderr,exam_id__2011-03": 0.02522016343392691,
	"acc,exam_id__2014-15": 0.2564102564102564,
	"acc_stderr,exam_id__2014-15": 0.02850809134358431,
	"acc,exam_id__2016-19": 0.2948717948717949,
	"acc_stderr,exam_id__2016-19": 0.029792524691902637,
	"acc,exam_id__2012-08": 0.2375,
	"acc_stderr,exam_id__2012-08": 0.02756426923964389,
	"acc,exam_id__2015-17": 0.24358974358974358,
	"acc_stderr,exam_id__2015-17": 0.028007851240875885,
	"acc,exam_id__2015-18": 0.2875,
	"acc_stderr,exam_id__2015-18": 0.02920616482631226,
	"acc,exam_id__2010-01": 0.25882352941176473,
	"acc_stderr,exam_id__2010-01": 0.027343286705705846,
	"acc,exam_id__2016-20a": 0.2125,
	"acc_stderr,exam_id__2016-20a": 0.026410470748996454,
	"acc,exam_id__2016-21": 0.2625,
	"acc_stderr,exam_id__2016-21": 0.028563711821251347,
	"acc,exam_id__2016-20": 0.225,
	"acc_stderr,exam_id__2016-20": 0.02689442014065515,
	"acc,exam_id__2017-23": 0.275,
	"acc_stderr,exam_id__2017-23": 0.028728458302479908,
	"acc,exam_id__2013-12": 0.2875,
	"acc_stderr,exam_id__2013-12": 0.02914378722659053,
	"acc,exam_id__2010-02": 0.27,
	"acc_stderr,exam_id__2010-02": 0.02563525101778004,
	"acc,exam_id__2013-11": 0.225,
	"acc_stderr,exam_id__2013-11": 0.027058590192888813,
	"alias": "oab_exams"
	},
	"portuguese_hate_speech": {
	"alias": "portuguese_hate_speech_binary",
	"f1_macro,all": 0.300060277449729,
	"f1_macro_stderr,all": 0.010227555978721074,
	"acc,all": 0.33137485311398357,
	"acc_stderr,all": 0.011379474523059378
	},
	"tweetsentbr": {
	"f1_macro,all": 0.2072905953605302,
	"f1_macro_stderr,all": 0.0024889383614545776,
	"acc,all": 0.45124378109452734,
	"acc_stderr,all": 0.007861876742012653,
	"alias": "tweetsentbr"
	}
	},
	"configs": {
	"assin2_rte": {
	"task": "assin2_rte",
	"group": [
	"pt_benchmark",
	"assin2"
	],
	"dataset_path": "assin2",
	"test_split": "test",
	"fewshot_split": "train",
	"doc_to_text": "Premissa: {{premise}}\nHipótese: {{hypothesis}}\nPergunta: A hipótese pode ser inferida pela premissa? Sim ou Não?\nResposta:",
	"doc_to_target": "{{['Não', 'Sim'][entailment_judgment]}}",
	"description": "Abaixo estão pares de premissa e hipótese. Para cada par, indique se a hipótese pode ser inferida a partir da premissa, responda apenas com \"Sim\" ou \"Não\".\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "id_sampler",
	"sampler_config": {
	"id_list": [
	1,
	3251,
	2,
	3252,
	3,
	4,
	5,
	6,
	3253,
	7,
	3254,
	3255,
	3256,
	8,
	9,
	10,
	3257,
	11,
	3258,
	12,
	13,
	14,
	15,
	3259,
	3260,
	3261,
	3262,
	3263,
	16,
	17,
	3264,
	18,
	3265,
	3266,
	3267,
	19,
	20,
	3268,
	3269,
	21,
	3270,
	3271,
	22,
	3272,
	3273,
	23,
	3274,
	24,
	25,
	3275
	],
	"id_column": "sentence_pair_id"
	}
	},
	"num_fewshot": 15,
	"metric_list": [
	{
	"metric": "f1_macro",
	"aggregation": "f1_macro",
	"higher_is_better": true
	},
	{
	"metric": "acc",
	"aggregation": "acc",
	"higher_is_better": true
	}
	],
	"output_type": "generate_until",
	"generation_kwargs": {
	"max_gen_toks": 32,
	"do_sample": false,
	"temperature": 0.0,
	"top_k": null,
	"top_p": null,
	"until": [
	"\n\n"
	]
	},
	"repeats": 1,
	"filter_list": [
	{
	"name": "all",
	"filter": [
	{
	"function": "find_similar_label",
	"labels": [
	"Sim",
	"Não"
	]
	},
	{
	"function": "take_first"
	}
	]
	}
	],
	"should_decontaminate": false,
	"metadata": {
	"version": 1.1
	}
	},
	"assin2_sts": {
	"task": "assin2_sts",
	"group": [
	"pt_benchmark",
	"assin2"
	],
	"dataset_path": "assin2",
	"test_split": "test",
	"fewshot_split": "train",
	"doc_to_text": "Frase 1: {{premise}}\nFrase 2: {{hypothesis}}\nPergunta: Quão similares são as duas frases? Dê uma pontuação entre 1,0 a 5,0.\nResposta:",
	"doc_to_target": "<function assin2_float_to_pt_str at 0x14879d55b600>",
	"description": "Abaixo estão pares de frases que você deve avaliar o grau de similaridade. Dê uma pontuação entre 1,0 e 5,0, sendo 1,0 pouco similar e 5,0 muito similar.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "id_sampler",
	"sampler_config": {
	"id_list": [
	1,
	3251,
	2,
	3252,
	3,
	4,
	5,
	6,
	3253,
	7,
	3254,
	3255,
	3256,
	8,
	9,
	10,
	3257,
	11,
	3258,
	12,
	13,
	14,
	15,
	3259,
	3260,
	3261,
	3262,
	3263,
	16,
	17,
	3264,
	18,
	3265,
	3266,
	3267,
	19,
	20,
	3268,
	3269,
	21,
	3270,
	3271,
	22,
	3272,
	3273,
	23,
	3274,
	24,
	25,
	3275
	],
	"id_column": "sentence_pair_id"
	}
	},
	"num_fewshot": 10,
	"metric_list": [
	{
	"metric": "pearson",
	"aggregation": "pearsonr",
	"higher_is_better": true
	},
	{
	"metric": "mse",
	"aggregation": "mean_squared_error",
	"higher_is_better": false
	}
	],
	"output_type": "generate_until",
	"generation_kwargs": {
	"max_gen_toks": 32,
	"do_sample": false,
	"temperature": 0.0,
	"top_k": null,
	"top_p": null,
	"until": [
	"\n\n"
	]
	},
	"repeats": 1,
	"filter_list": [
	{
	"name": "all",
	"filter": [
	{
	"function": "number_filter",
	"type": "float",
	"range_min": 1.0,
	"range_max": 5.0,
	"on_outside_range": "clip",
	"fallback": 5.0
	},
	{
	"function": "take_first"
	}
	]
	}
	],
	"should_decontaminate": false,
	"metadata": {
	"version": 1.1
	}
	},
	"bluex": {
	"task": "bluex",
	"group": [
	"pt_benchmark",
	"vestibular"
	],
	"dataset_path": "eduagarcia-temp/BLUEX_without_images",
	"test_split": "train",
	"fewshot_split": "train",
	"doc_to_text": "<function enem_doc_to_text at 0x14879d55ab60>",
	"doc_to_target": "{{answerKey}}",
	"description": "As perguntas a seguir são questões de múltipla escolha de provas de vestibular de universidades brasileiras, selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "id_sampler",
	"sampler_config": {
	"id_list": [
	"USP_2018_3",
	"UNICAMP_2018_2",
	"USP_2018_35",
	"UNICAMP_2018_16",
	"USP_2018_89"
	],
	"id_column": "id",
	"exclude_from_task": true
	}
	},
	"num_fewshot": 3,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "acc",
	"higher_is_better": true
	}
	],
	"output_type": "generate_until",
	"generation_kwargs": {
	"max_gen_toks": 32,
	"do_sample": false,
	"temperature": 0.0,
	"top_k": null,
	"top_p": null,
	"until": [
	"\n\n"
	]
	},
	"repeats": 1,
	"filter_list": [
	{
	"name": "all",
	"filter": [
	{
	"function": "normalize_spaces"
	},
	{
	"function": "remove_accents"
	},
	{
	"function": "find_choices",
	"choices": [
	"A",
	"B",
	"C",
	"D",
	"E"
	],
	"regex_patterns": [
	"(?:[Ll]etra\|[Aa]lternativa\|[Rr]esposta\|[Rr]esposta [Cc]orreta\|[Rr]esposta [Cc]orreta e\|[Oo]pcao):? ([ABCDE])\\b",
	"\\b([ABCDE])\\.",
	"\\b([ABCDE]) ?[.):-]",
	"\\b([ABCDE])$",
	"\\b([ABCDE])\\b"
	]
	},
	{
	"function": "take_first"
	}
	],
	"group_by": {
	"column": "exam_id"
	}
	}
	],
	"should_decontaminate": true,
	"doc_to_decontamination_query": "<function enem_doc_to_text at 0x14879d55ae80>",
	"metadata": {
	"version": 1.1
	}
	},
	"enem_challenge": {
	"task": "enem_challenge",
	"task_alias": "enem",
	"group": [
	"pt_benchmark",
	"vestibular"
	],
	"dataset_path": "eduagarcia/enem_challenge",
	"test_split": "train",
	"fewshot_split": "train",
	"doc_to_text": "<function enem_doc_to_text at 0x14879d55b060>",
	"doc_to_target": "{{answerKey}}",
	"description": "As perguntas a seguir são questões de múltipla escolha do Exame Nacional do Ensino Médio (ENEM), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "id_sampler",
	"sampler_config": {
	"id_list": [
	"2022_21",
	"2022_88",
	"2022_143"
	],
	"id_column": "id",
	"exclude_from_task": true
	}
	},
	"num_fewshot": 3,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "acc",
	"higher_is_better": true
	}
	],
	"output_type": "generate_until",
	"generation_kwargs": {
	"max_gen_toks": 32,
	"do_sample": false,
	"temperature": 0.0,
	"top_k": null,
	"top_p": null,
	"until": [
	"\n\n"
	]
	},
	"repeats": 1,
	"filter_list": [
	{
	"name": "all",
	"filter": [
	{
	"function": "normalize_spaces"
	},
	{
	"function": "remove_accents"
	},
	{
	"function": "find_choices",
	"choices": [
	"A",
	"B",
	"C",
	"D",
	"E"
	],
	"regex_patterns": [
	"(?:[Ll]etra\|[Aa]lternativa\|[Rr]esposta\|[Rr]esposta [Cc]orreta\|[Rr]esposta [Cc]orreta e\|[Oo]pcao):? ([ABCDE])\\b",
	"\\b([ABCDE])\\.",
	"\\b([ABCDE]) ?[.):-]",
	"\\b([ABCDE])$",
	"\\b([ABCDE])\\b"
	]
	},
	{
	"function": "take_first"
	}
	],
	"group_by": {
	"column": "exam_id"
	}
	}
	],
	"should_decontaminate": true,
	"doc_to_decontamination_query": "<function enem_doc_to_text at 0x14879d55b240>",
	"metadata": {
	"version": 1.1
	}
	},
	"faquad_nli": {
	"task": "faquad_nli",
	"group": [
	"pt_benchmark"
	],
	"dataset_path": "ruanchaves/faquad-nli",
	"test_split": "test",
	"fewshot_split": "train",
	"doc_to_text": "Pergunta: {{question}}\nResposta: {{answer}}\nA resposta dada satisfaz à pergunta? Sim ou Não?",
	"doc_to_target": "{{['Não', 'Sim'][label]}}",
	"description": "Abaixo estão pares de pergunta e resposta. Para cada par, você deve julgar se a resposta responde à pergunta de maneira satisfatória e aparenta estar correta. Escreva apenas \"Sim\" ou \"Não\".\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n",
	"sampler_config": {
	"fewshot_indices": [
	1893,
	949,
	663,
	105,
	1169,
	2910,
	2227,
	2813,
	974,
	558,
	1503,
	1958,
	2918,
	601,
	1560,
	984,
	2388,
	995,
	2233,
	1982,
	165,
	2788,
	1312,
	2285,
	522,
	1113,
	1670,
	323,
	236,
	1263,
	1562,
	2519,
	1049,
	432,
	1167,
	1394,
	2022,
	2551,
	2194,
	2187,
	2282,
	2816,
	108,
	301,
	1185,
	1315,
	1420,
	2436,
	2322,
	766
	]
	}
	},
	"num_fewshot": 15,
	"metric_list": [
	{
	"metric": "f1_macro",
	"aggregation": "f1_macro",
	"higher_is_better": true
	},
	{
	"metric": "acc",
	"aggregation": "acc",
	"higher_is_better": true
	}
	],
	"output_type": "generate_until",
	"generation_kwargs": {
	"max_gen_toks": 32,
	"do_sample": false,
	"temperature": 0.0,
	"top_k": null,
	"top_p": null,
	"until": [
	"\n\n"
	]
	},
	"repeats": 1,
	"filter_list": [
	{
	"name": "all",
	"filter": [
	{
	"function": "find_similar_label",
	"labels": [
	"Sim",
	"Não"
	]
	},
	{
	"function": "take_first"
	}
	]
	}
	],
	"should_decontaminate": false,
	"metadata": {
	"version": 1.1
	}
	},
	"hatebr_offensive": {
	"task": "hatebr_offensive",
	"task_alias": "hatebr_offensive_binary",
	"group": [
	"pt_benchmark"
	],
	"dataset_path": "eduagarcia/portuguese_benchmark",
	"dataset_name": "HateBR_offensive_binary",
	"test_split": "test",
	"fewshot_split": "train",
	"doc_to_text": "Texto: {{sentence}}\nPergunta: O texto é ofensivo?\nResposta:",
	"doc_to_target": "{{'Sim' if label == 1 else 'Não'}}",
	"description": "Abaixo contém o texto de comentários de usuários do Instagram em português, sua tarefa é classificar se o texto é ofensivo ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "id_sampler",
	"sampler_config": {
	"id_list": [
	48,
	44,
	36,
	20,
	3511,
	88,
	3555,
	16,
	56,
	3535,
	60,
	40,
	3527,
	4,
	76,
	3579,
	3523,
	3551,
	68,
	3503,
	84,
	3539,
	64,
	3599,
	80,
	3563,
	3559,
	3543,
	3547,
	3587,
	3595,
	3575,
	3567,
	3591,
	24,
	96,
	92,
	3507,
	52,
	72,
	8,
	3571,
	3515,
	3519,
	3531,
	28,
	32,
	0,
	12,
	3583
	],
	"id_column": "idx"
	}
	},
	"num_fewshot": 25,
	"metric_list": [
	{
	"metric": "f1_macro",
	"aggregation": "f1_macro",
	"higher_is_better": true
	},
	{
	"metric": "acc",
	"aggregation": "acc",
	"higher_is_better": true
	}
	],
	"output_type": "generate_until",
	"generation_kwargs": {
	"max_gen_toks": 32,
	"do_sample": false,
	"temperature": 0.0,
	"top_k": null,
	"top_p": null,
	"until": [
	"\n\n"
	]
	},
	"repeats": 1,
	"filter_list": [
	{
	"name": "all",
	"filter": [
	{
	"function": "find_similar_label",
	"labels": [
	"Sim",
	"Não"
	]
	},
	{
	"function": "take_first"
	}
	]
	}
	],
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"oab_exams": {
	"task": "oab_exams",
	"group": [
	"legal_benchmark",
	"pt_benchmark"
	],
	"dataset_path": "eduagarcia/oab_exams",
	"test_split": "train",
	"fewshot_split": "train",
	"doc_to_text": "<function doc_to_text at 0x14879d55bd80>",
	"doc_to_target": "{{answerKey}}",
	"description": "As perguntas a seguir são questões de múltipla escolha do Exame de Ordem da Ordem dos Advogados do Brasil (OAB), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\" ou \"D\".\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "id_sampler",
	"sampler_config": {
	"id_list": [
	"2010-01_1",
	"2010-01_11",
	"2010-01_13",
	"2010-01_23",
	"2010-01_26",
	"2010-01_28",
	"2010-01_38",
	"2010-01_48",
	"2010-01_58",
	"2010-01_68",
	"2010-01_76",
	"2010-01_83",
	"2010-01_85",
	"2010-01_91",
	"2010-01_99"
	],
	"id_column": "id",
	"exclude_from_task": true
	}
	},
	"num_fewshot": 3,
	"metric_list": [
	{
	"metric": "acc",
	"aggregation": "acc",
	"higher_is_better": true
	}
	],
	"output_type": "generate_until",
	"generation_kwargs": {
	"max_gen_toks": 32,
	"do_sample": false,
	"temperature": 0.0,
	"top_k": null,
	"top_p": null,
	"until": [
	"\n\n"
	]
	},
	"repeats": 1,
	"filter_list": [
	{
	"name": "all",
	"filter": [
	{
	"function": "normalize_spaces"
	},
	{
	"function": "remove_accents"
	},
	{
	"function": "find_choices",
	"choices": [
	"A",
	"B",
	"C",
	"D"
	],
	"regex_patterns": [
	"(?:[Ll]etra\|[Aa]lternativa\|[Rr]esposta\|[Rr]esposta [Cc]orreta\|[Rr]esposta [Cc]orreta e\|[Oo]pcao):? ([ABCD])\\b",
	"\\b([ABCD])\\.",
	"\\b([ABCD]) ?[.):-]",
	"\\b([ABCD])$",
	"\\b([ABCD])\\b"
	]
	},
	{
	"function": "take_first"
	}
	],
	"group_by": {
	"column": "exam_id"
	}
	}
	],
	"should_decontaminate": true,
	"doc_to_decontamination_query": "<function doc_to_text at 0x14879d3ac040>",
	"metadata": {
	"version": 1.5
	}
	},
	"portuguese_hate_speech": {
	"task": "portuguese_hate_speech",
	"task_alias": "portuguese_hate_speech_binary",
	"group": [
	"pt_benchmark"
	],
	"dataset_path": "eduagarcia/portuguese_benchmark",
	"dataset_name": "Portuguese_Hate_Speech_binary",
	"test_split": "test",
	"fewshot_split": "train",
	"doc_to_text": "Texto: {{sentence}}\nPergunta: O texto contém discurso de ódio?\nResposta:",
	"doc_to_target": "{{'Sim' if label == 1 else 'Não'}}",
	"description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o texto contém discurso de ódio ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "id_sampler",
	"sampler_config": {
	"id_list": [
	52,
	50,
	39,
	28,
	3,
	105,
	22,
	25,
	60,
	11,
	66,
	41,
	9,
	4,
	91,
	42,
	7,
	20,
	76,
	1,
	104,
	13,
	67,
	54,
	97,
	27,
	24,
	14,
	16,
	48,
	53,
	40,
	34,
	49,
	32,
	119,
	114,
	2,
	58,
	83,
	18,
	36,
	5,
	6,
	10,
	35,
	38,
	0,
	21,
	46
	],
	"id_column": "idx"
	}
	},
	"num_fewshot": 25,
	"metric_list": [
	{
	"metric": "f1_macro",
	"aggregation": "f1_macro",
	"higher_is_better": true
	},
	{
	"metric": "acc",
	"aggregation": "acc",
	"higher_is_better": true
	}
	],
	"output_type": "generate_until",
	"generation_kwargs": {
	"max_gen_toks": 32,
	"do_sample": false,
	"temperature": 0.0,
	"top_k": null,
	"top_p": null,
	"until": [
	"\n\n"
	]
	},
	"repeats": 1,
	"filter_list": [
	{
	"name": "all",
	"filter": [
	{
	"function": "find_similar_label",
	"labels": [
	"Sim",
	"Não"
	]
	},
	{
	"function": "take_first"
	}
	]
	}
	],
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	},
	"tweetsentbr": {
	"task": "tweetsentbr",
	"group": [
	"pt_benchmark"
	],
	"dataset_path": "eduagarcia/tweetsentbr_fewshot",
	"test_split": "test",
	"fewshot_split": "train",
	"doc_to_text": "Texto: {{sentence}}\nPergunta: O sentimento do texto é Positivo, Neutro ou Negativo?\nResposta:",
	"doc_to_target": "{{'Positivo' if label == 'Positive' else ('Negativo' if label == 'Negative' else 'Neutro')}}",
	"description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o sentimento do texto é Positivo, Neutro ou Negativo. Responda apenas com uma das opções.\n\n",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"fewshot_config": {
	"sampler": "first_n"
	},
	"num_fewshot": 25,
	"metric_list": [
	{
	"metric": "f1_macro",
	"aggregation": "f1_macro",
	"higher_is_better": true
	},
	{
	"metric": "acc",
	"aggregation": "acc",
	"higher_is_better": true
	}
	],
	"output_type": "generate_until",
	"generation_kwargs": {
	"max_gen_toks": 32,
	"do_sample": false,
	"temperature": 0.0,
	"top_k": null,
	"top_p": null,
	"until": [
	"\n\n"
	]
	},
	"repeats": 1,
	"filter_list": [
	{
	"name": "all",
	"filter": [
	{
	"function": "find_similar_label",
	"labels": [
	"Positivo",
	"Neutro",
	"Negativo"
	]
	},
	{
	"function": "take_first"
	}
	]
	}
	],
	"should_decontaminate": false,
	"metadata": {
	"version": 1.0
	}
	}
	},
	"versions": {
	"assin2_rte": 1.1,
	"assin2_sts": 1.1,
	"bluex": 1.1,
	"enem_challenge": 1.1,
	"faquad_nli": 1.1,
	"hatebr_offensive": 1.0,
	"oab_exams": 1.5,
	"portuguese_hate_speech": 1.0,
	"tweetsentbr": 1.0
	},
	"n-shot": {
	"assin2_rte": 15,
	"assin2_sts": 10,
	"bluex": 3,
	"enem_challenge": 3,
	"faquad_nli": 15,
	"hatebr_offensive": 25,
	"oab_exams": 3,
	"portuguese_hate_speech": 25,
	"tweetsentbr": 25
	},
	"model_meta": {
	"truncated": 2,
	"non_truncated": 14148,
	"padded": 0,
	"non_padded": 14150,
	"fewshots_truncated": 4,
	"has_chat_template": false,
	"chat_type": null,
	"n_gpus": 1,
	"accelerate_num_process": null,
	"model_sha": "None",
	"model_dtype": "torch.bfloat16",
	"model_memory_footprint": 1260510976,
	"model_num_parameters": 630253568,
	"model_is_loaded_in_4bit": null,
	"model_is_loaded_in_8bit": null,
	"model_is_quantized": null,
	"model_device": "cuda:0",
	"batch_size": 64,
	"max_length": 2048,
	"max_ctx_length": 2016,
	"max_gen_toks": 32
	},
	"task_model_meta": {
	"assin2_rte": {
	"sample_size": 2448,
	"truncated": 0,
	"non_truncated": 2448,
	"padded": 0,
	"non_padded": 2448,
	"fewshots_truncated": 0,
	"mean_seq_length": 924.4232026143791,
	"min_seq_length": 909,
	"max_seq_length": 963,
	"max_ctx_length": 2016,
	"max_gen_toks": 32,
	"mean_original_fewshots_size": 15.0,
	"mean_effective_fewshot_size": 15.0
	},
	"assin2_sts": {
	"sample_size": 2448,
	"truncated": 0,
	"non_truncated": 2448,
	"padded": 0,
	"non_padded": 2448,
	"fewshots_truncated": 0,
	"mean_seq_length": 659.4232026143791,
	"min_seq_length": 644,
	"max_seq_length": 698,
	"max_ctx_length": 2016,
	"max_gen_toks": 32,
	"mean_original_fewshots_size": 10.0,
	"mean_effective_fewshot_size": 10.0
	},
	"bluex": {
	"sample_size": 719,
	"truncated": 0,
	"non_truncated": 719,
	"padded": 0,
	"non_padded": 719,
	"fewshots_truncated": 0,
	"mean_seq_length": 1170.817802503477,
	"min_seq_length": 904,
	"max_seq_length": 1801,
	"max_ctx_length": 2016,
	"max_gen_toks": 32,
	"mean_original_fewshots_size": 3.0,
	"mean_effective_fewshot_size": 3.0
	},
	"enem_challenge": {
	"sample_size": 1429,
	"truncated": 2,
	"non_truncated": 1427,
	"padded": 0,
	"non_padded": 1429,
	"fewshots_truncated": 4,
	"mean_seq_length": 1007.4177746675997,
	"min_seq_length": 829,
	"max_seq_length": 2484,
	"max_ctx_length": 2016,
	"max_gen_toks": 32,
	"mean_original_fewshots_size": 3.0,
	"mean_effective_fewshot_size": 2.9972008397480754
	},
	"faquad_nli": {
	"sample_size": 650,
	"truncated": 0,
	"non_truncated": 650,
	"padded": 0,
	"non_padded": 650,
	"fewshots_truncated": 0,
	"mean_seq_length": 968.1338461538462,
	"min_seq_length": 936,
	"max_seq_length": 1034,
	"max_ctx_length": 2016,
	"max_gen_toks": 32,
	"mean_original_fewshots_size": 15.0,
	"mean_effective_fewshot_size": 15.0
	},
	"hatebr_offensive": {
	"sample_size": 1400,
	"truncated": 0,
	"non_truncated": 1400,
	"padded": 0,
	"non_padded": 1400,
	"fewshots_truncated": 0,
	"mean_seq_length": 867.4407142857143,
	"min_seq_length": 852,
	"max_seq_length": 1061,
	"max_ctx_length": 2016,
	"max_gen_toks": 32,
	"mean_original_fewshots_size": 25.0,
	"mean_effective_fewshot_size": 25.0
	},
	"oab_exams": {
	"sample_size": 2195,
	"truncated": 0,
	"non_truncated": 2195,
	"padded": 0,
	"non_padded": 2195,
	"fewshots_truncated": 0,
	"mean_seq_length": 832.024145785877,
	"min_seq_length": 659,
	"max_seq_length": 1108,
	"max_ctx_length": 2016,
	"max_gen_toks": 32,
	"mean_original_fewshots_size": 3.0,
	"mean_effective_fewshot_size": 3.0
	},
	"portuguese_hate_speech": {
	"sample_size": 851,
	"truncated": 0,
	"non_truncated": 851,
	"padded": 0,
	"non_padded": 851,
	"fewshots_truncated": 0,
	"mean_seq_length": 1219.021151586369,
	"min_seq_length": 1192,
	"max_seq_length": 1255,
	"max_ctx_length": 2016,
	"max_gen_toks": 32,
	"mean_original_fewshots_size": 25.0,
	"mean_effective_fewshot_size": 25.0
	},
	"tweetsentbr": {
	"sample_size": 2010,
	"truncated": 0,
	"non_truncated": 2010,
	"padded": 0,
	"non_padded": 2010,
	"fewshots_truncated": 0,
	"mean_seq_length": 1154.4194029850746,
	"min_seq_length": 1137,
	"max_seq_length": 1211,
	"max_ctx_length": 2016,
	"max_gen_toks": 32,
	"mean_original_fewshots_size": 25.0,
	"mean_effective_fewshot_size": 25.0
	}
	},
	"config": {
	"model": "huggingface",
	"model_args": "pretrained=/lustre/mlnvme/data/asen_hpc-mula/checkpoints-llama/slurm_job_17032104/step_400000",
	"batch_size": "auto",
	"batch_sizes": [],
	"device": "cuda:0",
	"use_cache": null,
	"limit": null,
	"bootstrap_iters": 100000,
	"gen_kwargs": null
	},
	"git_hash": null
	}