{ "results": { "arc_challenge": { "acc,none": 0.5930034129692833, "acc_stderr,none": 0.014356399418009123, "acc_norm,none": 0.6143344709897611, "acc_norm_stderr,none": 0.014224250973257177, "alias": "arc_challenge" }, "arc_easy": { "acc,none": 0.8379629629629629, "acc_stderr,none": 0.007561148218715585, "acc_norm,none": 0.8265993265993266, "acc_norm_stderr,none": 0.007768570412816704, "alias": "arc_easy" }, "gsm8k": { "exact_match,get-answer": 0.5890826383623957, "exact_match_stderr,get-answer": 0.013552132901423226, "alias": "gsm8k" }, "hellaswag": { "acc,none": 0.6665006970722963, "acc_stderr,none": 0.00470499629414501, "acc_norm,none": 0.8445528779127663, "acc_norm_stderr,none": 0.003615898928269306, "alias": "hellaswag" }, "piqa": { "acc,none": 0.8128400435255713, "acc_stderr,none": 0.009100273290473547, "acc_norm,none": 0.8264417845484222, "acc_norm_stderr,none": 0.008836375101386922, "alias": "piqa" }, "truthfulqa_mc2": { "acc,none": 0.6089171791978354, "acc_stderr,none": 0.015669761019363578, "alias": "truthfulqa_mc2" }, "winogrande": { "acc,none": 0.7426992896606156, "acc_stderr,none": 0.012285989618865702, "alias": "winogrande" } }, "configs": { "arc_challenge": { "task": "arc_challenge", "group": [ "ai2_arc" ], "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "Question: {{question}}\nAnswer:", "doc_to_target": "{{choices.label.index(answerKey)}}", "doc_to_choice": "{{choices.text}}", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", "metadata": [ { "version": 1.0 } ] }, "arc_easy": { "task": "arc_easy", "group": [ "ai2_arc" ], "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "Question: {{question}}\nAnswer:", "doc_to_target": "{{choices.label.index(answerKey)}}", "doc_to_choice": "{{choices.text}}", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", "metadata": [ { "version": 1.0 } ] }, "gsm8k": { "task": "gsm8k", "group": [ "math_word_problems" ], "dataset_path": "gsm8k", "dataset_name": "main", "training_split": "train", "test_split": "test", "fewshot_split": "train", "doc_to_text": "Question: {{question}}\nAnswer:", "doc_to_target": "{{answer}}", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "num_fewshot": 5, "metric_list": [ { "metric": "exact_match", "aggregation": "mean", "higher_is_better": true, "ignore_case": true, "ignore_punctuation": false, "regexes_to_ignore": [ ",", "\\$", "(?s).*#### " ] } ], "output_type": "generate_until", "generation_kwargs": { "until": [ "\n\n", "Question:" ], "do_sample": false, "temperature": 0.0 }, "repeats": 1, "filter_list": [ { "name": "get-answer", "filter": [ { "function": "regex", "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" }, { "function": "take_first" } ] } ], "should_decontaminate": false, "metadata": [ { "version": 1.0 } ] }, "hellaswag": { "task": "hellaswag", "group": [ "multiple_choice" ], "dataset_path": "hellaswag", "training_split": "train", "validation_split": "validation", "process_docs": "", "doc_to_text": "{{query}}", "doc_to_target": "{{label}}", "doc_to_choice": "choices", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": [ { "version": 1.0 } ] }, "piqa": { "task": "piqa", "dataset_path": "piqa", "training_split": "train", "validation_split": "validation", "doc_to_text": "Question: {{goal}}\nAnswer:", "doc_to_target": "label", "doc_to_choice": "{{[sol1, sol2]}}", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "goal", "metadata": [ { "version": 1.0 } ] }, "truthfulqa_mc2": { "task": "truthfulqa_mc2", "group": [ "truthfulqa" ], "dataset_path": "truthful_qa", "dataset_name": "multiple_choice", "validation_split": "validation", "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", "doc_to_target": 0, "doc_to_choice": "{{mc2_targets.choices}}", "process_results": "", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "question", "metadata": [ { "version": 2.0 } ] }, "winogrande": { "task": "winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "training_split": "train", "validation_split": "validation", "doc_to_text": "", "doc_to_target": "", "doc_to_choice": "", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "sentence", "metadata": [ { "version": 1.0 } ] } }, "versions": { "arc_challenge": "Yaml", "arc_easy": "Yaml", "gsm8k": "Yaml", "hellaswag": "Yaml", "piqa": "Yaml", "truthfulqa_mc2": "Yaml", "winogrande": "Yaml" }, "n-shot": { "arc_challenge": 0, "arc_easy": 0, "gsm8k": 5, "hellaswag": 0, "piqa": 0, "truthfulqa_mc2": 0, "winogrande": 0 }, "config": { "model": "hf", "model_args": "pretrained=Undi95/SolarMaid-v0.1.1", "batch_size": "4", "batch_sizes": [], "device": "cuda:0", "use_cache": null, "limit": null, "bootstrap_iters": 100000, "gen_kwargs": null }, "git_hash": "fcfc0c60" }