add eval code

Browse files

Files changed (5) hide show

custom_evaluation_tasks.py +650 -0
custom_evaluation_utils.py +158 -0
lighteval_eval_config.yaml +45 -0
run_evals.py +442 -0
run_train.py +2 -2

custom_evaluation_tasks.py ADDED Viewed

	@@ -0,0 +1,650 @@

+# ruff: noqa: F405, F403, F401
+"""
+Custom evaluation tasks for lighteval
+This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
+"""
+import re
+from dataclasses import asdict
+from typing import Dict, List, Tuple
+from custom_evaluation_utils import *
+from lighteval.tasks.requests import Doc
+# fmt: off
+LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
+# fmt: on
+_TASKS_STRINGS: List[Tuple[CustomEvaluationTask, str]] = []
+_TASKS: List[CustomEvaluationTask] = []
+## COMMON_SENSE_REASONING_TASKS ##
+COMMON_SENSE_REASONING_TASKS = [
+    CustomEvaluationTask(
+        name="hellaswag",
+        prompt_function="hellaswag_prompt",
+        hf_repo="hellaswag",
+        hf_subset="default",
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+    ),
+    CustomEvaluationTask(
+        name="winogrande",
+        prompt_function="winogrande",
+        hf_repo="winogrande",
+        hf_subset="winogrande_xl",
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+    ),
+    CustomEvaluationTask(
+        name="piqa",
+        prompt_function="piqa_harness",
+        hf_repo="piqa",
+        hf_subset="plain_text",
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+    ),
+    CustomEvaluationTask(
+        name="siqa",
+        prompt_function="siqa_prompt",
+        hf_repo="lighteval/siqa",
+        hf_subset="default",
+        hf_avail_splits=["train", "validation"],
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+    ),
+    CustomEvaluationTask(
+        name="openbookqa",
+        prompt_function="openbookqa",
+        hf_repo="openbookqa",
+        hf_subset="main",
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+    ),
+    CustomEvaluationTask(
+        name="arc:easy",
+        prompt_function="arc",
+        hf_repo="ai2_arc",
+        hf_subset="ARC-Easy",
+        evaluation_splits=["test"],
+        generation_size=1,
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+    ),
+    CustomEvaluationTask(
+        name="arc:challenge",
+        prompt_function="arc",
+        hf_repo="ai2_arc",
+        hf_subset="ARC-Challenge",
+        evaluation_splits=["test"],
+        generation_size=1,
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+    ),
+    CustomEvaluationTask(
+        name="commonsense_qa",
+        prompt_function="commonsense_qa_prompt",
+        hf_repo="commonsense_qa",
+        hf_subset="default",
+        metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
+    ),
+]
+def commonsense_qa_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["question"],
+        choices=[f" {c}" for c in line["choices"]["text"]],
+        gold_index=LETTER_INDICES.index(line["answerKey"].strip()),
+        instruction="",
+    )
+def siqa_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["context"] + " " + line["question"],
+        choices=[f" {c}" for c in [line["answerA"], line["answerB"], line["answerC"]]],
+        gold_index=int(line["label"]) - 1,
+        instruction="",
+    )
+def hellaswag_prompt(line, task_name: str = None):
+    def preprocess(text):
+        """Comes from AiHarness"""
+        # text = text.strip()
+        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+        text = text.replace(" [title]", ". ")
+        text = re.sub("\\[.*?\\]", "", text)
+        text = text.replace("  ", " ")
+        return text
+    ctx = f"{line['ctx_a']} {line['ctx_b'].capitalize()} "
+    return Doc(
+        task_name=task_name,
+        query=preprocess(line["activity_label"] + ": " + ctx),
+        choices=[" " + preprocess(ending) for ending in line["endings"]],
+        gold_index=int(line["label"]) if line["label"] != "" else -1,  # -1 for test
+        # "metric": "choices_loglikelihood",
+    )
+# 0 short for common sense
+COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0|1") for t in COMMON_SENSE_REASONING_TASKS]
+_TASKS_STRINGS.extend(COMMON_SENSE_REASONING_STRING)
+_TASKS += COMMON_SENSE_REASONING_TASKS
+## WORLD_KNOWLEDGE_TASKS ##
+WORLD_KNOWLEDGE_TASKS = [
+    CustomEvaluationTask(
+        name="trivia_qa",
+        prompt_function="triviaqa",
+        hf_repo="trivia_qa",
+        hf_subset="rc.nocontext",
+        metric=[Metrics.quasi_exact_match2],
+        generation_size=20,
+        stop_sequence=["\n", ".", ","],
+    ),
+    CustomEvaluationTask(
+        name="natural_questions",
+        prompt_function="natural_questions_prompt",
+        hf_repo="lighteval/natural_questions_clean",
+        hf_subset="default",
+        metric=[Metrics.quasi_exact_match2],
+        generation_size=20,
+        stop_sequence=["\n", ".", ","],
+    ),
+]
+def natural_questions_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["question"] + "?\nAnswer: ",
+        choices=[line["short_answers"]],
+        gold_index=0,
+        instruction="",
+    )
+WORLD_KNOWLEDGE_STRING = [(t, f"custom|{t.name}|5|1") for t in WORLD_KNOWLEDGE_TASKS]
+# WORLD_KNOWLEDGE_STRING = {t: f'custom|{t.name}|0|1' for t in WORLD_KNOWLEDGE_TASKS}
+_TASKS_STRINGS.extend(WORLD_KNOWLEDGE_STRING)
+_TASKS += WORLD_KNOWLEDGE_TASKS
+## Reading comprehension ##
+READING_COMP_TASKS = [
+    CustomEvaluationTask(
+        name="super_glue:boolq",
+        prompt_function="boolq_prompt",
+        hf_repo="super_glue",
+        hf_subset="boolq",
+        metric=[Metrics.target_perplexity],
+    ),
+    CustomEvaluationTask(
+        name="quac",
+        prompt_function="quac",
+        hf_repo="lighteval/quac_helm",
+        hf_subset="default",
+        metric=[Metrics.quasi_exact_match2],
+        generation_size=20,
+        stop_sequence=["\n", ".", ","],
+    ),
+]
+def boolq_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=f"{line['passage']}\nQuestion: {line['question'].capitalize()}?\nAnswer:",
+        choices=[" No", " Yes"],  # Only gold
+        gold_index=int(line["label"]),
+    )
+READING_COMP_STRING = [(t, f"custom|{t.name}|0|1") for t in READING_COMP_TASKS]
+_TASKS_STRINGS.extend(READING_COMP_STRING)
+_TASKS += READING_COMP_TASKS
+## MATH ##
+class CustomMathEvaluationTask(CustomEvaluationTask):
+    """Custom class for math tasks with all the defaults set"""
+    def __init__(
+        self,
+        name,
+        prompt_function="math",
+        hf_repo="lighteval/MATH",
+        hf_subset=None,
+        metric=[Metrics.math_quasi_exact_match],
+        hf_avail_splits=None,
+        evaluation_splits=["test"],
+        few_shots_split=None,
+        few_shots_select=None,
+        suite=["custom"],
+        generation_size=40,
+        stop_sequence=None,
+        output_regex=None,
+        frozen=False,
+    ):
+        super().__init__(
+            name=name,
+            prompt_function=prompt_function,
+            hf_repo=hf_repo,
+            hf_subset=hf_subset,
+            metric=metric,
+            hf_avail_splits=hf_avail_splits,
+            evaluation_splits=evaluation_splits,
+            few_shots_split=few_shots_split,
+            few_shots_select=few_shots_select,
+            suite=suite,
+            generation_size=generation_size,
+            stop_sequence=stop_sequence,
+            output_regex=output_regex,
+            frozen=frozen,
+        )
+MATH_TASKS = [
+    CustomMathEvaluationTask(name="math:algebra", hf_subset="algebra"),
+    CustomMathEvaluationTask(name="math:counting_and_probability", hf_subset="counting_and_probability"),
+    CustomMathEvaluationTask(name="math:geometry", hf_subset="geometry"),
+    CustomMathEvaluationTask(name="math:intermediate_algebra", hf_subset="intermediate_algebra"),
+    CustomMathEvaluationTask(name="math:number_theory", hf_subset="number_theory"),
+    CustomMathEvaluationTask(name="math:prealgebra", hf_subset="prealgebra"),
+    CustomMathEvaluationTask(name="math:precalculus", hf_subset="precalculus"),
+]
+GSM8K = CustomEvaluationTask(
+    name="gsm8k",
+    prompt_function="gsm8k",
+    hf_repo="gsm8k",
+    hf_subset="main",
+    hf_avail_splits=["train", "test"],
+    evaluation_splits=["test"],
+    metric=[Metrics.perfect_exact_match],
+    generation_size=10,
+    stop_sequence=["\n"],
+)
+MATH_STRING = [(t, f"custom|{t.name}|4|1") for t in MATH_TASKS]
+GSM8K_STRING = [(GSM8K, f"custom|{GSM8K.name}|8|1")]
+_TASKS_STRINGS.extend(MATH_STRING)
+_TASKS_STRINGS.extend(GSM8K_STRING)
+_TASKS += MATH_TASKS + [GSM8K]
+## MMLU ##
+class CustomMMLUEvaluationTask(CustomEvaluationTask):
+    def __init__(
+        self,
+        name,
+        prompt_function="mmlu_prompt",
+        hf_repo="lighteval/mmlu",
+        hf_subset=None,
+        #  metric=[Metrics.loglikelihood_acc_single_token],
+        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
+        hf_avail_splits=None,
+        evaluation_splits=["test"],
+        few_shots_split="dev",
+        few_shots_select=None,
+        suite=None,
+        generation_size=-1,
+        stop_sequence=None,
+        output_regex=None,
+        frozen=False,
+    ):
+        super().__init__(
+            name=name,
+            prompt_function=prompt_function,
+            hf_repo=hf_repo,
+            hf_subset=hf_subset,
+            metric=metric,
+            hf_avail_splits=hf_avail_splits,
+            evaluation_splits=evaluation_splits,
+            few_shots_split=few_shots_split,
+            few_shots_select=few_shots_select,
+            suite=suite,
+            generation_size=generation_size,
+            stop_sequence=stop_sequence,
+            output_regex=output_regex,
+            frozen=frozen,
+        )
+MMLU_TASKS = [
+    CustomMMLUEvaluationTask(name="mmlu:abstract_algebra", hf_subset="abstract_algebra"),
+    CustomMMLUEvaluationTask(name="mmlu:anatomy", hf_subset="anatomy"),
+    CustomMMLUEvaluationTask(name="mmlu:astronomy", hf_subset="astronomy"),
+    CustomMMLUEvaluationTask(name="mmlu:business_ethics", hf_subset="business_ethics"),
+    CustomMMLUEvaluationTask(name="mmlu:clinical_knowledge", hf_subset="clinical_knowledge"),
+    CustomMMLUEvaluationTask(name="mmlu:college_biology", hf_subset="college_biology"),
+    CustomMMLUEvaluationTask(name="mmlu:college_chemistry", hf_subset="college_chemistry"),
+    CustomMMLUEvaluationTask(name="mmlu:college_computer_science", hf_subset="college_computer_science"),
+    CustomMMLUEvaluationTask(name="mmlu:college_mathematics", hf_subset="college_mathematics"),
+    CustomMMLUEvaluationTask(name="mmlu:college_medicine", hf_subset="college_medicine"),
+    CustomMMLUEvaluationTask(name="mmlu:college_physics", hf_subset="college_physics"),
+    CustomMMLUEvaluationTask(name="mmlu:computer_security", hf_subset="computer_security"),
+    CustomMMLUEvaluationTask(name="mmlu:conceptual_physics", hf_subset="conceptual_physics"),
+    CustomMMLUEvaluationTask(name="mmlu:econometrics", hf_subset="econometrics"),
+    CustomMMLUEvaluationTask(name="mmlu:electrical_engineering", hf_subset="electrical_engineering"),
+    CustomMMLUEvaluationTask(name="mmlu:elementary_mathematics", hf_subset="elementary_mathematics"),
+    CustomMMLUEvaluationTask(name="mmlu:formal_logic", hf_subset="formal_logic"),
+    CustomMMLUEvaluationTask(name="mmlu:global_facts", hf_subset="global_facts"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_biology", hf_subset="high_school_biology"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_chemistry", hf_subset="high_school_chemistry"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_computer_science", hf_subset="high_school_computer_science"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_european_history", hf_subset="high_school_european_history"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_geography", hf_subset="high_school_geography"),
+    CustomMMLUEvaluationTask(
+        name="mmlu:high_school_government_and_politics", hf_subset="high_school_government_and_politics"
+    ),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_macroeconomics", hf_subset="high_school_macroeconomics"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_mathematics", hf_subset="high_school_mathematics"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_microeconomics", hf_subset="high_school_microeconomics"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_physics", hf_subset="high_school_physics"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_psychology", hf_subset="high_school_psychology"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_statistics", hf_subset="high_school_statistics"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_us_history", hf_subset="high_school_us_history"),
+    CustomMMLUEvaluationTask(name="mmlu:high_school_world_history", hf_subset="high_school_world_history"),
+    CustomMMLUEvaluationTask(name="mmlu:human_aging", hf_subset="human_aging"),
+    CustomMMLUEvaluationTask(name="mmlu:human_sexuality", hf_subset="human_sexuality"),
+    CustomMMLUEvaluationTask(name="mmlu:international_law", hf_subset="international_law"),
+    CustomMMLUEvaluationTask(name="mmlu:jurisprudence", hf_subset="jurisprudence"),
+    CustomMMLUEvaluationTask(name="mmlu:logical_fallacies", hf_subset="logical_fallacies"),
+    CustomMMLUEvaluationTask(name="mmlu:machine_learning", hf_subset="machine_learning"),
+    CustomMMLUEvaluationTask(name="mmlu:management", hf_subset="management"),
+    CustomMMLUEvaluationTask(name="mmlu:marketing", hf_subset="marketing"),
+    CustomMMLUEvaluationTask(name="mmlu:medical_genetics", hf_subset="medical_genetics"),
+    CustomMMLUEvaluationTask(name="mmlu:miscellaneous", hf_subset="miscellaneous"),
+    CustomMMLUEvaluationTask(name="mmlu:moral_disputes", hf_subset="moral_disputes"),
+    CustomMMLUEvaluationTask(name="mmlu:moral_scenarios", hf_subset="moral_scenarios"),
+    CustomMMLUEvaluationTask(name="mmlu:nutrition", hf_subset="nutrition"),
+    CustomMMLUEvaluationTask(name="mmlu:philosophy", hf_subset="philosophy"),
+    CustomMMLUEvaluationTask(name="mmlu:prehistory", hf_subset="prehistory"),
+    CustomMMLUEvaluationTask(name="mmlu:professional_accounting", hf_subset="professional_accounting"),
+    CustomMMLUEvaluationTask(name="mmlu:professional_law", hf_subset="professional_law"),
+    CustomMMLUEvaluationTask(name="mmlu:professional_medicine", hf_subset="professional_medicine"),
+    CustomMMLUEvaluationTask(name="mmlu:professional_psychology", hf_subset="professional_psychology"),
+    CustomMMLUEvaluationTask(name="mmlu:public_relations", hf_subset="public_relations"),
+    CustomMMLUEvaluationTask(name="mmlu:security_studies", hf_subset="security_studies"),
+    CustomMMLUEvaluationTask(name="mmlu:sociology", hf_subset="sociology"),
+    CustomMMLUEvaluationTask(name="mmlu:us_foreign_policy", hf_subset="us_foreign_policy"),
+    CustomMMLUEvaluationTask(name="mmlu:virology", hf_subset="virology"),
+    CustomMMLUEvaluationTask(name="mmlu:world_religions", hf_subset="world_religions"),
+]
+def mmlu_harness(line, task_name: str = None):
+    topic = line["subject"]
+    prompt = f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n"
+    prompt += line["question"] + "\n"
+    prompt += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
+    prompt += "Answer:"
+    gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
+    "__few_shots" in line and line["__few_shots"] is True  # We are adding few shots
+    return Doc(
+        task_name=task_name,
+        query=prompt,
+        choices=[" A", " B", " C", " D"],
+        target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
+        gold_index=gold_ix,
+        instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
+    )
+def mmlu_prompt(line, task_name: str = None):
+    """MMLU prompt without letters"""
+    topic = line["subject"]
+    prompt = f"The following are questions about {topic.replace('_', ' ')}.\nQuestion: "
+    prompt += line["question"] + "\nAnswer:"
+    return Doc(
+        task_name=task_name,
+        query=prompt,
+        choices=[f" {c}" for c in line["choices"]],
+        gold_index=line["answer"],
+        instruction=f"The following are questions about {topic.replace('_', ' ')}.\n",
+    )
+# MMLU_STRING = {t: f'custom|{t.name}|5|1' for t in MMLU_TASKS}
+MMLU_STRING = [(t, f"custom|{t.name}|0|1") for t in MMLU_TASKS]
+_TASKS_STRINGS.extend(MMLU_STRING)
+_TASKS += MMLU_TASKS
+## BBH ##
+class CustomBBHEvaluationTask(CustomEvaluationTask):
+    def __init__(
+        self,
+        name,
+        prompt_function="bbh_prompt",
+        hf_repo="lighteval/big_bench_hard",
+        hf_subset=None,
+        metric=[Metrics.exact_match],
+        hf_avail_splits=["train"],
+        evaluation_splits=["train"],
+        few_shots_split="train",
+        few_shots_select=None,
+        suite=None,
+        generation_size=4,
+        stop_sequence=None,
+        output_regex=None,
+        frozen=False,
+    ):
+        super().__init__(
+            name=name,
+            prompt_function=prompt_function,
+            hf_repo=hf_repo,
+            hf_subset=hf_subset,
+            metric=metric,
+            hf_avail_splits=hf_avail_splits,
+            evaluation_splits=evaluation_splits,
+            few_shots_split=few_shots_split,
+            few_shots_select=few_shots_select,
+            suite=suite,
+            generation_size=generation_size,
+            stop_sequence=stop_sequence,
+            output_regex=output_regex,
+            frozen=frozen,
+        )
+BBH_TASKS = [
+    CustomBBHEvaluationTask(name="bbh:boolean_expressions", hf_subset="boolean_expressions"),
+    CustomBBHEvaluationTask(name="bbh:causal_judgement", hf_subset="causal_judgement"),
+    CustomBBHEvaluationTask(name="bbh:date_understanding", hf_subset="date_understanding"),
+    CustomBBHEvaluationTask(name="bbh:disambiguation_qa", hf_subset="disambiguation_qa"),
+    CustomBBHEvaluationTask(name="bbh:dyck_languages", hf_subset="dyck_languages"),
+    CustomBBHEvaluationTask(name="bbh:formal_fallacies", hf_subset="formal_fallacies"),
+    CustomBBHEvaluationTask(name="bbh:geometric_shapes", hf_subset="geometric_shapes"),
+    CustomBBHEvaluationTask(name="bbh:hyperbaton", hf_subset="hyperbaton"),
+    CustomBBHEvaluationTask(name="bbh:logical_deduction_five_objects", hf_subset="logical_deduction_five_objects"),
+    CustomBBHEvaluationTask(name="bbh:logical_deduction_seven_objects", hf_subset="logical_deduction_seven_objects"),
+    CustomBBHEvaluationTask(name="bbh:logical_deduction_three_objects", hf_subset="logical_deduction_three_objects"),
+    CustomBBHEvaluationTask(name="bbh:movie_recommendation", hf_subset="movie_recommendation"),
+    CustomBBHEvaluationTask(name="bbh:multistep_arithmetic_two", hf_subset="multistep_arithmetic_two"),
+    CustomBBHEvaluationTask(name="bbh:navigate", hf_subset="navigate"),
+    CustomBBHEvaluationTask(name="bbh:object_counting", hf_subset="object_counting"),
+    CustomBBHEvaluationTask(name="bbh:penguins_in_a_table", hf_subset="penguins_in_a_table"),
+    CustomBBHEvaluationTask(name="bbh:reasoning_about_colored_objects", hf_subset="reasoning_about_colored_objects"),
+    CustomBBHEvaluationTask(name="bbh:ruin_names", hf_subset="ruin_names"),
+    CustomBBHEvaluationTask(
+        name="bbh:salient_translation_error_detection", hf_subset="salient_translation_error_detection"
+    ),
+    CustomBBHEvaluationTask(name="bbh:snarks", hf_subset="snarks"),
+    CustomBBHEvaluationTask(name="bbh:sports_understanding", hf_subset="sports_understanding"),
+    CustomBBHEvaluationTask(name="bbh:temporal_sequences", hf_subset="temporal_sequences"),
+    CustomBBHEvaluationTask(
+        name="bbh:tracking_shuffled_objects_five_objects", hf_subset="tracking_shuffled_objects_five_objects"
+    ),
+    CustomBBHEvaluationTask(
+        name="bbh:tracking_shuffled_objects_seven_objects", hf_subset="tracking_shuffled_objects_seven_objects"
+    ),
+    CustomBBHEvaluationTask(
+        name="bbh:tracking_shuffled_objects_three_objects", hf_subset="tracking_shuffled_objects_three_objects"
+    ),
+    CustomBBHEvaluationTask(name="bbh:web_of_lies", hf_subset="web_of_lies"),
+    CustomBBHEvaluationTask(name="bbh:word_sorting", hf_subset="word_sorting"),
+]
+def bbh_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["input"] + "\nAnswer: ",
+        choices=[line["target"]],
+        gold_index=0,
+    )
+# BBH_STRING = {t: f'custom|{t.name}|3|1' for t in BBH_TASKS}
+BBH_STRING = [(t, f"custom|{t.name}|0|1") for t in BBH_TASKS]
+_TASKS_STRINGS.extend(BBH_STRING)
+_TASKS += BBH_TASKS
+## AGI eval ##
+class CustomAGIEvalEvaluationTask(CustomEvaluationTask):
+    def __init__(
+        self,
+        name,
+        prompt_function="agi_eval_prompt_no_letters",
+        hf_repo="lighteval/agi_eval_en",
+        hf_subset=None,
+        #  metric=[Metrics.loglikelihood_acc_single_token],
+        metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
+        hf_avail_splits=["train", "validation"],
+        evaluation_splits=["train"],
+        few_shots_split="validation",
+        few_shots_select=None,
+        suite=None,
+        generation_size=-1,
+        stop_sequence=None,
+        output_regex=None,
+        frozen=False,
+    ):
+        super().__init__(
+            name=name,
+            prompt_function=prompt_function,
+            hf_repo=hf_repo,
+            hf_subset=hf_subset,
+            metric=metric,
+            hf_avail_splits=hf_avail_splits,
+            evaluation_splits=evaluation_splits,
+            few_shots_split=few_shots_split,
+            few_shots_select=few_shots_select,
+            suite=suite,
+            generation_size=generation_size,
+            stop_sequence=stop_sequence,
+            output_regex=output_regex,
+            frozen=frozen,
+        )
+AGIEVAL_TASKS = [
+    CustomAGIEvalEvaluationTask(name="agi_eval:aqua_rat", hf_subset="aqua_rat"),
+    CustomAGIEvalEvaluationTask(name="agi_eval:logiqa-en", hf_subset="logiqa-en"),
+    CustomAGIEvalEvaluationTask(name="agi_eval:lsat-ar", hf_subset="lsat-ar"),
+    CustomAGIEvalEvaluationTask(name="agi_eval:lsat-lr", hf_subset="lsat-lr"),
+    CustomAGIEvalEvaluationTask(name="agi_eval:lsat-rc", hf_subset="lsat-rc"),
+    CustomAGIEvalEvaluationTask(
+        name="agi_eval:math",
+        hf_subset="math",
+        prompt_function="agi_eval_math_prompt",
+        metric=[Metrics.exact_match, Metrics.quasi_exact_match2],
+        generation_size=40,
+    ),
+    CustomAGIEvalEvaluationTask(name="agi_eval:sat-en", hf_subset="sat-en"),
+    CustomAGIEvalEvaluationTask(name="agi_eval:sat-math", hf_subset="sat-math"),
+]
+def agi_eval_math_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["question"],
+        choices=[line["answer"]],
+        gold_index=0,
+        instruction="",
+    )
+def agi_eval_prompt(line, task_name: str = None):
+    cleaned_options = [o.replace("(", "").replace(")", " ") for o in line["options"]]
+    prompt = "The following are multiple choice questions (with answers).\n\n"
+    prompt += line["question"] + "\n" + "\n".join(cleaned_options) + "\n"
+    prompt += "Answer: "
+    choices = LETTER_INDICES[: len(line["options"])]
+    output = Doc(
+        query=prompt,
+        instruction="The following are multiple choice questions (with answers).\n\n",
+    )
+    if line["label"]:
+        output.choices = choices
+        output.gold_index = LETTER_INDICES.index(line["label"].strip())
+    else:
+        output.choices = [line["answer"]]
+        output.gold_index = 0
+    return output
+def agi_eval_prompt_no_letters(line, task_name: str = None):
+    cleaned_options = [
+        " " + o.replace("(A)", "").replace("(B)", "").replace("(C)", "").replace("(D)", "").replace("(E)", "")
+        for o in line["options"]
+    ]
+    output = Doc(
+        query=line["question"],
+        choices=cleaned_options,
+        gold_index=LETTER_INDICES.index(line["label"].strip()),
+        instruction="",
+    )
+    return output
+# AGIEVAL_STRING = {t: f'custom|{t.name}|5|1' for t in AGIEVAL_TASKS}
+AGIEVAL_STRING = [(t, f"custom|{t.name}|0|1") for t in AGIEVAL_TASKS]
+_TASKS_STRINGS.extend(AGIEVAL_STRING)
+_TASKS += AGIEVAL_TASKS
+## HUMAN EVAL ##
+# human_eval = CustomEvaluationTask(
+#         name="human_eval",
+#         prompt_function="human_eval",
+#         hf_repo="lighteval/human_eval",
+#         metric=["human_eval_pass_at_1"],
+#     ),
+def has_generative_metrics(task: CustomEvaluationTask) -> bool:
+    for metric in task.metric:
+        if metric in NEEDS_GENERATION_ONLY:
+            return True
+    return False
+EARLY_SIGNAL_TASKS = ",".join([t[1] for t in COMMON_SENSE_REASONING_STRING] + [t[1] for t in MMLU_STRING])
+# Convert to dict for lighteval
+TASKS_TABLE = [asdict(task) for task in _TASKS]
+# You can have a few pre-organised groups of tasks
+TASKS_GROUPS = {
+    "all": ",".join(t[1] for t in _TASKS_STRINGS),
+    "early-signal": EARLY_SIGNAL_TASKS,
+    "non-generatives": ",".join(t for k, t in _TASKS_STRINGS if not has_generative_metrics(k)),
+    "generatives": ",".join(t for k, t in _TASKS_STRINGS if has_generative_metrics(k)),
+}
+if __name__ == "__main__":
+    print(t["name"] for t in TASKS_TABLE)
+    print(len(TASKS_TABLE))

custom_evaluation_utils.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+Custom evaluation tasks for lighteval
+"""
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Optional, Tuple, Union
+class Metrics(Enum):
+    any_target_loglikelihood_acc = auto()
+    bert_score = auto()
+    bias = auto()
+    bits_per_byte = auto()
+    bleu = auto()
+    bleu_1 = auto()
+    bleu_4 = auto()
+    byte_perplexity = auto()
+    chrf = auto()
+    code_eval_APPS = auto()
+    code_eval_HE = auto()
+    copyright = auto()
+    disinformation = auto()
+    exact_match = auto()
+    exact_set_match = auto()
+    extractiveness = auto()
+    f1_from_bags = auto()
+    f1_quasi = auto()
+    f1_sequence = auto()
+    f1_set_match = auto()
+    faithfulness = auto()
+    iou_set_match = auto()
+    log_prob = auto()
+    loglikelihood_acc = auto()
+    loglikelihood_acc_norm = auto()
+    loglikelihood_acc_norm_nospace = auto()
+    loglikelihood_acc_norm_single_token = auto()
+    loglikelihood_acc_single_token = auto()
+    loglikelihood_f1 = auto()
+    loglikelihood_f1_single_token = auto()
+    math_quasi_exact_match = auto()
+    mc_taco = auto()
+    mcc = auto()
+    mcc_single_token = auto()
+    mrr = auto()
+    mrr_single_token = auto()
+    multi_fi_numeric = auto()
+    one_choice_loglikelihood_acc = auto()
+    perfect_exact_match = auto()
+    prediction_perplexity = auto()
+    prefix_exact_match = auto()
+    prefix_quasi_exact_match = auto()
+    quasi_exact_match = auto()
+    ranking = auto()
+    recall_at_1_single_token = auto()
+    recall_at_2_single_token = auto()
+    recall_at_1 = auto()
+    recall_at_2 = auto()
+    rouge = auto()
+    rouge_1 = auto()
+    rouge_2 = auto()
+    rouge_l = auto()
+    target_perplexity = auto()
+    ter = auto()
+    toxicity = auto()
+    truthfulqa_mc_metrics = auto()
+    word_perplexity = auto()
+    def __str__(self):
+        return self.name.replace("_at_", "@")
+NEEDS_GENERATION_ONLY = [
+    "perfect_exact_match",
+    "exact_match",
+    "quasi_exact_match",
+    "quasi_exact_match2",
+    "prefix_exact_match",
+    "prefix_quasi_exact_match",
+    "math_quasi_exact_match",
+    "iou_set_match",
+    "exact_set_match",
+    "f1_sequence",
+    "f1_quasi",
+    "f1_set_match",
+    "f1_from_bags",
+    "chrf",
+    "ter",
+    "rouge",
+    "rouge_1",
+    "rouge_2",
+    "rouge_l",
+    "faithfulness",
+    "extractiveness",
+    "bert_score",
+    "bleu",
+    "bleu_1",
+    "bleu_4",
+    "bias",
+    "toxicity",
+    "code_eval_HE",
+    "code_eval_APPS",
+    "copyright",
+]
+@dataclass(unsafe_hash=True)
+class CustomEvaluationTask:
+    name: str
+    prompt_function: str
+    hf_repo: str
+    hf_subset: str
+    metric: Tuple[Union[str, Metrics]]
+    hf_avail_splits: Optional[Tuple[str]] = None
+    evaluation_splits: Optional[Tuple[str]] = None
+    few_shots_split: Optional[str] = None
+    few_shots_select: Optional[str] = None
+    generation_size: int = -1
+    stop_sequence: Optional[Tuple[str]] = None
+    output_regex: Optional[str] = None
+    frozen: bool = False
+    suite: Optional[Tuple[str]] = None  # we use this to know if we should use a custom lighteval or bigcode task
+    def __post_init__(self):
+        self.metric = [str(m) for m in self.metric]
+        if self.suite is None:
+            self.suite = ["custom"]
+        if self.hf_avail_splits is None:
+            self.hf_avail_splits = ["train", "validation", "test"]
+        if self.evaluation_splits is None:
+            self.evaluation_splits = ["validation"]
+        if self.stop_sequence is None:
+            self.stop_sequence = ["\n"]
+        # Convert list to tuple for hashing
+        self.metric = tuple(self.metric)
+        self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None
+        self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None
+        self.suite = tuple(self.suite) if self.suite else None
+        self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None
+@dataclass(unsafe_hash=True)
+class BigCodeEvaluationTask:
+    name: str
+    bigcode_task: str
+    bigcode_task_kwargs: Optional[dict] = None
+    n_samples: int = 1
+    prefix: Optional[str] = None
+    suite: Tuple[str] = None
+    def __post_init__(self):
+        if self.suite is None:
+            self.suite = ("bigcode",)
+        # Convert list to tuple for hashing
+        self.suite = tuple(self.suite)

lighteval_eval_config.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+checkpoints: null
+data: null
+experiment_logger: null
+general: null
+kill_switch_path: null
+lighteval:
+  batch_size: 24
+  checkpoints_path: null
+  generation: null
+  logging:
+    hub_repo_details: null
+    hub_repo_results: null
+    hub_repo_tensorboard: HuggingFaceBR4/thomwolf-nanotron-mistral-7b
+    local_output_path: /scratch/thomwolf/lighteval/nanotron-mistral-7b
+    push_details_to_hub: false
+    push_results_to_hub: false
+    push_results_to_tensorboard: true
+    tensorboard_metric_prefix: e
+  parallelism:
+    dp: 4
+    pp: 1
+    pp_engine: 1f1b
+    recompute_granularity: null
+    tp: 2
+    tp_linear_async_communication: false
+    tp_mode: ALL_REDUCE
+  slurm: null
+  slurm_script_dir: null
+  slurm_template: null
+  tasks:
+    custom_tasks_file: ./custom_evaluation_tasks.py
+    dataset_loading_processes: 8
+    max_samples: 1000
+    multichoice_continuations_start_space: null
+    no_multichoice_continuations_start_space: null
+    num_fewshot_seeds: null
+    tasks: early-signal
+logging: null
+model: null
+optimizer: null
+parallelism: null
+profiler: null
+s3_upload: null
+tokenizer: null
+tokens: null

run_evals.py ADDED Viewed

	@@ -0,0 +1,442 @@

+"""
+Nanotron Inference Script
+Usage:
+```
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
+torchrun --nproc_per_node=8 run_evals.py --checkpoint-config-path ./pretrained/Mistral-7B-v0.1/config.yaml \
+    --lighteval-override ./lighteval_eval_config.yaml
+```
+"""
+# flake8: noqa: C901
+import argparse
+import os
+import random
+import time
+from dataclasses import asdict
+from pathlib import Path
+import numpy as np
+import torch
+from huggingface_hub import HFSummaryWriter
+from lighteval.evaluator import evaluate, make_results_table
+from lighteval.logging.evaluation_tracker import EvaluationTracker
+from lighteval.logging.hierarchical_logger import hlog, htrack, htrack_block
+from lighteval.logging.info_loggers import (
+    DetailsLogger,
+)
+from lighteval.models.model_loader import ModelInfo
+from lighteval.tasks.lighteval_task import LightevalTask, create_requests_from_tasks
+from lighteval.tasks.registry import Registry, get_custom_tasks, taskinfo_selector
+from nanotron import distributed as dist
+from nanotron import logging
+from nanotron.config import get_config_from_file
+from nanotron.logging import get_logger, log_rank
+from nanotron.parallel.context import ParallelContext
+from nanotron.utils import local_ranks_zero_first
+from brrr.config import BrrrConfig
+from brrr.experiment_loggers import flatten_dict, obj_to_markdown
+from brrr.s3_checkpoints import fs_copy
+from brrr.utils import check_env
+from lighteval.models.brrr_models import BRRRModel
+from modeling_mistral import MistralForTraining
+from config_mistral import MistralConfig
+logger = get_logger(__name__)
+TOKEN = os.getenv("HF_TOKEN")
+CACHE_DIR = os.getenv("HF_HOME", "/scratch")
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--checkpoint-config-path",
+        type=str,
+        required=True,
+        help="Path to the brr checkpoint YAML or python config file, potentially on S3",
+    )
+    parser.add_argument(
+        "--lighteval-override",
+        type=str,
+        help="Path to an optional YAML or python Lighteval config to override part of the checkpoint Lighteval config",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help="Local or hub path of an optional tokenizer (if not indicated in the checkpoint)",
+    )
+    parser.add_argument(
+        "--s5cmd-path",
+        type=str,
+        default="/admin/home/thomwolf/miniconda3/envs/b4r/bin/s5cmd",
+        help="Path to s5cmd install",
+    )
+    parser.add_argument(
+        "--s5cmd-numworkers",
+        type=int,
+        default=64,
+        help="s5cmd num workers (optional)",
+    )
+    parser.add_argument(
+        "--s5cmd-concurrency",
+        type=int,
+        default=10,
+        help="s5cmd concurrency (optional)",
+    )
+    parser.add_argument(
+        "--cache-dir",
+        type=str,
+        default="",
+        help="Cache directory",
+    )
+    return parser
+def push_results_to_wandb(  # noqa: C901
+    config: BrrrConfig, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
+):
+    # config: BrrrConfig = get_config_from_dict(config, config_class=BrrrConfig)
+    lighteval_config = config.lighteval
+    try:
+        global_step = config.general.step
+    except ValueError:
+        global_step = 0
+    if config.lighteval.logging.tensorboard_metric_prefix is not None:
+        prefix = config.lighteval.logging.tensorboard_metric_prefix
+    else:
+        prefix = "eval"
+    output_dir_tb = Path(lighteval_config.logging.local_output_path) / "tb" / (config.general.run + "_" + prefix)
+    output_dir_tb.mkdir(parents=True, exist_ok=True)
+    os.environ["WANDB_DISABLE_SERVICE"] = "True"
+    import wandb
+    wandb.tensorboard.patch(root_logdir=config.lighteval.logging.local_output_path)
+    hlog("Starting wandb with WANDB_DISABLE_SERVICE=True")
+    wandb.init(
+        project=config.lighteval.wandb.wandb_project,
+        entity=config.lighteval.wandb.wandb_entity,
+        name=config.lighteval.wandb.wandb_run_name,
+        config=config.as_dict(),
+        # sync_tensorboard=True,
+        resume=True,
+    )
+    wb_dict = {}
+    bench_averages = {}
+    for name, values in results.items():
+        splited_name = name.split("|")
+        if len(splited_name) == 3:
+            _, task_name, _ = splited_name
+        else:
+            task_name = name
+        bench_suite = None
+        if ":" in task_name:
+            bench_suite = task_name.split(":")[0]  # e.g. MMLU
+            hlog(f"bench_suite {bench_suite} in {task_name}")
+            for metric, value in values.items():
+                if "stderr" in metric:
+                    continue
+                if bench_suite not in bench_averages:
+                    bench_averages[bench_suite] = {}
+                bench_averages[bench_suite][metric] = bench_averages[bench_suite].get(metric, []) + [float(value)]
+        hlog(f"Pushing {task_name} {values} to tensorboard")
+        for metric, value in values.items():
+            if "stderr" in metric:
+                wb_dict[f"stderr_{metric}/{task_name}"] = value
+            elif bench_suite is not None:
+                wb_dict[f"{bench_suite}-{metric}/{task_name}"] = value
+            else:
+                wb_dict[f"{metric}/{task_name}"] = value
+    # e.g. MMLU
+    for name, values in bench_averages.items():
+        for metric, values in values.items():
+            hlog(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard")
+            wb_dict[f"{metric}/{name}"] = sum(values) / len(values)
+    for task_name, task_details in details.items():
+        if len(task_details) <= 1:
+            continue
+        columns = list(flatten_dict(asdict(task_details[0])).keys())
+        table = wandb.Table(columns=columns)
+        table.add_data(*[str(v) for v in flatten_dict(asdict(task_details[0])).values()])
+        table.add_data(*[str(v) for v in flatten_dict(asdict(task_details[1])).values()])
+        wandb.log({f"eval_details_{task_name}": table}, step=global_step, commit=False)
+    wandb.log(dict(wb_dict.items()), step=global_step, commit=True)
+    # tb_context.add_text("eval_sizes", obj_to_markdown(sizes), global_step=global_step)
+    # We are doing parallel evaluations of multiple checkpoints and recording the steps not in order
+    # This messes up with tensorboard, so the easiest is to rename files in the order of the checkpoints
+    # See: https://github.com/tensorflow/tensorboard/issues/5958
+    # But tensorboardX don't let us control the prefix of the files (only the suffix), so we need to do it ourselves before commiting the files
+    hlog(f"Pushed to wandb" f" at {output_dir_tb} and global_step {global_step}")
+def push_results_to_tensorboard(  # noqa: C901
+    config: BrrrConfig, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
+):
+    # config: BrrrConfig = get_config_from_dict(config, config_class=BrrrConfig)
+    lighteval_config = config.lighteval
+    try:
+        global_step = config.general.step
+    except ValueError:
+        global_step = 0
+    if config.lighteval.logging.tensorboard_metric_prefix is not None:
+        prefix = config.lighteval.logging.tensorboard_metric_prefix
+    else:
+        prefix = "eval"
+    output_dir_tb = Path(lighteval_config.logging.local_output_path) / "tb" / (config.general.run + "_" + prefix)
+    output_dir_tb.mkdir(parents=True, exist_ok=True)
+    tb_context = HFSummaryWriter(
+        logdir=str(output_dir_tb),
+        repo_id=lighteval_config.logging.hub_repo_tensorboard,
+        repo_private=True,
+        path_in_repo="tb",
+        commit_every=6000,  # Very long time so that we can change our files names and trigger push ourselves (see below)
+    )
+    bench_averages = {}
+    for name, values in results.items():
+        splited_name = name.split("|")
+        if len(splited_name) == 3:
+            _, task_name, _ = splited_name
+        else:
+            task_name = name
+        bench_suite = None
+        if ":" in task_name:
+            bench_suite = task_name.split(":")[0]  # e.g. MMLU
+            hlog(f"bench_suite {bench_suite} in {task_name}")
+            for metric, value in values.items():
+                if "stderr" in metric:
+                    continue
+                if bench_suite not in bench_averages:
+                    bench_averages[bench_suite] = {}
+                bench_averages[bench_suite][metric] = bench_averages[bench_suite].get(metric, []) + [float(value)]
+        hlog(f"Pushing {task_name} {values} to tensorboard")
+        for metric, value in values.items():
+            if "stderr" in metric:
+                tb_context.add_scalar(f"stderr_{prefix}/{task_name}/{metric}", value, global_step=global_step)
+            elif bench_suite is not None:
+                tb_context.add_scalar(f"{prefix}_{bench_suite}/{task_name}/{metric}", value, global_step=global_step)
+            else:
+                tb_context.add_scalar(f"{prefix}/{task_name}/{metric}", value, global_step=global_step)
+    # e.g. MMLU
+    for name, values in bench_averages.items():
+        for metric, values in values.items():
+            hlog(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard")
+            tb_context.add_scalar(f"{prefix}/{name}/{metric}", sum(values) / len(values), global_step=global_step)
+    tb_context.add_text("eval_config", obj_to_markdown(results), global_step=global_step)
+    # tb_context.add_text("eval_sizes", obj_to_markdown(sizes), global_step=global_step)
+    for task_name, task_details in details.items():
+        tb_context.add_text(
+            f"eval_details_{task_name}",
+            obj_to_markdown({"0": task_details[0], "1": task_details[1] if len(task_details) > 1 else {}}),
+            global_step=global_step,
+        )
+    # We are doing parallel evaluations of multiple checkpoints and recording the steps not in order
+    # This messes up with tensorboard, so the easiest is to rename files in the order of the checkpoints
+    # See: https://github.com/tensorflow/tensorboard/issues/5958
+    # But tensorboardX don't let us control the prefix of the files (only the suffix), so we need to do it ourselves before commiting the files
+    tb_context.close()  # flushes the unfinished write operations
+    time.sleep(5)
+    files = os.listdir(output_dir_tb)
+    for file in files:
+        os.rename(os.path.join(output_dir_tb, file), os.path.join(output_dir_tb, f"{global_step:07d}_{file}"))
+    # Now we can push to the hub
+    tb_context.scheduler.trigger()
+    hlog(
+        f"Pushed to tensorboard at https://huggingface.co/tensorboard/{lighteval_config.logging.hub_repo_tensorboard}/"
+        f" at {output_dir_tb} and global_step {global_step}"
+    )
+@htrack()
+def main(args):
+    cache_dir = args.cache_dir or CACHE_DIR
+    check_env()
+    dist.initialize_torch_distributed()
+    with htrack_block("get config"):
+        if not args.checkpoint_config_path.endswith(".yaml"):
+            raise ValueError("The checkpoint path should point to a YAML file")
+        local_config_path = args.checkpoint_config_path
+        if args.checkpoint_config_path.startswith("s3:/"):
+            local_config_path = args.checkpoint_config_path.replace("s3:/", cache_dir)
+            with local_ranks_zero_first():
+                if os.environ.get("LOCAL_RANK", None) == "0":
+                    os.makedirs(os.path.dirname(local_config_path), exist_ok=True)
+                    fs_copy(args.checkpoint_config_path, local_config_path)
+        brrr_config: BrrrConfig = get_config_from_file(local_config_path, config_class=BrrrConfig, model_config_class=MistralConfig)
+        if args.lighteval_override:
+            local_override_path = args.lighteval_override.replace("s3:/", cache_dir)
+            if args.lighteval_override.startswith("s3:/"):
+                local_override_path = args.lighteval_override.replace("s3:/", cache_dir)
+                with local_ranks_zero_first():
+                    if os.environ.get("LOCAL_RANK", None) == "0":
+                        os.makedirs(os.path.dirname(local_override_path), exist_ok=True)
+                        fs_copy(args.lighteval_override, local_override_path)
+            lighteval_brrr_config: BrrrConfig = get_config_from_file(local_override_path, config_class=BrrrConfig)
+            lighteval_config = lighteval_brrr_config.lighteval
+            brrr_config.lighteval = lighteval_config
+        else:
+            local_override_path = ""
+            lighteval_config = brrr_config.lighteval
+        parallel_context = ParallelContext(
+            tensor_parallel_size=lighteval_config.parallelism.tp,
+            pipeline_parallel_size=lighteval_config.parallelism.pp,
+            data_parallel_size=lighteval_config.parallelism.dp,
+        )
+        evaluation_tracker = EvaluationTracker(token=TOKEN)
+        evaluation_tracker.general_config_logger.log_args_info(
+            num_fewshot_seeds=1,
+            override_batch_size=None,
+            max_samples=lighteval_config.tasks.max_samples,
+            job_id=os.environ.get("SLURM_JOB_ID", None),
+            config=brrr_config.as_dict(),
+        )
+    with htrack_block("Test all gather"):
+        hlog("Test gather tensor")
+        # Do a first NCCL sync to warmup and try to avoid Timeout after model/data loading
+        log_rank(
+            f"[TEST] Running NCCL sync for ranks {list(range(parallel_context.world_pg.size()))}",
+            logger=logger,
+            level=logging.WARNING,
+            group=parallel_context.dp_pg,
+            rank=0,
+        )
+        test_tensor = torch.tensor([dist.get_rank(parallel_context.world_pg)], device=torch.device("cuda"))
+        test_tensor_list = [torch.zeros_like(test_tensor) for _ in range(parallel_context.world_pg.size())]
+        dist.all_gather(test_tensor_list, test_tensor, group=parallel_context.world_pg, async_op=False)
+        dist.barrier()
+        log_rank(
+            f"[TEST] NCCL sync for ranks {[t.item() for t in test_tensor_list]}",
+            logger=logger,
+            level=logging.WARNING,
+            group=parallel_context.dp_pg,
+            rank=0,
+        )
+        del test_tensor_list
+        del test_tensor
+    with htrack_block("Model loading"):
+        # We need to load the model in the main process first to avoid downloading the model multiple times
+        model = BRRRModel(
+            checkpoint_path=args.checkpoint_config_path.replace("config.yaml", ""),
+            model_args=brrr_config.model,
+            tokenizer=brrr_config.tokenizer,
+            parallel_context=parallel_context,
+            parallel_config=lighteval_config.parallelism,
+            lighteval_config=lighteval_config,
+            batch_size=lighteval_config.batch_size,
+            cache_dir=os.environ.get("HF_HOME", "/scratch"),
+            debug_one_layer_model=False,
+            s5cmd_path=args.s5cmd_path,
+            s5cmd_numworkers=args.s5cmd_numworkers,
+            s5cmd_concurrency=args.s5cmd_concurrency,
+            model_class=MistralForTraining
+        )
+        model_info = ModelInfo(model_name=f"{brrr_config.general.run}/{brrr_config.general.step}")
+        evaluation_tracker.general_config_logger.log_model_info(model_info)
+    with htrack_block("Tasks loading"):
+        with local_ranks_zero_first():
+            tasks_selection = lighteval_config.tasks.tasks
+            if lighteval_config.tasks.custom_tasks_file:
+                _, tasks_groups_dict = get_custom_tasks(lighteval_config.tasks.custom_tasks_file)
+                if tasks_groups_dict and lighteval_config.tasks.tasks in tasks_groups_dict:
+                    tasks_selection = tasks_groups_dict[lighteval_config.tasks.tasks]
+            task_names_list, few_shots_dict = taskinfo_selector(tasks_selection)
+            task_dict = Registry(cache_dir=cache_dir).get_task_dict(
+                task_names_list, custom_tasks_file=lighteval_config.tasks.custom_tasks_file
+            )
+            # Loading all the dataset in a distributed manner
+            LightevalTask.load_datasets(task_dict.values(), lighteval_config.tasks.dataset_loading_processes)
+            evaluation_tracker.task_config_logger.log(task_dict)
+            hlog("Loading documents, and requests")
+            requests, docs = create_requests_from_tasks(
+                task_dict=task_dict,
+                fewshot_dict=few_shots_dict,
+                num_fewshot_seeds=lighteval_config.tasks.num_fewshot_seeds or 1,
+                lm=model,
+                max_samples=lighteval_config.tasks.max_samples,
+                evaluation_tracker=evaluation_tracker,
+                use_chat_template=False
+            )
+    with htrack_block("Setting seeds and waiting for all processes"):
+        hlog(f"setting seed to {1234} for random and numpy")
+        random.seed(1234)
+        np.random.seed(1234)
+        dist.barrier()
+    with htrack_block("Evaluation"):
+        hlog(f"Evaluate on {len(task_names_list)} tasks.")
+        evaluation_tracker = evaluate(
+            lm=model,
+            requests_dict=requests,
+            docs=docs,
+            task_dict=task_dict,
+            override_bs=lighteval_config.batch_size,
+            evaluation_tracker=evaluation_tracker,
+        )
+    if dist.get_rank(parallel_context.world_pg) == 0:
+        with htrack_block("Compiling and saving results"):
+            evaluation_tracker.general_config_logger.log_end_time()
+            evaluation_tracker.metrics_logger.aggregate(task_dict=task_dict, bootstrap_iters=1000)
+            evaluation_tracker.details_logger.aggregate()
+            if lighteval_config.logging.local_output_path:
+                evaluation_tracker.save(
+                    output_dir=lighteval_config.logging.local_output_path,
+                    push_results_to_hub=lighteval_config.logging.push_results_to_hub,
+                    push_details_to_hub=lighteval_config.logging.push_details_to_hub,
+                    public=False,
+                    push_results_to_tensorboard=lighteval_config.logging.push_results_to_tensorboard,
+                )
+            if lighteval_config.logging.push_results_to_tensorboard:
+                push_results_to_tensorboard(
+                    config=brrr_config,
+                    results=evaluation_tracker.metrics_logger.metric_aggregated,
+                    details=evaluation_tracker.details_logger.details,
+                )
+            if lighteval_config.wandb is not None:
+                push_results_to_wandb(
+                    config=brrr_config,
+                    results=evaluation_tracker.metrics_logger.metric_aggregated,
+                    details=evaluation_tracker.details_logger.details,
+                )
+            final_dict = evaluation_tracker.generate_final_dict()
+        hlog(make_results_table(final_dict))
+        return final_dict
+if __name__ == "__main__":
+    parser = get_parser()
+    args, unknowns = parser.parse_known_args()
+    main(args)

run_train.py CHANGED Viewed

@@ -8,11 +8,11 @@ torchrun --nproc_per_node=8 run_train.py --config-file config_tiny_mistral.yaml
 ```
 """
 import argparse
-from config_tiny_mistral import MistralConfig
 from dataloader import get_dataloader
 from modeling_mistral import MistralForTraining
-from nanotron.trainer import DistributedTrainer
 def get_args():

 ```
 """
 import argparse
+from nanotron.trainer import DistributedTrainer
 from dataloader import get_dataloader
 from modeling_mistral import MistralForTraining
+from config_tiny_mistral import MistralConfig
 def get_args():