diff --git a/comment.txt b/comment.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fef8a2a8c8f96c20969774fd9fd2804c2ec1c84 --- /dev/null +++ b/comment.txt @@ -0,0 +1,7 @@ +Job ID: 2498282 + +Git commit: 10e3e0a update alpaca eval gen + +Git branch: * main + +Comment: llama_moe_four_mix_freeze_gate_100 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..d09b73212fa0756af8af70819e28e4363bf2e1da --- /dev/null +++ b/config.json @@ -0,0 +1,373 @@ +{ + "_name_or_path": "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new", + "add_weight_norm": false, + "architectures": [ + "LlamaMoEForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_llama_moe.LlamaMoEConfig", + "AutoModel": "modeling_llama_moe_hf.LlamaMoEModel", + "AutoModelForCausalLM": "modeling_llama_moe_hf.LlamaMoEForCausalLM" + }, + "bos_token_id": 1, + "calculator_type": "UniversalCalculator", + "capacity_factor": 1.25, + "drop_tokens": true, + "dropped_padding": "zero", + "eos_token_id": 2, + "gate_add_noise": true, + "gate_balance_loss_weight": 0.01, + "gate_network": "mlp", + "gate_noise_epsilon": 0.01, + "gate_type": "TopKBalancedNoisyGate", + "gate_use_balance": true, + "gate_use_softmax": true, + "gates": "mlp", + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 4096, + "model_type": "llama_moe", + "multiply_gate_scores": true, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_selects": 2, + "pad_token_id": 0, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "score_scale_factor": 4.0, + "size_experts": [ + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ], + [ + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376, + 1376 + ] + ], + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.36.2", + "use_cache": true, + "vocab_size": 32000 +} diff --git a/configuration_llama_moe.py b/configuration_llama_moe.py new file mode 100644 index 0000000000000000000000000000000000000000..333a778e26d30eb6e79dbc118eefe0d83889afd6 --- /dev/null +++ b/configuration_llama_moe.py @@ -0,0 +1,130 @@ +from transformers.configuration_utils import PretrainedConfig + + +class LlamaMoEConfig(PretrainedConfig): + model_type = "llama_moe" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + # -------- moe expert configs -------- + num_experts=16, + num_selects=4, + size_experts=None, + # -------- moe gate configs -------- + gate_type="TopKBalancedNoisyGate", + gate_network="mlp", + gate_use_softmax=True, + gate_use_balance=True, + gate_balance_loss_weight=1e-2, + gate_add_noise=True, + # TopKBalancedNoisyGate + gate_noise_epsilon=1e-2, + # -------- moe calculator configs -------- + calculator_type="UniversalCalculator", + multiply_gate_scores=True, + score_scale_factor=1.0, + add_weight_norm=False, + # SwitchDropTokenCalculator + drop_tokens=True, + dropped_padding="zero", + capacity_factor=1.25, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + self.num_experts = num_experts + self.num_selects = num_selects + self.size_experts = size_experts + + self.gate_type = gate_type + self.gate_network = gate_network + self.gate_use_softmax = gate_use_softmax + self.gate_use_balance = gate_use_balance + self.gate_balance_loss_weight = gate_balance_loss_weight + self.gate_add_noise = gate_add_noise + self.gate_noise_epsilon = gate_noise_epsilon + + self.calculator_type = calculator_type + self.multiply_gate_scores = multiply_gate_scores + self.score_scale_factor = score_scale_factor + self.add_weight_norm = add_weight_norm + self.drop_tokens = drop_tokens + self.dropped_padding = dropped_padding + self.capacity_factor = capacity_factor + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if ( + rope_scaling_factor is None + or not isinstance(rope_scaling_factor, float) + or rope_scaling_factor <= 1.0 + ): + raise ValueError( + f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}" + ) diff --git a/diff.patch b/diff.patch new file mode 100644 index 0000000000000000000000000000000000000000..6b948889322c57f8580d4b80b5c99d8a3d906a11 --- /dev/null +++ b/diff.patch @@ -0,0 +1,863 @@ +diff --git a/.gitignore b/.gitignore +index c243024..8c28ce3 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -175,6 +175,7 @@ debug.py + wandb/ + nohup.out + lm-evaluation-harness/ ++bigcode-evaluation-harness/ + results/**/*.json + results/**/*.jsonl + results/**/*.db +diff --git a/README.md b/README.md +index 8813a32..b276a78 100644 +--- a/README.md ++++ b/README.md +@@ -26,6 +26,11 @@ bash scripts/data.sh + git clone https://github.com/EleutherAI/lm-evaluation-harness.git + cd lm-evaluation-harness + pip install -e . ++# commit: 9cfa52b ++git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git ++cd bigcode-evaluation-harness ++# change `pyext==0.5` in `bigcode-evaluation-harness/requirements.txt`, ref: https://github.com/bigcode-project/bigcode-evaluation-harness/pull/181 ++pip install -e . + ``` + + ## 📃 TODO +diff --git a/scripts/eval.sh b/scripts/eval.sh +deleted file mode 100644 +index 4f41b37..0000000 +--- a/scripts/eval.sh ++++ /dev/null +@@ -1,96 +0,0 @@ +-# nohup srun -p MoE --gres gpu:1 bash scripts/eval.sh all /mnt/petrelfs/share_data/quxiaoye/models/Sheared-LLaMA-2.7B True results/Sheared-LLaMA-2.7B 1>logs/eval-all-Sheared-LLaMA-2.7B.log 2>&1 & +- +-mmlu() { +- # MMLU: https://github.com/princeton-nlp/LLM-Shearing/blob/20ebd2645a8ff5fa65874e1347f9891b80e01805/icl_eval/run_eval.sh#L18 +- MODEL=$1 +- TRUST_REMOTE_CODE=$2 +- RESULT_DIR=$3 +- mkdir -p $RESULT_DIR +- +- lm_eval \ +- --model hf \ +- --model_args pretrained=$MODEL,trust_remote_code=$TRUST_REMOTE_CODE \ +- --tasks mmlu_computer_security,mmlu_high_school_chemistry,mmlu_philosophy,mmlu_elementary_mathematics,mmlu_prehistory,mmlu_formal_logic,mmlu_high_school_mathematics,mmlu_econometrics,mmlu_moral_scenarios,mmlu_college_mathematics,mmlu_high_school_government_and_politics,mmlu_us_foreign_policy,mmlu_high_school_world_history,mmlu_conceptual_physics,mmlu_college_medicine,mmlu_international_law,mmlu_abstract_algebra,mmlu_logical_fallacies,mmlu_machine_learning,mmlu_medical_genetics,mmlu_public_relations,mmlu_college_biology,mmlu_marketing,mmlu_electrical_engineering,mmlu_anatomy,mmlu_high_school_us_history,mmlu_high_school_biology,mmlu_miscellaneous,mmlu_high_school_psychology,mmlu_sociology,mmlu_business_ethics,mmlu_high_school_geography,mmlu_human_aging,mmlu_high_school_statistics,mmlu_moral_disputes,mmlu_professional_psychology,mmlu_global_facts,mmlu_college_physics,mmlu_nutrition,mmlu_high_school_macroeconomics,mmlu_world_religions,mmlu_professional_medicine,mmlu_high_school_computer_science,mmlu_college_chemistry,mmlu_human_sexuality,mmlu_high_school_microeconomics,mmlu_astronomy,mmlu_professional_accounting,mmlu_high_school_european_history,mmlu_jurisprudence,mmlu_professional_law,mmlu_high_school_physics,mmlu_virology,mmlu_management,mmlu_college_computer_science,mmlu_clinical_knowledge,mmlu_security_studies \ +- --num_fewshot 5 \ +- --device cuda:0 \ +- --batch_size auto \ +- --verbosity DEBUG \ +- --output_path $RESULT_DIR/mmlu.json +-} +- +-bbh() { +- # Big Bench Hard (BBH): https://arxiv.org/pdf/2210.09261.pdf +- MODEL=$1 +- TRUST_REMOTE_CODE=$2 +- RESULT_DIR=$3 +- mkdir -p $RESULT_DIR +- +- lm_eval \ +- --log_samples \ +- --model hf \ +- --model_args pretrained=$MODEL,trust_remote_code=$TRUST_REMOTE_CODE \ +- --tasks bbh_fewshot_boolean_expressions,bbh_fewshot_causal_judgement,bbh_fewshot_date_understanding,bbh_fewshot_disambiguation_qa,bbh_fewshot_dyck_languages,bbh_fewshot_formal_fallacies,bbh_fewshot_geometric_shapes,bbh_fewshot_hyperbaton,bbh_fewshot_logical_deduction_five_objects,bbh_fewshot_logical_deduction_seven_objects,bbh_fewshot_logical_deduction_three_objects,bbh_fewshot_movie_recommendation,bbh_fewshot_multistep_arithmetic_two,bbh_fewshot_navigate,bbh_fewshot_object_counting,bbh_fewshot_penguins_in_a_table,bbh_fewshot_reasoning_about_colored_objects,bbh_fewshot_ruin_names,bbh_fewshot_salient_translation_error_detection,bbh_fewshot_snarks,bbh_fewshot_sports_understanding,bbh_fewshot_temporal_sequences,bbh_fewshot_tracking_shuffled_objects_five_objects,bbh_fewshot_tracking_shuffled_objects_seven_objects,bbh_fewshot_tracking_shuffled_objects_three_objects,bbh_fewshot_web_of_lies,bbh_fewshot_word_sorting \ +- --device cuda:0 \ +- --batch_size auto \ +- --verbosity DEBUG \ +- --output_path $RESULT_DIR/bbh.json +-} +- +-reasoning() { +- MODEL=$1 +- TRUST_REMOTE_CODE=$2 +- RESULT_DIR=$3 +- mkdir -p $RESULT_DIR +- +- lm_eval \ +- --log_samples \ +- --model hf \ +- --model_args pretrained=$MODEL,trust_remote_code=$TRUST_REMOTE_CODE \ +- --tasks gsm8k_cot \ +- --device cuda:0 \ +- --batch_size auto \ +- --verbosity DEBUG \ +- --output_path $RESULT_DIR/reasoning.json +-} +- +-qa() { +- MODEL=$1 +- TRUST_REMOTE_CODE=$2 +- RESULT_DIR=$3 +- mkdir -p $RESULT_DIR +- +- lm_eval \ +- --log_samples \ +- --model hf \ +- --model_args pretrained=$MODEL,trust_remote_code=$TRUST_REMOTE_CODE \ +- --tasks arc_easy,arc_challenge,boolq \ +- --num_fewshot 0 \ +- --device cuda:0 \ +- --batch_size auto \ +- --verbosity DEBUG \ +- --output_path $RESULT_DIR/qa.json +-} +- +-EVAL_TASK=$1 +-shift 1 +-start=$(date +%s) +-case $EVAL_TASK in +- mmlu) +- mmlu $* ;; +- bbh) +- bbh $* ;; +- reasoning) +- reasoning $* ;; +- qa) +- qa $* ;; +- all) +- mmlu $* +- bbh $* +- reasoning $* +- qa $* +- ;; +- *) +- echo "$EVAL_TASK not recognized!";; +-esac +-end=$(date +%s) +-echo "Elapsed Time: $(($end-$start)) seconds" +diff --git a/scripts/four_mix/freeze_gate.sh b/scripts/four_mix/freeze_gate.sh +index d94d78c..70afb8e 100644 +--- a/scripts/four_mix/freeze_gate.sh ++++ b/scripts/four_mix/freeze_gate.sh +@@ -83,8 +83,11 @@ num_gpus=4 + + python -m src.eval.gen_mt_ans \ + --model-path $output_dir \ +- --model-id $task_name \ +- --num-gpus-total $num_gpus ++ --model-id $task_name ++ ++ python -m src.eval.gen_alpaca_eval_ans \ ++ --model-path $output_dir \ ++ --model-id $task_name + } + + # nohup srun -p MoE --ntasks-per-node=1 --cpus-per-task=16 --mem=128G --nodes=1 --gres=gpu:4 bash "/mnt/petrelfs/zhutong/adaptive-sft-for-moe/scripts/one_data_steps_dynamic.sh" "llama_moe_orca_epochs_cluster_4" "auto" "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new" "data/open_orca_clustered/4" "data/open_orca_clustered_eval/4" 1>logs/llama_moe_orca_cluster_4_dynamic.log 2>&1 & +diff --git a/scripts/gen_mt_bench_ans.sh b/scripts/gen_mt_bench_ans.sh +deleted file mode 100644 +index f251644..0000000 +--- a/scripts/gen_mt_bench_ans.sh ++++ /dev/null +@@ -1,32 +0,0 @@ +-#!/usr/bin/bash +- +-#SBATCH --job-name=moe_gen +-#SBATCH --output=logs/%x-%j.log +-#SBATCH --error=logs/%x-%j.log +- +-#SBATCH --partition=MoE +-#SBATCH --ntasks-per-node=1 +-#SBATCH --cpus-per-task=16 +-#SBATCH --mem=64G +- +-#SBATCH --nodes=1 +-#SBATCH --gres=gpu:1 +-#SBATCH --quotatype=auto +- +-{ +- # python -m fastchat.llm_judge.gen_model_answer \ +- # --model-path outputs/sheared_llama_sharegpt/moe_sft-2411306 \ +- # --model-id sheared_llama_sharegpt +- +- # python -m fastchat.llm_judge.gen_model_answer \ +- # --model-path outputs/sheared_llama_uniform_mix/moe_sft-2421072 \ +- # --model-id sheared_llama_uniform_mix +- +- bash scripts/cp_model_files.sh outputs/llama_moe/moe_sft-2409782 +- python -m fastchat.llm_judge.gen_model_answer \ +- --model-path outputs/llama_moe/moe_sft-2409782 \ +- --model-id llama_moe_uniform_mix +-} +- +-# nohup srun -p MoE -n1 -N1 --gres=gpu:1 --quotatype spot python -m fastchat.llm_judge.gen_model_answer --model-path outputs/sheared_llama_sharegpt/moe_sft-2411306 --model-id sheared_llama_sharegpt 1>logs/mt_bench_gen_sheared_llama_sharegpt.log 2>&1 & +-# nohup srun -p MoE -n1 -N1 --gres=gpu:1 --quotatype spot python -m fastchat.llm_judge.gen_model_answer --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/llama_moe_sharegpt/moe_sft-2411309 --model-id llama_moe_sharegpt 1>logs/mt_bench_gen_llama_moe_sharegpt.log 2>&1 & +diff --git a/scripts/multi.sh b/scripts/multi.sh +index bcd83b8..e399761 100644 +--- a/scripts/multi.sh ++++ b/scripts/multi.sh +@@ -100,5 +100,8 @@ nohup srun -p MoE --ntasks-per-node=1 --cpus-per-task=16 --mem=128G --nodes=1 -- + nohup srun -p MoE --gres gpu:1 python -m src.eval.gen_mt_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/llama_moe_four_mix_uniform/bash-2485396 --model-id llama_moe_four_mix_uniform 1>logs/gen_mt_ans-llama_moe_four_mix_uniform.log 2>&1 & + nohup srun -p MoE --gres gpu:1 python -m src.eval.gen_mt_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/sheared_four_mix_uniform/bash-2485397 --model-id sheared_four_mix_uniform 1>logs/gen_mt_ans-sheared_four_mix_uniform.log 2>&1 & + +-nohup srun -p MoE --gres gpu:1 python -m src.eval.get_alpaca_eval_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/llama_moe_four_mix_uniform/bash-2485396 --model-id llama_moe_four_mix_uniform 1>logs/gen_alpaca_eval-llama_moe_four_mix_uniform.log 2>&1 & +-nohup srun -p MoE --gres gpu:1 python -m src.eval.get_alpaca_eval_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/sheared_four_mix_uniform/bash-2485397 --model-id sheared_four_mix_uniform 1>logs/gen_alpaca_eval-sheared_four_mix_uniform.log 2>&1 & ++nohup srun -p MoE --gres gpu:1 python -m src.eval.gen_alpaca_eval_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/llama_moe_four_mix_uniform/bash-2485396 --model-id llama_moe_four_mix_uniform 1>logs/gen_alpaca_eval-llama_moe_four_mix_uniform.log 2>&1 & ++nohup srun -p MoE --gres gpu:1 python -m src.eval.gen_alpaca_eval_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/sheared_four_mix_uniform/bash-2485397 --model-id sheared_four_mix_uniform 1>logs/gen_alpaca_eval-sheared_four_mix_uniform.log 2>&1 & ++ ++nohup srun -p MoE --gres gpu:1 bash scripts/eval/eval.sh reasoning /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad_wo_gate_noise/moe_sft-2492650 True results/llama_moe_four_mix_wo_pad_wo_gate_noise 1>logs/eval-reasoning-llama_moe_four_mix_wo_pad_wo_gate_noise.log 2>&1 & ++nohup srun -p MoE --gres gpu:1 bash scripts/eval/eval.sh reasoning /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad/moe_sft-2491633 True results/llama_moe_four_mix_wo_pad 1>logs/eval-reasoning-llama_moe_four_mix_wo_pad.log 2>&1 & +diff --git a/src/callbacks.py b/src/callbacks.py +index a750f69..e9d0c04 100644 +--- a/src/callbacks.py ++++ b/src/callbacks.py +@@ -6,6 +6,7 @@ import torch + import numpy as np + from loguru import logger + from transformers.trainer_callback import TrainerCallback, TrainerState, TrainerControl ++from transformers.utils import is_flash_attn_2_available + + from src.utils.config import TrainingArguments + from src.utils.io import append_jsonlines +@@ -22,6 +23,7 @@ class AdaptiveSamplingCallback(TrainerCallback): + criterion: Optional[Literal["min", "max", "mean"]] = "mean", + sim_type: Optional[Literal["cos", "l2"]] = "cos", + ): ++ assert is_flash_attn_2_available(), "Make sure you have flash-attn installed" + self.criterion = criterion + self.sim_type = sim_type + self.prob_map = {} +@@ -74,8 +76,8 @@ class AdaptiveSamplingCallback(TrainerCallback): + cls, + ori_weights: np.ndarray, + delta: np.ndarray, +- eta: float = 1.0, +- c: float = 1e-4, ++ eta: float = 10.0, ++ c: float = 5e-2, + ) -> np.ndarray: + def _softmax(vec: np.ndarray) -> np.ndarray: + exps = np.exp(vec - np.max(vec)) +diff --git a/src/core/train.py b/src/core/train.py +index 2be5558..9b1f694 100644 +--- a/src/core/train.py ++++ b/src/core/train.py +@@ -7,13 +7,12 @@ from loguru import logger + from src.utils.config import ModelArguments, DataArguments, TrainingArguments + from src.data import ( + SubDirWeightedPackedJsonlDataset, +- get_uniform_sampling_ratio, + fault_tolerance_data_collator, + CachedJsonlDataset, + get_cached_datasets_from_dir, + ) + from src.utils.io import trainer_save_model_safe +-from src.models import LlamaMoEForCausalLM, LlamaMoEConfig ++from src.models import LlamaMoEForCausalLM, LlamaMoEConfig, DeepseekConfig, DeepseekForCausalLM + from src.trainer import GateLoadRecordingTrainer + from src.callbacks import AdaptiveSamplingCallback + +@@ -36,6 +35,9 @@ def get_model_and_tokenizer( + elif model_type == "llama_moe": + ConfigClass = LlamaMoEConfig + ModelClass = LlamaMoEForCausalLM ++ elif model_type == "deepseek": ++ ConfigClass = DeepseekConfig ++ ModelClass = DeepseekForCausalLM + else: + raise ValueError(f"Unknown model type: {model_type}") + +@@ -54,6 +56,21 @@ def get_model_and_tokenizer( + config.update(additional_config) + logger.info("Config ready") + ++ tokenizer = transformers.AutoTokenizer.from_pretrained( ++ model_name_or_path, ++ cache_dir=cache_dir, ++ model_max_length=model_max_length, ++ padding_side=padding_side, ++ use_fast=False, ++ trust_remote_code=trust_remote_code, ++ ) ++ if tokenizer.pad_token is None: ++ if tokenizer.unk_token is not None: ++ tokenizer.pad_token = tokenizer.unk_token ++ else: ++ tokenizer.pad_token = tokenizer.eos_token ++ logger.info(f"tokenizer ready, pad_token: {tokenizer.pad_token}") ++ + # Load model and tokenizer + model = ModelClass.from_pretrained( + model_name_or_path, +@@ -65,18 +82,6 @@ def get_model_and_tokenizer( + ) + logger.info("model ready") + +- tokenizer = transformers.AutoTokenizer.from_pretrained( +- model_name_or_path, +- cache_dir=cache_dir, +- model_max_length=model_max_length, +- padding_side=padding_side, +- use_fast=False, +- trust_remote_code=trust_remote_code, +- ) +- if tokenizer.pad_token != tokenizer.unk_token: +- tokenizer.pad_token = tokenizer.unk_token +- logger.info("tokenizer ready") +- + return model, tokenizer + + +@@ -117,7 +122,9 @@ def train(): + train_dataset = SubDirWeightedPackedJsonlDataset( + data_args.dataset_dir_or_path, + tokenizer, +- prob_map=get_uniform_sampling_ratio(data_args.dataset_dir_or_path), ++ # prob_map=get_uniform_sampling_ratio(data_args.dataset_dir_or_path), ++ # prob_map={"code": 0.25119094959816823, "math": 0.2674581878910902, "orca": 0.243050776175138, "sharegpt": 0.23830008633560357}, ++ prob_map=data_args.prob_map, + seed=training_args.seed, + ) + elif datapath.is_file(): +diff --git a/src/data.py b/src/data.py +index d783a21..a1a8ff7 100644 +--- a/src/data.py ++++ b/src/data.py +@@ -20,6 +20,7 @@ def preprocess( + instances, + tokenizer: transformers.PreTrainedTokenizer, + ) -> Dict: ++ tokenizer_legacy = getattr(tokenizer, "legacy", None) + conv = Conversation() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + +@@ -72,7 +73,7 @@ def preprocess( + # "-2" is hardcoded for the Llama tokenizer to make the offset correct. + instruction_len = len(tokenizer(parts[0]).input_ids) - 2 + +- if i != 0 and not tokenizer.legacy: ++ if i != 0 and not tokenizer_legacy: + # The legacy and non-legacy modes handle special tokens differently + instruction_len -= 1 + +@@ -80,7 +81,7 @@ def preprocess( + target[cur_len : cur_len + instruction_len] = IGNORE_TOKEN_ID + cur_len += turn_len + +- if i != 0 and not tokenizer.legacy: ++ if i != 0 and not tokenizer_legacy: + # The legacy and non-legacy modes handle special tokens differently + cur_len -= 1 + +diff --git a/src/eval/get_alpaca_eval_ans.py b/src/eval/get_alpaca_eval_ans.py +deleted file mode 100644 +index 1ff3e5e..0000000 +--- a/src/eval/get_alpaca_eval_ans.py ++++ /dev/null +@@ -1,113 +0,0 @@ +-import argparse +-from pathlib import Path +- +-import torch +-import datasets +-from tqdm import tqdm +- +-from src.core.train import get_model_and_tokenizer +-from src.utils.conversation import Conversation +-from src.utils.io import dump_json +- +- +-@torch.inference_mode() +-def run_eval(model_path, model_id, max_new_tokens): +- model, tokenizer = get_model_and_tokenizer( +- "auto", +- model_path, +- torch_dtype=torch.bfloat16, +- trust_remote_code=True, +- ) +- model.cuda() +- model.eval() +- +- conv = Conversation() +- outputs = [] +- eval_set = datasets.load_dataset("tatsu-lab/alpaca_eval", "alpaca_eval")["eval"] +- for example in tqdm(eval_set, desc="Eval"): +- conv.append_message(conv.roles[0], example["instruction"]) +- conv.append_message(conv.roles[1], None) +- prompt = conv.get_prompt() +- input_ids = tokenizer([prompt], return_tensors="pt").input_ids +- conv.clear_msg() +- # generate here is a placeholder for your models generations +- output_ids = model.generate( +- input_ids.cuda(), +- do_sample=False, +- temperature=0.0, +- max_new_tokens=max_new_tokens, +- ) +- if model.config.is_encoder_decoder: +- output_ids = output_ids[0] +- else: +- output_ids = output_ids[0][len(input_ids[0]) :] # noqa: E203 +- # be consistent with the template's stop_token_ids +- if conv.stop_token_ids: +- stop_token_ids_index = [ +- i +- for i, id in enumerate(output_ids) +- if id in conv.stop_token_ids +- ] +- if len(stop_token_ids_index) > 0: +- output_ids = output_ids[: stop_token_ids_index[0]] +- +- output = tokenizer.decode( +- output_ids, +- spaces_between_special_tokens=False, +- ) +- if conv.stop_str and isinstance(conv.stop_str, list): +- stop_str_indices = sorted( +- [ +- output.find(stop_str) +- for stop_str in conv.stop_str +- if output.find(stop_str) > 0 +- ] +- ) +- if len(stop_str_indices) > 0: +- output = output[: stop_str_indices[0]] +- elif conv.stop_str and output.find(conv.stop_str) > 0: +- output = output[: output.find(conv.stop_str)] +- +- for special_token in tokenizer.special_tokens_map.values(): +- if isinstance(special_token, list): +- for special_tok in special_token: +- output = output.replace(special_tok, "") +- else: +- output = output.replace(special_token, "") +- output = output.strip() +- +- if conv.name == "xgen" and output.startswith("Assistant:"): +- output = output.replace("Assistant:", "", 1).strip() +- +- example["output"] = output +- outputs.append(example) +- +- outpath = Path("results/alpaca_eval") / f"{model_id}.json" +- dump_json(outputs, outpath, indent=2) +- +- +-if __name__ == "__main__": +- parser = argparse.ArgumentParser() +- parser.add_argument( +- "--model-path", +- type=str, +- required=True, +- help="The path to the weights. This can be a local folder or a Hugging Face repo ID.", +- ) +- parser.add_argument( +- "--model-id", type=str, required=True, help="A custom name for the model." +- ) +- parser.add_argument( +- "--max-new-token", +- type=int, +- default=1024, +- help="The maximum number of new generated tokens.", +- ) +- +- args = parser.parse_args() +- +- run_eval( +- model_path=args.model_path, +- model_id=args.model_id, +- max_new_tokens=args.max_new_token, +- ) +diff --git a/src/eval/show.py b/src/eval/show.py +index d500054..ea0c210 100644 +--- a/src/eval/show.py ++++ b/src/eval/show.py +@@ -55,13 +55,13 @@ def collect_results(result_dir: str, verbose: bool = True) -> dict: + avg = sum(vals) / len(vals) + tot_vals.append(avg) + if verbose: +- logger.info(f"task: {name}, num: {len(tasks.split(','))}, avg: {avg:.3%}") ++ logger.info(f"task: {name}, num: {len(tasks.split(','))}, avg: {100 * avg:.3f} %") + + if len(tot_vals) == 0: + tot_avg = 0.0 + else: + tot_avg = sum(tot_vals) / len(tot_vals) +- logger.info(f"total avg: {tot_avg:.3%}") ++ logger.info(f"total avg: {100 * tot_avg:.3f} %") + + + if __name__ == "__main__": +diff --git a/src/models/deepseek/modeling_deepseek.py b/src/models/deepseek/modeling_deepseek.py +index 1dae56e..20498b2 100644 +--- a/src/models/deepseek/modeling_deepseek.py ++++ b/src/models/deepseek/modeling_deepseek.py +@@ -20,6 +20,7 @@ + """ PyTorch DeepSeek model.""" + import math + import warnings ++from dataclasses import dataclass + from typing import List, Optional, Tuple, Union + + import torch +@@ -297,7 +298,7 @@ class DeepseekMLP(nn.Module): + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + +- def forward(self, x): ++ def forward(self, x, **kwargs): + if self.config.pretraining_tp > 1: + slice = self.intermediate_size // self.config.pretraining_tp + gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) +@@ -328,7 +329,9 @@ class DeepseekMLP(nn.Module): + else: + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + +- return down_proj ++ bsz, seq_len, _ = x.shape ++ load = torch.zeros(bsz * seq_len, self.config.n_routed_experts) ++ return down_proj, load + + + class MoEGate(nn.Module): +@@ -356,7 +359,10 @@ class MoEGate(nn.Module): + init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + + def forward(self, hidden_states): +- bsz, seq_len, h = hidden_states.shape ++ if len(hidden_states.shape) == 2: ++ bsz, h = hidden_states.shape ++ else: ++ bsz, seq_len, h = hidden_states.shape + ### compute gating score + hidden_states = hidden_states.view(-1, h) + logits = F.linear(hidden_states, self.weight, None) +@@ -404,7 +410,10 @@ class MoEGate(nn.Module): + aux_loss = (Pi * fi).sum() * self.alpha + else: + aux_loss = None +- return topk_idx, topk_weight, aux_loss ++ _zeros = torch.zeros_like(logits) ++ _scores_filtered = _zeros.scatter(dim=1, index=topk_idx, src=topk_weight) ++ load = (_scores_filtered > 0).sum(0) ++ return topk_idx, topk_weight, aux_loss, load + + + class AddAuxiliaryLoss(torch.autograd.Function): +@@ -450,10 +459,19 @@ class DeepseekMoE(nn.Module): + config=config, intermediate_size=intermediate_size + ) + +- def forward(self, hidden_states): ++ def forward(self, hidden_states, attention_mask=None): ++ bsz, seq_len, hsz = hidden_states.shape ++ hidden_states = hidden_states.reshape(-1, hsz) ++ flattened_mask = None ++ flattened_shape = None ++ if attention_mask is not None and len(attention_mask.shape) == 2: ++ flattened_mask = attention_mask.flatten() ++ flattened_shape = flattened_mask.shape ++ hidden_states = hidden_states[flattened_mask.bool()] ++ + identity = hidden_states + orig_shape = hidden_states.shape +- topk_idx, topk_weight, aux_loss = self.gate(hidden_states) ++ topk_idx, topk_weight, aux_loss, load = self.gate(hidden_states) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + flat_topk_idx = topk_idx.view(-1) + if self.training: +@@ -472,7 +490,15 @@ class DeepseekMoE(nn.Module): + ).view(*orig_shape) + if self.config.n_shared_experts is not None: + y = y + self.shared_experts(identity) +- return y ++ ++ if flattened_mask is not None: ++ _y = torch.zeros(flattened_shape + (hsz,), dtype=y.dtype, device=y.device) ++ _y[flattened_mask.bool()] = y ++ y = _y ++ ++ y = y.reshape(bsz, seq_len, hsz) ++ ++ return y, load + + @torch.no_grad() + def moe_infer(self, x, flat_expert_indices, flat_expert_weights): +@@ -1163,7 +1189,7 @@ class DeepseekDecoderLayer(nn.Module): + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) +- hidden_states = self.mlp(hidden_states) ++ hidden_states, load = self.mlp(hidden_states, attention_mask=attention_mask) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) +@@ -1174,6 +1200,8 @@ class DeepseekDecoderLayer(nn.Module): + if use_cache: + outputs += (present_key_value,) + ++ outputs += (load,) ++ + return outputs + + +@@ -1220,6 +1248,11 @@ class DeepseekPreTrainedModel(PreTrainedModel): + module.weight.data[module.padding_idx].zero_() + + ++@dataclass ++class BaseMoEModelOutputWithPast(BaseModelOutputWithPast): ++ gate_load: Optional[torch.Tensor] = None ++ ++ + Deepseek_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): +@@ -1429,6 +1462,7 @@ class DeepseekModel(DeepseekPreTrainedModel): + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None ++ gate_load = () + next_decoder_cache = None + + for decoder_layer in self.layers: +@@ -1463,6 +1497,8 @@ class DeepseekModel(DeepseekPreTrainedModel): + if output_attentions: + all_self_attns += (layer_outputs[1],) + ++ gate_load += (layer_outputs[-1],) ++ + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer +@@ -1482,14 +1518,20 @@ class DeepseekModel(DeepseekPreTrainedModel): + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) +- return BaseModelOutputWithPast( ++ return BaseMoEModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, ++ gate_load=gate_load, + ) + + ++@dataclass ++class MoECausalLMOutputWithPast(CausalLMOutputWithPast): ++ gate_load: Optional[torch.Tensor] = None ++ ++ + class DeepseekForCausalLM(DeepseekPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + +@@ -1620,12 +1662,13 @@ class DeepseekForCausalLM(DeepseekPreTrainedModel): + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + +- return CausalLMOutputWithPast( ++ return MoECausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, ++ gate_load=outputs.gate_load, + ) + + def prepare_inputs_for_generation( +diff --git a/src/utils/config.py b/src/utils/config.py +index 3ea5283..d4060d9 100644 +--- a/src/utils/config.py ++++ b/src/utils/config.py +@@ -6,6 +6,7 @@ import torch + import transformers + + from src.utils.io import load_json ++from src.data import get_uniform_sampling_ratio + + + @dataclass +@@ -33,7 +34,9 @@ class ModelArguments: + ) + attn_impl: str = field( + default="flash_attention_2", +- metadata={"help": "attention implementation, choice from [eager, flash_attention_2, sdpa] (default: `flash_attention_2`)"} ++ metadata={ ++ "help": "attention implementation, choice from [eager, flash_attention_2, sdpa] (default: `flash_attention_2`)" ++ }, + ) + + def __post_init__(self): +@@ -56,6 +59,18 @@ class DataArguments: + default="data/merged", + metadata={"help": "Path to dataset directory or a single jsonl file"}, + ) ++ prob_map: str = field( ++ default=None, ++ metadata={"help": "Path to the probability map file"}, ++ ) ++ ++ def __post_init__(self): ++ if self.prob_map is not None: ++ if not pathlib.Path(self.prob_map).exists(): ++ raise ValueError(f"Probability map file {self.prob_map} not found") ++ self.prob_map = load_json(self.prob_map) ++ else: ++ self.prob_map = get_uniform_sampling_ratio(self.dataset_dir_or_path) + + + @dataclass +@@ -70,9 +85,7 @@ class TrainingArguments(transformers.TrainingArguments): + ) + max_eval_steps_per_type: int = field( + default=10, +- metadata={ +- "help": "Maximum number of steps to perform during evaluation." +- }, ++ metadata={"help": "Maximum number of steps to perform during evaluation."}, + ) + dynamic_sampling_sim_type: Literal["cos", "l2"] = field( + default="l2", +@@ -88,7 +101,5 @@ class TrainingArguments(transformers.TrainingArguments): + ) + freeze_gate: bool = field( + default=False, +- metadata={ +- "help": "Whether to freeze the gate during training." +- }, ++ metadata={"help": "Whether to freeze the gate during training."}, + ) +diff --git a/src/utils/visualization.py b/src/utils/visualization.py +index 794f6c8..02bd236 100644 +--- a/src/utils/visualization.py ++++ b/src/utils/visualization.py +@@ -180,6 +180,86 @@ def gate_load_stats(model_dir, data_dir, result_dir, update_strategy: str = "cos + ) + + ++def sampling_info_stats(filepath: str, data_type: str, output_dir: str): ++ from pathlib import Path ++ import numpy as np ++ from src.utils.io import load_jsonlines ++ ++ Path(output_dir).mkdir(exist_ok=True, parents=True) ++ ++ data = load_jsonlines(filepath) ++ step2data = {ins["step"]: ins for ins in data} ++ ++ data_types = sorted(data[0]["old_prob_map"].keys()) ++ data_type_idx = data_types.index(data_type) ++ ++ probs = [] ++ loads = [] ++ sims = [] ++ steps = sorted(step2data.keys()) ++ for step in steps: ++ ins = step2data[step] ++ probs.append(ins["old_prob_map"][data_type]) ++ loads.append(ins["name2load"][data_type]) ++ sims.append(ins["sim"][data_type_idx]) ++ ++ # probs ++ fig = plt.figure() ++ ax = fig.add_subplot(111) ++ ax.plot(steps, probs) ++ ax.set_title(f"Sampling Probability of {data_type}") ++ ax.set_xlabel("step") ++ fig.savefig(f"{output_dir}/prob-{data_type}.png") ++ ++ # loads ++ def cv_square(data): ++ return np.var(data, axis=1) / (np.mean(data, axis=1)**2 + 1e-10) ++ ++ fig = plt.figure() ++ ax = fig.add_subplot(111) ++ ax.plot(steps, cv_square(loads)) ++ ax.set_title(f"cv(load)^2 of {data_type}") ++ ax.set_xlabel("step") ++ fig.savefig(f"{output_dir}/load_cv-{data_type}.png") ++ ++ # sims ++ fig = plt.figure() ++ ax = fig.add_subplot(111) ++ ax.plot(steps, np.mean(sims, axis=1)) ++ ax.set_title(f"Mean Similarities with {data_type}") ++ ax.set_xlabel("step") ++ fig.savefig(f"{output_dir}/sim-{data_type}.png") ++ ++ ++def test_sampling_convergence(): ++ from collections import defaultdict ++ from src.callbacks import AdaptiveSamplingCallback ++ ++ # freeze gate ++ name2load = {"code": [0.1359794776119403, 0.1333115671641791, 0.12858208955223882, 0.10330223880597016, 0.12544776119402984, 0.12625932835820897, 0.12761194029850748, 0.11950559701492537], "orca": [0.1509941502743006, 0.11721425756978752, 0.1232988815809414, 0.12714439426545024, 0.11256554420634679, 0.14008274482465977, 0.11819552632376563, 0.11050450095474797], "math": [0.15956486572028086, 0.10727138452881943, 0.11506675888262392, 0.10958069091633744, 0.11805010139847842, 0.11915200393871546, 0.13648938539627462, 0.13482480921846976], "sharegpt": [0.15337086599959998, 0.11428233411553493, 0.12873151621889287, 0.1177436980734424, 0.11538123789498336, 0.13793986642403783, 0.12419686111124664, 0.10835362016226212]} # fmt: skip ++ # # dynamic ++ # name2load = {"code": [0.14031716417910448, 0.1310634328358209, 0.12651119402985075, 0.10993470149253731, 0.12196828358208955, 0.12552238805970148, 0.12791977611940297, 0.11676305970149255], "orca": [0.15106234655836084, 0.11803640166095838, 0.12349968175067437, 0.12884551268450883, 0.11344072985178673, 0.1383778377231534, 0.11733170672566907, 0.1094057830448883], "math": [0.16001617686708006, 0.10756444371505268, 0.11391210568886491, 0.114803005615014, 0.11676650216277679, 0.1177863481308685, 0.13630182751708533, 0.13284959030325763], "sharegpt": [0.15440024978412215, 0.113654214863131, 0.12914741653941664, 0.12104040941178769, 0.11470799162832905, 0.13593110446537907, 0.12316259873058931, 0.10795601457724527]} # fmt: skip ++ names = sorted(name2load.keys()) ++ callback = AdaptiveSamplingCallback() ++ callback.prob_map = {"code": 0.25, "math": 0.25, "orca": 0.25, "sharegpt": 0.25} ++ name2probs = defaultdict(list) ++ for _ in range(100): ++ for name in names: ++ name2probs[name].append(callback.prob_map[name]) ++ new_name2prob, _ = callback._update_prob_map(name2load) ++ callback.prob_map = new_name2prob ++ print(f"final prob_map: {callback.prob_map}") ++ ++ fig = plt.figure() ++ ax = fig.add_subplot(111) ++ for name in names: ++ ax.plot(name2probs[name], label=name) ++ ax.legend() ++ ax.set_title("Sampling Probability") ++ ax.set_xlabel("step") ++ fig.savefig("results/sampling_convergence.png") ++ ++ + if __name__ == "__main__": + # gate_load_stats( + # "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new", +@@ -195,12 +275,12 @@ if __name__ == "__main__": + # "results/gate_load_vis_llama_moe_2_8_orca_4clusters", + # ) + +- gate_load_stats( +- "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new", +- "data/four_types_mix/dev", +- "results/debug", +- update_strategy="l2", +- ) ++ # gate_load_stats( ++ # "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new", ++ # "data/four_types_mix/dev", ++ # "results/debug", ++ # update_strategy="l2", ++ # ) + + # gate_load_stats( + # "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new", +@@ -227,3 +307,29 @@ if __name__ == "__main__": + # "results/gate_load_vis_llama_moe_2_8_four_types_mix_l2", + # update_strategy="l2" + # ) ++ ++ # sampling_info_stats( ++ # "/mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad_freeze_gate/moe_sft-2491632/sampling_info/data.jsonl", ++ # "code", ++ # "results/sampling_info/llama_moe_four_mix_wo_pad_freeze_gate/code", ++ # ) ++ ++ # sampling_info_stats( ++ # "/mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad/moe_sft-2491633/sampling_info/data.jsonl", ++ # "code", ++ # "results/sampling_info/llama_moe_four_mix_wo_pad/code", ++ # ) ++ ++ # sampling_info_stats( ++ # "/mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad_freeze_gate_wo_gate_noise/moe_sft-2493315/sampling_info/data.jsonl", ++ # "code", ++ # "results/sampling_info/llama_moe_four_mix_wo_pad_freeze_gate_wo_gate_noise/code", ++ # ) ++ ++ # sampling_info_stats( ++ # "outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad_wo_gate_noise/moe_sft-2492650/sampling_info/data.jsonl", ++ # "code", ++ # "results/sampling_info/llama_moe_four_mix_wo_pad_wo_gate_noise/code", ++ # ) ++ ++ test_sampling_convergence() diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bf84ec1a28ba89feb07162d95b06633a40b4975f --- /dev/null +++ b/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.36.2" +} diff --git a/model-00001-of-00003.safetensors b/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29beccaf47b70d020c6d6d9b7799376ec56bc504 --- /dev/null +++ b/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8919f505f53749e1c46511cd975e9da2c91fbcd8105ad30bc26ea2bb5fec3f38 +size 4996976432 diff --git a/model-00002-of-00003.safetensors b/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8867e0f43b4565f683c43b336330afdcff22e872 --- /dev/null +++ b/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61cb496bd075daa995cf0341587193b2e7a4d5805b4aa561bff4013b1861afff +size 4982823704 diff --git a/model-00003-of-00003.safetensors b/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0241a0fbfe0f8ce4c5b8b6fba7427f4ab0813a8b --- /dev/null +++ b/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21fffb1ada83903f9906325e0244222f88a5a97fdc3ab778e424f940e2d07974 +size 3501371152 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..5651c549a94cadeef10e973b9b2e37a5f20575b9 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,1098 @@ +{ + "metadata": { + "total_size": 13481033728 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors", + "model.layers.11.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors", + "model.layers.12.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors", + "model.layers.13.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors", + "model.layers.14.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors", + "model.layers.15.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors", + "model.layers.16.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors", + "model.layers.17.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors", + "model.layers.18.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors", + "model.layers.19.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors", + "model.layers.20.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_down.0": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_down.1": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_down.2": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_down.3": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_down.4": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_down.5": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_down.6": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_down.7": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_gate.0": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_gate.1": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_gate.2": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_gate.3": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_gate.4": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_gate.5": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_gate.6": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_gate.7": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_up.0": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_up.1": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_up.2": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_up.3": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_up.4": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_up.5": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_up.6": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.calculator.experts.weight_up.7": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate.gate_network.0.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate.gate_network.2.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate.weight_noise.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.rotary_emb.inv_freq": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors", + "model.layers.24.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors", + "model.layers.25.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors", + "model.layers.26.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors", + "model.layers.27.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors", + "model.layers.28.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors", + "model.layers.29.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors", + "model.layers.30.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_down.0": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_down.1": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_down.2": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_down.3": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_down.4": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_down.5": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_down.6": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_down.7": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_gate.0": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_gate.1": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_gate.2": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_gate.3": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_gate.4": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_gate.5": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_gate.6": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_gate.7": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_up.0": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_up.1": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_up.2": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_up.3": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_up.4": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_up.5": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_up.6": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.calculator.experts.weight_up.7": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.gate.gate_network.0.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.gate.gate_network.2.weight": "model-00003-of-00003.safetensors", + "model.layers.31.mlp.gate.weight_noise.weight": "model-00003-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.rotary_emb.inv_freq": "model-00003-of-00003.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_down.0": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_down.1": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_down.2": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_down.3": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_down.4": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_down.5": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_down.6": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_down.7": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_gate.0": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_gate.1": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_gate.2": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_gate.3": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_gate.4": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_gate.5": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_gate.6": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_gate.7": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_up.0": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_up.1": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_up.2": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_up.3": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_up.4": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_up.5": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_up.6": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.calculator.experts.weight_up.7": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate.gate_network.0.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate.gate_network.2.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate.weight_noise.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.rotary_emb.inv_freq": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.norm.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/modeling_llama_moe_hf.py b/modeling_llama_moe_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..9769c70f0b6897ecd8ebecd6dd913dd57aa334c6 --- /dev/null +++ b/modeling_llama_moe_hf.py @@ -0,0 +1,1690 @@ +import math +import warnings +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +import torch.utils.checkpoint +import torch.nn as nn +import torch.nn.functional as F +from torch.distributions.normal import Normal +from transformers.modeling_outputs import ( + CausalLMOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.activations import ACT2FN +from transformers.utils import ModelOutput, logging +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import ( + AttentionMaskConverter, + _prepare_4d_attention_mask, + _prepare_4d_causal_attention_mask, + _prepare_4d_causal_attention_mask_for_sdpa, +) +from transformers.utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10 + +from .configuration_llama_moe import LlamaMoEConfig + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "LlamaMoEConfig" + + +@dataclass +class CalculatorOutput(ModelOutput): + hidden_states: Optional[torch.FloatTensor] = None + num_dropped_tokens: Optional[int] = None + + +@dataclass +class BaseMoEModelOutputWithPast(ModelOutput): + """ + Args: + num_dropped_tokens: layer idx to the number of dropped tokens + """ + + last_hidden_state: torch.FloatTensor = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + balance_loss: Optional[float] = None + num_dropped_tokens: Optional[Tuple[torch.Tensor]] = None + gate_load: Optional[Tuple[list]] = None + gate_importance: Optional[Tuple[list]] = None + + +@dataclass +class MoECausalLMOutputWithPast(CausalLMOutputWithPast): + balance_loss: Optional[float] = None + num_dropped_tokens: Optional[Tuple[int]] = None + gate_load: Optional[Tuple[list[torch.Tensor]]] = None + gate_importance: Optional[Tuple[list[torch.Tensor]]] = None + + +@dataclass +class MoEMlpOutput(ModelOutput): + hidden_states: Optional[torch.FloatTensor] = None + balance_loss: Optional[torch.FloatTensor] = None + num_dropped_tokens: Optional[int] = None + gate_load: Optional[list] = None + gate_importance: Optional[list] = None + + +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +class LlamaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + LlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +class LlamaRotaryEmbedding(torch.nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + ) + + +class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): + """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + + +class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding): + """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq) + + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class LlamaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: LlamaMoEConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = LlamaRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = LlamaLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + if self.config.pretraining_tp > 1: + key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp + query_slices = self.q_proj.weight.split( + (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 + ) + key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) + value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) + + query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] + query_states = torch.cat(query_states, dim=-1) + + key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] + key_states = torch.cat(key_states, dim=-1) + + value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] + value_states = torch.cat(value_states, dim=-1) + + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if self.config.pretraining_tp > 1: + attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) + o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) + attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) + else: + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class LlamaFlashAttention2(LlamaAttention): + """ + Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # LlamaFlashAttention2 attention does not support output_attentions + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (LlamaRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +class LlamaSdpaAttention(LlamaAttention): + """ + Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from LlamaAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +LLAMA_ATTENTION_CLASSES = { + "eager": LlamaAttention, + "flash_attention_2": LlamaFlashAttention2, + "sdpa": LlamaSdpaAttention, +} + + +class TopKBalancedNoisyGate(nn.Module): + def __init__( + self, + input_size, + num_experts, + num_selects, + gate_network="mlp", + use_softmax=True, + use_balance=True, + balance_loss_weight=1e-2, + add_noise=True, + noise_epsilon=1e-2, + ): + super(TopKBalancedNoisyGate, self).__init__() + assert num_selects <= num_experts + self.input_size = input_size + self.num_experts = num_experts + self.num_selects = num_selects + + self.gate_network_type = gate_network + self.gate_network = self.get_gate_network(gate_network, input_size, num_experts) + + self.use_softmax = use_softmax + self.softmax = nn.Softmax(1) + + self.use_balance = use_balance + self.balance_loss_weight = balance_loss_weight + + # add_noise + self.add_noise = add_noise + self.noise_epsilon = noise_epsilon + self.warned = False + if self.add_noise: + self.weight_noise = nn.Linear(input_size, num_experts, bias=False) + self.weight_noise.weight.data = torch.zeros( + (num_experts, input_size), + requires_grad=True, + device=self.weight_noise.weight.data.device, + dtype=self.weight_noise.weight.data.dtype, + ) + self.mean = 0.0 + self.std = 1.0 + self.normal = Normal(self.mean, self.std) + self.softplus = nn.Softplus() + + self.reset_parameters() + + def get_gate_network(self, gate_type, input_size, num_experts): + gate_type = gate_type.lower() + + if gate_type == "linear": + gate_network = nn.Linear(input_size, num_experts, bias=False) + nn.init.zeros_(gate_network.weight) + elif gate_type == "mlp": + gate_network = torch.nn.Sequential( + torch.nn.Linear(input_size, num_experts, bias=False), + torch.nn.Tanh(), + torch.nn.Linear(num_experts, num_experts, bias=False), + ) + else: + raise ValueError(f'Unexpected gate_type: {gate_type}.') + + return gate_network + + def reset_gate_network(self): + if "gate_network_type" not in vars(self): + raise KeyError(f"{type(self)} does not have a gate network.") + else: + self.gate_network = self.get_gate_network( + self.gate_network_type, self.input_size, self.num_experts + ) + + def reset_parameters(self): + if self.add_noise: + nn.init.zeros_(self.weight_noise.weight) + # nn.init.zeros_(self.weight_noise) + + def cv_squared(self, x, eps=1e-10): + """The squared coefficient of variation of a sample. + Useful as a loss to encourage a positive distribution to be more uniform. + Epsilons added for numerical stability. + Returns 0 for an empty Tensor. + Args: + x: a `Tensor`. + Returns: + a `Scalar`.s + """ + if x.shape[0] == 1: + return torch.tensor(0.0, device=x.device) + return x.float().var() / (x.float().mean() ** 2 + eps) + + def forward(self, x): + logits_gate = self.gate_network(x) + if self.training and self.add_noise: + noise_mm = self.weight_noise(x) + noise_control = self.softplus(noise_mm) + self.noise_epsilon + logits_noise = torch.randn_like(logits_gate) * noise_control + logits = logits_gate + logits_noise + else: + logits = logits_gate + + top_logits, top_indices = logits.topk(min(self.num_selects + 1, self.num_experts), dim=1) # 选择并排序前k+1个权重 + top_k_logits = top_logits[:, :self.num_selects] + top_k_indices = top_indices[:, :self.num_selects] + top_k_scores = self.softmax(top_k_logits.to(torch.float32)) if self.use_softmax else top_k_logits + top_k_scores = top_k_scores.to(logits.dtype) + + zeros = torch.zeros_like(logits, requires_grad=True, device=logits.device) + scores_filtered = zeros.scatter(dim=1, index=top_k_indices, src=top_k_scores) # shape(batch_size, num_experts) + importance = scores_filtered.sum(0) # shape(num_experts) + + if self.training: + if self.add_noise and self.num_selects != self.num_experts: + batch_size = top_logits.size(0) + m = top_logits.size(1) + top_values_flat = top_logits.flatten() + threshold_positions_if_in = torch.arange(batch_size, device=x.device) * m + self.num_selects + threshold_if_in = torch.unsqueeze(torch.gather(top_values_flat, 0, threshold_positions_if_in), 1) + is_in = torch.gt(logits_noise, threshold_if_in) + threshold_positions_if_out = threshold_positions_if_in - 1 + threshold_if_out = torch.unsqueeze(torch.gather(top_values_flat, 0, threshold_positions_if_out), 1) + # is each value currently in the top k. + prob_if_in = self.normal.cdf((logits_gate - threshold_if_in) / noise_control) + prob_if_out = self.normal.cdf((logits_gate - threshold_if_out) / noise_control) + prob = torch.where(is_in, prob_if_in, prob_if_out) + load = prob.sum(0) + else: + load = (scores_filtered > 0).sum(0) + if not self.add_noise and not self.warned: + warnings.warn('Gradient-trackable implementation for load calculation is only available when "add_noise=True". ' + 'Training without noise will block the gradient from "load" path and lead to inconsistency in optimization objectives.') + self.warned = True + else: + load = (scores_filtered > 0).sum(0) + + if self.use_balance: + balance_loss = self.cv_squared(importance) + self.cv_squared(load) + balance_loss *= self.balance_loss_weight + else: + balance_loss = torch.tensor(-100.0, device=x.device) + + return { + "topK_indices": top_k_indices, + "topK_scores": top_k_scores, + "balance_loss": balance_loss, + "load": load, + "importance": importance, + } + + +class LinearGLUExperts(nn.Module): + """ + Modified from transformers.models.llama.modeling_llama.LlamaMLP + """ + + __constants__ = [ + "bias", + "in_features", + "hidden_features", + "out_features", + "hidden_act", + "num_experts", + "size_experts", + ] + + def __init__( + self, + in_features, + hidden_features, + out_features, + hidden_act, + num_experts, + size_experts=None, + bias=True, + device=None, + dtype=None, + ): + factory_kwargs = {"device": device, "dtype": dtype} + super(LinearGLUExperts, self).__init__() + self.in_features = in_features + self.hidden_features = hidden_features + self.out_features = out_features + self.hidden_act = hidden_act + self.num_experts = num_experts + + if size_experts is None: + # all experts share the same number of hidden neurons + assert hidden_features % num_experts == 0 + size_per_expert = hidden_features // num_experts + size_experts = [size_per_expert for _ in range(num_experts)] + else: + # use specified expert sizes + assert ( + len(size_experts) == num_experts + and sum(size_experts) == hidden_features + ) + self.size_experts = size_experts + + self.act_fn = ACT2FN[hidden_act] + + self.weight_gate = nn.ParameterList() + self.weight_up = nn.ParameterList() + self.weight_down = nn.ParameterList() + + for i in range(num_experts): + # this matrix will be transposed when performing linear forwarding + this_expert_weight_gate = nn.Parameter( + torch.empty((size_experts[i], in_features), **factory_kwargs) + ) + # this matrix will be transposed when performing linear forwarding + this_expert_weight_up = nn.Parameter( + torch.empty((size_experts[i], in_features), **factory_kwargs) + ) + # this matrix will be transposed when performing linear forwarding + this_expert_weight_down = nn.Parameter( + torch.empty((out_features, size_experts[i]), **factory_kwargs) + ) + self.weight_gate.append(this_expert_weight_gate) + self.weight_up.append(this_expert_weight_up) + self.weight_down.append(this_expert_weight_down) + + if bias: + self.bias_gate = nn.ParameterList() + self.bias_up = nn.ParameterList() + self.bias_down = nn.ParameterList() + + for i in range(num_experts): + this_expert_bias_gate = nn.Parameter( + torch.empty((size_experts[i],), **factory_kwargs) + ) + this_expert_bias_up = nn.Parameter( + torch.empty((size_experts[i],), **factory_kwargs) + ) + this_expert_bias_down = nn.Parameter( + torch.empty((out_features,), **factory_kwargs) + ) + self.bias_gate.append(this_expert_bias_gate) + self.bias_up.append(this_expert_bias_up) + self.bias_down.append(this_expert_bias_down) + else: + self.register_parameter("bias_gate", None) + self.register_parameter("bias_up", None) + self.register_parameter("bias_down", None) + + self.reset_parameters() + + def reset_parameters(self): + for i in range(self.num_experts): + nn.init.kaiming_uniform_(self.weight_gate[i], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.weight_up[i], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.weight_down[i], a=math.sqrt(5)) + if self.bias_gate is not None: + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight_gate[i]) + bound = 1 / math.sqrt(fan_in) + nn.init.uniform_(self.bias_gate[i], -bound, bound) + if self.bias_up is not None: + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight_up[i]) + bound = 1 / math.sqrt(fan_in) + nn.init.uniform_(self.bias_up[i], -bound, bound) + if self.bias_down is not None: + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight_down[i]) + bound = 1 / math.sqrt(fan_in) + nn.init.uniform_(self.bias_down[i], -bound, bound) + + def forward(self, input, i): + gate = self.act_fn( + F.linear( + input, + self.weight_gate[i], + self.bias_gate[i] if self.bias_gate is not None else None, + ) + ) + up = F.linear( + input, + self.weight_up[i], + self.bias_up[i] if self.bias_up is not None else None, + ) + down = F.linear( + gate * up, + self.weight_down[i], + self.bias_down[i] if self.bias_down is not None else None, + ) + return down + + def extra_repr(self): + return ( + "in_features={}, hidden_features={}, out_features={}, hidden_act={}," + " num_experts={}, size_experts={}, bias={}".format( + self.in_features, + self.hidden_features, + self.out_features, + self.hidden_act, + self.num_experts, + self.size_experts, + self.bias_gate is not None, + ) + ) + + +class UniversalCalculator(nn.Module): + def __init__( + self, + experts: LinearGLUExperts, + multiply_gate_scores=True, + score_scale_factor=1.0, + add_weight_norm: bool = False, + ): + super(UniversalCalculator, self).__init__() + self.experts = experts + # TODO (zhutong): use vmap to boost the training efficiency + # self.experts_vmap = torch.vmap(self.experts) + self.multiply_gate_scores = multiply_gate_scores + self.score_scale_factor = score_scale_factor + self.num_experts = experts.num_experts + self.mlp_norm = None + if multiply_gate_scores and add_weight_norm: + raise NotImplementedError + + def reset_experts(self): + self.experts.reset_parameters() + + def forward( + self, x, topK_indices, topK_scores, expert_batch_size=None, **kwargs + ) -> CalculatorOutput: + batch_size = topK_indices.size(0) # topK_indices: (bsz*seq_len, num_selects) + num_selects = topK_indices.size(1) + topK_indices = topK_indices.flatten() # shape(batch_size*num_selects) + topK_scores = topK_scores.flatten() # shape(batch_size*num_selects) + batch_indices = torch.arange( + batch_size, device=topK_scores.device + ).repeat_interleave(num_selects) + + _, index_sorted_topK_indices = topK_indices.sort(0) + + sorted_topK_scores = topK_scores.index_select(0, index_sorted_topK_indices) + sorted_batch_indices = batch_indices.index_select(0, index_sorted_topK_indices) + + if expert_batch_size is None: + expert_batch_size = topK_indices.bincount( + minlength=self.num_experts + ).tolist() + + sorted_x = x.index_select(0, sorted_batch_indices) + split_x = torch.split(sorted_x, expert_batch_size, dim=0) + + expert_outputs = [ + self.experts(split_x[i], i) + for i in range(self.num_experts) + if split_x[i].shape[0] > 0 + ] + + # (bsz*seq_len*num_selects, hidden_size) + cat_expert_outputs = torch.cat(expert_outputs, 0) + output_dim = cat_expert_outputs.size(1) + if self.multiply_gate_scores: + if self.mlp_norm is None: + cat_expert_outputs = torch.mul( + cat_expert_outputs, + sorted_topK_scores.reshape(-1, 1) * self.score_scale_factor, + ) + # cat_expert_outputs = torch.mul(cat_expert_outputs, sorted_topK_scores.reshape(-1, 1) * 1.0) + else: + cat_expert_outputs = torch.mul( + cat_expert_outputs, sorted_topK_scores.reshape(-1, 1) + ) + cat_expert_outputs = self.mlp_norm(cat_expert_outputs) + + zeros = torch.zeros( + (batch_size, output_dim), + device=cat_expert_outputs.device, + dtype=cat_expert_outputs.dtype, + ) + y = zeros.index_add(0, sorted_batch_indices, cat_expert_outputs) + + return CalculatorOutput(hidden_states=y, num_dropped_tokens=torch.tensor(-1.0)) + + +class BaseMoELayer(nn.Module): + def __init__(self): + super(BaseMoELayer, self).__init__() + + self.gate: TopKBalancedNoisyGate + self.calculator: UniversalCalculator + + def _create_gate(self, **kwargs): + self.gate_type = kwargs.get("gate_type", "TopKBalancedNoisyGate") + + if self.gate_type == "TopKBalancedNoisyGate": # noisy gate + self.gate = TopKBalancedNoisyGate( + self.input_size, + self.num_experts, + self.num_selects, + gate_network=kwargs.get("gate_network", "mlp"), + use_softmax=kwargs.get("gate_use_softmax", True), + use_balance=kwargs.get("gate_use_balance", True), + balance_loss_weight=kwargs.get("gate_balance_loss_weight", 1e-2), + add_noise=kwargs.get("gate_add_noise", True), + noise_epsilon=kwargs.get("gate_noise_epsilon", 1e-2), + ) + else: + raise NotImplementedError + + def _create_calculator(self, experts, **kwargs): + self.calculator_type = kwargs.get("calculator_type", "UniversalCalculator") + + if self.calculator_type == "UniversalCalculator": # top K calculator + self.calculator = UniversalCalculator( + experts, + multiply_gate_scores=kwargs.get("multiply_gate_scores", True), + score_scale_factor=kwargs.get("score_scale_factor", 1.0), + add_weight_norm=kwargs.get("add_weight_norm", False), + ) + else: + raise NotImplementedError + + def forward(self, x, attention_mask=None) -> MoEMlpOutput: + original_shape = x.shape[:-1] + x = x.reshape(-1, self.input_size) + flattened_mask = None + if attention_mask is not None and len(attention_mask.shape) == 2: + flattened_mask = attention_mask.flatten() + flattened_shape = flattened_mask.shape + x = x[flattened_mask.bool()] + + gate_outputs: dict = self.gate(x) + calc_outs: CalculatorOutput = self.calculator(x, **gate_outputs) + + y = calc_outs.hidden_states + if flattened_mask is not None: + y = torch.zeros(flattened_shape + (self.output_size,), dtype=x.dtype, device=x.device) # (batch_size*seq_len, output_size) + y[flattened_mask.bool()] = calc_outs.hidden_states # (non_padding_num, output_size) + y = y.reshape(original_shape + (self.output_size,)) + + return MoEMlpOutput( + hidden_states=y, + balance_loss=gate_outputs.get("balance_loss"), + num_dropped_tokens=calc_outs.num_dropped_tokens, + gate_load=gate_outputs.get("load", torch.tensor(-1)), + gate_importance=gate_outputs.get("importance", torch.tensor(-1)), + ) + + def reset_gate_network(self): + self.gate.reset_gate_network() + + def reset_experts(self): + self.calculator.reset_experts() + + +class LinearGLUMoELayer(BaseMoELayer): + def __init__( + self, + input_size, + hidden_size, + output_size, + hidden_act, + num_experts, + num_selects, + size_experts=None, + bias=True, + **kwargs, + ): + super(LinearGLUMoELayer, self).__init__() + assert num_selects <= num_experts + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + self.hidden_act = hidden_act + self.num_experts = num_experts + self.num_selects = num_selects + self.size_experts = size_experts + self.bias = bias + + experts = LinearGLUExperts( + input_size, + hidden_size, + output_size, + hidden_act, + num_experts, + size_experts=size_experts, + bias=bias, + ) + + self._create_gate(**kwargs) + self._create_calculator(experts, **kwargs) + + +class LlamaMoEDecoderLayer(nn.Module): + def __init__(self, config: LlamaMoEConfig, layer_index): + super().__init__() + + self.hidden_size = config.hidden_size + # self.self_attn = LlamaAttention(config=config) + self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_index) + + self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + gating_config = { + # all gates + "gate_type": config.gate_type, + "gate_network": config.gate_network, + "gate_use_softmax": config.gate_use_softmax, + "gate_use_balance": config.gate_use_balance, + "gate_balance_loss_weight": config.gate_balance_loss_weight, + "gate_add_noise": config.gate_add_noise, + # TopKBalancedNoisyGate + "gate_noise_epsilon": config.gate_noise_epsilon, + } + calculator_config = { + # all calculators + "calculator_type": config.calculator_type, + "multiply_gate_scores": config.multiply_gate_scores, + "score_scale_factor": ( + config.score_scale_factor[layer_index] + if isinstance(config.score_scale_factor, list) + else config.score_scale_factor + ), + "add_weight_norm": config.add_weight_norm, + # SwitchDropTokenCalculator + "drop_tokens": config.drop_tokens, + "dropped_padding": config.dropped_padding, + "capacity_factor": config.capacity_factor, + } + + self.mlp = LinearGLUMoELayer( + input_size=self.hidden_size, + hidden_size=config.intermediate_size, + output_size=self.hidden_size, + hidden_act=config.hidden_act, + num_experts=config.num_experts, + num_selects=config.num_selects, + size_experts=( + config.size_experts[layer_index] + if config.size_experts is not None + else None + ), + bias=False, + **gating_config, + **calculator_config, + ) + + def forward( + self, + hidden_states, + attention_mask=None, + position_ids=None, + past_key_value=None, + output_attentions=False, + use_cache=False, + ) -> tuple: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + mlp_outs: MoEMlpOutput = self.mlp(hidden_states, attention_mask=attention_mask) + hidden_states = residual + mlp_outs.hidden_states + + outputs = ( + hidden_states, + mlp_outs.balance_loss, + mlp_outs.num_dropped_tokens, + mlp_outs.gate_load, + mlp_outs.gate_importance, + ) + if output_attentions: + outputs += (self_attn_weights,) + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class LlamaMoEPreTrainedModel(PreTrainedModel): + config_class = LlamaMoEConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["LlamaMoEDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +class LlamaMoEModel(LlamaMoEPreTrainedModel): + def __init__(self, config: LlamaMoEConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [LlamaMoEDecoderLayer(config, i) for i in range(config.num_hidden_layers)] + ) + self._use_sdpa = config._attn_implementation == "sdpa" + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + past_key_values=None, + inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both decoder_input_ids and decoder_inputs_embeds at" + " the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError( + "You have to specify either decoder_input_ids or decoder_inputs_embeds" + ) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._use_sdpa and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + balance_loss = 0.0 + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + num_dropped_tokens = () + gate_load = () + gate_importance = () + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + if layer_outputs[1] is not None: + balance_loss += layer_outputs[1] + + if use_cache: + next_decoder_cache = layer_outputs[6 if output_attentions else 5] + + if output_attentions: + all_self_attns += (layer_outputs[5],) + + num_dropped_tokens += (layer_outputs[2],) + gate_load += (layer_outputs[3],) + gate_importance += (layer_outputs[4],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseMoEModelOutputWithPast( + last_hidden_state=hidden_states, + balance_loss=balance_loss, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + num_dropped_tokens=num_dropped_tokens, + gate_load=gate_load, + gate_importance=gate_importance, + ) + + def reset_gate_network(self): + for idx, decoder_layer in enumerate(self.layers): + decoder_layer.reset_gate_network() + + def reset_experts(self): + for idx, decoder_layer in enumerate(self.layers): + decoder_layer.reset_experts() + + +class LlamaMoEForCausalLM(LlamaMoEPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = LlamaMoEModel(config) + self.pretraining_tp = config.pretraining_tp + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + past_key_values=None, + inputs_embeds=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs, + ): + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseMoEModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs.last_hidden_state + logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + if outputs.balance_loss is not None and outputs.balance_loss > 0: + loss += outputs.balance_loss + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return MoECausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + num_dropped_tokens=outputs.num_dropped_tokens, + balance_loss=outputs.balance_loss, + gate_load=outputs.gate_load, + gate_importance=outputs.gate_importance, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + def reset_gate_network(self): + self.model.reset_gate_network() + + def reset_experts(self): + self.model.reset_experts() diff --git a/sampling_info/100/load.pdf b/sampling_info/100/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6cc6eb7d5ae651404ed6b300dabd3d8c01dd3cb1 Binary files /dev/null and b/sampling_info/100/load.pdf differ diff --git a/sampling_info/100/prob_map.pdf b/sampling_info/100/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9e3272339d043dbca65b92362b44c3cb228ca0c6 Binary files /dev/null and b/sampling_info/100/prob_map.pdf differ diff --git a/sampling_info/100/sim.pdf b/sampling_info/100/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..cc92d57d5032a2470e386d8b3cfac587cca60503 Binary files /dev/null and b/sampling_info/100/sim.pdf differ diff --git a/sampling_info/1000/load.pdf b/sampling_info/1000/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..0b378bd8fb22b57cfd1391db4529ac95ec1e2292 Binary files /dev/null and b/sampling_info/1000/load.pdf differ diff --git a/sampling_info/1000/prob_map.pdf b/sampling_info/1000/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e4d31f8a6f3f6592d1ff85a88f9629c3123c7c3f Binary files /dev/null and b/sampling_info/1000/prob_map.pdf differ diff --git a/sampling_info/1000/sim.pdf b/sampling_info/1000/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2ece3648860b77efcba8cdb60eee529f6c188091 Binary files /dev/null and b/sampling_info/1000/sim.pdf differ diff --git a/sampling_info/1100/load.pdf b/sampling_info/1100/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..0e3bc65644ba960eb63153005ab196d63bc55079 Binary files /dev/null and b/sampling_info/1100/load.pdf differ diff --git a/sampling_info/1100/prob_map.pdf b/sampling_info/1100/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c082785ec24e7c8dcdeaefc911cdc4df39aaed9a Binary files /dev/null and b/sampling_info/1100/prob_map.pdf differ diff --git a/sampling_info/1100/sim.pdf b/sampling_info/1100/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..7d957e7d97b1c90f544e6098c9a2d6d12501e502 Binary files /dev/null and b/sampling_info/1100/sim.pdf differ diff --git a/sampling_info/1200/load.pdf b/sampling_info/1200/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..7092fef87780daa99c4c27b9b51ee5472efd183d Binary files /dev/null and b/sampling_info/1200/load.pdf differ diff --git a/sampling_info/1200/prob_map.pdf b/sampling_info/1200/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..bfe97c1c4f66ea433004745b6fa44deddf219045 Binary files /dev/null and b/sampling_info/1200/prob_map.pdf differ diff --git a/sampling_info/1200/sim.pdf b/sampling_info/1200/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3ce61d170a4f24990feaa9befb5cac71f031c8be Binary files /dev/null and b/sampling_info/1200/sim.pdf differ diff --git a/sampling_info/1300/load.pdf b/sampling_info/1300/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..5b1c9b63b145a27c90fcf9478c9b9594c67a4065 Binary files /dev/null and b/sampling_info/1300/load.pdf differ diff --git a/sampling_info/1300/prob_map.pdf b/sampling_info/1300/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..754039ee83685d73a714dd8e6e8cb7e0972025a2 Binary files /dev/null and b/sampling_info/1300/prob_map.pdf differ diff --git a/sampling_info/1300/sim.pdf b/sampling_info/1300/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..482a9db9ddca31346756930f033971d6342700c4 Binary files /dev/null and b/sampling_info/1300/sim.pdf differ diff --git a/sampling_info/1400/load.pdf b/sampling_info/1400/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d42c2a335c2ff9ea94ebc028d1c8a6bfff21a483 Binary files /dev/null and b/sampling_info/1400/load.pdf differ diff --git a/sampling_info/1400/prob_map.pdf b/sampling_info/1400/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..65a5485fa34a7debdefcd6bf974956ce68b5abae Binary files /dev/null and b/sampling_info/1400/prob_map.pdf differ diff --git a/sampling_info/1400/sim.pdf b/sampling_info/1400/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e9d94d9eb05c61342526eb0231d233127b9ec22b Binary files /dev/null and b/sampling_info/1400/sim.pdf differ diff --git a/sampling_info/1500/load.pdf b/sampling_info/1500/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c53bd776397dfea411c5faba63706ef290513d75 Binary files /dev/null and b/sampling_info/1500/load.pdf differ diff --git a/sampling_info/1500/prob_map.pdf b/sampling_info/1500/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1ff8648bbd97165d588b3c92add6d7c2d067983e Binary files /dev/null and b/sampling_info/1500/prob_map.pdf differ diff --git a/sampling_info/1500/sim.pdf b/sampling_info/1500/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..41c96d180634f30300fdc1a88c8df689929a8d9a Binary files /dev/null and b/sampling_info/1500/sim.pdf differ diff --git a/sampling_info/1600/load.pdf b/sampling_info/1600/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8aeb1b86ca976e30de8540b21ae1b5b920e191c9 Binary files /dev/null and b/sampling_info/1600/load.pdf differ diff --git a/sampling_info/1600/prob_map.pdf b/sampling_info/1600/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c7362ceb143796dc7fe04e8e25b8dd3b16cb003b Binary files /dev/null and b/sampling_info/1600/prob_map.pdf differ diff --git a/sampling_info/1600/sim.pdf b/sampling_info/1600/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a84bb1ed29423edfbfdea67a2b930dd3859ed96d Binary files /dev/null and b/sampling_info/1600/sim.pdf differ diff --git a/sampling_info/1700/load.pdf b/sampling_info/1700/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a9e2742970fb57a5eebfc752bcee141313be4e1c Binary files /dev/null and b/sampling_info/1700/load.pdf differ diff --git a/sampling_info/1700/prob_map.pdf b/sampling_info/1700/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..292030ce663427650c24ba48aff5cdda4dfc8a38 Binary files /dev/null and b/sampling_info/1700/prob_map.pdf differ diff --git a/sampling_info/1700/sim.pdf b/sampling_info/1700/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..eee716d1ca601d89a65268ae3aef81d02ec13351 Binary files /dev/null and b/sampling_info/1700/sim.pdf differ diff --git a/sampling_info/1800/load.pdf b/sampling_info/1800/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..eb809e83d770b2845aad8c5d0c0e0c9873caff04 Binary files /dev/null and b/sampling_info/1800/load.pdf differ diff --git a/sampling_info/1800/prob_map.pdf b/sampling_info/1800/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..515a9ecebc6f8ece1e84b7ca3e100c6055f415b7 Binary files /dev/null and b/sampling_info/1800/prob_map.pdf differ diff --git a/sampling_info/1800/sim.pdf b/sampling_info/1800/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..69435e32fb379d1983528c8b2d7dc0845f294f4f Binary files /dev/null and b/sampling_info/1800/sim.pdf differ diff --git a/sampling_info/1900/load.pdf b/sampling_info/1900/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..40debcd3cc5be2756ac4d19c1d34ada8879f7347 Binary files /dev/null and b/sampling_info/1900/load.pdf differ diff --git a/sampling_info/1900/prob_map.pdf b/sampling_info/1900/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..5d4abf233eb3bcf034ecb43931ef9b0155a0297b Binary files /dev/null and b/sampling_info/1900/prob_map.pdf differ diff --git a/sampling_info/1900/sim.pdf b/sampling_info/1900/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..ffda20f156b1997a7f0a8e3029147fcf61ac4ea4 Binary files /dev/null and b/sampling_info/1900/sim.pdf differ diff --git a/sampling_info/200/load.pdf b/sampling_info/200/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..4f8ff3b870efb490db41cb6c4cd682018bd92d80 Binary files /dev/null and b/sampling_info/200/load.pdf differ diff --git a/sampling_info/200/prob_map.pdf b/sampling_info/200/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..07f2d9f09ee8e34417a38de6ac47e374cd494e9b Binary files /dev/null and b/sampling_info/200/prob_map.pdf differ diff --git a/sampling_info/200/sim.pdf b/sampling_info/200/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..dbaf5567518363f2e8af88a1dd03588863c032b3 Binary files /dev/null and b/sampling_info/200/sim.pdf differ diff --git a/sampling_info/2000/load.pdf b/sampling_info/2000/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..437a79244da25b133ac02ab19174b6f22d98c40e Binary files /dev/null and b/sampling_info/2000/load.pdf differ diff --git a/sampling_info/2000/prob_map.pdf b/sampling_info/2000/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..395510eb7a42b1f0ccab0b74b00d80de6a071b28 Binary files /dev/null and b/sampling_info/2000/prob_map.pdf differ diff --git a/sampling_info/2000/sim.pdf b/sampling_info/2000/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b8abf711b2dda30df0766071b0f9928bddf729dc Binary files /dev/null and b/sampling_info/2000/sim.pdf differ diff --git a/sampling_info/300/load.pdf b/sampling_info/300/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..38101113c8187950fc7d88f3c6e96606595c32f0 Binary files /dev/null and b/sampling_info/300/load.pdf differ diff --git a/sampling_info/300/prob_map.pdf b/sampling_info/300/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..7ec6cbb6302b5b4e3230654f70ba5ffaa7551094 Binary files /dev/null and b/sampling_info/300/prob_map.pdf differ diff --git a/sampling_info/300/sim.pdf b/sampling_info/300/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..5f5ea72c687f21e583af927e304f375a25c8c322 Binary files /dev/null and b/sampling_info/300/sim.pdf differ diff --git a/sampling_info/400/load.pdf b/sampling_info/400/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..4886e7616e7ab348e38b2cba5ff09ee1e79c0191 Binary files /dev/null and b/sampling_info/400/load.pdf differ diff --git a/sampling_info/400/prob_map.pdf b/sampling_info/400/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..200f7b2d3698aa8af6f0b044c1dd87475c49d9b5 Binary files /dev/null and b/sampling_info/400/prob_map.pdf differ diff --git a/sampling_info/400/sim.pdf b/sampling_info/400/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..bffb512b45ab99f08a748ad280e37e843c7b31ca Binary files /dev/null and b/sampling_info/400/sim.pdf differ diff --git a/sampling_info/500/load.pdf b/sampling_info/500/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..39bf8545fd5bf7d139031fdab934223e685e2540 Binary files /dev/null and b/sampling_info/500/load.pdf differ diff --git a/sampling_info/500/prob_map.pdf b/sampling_info/500/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..447fcc083909f4050349150e1223ea4211b11c79 Binary files /dev/null and b/sampling_info/500/prob_map.pdf differ diff --git a/sampling_info/500/sim.pdf b/sampling_info/500/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d78538ca00f8267fcf714608508f3bd4b8877cc8 Binary files /dev/null and b/sampling_info/500/sim.pdf differ diff --git a/sampling_info/600/load.pdf b/sampling_info/600/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..ad6442d9fa3cccc8559a9637e8157cdc22d47e45 Binary files /dev/null and b/sampling_info/600/load.pdf differ diff --git a/sampling_info/600/prob_map.pdf b/sampling_info/600/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c3241a0c02e0cbb40cf2cb0faa7c0486ad14c91d Binary files /dev/null and b/sampling_info/600/prob_map.pdf differ diff --git a/sampling_info/600/sim.pdf b/sampling_info/600/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..4c2608e3e9bbd3ed91de2b68a9b42c2f417a47a2 Binary files /dev/null and b/sampling_info/600/sim.pdf differ diff --git a/sampling_info/700/load.pdf b/sampling_info/700/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2297ac8a8f832f14c9f4704e20b8490fe1687cae Binary files /dev/null and b/sampling_info/700/load.pdf differ diff --git a/sampling_info/700/prob_map.pdf b/sampling_info/700/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..07163f14c62b0cbae11ce761bb52943a7821a3ff Binary files /dev/null and b/sampling_info/700/prob_map.pdf differ diff --git a/sampling_info/700/sim.pdf b/sampling_info/700/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..16ac462ad590f4cdc7a44a266e95326416b2a794 Binary files /dev/null and b/sampling_info/700/sim.pdf differ diff --git a/sampling_info/800/load.pdf b/sampling_info/800/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9b4aec4c00e04af291272b24dbb4303fb18e9fe5 Binary files /dev/null and b/sampling_info/800/load.pdf differ diff --git a/sampling_info/800/prob_map.pdf b/sampling_info/800/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..48204b60e01768797624caf568d962f024b3d9e7 Binary files /dev/null and b/sampling_info/800/prob_map.pdf differ diff --git a/sampling_info/800/sim.pdf b/sampling_info/800/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f4c2c79366ee3fecdcba8df83df63e3b3b71015f Binary files /dev/null and b/sampling_info/800/sim.pdf differ diff --git a/sampling_info/900/load.pdf b/sampling_info/900/load.pdf new file mode 100644 index 0000000000000000000000000000000000000000..5b4723a369aff02fe3136d5ae61e03f31327261b Binary files /dev/null and b/sampling_info/900/load.pdf differ diff --git a/sampling_info/900/prob_map.pdf b/sampling_info/900/prob_map.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2a0c23874e999b2e77277f67984bf795ba934180 Binary files /dev/null and b/sampling_info/900/prob_map.pdf differ diff --git a/sampling_info/900/sim.pdf b/sampling_info/900/sim.pdf new file mode 100644 index 0000000000000000000000000000000000000000..98107fdeb60e40150db3472ca04004f8b2debc22 Binary files /dev/null and b/sampling_info/900/sim.pdf differ diff --git a/sampling_info/data.jsonl b/sampling_info/data.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5bb1bd27578bb123f8b010a7f61becc470ce116a --- /dev/null +++ b/sampling_info/data.jsonl @@ -0,0 +1,60 @@ +{"step": 100, "old_prob_map": {"code": 0.25, "sharegpt": 0.25, "orca": 0.25, "math": 0.25}, "new_prob_map": {"code": 0.2572303488138314, "math": 0.2588491538502336, "orca": 0.24410416223157866, "sharegpt": 0.23981633510435646}, "sim": [[0.12586449411651235, 0.1258017498961425, 0.12542206677168527, 0.12547745437725044], [0.1258017498961425, 0.1271137439587301, 0.12608451435551657, 0.12595773669791455], [0.12542206677168527, 0.12608451435551657, 0.12707085949740077, 0.12682451018998525], [0.12547745437725044, 0.12595773669791455, 0.12682451018998525, 0.12667757289382245]], "name2load": {"code": [0.13847222222222222, 0.1329861111111111, 0.12631944444444446, 0.10090277777777779, 0.1232638888888889, 0.12413194444444445, 0.13052083333333334, 0.12340277777777776], "orca": [0.15915094039397892, 0.11213124144734748, 0.1264519619387073, 0.12225121726124175, 0.11131973395283709, 0.14101136110492316, 0.11813003214206154, 0.10955351175890271], "math": [0.15772125744084223, 0.10886997589413094, 0.11236286712254635, 0.10700054115216213, 0.12215280169233039, 0.11964382348600382, 0.13885472524228856, 0.13339400796969544], "sharegpt": [0.1538298559263893, 0.11250454013479154, 0.1309021752290246, 0.11805359376891722, 0.11528411154606724, 0.1396192340288147, 0.12180172726905847, 0.10800476209693692]}} +{"step": 100, "old_prob_map": {"code": 0.25, "sharegpt": 0.25, "orca": 0.25, "math": 0.25}, "new_prob_map": {"code": 0.2572303488138314, "math": 0.2588491538502336, "orca": 0.24410416223157866, "sharegpt": 0.23981633510435646}, "sim": [[0.12586449411651235, 0.1258017498961425, 0.12542206677168527, 0.12547745437725044], [0.1258017498961425, 0.1271137439587301, 0.12608451435551657, 0.12595773669791455], [0.12542206677168527, 0.12608451435551657, 0.12707085949740077, 0.12682451018998525], [0.12547745437725044, 0.12595773669791455, 0.12682451018998525, 0.12667757289382245]], "name2load": {"code": [0.13847222222222222, 0.1329861111111111, 0.12631944444444446, 0.10090277777777779, 0.1232638888888889, 0.12413194444444445, 0.13052083333333334, 0.12340277777777776], "orca": [0.15915094039397892, 0.11213124144734748, 0.1264519619387073, 0.12225121726124175, 0.11131973395283709, 0.14101136110492316, 0.11813003214206154, 0.10955351175890271], "math": [0.15772125744084223, 0.10886997589413094, 0.11236286712254635, 0.10700054115216213, 0.12215280169233039, 0.11964382348600382, 0.13885472524228856, 0.13339400796969544], "sharegpt": [0.1538298559263893, 0.11250454013479154, 0.1309021752290246, 0.11805359376891722, 0.11528411154606724, 0.1396192340288147, 0.12180172726905847, 0.10800476209693692]}} +{"step": 100, "old_prob_map": {"code": 0.25, "sharegpt": 0.25, "orca": 0.25, "math": 0.25}, "new_prob_map": {"code": 0.2572303488138314, "math": 0.2588491538502336, "orca": 0.24410416223157866, "sharegpt": 0.23981633510435646}, "sim": [[0.12586449411651235, 0.1258017498961425, 0.12542206677168527, 0.12547745437725044], [0.1258017498961425, 0.1271137439587301, 0.12608451435551657, 0.12595773669791455], [0.12542206677168527, 0.12608451435551657, 0.12707085949740077, 0.12682451018998525], [0.12547745437725044, 0.12595773669791455, 0.12682451018998525, 0.12667757289382245]], "name2load": {"code": [0.13847222222222222, 0.1329861111111111, 0.12631944444444446, 0.10090277777777779, 0.1232638888888889, 0.12413194444444445, 0.13052083333333334, 0.12340277777777776], "orca": [0.15915094039397892, 0.11213124144734748, 0.1264519619387073, 0.12225121726124175, 0.11131973395283709, 0.14101136110492316, 0.11813003214206154, 0.10955351175890271], "math": [0.15772125744084223, 0.10886997589413094, 0.11236286712254635, 0.10700054115216213, 0.12215280169233039, 0.11964382348600382, 0.13885472524228856, 0.13339400796969544], "sharegpt": [0.1538298559263893, 0.11250454013479154, 0.1309021752290246, 0.11805359376891722, 0.11528411154606724, 0.1396192340288147, 0.12180172726905847, 0.10800476209693692]}} +{"step": 200, "old_prob_map": {"code": 0.2572303488138314, "math": 0.2588491538502336, "orca": 0.24410416223157866, "sharegpt": 0.23981633510435646}, "new_prob_map": {"code": 0.26041014282145614, "math": 0.2699967341834913, "orca": 0.2388624915646586, "sharegpt": 0.2307306314303939}, "sim": [[0.12557887008101853, 0.12565037163214773, 0.1255207902385726, 0.12553045151780629], [0.12565037163214773, 0.12691183050468666, 0.12602518187654757, 0.12586824087634546], [0.1255207902385726, 0.12602518187654757, 0.12701431126844212, 0.1267592077148191], [0.12553045151780629, 0.12586824087634546, 0.1267592077148191, 0.12658454559077903]], "name2load": {"code": [0.14038194444444446, 0.12777777777777777, 0.12628472222222223, 0.10847222222222222, 0.1252777777777778, 0.12333333333333332, 0.1295138888888889, 0.11895833333333335], "orca": [0.15915094039397892, 0.11076281704483977, 0.12624510708716544, 0.12474938739140119, 0.11291092511854374, 0.13927696273430287, 0.11712758170766635, 0.10977627852210163], "math": [0.1569587248487234, 0.10840261720863877, 0.10938653023072761, 0.11396172578344076, 0.12547350814188027, 0.11779898656958725, 0.1381905839523786, 0.1298273232646234], "sharegpt": [0.15361798296944992, 0.11186892126397352, 0.1299941482707131, 0.1208079422091287, 0.1149158561685298, 0.13860527059203356, 0.12090883409338553, 0.10928104443278583]}} +{"step": 200, "old_prob_map": {"code": 0.2572303488138314, "math": 0.2588491538502336, "orca": 0.24410416223157866, "sharegpt": 0.23981633510435646}, "new_prob_map": {"code": 0.26041014282145614, "math": 0.2699967341834913, "orca": 0.2388624915646586, "sharegpt": 0.2307306314303939}, "sim": [[0.12557887008101853, 0.12565037163214773, 0.1255207902385726, 0.12553045151780629], [0.12565037163214773, 0.12691183050468666, 0.12602518187654757, 0.12586824087634546], [0.1255207902385726, 0.12602518187654757, 0.12701431126844212, 0.1267592077148191], [0.12553045151780629, 0.12586824087634546, 0.1267592077148191, 0.12658454559077903]], "name2load": {"code": [0.14038194444444446, 0.12777777777777777, 0.12628472222222223, 0.10847222222222222, 0.1252777777777778, 0.12333333333333332, 0.1295138888888889, 0.11895833333333335], "orca": [0.15915094039397892, 0.11076281704483977, 0.12624510708716544, 0.12474938739140119, 0.11291092511854374, 0.13927696273430287, 0.11712758170766635, 0.10977627852210163], "math": [0.1569587248487234, 0.10840261720863877, 0.10938653023072761, 0.11396172578344076, 0.12547350814188027, 0.11779898656958725, 0.1381905839523786, 0.1298273232646234], "sharegpt": [0.15361798296944992, 0.11186892126397352, 0.1299941482707131, 0.1208079422091287, 0.1149158561685298, 0.13860527059203356, 0.12090883409338553, 0.10928104443278583]}} +{"step": 200, "old_prob_map": {"code": 0.2572303488138314, "math": 0.2588491538502336, "orca": 0.24410416223157866, "sharegpt": 0.23981633510435646}, "new_prob_map": {"code": 0.26041014282145614, "math": 0.2699967341834913, "orca": 0.2388624915646586, "sharegpt": 0.2307306314303939}, "sim": [[0.12557887008101853, 0.12565037163214773, 0.1255207902385726, 0.12553045151780629], [0.12565037163214773, 0.12691183050468666, 0.12602518187654757, 0.12586824087634546], [0.1255207902385726, 0.12602518187654757, 0.12701431126844212, 0.1267592077148191], [0.12553045151780629, 0.12586824087634546, 0.1267592077148191, 0.12658454559077903]], "name2load": {"code": [0.14038194444444446, 0.12777777777777777, 0.12628472222222223, 0.10847222222222222, 0.1252777777777778, 0.12333333333333332, 0.1295138888888889, 0.11895833333333335], "orca": [0.15915094039397892, 0.11076281704483977, 0.12624510708716544, 0.12474938739140119, 0.11291092511854374, 0.13927696273430287, 0.11712758170766635, 0.10977627852210163], "math": [0.1569587248487234, 0.10840261720863877, 0.10938653023072761, 0.11396172578344076, 0.12547350814188027, 0.11779898656958725, 0.1381905839523786, 0.1298273232646234], "sharegpt": [0.15361798296944992, 0.11186892126397352, 0.1299941482707131, 0.1208079422091287, 0.1149158561685298, 0.13860527059203356, 0.12090883409338553, 0.10928104443278583]}} +{"step": 300, "old_prob_map": {"code": 0.26041014282145614, "math": 0.2699967341834913, "orca": 0.2388624915646586, "sharegpt": 0.2307306314303939}, "new_prob_map": {"code": 0.262732059091419, "math": 0.2804335637149032, "orca": 0.23390804147258062, "sharegpt": 0.22292633572109724}, "sim": [[0.12574693769290124, 0.1257249944313256, 0.12580449354595183, 0.1257251789850052], [0.1257249944313256, 0.12691919157975104, 0.12625310413982882, 0.12598091200100756], [0.12580449354595183, 0.12625310413982882, 0.12722391119439105, 0.1268789970410317], [0.1257251789850052, 0.12598091200100756, 0.1268789970410317, 0.12665033434484935]], "name2load": {"code": [0.14305555555555555, 0.13052083333333334, 0.13197916666666668, 0.11215277777777778, 0.11579861111111112, 0.12083333333333333, 0.12854166666666667, 0.11711805555555556], "orca": [0.16174458199408076, 0.11194029850746268, 0.128297743690927, 0.12694523120007636, 0.10855106132450752, 0.1353467205550075, 0.11860738949177353, 0.10856697323616459], "math": [0.1594431052294977, 0.11096079106606975, 0.11413391056230628, 0.1136911497023663, 0.12011118217149602, 0.11592955182761842, 0.13833817090569192, 0.1273921385349535], "sharegpt": [0.15417793292707535, 0.11246418338108882, 0.13297550345050244, 0.1228005569232011, 0.11246418338108882, 0.13609810726825133, 0.12055066790427377, 0.10846886476451834]}} +{"step": 300, "old_prob_map": {"code": 0.26041014282145614, "math": 0.2699967341834913, "orca": 0.2388624915646586, "sharegpt": 0.2307306314303939}, "new_prob_map": {"code": 0.262732059091419, "math": 0.2804335637149032, "orca": 0.23390804147258062, "sharegpt": 0.22292633572109724}, "sim": [[0.12574693769290124, 0.1257249944313256, 0.12580449354595183, 0.1257251789850052], [0.1257249944313256, 0.12691919157975104, 0.12625310413982882, 0.12598091200100756], [0.12580449354595183, 0.12625310413982882, 0.12722391119439105, 0.1268789970410317], [0.1257251789850052, 0.12598091200100756, 0.1268789970410317, 0.12665033434484935]], "name2load": {"code": [0.14305555555555555, 0.13052083333333334, 0.13197916666666668, 0.11215277777777778, 0.11579861111111112, 0.12083333333333333, 0.12854166666666667, 0.11711805555555556], "orca": [0.16174458199408076, 0.11194029850746268, 0.128297743690927, 0.12694523120007636, 0.10855106132450752, 0.1353467205550075, 0.11860738949177353, 0.10856697323616459], "math": [0.1594431052294977, 0.11096079106606975, 0.11413391056230628, 0.1136911497023663, 0.12011118217149602, 0.11592955182761842, 0.13833817090569192, 0.1273921385349535], "sharegpt": [0.15417793292707535, 0.11246418338108882, 0.13297550345050244, 0.1228005569232011, 0.11246418338108882, 0.13609810726825133, 0.12055066790427377, 0.10846886476451834]}} +{"step": 300, "old_prob_map": {"code": 0.26041014282145614, "math": 0.2699967341834913, "orca": 0.2388624915646586, "sharegpt": 0.2307306314303939}, "new_prob_map": {"code": 0.262732059091419, "math": 0.2804335637149032, "orca": 0.23390804147258062, "sharegpt": 0.22292633572109724}, "sim": [[0.12574693769290124, 0.1257249944313256, 0.12580449354595183, 0.1257251789850052], [0.1257249944313256, 0.12691919157975104, 0.12625310413982882, 0.12598091200100756], [0.12580449354595183, 0.12625310413982882, 0.12722391119439105, 0.1268789970410317], [0.1257251789850052, 0.12598091200100756, 0.1268789970410317, 0.12665033434484935]], "name2load": {"code": [0.14305555555555555, 0.13052083333333334, 0.13197916666666668, 0.11215277777777778, 0.11579861111111112, 0.12083333333333333, 0.12854166666666667, 0.11711805555555556], "orca": [0.16174458199408076, 0.11194029850746268, 0.128297743690927, 0.12694523120007636, 0.10855106132450752, 0.1353467205550075, 0.11860738949177353, 0.10856697323616459], "math": [0.1594431052294977, 0.11096079106606975, 0.11413391056230628, 0.1136911497023663, 0.12011118217149602, 0.11592955182761842, 0.13833817090569192, 0.1273921385349535], "sharegpt": [0.15417793292707535, 0.11246418338108882, 0.13297550345050244, 0.1228005569232011, 0.11246418338108882, 0.13609810726825133, 0.12055066790427377, 0.10846886476451834]}} +{"step": 400, "old_prob_map": {"code": 0.262732059091419, "math": 0.2804335637149032, "orca": 0.23390804147258062, "sharegpt": 0.22292633572109724}, "new_prob_map": {"code": 0.26592793408730336, "math": 0.28876391543048435, "orca": 0.22985923187970148, "sharegpt": 0.2154489186025108}, "sim": [[0.1256565586419753, 0.12572712965923813, 0.12552322619754816, 0.12548363978542537], [0.12572712965923813, 0.126813335133358, 0.12601790734538068, 0.1258321498287296], [0.12552322619754816, 0.12601790734538068, 0.1269305361144293, 0.12662832848484412], [0.12548363978542537, 0.1258321498287296, 0.12662832848484412, 0.12640048340498414]], "name2load": {"code": [0.14086805555555554, 0.1329861111111111, 0.12416666666666668, 0.11038194444444444, 0.12090277777777778, 0.1207986111111111, 0.1317361111111111, 0.11815972222222222], "orca": [0.15962829774369092, 0.1129268370302008, 0.12680202399516277, 0.12576774973745344, 0.11252903923877415, 0.13525124908506508, 0.11897336345988607, 0.10812143970976673], "math": [0.15784424656860335, 0.1165936931175284, 0.10643479116446106, 0.1138141388301274, 0.12399763860874699, 0.11735622570964727, 0.13629655138485758, 0.12766271461602793], "sharegpt": [0.15344142217200046, 0.11403305218128253, 0.12936357399410792, 0.12228600831349125, 0.11519835344444893, 0.13466039791759152, 0.12164534484846039, 0.10937184712861696]}} +{"step": 400, "old_prob_map": {"code": 0.262732059091419, "math": 0.2804335637149032, "orca": 0.23390804147258062, "sharegpt": 0.22292633572109724}, "new_prob_map": {"code": 0.26592793408730336, "math": 0.28876391543048435, "orca": 0.22985923187970148, "sharegpt": 0.2154489186025108}, "sim": [[0.1256565586419753, 0.12572712965923813, 0.12552322619754816, 0.12548363978542537], [0.12572712965923813, 0.126813335133358, 0.12601790734538068, 0.1258321498287296], [0.12552322619754816, 0.12601790734538068, 0.1269305361144293, 0.12662832848484412], [0.12548363978542537, 0.1258321498287296, 0.12662832848484412, 0.12640048340498414]], "name2load": {"code": [0.14086805555555554, 0.1329861111111111, 0.12416666666666668, 0.11038194444444444, 0.12090277777777778, 0.1207986111111111, 0.1317361111111111, 0.11815972222222222], "orca": [0.15962829774369092, 0.1129268370302008, 0.12680202399516277, 0.12576774973745344, 0.11252903923877415, 0.13525124908506508, 0.11897336345988607, 0.10812143970976673], "math": [0.15784424656860335, 0.1165936931175284, 0.10643479116446106, 0.1138141388301274, 0.12399763860874699, 0.11735622570964727, 0.13629655138485758, 0.12766271461602793], "sharegpt": [0.15344142217200046, 0.11403305218128253, 0.12936357399410792, 0.12228600831349125, 0.11519835344444893, 0.13466039791759152, 0.12164534484846039, 0.10937184712861696]}} +{"step": 400, "old_prob_map": {"code": 0.262732059091419, "math": 0.2804335637149032, "orca": 0.23390804147258062, "sharegpt": 0.22292633572109724}, "new_prob_map": {"code": 0.26592793408730336, "math": 0.28876391543048435, "orca": 0.22985923187970148, "sharegpt": 0.2154489186025108}, "sim": [[0.1256565586419753, 0.12572712965923813, 0.12552322619754816, 0.12548363978542537], [0.12572712965923813, 0.126813335133358, 0.12601790734538068, 0.1258321498287296], [0.12552322619754816, 0.12601790734538068, 0.1269305361144293, 0.12662832848484412], [0.12548363978542537, 0.1258321498287296, 0.12662832848484412, 0.12640048340498414]], "name2load": {"code": [0.14086805555555554, 0.1329861111111111, 0.12416666666666668, 0.11038194444444444, 0.12090277777777778, 0.1207986111111111, 0.1317361111111111, 0.11815972222222222], "orca": [0.15962829774369092, 0.1129268370302008, 0.12680202399516277, 0.12576774973745344, 0.11252903923877415, 0.13525124908506508, 0.11897336345988607, 0.10812143970976673], "math": [0.15784424656860335, 0.1165936931175284, 0.10643479116446106, 0.1138141388301274, 0.12399763860874699, 0.11735622570964727, 0.13629655138485758, 0.12766271461602793], "sharegpt": [0.15344142217200046, 0.11403305218128253, 0.12936357399410792, 0.12228600831349125, 0.11519835344444893, 0.13466039791759152, 0.12164534484846039, 0.10937184712861696]}} +{"step": 500, "old_prob_map": {"code": 0.26592793408730336, "math": 0.28876391543048435, "orca": 0.22985923187970148, "sharegpt": 0.2154489186025108}, "new_prob_map": {"code": 0.2669836751180987, "math": 0.2966379337081988, "orca": 0.22764222531614908, "sharegpt": 0.20873616585755356}, "sim": [[0.1259574701003086, 0.12599376240140372, 0.1260121163682299, 0.1259192261326235], [0.12599376240140372, 0.12696372953269297, 0.12641762455469407, 0.12616699032245762], [0.1260121163682299, 0.12641762455469407, 0.12725356468217505, 0.12685254651404831], [0.1259192261326235, 0.12616699032245762, 0.12685254651404831, 0.12656536029397814]], "name2load": {"code": [0.1470138888888889, 0.12184027777777777, 0.12666666666666668, 0.10680555555555557, 0.12201388888888888, 0.12670138888888888, 0.13211805555555556, 0.11684027777777778], "orca": [0.16257200140024822, 0.11082646469146802, 0.12724755752156064, 0.12562454253253985, 0.11042866690004137, 0.13595137319797598, 0.11851191802183113, 0.10883747573433472], "math": [0.15934471392728883, 0.10412259556255228, 0.11241206277365082, 0.11809416047621392, 0.1262114429084469, 0.1197176169626605, 0.1356078122693954, 0.12448959511979142], "sharegpt": [0.15488417611687316, 0.11187396585818636, 0.12985289963275354, 0.1216705678195246, 0.11496630211065821, 0.13559869244118003, 0.12169074619637597, 0.10946264982444812]}} +{"step": 500, "old_prob_map": {"code": 0.26592793408730336, "math": 0.28876391543048435, "orca": 0.22985923187970148, "sharegpt": 0.2154489186025108}, "new_prob_map": {"code": 0.2669836751180987, "math": 0.2966379337081988, "orca": 0.22764222531614908, "sharegpt": 0.20873616585755356}, "sim": [[0.1259574701003086, 0.12599376240140372, 0.1260121163682299, 0.1259192261326235], [0.12599376240140372, 0.12696372953269297, 0.12641762455469407, 0.12616699032245762], [0.1260121163682299, 0.12641762455469407, 0.12725356468217505, 0.12685254651404831], [0.1259192261326235, 0.12616699032245762, 0.12685254651404831, 0.12656536029397814]], "name2load": {"code": [0.1470138888888889, 0.12184027777777777, 0.12666666666666668, 0.10680555555555557, 0.12201388888888888, 0.12670138888888888, 0.13211805555555556, 0.11684027777777778], "orca": [0.16257200140024822, 0.11082646469146802, 0.12724755752156064, 0.12562454253253985, 0.11042866690004137, 0.13595137319797598, 0.11851191802183113, 0.10883747573433472], "math": [0.15934471392728883, 0.10412259556255228, 0.11241206277365082, 0.11809416047621392, 0.1262114429084469, 0.1197176169626605, 0.1356078122693954, 0.12448959511979142], "sharegpt": [0.15488417611687316, 0.11187396585818636, 0.12985289963275354, 0.1216705678195246, 0.11496630211065821, 0.13559869244118003, 0.12169074619637597, 0.10946264982444812]}} +{"step": 500, "old_prob_map": {"code": 0.26592793408730336, "math": 0.28876391543048435, "orca": 0.22985923187970148, "sharegpt": 0.2154489186025108}, "new_prob_map": {"code": 0.2669836751180987, "math": 0.2966379337081988, "orca": 0.22764222531614908, "sharegpt": 0.20873616585755356}, "sim": [[0.1259574701003086, 0.12599376240140372, 0.1260121163682299, 0.1259192261326235], [0.12599376240140372, 0.12696372953269297, 0.12641762455469407, 0.12616699032245762], [0.1260121163682299, 0.12641762455469407, 0.12725356468217505, 0.12685254651404831], [0.1259192261326235, 0.12616699032245762, 0.12685254651404831, 0.12656536029397814]], "name2load": {"code": [0.1470138888888889, 0.12184027777777777, 0.12666666666666668, 0.10680555555555557, 0.12201388888888888, 0.12670138888888888, 0.13211805555555556, 0.11684027777777778], "orca": [0.16257200140024822, 0.11082646469146802, 0.12724755752156064, 0.12562454253253985, 0.11042866690004137, 0.13595137319797598, 0.11851191802183113, 0.10883747573433472], "math": [0.15934471392728883, 0.10412259556255228, 0.11241206277365082, 0.11809416047621392, 0.1262114429084469, 0.1197176169626605, 0.1356078122693954, 0.12448959511979142], "sharegpt": [0.15488417611687316, 0.11187396585818636, 0.12985289963275354, 0.1216705678195246, 0.11496630211065821, 0.13559869244118003, 0.12169074619637597, 0.10946264982444812]}} +{"step": 600, "old_prob_map": {"code": 0.2669836751180987, "math": 0.2966379337081988, "orca": 0.22764222531614908, "sharegpt": 0.20873616585755356}, "new_prob_map": {"code": 0.27003083823938623, "math": 0.3010794462965474, "orca": 0.2251186490107333, "sharegpt": 0.20377106645333304}, "sim": [[0.12566743103780864, 0.12567633173310813, 0.12562479115615952, 0.12557017629595627], [0.12567633173310813, 0.1264801167028205, 0.1260440652850635, 0.12585004924121554], [0.12562479115615952, 0.1260440652850635, 0.12700713538771485, 0.12670045038512148], [0.12557017629595627, 0.12585004924121554, 0.12670045038512148, 0.12648942901261928]], "name2load": {"code": [0.14253472222222222, 0.1292013888888889, 0.12309027777777778, 0.1090625, 0.12319444444444445, 0.12371527777777779, 0.13090277777777778, 0.1182986111111111], "orca": [0.16037615759157303, 0.11216306527066161, 0.12672246443687743, 0.1264996976736785, 0.11119243865958055, 0.13502848232186615, 0.11914839448811379, 0.10886929955764886], "math": [0.1545235401190535, 0.10980469326511537, 0.11351896492350078, 0.11703645397746841, 0.12744133418605796, 0.11740542136075172, 0.13587838835046984, 0.12439120381758254], "sharegpt": [0.15372391944791963, 0.11282739416441342, 0.1299437023285847, 0.1220438677912749, 0.11535473586504702, 0.1362645788772751, 0.1203942854836757, 0.1094475160418096]}} +{"step": 600, "old_prob_map": {"code": 0.2669836751180987, "math": 0.2966379337081988, "orca": 0.22764222531614908, "sharegpt": 0.20873616585755356}, "new_prob_map": {"code": 0.27003083823938623, "math": 0.3010794462965474, "orca": 0.2251186490107333, "sharegpt": 0.20377106645333304}, "sim": [[0.12566743103780864, 0.12567633173310813, 0.12562479115615952, 0.12557017629595627], [0.12567633173310813, 0.1264801167028205, 0.1260440652850635, 0.12585004924121554], [0.12562479115615952, 0.1260440652850635, 0.12700713538771485, 0.12670045038512148], [0.12557017629595627, 0.12585004924121554, 0.12670045038512148, 0.12648942901261928]], "name2load": {"code": [0.14253472222222222, 0.1292013888888889, 0.12309027777777778, 0.1090625, 0.12319444444444445, 0.12371527777777779, 0.13090277777777778, 0.1182986111111111], "orca": [0.16037615759157303, 0.11216306527066161, 0.12672246443687743, 0.1264996976736785, 0.11119243865958055, 0.13502848232186615, 0.11914839448811379, 0.10886929955764886], "math": [0.1545235401190535, 0.10980469326511537, 0.11351896492350078, 0.11703645397746841, 0.12744133418605796, 0.11740542136075172, 0.13587838835046984, 0.12439120381758254], "sharegpt": [0.15372391944791963, 0.11282739416441342, 0.1299437023285847, 0.1220438677912749, 0.11535473586504702, 0.1362645788772751, 0.1203942854836757, 0.1094475160418096]}} +{"step": 600, "old_prob_map": {"code": 0.2669836751180987, "math": 0.2966379337081988, "orca": 0.22764222531614908, "sharegpt": 0.20873616585755356}, "new_prob_map": {"code": 0.27003083823938623, "math": 0.3010794462965474, "orca": 0.2251186490107333, "sharegpt": 0.20377106645333304}, "sim": [[0.12566743103780864, 0.12567633173310813, 0.12562479115615952, 0.12557017629595627], [0.12567633173310813, 0.1264801167028205, 0.1260440652850635, 0.12585004924121554], [0.12562479115615952, 0.1260440652850635, 0.12700713538771485, 0.12670045038512148], [0.12557017629595627, 0.12585004924121554, 0.12670045038512148, 0.12648942901261928]], "name2load": {"code": [0.14253472222222222, 0.1292013888888889, 0.12309027777777778, 0.1090625, 0.12319444444444445, 0.12371527777777779, 0.13090277777777778, 0.1182986111111111], "orca": [0.16037615759157303, 0.11216306527066161, 0.12672246443687743, 0.1264996976736785, 0.11119243865958055, 0.13502848232186615, 0.11914839448811379, 0.10886929955764886], "math": [0.1545235401190535, 0.10980469326511537, 0.11351896492350078, 0.11703645397746841, 0.12744133418605796, 0.11740542136075172, 0.13587838835046984, 0.12439120381758254], "sharegpt": [0.15372391944791963, 0.11282739416441342, 0.1299437023285847, 0.1220438677912749, 0.11535473586504702, 0.1362645788772751, 0.1203942854836757, 0.1094475160418096]}} +{"step": 700, "old_prob_map": {"code": 0.27003083823938623, "math": 0.3010794462965474, "orca": 0.2251186490107333, "sharegpt": 0.20377106645333304}, "new_prob_map": {"code": 0.2721660599163854, "math": 0.3060484802624147, "orca": 0.22232232353028722, "sharegpt": 0.19946313629091267}, "sim": [[0.12591640866126544, 0.12617961093072705, 0.1258935858863112, 0.12586781576217093], [0.12617961093072705, 0.12723727763421092, 0.1265306236290008, 0.12636545218336875], [0.1258935858863112, 0.1265306236290008, 0.12717894129531937, 0.12692870153713742], [0.12586781576217093, 0.12636545218336875, 0.12692870153713742, 0.12677430653529523]], "name2load": {"code": [0.14517361111111113, 0.12329861111111112, 0.12114583333333333, 0.1057638888888889, 0.12111111111111111, 0.13010416666666666, 0.1328125, 0.12059027777777778], "orca": [0.16211055596219331, 0.11036501925341312, 0.12634057855710787, 0.12439932533494574, 0.11015816440187125, 0.1370492951023136, 0.11717531744263757, 0.11240174394551762], "math": [0.1616323117036454, 0.10572145422344664, 0.1116003345304275, 0.11113297584493531, 0.12362867122546367, 0.12348108427215033, 0.1371328774536331, 0.125670290746298], "sharegpt": [0.156226038177489, 0.10993684168045523, 0.12972174018321966, 0.12017232333831065, 0.11424492513822186, 0.13864058275152347, 0.12030348278784456, 0.11075406594293555]}} +{"step": 700, "old_prob_map": {"code": 0.27003083823938623, "math": 0.3010794462965474, "orca": 0.2251186490107333, "sharegpt": 0.20377106645333304}, "new_prob_map": {"code": 0.2721660599163854, "math": 0.3060484802624147, "orca": 0.22232232353028722, "sharegpt": 0.19946313629091267}, "sim": [[0.12591640866126544, 0.12617961093072705, 0.1258935858863112, 0.12586781576217093], [0.12617961093072705, 0.12723727763421092, 0.1265306236290008, 0.12636545218336875], [0.1258935858863112, 0.1265306236290008, 0.12717894129531937, 0.12692870153713742], [0.12586781576217093, 0.12636545218336875, 0.12692870153713742, 0.12677430653529523]], "name2load": {"code": [0.14517361111111113, 0.12329861111111112, 0.12114583333333333, 0.1057638888888889, 0.12111111111111111, 0.13010416666666666, 0.1328125, 0.12059027777777778], "orca": [0.16211055596219331, 0.11036501925341312, 0.12634057855710787, 0.12439932533494574, 0.11015816440187125, 0.1370492951023136, 0.11717531744263757, 0.11240174394551762], "math": [0.1616323117036454, 0.10572145422344664, 0.1116003345304275, 0.11113297584493531, 0.12362867122546367, 0.12348108427215033, 0.1371328774536331, 0.125670290746298], "sharegpt": [0.156226038177489, 0.10993684168045523, 0.12972174018321966, 0.12017232333831065, 0.11424492513822186, 0.13864058275152347, 0.12030348278784456, 0.11075406594293555]}} +{"step": 700, "old_prob_map": {"code": 0.27003083823938623, "math": 0.3010794462965474, "orca": 0.2251186490107333, "sharegpt": 0.20377106645333304}, "new_prob_map": {"code": 0.2721660599163854, "math": 0.3060484802624147, "orca": 0.22232232353028722, "sharegpt": 0.19946313629091267}, "sim": [[0.12591640866126544, 0.12617961093072705, 0.1258935858863112, 0.12586781576217093], [0.12617961093072705, 0.12723727763421092, 0.1265306236290008, 0.12636545218336875], [0.1258935858863112, 0.1265306236290008, 0.12717894129531937, 0.12692870153713742], [0.12586781576217093, 0.12636545218336875, 0.12692870153713742, 0.12677430653529523]], "name2load": {"code": [0.14517361111111113, 0.12329861111111112, 0.12114583333333333, 0.1057638888888889, 0.12111111111111111, 0.13010416666666666, 0.1328125, 0.12059027777777778], "orca": [0.16211055596219331, 0.11036501925341312, 0.12634057855710787, 0.12439932533494574, 0.11015816440187125, 0.1370492951023136, 0.11717531744263757, 0.11240174394551762], "math": [0.1616323117036454, 0.10572145422344664, 0.1116003345304275, 0.11113297584493531, 0.12362867122546367, 0.12348108427215033, 0.1371328774536331, 0.125670290746298], "sharegpt": [0.156226038177489, 0.10993684168045523, 0.12972174018321966, 0.12017232333831065, 0.11424492513822186, 0.13864058275152347, 0.12030348278784456, 0.11075406594293555]}} +{"step": 800, "old_prob_map": {"code": 0.2721660599163854, "math": 0.3060484802624147, "orca": 0.22232232353028722, "sharegpt": 0.19946313629091267}, "new_prob_map": {"code": 0.2714569825529402, "math": 0.31164467459228024, "orca": 0.22043752359675148, "sharegpt": 0.19646081925802802}, "sim": [[0.1257888744212963, 0.12608671741334734, 0.12586396486826704, 0.12578446662944875], [0.12608671741334734, 0.12703465505802583, 0.1263656558457333, 0.12616774116713667], [0.12586396486826704, 0.1263656558457333, 0.12707414791525715, 0.12679080694508663], [0.12578446662944875, 0.12616774116713667, 0.12679080694508663, 0.1266166860219692]], "name2load": {"code": [0.14774305555555556, 0.12232638888888889, 0.12312500000000001, 0.11069444444444444, 0.12357638888888887, 0.12385416666666667, 0.12944444444444445, 0.1192361111111111], "orca": [0.16113992935111224, 0.11014225249021417, 0.12837730324921234, 0.1275021481080737, 0.11046049072335551, 0.13389873659421442, 0.11617286700824236, 0.1123062724755752], "math": [0.1604516160771388, 0.10520489988685001, 0.11302700841245635, 0.11408471491120187, 0.12601466030402914, 0.12092291041471935, 0.13528804053721652, 0.12500614945638808], "sharegpt": [0.1546823923483595, 0.10977541466564429, 0.13218854675329916, 0.12226582993663988, 0.1145879575446951, 0.1362393559062109, 0.11788207756568062, 0.1123784252794705]}} +{"step": 800, "old_prob_map": {"code": 0.2721660599163854, "math": 0.3060484802624147, "orca": 0.22232232353028722, "sharegpt": 0.19946313629091267}, "new_prob_map": {"code": 0.2714569825529402, "math": 0.31164467459228024, "orca": 0.22043752359675148, "sharegpt": 0.19646081925802802}, "sim": [[0.1257888744212963, 0.12608671741334734, 0.12586396486826704, 0.12578446662944875], [0.12608671741334734, 0.12703465505802583, 0.1263656558457333, 0.12616774116713667], [0.12586396486826704, 0.1263656558457333, 0.12707414791525715, 0.12679080694508663], [0.12578446662944875, 0.12616774116713667, 0.12679080694508663, 0.1266166860219692]], "name2load": {"code": [0.14774305555555556, 0.12232638888888889, 0.12312500000000001, 0.11069444444444444, 0.12357638888888887, 0.12385416666666667, 0.12944444444444445, 0.1192361111111111], "orca": [0.16113992935111224, 0.11014225249021417, 0.12837730324921234, 0.1275021481080737, 0.11046049072335551, 0.13389873659421442, 0.11617286700824236, 0.1123062724755752], "math": [0.1604516160771388, 0.10520489988685001, 0.11302700841245635, 0.11408471491120187, 0.12601466030402914, 0.12092291041471935, 0.13528804053721652, 0.12500614945638808], "sharegpt": [0.1546823923483595, 0.10977541466564429, 0.13218854675329916, 0.12226582993663988, 0.1145879575446951, 0.1362393559062109, 0.11788207756568062, 0.1123784252794705]}} +{"step": 800, "old_prob_map": {"code": 0.2721660599163854, "math": 0.3060484802624147, "orca": 0.22232232353028722, "sharegpt": 0.19946313629091267}, "new_prob_map": {"code": 0.2714569825529402, "math": 0.31164467459228024, "orca": 0.22043752359675148, "sharegpt": 0.19646081925802802}, "sim": [[0.1257888744212963, 0.12608671741334734, 0.12586396486826704, 0.12578446662944875], [0.12608671741334734, 0.12703465505802583, 0.1263656558457333, 0.12616774116713667], [0.12586396486826704, 0.1263656558457333, 0.12707414791525715, 0.12679080694508663], [0.12578446662944875, 0.12616774116713667, 0.12679080694508663, 0.1266166860219692]], "name2load": {"code": [0.14774305555555556, 0.12232638888888889, 0.12312500000000001, 0.11069444444444444, 0.12357638888888887, 0.12385416666666667, 0.12944444444444445, 0.1192361111111111], "orca": [0.16113992935111224, 0.11014225249021417, 0.12837730324921234, 0.1275021481080737, 0.11046049072335551, 0.13389873659421442, 0.11617286700824236, 0.1123062724755752], "math": [0.1604516160771388, 0.10520489988685001, 0.11302700841245635, 0.11408471491120187, 0.12601466030402914, 0.12092291041471935, 0.13528804053721652, 0.12500614945638808], "sharegpt": [0.1546823923483595, 0.10977541466564429, 0.13218854675329916, 0.12226582993663988, 0.1145879575446951, 0.1362393559062109, 0.11788207756568062, 0.1123784252794705]}} +{"step": 900, "old_prob_map": {"code": 0.2714569825529402, "math": 0.31164467459228024, "orca": 0.22043752359675148, "sharegpt": 0.19646081925802802}, "new_prob_map": {"code": 0.2723148904404529, "math": 0.31564146378610697, "orca": 0.21837309802996122, "sharegpt": 0.1936705477434788}, "sim": [[0.12570747251157408, 0.12585408006182255, 0.12568240001131514, 0.12568915602817798], [0.12585408006182255, 0.12659409417087344, 0.1260897289567614, 0.12595491163084996], [0.12568240001131514, 0.1260897289567614, 0.12685691586212447, 0.1266720302140431], [0.12568915602817798, 0.12595491163084996, 0.1266720302140431, 0.12656430150737044]], "name2load": {"code": [0.14270833333333333, 0.12319444444444445, 0.12736111111111112, 0.10944444444444444, 0.12482638888888889, 0.12458333333333334, 0.1323263888888889, 0.11555555555555556], "orca": [0.1588327021608376, 0.11092193616141043, 0.128297743690927, 0.12584730929573879, 0.11176526747923494, 0.1350125704102091, 0.1172389650892658, 0.11208350571237627], "math": [0.1558272248733212, 0.11076400846165199, 0.11388793230678408, 0.11187091061150195, 0.126113051606238, 0.12212820386677817, 0.13646873616372313, 0.12293993211000148], "sharegpt": [0.15380463295532507, 0.11132410508898664, 0.13251140078292103, 0.1214637394567981, 0.1148200088784858, 0.13663787884902537, 0.11847229508858308, 0.11096593889987488]}} +{"step": 900, "old_prob_map": {"code": 0.2714569825529402, "math": 0.31164467459228024, "orca": 0.22043752359675148, "sharegpt": 0.19646081925802802}, "new_prob_map": {"code": 0.2723148904404529, "math": 0.31564146378610697, "orca": 0.21837309802996122, "sharegpt": 0.1936705477434788}, "sim": [[0.12570747251157408, 0.12585408006182255, 0.12568240001131514, 0.12568915602817798], [0.12585408006182255, 0.12659409417087344, 0.1260897289567614, 0.12595491163084996], [0.12568240001131514, 0.1260897289567614, 0.12685691586212447, 0.1266720302140431], [0.12568915602817798, 0.12595491163084996, 0.1266720302140431, 0.12656430150737044]], "name2load": {"code": [0.14270833333333333, 0.12319444444444445, 0.12736111111111112, 0.10944444444444444, 0.12482638888888889, 0.12458333333333334, 0.1323263888888889, 0.11555555555555556], "orca": [0.1588327021608376, 0.11092193616141043, 0.128297743690927, 0.12584730929573879, 0.11176526747923494, 0.1350125704102091, 0.1172389650892658, 0.11208350571237627], "math": [0.1558272248733212, 0.11076400846165199, 0.11388793230678408, 0.11187091061150195, 0.126113051606238, 0.12212820386677817, 0.13646873616372313, 0.12293993211000148], "sharegpt": [0.15380463295532507, 0.11132410508898664, 0.13251140078292103, 0.1214637394567981, 0.1148200088784858, 0.13663787884902537, 0.11847229508858308, 0.11096593889987488]}} +{"step": 900, "old_prob_map": {"code": 0.2714569825529402, "math": 0.31164467459228024, "orca": 0.22043752359675148, "sharegpt": 0.19646081925802802}, "new_prob_map": {"code": 0.2723148904404529, "math": 0.31564146378610697, "orca": 0.21837309802996122, "sharegpt": 0.1936705477434788}, "sim": [[0.12570747251157408, 0.12585408006182255, 0.12568240001131514, 0.12568915602817798], [0.12585408006182255, 0.12659409417087344, 0.1260897289567614, 0.12595491163084996], [0.12568240001131514, 0.1260897289567614, 0.12685691586212447, 0.1266720302140431], [0.12568915602817798, 0.12595491163084996, 0.1266720302140431, 0.12656430150737044]], "name2load": {"code": [0.14270833333333333, 0.12319444444444445, 0.12736111111111112, 0.10944444444444444, 0.12482638888888889, 0.12458333333333334, 0.1323263888888889, 0.11555555555555556], "orca": [0.1588327021608376, 0.11092193616141043, 0.128297743690927, 0.12584730929573879, 0.11176526747923494, 0.1350125704102091, 0.1172389650892658, 0.11208350571237627], "math": [0.1558272248733212, 0.11076400846165199, 0.11388793230678408, 0.11187091061150195, 0.126113051606238, 0.12212820386677817, 0.13646873616372313, 0.12293993211000148], "sharegpt": [0.15380463295532507, 0.11132410508898664, 0.13251140078292103, 0.1214637394567981, 0.1148200088784858, 0.13663787884902537, 0.11847229508858308, 0.11096593889987488]}} +{"step": 1000, "old_prob_map": {"code": 0.2723148904404529, "math": 0.31564146378610697, "orca": 0.21837309802996122, "sharegpt": 0.1936705477434788}, "new_prob_map": {"code": 0.2722841717572351, "math": 0.3194814726996295, "orca": 0.21690109310035738, "sharegpt": 0.191333262442778}, "sim": [[0.1257504653742284, 0.12595864815133678, 0.12580714111125255, 0.12576848017026065], [0.12595864815133678, 0.1268039313992917, 0.12628966584110354, 0.12609522044107874], [0.12580714111125255, 0.12628966584110354, 0.12696985787479292, 0.12670862298735203], [0.12576848017026065, 0.12609522044107874, 0.12670862298735203, 0.12656973927211368]], "name2load": {"code": [0.14548611111111112, 0.12215277777777778, 0.12652777777777777, 0.11104166666666668, 0.11947916666666668, 0.12413194444444445, 0.13243055555555555, 0.11875], "orca": [0.15951691436209145, 0.10928300926073257, 0.12837730324921234, 0.12732711707984598, 0.11020590013684244, 0.13491709894026668, 0.1167297839162397, 0.1136428730547688], "math": [0.1587297682884833, 0.10852560633639986, 0.11265804102917303, 0.11573276922320068, 0.1213410734491071, 0.12175923648349488, 0.13651793181482758, 0.12473557337531364], "sharegpt": [0.15346160054885183, 0.1094122038823197, 0.1319514508252956, 0.12137293676096694, 0.11503188183542516, 0.13752068283627264, 0.11903728964042132, 0.11221195367044674]}} +{"step": 1000, "old_prob_map": {"code": 0.2723148904404529, "math": 0.31564146378610697, "orca": 0.21837309802996122, "sharegpt": 0.1936705477434788}, "new_prob_map": {"code": 0.2722841717572351, "math": 0.3194814726996295, "orca": 0.21690109310035738, "sharegpt": 0.191333262442778}, "sim": [[0.1257504653742284, 0.12595864815133678, 0.12580714111125255, 0.12576848017026065], [0.12595864815133678, 0.1268039313992917, 0.12628966584110354, 0.12609522044107874], [0.12580714111125255, 0.12628966584110354, 0.12696985787479292, 0.12670862298735203], [0.12576848017026065, 0.12609522044107874, 0.12670862298735203, 0.12656973927211368]], "name2load": {"code": [0.14548611111111112, 0.12215277777777778, 0.12652777777777777, 0.11104166666666668, 0.11947916666666668, 0.12413194444444445, 0.13243055555555555, 0.11875], "orca": [0.15951691436209145, 0.10928300926073257, 0.12837730324921234, 0.12732711707984598, 0.11020590013684244, 0.13491709894026668, 0.1167297839162397, 0.1136428730547688], "math": [0.1587297682884833, 0.10852560633639986, 0.11265804102917303, 0.11573276922320068, 0.1213410734491071, 0.12175923648349488, 0.13651793181482758, 0.12473557337531364], "sharegpt": [0.15346160054885183, 0.1094122038823197, 0.1319514508252956, 0.12137293676096694, 0.11503188183542516, 0.13752068283627264, 0.11903728964042132, 0.11221195367044674]}} +{"step": 1000, "old_prob_map": {"code": 0.2723148904404529, "math": 0.31564146378610697, "orca": 0.21837309802996122, "sharegpt": 0.1936705477434788}, "new_prob_map": {"code": 0.2722841717572351, "math": 0.3194814726996295, "orca": 0.21690109310035738, "sharegpt": 0.191333262442778}, "sim": [[0.1257504653742284, 0.12595864815133678, 0.12580714111125255, 0.12576848017026065], [0.12595864815133678, 0.1268039313992917, 0.12628966584110354, 0.12609522044107874], [0.12580714111125255, 0.12628966584110354, 0.12696985787479292, 0.12670862298735203], [0.12576848017026065, 0.12609522044107874, 0.12670862298735203, 0.12656973927211368]], "name2load": {"code": [0.14548611111111112, 0.12215277777777778, 0.12652777777777777, 0.11104166666666668, 0.11947916666666668, 0.12413194444444445, 0.13243055555555555, 0.11875], "orca": [0.15951691436209145, 0.10928300926073257, 0.12837730324921234, 0.12732711707984598, 0.11020590013684244, 0.13491709894026668, 0.1167297839162397, 0.1136428730547688], "math": [0.1587297682884833, 0.10852560633639986, 0.11265804102917303, 0.11573276922320068, 0.1213410734491071, 0.12175923648349488, 0.13651793181482758, 0.12473557337531364], "sharegpt": [0.15346160054885183, 0.1094122038823197, 0.1319514508252956, 0.12137293676096694, 0.11503188183542516, 0.13752068283627264, 0.11903728964042132, 0.11221195367044674]}} +{"step": 1100, "old_prob_map": {"code": 0.2722841717572351, "math": 0.3194814726996295, "orca": 0.21690109310035738, "sharegpt": 0.191333262442778}, "new_prob_map": {"code": 0.2736675593567714, "math": 0.3204557297882405, "orca": 0.21683951539994323, "sharegpt": 0.18903719545504494}, "sim": [[0.12571141493055557, 0.12577790367081548, 0.12564886676249173, 0.1256420827826318], [0.12577790367081548, 0.12634154987995758, 0.12599165116469266, 0.12587112686965984], [0.12564886676249173, 0.12599165116469266, 0.1268617978511225, 0.1266110307265288], [0.1256420827826318, 0.12587112686965984, 0.1266110307265288, 0.12645934080435833]], "name2load": {"code": [0.1420486111111111, 0.12375, 0.12586805555555555, 0.11027777777777778, 0.1209375, 0.1255902777777778, 0.13534722222222223, 0.11618055555555556], "orca": [0.158450816281068, 0.11044457881169843, 0.1288864844222385, 0.12737485281481717, 0.11128791012952295, 0.13451930114884, 0.11695255067943862, 0.11208350571237627], "math": [0.15245732277266685, 0.10923894327741426, 0.11674128007084172, 0.11378954100457518, 0.12471097554976139, 0.12257096472671815, 0.13673931224479755, 0.12375166035322477], "sharegpt": [0.15234674522781388, 0.11115258888575005, 0.13157310625933252, 0.12223556237136286, 0.11503692642963802, 0.13709693692239397, 0.11942067880059731, 0.11113745510311152]}} +{"step": 1100, "old_prob_map": {"code": 0.2722841717572351, "math": 0.3194814726996295, "orca": 0.21690109310035738, "sharegpt": 0.191333262442778}, "new_prob_map": {"code": 0.2736675593567714, "math": 0.3204557297882405, "orca": 0.21683951539994323, "sharegpt": 0.18903719545504494}, "sim": [[0.12571141493055557, 0.12577790367081548, 0.12564886676249173, 0.1256420827826318], [0.12577790367081548, 0.12634154987995758, 0.12599165116469266, 0.12587112686965984], [0.12564886676249173, 0.12599165116469266, 0.1268617978511225, 0.1266110307265288], [0.1256420827826318, 0.12587112686965984, 0.1266110307265288, 0.12645934080435833]], "name2load": {"code": [0.1420486111111111, 0.12375, 0.12586805555555555, 0.11027777777777778, 0.1209375, 0.1255902777777778, 0.13534722222222223, 0.11618055555555556], "orca": [0.158450816281068, 0.11044457881169843, 0.1288864844222385, 0.12737485281481717, 0.11128791012952295, 0.13451930114884, 0.11695255067943862, 0.11208350571237627], "math": [0.15245732277266685, 0.10923894327741426, 0.11674128007084172, 0.11378954100457518, 0.12471097554976139, 0.12257096472671815, 0.13673931224479755, 0.12375166035322477], "sharegpt": [0.15234674522781388, 0.11115258888575005, 0.13157310625933252, 0.12223556237136286, 0.11503692642963802, 0.13709693692239397, 0.11942067880059731, 0.11113745510311152]}} +{"step": 1100, "old_prob_map": {"code": 0.2722841717572351, "math": 0.3194814726996295, "orca": 0.21690109310035738, "sharegpt": 0.191333262442778}, "new_prob_map": {"code": 0.2736675593567714, "math": 0.3204557297882405, "orca": 0.21683951539994323, "sharegpt": 0.18903719545504494}, "sim": [[0.12571141493055557, 0.12577790367081548, 0.12564886676249173, 0.1256420827826318], [0.12577790367081548, 0.12634154987995758, 0.12599165116469266, 0.12587112686965984], [0.12564886676249173, 0.12599165116469266, 0.1268617978511225, 0.1266110307265288], [0.1256420827826318, 0.12587112686965984, 0.1266110307265288, 0.12645934080435833]], "name2load": {"code": [0.1420486111111111, 0.12375, 0.12586805555555555, 0.11027777777777778, 0.1209375, 0.1255902777777778, 0.13534722222222223, 0.11618055555555556], "orca": [0.158450816281068, 0.11044457881169843, 0.1288864844222385, 0.12737485281481717, 0.11128791012952295, 0.13451930114884, 0.11695255067943862, 0.11208350571237627], "math": [0.15245732277266685, 0.10923894327741426, 0.11674128007084172, 0.11378954100457518, 0.12471097554976139, 0.12257096472671815, 0.13673931224479755, 0.12375166035322477], "sharegpt": [0.15234674522781388, 0.11115258888575005, 0.13157310625933252, 0.12223556237136286, 0.11503692642963802, 0.13709693692239397, 0.11942067880059731, 0.11113745510311152]}} +{"step": 1200, "old_prob_map": {"code": 0.2736675593567714, "math": 0.3204557297882405, "orca": 0.21683951539994323, "sharegpt": 0.18903719545504494}, "new_prob_map": {"code": 0.27580974388598356, "math": 0.3222952911181522, "orca": 0.21566611167128363, "sharegpt": 0.1862288533245807}, "sim": [[0.12588290171682098, 0.1261328801867248, 0.1257449084092685, 0.1257601303158813], [0.1261328801867248, 0.12701436399988345, 0.12633952452114355, 0.12623899298926336], [0.1257449084092685, 0.12633952452114355, 0.126932493771256, 0.12671202848577792], [0.1257601303158813, 0.12623899298926336, 0.12671202848577792, 0.12658558590218905]], "name2load": {"code": [0.14434027777777778, 0.12267361111111111, 0.12274305555555555, 0.10788194444444445, 0.12180555555555556, 0.1282986111111111, 0.13565972222222222, 0.11659722222222223], "orca": [0.15932597142220664, 0.10999904528530056, 0.129013779715495, 0.1276453553129873, 0.10996722146198644, 0.13369188174267257, 0.11758902714572128, 0.11276771791363015], "math": [0.15981207261278105, 0.10670536724553549, 0.11440448664338072, 0.11076400846165199, 0.12382545382988143, 0.12453879077089586, 0.13710827962808086, 0.12284154080779258], "sharegpt": [0.15416784373864967, 0.10968461196981318, 0.13148230356350138, 0.12223051777715002, 0.11419447919609348, 0.13658238831268416, 0.11979397877234756, 0.1118638766697607]}} +{"step": 1200, "old_prob_map": {"code": 0.2736675593567714, "math": 0.3204557297882405, "orca": 0.21683951539994323, "sharegpt": 0.18903719545504494}, "new_prob_map": {"code": 0.27580974388598356, "math": 0.3222952911181522, "orca": 0.21566611167128363, "sharegpt": 0.1862288533245807}, "sim": [[0.12588290171682098, 0.1261328801867248, 0.1257449084092685, 0.1257601303158813], [0.1261328801867248, 0.12701436399988345, 0.12633952452114355, 0.12623899298926336], [0.1257449084092685, 0.12633952452114355, 0.126932493771256, 0.12671202848577792], [0.1257601303158813, 0.12623899298926336, 0.12671202848577792, 0.12658558590218905]], "name2load": {"code": [0.14434027777777778, 0.12267361111111111, 0.12274305555555555, 0.10788194444444445, 0.12180555555555556, 0.1282986111111111, 0.13565972222222222, 0.11659722222222223], "orca": [0.15932597142220664, 0.10999904528530056, 0.129013779715495, 0.1276453553129873, 0.10996722146198644, 0.13369188174267257, 0.11758902714572128, 0.11276771791363015], "math": [0.15981207261278105, 0.10670536724553549, 0.11440448664338072, 0.11076400846165199, 0.12382545382988143, 0.12453879077089586, 0.13710827962808086, 0.12284154080779258], "sharegpt": [0.15416784373864967, 0.10968461196981318, 0.13148230356350138, 0.12223051777715002, 0.11419447919609348, 0.13658238831268416, 0.11979397877234756, 0.1118638766697607]}} +{"step": 1200, "old_prob_map": {"code": 0.2736675593567714, "math": 0.3204557297882405, "orca": 0.21683951539994323, "sharegpt": 0.18903719545504494}, "new_prob_map": {"code": 0.27580974388598356, "math": 0.3222952911181522, "orca": 0.21566611167128363, "sharegpt": 0.1862288533245807}, "sim": [[0.12588290171682098, 0.1261328801867248, 0.1257449084092685, 0.1257601303158813], [0.1261328801867248, 0.12701436399988345, 0.12633952452114355, 0.12623899298926336], [0.1257449084092685, 0.12633952452114355, 0.126932493771256, 0.12671202848577792], [0.1257601303158813, 0.12623899298926336, 0.12671202848577792, 0.12658558590218905]], "name2load": {"code": [0.14434027777777778, 0.12267361111111111, 0.12274305555555555, 0.10788194444444445, 0.12180555555555556, 0.1282986111111111, 0.13565972222222222, 0.11659722222222223], "orca": [0.15932597142220664, 0.10999904528530056, 0.129013779715495, 0.1276453553129873, 0.10996722146198644, 0.13369188174267257, 0.11758902714572128, 0.11276771791363015], "math": [0.15981207261278105, 0.10670536724553549, 0.11440448664338072, 0.11076400846165199, 0.12382545382988143, 0.12453879077089586, 0.13710827962808086, 0.12284154080779258], "sharegpt": [0.15416784373864967, 0.10968461196981318, 0.13148230356350138, 0.12223051777715002, 0.11419447919609348, 0.13658238831268416, 0.11979397877234756, 0.1118638766697607]}} +{"step": 1300, "old_prob_map": {"code": 0.27580974388598356, "math": 0.3222952911181522, "orca": 0.21566611167128363, "sharegpt": 0.1862288533245807}, "new_prob_map": {"code": 0.27666332592621296, "math": 0.32285034883570185, "orca": 0.2155287691198032, "sharegpt": 0.184957556118282}, "sim": [[0.12582070071373458, 0.12591443526808896, 0.1257385115997836, 0.12572414519351066], [0.12591443526808896, 0.1264367622335894, 0.12607809581270985, 0.12593663363183027], [0.1257385115997836, 0.12607809581270985, 0.1268740739697077, 0.12664481223383642], [0.12572414519351066, 0.12593663363183027, 0.12664481223383642, 0.12650471527925894]], "name2load": {"code": [0.14350694444444445, 0.12170138888888889, 0.12128472222222222, 0.10913194444444445, 0.12020833333333333, 0.13041666666666668, 0.1353125, 0.1184375], "orca": [0.1590713808356936, 0.11033319543009897, 0.1284886866308118, 0.128059065016071, 0.11139929351112242, 0.13307131718804696, 0.11777997008560609, 0.11179709130254908], "math": [0.15393319230580013, 0.11059182368278643, 0.11189550843705415, 0.1153146061888129, 0.12343188862104588, 0.12493235597973137, 0.13607517095488758, 0.12382545382988142], "sharegpt": [0.15327999515718954, 0.11059263892812463, 0.13096271035957865, 0.12303765285120465, 0.1143205940514145, 0.13666814641430242, 0.1194358125832358, 0.11170244965494976]}} +{"step": 1300, "old_prob_map": {"code": 0.27580974388598356, "math": 0.3222952911181522, "orca": 0.21566611167128363, "sharegpt": 0.1862288533245807}, "new_prob_map": {"code": 0.27666332592621296, "math": 0.32285034883570185, "orca": 0.2155287691198032, "sharegpt": 0.184957556118282}, "sim": [[0.12582070071373458, 0.12591443526808896, 0.1257385115997836, 0.12572414519351066], [0.12591443526808896, 0.1264367622335894, 0.12607809581270985, 0.12593663363183027], [0.1257385115997836, 0.12607809581270985, 0.1268740739697077, 0.12664481223383642], [0.12572414519351066, 0.12593663363183027, 0.12664481223383642, 0.12650471527925894]], "name2load": {"code": [0.14350694444444445, 0.12170138888888889, 0.12128472222222222, 0.10913194444444445, 0.12020833333333333, 0.13041666666666668, 0.1353125, 0.1184375], "orca": [0.1590713808356936, 0.11033319543009897, 0.1284886866308118, 0.128059065016071, 0.11139929351112242, 0.13307131718804696, 0.11777997008560609, 0.11179709130254908], "math": [0.15393319230580013, 0.11059182368278643, 0.11189550843705415, 0.1153146061888129, 0.12343188862104588, 0.12493235597973137, 0.13607517095488758, 0.12382545382988142], "sharegpt": [0.15327999515718954, 0.11059263892812463, 0.13096271035957865, 0.12303765285120465, 0.1143205940514145, 0.13666814641430242, 0.1194358125832358, 0.11170244965494976]}} +{"step": 1300, "old_prob_map": {"code": 0.27580974388598356, "math": 0.3222952911181522, "orca": 0.21566611167128363, "sharegpt": 0.1862288533245807}, "new_prob_map": {"code": 0.27666332592621296, "math": 0.32285034883570185, "orca": 0.2155287691198032, "sharegpt": 0.184957556118282}, "sim": [[0.12582070071373458, 0.12591443526808896, 0.1257385115997836, 0.12572414519351066], [0.12591443526808896, 0.1264367622335894, 0.12607809581270985, 0.12593663363183027], [0.1257385115997836, 0.12607809581270985, 0.1268740739697077, 0.12664481223383642], [0.12572414519351066, 0.12593663363183027, 0.12664481223383642, 0.12650471527925894]], "name2load": {"code": [0.14350694444444445, 0.12170138888888889, 0.12128472222222222, 0.10913194444444445, 0.12020833333333333, 0.13041666666666668, 0.1353125, 0.1184375], "orca": [0.1590713808356936, 0.11033319543009897, 0.1284886866308118, 0.128059065016071, 0.11139929351112242, 0.13307131718804696, 0.11777997008560609, 0.11179709130254908], "math": [0.15393319230580013, 0.11059182368278643, 0.11189550843705415, 0.1153146061888129, 0.12343188862104588, 0.12493235597973137, 0.13607517095488758, 0.12382545382988142], "sharegpt": [0.15327999515718954, 0.11059263892812463, 0.13096271035957865, 0.12303765285120465, 0.1143205940514145, 0.13666814641430242, 0.1194358125832358, 0.11170244965494976]}} +{"step": 1400, "old_prob_map": {"code": 0.27666332592621296, "math": 0.32285034883570185, "orca": 0.2155287691198032, "sharegpt": 0.184957556118282}, "new_prob_map": {"code": 0.2779217751998221, "math": 0.3233091416247052, "orca": 0.21481783112919145, "sharegpt": 0.18395125204628138}, "sim": [[0.1258207561728395, 0.12601849431926887, 0.1256774280914192, 0.12570311536622633], [0.12601849431926887, 0.12665956090784278, 0.1261289811308243, 0.12604632036294167], [0.1256774280914192, 0.1261289811308243, 0.12685240757999092, 0.126695717235162], [0.12570311536622633, 0.12604632036294167, 0.126695717235162, 0.12662251517588774]], "name2load": {"code": [0.14270833333333333, 0.11940972222222221, 0.1225, 0.1076388888888889, 0.12232638888888889, 0.1287847222222222, 0.13621527777777778, 0.12041666666666667], "orca": [0.1590236451007224, 0.10963307131718804, 0.12936384177195048, 0.12710435031664702, 0.11149476498106482, 0.13260987174999203, 0.11762085096903542, 0.11314960379339974], "math": [0.15526147488562014, 0.10781226939538546, 0.11310080188911302, 0.11105918236827865, 0.12473557337531364, 0.12606385595513359, 0.13543562749052984, 0.12653121464062578], "sharegpt": [0.15414766536179828, 0.10903385931635659, 0.1319363170426571, 0.12240203398038661, 0.11416421163081641, 0.13701622341498848, 0.11928951935106338, 0.11201016990193309]}} +{"step": 1400, "old_prob_map": {"code": 0.27666332592621296, "math": 0.32285034883570185, "orca": 0.2155287691198032, "sharegpt": 0.184957556118282}, "new_prob_map": {"code": 0.2779217751998221, "math": 0.3233091416247052, "orca": 0.21481783112919145, "sharegpt": 0.18395125204628138}, "sim": [[0.1258207561728395, 0.12601849431926887, 0.1256774280914192, 0.12570311536622633], [0.12601849431926887, 0.12665956090784278, 0.1261289811308243, 0.12604632036294167], [0.1256774280914192, 0.1261289811308243, 0.12685240757999092, 0.126695717235162], [0.12570311536622633, 0.12604632036294167, 0.126695717235162, 0.12662251517588774]], "name2load": {"code": [0.14270833333333333, 0.11940972222222221, 0.1225, 0.1076388888888889, 0.12232638888888889, 0.1287847222222222, 0.13621527777777778, 0.12041666666666667], "orca": [0.1590236451007224, 0.10963307131718804, 0.12936384177195048, 0.12710435031664702, 0.11149476498106482, 0.13260987174999203, 0.11762085096903542, 0.11314960379339974], "math": [0.15526147488562014, 0.10781226939538546, 0.11310080188911302, 0.11105918236827865, 0.12473557337531364, 0.12606385595513359, 0.13543562749052984, 0.12653121464062578], "sharegpt": [0.15414766536179828, 0.10903385931635659, 0.1319363170426571, 0.12240203398038661, 0.11416421163081641, 0.13701622341498848, 0.11928951935106338, 0.11201016990193309]}} +{"step": 1400, "old_prob_map": {"code": 0.27666332592621296, "math": 0.32285034883570185, "orca": 0.2155287691198032, "sharegpt": 0.184957556118282}, "new_prob_map": {"code": 0.2779217751998221, "math": 0.3233091416247052, "orca": 0.21481783112919145, "sharegpt": 0.18395125204628138}, "sim": [[0.1258207561728395, 0.12601849431926887, 0.1256774280914192, 0.12570311536622633], [0.12601849431926887, 0.12665956090784278, 0.1261289811308243, 0.12604632036294167], [0.1256774280914192, 0.1261289811308243, 0.12685240757999092, 0.126695717235162], [0.12570311536622633, 0.12604632036294167, 0.126695717235162, 0.12662251517588774]], "name2load": {"code": [0.14270833333333333, 0.11940972222222221, 0.1225, 0.1076388888888889, 0.12232638888888889, 0.1287847222222222, 0.13621527777777778, 0.12041666666666667], "orca": [0.1590236451007224, 0.10963307131718804, 0.12936384177195048, 0.12710435031664702, 0.11149476498106482, 0.13260987174999203, 0.11762085096903542, 0.11314960379339974], "math": [0.15526147488562014, 0.10781226939538546, 0.11310080188911302, 0.11105918236827865, 0.12473557337531364, 0.12606385595513359, 0.13543562749052984, 0.12653121464062578], "sharegpt": [0.15414766536179828, 0.10903385931635659, 0.1319363170426571, 0.12240203398038661, 0.11416421163081641, 0.13701622341498848, 0.11928951935106338, 0.11201016990193309]}} +{"step": 1500, "old_prob_map": {"code": 0.2779217751998221, "math": 0.3233091416247052, "orca": 0.21481783112919145, "sharegpt": 0.18395125204628138}, "new_prob_map": {"code": 0.2794000506707779, "math": 0.32403823816235056, "orca": 0.2141558894834887, "sharegpt": 0.18240582168338285}, "sim": [[0.12583274016203705, 0.12605009740055645, 0.12582082701276845, 0.12585137441370606], [0.12605009740055645, 0.1267408267893199, 0.12630800638000103, 0.12623558657906186], [0.12582082701276845, 0.12630800638000103, 0.12691902867744342, 0.1267594364818609], [0.12585137441370606, 0.12623558657906186, 0.1267594364818609, 0.1266761751407769]], "name2load": {"code": [0.14378472222222222, 0.1182986111111111, 0.12659722222222222, 0.10895833333333334, 0.12434027777777779, 0.12802083333333333, 0.13409722222222223, 0.11590277777777779], "orca": [0.1595010024504344, 0.10934665690736085, 0.1295547847118353, 0.12653152149699265, 0.1111765267479235, 0.1331986124813035, 0.11816185596537568, 0.11252903923877415], "math": [0.15666355094209672, 0.10655778029222217, 0.11602794312982732, 0.11118217149603976, 0.12367786687656812, 0.127219953756088, 0.1357553992227087, 0.12291533428444926], "sharegpt": [0.15482868558053192, 0.10909943904112354, 0.13166390895516367, 0.12214980426974456, 0.11431050486298884, 0.13694559909600873, 0.12017232333831067, 0.1108297348561282]}} +{"step": 1500, "old_prob_map": {"code": 0.2779217751998221, "math": 0.3233091416247052, "orca": 0.21481783112919145, "sharegpt": 0.18395125204628138}, "new_prob_map": {"code": 0.2794000506707779, "math": 0.32403823816235056, "orca": 0.2141558894834887, "sharegpt": 0.18240582168338285}, "sim": [[0.12583274016203705, 0.12605009740055645, 0.12582082701276845, 0.12585137441370606], [0.12605009740055645, 0.1267408267893199, 0.12630800638000103, 0.12623558657906186], [0.12582082701276845, 0.12630800638000103, 0.12691902867744342, 0.1267594364818609], [0.12585137441370606, 0.12623558657906186, 0.1267594364818609, 0.1266761751407769]], "name2load": {"code": [0.14378472222222222, 0.1182986111111111, 0.12659722222222222, 0.10895833333333334, 0.12434027777777779, 0.12802083333333333, 0.13409722222222223, 0.11590277777777779], "orca": [0.1595010024504344, 0.10934665690736085, 0.1295547847118353, 0.12653152149699265, 0.1111765267479235, 0.1331986124813035, 0.11816185596537568, 0.11252903923877415], "math": [0.15666355094209672, 0.10655778029222217, 0.11602794312982732, 0.11118217149603976, 0.12367786687656812, 0.127219953756088, 0.1357553992227087, 0.12291533428444926], "sharegpt": [0.15482868558053192, 0.10909943904112354, 0.13166390895516367, 0.12214980426974456, 0.11431050486298884, 0.13694559909600873, 0.12017232333831067, 0.1108297348561282]}} +{"step": 1500, "old_prob_map": {"code": 0.2779217751998221, "math": 0.3233091416247052, "orca": 0.21481783112919145, "sharegpt": 0.18395125204628138}, "new_prob_map": {"code": 0.2794000506707779, "math": 0.32403823816235056, "orca": 0.2141558894834887, "sharegpt": 0.18240582168338285}, "sim": [[0.12583274016203705, 0.12605009740055645, 0.12582082701276845, 0.12585137441370606], [0.12605009740055645, 0.1267408267893199, 0.12630800638000103, 0.12623558657906186], [0.12582082701276845, 0.12630800638000103, 0.12691902867744342, 0.1267594364818609], [0.12585137441370606, 0.12623558657906186, 0.1267594364818609, 0.1266761751407769]], "name2load": {"code": [0.14378472222222222, 0.1182986111111111, 0.12659722222222222, 0.10895833333333334, 0.12434027777777779, 0.12802083333333333, 0.13409722222222223, 0.11590277777777779], "orca": [0.1595010024504344, 0.10934665690736085, 0.1295547847118353, 0.12653152149699265, 0.1111765267479235, 0.1331986124813035, 0.11816185596537568, 0.11252903923877415], "math": [0.15666355094209672, 0.10655778029222217, 0.11602794312982732, 0.11118217149603976, 0.12367786687656812, 0.127219953756088, 0.1357553992227087, 0.12291533428444926], "sharegpt": [0.15482868558053192, 0.10909943904112354, 0.13166390895516367, 0.12214980426974456, 0.11431050486298884, 0.13694559909600873, 0.12017232333831067, 0.1108297348561282]}} +{"step": 1600, "old_prob_map": {"code": 0.2794000506707779, "math": 0.32403823816235056, "orca": 0.2141558894834887, "sharegpt": 0.18240582168338285}, "new_prob_map": {"code": 0.2804732627789791, "math": 0.3241530780111496, "orca": 0.21407924627956532, "sharegpt": 0.18129441293030601}, "sim": [[0.12593104021990742, 0.12613996914339437, 0.12578566997546028, 0.12582446745900427], [0.12613996914339437, 0.1267514866134596, 0.1262219844422851, 0.12617636192126522], [0.12578566997546028, 0.1262219844422851, 0.12686584735491024, 0.12671370049191125], [0.12582446745900427, 0.12617636192126522, 0.12671370049191125, 0.1266354352939243]], "name2load": {"code": [0.14371527777777776, 0.11833333333333333, 0.12493055555555556, 0.10666666666666666, 0.12430555555555556, 0.12975694444444444, 0.1354861111111111, 0.11680555555555555], "orca": [0.15848264010438212, 0.10853514941285046, 0.1307640899977723, 0.12614963561722303, 0.11184482703752027, 0.13342137924450242, 0.11773223435063487, 0.11307004423511441], "math": [0.15486790967678457, 0.10518030206129778, 0.11784818222069168, 0.10872238894081762, 0.1250799429330447, 0.12795788852265458, 0.13681310572145422, 0.12353027992325477], "sharegpt": [0.15386012349166633, 0.10845373098187981, 0.13279894265305298, 0.12157472052948061, 0.11488558860325275, 0.13725331934299206, 0.1192289842205093, 0.11194459017716614]}} +{"step": 1600, "old_prob_map": {"code": 0.2794000506707779, "math": 0.32403823816235056, "orca": 0.2141558894834887, "sharegpt": 0.18240582168338285}, "new_prob_map": {"code": 0.2804732627789791, "math": 0.3241530780111496, "orca": 0.21407924627956532, "sharegpt": 0.18129441293030601}, "sim": [[0.12593104021990742, 0.12613996914339437, 0.12578566997546028, 0.12582446745900427], [0.12613996914339437, 0.1267514866134596, 0.1262219844422851, 0.12617636192126522], [0.12578566997546028, 0.1262219844422851, 0.12686584735491024, 0.12671370049191125], [0.12582446745900427, 0.12617636192126522, 0.12671370049191125, 0.1266354352939243]], "name2load": {"code": [0.14371527777777776, 0.11833333333333333, 0.12493055555555556, 0.10666666666666666, 0.12430555555555556, 0.12975694444444444, 0.1354861111111111, 0.11680555555555555], "orca": [0.15848264010438212, 0.10853514941285046, 0.1307640899977723, 0.12614963561722303, 0.11184482703752027, 0.13342137924450242, 0.11773223435063487, 0.11307004423511441], "math": [0.15486790967678457, 0.10518030206129778, 0.11784818222069168, 0.10872238894081762, 0.1250799429330447, 0.12795788852265458, 0.13681310572145422, 0.12353027992325477], "sharegpt": [0.15386012349166633, 0.10845373098187981, 0.13279894265305298, 0.12157472052948061, 0.11488558860325275, 0.13725331934299206, 0.1192289842205093, 0.11194459017716614]}} +{"step": 1600, "old_prob_map": {"code": 0.2794000506707779, "math": 0.32403823816235056, "orca": 0.2141558894834887, "sharegpt": 0.18240582168338285}, "new_prob_map": {"code": 0.2804732627789791, "math": 0.3241530780111496, "orca": 0.21407924627956532, "sharegpt": 0.18129441293030601}, "sim": [[0.12593104021990742, 0.12613996914339437, 0.12578566997546028, 0.12582446745900427], [0.12613996914339437, 0.1267514866134596, 0.1262219844422851, 0.12617636192126522], [0.12578566997546028, 0.1262219844422851, 0.12686584735491024, 0.12671370049191125], [0.12582446745900427, 0.12617636192126522, 0.12671370049191125, 0.1266354352939243]], "name2load": {"code": [0.14371527777777776, 0.11833333333333333, 0.12493055555555556, 0.10666666666666666, 0.12430555555555556, 0.12975694444444444, 0.1354861111111111, 0.11680555555555555], "orca": [0.15848264010438212, 0.10853514941285046, 0.1307640899977723, 0.12614963561722303, 0.11184482703752027, 0.13342137924450242, 0.11773223435063487, 0.11307004423511441], "math": [0.15486790967678457, 0.10518030206129778, 0.11784818222069168, 0.10872238894081762, 0.1250799429330447, 0.12795788852265458, 0.13681310572145422, 0.12353027992325477], "sharegpt": [0.15386012349166633, 0.10845373098187981, 0.13279894265305298, 0.12157472052948061, 0.11488558860325275, 0.13725331934299206, 0.1192289842205093, 0.11194459017716614]}} +{"step": 1700, "old_prob_map": {"code": 0.2804732627789791, "math": 0.3241530780111496, "orca": 0.21407924627956532, "sharegpt": 0.18129441293030601}, "new_prob_map": {"code": 0.2813282261479211, "math": 0.32312641954300747, "orca": 0.21468698073291553, "sharegpt": 0.180858373576156}, "sim": [[0.1259065369405864, 0.126133320897766, 0.12580441674887824, 0.12582797783097024], [0.126133320897766, 0.12669842951396948, 0.126244217829939, 0.12618712353198738], [0.12580441674887824, 0.126244217829939, 0.1268750365940294, 0.12670858269224505], [0.12582797783097024, 0.12618712353198738, 0.12670858269224505, 0.1266261001785833]], "name2load": {"code": [0.14402777777777778, 0.11847222222222221, 0.12458333333333334, 0.10750000000000001, 0.12350694444444445, 0.1295138888888889, 0.13541666666666666, 0.11697916666666666], "orca": [0.15884861407249468, 0.10874200426439233, 0.1294593132418929, 0.12602234032396653, 0.11192438659580563, 0.1341055914457563, 0.11741399611749356, 0.11348375393819815], "math": [0.1546465292468146, 0.10621341073449109, 0.11816795395287057, 0.1085502041619521, 0.12471097554976142, 0.12891720371919124, 0.1362965513848576, 0.1224971712500615], "sharegpt": [0.15373400863634532, 0.10872613906937328, 0.13202711973848824, 0.121403204326244, 0.11506214940070222, 0.13804027604019536, 0.11897675450986725, 0.11203034827878448]}} +{"step": 1700, "old_prob_map": {"code": 0.2804732627789791, "math": 0.3241530780111496, "orca": 0.21407924627956532, "sharegpt": 0.18129441293030601}, "new_prob_map": {"code": 0.2813282261479211, "math": 0.32312641954300747, "orca": 0.21468698073291553, "sharegpt": 0.180858373576156}, "sim": [[0.1259065369405864, 0.126133320897766, 0.12580441674887824, 0.12582797783097024], [0.126133320897766, 0.12669842951396948, 0.126244217829939, 0.12618712353198738], [0.12580441674887824, 0.126244217829939, 0.1268750365940294, 0.12670858269224505], [0.12582797783097024, 0.12618712353198738, 0.12670858269224505, 0.1266261001785833]], "name2load": {"code": [0.14402777777777778, 0.11847222222222221, 0.12458333333333334, 0.10750000000000001, 0.12350694444444445, 0.1295138888888889, 0.13541666666666666, 0.11697916666666666], "orca": [0.15884861407249468, 0.10874200426439233, 0.1294593132418929, 0.12602234032396653, 0.11192438659580563, 0.1341055914457563, 0.11741399611749356, 0.11348375393819815], "math": [0.1546465292468146, 0.10621341073449109, 0.11816795395287057, 0.1085502041619521, 0.12471097554976142, 0.12891720371919124, 0.1362965513848576, 0.1224971712500615], "sharegpt": [0.15373400863634532, 0.10872613906937328, 0.13202711973848824, 0.121403204326244, 0.11506214940070222, 0.13804027604019536, 0.11897675450986725, 0.11203034827878448]}} +{"step": 1700, "old_prob_map": {"code": 0.2804732627789791, "math": 0.3241530780111496, "orca": 0.21407924627956532, "sharegpt": 0.18129441293030601}, "new_prob_map": {"code": 0.2813282261479211, "math": 0.32312641954300747, "orca": 0.21468698073291553, "sharegpt": 0.180858373576156}, "sim": [[0.1259065369405864, 0.126133320897766, 0.12580441674887824, 0.12582797783097024], [0.126133320897766, 0.12669842951396948, 0.126244217829939, 0.12618712353198738], [0.12580441674887824, 0.126244217829939, 0.1268750365940294, 0.12670858269224505], [0.12582797783097024, 0.12618712353198738, 0.12670858269224505, 0.1266261001785833]], "name2load": {"code": [0.14402777777777778, 0.11847222222222221, 0.12458333333333334, 0.10750000000000001, 0.12350694444444445, 0.1295138888888889, 0.13541666666666666, 0.11697916666666666], "orca": [0.15884861407249468, 0.10874200426439233, 0.1294593132418929, 0.12602234032396653, 0.11192438659580563, 0.1341055914457563, 0.11741399611749356, 0.11348375393819815], "math": [0.1546465292468146, 0.10621341073449109, 0.11816795395287057, 0.1085502041619521, 0.12471097554976142, 0.12891720371919124, 0.1362965513848576, 0.1224971712500615], "sharegpt": [0.15373400863634532, 0.10872613906937328, 0.13202711973848824, 0.121403204326244, 0.11506214940070222, 0.13804027604019536, 0.11897675450986725, 0.11203034827878448]}} +{"step": 1800, "old_prob_map": {"code": 0.2813282261479211, "math": 0.32312641954300747, "orca": 0.21468698073291553, "sharegpt": 0.180858373576156}, "new_prob_map": {"code": 0.282973176307897, "math": 0.3216829267748556, "orca": 0.21494254071910104, "sharegpt": 0.18040135619814648}, "sim": [[0.1259092978395062, 0.1261164543054394, 0.12576030318379672, 0.1258058630656111], [0.1261164543054394, 0.1266795881628676, 0.12622637671543455, 0.12617597886763374], [0.12576030318379672, 0.12622637671543455, 0.12683592295496837, 0.126692137071093], [0.1258058630656111, 0.12617597886763374, 0.126692137071093, 0.12662839604000165]], "name2load": {"code": [0.14326388888888889, 0.11875, 0.1238888888888889, 0.10666666666666666, 0.12399305555555556, 0.1305902777777778, 0.13545138888888889, 0.11739583333333334], "orca": [0.15853037583935334, 0.108662444706107, 0.1288864844222385, 0.12602234032396653, 0.11216306527066162, 0.13428062247398403, 0.11741399611749356, 0.11404067084619549], "math": [0.1546465292468146, 0.10660697594332662, 0.11784818222069171, 0.10827962808087765, 0.12463718207310474, 0.1288680080680868, 0.13568160574605206, 0.12343188862104591], "sharegpt": [0.1538954356511562, 0.10891278905524839, 0.13152770491141694, 0.12142338270309536, 0.11505710480648937, 0.13823197062028328, 0.11893135316195165, 0.11202025909035876]}} +{"step": 1800, "old_prob_map": {"code": 0.2813282261479211, "math": 0.32312641954300747, "orca": 0.21468698073291553, "sharegpt": 0.180858373576156}, "new_prob_map": {"code": 0.282973176307897, "math": 0.3216829267748556, "orca": 0.21494254071910104, "sharegpt": 0.18040135619814648}, "sim": [[0.1259092978395062, 0.1261164543054394, 0.12576030318379672, 0.1258058630656111], [0.1261164543054394, 0.1266795881628676, 0.12622637671543455, 0.12617597886763374], [0.12576030318379672, 0.12622637671543455, 0.12683592295496837, 0.126692137071093], [0.1258058630656111, 0.12617597886763374, 0.126692137071093, 0.12662839604000165]], "name2load": {"code": [0.14326388888888889, 0.11875, 0.1238888888888889, 0.10666666666666666, 0.12399305555555556, 0.1305902777777778, 0.13545138888888889, 0.11739583333333334], "orca": [0.15853037583935334, 0.108662444706107, 0.1288864844222385, 0.12602234032396653, 0.11216306527066162, 0.13428062247398403, 0.11741399611749356, 0.11404067084619549], "math": [0.1546465292468146, 0.10660697594332662, 0.11784818222069171, 0.10827962808087765, 0.12463718207310474, 0.1288680080680868, 0.13568160574605206, 0.12343188862104591], "sharegpt": [0.1538954356511562, 0.10891278905524839, 0.13152770491141694, 0.12142338270309536, 0.11505710480648937, 0.13823197062028328, 0.11893135316195165, 0.11202025909035876]}} +{"step": 1800, "old_prob_map": {"code": 0.2813282261479211, "math": 0.32312641954300747, "orca": 0.21468698073291553, "sharegpt": 0.180858373576156}, "new_prob_map": {"code": 0.282973176307897, "math": 0.3216829267748556, "orca": 0.21494254071910104, "sharegpt": 0.18040135619814648}, "sim": [[0.1259092978395062, 0.1261164543054394, 0.12576030318379672, 0.1258058630656111], [0.1261164543054394, 0.1266795881628676, 0.12622637671543455, 0.12617597886763374], [0.12576030318379672, 0.12622637671543455, 0.12683592295496837, 0.126692137071093], [0.1258058630656111, 0.12617597886763374, 0.126692137071093, 0.12662839604000165]], "name2load": {"code": [0.14326388888888889, 0.11875, 0.1238888888888889, 0.10666666666666666, 0.12399305555555556, 0.1305902777777778, 0.13545138888888889, 0.11739583333333334], "orca": [0.15853037583935334, 0.108662444706107, 0.1288864844222385, 0.12602234032396653, 0.11216306527066162, 0.13428062247398403, 0.11741399611749356, 0.11404067084619549], "math": [0.1546465292468146, 0.10660697594332662, 0.11784818222069171, 0.10827962808087765, 0.12463718207310474, 0.1288680080680868, 0.13568160574605206, 0.12343188862104591], "sharegpt": [0.1538954356511562, 0.10891278905524839, 0.13152770491141694, 0.12142338270309536, 0.11505710480648937, 0.13823197062028328, 0.11893135316195165, 0.11202025909035876]}} +{"step": 1900, "old_prob_map": {"code": 0.282973176307897, "math": 0.3216829267748556, "orca": 0.21494254071910104, "sharegpt": 0.18040135619814648}, "new_prob_map": {"code": 0.28506630267298727, "math": 0.32022976576466383, "orca": 0.21494599194174854, "sharegpt": 0.17975793962060052}, "sim": [[0.1259095100308642, 0.12610713531673798, 0.12573655355065466, 0.12578400070512216], [0.12610713531673798, 0.12669060012786607, 0.12622065525322654, 0.12617461950497313], [0.12573655355065466, 0.12622065525322654, 0.12681874560502623, 0.12667662506027874], [0.12578400070512216, 0.12617461950497313, 0.12667662506027874, 0.12661365756551973]], "name2load": {"code": [0.14274305555555555, 0.11902777777777779, 0.1240625, 0.10652777777777778, 0.12375, 0.130625, 0.13607638888888887, 0.1171875], "orca": [0.15827578525284028, 0.10906024249753364, 0.1293956655952646, 0.12605416414728066, 0.11170161983260668, 0.13401011997581389, 0.11773223435063487, 0.11377016834802532], "math": [0.1548925075023368, 0.10665617159443104, 0.11755300831406501, 0.10862399763860874, 0.12451419294534362, 0.12881881241698234, 0.13587838835046978, 0.12306292123776257], "sharegpt": [0.15360789378102424, 0.1088421647362686, 0.13162859679567376, 0.12163021106582185, 0.11487045482061423, 0.13820170305500626, 0.11918862746680656, 0.11203034827878446]}} +{"step": 1900, "old_prob_map": {"code": 0.282973176307897, "math": 0.3216829267748556, "orca": 0.21494254071910104, "sharegpt": 0.18040135619814648}, "new_prob_map": {"code": 0.28506630267298727, "math": 0.32022976576466383, "orca": 0.21494599194174854, "sharegpt": 0.17975793962060052}, "sim": [[0.1259095100308642, 0.12610713531673798, 0.12573655355065466, 0.12578400070512216], [0.12610713531673798, 0.12669060012786607, 0.12622065525322654, 0.12617461950497313], [0.12573655355065466, 0.12622065525322654, 0.12681874560502623, 0.12667662506027874], [0.12578400070512216, 0.12617461950497313, 0.12667662506027874, 0.12661365756551973]], "name2load": {"code": [0.14274305555555555, 0.11902777777777779, 0.1240625, 0.10652777777777778, 0.12375, 0.130625, 0.13607638888888887, 0.1171875], "orca": [0.15827578525284028, 0.10906024249753364, 0.1293956655952646, 0.12605416414728066, 0.11170161983260668, 0.13401011997581389, 0.11773223435063487, 0.11377016834802532], "math": [0.1548925075023368, 0.10665617159443104, 0.11755300831406501, 0.10862399763860874, 0.12451419294534362, 0.12881881241698234, 0.13587838835046978, 0.12306292123776257], "sharegpt": [0.15360789378102424, 0.1088421647362686, 0.13162859679567376, 0.12163021106582185, 0.11487045482061423, 0.13820170305500626, 0.11918862746680656, 0.11203034827878446]}} +{"step": 1900, "old_prob_map": {"code": 0.282973176307897, "math": 0.3216829267748556, "orca": 0.21494254071910104, "sharegpt": 0.18040135619814648}, "new_prob_map": {"code": 0.28506630267298727, "math": 0.32022976576466383, "orca": 0.21494599194174854, "sharegpt": 0.17975793962060052}, "sim": [[0.1259095100308642, 0.12610713531673798, 0.12573655355065466, 0.12578400070512216], [0.12610713531673798, 0.12669060012786607, 0.12622065525322654, 0.12617461950497313], [0.12573655355065466, 0.12622065525322654, 0.12681874560502623, 0.12667662506027874], [0.12578400070512216, 0.12617461950497313, 0.12667662506027874, 0.12661365756551973]], "name2load": {"code": [0.14274305555555555, 0.11902777777777779, 0.1240625, 0.10652777777777778, 0.12375, 0.130625, 0.13607638888888887, 0.1171875], "orca": [0.15827578525284028, 0.10906024249753364, 0.1293956655952646, 0.12605416414728066, 0.11170161983260668, 0.13401011997581389, 0.11773223435063487, 0.11377016834802532], "math": [0.1548925075023368, 0.10665617159443104, 0.11755300831406501, 0.10862399763860874, 0.12451419294534362, 0.12881881241698234, 0.13587838835046978, 0.12306292123776257], "sharegpt": [0.15360789378102424, 0.1088421647362686, 0.13162859679567376, 0.12163021106582185, 0.11487045482061423, 0.13820170305500626, 0.11918862746680656, 0.11203034827878446]}} +{"step": 2000, "old_prob_map": {"code": 0.28506630267298727, "math": 0.32022976576466383, "orca": 0.21494599194174854, "sharegpt": 0.17975793962060052}, "new_prob_map": {"code": 0.2867481424046919, "math": 0.31854989012606805, "orca": 0.2152416645459714, "sharegpt": 0.1794603029232688}, "sim": [[0.12588450038580248, 0.12610842670257952, 0.12573015618867284, 0.12577819013832278], [0.12610842670257952, 0.12667020379049795, 0.12621567588243646, 0.1261686910458374], [0.12573015618867284, 0.12621567588243646, 0.12683392023051163, 0.12669522454319626], [0.12577819013832278, 0.1261686910458374, 0.12669522454319626, 0.12663247442717893]], "name2load": {"code": [0.1425, 0.11784722222222221, 0.12364583333333334, 0.10694444444444444, 0.12427083333333333, 0.1305902777777778, 0.13600694444444444, 0.11819444444444444], "orca": [0.15818031378289787, 0.10869426852942113, 0.1293956655952646, 0.1262769309104796, 0.11186073894917735, 0.13436018203226935, 0.11782770582057728, 0.11340419437991281], "math": [0.15474492054902347, 0.10672996507108772, 0.11779898656958725, 0.10882078024302652, 0.12458798642200031, 0.12842524720814683, 0.13602597530378316, 0.12286613863334482], "sharegpt": [0.15376427620162234, 0.10894810121473827, 0.13194640623108275, 0.12141833810888251, 0.11465858186367489, 0.13821683683764477, 0.11906755720569838, 0.11197990233665604]}} +{"step": 2000, "old_prob_map": {"code": 0.28506630267298727, "math": 0.32022976576466383, "orca": 0.21494599194174854, "sharegpt": 0.17975793962060052}, "new_prob_map": {"code": 0.2867481424046919, "math": 0.31854989012606805, "orca": 0.2152416645459714, "sharegpt": 0.1794603029232688}, "sim": [[0.12588450038580248, 0.12610842670257952, 0.12573015618867284, 0.12577819013832278], [0.12610842670257952, 0.12667020379049795, 0.12621567588243646, 0.1261686910458374], [0.12573015618867284, 0.12621567588243646, 0.12683392023051163, 0.12669522454319626], [0.12577819013832278, 0.1261686910458374, 0.12669522454319626, 0.12663247442717893]], "name2load": {"code": [0.1425, 0.11784722222222221, 0.12364583333333334, 0.10694444444444444, 0.12427083333333333, 0.1305902777777778, 0.13600694444444444, 0.11819444444444444], "orca": [0.15818031378289787, 0.10869426852942113, 0.1293956655952646, 0.1262769309104796, 0.11186073894917735, 0.13436018203226935, 0.11782770582057728, 0.11340419437991281], "math": [0.15474492054902347, 0.10672996507108772, 0.11779898656958725, 0.10882078024302652, 0.12458798642200031, 0.12842524720814683, 0.13602597530378316, 0.12286613863334482], "sharegpt": [0.15376427620162234, 0.10894810121473827, 0.13194640623108275, 0.12141833810888251, 0.11465858186367489, 0.13821683683764477, 0.11906755720569838, 0.11197990233665604]}} +{"step": 2000, "old_prob_map": {"code": 0.28506630267298727, "math": 0.32022976576466383, "orca": 0.21494599194174854, "sharegpt": 0.17975793962060052}, "new_prob_map": {"code": 0.2867481424046919, "math": 0.31854989012606805, "orca": 0.2152416645459714, "sharegpt": 0.1794603029232688}, "sim": [[0.12588450038580248, 0.12610842670257952, 0.12573015618867284, 0.12577819013832278], [0.12610842670257952, 0.12667020379049795, 0.12621567588243646, 0.1261686910458374], [0.12573015618867284, 0.12621567588243646, 0.12683392023051163, 0.12669522454319626], [0.12577819013832278, 0.1261686910458374, 0.12669522454319626, 0.12663247442717893]], "name2load": {"code": [0.1425, 0.11784722222222221, 0.12364583333333334, 0.10694444444444444, 0.12427083333333333, 0.1305902777777778, 0.13600694444444444, 0.11819444444444444], "orca": [0.15818031378289787, 0.10869426852942113, 0.1293956655952646, 0.1262769309104796, 0.11186073894917735, 0.13436018203226935, 0.11782770582057728, 0.11340419437991281], "math": [0.15474492054902347, 0.10672996507108772, 0.11779898656958725, 0.10882078024302652, 0.12458798642200031, 0.12842524720814683, 0.13602597530378316, 0.12286613863334482], "sharegpt": [0.15376427620162234, 0.10894810121473827, 0.13194640623108275, 0.12141833810888251, 0.11465858186367489, 0.13821683683764477, 0.11906755720569838, 0.11197990233665604]}} diff --git a/sbatch.sh b/sbatch.sh new file mode 100644 index 0000000000000000000000000000000000000000..b6d768922ddc37ba312c356013fbbb2af334bfda --- /dev/null +++ b/sbatch.sh @@ -0,0 +1,93 @@ +#!/usr/bin/bash + +#SBATCH --job-name=moe_sft +#SBATCH --output=logs/%x-%j.log +#SBATCH --error=logs/%x-%j.log + +#SBATCH --partition=MoE +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=16 +#SBATCH --mem=64G + +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --quotatype=auto + +export WANDB_PROJECT="adaptive-moe-sft" +num_gpus=4 + +{ + task_name="llama_moe_four_mix_freeze_gate_100" + model_type="auto" + model_name_or_path="/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new" + dataset_dir_or_path="data/four_types_mix/train" + eval_data_dir="data/four_types_mix/dev" + + comment=$task_name + base_dir="outputs/dynamic_eval_interval_20" + output_dir="${base_dir}/${task_name}/$SLURM_JOB_NAME-$SLURM_JOB_ID" + mkdir -p $output_dir + scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh + git diff > $output_dir/diff.patch + env > $output_dir/env + echo -e "Job ID: ${SLURM_JOB_ID}\n\nGit commit: $(git log -1 --oneline)\n\nGit branch: $(git branch | grep "*")\n\nComment: ${comment}" > $output_dir/comment.txt + echo "$SLURM_JOB_ID" > $base_dir/latest.jobid + ln -snf $output_dir $base_dir/latest.dir + ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $base_dir/latest.log + + nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS)) + nodes_array=($nodes) + head_node=${nodes_array[0]} + echo "Node: $head_node" + + torchrun \ + --nnodes 1 \ + --nproc_per_node $num_gpus \ + --node_rank $SLURM_NODEID \ + --rdzv_id $RANDOM \ + --rdzv_backend c10d \ + --rdzv_endpoint $head_node:29522 \ + -m src.core.train \ + --do_train \ + --do_eval \ + --freeze_gate True \ + --eval_data_dir $eval_data_dir \ + --evaluation_strategy steps \ + --eval_steps 100 \ + --max_eval_steps 5 \ + --dynamic_sampling_criterion mean \ + --run_name $task_name \ + --model_type $model_type \ + --model_name_or_path $model_name_or_path \ + --dataset_dir_or_path $dataset_dir_or_path \ + --output_dir $output_dir \ + --deepspeed conf/ds_bf16_zero1.json \ + --bf16 True \ + --tf32 True \ + --torch_dtype bfloat16 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 8 \ + --max_steps 2000 \ + --save_strategy steps \ + --save_steps 9999999999999 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type cosine \ + --logging_steps 1 \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --report_to wandb + + python -m src.eval.gen_mt_ans \ + --model-path $output_dir \ + --model-id $task_name + + python -m src.eval.gen_alpaca_eval_ans \ + --model-path $output_dir \ + --model-id $task_name +} + +# nohup srun -p MoE --ntasks-per-node=1 --cpus-per-task=16 --mem=128G --nodes=1 --gres=gpu:4 bash "/mnt/petrelfs/zhutong/adaptive-sft-for-moe/scripts/one_data_steps_dynamic.sh" "llama_moe_orca_epochs_cluster_4" "auto" "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new" "data/open_orca_clustered/4" "data/open_orca_clustered_eval/4" 1>logs/llama_moe_orca_cluster_4_dynamic.log 2>&1 & diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..14761dcf1466dc232bd41de9c21d4c617b15755e --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a927b63b5663a28db1959828168610a93b3b9f67 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,43 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1ebbeebefa9083fb9e9971d69516859d7072dc8a --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,13470 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.9672, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 6.666666666666667e-07, + "loss": 1.101, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.0874, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 1.3333333333333334e-06, + "loss": 1.0302, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.0754, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.0552, + "step": 6 + }, + { + "epoch": 0.0, + "learning_rate": 2.3333333333333336e-06, + "loss": 1.056, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 2.666666666666667e-06, + "loss": 1.0807, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 3e-06, + "loss": 0.9727, + "step": 9 + }, + { + "epoch": 0.01, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.0609, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 3.6666666666666666e-06, + "loss": 1.0426, + "step": 11 + }, + { + "epoch": 0.01, + "learning_rate": 4.000000000000001e-06, + "loss": 0.9514, + "step": 12 + }, + { + "epoch": 0.01, + "learning_rate": 4.333333333333334e-06, + "loss": 0.946, + "step": 13 + }, + { + "epoch": 0.01, + "learning_rate": 4.666666666666667e-06, + "loss": 0.9148, + "step": 14 + }, + { + "epoch": 0.01, + "learning_rate": 5e-06, + "loss": 0.9166, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 5.333333333333334e-06, + "loss": 0.8649, + "step": 16 + }, + { + "epoch": 0.01, + "learning_rate": 5.666666666666667e-06, + "loss": 0.9305, + "step": 17 + }, + { + "epoch": 0.01, + "learning_rate": 6e-06, + "loss": 0.8224, + "step": 18 + }, + { + "epoch": 0.01, + "learning_rate": 6.333333333333333e-06, + "loss": 0.8195, + "step": 19 + }, + { + "epoch": 0.01, + "learning_rate": 6.666666666666667e-06, + "loss": 0.8794, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 7e-06, + "loss": 0.7984, + "step": 21 + }, + { + "epoch": 0.01, + "learning_rate": 7.333333333333333e-06, + "loss": 0.8589, + "step": 22 + }, + { + "epoch": 0.01, + "learning_rate": 7.666666666666667e-06, + "loss": 0.7756, + "step": 23 + }, + { + "epoch": 0.01, + "learning_rate": 8.000000000000001e-06, + "loss": 0.8005, + "step": 24 + }, + { + "epoch": 0.01, + "learning_rate": 8.333333333333334e-06, + "loss": 0.8899, + "step": 25 + }, + { + "epoch": 0.01, + "learning_rate": 8.666666666666668e-06, + "loss": 0.8451, + "step": 26 + }, + { + "epoch": 0.01, + "learning_rate": 9e-06, + "loss": 0.8091, + "step": 27 + }, + { + "epoch": 0.01, + "learning_rate": 9.333333333333334e-06, + "loss": 0.8062, + "step": 28 + }, + { + "epoch": 0.01, + "learning_rate": 9.666666666666667e-06, + "loss": 0.8387, + "step": 29 + }, + { + "epoch": 0.01, + "learning_rate": 1e-05, + "loss": 0.774, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 1.0333333333333335e-05, + "loss": 0.7554, + "step": 31 + }, + { + "epoch": 0.02, + "learning_rate": 1.0666666666666667e-05, + "loss": 0.8233, + "step": 32 + }, + { + "epoch": 0.02, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.7762, + "step": 33 + }, + { + "epoch": 0.02, + "learning_rate": 1.1333333333333334e-05, + "loss": 0.8007, + "step": 34 + }, + { + "epoch": 0.02, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.7668, + "step": 35 + }, + { + "epoch": 0.02, + "learning_rate": 1.2e-05, + "loss": 0.7904, + "step": 36 + }, + { + "epoch": 0.02, + "learning_rate": 1.2333333333333334e-05, + "loss": 0.824, + "step": 37 + }, + { + "epoch": 0.02, + "learning_rate": 1.2666666666666667e-05, + "loss": 0.7715, + "step": 38 + }, + { + "epoch": 0.02, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.7424, + "step": 39 + }, + { + "epoch": 0.02, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.7362, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 1.3666666666666667e-05, + "loss": 0.7583, + "step": 41 + }, + { + "epoch": 0.02, + "learning_rate": 1.4e-05, + "loss": 0.8013, + "step": 42 + }, + { + "epoch": 0.02, + "learning_rate": 1.4333333333333334e-05, + "loss": 0.7942, + "step": 43 + }, + { + "epoch": 0.02, + "learning_rate": 1.4666666666666666e-05, + "loss": 0.7419, + "step": 44 + }, + { + "epoch": 0.02, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.7636, + "step": 45 + }, + { + "epoch": 0.02, + "learning_rate": 1.5333333333333334e-05, + "loss": 0.8152, + "step": 46 + }, + { + "epoch": 0.02, + "learning_rate": 1.5666666666666667e-05, + "loss": 0.7442, + "step": 47 + }, + { + "epoch": 0.02, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.7432, + "step": 48 + }, + { + "epoch": 0.02, + "learning_rate": 1.6333333333333335e-05, + "loss": 0.7055, + "step": 49 + }, + { + "epoch": 0.03, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.7479, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 1.7e-05, + "loss": 0.7984, + "step": 51 + }, + { + "epoch": 0.03, + "learning_rate": 1.7333333333333336e-05, + "loss": 0.7365, + "step": 52 + }, + { + "epoch": 0.03, + "learning_rate": 1.7666666666666668e-05, + "loss": 0.7525, + "step": 53 + }, + { + "epoch": 0.03, + "learning_rate": 1.8e-05, + "loss": 0.7407, + "step": 54 + }, + { + "epoch": 0.03, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.798, + "step": 55 + }, + { + "epoch": 0.03, + "learning_rate": 1.866666666666667e-05, + "loss": 0.7416, + "step": 56 + }, + { + "epoch": 0.03, + "learning_rate": 1.9e-05, + "loss": 0.8083, + "step": 57 + }, + { + "epoch": 0.03, + "learning_rate": 1.9333333333333333e-05, + "loss": 0.7662, + "step": 58 + }, + { + "epoch": 0.03, + "learning_rate": 1.9666666666666666e-05, + "loss": 0.7259, + "step": 59 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999986888082895e-05, + "loss": 0.6824, + "step": 61 + }, + { + "epoch": 0.03, + "learning_rate": 1.999994755236596e-05, + "loss": 0.7889, + "step": 62 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999881992952353e-05, + "loss": 0.7583, + "step": 63 + }, + { + "epoch": 0.03, + "learning_rate": 1.999979021001399e-05, + "loss": 0.7744, + "step": 64 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999672203791564e-05, + "loss": 0.756, + "step": 65 + }, + { + "epoch": 0.03, + "learning_rate": 1.999952797459453e-05, + "loss": 0.7075, + "step": 66 + }, + { + "epoch": 0.03, + "learning_rate": 1.9999357522801125e-05, + "loss": 0.7573, + "step": 67 + }, + { + "epoch": 0.03, + "learning_rate": 1.999916084885832e-05, + "loss": 0.6942, + "step": 68 + }, + { + "epoch": 0.03, + "learning_rate": 1.999893795328188e-05, + "loss": 0.7428, + "step": 69 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998688836656322e-05, + "loss": 0.7451, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998413499634927e-05, + "loss": 0.7405, + "step": 71 + }, + { + "epoch": 0.04, + "learning_rate": 1.9998111942939727e-05, + "loss": 0.7407, + "step": 72 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997784167361526e-05, + "loss": 0.7318, + "step": 73 + }, + { + "epoch": 0.04, + "learning_rate": 1.9997430173759876e-05, + "loss": 0.7698, + "step": 74 + }, + { + "epoch": 0.04, + "learning_rate": 1.999704996306308e-05, + "loss": 0.7822, + "step": 75 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996643536268202e-05, + "loss": 0.8002, + "step": 76 + }, + { + "epoch": 0.04, + "learning_rate": 1.9996210894441047e-05, + "loss": 0.7528, + "step": 77 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995752038716166e-05, + "loss": 0.7819, + "step": 78 + }, + { + "epoch": 0.04, + "learning_rate": 1.9995266970296856e-05, + "loss": 0.7695, + "step": 79 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994755690455154e-05, + "loss": 0.7493, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 1.9994218200531823e-05, + "loss": 0.7173, + "step": 81 + }, + { + "epoch": 0.04, + "learning_rate": 1.999365450193638e-05, + "loss": 0.7805, + "step": 82 + }, + { + "epoch": 0.04, + "learning_rate": 1.999306459614705e-05, + "loss": 0.7364, + "step": 83 + }, + { + "epoch": 0.04, + "learning_rate": 1.99924484847108e-05, + "loss": 0.7128, + "step": 84 + }, + { + "epoch": 0.04, + "learning_rate": 1.9991806169243302e-05, + "loss": 0.7227, + "step": 85 + }, + { + "epoch": 0.04, + "learning_rate": 1.9991137651428957e-05, + "loss": 0.8038, + "step": 86 + }, + { + "epoch": 0.04, + "learning_rate": 1.999044293302088e-05, + "loss": 0.8071, + "step": 87 + }, + { + "epoch": 0.04, + "learning_rate": 1.998972201584088e-05, + "loss": 0.7258, + "step": 88 + }, + { + "epoch": 0.04, + "learning_rate": 1.9988974901779482e-05, + "loss": 0.7094, + "step": 89 + }, + { + "epoch": 0.04, + "learning_rate": 1.998820159279591e-05, + "loss": 0.7975, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 1.998740209091807e-05, + "loss": 0.7445, + "step": 91 + }, + { + "epoch": 0.05, + "learning_rate": 1.9986576398242566e-05, + "loss": 0.7527, + "step": 92 + }, + { + "epoch": 0.05, + "learning_rate": 1.998572451693468e-05, + "loss": 0.7194, + "step": 93 + }, + { + "epoch": 0.05, + "learning_rate": 1.998484644922837e-05, + "loss": 0.8186, + "step": 94 + }, + { + "epoch": 0.05, + "learning_rate": 1.9983942197426272e-05, + "loss": 0.7771, + "step": 95 + }, + { + "epoch": 0.05, + "learning_rate": 1.9983011763899674e-05, + "loss": 0.7875, + "step": 96 + }, + { + "epoch": 0.05, + "learning_rate": 1.998205515108853e-05, + "loss": 0.767, + "step": 97 + }, + { + "epoch": 0.05, + "learning_rate": 1.998107236150145e-05, + "loss": 0.7356, + "step": 98 + }, + { + "epoch": 0.05, + "learning_rate": 1.9980063397715685e-05, + "loss": 0.7307, + "step": 99 + }, + { + "epoch": 0.05, + "learning_rate": 1.997902826237712e-05, + "loss": 0.712, + "step": 100 + }, + { + "epoch": 0.05, + "eval_code_gate_load": [ + 199.4, + 191.5, + 181.9, + 145.3, + 177.5, + 178.75, + 187.95, + 177.7 + ], + "eval_code_loss": 0.5201171636581421, + "eval_code_runtime": 1.7839, + "eval_code_samples_per_second": 560.56, + "eval_code_steps_per_second": 35.315, + "step": 100 + }, + { + "epoch": 0.05, + "eval_orca_gate_load": [ + 500.1, + 352.35, + 397.35, + 384.15, + 349.8, + 443.1, + 371.2, + 344.25 + ], + "eval_orca_loss": 0.7696288824081421, + "eval_orca_runtime": 2.0215, + "eval_orca_samples_per_second": 494.677, + "eval_orca_steps_per_second": 31.165, + "step": 100 + }, + { + "epoch": 0.05, + "eval_math_gate_load": [ + 320.6, + 221.3, + 228.4, + 217.5, + 248.3, + 243.2, + 282.25, + 271.15 + ], + "eval_math_loss": 0.716601550579071, + "eval_math_runtime": 1.8689, + "eval_math_samples_per_second": 535.077, + "eval_math_steps_per_second": 33.71, + "step": 100 + }, + { + "epoch": 0.05, + "eval_sharegpt_gate_load": [ + 1524.7, + 1115.1, + 1297.45, + 1170.1, + 1142.65, + 1383.85, + 1207.25, + 1070.5 + ], + "eval_sharegpt_loss": 0.7251952886581421, + "eval_sharegpt_runtime": 2.996, + "eval_sharegpt_samples_per_second": 333.775, + "eval_sharegpt_steps_per_second": 21.028, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 1.9977966958200276e-05, + "loss": 0.7711, + "step": 101 + }, + { + "epoch": 0.05, + "learning_rate": 1.997687948796831e-05, + "loss": 0.7064, + "step": 102 + }, + { + "epoch": 0.05, + "learning_rate": 1.9975765854532974e-05, + "loss": 0.7409, + "step": 103 + }, + { + "epoch": 0.05, + "learning_rate": 1.997462606081465e-05, + "loss": 0.7148, + "step": 104 + }, + { + "epoch": 0.05, + "learning_rate": 1.9973460109802306e-05, + "loss": 0.7689, + "step": 105 + }, + { + "epoch": 0.05, + "learning_rate": 1.997226800455352e-05, + "loss": 0.7637, + "step": 106 + }, + { + "epoch": 0.05, + "learning_rate": 1.9971049748194448e-05, + "loss": 0.7224, + "step": 107 + }, + { + "epoch": 0.05, + "learning_rate": 1.9969805343919822e-05, + "loss": 0.7147, + "step": 108 + }, + { + "epoch": 0.05, + "learning_rate": 1.9968534794992947e-05, + "loss": 0.6927, + "step": 109 + }, + { + "epoch": 0.06, + "learning_rate": 1.9967238104745695e-05, + "loss": 0.7395, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 1.996591527657848e-05, + "loss": 0.6984, + "step": 111 + }, + { + "epoch": 0.06, + "learning_rate": 1.9964566313960265e-05, + "loss": 0.7243, + "step": 112 + }, + { + "epoch": 0.06, + "learning_rate": 1.9963191220428552e-05, + "loss": 0.8089, + "step": 113 + }, + { + "epoch": 0.06, + "learning_rate": 1.9961789999589357e-05, + "loss": 0.7965, + "step": 114 + }, + { + "epoch": 0.06, + "learning_rate": 1.996036265511722e-05, + "loss": 0.7219, + "step": 115 + }, + { + "epoch": 0.06, + "learning_rate": 1.995890919075519e-05, + "loss": 0.7092, + "step": 116 + }, + { + "epoch": 0.06, + "learning_rate": 1.9957429610314797e-05, + "loss": 0.7492, + "step": 117 + }, + { + "epoch": 0.06, + "learning_rate": 1.995592391767608e-05, + "loss": 0.7885, + "step": 118 + }, + { + "epoch": 0.06, + "learning_rate": 1.995439211678754e-05, + "loss": 0.7533, + "step": 119 + }, + { + "epoch": 0.06, + "learning_rate": 1.995283421166614e-05, + "loss": 0.6916, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 1.995125020639731e-05, + "loss": 0.6982, + "step": 121 + }, + { + "epoch": 0.06, + "learning_rate": 1.994964010513492e-05, + "loss": 0.7155, + "step": 122 + }, + { + "epoch": 0.06, + "learning_rate": 1.9948003912101274e-05, + "loss": 0.7385, + "step": 123 + }, + { + "epoch": 0.06, + "learning_rate": 1.9946341631587086e-05, + "loss": 0.7199, + "step": 124 + }, + { + "epoch": 0.06, + "learning_rate": 1.9944653267951507e-05, + "loss": 0.68, + "step": 125 + }, + { + "epoch": 0.06, + "learning_rate": 1.9942938825622064e-05, + "loss": 0.8073, + "step": 126 + }, + { + "epoch": 0.06, + "learning_rate": 1.994119830909469e-05, + "loss": 0.705, + "step": 127 + }, + { + "epoch": 0.06, + "learning_rate": 1.9939431722933678e-05, + "loss": 0.7802, + "step": 128 + }, + { + "epoch": 0.06, + "learning_rate": 1.9937639071771704e-05, + "loss": 0.7184, + "step": 129 + }, + { + "epoch": 0.07, + "learning_rate": 1.993582036030978e-05, + "loss": 0.7413, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 1.9933975593317263e-05, + "loss": 0.7346, + "step": 131 + }, + { + "epoch": 0.07, + "learning_rate": 1.9932104775631847e-05, + "loss": 0.7115, + "step": 132 + }, + { + "epoch": 0.07, + "learning_rate": 1.993020791215953e-05, + "loss": 0.713, + "step": 133 + }, + { + "epoch": 0.07, + "learning_rate": 1.992828500787461e-05, + "loss": 0.7238, + "step": 134 + }, + { + "epoch": 0.07, + "learning_rate": 1.9926336067819686e-05, + "loss": 0.7186, + "step": 135 + }, + { + "epoch": 0.07, + "learning_rate": 1.9924361097105624e-05, + "loss": 0.7158, + "step": 136 + }, + { + "epoch": 0.07, + "learning_rate": 1.9922360100911553e-05, + "loss": 0.7106, + "step": 137 + }, + { + "epoch": 0.07, + "learning_rate": 1.992033308448486e-05, + "loss": 0.7274, + "step": 138 + }, + { + "epoch": 0.07, + "learning_rate": 1.9918280053141144e-05, + "loss": 0.738, + "step": 139 + }, + { + "epoch": 0.07, + "learning_rate": 1.9916201012264255e-05, + "loss": 0.7267, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 1.9914095967306224e-05, + "loss": 0.734, + "step": 141 + }, + { + "epoch": 0.07, + "learning_rate": 1.9911964923787295e-05, + "loss": 0.7932, + "step": 142 + }, + { + "epoch": 0.07, + "learning_rate": 1.990980788729588e-05, + "loss": 0.7568, + "step": 143 + }, + { + "epoch": 0.07, + "learning_rate": 1.990762486348855e-05, + "loss": 0.7244, + "step": 144 + }, + { + "epoch": 0.07, + "learning_rate": 1.9905415858090036e-05, + "loss": 0.7349, + "step": 145 + }, + { + "epoch": 0.07, + "learning_rate": 1.9903180876893195e-05, + "loss": 0.674, + "step": 146 + }, + { + "epoch": 0.07, + "learning_rate": 1.9900919925759e-05, + "loss": 0.7401, + "step": 147 + }, + { + "epoch": 0.07, + "learning_rate": 1.989863301061654e-05, + "loss": 0.7278, + "step": 148 + }, + { + "epoch": 0.07, + "learning_rate": 1.9896320137462984e-05, + "loss": 0.7734, + "step": 149 + }, + { + "epoch": 0.07, + "learning_rate": 1.9893981312363563e-05, + "loss": 0.6926, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 1.989161654145158e-05, + "loss": 0.7339, + "step": 151 + }, + { + "epoch": 0.08, + "learning_rate": 1.9889225830928365e-05, + "loss": 0.7335, + "step": 152 + }, + { + "epoch": 0.08, + "learning_rate": 1.9886809187063285e-05, + "loss": 0.6829, + "step": 153 + }, + { + "epoch": 0.08, + "learning_rate": 1.9884366616193707e-05, + "loss": 0.6921, + "step": 154 + }, + { + "epoch": 0.08, + "learning_rate": 1.988189812472498e-05, + "loss": 0.7695, + "step": 155 + }, + { + "epoch": 0.08, + "learning_rate": 1.987940371913044e-05, + "loss": 0.7079, + "step": 156 + }, + { + "epoch": 0.08, + "learning_rate": 1.9876883405951378e-05, + "loss": 0.7568, + "step": 157 + }, + { + "epoch": 0.08, + "learning_rate": 1.987433719179702e-05, + "loss": 0.7084, + "step": 158 + }, + { + "epoch": 0.08, + "learning_rate": 1.987176508334451e-05, + "loss": 0.7419, + "step": 159 + }, + { + "epoch": 0.08, + "learning_rate": 1.9869167087338908e-05, + "loss": 0.7072, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 1.9866543210593154e-05, + "loss": 0.769, + "step": 161 + }, + { + "epoch": 0.08, + "learning_rate": 1.986389345998806e-05, + "loss": 0.7518, + "step": 162 + }, + { + "epoch": 0.08, + "learning_rate": 1.986121784247229e-05, + "loss": 0.7457, + "step": 163 + }, + { + "epoch": 0.08, + "learning_rate": 1.9858516365062334e-05, + "loss": 0.7448, + "step": 164 + }, + { + "epoch": 0.08, + "learning_rate": 1.9855789034842504e-05, + "loss": 0.7529, + "step": 165 + }, + { + "epoch": 0.08, + "learning_rate": 1.9853035858964907e-05, + "loss": 0.6531, + "step": 166 + }, + { + "epoch": 0.08, + "learning_rate": 1.9850256844649422e-05, + "loss": 0.7162, + "step": 167 + }, + { + "epoch": 0.08, + "learning_rate": 1.9847451999183692e-05, + "loss": 0.6711, + "step": 168 + }, + { + "epoch": 0.08, + "learning_rate": 1.98446213299231e-05, + "loss": 0.687, + "step": 169 + }, + { + "epoch": 0.09, + "learning_rate": 1.9841764844290744e-05, + "loss": 0.7153, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 1.9838882549777426e-05, + "loss": 0.7152, + "step": 171 + }, + { + "epoch": 0.09, + "learning_rate": 1.9835974453941623e-05, + "loss": 0.7557, + "step": 172 + }, + { + "epoch": 0.09, + "learning_rate": 1.983304056440948e-05, + "loss": 0.7054, + "step": 173 + }, + { + "epoch": 0.09, + "learning_rate": 1.983008088887478e-05, + "loss": 0.6988, + "step": 174 + }, + { + "epoch": 0.09, + "learning_rate": 1.9827095435098926e-05, + "loss": 0.6721, + "step": 175 + }, + { + "epoch": 0.09, + "learning_rate": 1.9824084210910924e-05, + "loss": 0.7319, + "step": 176 + }, + { + "epoch": 0.09, + "learning_rate": 1.9821047224207362e-05, + "loss": 0.6729, + "step": 177 + }, + { + "epoch": 0.09, + "learning_rate": 1.9817984482952378e-05, + "loss": 0.7491, + "step": 178 + }, + { + "epoch": 0.09, + "learning_rate": 1.9814895995177653e-05, + "loss": 0.7494, + "step": 179 + }, + { + "epoch": 0.09, + "learning_rate": 1.9811781768982392e-05, + "loss": 0.7351, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 1.9808641812533286e-05, + "loss": 0.7003, + "step": 181 + }, + { + "epoch": 0.09, + "learning_rate": 1.980547613406451e-05, + "loss": 0.7291, + "step": 182 + }, + { + "epoch": 0.09, + "learning_rate": 1.9802284741877674e-05, + "loss": 0.7574, + "step": 183 + }, + { + "epoch": 0.09, + "learning_rate": 1.9799067644341844e-05, + "loss": 0.6798, + "step": 184 + }, + { + "epoch": 0.09, + "learning_rate": 1.9795824849893483e-05, + "loss": 0.7454, + "step": 185 + }, + { + "epoch": 0.09, + "learning_rate": 1.9792556367036432e-05, + "loss": 0.6889, + "step": 186 + }, + { + "epoch": 0.09, + "learning_rate": 1.9789262204341918e-05, + "loss": 0.7128, + "step": 187 + }, + { + "epoch": 0.09, + "learning_rate": 1.978594237044849e-05, + "loss": 0.6948, + "step": 188 + }, + { + "epoch": 0.09, + "learning_rate": 1.9782596874062028e-05, + "loss": 0.733, + "step": 189 + }, + { + "epoch": 0.1, + "learning_rate": 1.977922572395571e-05, + "loss": 0.7165, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 1.9775828928969976e-05, + "loss": 0.7471, + "step": 191 + }, + { + "epoch": 0.1, + "learning_rate": 1.977240649801253e-05, + "loss": 0.7039, + "step": 192 + }, + { + "epoch": 0.1, + "learning_rate": 1.97689584400583e-05, + "loss": 0.6682, + "step": 193 + }, + { + "epoch": 0.1, + "learning_rate": 1.9765484764149413e-05, + "loss": 0.7194, + "step": 194 + }, + { + "epoch": 0.1, + "learning_rate": 1.976198547939518e-05, + "loss": 0.6918, + "step": 195 + }, + { + "epoch": 0.1, + "learning_rate": 1.9758460594972068e-05, + "loss": 0.7341, + "step": 196 + }, + { + "epoch": 0.1, + "learning_rate": 1.9754910120123675e-05, + "loss": 0.7172, + "step": 197 + }, + { + "epoch": 0.1, + "learning_rate": 1.9751334064160708e-05, + "loss": 0.6916, + "step": 198 + }, + { + "epoch": 0.1, + "learning_rate": 1.9747732436460955e-05, + "loss": 0.7046, + "step": 199 + }, + { + "epoch": 0.1, + "learning_rate": 1.9744105246469264e-05, + "loss": 0.7297, + "step": 200 + }, + { + "epoch": 0.1, + "eval_code_gate_load": [ + 202.15, + 184.0, + 181.85, + 156.2, + 180.4, + 177.6, + 186.5, + 171.3 + ], + "eval_code_loss": 0.5015624761581421, + "eval_code_runtime": 1.8076, + "eval_code_samples_per_second": 553.219, + "eval_code_steps_per_second": 34.853, + "step": 200 + }, + { + "epoch": 0.1, + "eval_orca_gate_load": [ + 500.1, + 348.05, + 396.7, + 392.0, + 354.8, + 437.65, + 368.05, + 344.95 + ], + "eval_orca_loss": 0.7518554925918579, + "eval_orca_runtime": 2.0082, + "eval_orca_samples_per_second": 497.958, + "eval_orca_steps_per_second": 31.371, + "step": 200 + }, + { + "epoch": 0.1, + "eval_math_gate_load": [ + 319.05, + 220.35, + 222.35, + 231.65, + 255.05, + 239.45, + 280.9, + 263.9 + ], + "eval_math_loss": 0.6756836175918579, + "eval_math_runtime": 1.8563, + "eval_math_samples_per_second": 538.698, + "eval_math_steps_per_second": 33.938, + "step": 200 + }, + { + "epoch": 0.1, + "eval_sharegpt_gate_load": [ + 1522.6, + 1108.8, + 1288.45, + 1197.4, + 1139.0, + 1373.8, + 1198.4, + 1083.15 + ], + "eval_sharegpt_loss": 0.7212890386581421, + "eval_sharegpt_runtime": 3.0067, + "eval_sharegpt_samples_per_second": 332.594, + "eval_sharegpt_steps_per_second": 20.953, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 1.9740452503697518e-05, + "loss": 0.7521, + "step": 201 + }, + { + "epoch": 0.1, + "learning_rate": 1.9736774217724614e-05, + "loss": 0.716, + "step": 202 + }, + { + "epoch": 0.1, + "learning_rate": 1.9733070398196423e-05, + "loss": 0.7111, + "step": 203 + }, + { + "epoch": 0.1, + "learning_rate": 1.9729341054825783e-05, + "loss": 0.7261, + "step": 204 + }, + { + "epoch": 0.1, + "learning_rate": 1.972558619739246e-05, + "loss": 0.7397, + "step": 205 + }, + { + "epoch": 0.1, + "learning_rate": 1.972180583574313e-05, + "loss": 0.7313, + "step": 206 + }, + { + "epoch": 0.1, + "learning_rate": 1.9717999979791356e-05, + "loss": 0.6935, + "step": 207 + }, + { + "epoch": 0.1, + "learning_rate": 1.9714168639517543e-05, + "loss": 0.7706, + "step": 208 + }, + { + "epoch": 0.1, + "learning_rate": 1.9710311824968942e-05, + "loss": 0.7282, + "step": 209 + }, + { + "epoch": 0.1, + "learning_rate": 1.9706429546259592e-05, + "loss": 0.6551, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 1.9702521813570322e-05, + "loss": 0.6824, + "step": 211 + }, + { + "epoch": 0.11, + "learning_rate": 1.9698588637148705e-05, + "loss": 0.7006, + "step": 212 + }, + { + "epoch": 0.11, + "learning_rate": 1.9694630027309035e-05, + "loss": 0.734, + "step": 213 + }, + { + "epoch": 0.11, + "learning_rate": 1.9690645994432307e-05, + "loss": 0.6711, + "step": 214 + }, + { + "epoch": 0.11, + "learning_rate": 1.9686636548966177e-05, + "loss": 0.7231, + "step": 215 + }, + { + "epoch": 0.11, + "learning_rate": 1.9682601701424958e-05, + "loss": 0.7045, + "step": 216 + }, + { + "epoch": 0.11, + "learning_rate": 1.9678541462389564e-05, + "loss": 0.7036, + "step": 217 + }, + { + "epoch": 0.11, + "learning_rate": 1.9674455842507494e-05, + "loss": 0.6919, + "step": 218 + }, + { + "epoch": 0.11, + "learning_rate": 1.9670344852492814e-05, + "loss": 0.7436, + "step": 219 + }, + { + "epoch": 0.11, + "learning_rate": 1.9666208503126115e-05, + "loss": 0.7014, + "step": 220 + }, + { + "epoch": 0.11, + "learning_rate": 1.966204680525449e-05, + "loss": 0.7137, + "step": 221 + }, + { + "epoch": 0.11, + "learning_rate": 1.9657859769791506e-05, + "loss": 0.6578, + "step": 222 + }, + { + "epoch": 0.11, + "learning_rate": 1.965364740771718e-05, + "loss": 0.6613, + "step": 223 + }, + { + "epoch": 0.11, + "learning_rate": 1.9649409730077934e-05, + "loss": 0.748, + "step": 224 + }, + { + "epoch": 0.11, + "learning_rate": 1.964514674798659e-05, + "loss": 0.7396, + "step": 225 + }, + { + "epoch": 0.11, + "learning_rate": 1.9640858472622316e-05, + "loss": 0.6985, + "step": 226 + }, + { + "epoch": 0.11, + "learning_rate": 1.963654491523062e-05, + "loss": 0.7084, + "step": 227 + }, + { + "epoch": 0.11, + "learning_rate": 1.9632206087123296e-05, + "loss": 0.6876, + "step": 228 + }, + { + "epoch": 0.11, + "learning_rate": 1.9627841999678422e-05, + "loss": 0.719, + "step": 229 + }, + { + "epoch": 0.12, + "learning_rate": 1.9623452664340305e-05, + "loss": 0.6693, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 1.9619038092619465e-05, + "loss": 0.722, + "step": 231 + }, + { + "epoch": 0.12, + "learning_rate": 1.9614598296092603e-05, + "loss": 0.7495, + "step": 232 + }, + { + "epoch": 0.12, + "learning_rate": 1.9610133286402565e-05, + "loss": 0.653, + "step": 233 + }, + { + "epoch": 0.12, + "learning_rate": 1.9605643075258323e-05, + "loss": 0.7022, + "step": 234 + }, + { + "epoch": 0.12, + "learning_rate": 1.960112767443493e-05, + "loss": 0.7374, + "step": 235 + }, + { + "epoch": 0.12, + "learning_rate": 1.9596587095773496e-05, + "loss": 0.6933, + "step": 236 + }, + { + "epoch": 0.12, + "learning_rate": 1.9592021351181163e-05, + "loss": 0.6685, + "step": 237 + }, + { + "epoch": 0.12, + "learning_rate": 1.958743045263106e-05, + "loss": 0.7344, + "step": 238 + }, + { + "epoch": 0.12, + "learning_rate": 1.9582814412162288e-05, + "loss": 0.6979, + "step": 239 + }, + { + "epoch": 0.12, + "learning_rate": 1.957817324187987e-05, + "loss": 0.6684, + "step": 240 + }, + { + "epoch": 0.12, + "learning_rate": 1.957350695395474e-05, + "loss": 0.7081, + "step": 241 + }, + { + "epoch": 0.12, + "learning_rate": 1.956881556062369e-05, + "loss": 0.7117, + "step": 242 + }, + { + "epoch": 0.12, + "learning_rate": 1.956409907418935e-05, + "loss": 0.6398, + "step": 243 + }, + { + "epoch": 0.12, + "learning_rate": 1.9559357507020163e-05, + "loss": 0.7447, + "step": 244 + }, + { + "epoch": 0.12, + "learning_rate": 1.955459087155033e-05, + "loss": 0.7159, + "step": 245 + }, + { + "epoch": 0.12, + "learning_rate": 1.9549799180279793e-05, + "loss": 0.7141, + "step": 246 + }, + { + "epoch": 0.12, + "learning_rate": 1.9544982445774217e-05, + "loss": 0.7247, + "step": 247 + }, + { + "epoch": 0.12, + "learning_rate": 1.9540140680664915e-05, + "loss": 0.6986, + "step": 248 + }, + { + "epoch": 0.12, + "learning_rate": 1.9535273897648857e-05, + "loss": 0.7617, + "step": 249 + }, + { + "epoch": 0.12, + "learning_rate": 1.953038210948861e-05, + "loss": 0.7067, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 1.9525465329012322e-05, + "loss": 0.7115, + "step": 251 + }, + { + "epoch": 0.13, + "learning_rate": 1.952052356911368e-05, + "loss": 0.7017, + "step": 252 + }, + { + "epoch": 0.13, + "learning_rate": 1.9515556842751863e-05, + "loss": 0.6714, + "step": 253 + }, + { + "epoch": 0.13, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.6627, + "step": 254 + }, + { + "epoch": 0.13, + "learning_rate": 1.9505548542802805e-05, + "loss": 0.6834, + "step": 255 + }, + { + "epoch": 0.13, + "learning_rate": 1.950050699546116e-05, + "loss": 0.7105, + "step": 256 + }, + { + "epoch": 0.13, + "learning_rate": 1.949544053414748e-05, + "loss": 0.7134, + "step": 257 + }, + { + "epoch": 0.13, + "learning_rate": 1.9490349172147964e-05, + "loss": 0.7188, + "step": 258 + }, + { + "epoch": 0.13, + "learning_rate": 1.9485232922814117e-05, + "loss": 0.6334, + "step": 259 + }, + { + "epoch": 0.13, + "learning_rate": 1.9480091799562706e-05, + "loss": 0.7144, + "step": 260 + }, + { + "epoch": 0.13, + "learning_rate": 1.947492581587573e-05, + "loss": 0.6746, + "step": 261 + }, + { + "epoch": 0.13, + "learning_rate": 1.9469734985300373e-05, + "loss": 0.778, + "step": 262 + }, + { + "epoch": 0.13, + "learning_rate": 1.9464519321448988e-05, + "loss": 0.7379, + "step": 263 + }, + { + "epoch": 0.13, + "learning_rate": 1.9459278837999048e-05, + "loss": 0.699, + "step": 264 + }, + { + "epoch": 0.13, + "learning_rate": 1.9454013548693103e-05, + "loss": 0.7559, + "step": 265 + }, + { + "epoch": 0.13, + "learning_rate": 1.9448723467338765e-05, + "loss": 0.6769, + "step": 266 + }, + { + "epoch": 0.13, + "learning_rate": 1.944340860780865e-05, + "loss": 0.709, + "step": 267 + }, + { + "epoch": 0.13, + "learning_rate": 1.9438068984040366e-05, + "loss": 0.7177, + "step": 268 + }, + { + "epoch": 0.13, + "learning_rate": 1.9432704610036448e-05, + "loss": 0.7849, + "step": 269 + }, + { + "epoch": 0.14, + "learning_rate": 1.9427315499864345e-05, + "loss": 0.7355, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 1.9421901667656364e-05, + "loss": 0.686, + "step": 271 + }, + { + "epoch": 0.14, + "learning_rate": 1.9416463127609655e-05, + "loss": 0.6962, + "step": 272 + }, + { + "epoch": 0.14, + "learning_rate": 1.9410999893986157e-05, + "loss": 0.7009, + "step": 273 + }, + { + "epoch": 0.14, + "learning_rate": 1.9405511981112553e-05, + "loss": 0.6892, + "step": 274 + }, + { + "epoch": 0.14, + "learning_rate": 1.9399999403380266e-05, + "loss": 0.6725, + "step": 275 + }, + { + "epoch": 0.14, + "learning_rate": 1.9394462175245382e-05, + "loss": 0.75, + "step": 276 + }, + { + "epoch": 0.14, + "learning_rate": 1.9388900311228636e-05, + "loss": 0.7397, + "step": 277 + }, + { + "epoch": 0.14, + "learning_rate": 1.9383313825915372e-05, + "loss": 0.6471, + "step": 278 + }, + { + "epoch": 0.14, + "learning_rate": 1.9377702733955493e-05, + "loss": 0.7021, + "step": 279 + }, + { + "epoch": 0.14, + "learning_rate": 1.937206705006344e-05, + "loss": 0.6346, + "step": 280 + }, + { + "epoch": 0.14, + "learning_rate": 1.9366406789018127e-05, + "loss": 0.7311, + "step": 281 + }, + { + "epoch": 0.14, + "learning_rate": 1.9360721965662934e-05, + "loss": 0.6929, + "step": 282 + }, + { + "epoch": 0.14, + "learning_rate": 1.9355012594905645e-05, + "loss": 0.705, + "step": 283 + }, + { + "epoch": 0.14, + "learning_rate": 1.9349278691718426e-05, + "loss": 0.7271, + "step": 284 + }, + { + "epoch": 0.14, + "learning_rate": 1.9343520271137764e-05, + "loss": 0.7556, + "step": 285 + }, + { + "epoch": 0.14, + "learning_rate": 1.9337737348264448e-05, + "loss": 0.6825, + "step": 286 + }, + { + "epoch": 0.14, + "learning_rate": 1.9331929938263515e-05, + "loss": 0.6625, + "step": 287 + }, + { + "epoch": 0.14, + "learning_rate": 1.9326098056364224e-05, + "loss": 0.6577, + "step": 288 + }, + { + "epoch": 0.14, + "learning_rate": 1.9320241717860007e-05, + "loss": 0.6815, + "step": 289 + }, + { + "epoch": 0.14, + "learning_rate": 1.9314360938108427e-05, + "loss": 0.7224, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 1.930845573253114e-05, + "loss": 0.6794, + "step": 291 + }, + { + "epoch": 0.15, + "learning_rate": 1.9302526116613863e-05, + "loss": 0.7068, + "step": 292 + }, + { + "epoch": 0.15, + "learning_rate": 1.9296572105906323e-05, + "loss": 0.759, + "step": 293 + }, + { + "epoch": 0.15, + "learning_rate": 1.9290593716022218e-05, + "loss": 0.7061, + "step": 294 + }, + { + "epoch": 0.15, + "learning_rate": 1.928459096263918e-05, + "loss": 0.7049, + "step": 295 + }, + { + "epoch": 0.15, + "learning_rate": 1.9278563861498726e-05, + "loss": 0.7267, + "step": 296 + }, + { + "epoch": 0.15, + "learning_rate": 1.927251242840623e-05, + "loss": 0.7339, + "step": 297 + }, + { + "epoch": 0.15, + "learning_rate": 1.9266436679230866e-05, + "loss": 0.7487, + "step": 298 + }, + { + "epoch": 0.15, + "learning_rate": 1.926033662990558e-05, + "loss": 0.7184, + "step": 299 + }, + { + "epoch": 0.15, + "learning_rate": 1.9254212296427043e-05, + "loss": 0.7435, + "step": 300 + }, + { + "epoch": 0.15, + "eval_code_gate_load": [ + 206.0, + 187.95, + 190.05, + 161.5, + 166.75, + 174.0, + 185.1, + 168.65 + ], + "eval_code_loss": 0.5054687261581421, + "eval_code_runtime": 1.7765, + "eval_code_samples_per_second": 562.908, + "eval_code_steps_per_second": 35.463, + "step": 300 + }, + { + "epoch": 0.15, + "eval_orca_gate_load": [ + 508.25, + 351.75, + 403.15, + 398.9, + 341.1, + 425.3, + 372.7, + 341.15 + ], + "eval_orca_loss": 0.74560546875, + "eval_orca_runtime": 1.9987, + "eval_orca_samples_per_second": 500.33, + "eval_orca_steps_per_second": 31.521, + "step": 300 + }, + { + "epoch": 0.15, + "eval_math_gate_load": [ + 324.1, + 225.55, + 232.0, + 231.1, + 244.15, + 235.65, + 281.2, + 258.95 + ], + "eval_math_loss": 0.660449206829071, + "eval_math_runtime": 1.8524, + "eval_math_samples_per_second": 539.839, + "eval_math_steps_per_second": 34.01, + "step": 300 + }, + { + "epoch": 0.15, + "eval_sharegpt_gate_load": [ + 1528.15, + 1114.7, + 1318.0, + 1217.15, + 1114.7, + 1348.95, + 1194.85, + 1075.1 + ], + "eval_sharegpt_loss": 0.7177734375, + "eval_sharegpt_runtime": 3.0081, + "eval_sharegpt_samples_per_second": 332.432, + "eval_sharegpt_steps_per_second": 20.943, + "step": 300 + }, + { + "epoch": 0.15, + "learning_rate": 1.9248063694855603e-05, + "loss": 0.7142, + "step": 301 + }, + { + "epoch": 0.15, + "learning_rate": 1.924189084131525e-05, + "loss": 0.6985, + "step": 302 + }, + { + "epoch": 0.15, + "learning_rate": 1.923569375199357e-05, + "loss": 0.7139, + "step": 303 + }, + { + "epoch": 0.15, + "learning_rate": 1.922947244314172e-05, + "loss": 0.7499, + "step": 304 + }, + { + "epoch": 0.15, + "learning_rate": 1.922322693107434e-05, + "loss": 0.6487, + "step": 305 + }, + { + "epoch": 0.15, + "learning_rate": 1.9216957232169567e-05, + "loss": 0.6963, + "step": 306 + }, + { + "epoch": 0.15, + "learning_rate": 1.9210663362868956e-05, + "loss": 0.6527, + "step": 307 + }, + { + "epoch": 0.15, + "learning_rate": 1.9204345339677442e-05, + "loss": 0.6664, + "step": 308 + }, + { + "epoch": 0.15, + "learning_rate": 1.9198003179163308e-05, + "loss": 0.7128, + "step": 309 + }, + { + "epoch": 0.15, + "learning_rate": 1.9191636897958123e-05, + "loss": 0.7186, + "step": 310 + }, + { + "epoch": 0.16, + "learning_rate": 1.9185246512756727e-05, + "loss": 0.6744, + "step": 311 + }, + { + "epoch": 0.16, + "learning_rate": 1.9178832040317153e-05, + "loss": 0.671, + "step": 312 + }, + { + "epoch": 0.16, + "learning_rate": 1.917239349746061e-05, + "loss": 0.7735, + "step": 313 + }, + { + "epoch": 0.16, + "learning_rate": 1.916593090107143e-05, + "loss": 0.6821, + "step": 314 + }, + { + "epoch": 0.16, + "learning_rate": 1.9159444268097012e-05, + "loss": 0.6755, + "step": 315 + }, + { + "epoch": 0.16, + "learning_rate": 1.91529336155478e-05, + "loss": 0.6606, + "step": 316 + }, + { + "epoch": 0.16, + "learning_rate": 1.9146398960497213e-05, + "loss": 0.7263, + "step": 317 + }, + { + "epoch": 0.16, + "learning_rate": 1.913984032008163e-05, + "loss": 0.6973, + "step": 318 + }, + { + "epoch": 0.16, + "learning_rate": 1.9133257711500318e-05, + "loss": 0.6823, + "step": 319 + }, + { + "epoch": 0.16, + "learning_rate": 1.9126651152015404e-05, + "loss": 0.6701, + "step": 320 + }, + { + "epoch": 0.16, + "learning_rate": 1.9120020658951814e-05, + "loss": 0.6863, + "step": 321 + }, + { + "epoch": 0.16, + "learning_rate": 1.911336624969725e-05, + "loss": 0.7225, + "step": 322 + }, + { + "epoch": 0.16, + "learning_rate": 1.910668794170212e-05, + "loss": 0.7344, + "step": 323 + }, + { + "epoch": 0.16, + "learning_rate": 1.9099985752479505e-05, + "loss": 0.743, + "step": 324 + }, + { + "epoch": 0.16, + "learning_rate": 1.9093259699605125e-05, + "loss": 0.7427, + "step": 325 + }, + { + "epoch": 0.16, + "learning_rate": 1.908650980071726e-05, + "loss": 0.6911, + "step": 326 + }, + { + "epoch": 0.16, + "learning_rate": 1.9079736073516735e-05, + "loss": 0.6815, + "step": 327 + }, + { + "epoch": 0.16, + "learning_rate": 1.9072938535766864e-05, + "loss": 0.681, + "step": 328 + }, + { + "epoch": 0.16, + "learning_rate": 1.9066117205293393e-05, + "loss": 0.6892, + "step": 329 + }, + { + "epoch": 0.17, + "learning_rate": 1.905927209998447e-05, + "loss": 0.7275, + "step": 330 + }, + { + "epoch": 0.17, + "learning_rate": 1.905240323779058e-05, + "loss": 0.7578, + "step": 331 + }, + { + "epoch": 0.17, + "learning_rate": 1.904551063672452e-05, + "loss": 0.6842, + "step": 332 + }, + { + "epoch": 0.17, + "learning_rate": 1.9038594314861328e-05, + "loss": 0.7046, + "step": 333 + }, + { + "epoch": 0.17, + "learning_rate": 1.9031654290338256e-05, + "loss": 0.686, + "step": 334 + }, + { + "epoch": 0.17, + "learning_rate": 1.90246905813547e-05, + "loss": 0.6564, + "step": 335 + }, + { + "epoch": 0.17, + "learning_rate": 1.9017703206172187e-05, + "loss": 0.6683, + "step": 336 + }, + { + "epoch": 0.17, + "learning_rate": 1.9010692183114285e-05, + "loss": 0.7419, + "step": 337 + }, + { + "epoch": 0.17, + "learning_rate": 1.900365753056659e-05, + "loss": 0.6519, + "step": 338 + }, + { + "epoch": 0.17, + "learning_rate": 1.8996599266976658e-05, + "loss": 0.6392, + "step": 339 + }, + { + "epoch": 0.17, + "learning_rate": 1.8989517410853956e-05, + "loss": 0.6843, + "step": 340 + }, + { + "epoch": 0.17, + "learning_rate": 1.898241198076983e-05, + "loss": 0.6899, + "step": 341 + }, + { + "epoch": 0.17, + "learning_rate": 1.8975282995357448e-05, + "loss": 0.7023, + "step": 342 + }, + { + "epoch": 0.17, + "learning_rate": 1.8968130473311732e-05, + "loss": 0.6331, + "step": 343 + }, + { + "epoch": 0.17, + "learning_rate": 1.896095443338935e-05, + "loss": 0.659, + "step": 344 + }, + { + "epoch": 0.17, + "learning_rate": 1.8953754894408617e-05, + "loss": 0.6294, + "step": 345 + }, + { + "epoch": 0.17, + "learning_rate": 1.8946531875249496e-05, + "loss": 0.6617, + "step": 346 + }, + { + "epoch": 0.17, + "learning_rate": 1.89392853948535e-05, + "loss": 0.6872, + "step": 347 + }, + { + "epoch": 0.17, + "learning_rate": 1.8932015472223692e-05, + "loss": 0.7242, + "step": 348 + }, + { + "epoch": 0.17, + "learning_rate": 1.892472212642459e-05, + "loss": 0.6774, + "step": 349 + }, + { + "epoch": 0.17, + "learning_rate": 1.8917405376582144e-05, + "loss": 0.6537, + "step": 350 + }, + { + "epoch": 0.18, + "learning_rate": 1.891006524188368e-05, + "loss": 0.7253, + "step": 351 + }, + { + "epoch": 0.18, + "learning_rate": 1.8902701741577844e-05, + "loss": 0.6531, + "step": 352 + }, + { + "epoch": 0.18, + "learning_rate": 1.889531489497455e-05, + "loss": 0.6655, + "step": 353 + }, + { + "epoch": 0.18, + "learning_rate": 1.8887904721444955e-05, + "loss": 0.7832, + "step": 354 + }, + { + "epoch": 0.18, + "learning_rate": 1.8880471240421365e-05, + "loss": 0.6488, + "step": 355 + }, + { + "epoch": 0.18, + "learning_rate": 1.8873014471397225e-05, + "loss": 0.7323, + "step": 356 + }, + { + "epoch": 0.18, + "learning_rate": 1.8865534433927034e-05, + "loss": 0.6627, + "step": 357 + }, + { + "epoch": 0.18, + "learning_rate": 1.8858031147626326e-05, + "loss": 0.6513, + "step": 358 + }, + { + "epoch": 0.18, + "learning_rate": 1.885050463217159e-05, + "loss": 0.6807, + "step": 359 + }, + { + "epoch": 0.18, + "learning_rate": 1.8842954907300236e-05, + "loss": 0.6281, + "step": 360 + }, + { + "epoch": 0.18, + "learning_rate": 1.883538199281054e-05, + "loss": 0.6593, + "step": 361 + }, + { + "epoch": 0.18, + "learning_rate": 1.8827785908561585e-05, + "loss": 0.6379, + "step": 362 + }, + { + "epoch": 0.18, + "learning_rate": 1.8820166674473217e-05, + "loss": 0.6934, + "step": 363 + }, + { + "epoch": 0.18, + "learning_rate": 1.881252431052599e-05, + "loss": 0.6419, + "step": 364 + }, + { + "epoch": 0.18, + "learning_rate": 1.880485883676111e-05, + "loss": 0.7072, + "step": 365 + }, + { + "epoch": 0.18, + "learning_rate": 1.879717027328039e-05, + "loss": 0.7061, + "step": 366 + }, + { + "epoch": 0.18, + "learning_rate": 1.8789458640246193e-05, + "loss": 0.668, + "step": 367 + }, + { + "epoch": 0.18, + "learning_rate": 1.8781723957881374e-05, + "loss": 0.6409, + "step": 368 + }, + { + "epoch": 0.18, + "learning_rate": 1.8773966246469238e-05, + "loss": 0.6655, + "step": 369 + }, + { + "epoch": 0.18, + "learning_rate": 1.876618552635348e-05, + "loss": 0.7031, + "step": 370 + }, + { + "epoch": 0.19, + "learning_rate": 1.8758381817938126e-05, + "loss": 0.6492, + "step": 371 + }, + { + "epoch": 0.19, + "learning_rate": 1.87505551416875e-05, + "loss": 0.7515, + "step": 372 + }, + { + "epoch": 0.19, + "learning_rate": 1.874270551812614e-05, + "loss": 0.6949, + "step": 373 + }, + { + "epoch": 0.19, + "learning_rate": 1.8734832967838775e-05, + "loss": 0.7355, + "step": 374 + }, + { + "epoch": 0.19, + "learning_rate": 1.8726937511470247e-05, + "loss": 0.6748, + "step": 375 + }, + { + "epoch": 0.19, + "learning_rate": 1.871901916972547e-05, + "loss": 0.6932, + "step": 376 + }, + { + "epoch": 0.19, + "learning_rate": 1.8711077963369377e-05, + "loss": 0.7317, + "step": 377 + }, + { + "epoch": 0.19, + "learning_rate": 1.8703113913226847e-05, + "loss": 0.7287, + "step": 378 + }, + { + "epoch": 0.19, + "learning_rate": 1.8695127040182678e-05, + "loss": 0.7347, + "step": 379 + }, + { + "epoch": 0.19, + "learning_rate": 1.8687117365181514e-05, + "loss": 0.665, + "step": 380 + }, + { + "epoch": 0.19, + "learning_rate": 1.867908490922779e-05, + "loss": 0.6309, + "step": 381 + }, + { + "epoch": 0.19, + "learning_rate": 1.867102969338569e-05, + "loss": 0.6849, + "step": 382 + }, + { + "epoch": 0.19, + "learning_rate": 1.8662951738779077e-05, + "loss": 0.7277, + "step": 383 + }, + { + "epoch": 0.19, + "learning_rate": 1.865485106659145e-05, + "loss": 0.7113, + "step": 384 + }, + { + "epoch": 0.19, + "learning_rate": 1.8646727698065865e-05, + "loss": 0.6555, + "step": 385 + }, + { + "epoch": 0.19, + "learning_rate": 1.863858165450492e-05, + "loss": 0.6328, + "step": 386 + }, + { + "epoch": 0.19, + "learning_rate": 1.863041295727066e-05, + "loss": 0.755, + "step": 387 + }, + { + "epoch": 0.19, + "learning_rate": 1.862222162778454e-05, + "loss": 0.72, + "step": 388 + }, + { + "epoch": 0.19, + "learning_rate": 1.8614007687527374e-05, + "loss": 0.7177, + "step": 389 + }, + { + "epoch": 0.2, + "learning_rate": 1.8605771158039253e-05, + "loss": 0.7039, + "step": 390 + }, + { + "epoch": 0.2, + "learning_rate": 1.8597512060919523e-05, + "loss": 0.6861, + "step": 391 + }, + { + "epoch": 0.2, + "learning_rate": 1.85892304178267e-05, + "loss": 0.672, + "step": 392 + }, + { + "epoch": 0.2, + "learning_rate": 1.8580926250478425e-05, + "loss": 0.6745, + "step": 393 + }, + { + "epoch": 0.2, + "learning_rate": 1.8572599580651415e-05, + "loss": 0.6572, + "step": 394 + }, + { + "epoch": 0.2, + "learning_rate": 1.8564250430181387e-05, + "loss": 0.6489, + "step": 395 + }, + { + "epoch": 0.2, + "learning_rate": 1.8555878820963014e-05, + "loss": 0.7114, + "step": 396 + }, + { + "epoch": 0.2, + "learning_rate": 1.8547484774949865e-05, + "loss": 0.7313, + "step": 397 + }, + { + "epoch": 0.2, + "learning_rate": 1.8539068314154355e-05, + "loss": 0.7268, + "step": 398 + }, + { + "epoch": 0.2, + "learning_rate": 1.8530629460647658e-05, + "loss": 0.698, + "step": 399 + }, + { + "epoch": 0.2, + "learning_rate": 1.8522168236559693e-05, + "loss": 0.7727, + "step": 400 + }, + { + "epoch": 0.2, + "eval_code_gate_load": [ + 202.85, + 191.5, + 178.8, + 158.95, + 174.1, + 173.95, + 189.7, + 170.15 + ], + "eval_code_loss": 0.4867187440395355, + "eval_code_runtime": 1.7854, + "eval_code_samples_per_second": 560.106, + "eval_code_steps_per_second": 35.287, + "step": 400 + }, + { + "epoch": 0.2, + "eval_orca_gate_load": [ + 501.6, + 354.85, + 398.45, + 395.2, + 353.6, + 425.0, + 373.85, + 339.75 + ], + "eval_orca_loss": 0.735644519329071, + "eval_orca_runtime": 2.0136, + "eval_orca_samples_per_second": 496.616, + "eval_orca_steps_per_second": 31.287, + "step": 400 + }, + { + "epoch": 0.2, + "eval_math_gate_load": [ + 320.85, + 237.0, + 216.35, + 231.35, + 252.05, + 238.55, + 277.05, + 259.5 + ], + "eval_math_loss": 0.6509765386581421, + "eval_math_runtime": 1.8609, + "eval_math_samples_per_second": 537.377, + "eval_math_steps_per_second": 33.855, + "step": 400 + }, + { + "epoch": 0.2, + "eval_sharegpt_gate_load": [ + 1520.85, + 1130.25, + 1282.2, + 1212.05, + 1141.8, + 1334.7, + 1205.7, + 1084.05 + ], + "eval_sharegpt_loss": 0.712109386920929, + "eval_sharegpt_runtime": 2.9955, + "eval_sharegpt_samples_per_second": 333.837, + "eval_sharegpt_steps_per_second": 21.032, + "step": 400 + }, + { + "epoch": 0.2, + "learning_rate": 1.8513684664079033e-05, + "loss": 0.736, + "step": 401 + }, + { + "epoch": 0.2, + "learning_rate": 1.8505178765452853e-05, + "loss": 0.7126, + "step": 402 + }, + { + "epoch": 0.2, + "learning_rate": 1.8496650562986888e-05, + "loss": 0.6468, + "step": 403 + }, + { + "epoch": 0.2, + "learning_rate": 1.8488100079045345e-05, + "loss": 0.76, + "step": 404 + }, + { + "epoch": 0.2, + "learning_rate": 1.847952733605088e-05, + "loss": 0.641, + "step": 405 + }, + { + "epoch": 0.2, + "learning_rate": 1.847093235648451e-05, + "loss": 0.656, + "step": 406 + }, + { + "epoch": 0.2, + "learning_rate": 1.8462315162885563e-05, + "loss": 0.7218, + "step": 407 + }, + { + "epoch": 0.2, + "learning_rate": 1.8453675777851627e-05, + "loss": 0.6604, + "step": 408 + }, + { + "epoch": 0.2, + "learning_rate": 1.8445014224038485e-05, + "loss": 0.7317, + "step": 409 + }, + { + "epoch": 0.2, + "learning_rate": 1.8436330524160048e-05, + "loss": 0.7086, + "step": 410 + }, + { + "epoch": 0.21, + "learning_rate": 1.8427624700988308e-05, + "loss": 0.7205, + "step": 411 + }, + { + "epoch": 0.21, + "learning_rate": 1.8418896777353272e-05, + "loss": 0.7507, + "step": 412 + }, + { + "epoch": 0.21, + "learning_rate": 1.84101467761429e-05, + "loss": 0.6566, + "step": 413 + }, + { + "epoch": 0.21, + "learning_rate": 1.8401374720303054e-05, + "loss": 0.6488, + "step": 414 + }, + { + "epoch": 0.21, + "learning_rate": 1.8392580632837423e-05, + "loss": 0.6858, + "step": 415 + }, + { + "epoch": 0.21, + "learning_rate": 1.8383764536807486e-05, + "loss": 0.6794, + "step": 416 + }, + { + "epoch": 0.21, + "learning_rate": 1.837492645533241e-05, + "loss": 0.7179, + "step": 417 + }, + { + "epoch": 0.21, + "learning_rate": 1.836606641158905e-05, + "loss": 0.6611, + "step": 418 + }, + { + "epoch": 0.21, + "learning_rate": 1.835718442881183e-05, + "loss": 0.6555, + "step": 419 + }, + { + "epoch": 0.21, + "learning_rate": 1.8348280530292712e-05, + "loss": 0.6509, + "step": 420 + }, + { + "epoch": 0.21, + "learning_rate": 1.8339354739381138e-05, + "loss": 0.6516, + "step": 421 + }, + { + "epoch": 0.21, + "learning_rate": 1.833040707948395e-05, + "loss": 0.7115, + "step": 422 + }, + { + "epoch": 0.21, + "learning_rate": 1.8321437574065347e-05, + "loss": 0.6653, + "step": 423 + }, + { + "epoch": 0.21, + "learning_rate": 1.831244624664681e-05, + "loss": 0.6984, + "step": 424 + }, + { + "epoch": 0.21, + "learning_rate": 1.8303433120807043e-05, + "loss": 0.6648, + "step": 425 + }, + { + "epoch": 0.21, + "learning_rate": 1.829439822018192e-05, + "loss": 0.6358, + "step": 426 + }, + { + "epoch": 0.21, + "learning_rate": 1.8285341568464416e-05, + "loss": 0.7535, + "step": 427 + }, + { + "epoch": 0.21, + "learning_rate": 1.827626318940454e-05, + "loss": 0.6195, + "step": 428 + }, + { + "epoch": 0.21, + "learning_rate": 1.8267163106809288e-05, + "loss": 0.6978, + "step": 429 + }, + { + "epoch": 0.21, + "learning_rate": 1.8258041344542567e-05, + "loss": 0.6611, + "step": 430 + }, + { + "epoch": 0.22, + "learning_rate": 1.824889792652513e-05, + "loss": 0.6458, + "step": 431 + }, + { + "epoch": 0.22, + "learning_rate": 1.8239732876734525e-05, + "loss": 0.68, + "step": 432 + }, + { + "epoch": 0.22, + "learning_rate": 1.8230546219205032e-05, + "loss": 0.6779, + "step": 433 + }, + { + "epoch": 0.22, + "learning_rate": 1.822133797802758e-05, + "loss": 0.719, + "step": 434 + }, + { + "epoch": 0.22, + "learning_rate": 1.8212108177349722e-05, + "loss": 0.6448, + "step": 435 + }, + { + "epoch": 0.22, + "learning_rate": 1.8202856841375517e-05, + "loss": 0.7534, + "step": 436 + }, + { + "epoch": 0.22, + "learning_rate": 1.819358399436553e-05, + "loss": 0.6638, + "step": 437 + }, + { + "epoch": 0.22, + "learning_rate": 1.8184289660636715e-05, + "loss": 0.7415, + "step": 438 + }, + { + "epoch": 0.22, + "learning_rate": 1.817497386456238e-05, + "loss": 0.6746, + "step": 439 + }, + { + "epoch": 0.22, + "learning_rate": 1.816563663057211e-05, + "loss": 0.6855, + "step": 440 + }, + { + "epoch": 0.22, + "learning_rate": 1.815627798315172e-05, + "loss": 0.7072, + "step": 441 + }, + { + "epoch": 0.22, + "learning_rate": 1.8146897946843162e-05, + "loss": 0.7095, + "step": 442 + }, + { + "epoch": 0.22, + "learning_rate": 1.81374965462445e-05, + "loss": 0.6492, + "step": 443 + }, + { + "epoch": 0.22, + "learning_rate": 1.81280738060098e-05, + "loss": 0.6364, + "step": 444 + }, + { + "epoch": 0.22, + "learning_rate": 1.8118629750849106e-05, + "loss": 0.7134, + "step": 445 + }, + { + "epoch": 0.22, + "learning_rate": 1.810916440552835e-05, + "loss": 0.6878, + "step": 446 + }, + { + "epoch": 0.22, + "learning_rate": 1.8099677794869297e-05, + "loss": 0.6927, + "step": 447 + }, + { + "epoch": 0.22, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.6148, + "step": 448 + }, + { + "epoch": 0.22, + "learning_rate": 1.808064087710212e-05, + "loss": 0.6622, + "step": 449 + }, + { + "epoch": 0.23, + "learning_rate": 1.8071090619916095e-05, + "loss": 0.7079, + "step": 450 + }, + { + "epoch": 0.23, + "learning_rate": 1.8061519197235835e-05, + "loss": 0.6797, + "step": 451 + }, + { + "epoch": 0.23, + "learning_rate": 1.8051926634161282e-05, + "loss": 0.6875, + "step": 452 + }, + { + "epoch": 0.23, + "learning_rate": 1.804231295584782e-05, + "loss": 0.6777, + "step": 453 + }, + { + "epoch": 0.23, + "learning_rate": 1.8032678187506187e-05, + "loss": 0.6696, + "step": 454 + }, + { + "epoch": 0.23, + "learning_rate": 1.802302235440245e-05, + "loss": 0.7006, + "step": 455 + }, + { + "epoch": 0.23, + "learning_rate": 1.8013345481857903e-05, + "loss": 0.6464, + "step": 456 + }, + { + "epoch": 0.23, + "learning_rate": 1.8003647595249016e-05, + "loss": 0.7272, + "step": 457 + }, + { + "epoch": 0.23, + "learning_rate": 1.799392872000736e-05, + "loss": 0.6675, + "step": 458 + }, + { + "epoch": 0.23, + "learning_rate": 1.7984188881619563e-05, + "loss": 0.6513, + "step": 459 + }, + { + "epoch": 0.23, + "learning_rate": 1.797442810562721e-05, + "loss": 0.707, + "step": 460 + }, + { + "epoch": 0.23, + "learning_rate": 1.79646464176268e-05, + "loss": 0.6863, + "step": 461 + }, + { + "epoch": 0.23, + "learning_rate": 1.7954843843269665e-05, + "loss": 0.6823, + "step": 462 + }, + { + "epoch": 0.23, + "learning_rate": 1.794502040826192e-05, + "loss": 0.6794, + "step": 463 + }, + { + "epoch": 0.23, + "learning_rate": 1.793517613836437e-05, + "loss": 0.6876, + "step": 464 + }, + { + "epoch": 0.23, + "learning_rate": 1.7925311059392472e-05, + "loss": 0.6972, + "step": 465 + }, + { + "epoch": 0.23, + "learning_rate": 1.7915425197216246e-05, + "loss": 0.6594, + "step": 466 + }, + { + "epoch": 0.23, + "learning_rate": 1.7905518577760207e-05, + "loss": 0.6748, + "step": 467 + }, + { + "epoch": 0.23, + "learning_rate": 1.7895591227003316e-05, + "loss": 0.7027, + "step": 468 + }, + { + "epoch": 0.23, + "learning_rate": 1.788564317097889e-05, + "loss": 0.6533, + "step": 469 + }, + { + "epoch": 0.23, + "learning_rate": 1.7875674435774546e-05, + "loss": 0.7014, + "step": 470 + }, + { + "epoch": 0.24, + "learning_rate": 1.786568504753213e-05, + "loss": 0.6529, + "step": 471 + }, + { + "epoch": 0.24, + "learning_rate": 1.7855675032447648e-05, + "loss": 0.6857, + "step": 472 + }, + { + "epoch": 0.24, + "learning_rate": 1.78456444167712e-05, + "loss": 0.6052, + "step": 473 + }, + { + "epoch": 0.24, + "learning_rate": 1.7835593226806902e-05, + "loss": 0.6513, + "step": 474 + }, + { + "epoch": 0.24, + "learning_rate": 1.7825521488912833e-05, + "loss": 0.6751, + "step": 475 + }, + { + "epoch": 0.24, + "learning_rate": 1.7815429229500946e-05, + "loss": 0.6228, + "step": 476 + }, + { + "epoch": 0.24, + "learning_rate": 1.7805316475037016e-05, + "loss": 0.7381, + "step": 477 + }, + { + "epoch": 0.24, + "learning_rate": 1.7795183252040568e-05, + "loss": 0.7245, + "step": 478 + }, + { + "epoch": 0.24, + "learning_rate": 1.7785029587084793e-05, + "loss": 0.6885, + "step": 479 + }, + { + "epoch": 0.24, + "learning_rate": 1.7774855506796497e-05, + "loss": 0.7099, + "step": 480 + }, + { + "epoch": 0.24, + "learning_rate": 1.7764661037856013e-05, + "loss": 0.7003, + "step": 481 + }, + { + "epoch": 0.24, + "learning_rate": 1.7754446206997152e-05, + "loss": 0.6168, + "step": 482 + }, + { + "epoch": 0.24, + "learning_rate": 1.774421104100712e-05, + "loss": 0.6982, + "step": 483 + }, + { + "epoch": 0.24, + "learning_rate": 1.7733955566726438e-05, + "loss": 0.705, + "step": 484 + }, + { + "epoch": 0.24, + "learning_rate": 1.7723679811048904e-05, + "loss": 0.6737, + "step": 485 + }, + { + "epoch": 0.24, + "learning_rate": 1.771338380092148e-05, + "loss": 0.649, + "step": 486 + }, + { + "epoch": 0.24, + "learning_rate": 1.7703067563344252e-05, + "loss": 0.6895, + "step": 487 + }, + { + "epoch": 0.24, + "learning_rate": 1.7692731125370355e-05, + "loss": 0.724, + "step": 488 + }, + { + "epoch": 0.24, + "learning_rate": 1.768237451410589e-05, + "loss": 0.6591, + "step": 489 + }, + { + "epoch": 0.24, + "learning_rate": 1.767199775670986e-05, + "loss": 0.6185, + "step": 490 + }, + { + "epoch": 0.25, + "learning_rate": 1.7661600880394113e-05, + "loss": 0.634, + "step": 491 + }, + { + "epoch": 0.25, + "learning_rate": 1.7651183912423228e-05, + "loss": 0.7616, + "step": 492 + }, + { + "epoch": 0.25, + "learning_rate": 1.7640746880114505e-05, + "loss": 0.6675, + "step": 493 + }, + { + "epoch": 0.25, + "learning_rate": 1.7630289810837836e-05, + "loss": 0.6559, + "step": 494 + }, + { + "epoch": 0.25, + "learning_rate": 1.7619812732015664e-05, + "loss": 0.6797, + "step": 495 + }, + { + "epoch": 0.25, + "learning_rate": 1.7609315671122912e-05, + "loss": 0.6389, + "step": 496 + }, + { + "epoch": 0.25, + "learning_rate": 1.75987986556869e-05, + "loss": 0.6548, + "step": 497 + }, + { + "epoch": 0.25, + "learning_rate": 1.758826171328727e-05, + "loss": 0.641, + "step": 498 + }, + { + "epoch": 0.25, + "learning_rate": 1.7577704871555924e-05, + "loss": 0.6575, + "step": 499 + }, + { + "epoch": 0.25, + "learning_rate": 1.7567128158176955e-05, + "loss": 0.7441, + "step": 500 + }, + { + "epoch": 0.25, + "eval_code_gate_load": [ + 211.7, + 175.45, + 182.4, + 153.8, + 175.7, + 182.45, + 190.25, + 168.25 + ], + "eval_code_loss": 0.48564451932907104, + "eval_code_runtime": 1.7819, + "eval_code_samples_per_second": 561.202, + "eval_code_steps_per_second": 35.356, + "step": 500 + }, + { + "epoch": 0.25, + "eval_orca_gate_load": [ + 510.85, + 348.25, + 399.85, + 394.75, + 347.0, + 427.2, + 372.4, + 342.0 + ], + "eval_orca_loss": 0.7276366949081421, + "eval_orca_runtime": 2.0064, + "eval_orca_samples_per_second": 498.405, + "eval_orca_steps_per_second": 31.399, + "step": 500 + }, + { + "epoch": 0.25, + "eval_math_gate_load": [ + 323.9, + 211.65, + 228.5, + 240.05, + 256.55, + 243.35, + 275.65, + 253.05 + ], + "eval_math_loss": 0.6341797113418579, + "eval_math_runtime": 1.8482, + "eval_math_samples_per_second": 541.069, + "eval_math_steps_per_second": 34.087, + "step": 500 + }, + { + "epoch": 0.25, + "eval_sharegpt_gate_load": [ + 1535.15, + 1108.85, + 1287.05, + 1205.95, + 1139.5, + 1344.0, + 1206.15, + 1084.95 + ], + "eval_sharegpt_loss": 0.702929675579071, + "eval_sharegpt_runtime": 3.0148, + "eval_sharegpt_samples_per_second": 331.693, + "eval_sharegpt_steps_per_second": 20.897, + "step": 500 + }, + { + "epoch": 0.25, + "learning_rate": 1.7556531600886554e-05, + "loss": 0.6686, + "step": 501 + }, + { + "epoch": 0.25, + "learning_rate": 1.7545915227472967e-05, + "loss": 0.6791, + "step": 502 + }, + { + "epoch": 0.25, + "learning_rate": 1.753527906577638e-05, + "loss": 0.6089, + "step": 503 + }, + { + "epoch": 0.25, + "learning_rate": 1.7524623143688905e-05, + "loss": 0.6985, + "step": 504 + }, + { + "epoch": 0.25, + "learning_rate": 1.7513947489154443e-05, + "loss": 0.6266, + "step": 505 + }, + { + "epoch": 0.25, + "learning_rate": 1.7503252130168657e-05, + "loss": 0.653, + "step": 506 + }, + { + "epoch": 0.25, + "learning_rate": 1.749253709477888e-05, + "loss": 0.6769, + "step": 507 + }, + { + "epoch": 0.25, + "learning_rate": 1.748180241108404e-05, + "loss": 0.7099, + "step": 508 + }, + { + "epoch": 0.25, + "learning_rate": 1.74710481072346e-05, + "loss": 0.6426, + "step": 509 + }, + { + "epoch": 0.26, + "learning_rate": 1.7460274211432463e-05, + "loss": 0.6726, + "step": 510 + }, + { + "epoch": 0.26, + "learning_rate": 1.7449480751930915e-05, + "loss": 0.6101, + "step": 511 + }, + { + "epoch": 0.26, + "learning_rate": 1.7438667757034547e-05, + "loss": 0.7545, + "step": 512 + }, + { + "epoch": 0.26, + "learning_rate": 1.7427835255099173e-05, + "loss": 0.6404, + "step": 513 + }, + { + "epoch": 0.26, + "learning_rate": 1.7416983274531777e-05, + "loss": 0.6997, + "step": 514 + }, + { + "epoch": 0.26, + "learning_rate": 1.74061118437904e-05, + "loss": 0.636, + "step": 515 + }, + { + "epoch": 0.26, + "learning_rate": 1.739522099138411e-05, + "loss": 0.6581, + "step": 516 + }, + { + "epoch": 0.26, + "learning_rate": 1.7384310745872896e-05, + "loss": 0.6526, + "step": 517 + }, + { + "epoch": 0.26, + "learning_rate": 1.7373381135867605e-05, + "loss": 0.6856, + "step": 518 + }, + { + "epoch": 0.26, + "learning_rate": 1.7362432190029862e-05, + "loss": 0.6896, + "step": 519 + }, + { + "epoch": 0.26, + "learning_rate": 1.7351463937072008e-05, + "loss": 0.6557, + "step": 520 + }, + { + "epoch": 0.26, + "learning_rate": 1.7340476405757e-05, + "loss": 0.5884, + "step": 521 + }, + { + "epoch": 0.26, + "learning_rate": 1.732946962489836e-05, + "loss": 0.7313, + "step": 522 + }, + { + "epoch": 0.26, + "learning_rate": 1.7318443623360092e-05, + "loss": 0.6795, + "step": 523 + }, + { + "epoch": 0.26, + "learning_rate": 1.7307398430056595e-05, + "loss": 0.6634, + "step": 524 + }, + { + "epoch": 0.26, + "learning_rate": 1.7296334073952606e-05, + "loss": 0.6827, + "step": 525 + }, + { + "epoch": 0.26, + "learning_rate": 1.72852505840631e-05, + "loss": 0.5858, + "step": 526 + }, + { + "epoch": 0.26, + "learning_rate": 1.7274147989453246e-05, + "loss": 0.6678, + "step": 527 + }, + { + "epoch": 0.26, + "learning_rate": 1.72630263192383e-05, + "loss": 0.6171, + "step": 528 + }, + { + "epoch": 0.26, + "learning_rate": 1.7251885602583547e-05, + "loss": 0.7119, + "step": 529 + }, + { + "epoch": 0.27, + "learning_rate": 1.7240725868704218e-05, + "loss": 0.6961, + "step": 530 + }, + { + "epoch": 0.27, + "learning_rate": 1.722954714686541e-05, + "loss": 0.6365, + "step": 531 + }, + { + "epoch": 0.27, + "learning_rate": 1.7218349466382024e-05, + "loss": 0.6798, + "step": 532 + }, + { + "epoch": 0.27, + "learning_rate": 1.7207132856618668e-05, + "loss": 0.6779, + "step": 533 + }, + { + "epoch": 0.27, + "learning_rate": 1.719589734698959e-05, + "loss": 0.7134, + "step": 534 + }, + { + "epoch": 0.27, + "learning_rate": 1.718464296695861e-05, + "loss": 0.6633, + "step": 535 + }, + { + "epoch": 0.27, + "learning_rate": 1.7173369746039026e-05, + "loss": 0.6756, + "step": 536 + }, + { + "epoch": 0.27, + "learning_rate": 1.7162077713793547e-05, + "loss": 0.6724, + "step": 537 + }, + { + "epoch": 0.27, + "learning_rate": 1.7150766899834205e-05, + "loss": 0.7333, + "step": 538 + }, + { + "epoch": 0.27, + "learning_rate": 1.7139437333822303e-05, + "loss": 0.719, + "step": 539 + }, + { + "epoch": 0.27, + "learning_rate": 1.7128089045468294e-05, + "loss": 0.6325, + "step": 540 + }, + { + "epoch": 0.27, + "learning_rate": 1.711672206453175e-05, + "loss": 0.6804, + "step": 541 + }, + { + "epoch": 0.27, + "learning_rate": 1.7105336420821247e-05, + "loss": 0.6392, + "step": 542 + }, + { + "epoch": 0.27, + "learning_rate": 1.709393214419431e-05, + "loss": 0.6337, + "step": 543 + }, + { + "epoch": 0.27, + "learning_rate": 1.7082509264557333e-05, + "loss": 0.6137, + "step": 544 + }, + { + "epoch": 0.27, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.6187, + "step": 545 + }, + { + "epoch": 0.27, + "learning_rate": 1.705960781612262e-05, + "loss": 0.6155, + "step": 546 + }, + { + "epoch": 0.27, + "learning_rate": 1.7048129307381266e-05, + "loss": 0.7153, + "step": 547 + }, + { + "epoch": 0.27, + "learning_rate": 1.7036632315742464e-05, + "loss": 0.6237, + "step": 548 + }, + { + "epoch": 0.27, + "learning_rate": 1.7025116871355737e-05, + "loss": 0.6675, + "step": 549 + }, + { + "epoch": 0.28, + "learning_rate": 1.7013583004418994e-05, + "loss": 0.6524, + "step": 550 + }, + { + "epoch": 0.28, + "learning_rate": 1.7002030745178455e-05, + "loss": 0.6712, + "step": 551 + }, + { + "epoch": 0.28, + "learning_rate": 1.6990460123928577e-05, + "loss": 0.7206, + "step": 552 + }, + { + "epoch": 0.28, + "learning_rate": 1.6978871171011963e-05, + "loss": 0.6446, + "step": 553 + }, + { + "epoch": 0.28, + "learning_rate": 1.696726391681929e-05, + "loss": 0.7012, + "step": 554 + }, + { + "epoch": 0.28, + "learning_rate": 1.695563839178923e-05, + "loss": 0.6295, + "step": 555 + }, + { + "epoch": 0.28, + "learning_rate": 1.6943994626408365e-05, + "loss": 0.6708, + "step": 556 + }, + { + "epoch": 0.28, + "learning_rate": 1.6932332651211115e-05, + "loss": 0.6714, + "step": 557 + }, + { + "epoch": 0.28, + "learning_rate": 1.692065249677965e-05, + "loss": 0.6664, + "step": 558 + }, + { + "epoch": 0.28, + "learning_rate": 1.6908954193743816e-05, + "loss": 0.6241, + "step": 559 + }, + { + "epoch": 0.28, + "learning_rate": 1.6897237772781046e-05, + "loss": 0.5995, + "step": 560 + }, + { + "epoch": 0.28, + "learning_rate": 1.6885503264616282e-05, + "loss": 0.6553, + "step": 561 + }, + { + "epoch": 0.28, + "learning_rate": 1.6873750700021917e-05, + "loss": 0.6247, + "step": 562 + }, + { + "epoch": 0.28, + "learning_rate": 1.686198010981767e-05, + "loss": 0.6658, + "step": 563 + }, + { + "epoch": 0.28, + "learning_rate": 1.6850191524870548e-05, + "loss": 0.6658, + "step": 564 + }, + { + "epoch": 0.28, + "learning_rate": 1.6838384976094738e-05, + "loss": 0.5976, + "step": 565 + }, + { + "epoch": 0.28, + "learning_rate": 1.682656049445154e-05, + "loss": 0.6962, + "step": 566 + }, + { + "epoch": 0.28, + "learning_rate": 1.6814718110949274e-05, + "loss": 0.618, + "step": 567 + }, + { + "epoch": 0.28, + "learning_rate": 1.6802857856643214e-05, + "loss": 0.6911, + "step": 568 + }, + { + "epoch": 0.28, + "learning_rate": 1.6790979762635497e-05, + "loss": 0.6263, + "step": 569 + }, + { + "epoch": 0.28, + "learning_rate": 1.6779083860075032e-05, + "loss": 0.7031, + "step": 570 + }, + { + "epoch": 0.29, + "learning_rate": 1.6767170180157442e-05, + "loss": 0.6945, + "step": 571 + }, + { + "epoch": 0.29, + "learning_rate": 1.6755238754124965e-05, + "loss": 0.5995, + "step": 572 + }, + { + "epoch": 0.29, + "learning_rate": 1.674328961326637e-05, + "loss": 0.6473, + "step": 573 + }, + { + "epoch": 0.29, + "learning_rate": 1.6731322788916892e-05, + "loss": 0.6429, + "step": 574 + }, + { + "epoch": 0.29, + "learning_rate": 1.6719338312458123e-05, + "loss": 0.6871, + "step": 575 + }, + { + "epoch": 0.29, + "learning_rate": 1.6707336215317968e-05, + "loss": 0.7102, + "step": 576 + }, + { + "epoch": 0.29, + "learning_rate": 1.6695316528970517e-05, + "loss": 0.6381, + "step": 577 + }, + { + "epoch": 0.29, + "learning_rate": 1.6683279284936004e-05, + "loss": 0.6506, + "step": 578 + }, + { + "epoch": 0.29, + "learning_rate": 1.6671224514780692e-05, + "loss": 0.6909, + "step": 579 + }, + { + "epoch": 0.29, + "learning_rate": 1.665915225011681e-05, + "loss": 0.6609, + "step": 580 + }, + { + "epoch": 0.29, + "learning_rate": 1.6647062522602474e-05, + "loss": 0.6844, + "step": 581 + }, + { + "epoch": 0.29, + "learning_rate": 1.6634955363941573e-05, + "loss": 0.639, + "step": 582 + }, + { + "epoch": 0.29, + "learning_rate": 1.662283080588373e-05, + "loss": 0.6377, + "step": 583 + }, + { + "epoch": 0.29, + "learning_rate": 1.6610688880224178e-05, + "loss": 0.6264, + "step": 584 + }, + { + "epoch": 0.29, + "learning_rate": 1.65985296188037e-05, + "loss": 0.5948, + "step": 585 + }, + { + "epoch": 0.29, + "learning_rate": 1.6586353053508548e-05, + "loss": 0.6641, + "step": 586 + }, + { + "epoch": 0.29, + "learning_rate": 1.657415921627034e-05, + "loss": 0.6331, + "step": 587 + }, + { + "epoch": 0.29, + "learning_rate": 1.6561948139065997e-05, + "loss": 0.6469, + "step": 588 + }, + { + "epoch": 0.29, + "learning_rate": 1.654971985391764e-05, + "loss": 0.6639, + "step": 589 + }, + { + "epoch": 0.29, + "learning_rate": 1.6537474392892527e-05, + "loss": 0.6445, + "step": 590 + }, + { + "epoch": 0.3, + "learning_rate": 1.6525211788102946e-05, + "loss": 0.6451, + "step": 591 + }, + { + "epoch": 0.3, + "learning_rate": 1.6512932071706153e-05, + "loss": 0.6608, + "step": 592 + }, + { + "epoch": 0.3, + "learning_rate": 1.6500635275904274e-05, + "loss": 0.6746, + "step": 593 + }, + { + "epoch": 0.3, + "learning_rate": 1.6488321432944218e-05, + "loss": 0.6803, + "step": 594 + }, + { + "epoch": 0.3, + "learning_rate": 1.6475990575117603e-05, + "loss": 0.6298, + "step": 595 + }, + { + "epoch": 0.3, + "learning_rate": 1.646364273476067e-05, + "loss": 0.6742, + "step": 596 + }, + { + "epoch": 0.3, + "learning_rate": 1.6451277944254186e-05, + "loss": 0.6271, + "step": 597 + }, + { + "epoch": 0.3, + "learning_rate": 1.6438896236023374e-05, + "loss": 0.6675, + "step": 598 + }, + { + "epoch": 0.3, + "learning_rate": 1.6426497642537826e-05, + "loss": 0.5843, + "step": 599 + }, + { + "epoch": 0.3, + "learning_rate": 1.6414082196311402e-05, + "loss": 0.6476, + "step": 600 + }, + { + "epoch": 0.3, + "eval_code_gate_load": [ + 205.25, + 186.05, + 177.25, + 157.05, + 177.4, + 178.15, + 188.5, + 170.35 + ], + "eval_code_loss": 0.47382813692092896, + "eval_code_runtime": 1.7893, + "eval_code_samples_per_second": 558.879, + "eval_code_steps_per_second": 35.209, + "step": 600 + }, + { + "epoch": 0.3, + "eval_orca_gate_load": [ + 503.95, + 352.45, + 398.2, + 397.5, + 349.4, + 424.3, + 374.4, + 342.1 + ], + "eval_orca_loss": 0.7177734375, + "eval_orca_runtime": 2.0043, + "eval_orca_samples_per_second": 498.93, + "eval_orca_steps_per_second": 31.433, + "step": 600 + }, + { + "epoch": 0.3, + "eval_math_gate_load": [ + 314.1, + 223.2, + 230.75, + 237.9, + 259.05, + 238.65, + 276.2, + 252.85 + ], + "eval_math_loss": 0.45869141817092896, + "eval_math_runtime": 1.8415, + "eval_math_samples_per_second": 543.045, + "eval_math_steps_per_second": 34.212, + "step": 600 + }, + { + "epoch": 0.3, + "eval_sharegpt_gate_load": [ + 1523.65, + 1118.3, + 1287.95, + 1209.65, + 1143.35, + 1350.6, + 1193.3, + 1084.8 + ], + "eval_sharegpt_loss": 0.6988281011581421, + "eval_sharegpt_runtime": 2.9965, + "eval_sharegpt_samples_per_second": 333.722, + "eval_sharegpt_steps_per_second": 21.024, + "step": 600 + }, + { + "epoch": 0.3, + "learning_rate": 1.640164992990216e-05, + "loss": 0.6594, + "step": 601 + }, + { + "epoch": 0.3, + "learning_rate": 1.638920087591228e-05, + "loss": 0.6526, + "step": 602 + }, + { + "epoch": 0.3, + "learning_rate": 1.637673506698794e-05, + "loss": 0.6331, + "step": 603 + }, + { + "epoch": 0.3, + "learning_rate": 1.6364252535819284e-05, + "loss": 0.6393, + "step": 604 + }, + { + "epoch": 0.3, + "learning_rate": 1.6351753315140285e-05, + "loss": 0.6594, + "step": 605 + }, + { + "epoch": 0.3, + "learning_rate": 1.63392374377287e-05, + "loss": 0.6047, + "step": 606 + }, + { + "epoch": 0.3, + "learning_rate": 1.6326704936405953e-05, + "loss": 0.6231, + "step": 607 + }, + { + "epoch": 0.3, + "learning_rate": 1.6314155844037074e-05, + "loss": 0.646, + "step": 608 + }, + { + "epoch": 0.3, + "learning_rate": 1.6301590193530585e-05, + "loss": 0.6644, + "step": 609 + }, + { + "epoch": 0.3, + "learning_rate": 1.6289008017838447e-05, + "loss": 0.6033, + "step": 610 + }, + { + "epoch": 0.31, + "learning_rate": 1.6276409349955945e-05, + "loss": 0.6628, + "step": 611 + }, + { + "epoch": 0.31, + "learning_rate": 1.626379422292162e-05, + "loss": 0.6536, + "step": 612 + }, + { + "epoch": 0.31, + "learning_rate": 1.6251162669817172e-05, + "loss": 0.6094, + "step": 613 + }, + { + "epoch": 0.31, + "learning_rate": 1.6238514723767372e-05, + "loss": 0.6263, + "step": 614 + }, + { + "epoch": 0.31, + "learning_rate": 1.622585041793999e-05, + "loss": 0.5747, + "step": 615 + }, + { + "epoch": 0.31, + "learning_rate": 1.6213169785545688e-05, + "loss": 0.5773, + "step": 616 + }, + { + "epoch": 0.31, + "learning_rate": 1.6200472859837946e-05, + "loss": 0.6433, + "step": 617 + }, + { + "epoch": 0.31, + "learning_rate": 1.6187759674112972e-05, + "loss": 0.63, + "step": 618 + }, + { + "epoch": 0.31, + "learning_rate": 1.6175030261709615e-05, + "loss": 0.6301, + "step": 619 + }, + { + "epoch": 0.31, + "learning_rate": 1.6162284656009276e-05, + "loss": 0.6516, + "step": 620 + }, + { + "epoch": 0.31, + "learning_rate": 1.6149522890435815e-05, + "loss": 0.6746, + "step": 621 + }, + { + "epoch": 0.31, + "learning_rate": 1.6136744998455477e-05, + "loss": 0.5876, + "step": 622 + }, + { + "epoch": 0.31, + "learning_rate": 1.6123951013576796e-05, + "loss": 0.6151, + "step": 623 + }, + { + "epoch": 0.31, + "learning_rate": 1.6111140969350504e-05, + "loss": 0.6078, + "step": 624 + }, + { + "epoch": 0.31, + "learning_rate": 1.6098314899369446e-05, + "loss": 0.564, + "step": 625 + }, + { + "epoch": 0.31, + "learning_rate": 1.6085472837268504e-05, + "loss": 0.6233, + "step": 626 + }, + { + "epoch": 0.31, + "learning_rate": 1.607261481672448e-05, + "loss": 0.6358, + "step": 627 + }, + { + "epoch": 0.31, + "learning_rate": 1.6059740871456035e-05, + "loss": 0.6307, + "step": 628 + }, + { + "epoch": 0.31, + "learning_rate": 1.6046851035223594e-05, + "loss": 0.6252, + "step": 629 + }, + { + "epoch": 0.32, + "learning_rate": 1.603394534182925e-05, + "loss": 0.6263, + "step": 630 + }, + { + "epoch": 0.32, + "learning_rate": 1.6021023825116672e-05, + "loss": 0.6303, + "step": 631 + }, + { + "epoch": 0.32, + "learning_rate": 1.6008086518971037e-05, + "loss": 0.6099, + "step": 632 + }, + { + "epoch": 0.32, + "learning_rate": 1.599513345731892e-05, + "loss": 0.6102, + "step": 633 + }, + { + "epoch": 0.32, + "learning_rate": 1.598216467412822e-05, + "loss": 0.6181, + "step": 634 + }, + { + "epoch": 0.32, + "learning_rate": 1.5969180203408052e-05, + "loss": 0.6137, + "step": 635 + }, + { + "epoch": 0.32, + "learning_rate": 1.5956180079208684e-05, + "loss": 0.6133, + "step": 636 + }, + { + "epoch": 0.32, + "learning_rate": 1.5943164335621418e-05, + "loss": 0.6424, + "step": 637 + }, + { + "epoch": 0.32, + "learning_rate": 1.593013300677853e-05, + "loss": 0.6082, + "step": 638 + }, + { + "epoch": 0.32, + "learning_rate": 1.591708612685316e-05, + "loss": 0.575, + "step": 639 + }, + { + "epoch": 0.32, + "learning_rate": 1.5904023730059227e-05, + "loss": 0.6683, + "step": 640 + }, + { + "epoch": 0.32, + "learning_rate": 1.5890945850651347e-05, + "loss": 0.6528, + "step": 641 + }, + { + "epoch": 0.32, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.6372, + "step": 642 + }, + { + "epoch": 0.32, + "learning_rate": 1.586474378121511e-05, + "loss": 0.6389, + "step": 643 + }, + { + "epoch": 0.32, + "learning_rate": 1.5851619659898623e-05, + "loss": 0.6056, + "step": 644 + }, + { + "epoch": 0.32, + "learning_rate": 1.5838480193391753e-05, + "loss": 0.5766, + "step": 645 + }, + { + "epoch": 0.32, + "learning_rate": 1.582532541615122e-05, + "loss": 0.6306, + "step": 646 + }, + { + "epoch": 0.32, + "learning_rate": 1.5812155362673895e-05, + "loss": 0.6092, + "step": 647 + }, + { + "epoch": 0.32, + "learning_rate": 1.57989700674967e-05, + "loss": 0.6193, + "step": 648 + }, + { + "epoch": 0.32, + "learning_rate": 1.5785769565196543e-05, + "loss": 0.693, + "step": 649 + }, + { + "epoch": 0.33, + "learning_rate": 1.5772553890390196e-05, + "loss": 0.6228, + "step": 650 + }, + { + "epoch": 0.33, + "learning_rate": 1.5759323077734233e-05, + "loss": 0.6016, + "step": 651 + }, + { + "epoch": 0.33, + "learning_rate": 1.5746077161924905e-05, + "loss": 0.6191, + "step": 652 + }, + { + "epoch": 0.33, + "learning_rate": 1.5732816177698097e-05, + "loss": 0.5908, + "step": 653 + }, + { + "epoch": 0.33, + "learning_rate": 1.5719540159829185e-05, + "loss": 0.6295, + "step": 654 + }, + { + "epoch": 0.33, + "learning_rate": 1.5706249143132982e-05, + "loss": 0.5966, + "step": 655 + }, + { + "epoch": 0.33, + "learning_rate": 1.5692943162463628e-05, + "loss": 0.571, + "step": 656 + }, + { + "epoch": 0.33, + "learning_rate": 1.5679622252714507e-05, + "loss": 0.6001, + "step": 657 + }, + { + "epoch": 0.33, + "learning_rate": 1.5666286448818152e-05, + "loss": 0.6675, + "step": 658 + }, + { + "epoch": 0.33, + "learning_rate": 1.565293578574615e-05, + "loss": 0.5932, + "step": 659 + }, + { + "epoch": 0.33, + "learning_rate": 1.5639570298509067e-05, + "loss": 0.6454, + "step": 660 + }, + { + "epoch": 0.33, + "learning_rate": 1.5626190022156328e-05, + "loss": 0.6011, + "step": 661 + }, + { + "epoch": 0.33, + "learning_rate": 1.5612794991776147e-05, + "loss": 0.5868, + "step": 662 + }, + { + "epoch": 0.33, + "learning_rate": 1.5599385242495437e-05, + "loss": 0.5421, + "step": 663 + }, + { + "epoch": 0.33, + "learning_rate": 1.5585960809479698e-05, + "loss": 0.6044, + "step": 664 + }, + { + "epoch": 0.33, + "learning_rate": 1.5572521727932937e-05, + "loss": 0.5905, + "step": 665 + }, + { + "epoch": 0.33, + "learning_rate": 1.5559068033097583e-05, + "loss": 0.6102, + "step": 666 + }, + { + "epoch": 0.33, + "learning_rate": 1.554559976025438e-05, + "loss": 0.5814, + "step": 667 + }, + { + "epoch": 0.33, + "learning_rate": 1.5532116944722308e-05, + "loss": 0.6065, + "step": 668 + }, + { + "epoch": 0.33, + "learning_rate": 1.5518619621858474e-05, + "loss": 0.5764, + "step": 669 + }, + { + "epoch": 0.34, + "learning_rate": 1.5505107827058038e-05, + "loss": 0.5827, + "step": 670 + }, + { + "epoch": 0.34, + "learning_rate": 1.5491581595754102e-05, + "loss": 0.5963, + "step": 671 + }, + { + "epoch": 0.34, + "learning_rate": 1.547804096341763e-05, + "loss": 0.5861, + "step": 672 + }, + { + "epoch": 0.34, + "learning_rate": 1.546448596555736e-05, + "loss": 0.6071, + "step": 673 + }, + { + "epoch": 0.34, + "learning_rate": 1.5450916637719683e-05, + "loss": 0.6061, + "step": 674 + }, + { + "epoch": 0.34, + "learning_rate": 1.5437333015488586e-05, + "loss": 0.6383, + "step": 675 + }, + { + "epoch": 0.34, + "learning_rate": 1.5423735134485537e-05, + "loss": 0.5513, + "step": 676 + }, + { + "epoch": 0.34, + "learning_rate": 1.5410123030369387e-05, + "loss": 0.6036, + "step": 677 + }, + { + "epoch": 0.34, + "learning_rate": 1.5396496738836292e-05, + "loss": 0.6438, + "step": 678 + }, + { + "epoch": 0.34, + "learning_rate": 1.5382856295619622e-05, + "loss": 0.6068, + "step": 679 + }, + { + "epoch": 0.34, + "learning_rate": 1.536920173648984e-05, + "loss": 0.6384, + "step": 680 + }, + { + "epoch": 0.34, + "learning_rate": 1.535553309725444e-05, + "loss": 0.5917, + "step": 681 + }, + { + "epoch": 0.34, + "learning_rate": 1.5341850413757834e-05, + "loss": 0.5726, + "step": 682 + }, + { + "epoch": 0.34, + "learning_rate": 1.532815372188126e-05, + "loss": 0.5571, + "step": 683 + }, + { + "epoch": 0.34, + "learning_rate": 1.5314443057542703e-05, + "loss": 0.6066, + "step": 684 + }, + { + "epoch": 0.34, + "learning_rate": 1.530071845669678e-05, + "loss": 0.5798, + "step": 685 + }, + { + "epoch": 0.34, + "learning_rate": 1.5286979955334655e-05, + "loss": 0.6412, + "step": 686 + }, + { + "epoch": 0.34, + "learning_rate": 1.5273227589483945e-05, + "loss": 0.5704, + "step": 687 + }, + { + "epoch": 0.34, + "learning_rate": 1.5259461395208628e-05, + "loss": 0.6293, + "step": 688 + }, + { + "epoch": 0.34, + "learning_rate": 1.5245681408608946e-05, + "loss": 0.541, + "step": 689 + }, + { + "epoch": 0.34, + "learning_rate": 1.52318876658213e-05, + "loss": 0.6042, + "step": 690 + }, + { + "epoch": 0.35, + "learning_rate": 1.5218080203018181e-05, + "loss": 0.5809, + "step": 691 + }, + { + "epoch": 0.35, + "learning_rate": 1.5204259056408046e-05, + "loss": 0.5862, + "step": 692 + }, + { + "epoch": 0.35, + "learning_rate": 1.5190424262235241e-05, + "loss": 0.5623, + "step": 693 + }, + { + "epoch": 0.35, + "learning_rate": 1.5176575856779904e-05, + "loss": 0.5797, + "step": 694 + }, + { + "epoch": 0.35, + "learning_rate": 1.516271387635786e-05, + "loss": 0.6085, + "step": 695 + }, + { + "epoch": 0.35, + "learning_rate": 1.5148838357320537e-05, + "loss": 0.5822, + "step": 696 + }, + { + "epoch": 0.35, + "learning_rate": 1.5134949336054866e-05, + "loss": 0.5458, + "step": 697 + }, + { + "epoch": 0.35, + "learning_rate": 1.512104684898319e-05, + "loss": 0.6001, + "step": 698 + }, + { + "epoch": 0.35, + "learning_rate": 1.5107130932563151e-05, + "loss": 0.5995, + "step": 699 + }, + { + "epoch": 0.35, + "learning_rate": 1.5093201623287631e-05, + "loss": 0.6145, + "step": 700 + }, + { + "epoch": 0.35, + "eval_code_gate_load": [ + 209.05, + 177.55, + 174.45, + 152.3, + 174.4, + 187.35, + 191.25, + 173.65 + ], + "eval_code_loss": 0.34501951932907104, + "eval_code_runtime": 1.7766, + "eval_code_samples_per_second": 562.88, + "eval_code_steps_per_second": 35.461, + "step": 700 + }, + { + "epoch": 0.35, + "eval_orca_gate_load": [ + 509.4, + 346.8, + 397.0, + 390.9, + 346.15, + 430.65, + 368.2, + 353.2 + ], + "eval_orca_loss": 0.4732421934604645, + "eval_orca_runtime": 1.9952, + "eval_orca_samples_per_second": 501.201, + "eval_orca_steps_per_second": 31.576, + "step": 700 + }, + { + "epoch": 0.35, + "eval_math_gate_load": [ + 328.55, + 214.9, + 226.85, + 225.9, + 251.3, + 251.0, + 278.75, + 255.45 + ], + "eval_math_loss": 0.45625001192092896, + "eval_math_runtime": 1.8447, + "eval_math_samples_per_second": 542.082, + "eval_math_steps_per_second": 34.151, + "step": 700 + }, + { + "epoch": 0.35, + "eval_sharegpt_gate_load": [ + 1548.45, + 1089.65, + 1285.75, + 1191.1, + 1132.35, + 1374.15, + 1192.4, + 1097.75 + ], + "eval_sharegpt_loss": 0.689746081829071, + "eval_sharegpt_runtime": 2.9874, + "eval_sharegpt_samples_per_second": 334.74, + "eval_sharegpt_steps_per_second": 21.089, + "step": 700 + }, + { + "epoch": 0.35, + "learning_rate": 1.507925895768461e-05, + "loss": 0.5587, + "step": 701 + }, + { + "epoch": 0.35, + "learning_rate": 1.5065302972317108e-05, + "loss": 0.5865, + "step": 702 + }, + { + "epoch": 0.35, + "learning_rate": 1.5051333703783069e-05, + "loss": 0.5186, + "step": 703 + }, + { + "epoch": 0.35, + "learning_rate": 1.5037351188715265e-05, + "loss": 0.6164, + "step": 704 + }, + { + "epoch": 0.35, + "learning_rate": 1.5023355463781221e-05, + "loss": 0.586, + "step": 705 + }, + { + "epoch": 0.35, + "learning_rate": 1.5009346565683088e-05, + "loss": 0.5101, + "step": 706 + }, + { + "epoch": 0.35, + "learning_rate": 1.4995324531157569e-05, + "loss": 0.5553, + "step": 707 + }, + { + "epoch": 0.35, + "learning_rate": 1.4981289396975818e-05, + "loss": 0.5443, + "step": 708 + }, + { + "epoch": 0.35, + "learning_rate": 1.4967241199943332e-05, + "loss": 0.589, + "step": 709 + }, + { + "epoch": 0.35, + "learning_rate": 1.4953179976899878e-05, + "loss": 0.6207, + "step": 710 + }, + { + "epoch": 0.36, + "learning_rate": 1.4939105764719369e-05, + "loss": 0.5548, + "step": 711 + }, + { + "epoch": 0.36, + "learning_rate": 1.4925018600309784e-05, + "loss": 0.5938, + "step": 712 + }, + { + "epoch": 0.36, + "learning_rate": 1.4910918520613074e-05, + "loss": 0.5599, + "step": 713 + }, + { + "epoch": 0.36, + "learning_rate": 1.4896805562605052e-05, + "loss": 0.5786, + "step": 714 + }, + { + "epoch": 0.36, + "learning_rate": 1.4882679763295307e-05, + "loss": 0.5052, + "step": 715 + }, + { + "epoch": 0.36, + "learning_rate": 1.4868541159727097e-05, + "loss": 0.5806, + "step": 716 + }, + { + "epoch": 0.36, + "learning_rate": 1.4854389788977266e-05, + "loss": 0.5824, + "step": 717 + }, + { + "epoch": 0.36, + "learning_rate": 1.4840225688156132e-05, + "loss": 0.5878, + "step": 718 + }, + { + "epoch": 0.36, + "learning_rate": 1.4826048894407396e-05, + "loss": 0.5256, + "step": 719 + }, + { + "epoch": 0.36, + "learning_rate": 1.4811859444908053e-05, + "loss": 0.5338, + "step": 720 + }, + { + "epoch": 0.36, + "learning_rate": 1.4797657376868273e-05, + "loss": 0.5115, + "step": 721 + }, + { + "epoch": 0.36, + "learning_rate": 1.4783442727531328e-05, + "loss": 0.5406, + "step": 722 + }, + { + "epoch": 0.36, + "learning_rate": 1.4769215534173476e-05, + "loss": 0.5402, + "step": 723 + }, + { + "epoch": 0.36, + "learning_rate": 1.4754975834103877e-05, + "loss": 0.5703, + "step": 724 + }, + { + "epoch": 0.36, + "learning_rate": 1.4740723664664483e-05, + "loss": 0.5609, + "step": 725 + }, + { + "epoch": 0.36, + "learning_rate": 1.4726459063229946e-05, + "loss": 0.5399, + "step": 726 + }, + { + "epoch": 0.36, + "learning_rate": 1.4712182067207516e-05, + "loss": 0.5649, + "step": 727 + }, + { + "epoch": 0.36, + "learning_rate": 1.4697892714036959e-05, + "loss": 0.5274, + "step": 728 + }, + { + "epoch": 0.36, + "learning_rate": 1.4683591041190433e-05, + "loss": 0.5253, + "step": 729 + }, + { + "epoch": 0.36, + "learning_rate": 1.4669277086172406e-05, + "loss": 0.4835, + "step": 730 + }, + { + "epoch": 0.37, + "learning_rate": 1.4654950886519563e-05, + "loss": 0.5794, + "step": 731 + }, + { + "epoch": 0.37, + "learning_rate": 1.4640612479800686e-05, + "loss": 0.521, + "step": 732 + }, + { + "epoch": 0.37, + "learning_rate": 1.4626261903616579e-05, + "loss": 0.5273, + "step": 733 + }, + { + "epoch": 0.37, + "learning_rate": 1.4611899195599952e-05, + "loss": 0.5263, + "step": 734 + }, + { + "epoch": 0.37, + "learning_rate": 1.4597524393415336e-05, + "loss": 0.5614, + "step": 735 + }, + { + "epoch": 0.37, + "learning_rate": 1.4583137534758968e-05, + "loss": 0.5693, + "step": 736 + }, + { + "epoch": 0.37, + "learning_rate": 1.4568738657358715e-05, + "loss": 0.5616, + "step": 737 + }, + { + "epoch": 0.37, + "learning_rate": 1.455432779897395e-05, + "loss": 0.5403, + "step": 738 + }, + { + "epoch": 0.37, + "learning_rate": 1.4539904997395468e-05, + "loss": 0.5085, + "step": 739 + }, + { + "epoch": 0.37, + "learning_rate": 1.4525470290445392e-05, + "loss": 0.5168, + "step": 740 + }, + { + "epoch": 0.37, + "learning_rate": 1.4511023715977048e-05, + "loss": 0.587, + "step": 741 + }, + { + "epoch": 0.37, + "learning_rate": 1.4496565311874902e-05, + "loss": 0.5324, + "step": 742 + }, + { + "epoch": 0.37, + "learning_rate": 1.4482095116054421e-05, + "loss": 0.4944, + "step": 743 + }, + { + "epoch": 0.37, + "learning_rate": 1.4467613166462024e-05, + "loss": 0.5339, + "step": 744 + }, + { + "epoch": 0.37, + "learning_rate": 1.4453119501074924e-05, + "loss": 0.5517, + "step": 745 + }, + { + "epoch": 0.37, + "learning_rate": 1.4438614157901073e-05, + "loss": 0.5189, + "step": 746 + }, + { + "epoch": 0.37, + "learning_rate": 1.4424097174979038e-05, + "loss": 0.535, + "step": 747 + }, + { + "epoch": 0.37, + "learning_rate": 1.4409568590377918e-05, + "loss": 0.5303, + "step": 748 + }, + { + "epoch": 0.37, + "learning_rate": 1.4395028442197231e-05, + "loss": 0.5561, + "step": 749 + }, + { + "epoch": 0.38, + "learning_rate": 1.4380476768566825e-05, + "loss": 0.5305, + "step": 750 + }, + { + "epoch": 0.38, + "learning_rate": 1.4365913607646762e-05, + "loss": 0.496, + "step": 751 + }, + { + "epoch": 0.38, + "learning_rate": 1.4351338997627233e-05, + "loss": 0.5268, + "step": 752 + }, + { + "epoch": 0.38, + "learning_rate": 1.433675297672846e-05, + "loss": 0.5, + "step": 753 + }, + { + "epoch": 0.38, + "learning_rate": 1.4322155583200577e-05, + "loss": 0.517, + "step": 754 + }, + { + "epoch": 0.38, + "learning_rate": 1.4307546855323549e-05, + "loss": 0.5244, + "step": 755 + }, + { + "epoch": 0.38, + "learning_rate": 1.429292683140706e-05, + "loss": 0.4792, + "step": 756 + }, + { + "epoch": 0.38, + "learning_rate": 1.4278295549790419e-05, + "loss": 0.5154, + "step": 757 + }, + { + "epoch": 0.38, + "learning_rate": 1.4263653048842461e-05, + "loss": 0.4767, + "step": 758 + }, + { + "epoch": 0.38, + "learning_rate": 1.424899936696143e-05, + "loss": 0.4858, + "step": 759 + }, + { + "epoch": 0.38, + "learning_rate": 1.4234334542574906e-05, + "loss": 0.4936, + "step": 760 + }, + { + "epoch": 0.38, + "learning_rate": 1.4219658614139674e-05, + "loss": 0.4957, + "step": 761 + }, + { + "epoch": 0.38, + "learning_rate": 1.4204971620141648e-05, + "loss": 0.5789, + "step": 762 + }, + { + "epoch": 0.38, + "learning_rate": 1.4190273599095761e-05, + "loss": 0.5306, + "step": 763 + }, + { + "epoch": 0.38, + "learning_rate": 1.4175564589545853e-05, + "loss": 0.4988, + "step": 764 + }, + { + "epoch": 0.38, + "learning_rate": 1.4160844630064596e-05, + "loss": 0.5283, + "step": 765 + }, + { + "epoch": 0.38, + "learning_rate": 1.4146113759253362e-05, + "loss": 0.5026, + "step": 766 + }, + { + "epoch": 0.38, + "learning_rate": 1.4131372015742141e-05, + "loss": 0.5237, + "step": 767 + }, + { + "epoch": 0.38, + "learning_rate": 1.411661943818944e-05, + "loss": 0.5602, + "step": 768 + }, + { + "epoch": 0.38, + "learning_rate": 1.4101856065282174e-05, + "loss": 0.528, + "step": 769 + }, + { + "epoch": 0.39, + "learning_rate": 1.4087081935735565e-05, + "loss": 0.5042, + "step": 770 + }, + { + "epoch": 0.39, + "learning_rate": 1.4072297088293043e-05, + "loss": 0.4794, + "step": 771 + }, + { + "epoch": 0.39, + "learning_rate": 1.4057501561726157e-05, + "loss": 0.5531, + "step": 772 + }, + { + "epoch": 0.39, + "learning_rate": 1.4042695394834435e-05, + "loss": 0.4915, + "step": 773 + }, + { + "epoch": 0.39, + "learning_rate": 1.4027878626445339e-05, + "loss": 0.4861, + "step": 774 + }, + { + "epoch": 0.39, + "learning_rate": 1.4013051295414108e-05, + "loss": 0.4889, + "step": 775 + }, + { + "epoch": 0.39, + "learning_rate": 1.3998213440623691e-05, + "loss": 0.5035, + "step": 776 + }, + { + "epoch": 0.39, + "learning_rate": 1.3983365100984633e-05, + "loss": 0.5034, + "step": 777 + }, + { + "epoch": 0.39, + "learning_rate": 1.3968506315434973e-05, + "loss": 0.4847, + "step": 778 + }, + { + "epoch": 0.39, + "learning_rate": 1.3953637122940147e-05, + "loss": 0.504, + "step": 779 + }, + { + "epoch": 0.39, + "learning_rate": 1.3938757562492873e-05, + "loss": 0.483, + "step": 780 + }, + { + "epoch": 0.39, + "learning_rate": 1.3923867673113067e-05, + "loss": 0.5039, + "step": 781 + }, + { + "epoch": 0.39, + "learning_rate": 1.390896749384773e-05, + "loss": 0.4807, + "step": 782 + }, + { + "epoch": 0.39, + "learning_rate": 1.3894057063770841e-05, + "loss": 0.513, + "step": 783 + }, + { + "epoch": 0.39, + "learning_rate": 1.3879136421983265e-05, + "loss": 0.4903, + "step": 784 + }, + { + "epoch": 0.39, + "learning_rate": 1.3864205607612648e-05, + "loss": 0.5104, + "step": 785 + }, + { + "epoch": 0.39, + "learning_rate": 1.3849264659813314e-05, + "loss": 0.4922, + "step": 786 + }, + { + "epoch": 0.39, + "learning_rate": 1.3834313617766146e-05, + "loss": 0.5198, + "step": 787 + }, + { + "epoch": 0.39, + "learning_rate": 1.3819352520678519e-05, + "loss": 0.4577, + "step": 788 + }, + { + "epoch": 0.39, + "learning_rate": 1.380438140778416e-05, + "loss": 0.4697, + "step": 789 + }, + { + "epoch": 0.4, + "learning_rate": 1.378940031834307e-05, + "loss": 0.4832, + "step": 790 + }, + { + "epoch": 0.4, + "learning_rate": 1.3774409291641407e-05, + "loss": 0.4664, + "step": 791 + }, + { + "epoch": 0.4, + "learning_rate": 1.3759408366991391e-05, + "loss": 0.492, + "step": 792 + }, + { + "epoch": 0.4, + "learning_rate": 1.3744397583731204e-05, + "loss": 0.496, + "step": 793 + }, + { + "epoch": 0.4, + "learning_rate": 1.3729376981224869e-05, + "loss": 0.498, + "step": 794 + }, + { + "epoch": 0.4, + "learning_rate": 1.3714346598862168e-05, + "loss": 0.5533, + "step": 795 + }, + { + "epoch": 0.4, + "learning_rate": 1.3699306476058523e-05, + "loss": 0.4858, + "step": 796 + }, + { + "epoch": 0.4, + "learning_rate": 1.3684256652254906e-05, + "loss": 0.511, + "step": 797 + }, + { + "epoch": 0.4, + "learning_rate": 1.3669197166917723e-05, + "loss": 0.5326, + "step": 798 + }, + { + "epoch": 0.4, + "learning_rate": 1.365412805953872e-05, + "loss": 0.5154, + "step": 799 + }, + { + "epoch": 0.4, + "learning_rate": 1.3639049369634878e-05, + "loss": 0.5222, + "step": 800 + }, + { + "epoch": 0.4, + "eval_code_gate_load": [ + 212.75, + 176.15, + 177.3, + 159.4, + 177.95, + 178.35, + 186.4, + 171.7 + ], + "eval_code_loss": 0.35273438692092896, + "eval_code_runtime": 1.7754, + "eval_code_samples_per_second": 563.24, + "eval_code_steps_per_second": 35.484, + "step": 800 + }, + { + "epoch": 0.4, + "eval_orca_gate_load": [ + 506.35, + 346.1, + 403.4, + 400.65, + 347.1, + 420.75, + 365.05, + 352.9 + ], + "eval_orca_loss": 0.45512694120407104, + "eval_orca_runtime": 2.0061, + "eval_orca_samples_per_second": 498.484, + "eval_orca_steps_per_second": 31.405, + "step": 800 + }, + { + "epoch": 0.4, + "eval_math_gate_load": [ + 326.15, + 213.85, + 229.75, + 231.9, + 256.15, + 245.8, + 275.0, + 254.1 + ], + "eval_math_loss": 0.4581542909145355, + "eval_math_runtime": 1.8544, + "eval_math_samples_per_second": 539.26, + "eval_math_steps_per_second": 33.973, + "step": 800 + }, + { + "epoch": 0.4, + "eval_sharegpt_gate_load": [ + 1533.15, + 1088.05, + 1310.2, + 1211.85, + 1135.75, + 1350.35, + 1168.4, + 1113.85 + ], + "eval_sharegpt_loss": 0.5342773199081421, + "eval_sharegpt_runtime": 2.9927, + "eval_sharegpt_samples_per_second": 334.144, + "eval_sharegpt_steps_per_second": 21.051, + "step": 800 + }, + { + "epoch": 0.4, + "learning_rate": 1.3623961136748296e-05, + "loss": 0.5299, + "step": 801 + }, + { + "epoch": 0.4, + "learning_rate": 1.3608863400446113e-05, + "loss": 0.485, + "step": 802 + }, + { + "epoch": 0.4, + "learning_rate": 1.3593756200320373e-05, + "loss": 0.4949, + "step": 803 + }, + { + "epoch": 0.4, + "learning_rate": 1.357863957598796e-05, + "loss": 0.5395, + "step": 804 + }, + { + "epoch": 0.4, + "learning_rate": 1.356351356709045e-05, + "loss": 0.5138, + "step": 805 + }, + { + "epoch": 0.4, + "learning_rate": 1.3548378213294042e-05, + "loss": 0.5286, + "step": 806 + }, + { + "epoch": 0.4, + "learning_rate": 1.3533233554289433e-05, + "loss": 0.5311, + "step": 807 + }, + { + "epoch": 0.4, + "learning_rate": 1.3518079629791725e-05, + "loss": 0.5093, + "step": 808 + }, + { + "epoch": 0.4, + "learning_rate": 1.3502916479540327e-05, + "loss": 0.4513, + "step": 809 + }, + { + "epoch": 0.41, + "learning_rate": 1.3487744143298822e-05, + "loss": 0.4592, + "step": 810 + }, + { + "epoch": 0.41, + "learning_rate": 1.3472562660854902e-05, + "loss": 0.4641, + "step": 811 + }, + { + "epoch": 0.41, + "learning_rate": 1.345737207202023e-05, + "loss": 0.5504, + "step": 812 + }, + { + "epoch": 0.41, + "learning_rate": 1.3442172416630355e-05, + "loss": 0.5057, + "step": 813 + }, + { + "epoch": 0.41, + "learning_rate": 1.3426963734544601e-05, + "loss": 0.4988, + "step": 814 + }, + { + "epoch": 0.41, + "learning_rate": 1.3411746065645961e-05, + "loss": 0.4449, + "step": 815 + }, + { + "epoch": 0.41, + "learning_rate": 1.3396519449841006e-05, + "loss": 0.4707, + "step": 816 + }, + { + "epoch": 0.41, + "learning_rate": 1.3381283927059751e-05, + "loss": 0.476, + "step": 817 + }, + { + "epoch": 0.41, + "learning_rate": 1.3366039537255589e-05, + "loss": 0.5192, + "step": 818 + }, + { + "epoch": 0.41, + "learning_rate": 1.3350786320405145e-05, + "loss": 0.494, + "step": 819 + }, + { + "epoch": 0.41, + "learning_rate": 1.3335524316508208e-05, + "loss": 0.4846, + "step": 820 + }, + { + "epoch": 0.41, + "learning_rate": 1.3320253565587602e-05, + "loss": 0.4849, + "step": 821 + }, + { + "epoch": 0.41, + "learning_rate": 1.3304974107689088e-05, + "loss": 0.5147, + "step": 822 + }, + { + "epoch": 0.41, + "learning_rate": 1.328968598288127e-05, + "loss": 0.5149, + "step": 823 + }, + { + "epoch": 0.41, + "learning_rate": 1.3274389231255466e-05, + "loss": 0.5002, + "step": 824 + }, + { + "epoch": 0.41, + "learning_rate": 1.3259083892925633e-05, + "loss": 0.5237, + "step": 825 + }, + { + "epoch": 0.41, + "learning_rate": 1.3243770008028225e-05, + "loss": 0.4745, + "step": 826 + }, + { + "epoch": 0.41, + "learning_rate": 1.3228447616722128e-05, + "loss": 0.5389, + "step": 827 + }, + { + "epoch": 0.41, + "learning_rate": 1.3213116759188525e-05, + "loss": 0.4719, + "step": 828 + }, + { + "epoch": 0.41, + "learning_rate": 1.31977774756308e-05, + "loss": 0.5096, + "step": 829 + }, + { + "epoch": 0.41, + "learning_rate": 1.3182429806274442e-05, + "loss": 0.5167, + "step": 830 + }, + { + "epoch": 0.42, + "learning_rate": 1.3167073791366915e-05, + "loss": 0.494, + "step": 831 + }, + { + "epoch": 0.42, + "learning_rate": 1.3151709471177589e-05, + "loss": 0.5393, + "step": 832 + }, + { + "epoch": 0.42, + "learning_rate": 1.3136336885997591e-05, + "loss": 0.5502, + "step": 833 + }, + { + "epoch": 0.42, + "learning_rate": 1.3120956076139746e-05, + "loss": 0.4955, + "step": 834 + }, + { + "epoch": 0.42, + "learning_rate": 1.3105567081938423e-05, + "loss": 0.5306, + "step": 835 + }, + { + "epoch": 0.42, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.4821, + "step": 836 + }, + { + "epoch": 0.42, + "learning_rate": 1.3074764701950095e-05, + "loss": 0.4851, + "step": 837 + }, + { + "epoch": 0.42, + "learning_rate": 1.305935139693874e-05, + "loss": 0.5209, + "step": 838 + }, + { + "epoch": 0.42, + "learning_rate": 1.3043930069134998e-05, + "loss": 0.4816, + "step": 839 + }, + { + "epoch": 0.42, + "learning_rate": 1.3028500758979507e-05, + "loss": 0.4333, + "step": 840 + }, + { + "epoch": 0.42, + "learning_rate": 1.3013063506933838e-05, + "loss": 0.472, + "step": 841 + }, + { + "epoch": 0.42, + "learning_rate": 1.299761835348038e-05, + "loss": 0.4837, + "step": 842 + }, + { + "epoch": 0.42, + "learning_rate": 1.2982165339122248e-05, + "loss": 0.5189, + "step": 843 + }, + { + "epoch": 0.42, + "learning_rate": 1.296670450438317e-05, + "loss": 0.5207, + "step": 844 + }, + { + "epoch": 0.42, + "learning_rate": 1.2951235889807386e-05, + "loss": 0.4656, + "step": 845 + }, + { + "epoch": 0.42, + "learning_rate": 1.2935759535959528e-05, + "loss": 0.537, + "step": 846 + }, + { + "epoch": 0.42, + "learning_rate": 1.2920275483424538e-05, + "loss": 0.5239, + "step": 847 + }, + { + "epoch": 0.42, + "learning_rate": 1.2904783772807534e-05, + "loss": 0.4885, + "step": 848 + }, + { + "epoch": 0.42, + "learning_rate": 1.2889284444733722e-05, + "loss": 0.5081, + "step": 849 + }, + { + "epoch": 0.42, + "learning_rate": 1.2873777539848284e-05, + "loss": 0.4915, + "step": 850 + }, + { + "epoch": 0.43, + "learning_rate": 1.2858263098816265e-05, + "loss": 0.4385, + "step": 851 + }, + { + "epoch": 0.43, + "learning_rate": 1.2842741162322487e-05, + "loss": 0.5104, + "step": 852 + }, + { + "epoch": 0.43, + "learning_rate": 1.282721177107141e-05, + "loss": 0.5064, + "step": 853 + }, + { + "epoch": 0.43, + "learning_rate": 1.2811674965787058e-05, + "loss": 0.4651, + "step": 854 + }, + { + "epoch": 0.43, + "learning_rate": 1.279613078721289e-05, + "loss": 0.5029, + "step": 855 + }, + { + "epoch": 0.43, + "learning_rate": 1.2780579276111702e-05, + "loss": 0.5042, + "step": 856 + }, + { + "epoch": 0.43, + "learning_rate": 1.276502047326552e-05, + "loss": 0.5048, + "step": 857 + }, + { + "epoch": 0.43, + "learning_rate": 1.2749454419475486e-05, + "loss": 0.4466, + "step": 858 + }, + { + "epoch": 0.43, + "learning_rate": 1.273388115556177e-05, + "loss": 0.5513, + "step": 859 + }, + { + "epoch": 0.43, + "learning_rate": 1.2718300722363431e-05, + "loss": 0.5231, + "step": 860 + }, + { + "epoch": 0.43, + "learning_rate": 1.2702713160738344e-05, + "loss": 0.4755, + "step": 861 + }, + { + "epoch": 0.43, + "learning_rate": 1.2687118511563075e-05, + "loss": 0.5436, + "step": 862 + }, + { + "epoch": 0.43, + "learning_rate": 1.2671516815732767e-05, + "loss": 0.4676, + "step": 863 + }, + { + "epoch": 0.43, + "learning_rate": 1.2655908114161053e-05, + "loss": 0.5156, + "step": 864 + }, + { + "epoch": 0.43, + "learning_rate": 1.2640292447779932e-05, + "loss": 0.4917, + "step": 865 + }, + { + "epoch": 0.43, + "learning_rate": 1.2624669857539669e-05, + "loss": 0.5152, + "step": 866 + }, + { + "epoch": 0.43, + "learning_rate": 1.2609040384408685e-05, + "loss": 0.4448, + "step": 867 + }, + { + "epoch": 0.43, + "learning_rate": 1.2593404069373452e-05, + "loss": 0.4678, + "step": 868 + }, + { + "epoch": 0.43, + "learning_rate": 1.2577760953438382e-05, + "loss": 0.4526, + "step": 869 + }, + { + "epoch": 0.43, + "learning_rate": 1.2562111077625723e-05, + "loss": 0.4869, + "step": 870 + }, + { + "epoch": 0.44, + "learning_rate": 1.2546454482975454e-05, + "loss": 0.4524, + "step": 871 + }, + { + "epoch": 0.44, + "learning_rate": 1.2530791210545163e-05, + "loss": 0.4979, + "step": 872 + }, + { + "epoch": 0.44, + "learning_rate": 1.251512130140996e-05, + "loss": 0.4931, + "step": 873 + }, + { + "epoch": 0.44, + "learning_rate": 1.2499444796662354e-05, + "loss": 0.4952, + "step": 874 + }, + { + "epoch": 0.44, + "learning_rate": 1.248376173741215e-05, + "loss": 0.476, + "step": 875 + }, + { + "epoch": 0.44, + "learning_rate": 1.2468072164786342e-05, + "loss": 0.5432, + "step": 876 + }, + { + "epoch": 0.44, + "learning_rate": 1.2452376119929009e-05, + "loss": 0.5156, + "step": 877 + }, + { + "epoch": 0.44, + "learning_rate": 1.2436673644001196e-05, + "loss": 0.4924, + "step": 878 + }, + { + "epoch": 0.44, + "learning_rate": 1.2420964778180815e-05, + "loss": 0.4994, + "step": 879 + }, + { + "epoch": 0.44, + "learning_rate": 1.2405249563662539e-05, + "loss": 0.4219, + "step": 880 + }, + { + "epoch": 0.44, + "learning_rate": 1.2389528041657679e-05, + "loss": 0.4999, + "step": 881 + }, + { + "epoch": 0.44, + "learning_rate": 1.23738002533941e-05, + "loss": 0.5012, + "step": 882 + }, + { + "epoch": 0.44, + "learning_rate": 1.2358066240116092e-05, + "loss": 0.4539, + "step": 883 + }, + { + "epoch": 0.44, + "learning_rate": 1.2342326043084268e-05, + "loss": 0.4784, + "step": 884 + }, + { + "epoch": 0.44, + "learning_rate": 1.2326579703575464e-05, + "loss": 0.5036, + "step": 885 + }, + { + "epoch": 0.44, + "learning_rate": 1.2310827262882614e-05, + "loss": 0.5389, + "step": 886 + }, + { + "epoch": 0.44, + "learning_rate": 1.2295068762314661e-05, + "loss": 0.5221, + "step": 887 + }, + { + "epoch": 0.44, + "learning_rate": 1.2279304243196438e-05, + "loss": 0.4863, + "step": 888 + }, + { + "epoch": 0.44, + "learning_rate": 1.2263533746868552e-05, + "loss": 0.47, + "step": 889 + }, + { + "epoch": 0.45, + "learning_rate": 1.2247757314687296e-05, + "loss": 0.5165, + "step": 890 + }, + { + "epoch": 0.45, + "learning_rate": 1.2231974988024522e-05, + "loss": 0.4943, + "step": 891 + }, + { + "epoch": 0.45, + "learning_rate": 1.2216186808267544e-05, + "loss": 0.4758, + "step": 892 + }, + { + "epoch": 0.45, + "learning_rate": 1.2200392816819022e-05, + "loss": 0.4999, + "step": 893 + }, + { + "epoch": 0.45, + "learning_rate": 1.2184593055096853e-05, + "loss": 0.5106, + "step": 894 + }, + { + "epoch": 0.45, + "learning_rate": 1.2168787564534078e-05, + "loss": 0.476, + "step": 895 + }, + { + "epoch": 0.45, + "learning_rate": 1.215297638657875e-05, + "loss": 0.5206, + "step": 896 + }, + { + "epoch": 0.45, + "learning_rate": 1.2137159562693839e-05, + "loss": 0.4682, + "step": 897 + }, + { + "epoch": 0.45, + "learning_rate": 1.2121337134357121e-05, + "loss": 0.5406, + "step": 898 + }, + { + "epoch": 0.45, + "learning_rate": 1.2105509143061072e-05, + "loss": 0.4805, + "step": 899 + }, + { + "epoch": 0.45, + "learning_rate": 1.2089675630312755e-05, + "loss": 0.5203, + "step": 900 + }, + { + "epoch": 0.45, + "eval_code_gate_load": [ + 205.5, + 177.4, + 183.4, + 157.6, + 179.75, + 179.4, + 190.55, + 166.4 + ], + "eval_code_loss": 0.35786134004592896, + "eval_code_runtime": 1.7808, + "eval_code_samples_per_second": 561.533, + "eval_code_steps_per_second": 35.377, + "step": 900 + }, + { + "epoch": 0.45, + "eval_orca_gate_load": [ + 499.1, + 348.55, + 403.15, + 395.45, + 351.2, + 424.25, + 368.4, + 352.2 + ], + "eval_orca_loss": 0.46074217557907104, + "eval_orca_runtime": 1.999, + "eval_orca_samples_per_second": 500.239, + "eval_orca_steps_per_second": 31.515, + "step": 900 + }, + { + "epoch": 0.45, + "eval_math_gate_load": [ + 316.75, + 225.15, + 231.5, + 227.4, + 256.35, + 248.25, + 277.4, + 249.9 + ], + "eval_math_loss": 0.4561523497104645, + "eval_math_runtime": 1.8388, + "eval_math_samples_per_second": 543.841, + "eval_math_steps_per_second": 34.262, + "step": 900 + }, + { + "epoch": 0.45, + "eval_sharegpt_gate_load": [ + 1524.45, + 1103.4, + 1313.4, + 1203.9, + 1138.05, + 1354.3, + 1174.25, + 1099.85 + ], + "eval_sharegpt_loss": 0.5375000238418579, + "eval_sharegpt_runtime": 2.9992, + "eval_sharegpt_samples_per_second": 333.421, + "eval_sharegpt_steps_per_second": 21.006, + "step": 900 + }, + { + "epoch": 0.45, + "learning_rate": 1.2073836637633705e-05, + "loss": 0.4337, + "step": 901 + }, + { + "epoch": 0.45, + "learning_rate": 1.2057992206559837e-05, + "loss": 0.4969, + "step": 902 + }, + { + "epoch": 0.45, + "learning_rate": 1.204214237864133e-05, + "loss": 0.5045, + "step": 903 + }, + { + "epoch": 0.45, + "learning_rate": 1.2026287195442503e-05, + "loss": 0.4796, + "step": 904 + }, + { + "epoch": 0.45, + "learning_rate": 1.2010426698541728e-05, + "loss": 0.5411, + "step": 905 + }, + { + "epoch": 0.45, + "learning_rate": 1.199456092953131e-05, + "loss": 0.5126, + "step": 906 + }, + { + "epoch": 0.45, + "learning_rate": 1.197868993001738e-05, + "loss": 0.5036, + "step": 907 + }, + { + "epoch": 0.45, + "learning_rate": 1.1962813741619777e-05, + "loss": 0.4752, + "step": 908 + }, + { + "epoch": 0.45, + "learning_rate": 1.194693240597196e-05, + "loss": 0.5218, + "step": 909 + }, + { + "epoch": 0.46, + "learning_rate": 1.1931045964720882e-05, + "loss": 0.4636, + "step": 910 + }, + { + "epoch": 0.46, + "learning_rate": 1.1915154459526876e-05, + "loss": 0.5427, + "step": 911 + }, + { + "epoch": 0.46, + "learning_rate": 1.189925793206357e-05, + "loss": 0.5337, + "step": 912 + }, + { + "epoch": 0.46, + "learning_rate": 1.188335642401775e-05, + "loss": 0.5242, + "step": 913 + }, + { + "epoch": 0.46, + "learning_rate": 1.1867449977089264e-05, + "loss": 0.4671, + "step": 914 + }, + { + "epoch": 0.46, + "learning_rate": 1.1851538632990922e-05, + "loss": 0.5247, + "step": 915 + }, + { + "epoch": 0.46, + "learning_rate": 1.1835622433448361e-05, + "loss": 0.4585, + "step": 916 + }, + { + "epoch": 0.46, + "learning_rate": 1.181970142019997e-05, + "loss": 0.5068, + "step": 917 + }, + { + "epoch": 0.46, + "learning_rate": 1.1803775634996735e-05, + "loss": 0.5094, + "step": 918 + }, + { + "epoch": 0.46, + "learning_rate": 1.1787845119602184e-05, + "loss": 0.5033, + "step": 919 + }, + { + "epoch": 0.46, + "learning_rate": 1.177190991579223e-05, + "loss": 0.5635, + "step": 920 + }, + { + "epoch": 0.46, + "learning_rate": 1.1755970065355087e-05, + "loss": 0.4865, + "step": 921 + }, + { + "epoch": 0.46, + "learning_rate": 1.174002561009116e-05, + "loss": 0.5134, + "step": 922 + }, + { + "epoch": 0.46, + "learning_rate": 1.1724076591812919e-05, + "loss": 0.5193, + "step": 923 + }, + { + "epoch": 0.46, + "learning_rate": 1.1708123052344803e-05, + "loss": 0.4283, + "step": 924 + }, + { + "epoch": 0.46, + "learning_rate": 1.1692165033523117e-05, + "loss": 0.5168, + "step": 925 + }, + { + "epoch": 0.46, + "learning_rate": 1.1676202577195901e-05, + "loss": 0.5103, + "step": 926 + }, + { + "epoch": 0.46, + "learning_rate": 1.1660235725222835e-05, + "loss": 0.4611, + "step": 927 + }, + { + "epoch": 0.46, + "learning_rate": 1.164426451947513e-05, + "loss": 0.5215, + "step": 928 + }, + { + "epoch": 0.46, + "learning_rate": 1.1628289001835405e-05, + "loss": 0.5178, + "step": 929 + }, + { + "epoch": 0.47, + "learning_rate": 1.1612309214197599e-05, + "loss": 0.4999, + "step": 930 + }, + { + "epoch": 0.47, + "learning_rate": 1.1596325198466841e-05, + "loss": 0.5103, + "step": 931 + }, + { + "epoch": 0.47, + "learning_rate": 1.1580336996559343e-05, + "loss": 0.4791, + "step": 932 + }, + { + "epoch": 0.47, + "learning_rate": 1.156434465040231e-05, + "loss": 0.4663, + "step": 933 + }, + { + "epoch": 0.47, + "learning_rate": 1.1548348201933799e-05, + "loss": 0.5155, + "step": 934 + }, + { + "epoch": 0.47, + "learning_rate": 1.1532347693102632e-05, + "loss": 0.4827, + "step": 935 + }, + { + "epoch": 0.47, + "learning_rate": 1.151634316586828e-05, + "loss": 0.4985, + "step": 936 + }, + { + "epoch": 0.47, + "learning_rate": 1.150033466220075e-05, + "loss": 0.4664, + "step": 937 + }, + { + "epoch": 0.47, + "learning_rate": 1.1484322224080474e-05, + "loss": 0.4885, + "step": 938 + }, + { + "epoch": 0.47, + "learning_rate": 1.1468305893498204e-05, + "loss": 0.4893, + "step": 939 + }, + { + "epoch": 0.47, + "learning_rate": 1.1452285712454905e-05, + "loss": 0.5168, + "step": 940 + }, + { + "epoch": 0.47, + "learning_rate": 1.1436261722961627e-05, + "loss": 0.5049, + "step": 941 + }, + { + "epoch": 0.47, + "learning_rate": 1.1420233967039423e-05, + "loss": 0.464, + "step": 942 + }, + { + "epoch": 0.47, + "learning_rate": 1.1404202486719205e-05, + "loss": 0.4962, + "step": 943 + }, + { + "epoch": 0.47, + "learning_rate": 1.138816732404167e-05, + "loss": 0.5134, + "step": 944 + }, + { + "epoch": 0.47, + "learning_rate": 1.1372128521057155e-05, + "loss": 0.5082, + "step": 945 + }, + { + "epoch": 0.47, + "learning_rate": 1.1356086119825553e-05, + "loss": 0.4526, + "step": 946 + }, + { + "epoch": 0.47, + "learning_rate": 1.1340040162416197e-05, + "loss": 0.5121, + "step": 947 + }, + { + "epoch": 0.47, + "learning_rate": 1.1323990690907734e-05, + "loss": 0.4691, + "step": 948 + }, + { + "epoch": 0.47, + "learning_rate": 1.1307937747388034e-05, + "loss": 0.4597, + "step": 949 + }, + { + "epoch": 0.47, + "learning_rate": 1.1291881373954066e-05, + "loss": 0.4535, + "step": 950 + }, + { + "epoch": 0.48, + "learning_rate": 1.1275821612711803e-05, + "loss": 0.4899, + "step": 951 + }, + { + "epoch": 0.48, + "learning_rate": 1.1259758505776092e-05, + "loss": 0.5067, + "step": 952 + }, + { + "epoch": 0.48, + "learning_rate": 1.1243692095270565e-05, + "loss": 0.463, + "step": 953 + }, + { + "epoch": 0.48, + "learning_rate": 1.1227622423327501e-05, + "loss": 0.5201, + "step": 954 + }, + { + "epoch": 0.48, + "learning_rate": 1.1211549532087749e-05, + "loss": 0.4925, + "step": 955 + }, + { + "epoch": 0.48, + "learning_rate": 1.119547346370059e-05, + "loss": 0.5388, + "step": 956 + }, + { + "epoch": 0.48, + "learning_rate": 1.1179394260323639e-05, + "loss": 0.4614, + "step": 957 + }, + { + "epoch": 0.48, + "learning_rate": 1.1163311964122733e-05, + "loss": 0.4782, + "step": 958 + }, + { + "epoch": 0.48, + "learning_rate": 1.114722661727182e-05, + "loss": 0.4644, + "step": 959 + }, + { + "epoch": 0.48, + "learning_rate": 1.1131138261952845e-05, + "loss": 0.4268, + "step": 960 + }, + { + "epoch": 0.48, + "learning_rate": 1.1115046940355643e-05, + "loss": 0.4545, + "step": 961 + }, + { + "epoch": 0.48, + "learning_rate": 1.109895269467783e-05, + "loss": 0.4529, + "step": 962 + }, + { + "epoch": 0.48, + "learning_rate": 1.1082855567124693e-05, + "loss": 0.5314, + "step": 963 + }, + { + "epoch": 0.48, + "learning_rate": 1.1066755599909065e-05, + "loss": 0.4394, + "step": 964 + }, + { + "epoch": 0.48, + "learning_rate": 1.105065283525124e-05, + "loss": 0.4943, + "step": 965 + }, + { + "epoch": 0.48, + "learning_rate": 1.1034547315378838e-05, + "loss": 0.4995, + "step": 966 + }, + { + "epoch": 0.48, + "learning_rate": 1.101843908252671e-05, + "loss": 0.5093, + "step": 967 + }, + { + "epoch": 0.48, + "learning_rate": 1.1002328178936813e-05, + "loss": 0.4655, + "step": 968 + }, + { + "epoch": 0.48, + "learning_rate": 1.0986214646858115e-05, + "loss": 0.47, + "step": 969 + }, + { + "epoch": 0.48, + "learning_rate": 1.0970098528546482e-05, + "loss": 0.5542, + "step": 970 + }, + { + "epoch": 0.49, + "learning_rate": 1.0953979866264549e-05, + "loss": 0.4962, + "step": 971 + }, + { + "epoch": 0.49, + "learning_rate": 1.0937858702281631e-05, + "loss": 0.4945, + "step": 972 + }, + { + "epoch": 0.49, + "learning_rate": 1.0921735078873599e-05, + "loss": 0.4754, + "step": 973 + }, + { + "epoch": 0.49, + "learning_rate": 1.090560903832278e-05, + "loss": 0.4328, + "step": 974 + }, + { + "epoch": 0.49, + "learning_rate": 1.088948062291783e-05, + "loss": 0.4566, + "step": 975 + }, + { + "epoch": 0.49, + "learning_rate": 1.087334987495364e-05, + "loss": 0.4559, + "step": 976 + }, + { + "epoch": 0.49, + "learning_rate": 1.0857216836731221e-05, + "loss": 0.5433, + "step": 977 + }, + { + "epoch": 0.49, + "learning_rate": 1.0841081550557577e-05, + "loss": 0.4789, + "step": 978 + }, + { + "epoch": 0.49, + "learning_rate": 1.0824944058745623e-05, + "loss": 0.4684, + "step": 979 + }, + { + "epoch": 0.49, + "learning_rate": 1.0808804403614044e-05, + "loss": 0.4446, + "step": 980 + }, + { + "epoch": 0.49, + "learning_rate": 1.0792662627487207e-05, + "loss": 0.4864, + "step": 981 + }, + { + "epoch": 0.49, + "learning_rate": 1.0776518772695035e-05, + "loss": 0.4218, + "step": 982 + }, + { + "epoch": 0.49, + "learning_rate": 1.0760372881572904e-05, + "loss": 0.4892, + "step": 983 + }, + { + "epoch": 0.49, + "learning_rate": 1.0744224996461541e-05, + "loss": 0.4257, + "step": 984 + }, + { + "epoch": 0.49, + "learning_rate": 1.0728075159706881e-05, + "loss": 0.5109, + "step": 985 + }, + { + "epoch": 0.49, + "learning_rate": 1.0711923413659995e-05, + "loss": 0.4799, + "step": 986 + }, + { + "epoch": 0.49, + "learning_rate": 1.069576980067695e-05, + "loss": 0.4825, + "step": 987 + }, + { + "epoch": 0.49, + "learning_rate": 1.0679614363118718e-05, + "loss": 0.4711, + "step": 988 + }, + { + "epoch": 0.49, + "learning_rate": 1.0663457143351044e-05, + "loss": 0.528, + "step": 989 + }, + { + "epoch": 0.49, + "learning_rate": 1.0647298183744359e-05, + "loss": 0.4973, + "step": 990 + }, + { + "epoch": 0.5, + "learning_rate": 1.0631137526673647e-05, + "loss": 0.5144, + "step": 991 + }, + { + "epoch": 0.5, + "learning_rate": 1.061497521451835e-05, + "loss": 0.5067, + "step": 992 + }, + { + "epoch": 0.5, + "learning_rate": 1.0598811289662243e-05, + "loss": 0.5282, + "step": 993 + }, + { + "epoch": 0.5, + "learning_rate": 1.0582645794493337e-05, + "loss": 0.4935, + "step": 994 + }, + { + "epoch": 0.5, + "learning_rate": 1.0566478771403763e-05, + "loss": 0.4667, + "step": 995 + }, + { + "epoch": 0.5, + "learning_rate": 1.055031026278965e-05, + "loss": 0.4847, + "step": 996 + }, + { + "epoch": 0.5, + "learning_rate": 1.0534140311051026e-05, + "loss": 0.4582, + "step": 997 + }, + { + "epoch": 0.5, + "learning_rate": 1.0517968958591705e-05, + "loss": 0.502, + "step": 998 + }, + { + "epoch": 0.5, + "learning_rate": 1.0501796247819176e-05, + "loss": 0.4594, + "step": 999 + }, + { + "epoch": 0.5, + "learning_rate": 1.0485622221144485e-05, + "loss": 0.49, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_code_gate_load": [ + 209.5, + 175.9, + 182.2, + 159.9, + 172.05, + 178.75, + 190.7, + 171.0 + ], + "eval_code_loss": 0.35546875, + "eval_code_runtime": 1.7833, + "eval_code_samples_per_second": 560.749, + "eval_code_steps_per_second": 35.327, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_orca_gate_load": [ + 501.25, + 343.4, + 403.4, + 400.1, + 346.3, + 423.95, + 366.8, + 357.1 + ], + "eval_orca_loss": 0.452880859375, + "eval_orca_runtime": 2.0036, + "eval_orca_samples_per_second": 499.113, + "eval_orca_steps_per_second": 31.444, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_math_gate_load": [ + 322.65, + 220.6, + 229.0, + 235.25, + 246.65, + 247.5, + 277.5, + 253.55 + ], + "eval_math_loss": 0.4518066346645355, + "eval_math_runtime": 1.847, + "eval_math_samples_per_second": 541.417, + "eval_math_steps_per_second": 34.109, + "step": 1000 + }, + { + "epoch": 0.5, + "eval_sharegpt_gate_load": [ + 1521.05, + 1084.45, + 1307.85, + 1203.0, + 1140.15, + 1363.05, + 1179.85, + 1112.2 + ], + "eval_sharegpt_loss": 0.5404297113418579, + "eval_sharegpt_runtime": 3.0138, + "eval_sharegpt_samples_per_second": 331.805, + "eval_sharegpt_steps_per_second": 20.904, + "step": 1000 + }, + { + "epoch": 0.5, + "learning_rate": 1.046944692098213e-05, + "loss": 0.487, + "step": 1001 + }, + { + "epoch": 0.5, + "learning_rate": 1.0453270389749956e-05, + "loss": 0.4748, + "step": 1002 + }, + { + "epoch": 0.5, + "learning_rate": 1.0437092669869025e-05, + "loss": 0.4349, + "step": 1003 + }, + { + "epoch": 0.5, + "learning_rate": 1.0420913803763522e-05, + "loss": 0.4664, + "step": 1004 + }, + { + "epoch": 0.5, + "learning_rate": 1.0404733833860639e-05, + "loss": 0.4951, + "step": 1005 + }, + { + "epoch": 0.5, + "learning_rate": 1.0388552802590461e-05, + "loss": 0.4668, + "step": 1006 + }, + { + "epoch": 0.5, + "learning_rate": 1.0372370752385854e-05, + "loss": 0.4626, + "step": 1007 + }, + { + "epoch": 0.5, + "learning_rate": 1.0356187725682359e-05, + "loss": 0.4278, + "step": 1008 + }, + { + "epoch": 0.5, + "learning_rate": 1.0340003764918078e-05, + "loss": 0.5119, + "step": 1009 + }, + { + "epoch": 0.51, + "learning_rate": 1.0323818912533561e-05, + "loss": 0.4633, + "step": 1010 + }, + { + "epoch": 0.51, + "learning_rate": 1.0307633210971697e-05, + "loss": 0.461, + "step": 1011 + }, + { + "epoch": 0.51, + "learning_rate": 1.0291446702677598e-05, + "loss": 0.5217, + "step": 1012 + }, + { + "epoch": 0.51, + "learning_rate": 1.0275259430098502e-05, + "loss": 0.4573, + "step": 1013 + }, + { + "epoch": 0.51, + "learning_rate": 1.0259071435683636e-05, + "loss": 0.5034, + "step": 1014 + }, + { + "epoch": 0.51, + "learning_rate": 1.0242882761884132e-05, + "loss": 0.4356, + "step": 1015 + }, + { + "epoch": 0.51, + "learning_rate": 1.02266934511529e-05, + "loss": 0.4805, + "step": 1016 + }, + { + "epoch": 0.51, + "learning_rate": 1.0210503545944522e-05, + "loss": 0.4934, + "step": 1017 + }, + { + "epoch": 0.51, + "learning_rate": 1.0194313088715135e-05, + "loss": 0.4913, + "step": 1018 + }, + { + "epoch": 0.51, + "learning_rate": 1.0178122121922324e-05, + "loss": 0.4323, + "step": 1019 + }, + { + "epoch": 0.51, + "learning_rate": 1.0161930688025018e-05, + "loss": 0.4759, + "step": 1020 + }, + { + "epoch": 0.51, + "learning_rate": 1.0145738829483354e-05, + "loss": 0.4667, + "step": 1021 + }, + { + "epoch": 0.51, + "learning_rate": 1.0129546588758605e-05, + "loss": 0.4452, + "step": 1022 + }, + { + "epoch": 0.51, + "learning_rate": 1.0113354008313025e-05, + "loss": 0.438, + "step": 1023 + }, + { + "epoch": 0.51, + "learning_rate": 1.0097161130609774e-05, + "loss": 0.4632, + "step": 1024 + }, + { + "epoch": 0.51, + "learning_rate": 1.0080967998112787e-05, + "loss": 0.4721, + "step": 1025 + }, + { + "epoch": 0.51, + "learning_rate": 1.0064774653286662e-05, + "loss": 0.4787, + "step": 1026 + }, + { + "epoch": 0.51, + "learning_rate": 1.0048581138596563e-05, + "loss": 0.479, + "step": 1027 + }, + { + "epoch": 0.51, + "learning_rate": 1.003238749650809e-05, + "loss": 0.4629, + "step": 1028 + }, + { + "epoch": 0.51, + "learning_rate": 1.001619376948718e-05, + "loss": 0.4641, + "step": 1029 + }, + { + "epoch": 0.52, + "learning_rate": 1e-05, + "loss": 0.4645, + "step": 1030 + }, + { + "epoch": 0.52, + "learning_rate": 9.98380623051282e-06, + "loss": 0.4603, + "step": 1031 + }, + { + "epoch": 0.52, + "learning_rate": 9.967612503491915e-06, + "loss": 0.4245, + "step": 1032 + }, + { + "epoch": 0.52, + "learning_rate": 9.95141886140344e-06, + "loss": 0.5036, + "step": 1033 + }, + { + "epoch": 0.52, + "learning_rate": 9.935225346713341e-06, + "loss": 0.4646, + "step": 1034 + }, + { + "epoch": 0.52, + "learning_rate": 9.919032001887215e-06, + "loss": 0.4433, + "step": 1035 + }, + { + "epoch": 0.52, + "learning_rate": 9.90283886939023e-06, + "loss": 0.4786, + "step": 1036 + }, + { + "epoch": 0.52, + "learning_rate": 9.886645991686977e-06, + "loss": 0.5202, + "step": 1037 + }, + { + "epoch": 0.52, + "learning_rate": 9.870453411241399e-06, + "loss": 0.4868, + "step": 1038 + }, + { + "epoch": 0.52, + "learning_rate": 9.854261170516648e-06, + "loss": 0.4462, + "step": 1039 + }, + { + "epoch": 0.52, + "learning_rate": 9.838069311974986e-06, + "loss": 0.4161, + "step": 1040 + }, + { + "epoch": 0.52, + "learning_rate": 9.821877878077678e-06, + "loss": 0.4759, + "step": 1041 + }, + { + "epoch": 0.52, + "learning_rate": 9.805686911284867e-06, + "loss": 0.5145, + "step": 1042 + }, + { + "epoch": 0.52, + "learning_rate": 9.789496454055482e-06, + "loss": 0.4849, + "step": 1043 + }, + { + "epoch": 0.52, + "learning_rate": 9.773306548847102e-06, + "loss": 0.4507, + "step": 1044 + }, + { + "epoch": 0.52, + "learning_rate": 9.757117238115871e-06, + "loss": 0.4427, + "step": 1045 + }, + { + "epoch": 0.52, + "learning_rate": 9.740928564316369e-06, + "loss": 0.4745, + "step": 1046 + }, + { + "epoch": 0.52, + "learning_rate": 9.724740569901503e-06, + "loss": 0.4943, + "step": 1047 + }, + { + "epoch": 0.52, + "learning_rate": 9.708553297322407e-06, + "loss": 0.4544, + "step": 1048 + }, + { + "epoch": 0.52, + "learning_rate": 9.692366789028308e-06, + "loss": 0.4622, + "step": 1049 + }, + { + "epoch": 0.53, + "learning_rate": 9.676181087466444e-06, + "loss": 0.4576, + "step": 1050 + }, + { + "epoch": 0.53, + "learning_rate": 9.659996235081926e-06, + "loss": 0.4764, + "step": 1051 + }, + { + "epoch": 0.53, + "learning_rate": 9.643812274317644e-06, + "loss": 0.5082, + "step": 1052 + }, + { + "epoch": 0.53, + "learning_rate": 9.627629247614151e-06, + "loss": 0.4467, + "step": 1053 + }, + { + "epoch": 0.53, + "learning_rate": 9.611447197409544e-06, + "loss": 0.4574, + "step": 1054 + }, + { + "epoch": 0.53, + "learning_rate": 9.595266166139366e-06, + "loss": 0.4667, + "step": 1055 + }, + { + "epoch": 0.53, + "learning_rate": 9.579086196236483e-06, + "loss": 0.4444, + "step": 1056 + }, + { + "epoch": 0.53, + "learning_rate": 9.562907330130981e-06, + "loss": 0.4719, + "step": 1057 + }, + { + "epoch": 0.53, + "learning_rate": 9.54672961025005e-06, + "loss": 0.4315, + "step": 1058 + }, + { + "epoch": 0.53, + "learning_rate": 9.530553079017872e-06, + "loss": 0.5312, + "step": 1059 + }, + { + "epoch": 0.53, + "learning_rate": 9.514377778855521e-06, + "loss": 0.4925, + "step": 1060 + }, + { + "epoch": 0.53, + "learning_rate": 9.498203752180827e-06, + "loss": 0.4775, + "step": 1061 + }, + { + "epoch": 0.53, + "learning_rate": 9.482031041408296e-06, + "loss": 0.4874, + "step": 1062 + }, + { + "epoch": 0.53, + "learning_rate": 9.465859688948977e-06, + "loss": 0.5414, + "step": 1063 + }, + { + "epoch": 0.53, + "learning_rate": 9.449689737210352e-06, + "loss": 0.479, + "step": 1064 + }, + { + "epoch": 0.53, + "learning_rate": 9.433521228596237e-06, + "loss": 0.4297, + "step": 1065 + }, + { + "epoch": 0.53, + "learning_rate": 9.417354205506663e-06, + "loss": 0.5173, + "step": 1066 + }, + { + "epoch": 0.53, + "learning_rate": 9.401188710337757e-06, + "loss": 0.4412, + "step": 1067 + }, + { + "epoch": 0.53, + "learning_rate": 9.385024785481653e-06, + "loss": 0.4954, + "step": 1068 + }, + { + "epoch": 0.53, + "learning_rate": 9.368862473326355e-06, + "loss": 0.4635, + "step": 1069 + }, + { + "epoch": 0.54, + "learning_rate": 9.352701816255643e-06, + "loss": 0.5658, + "step": 1070 + }, + { + "epoch": 0.54, + "learning_rate": 9.336542856648958e-06, + "loss": 0.478, + "step": 1071 + }, + { + "epoch": 0.54, + "learning_rate": 9.320385636881283e-06, + "loss": 0.4725, + "step": 1072 + }, + { + "epoch": 0.54, + "learning_rate": 9.30423019932305e-06, + "loss": 0.5015, + "step": 1073 + }, + { + "epoch": 0.54, + "learning_rate": 9.288076586340005e-06, + "loss": 0.5177, + "step": 1074 + }, + { + "epoch": 0.54, + "learning_rate": 9.27192484029312e-06, + "loss": 0.4999, + "step": 1075 + }, + { + "epoch": 0.54, + "learning_rate": 9.255775003538462e-06, + "loss": 0.4933, + "step": 1076 + }, + { + "epoch": 0.54, + "learning_rate": 9.239627118427098e-06, + "loss": 0.4579, + "step": 1077 + }, + { + "epoch": 0.54, + "learning_rate": 9.22348122730497e-06, + "loss": 0.4491, + "step": 1078 + }, + { + "epoch": 0.54, + "learning_rate": 9.207337372512797e-06, + "loss": 0.4519, + "step": 1079 + }, + { + "epoch": 0.54, + "learning_rate": 9.19119559638596e-06, + "loss": 0.4515, + "step": 1080 + }, + { + "epoch": 0.54, + "learning_rate": 9.17505594125438e-06, + "loss": 0.4672, + "step": 1081 + }, + { + "epoch": 0.54, + "learning_rate": 9.158918449442425e-06, + "loss": 0.4807, + "step": 1082 + }, + { + "epoch": 0.54, + "learning_rate": 9.142783163268782e-06, + "loss": 0.5425, + "step": 1083 + }, + { + "epoch": 0.54, + "learning_rate": 9.126650125046361e-06, + "loss": 0.4717, + "step": 1084 + }, + { + "epoch": 0.54, + "learning_rate": 9.110519377082174e-06, + "loss": 0.5154, + "step": 1085 + }, + { + "epoch": 0.54, + "learning_rate": 9.094390961677223e-06, + "loss": 0.4954, + "step": 1086 + }, + { + "epoch": 0.54, + "learning_rate": 9.078264921126405e-06, + "loss": 0.43, + "step": 1087 + }, + { + "epoch": 0.54, + "learning_rate": 9.062141297718372e-06, + "loss": 0.4305, + "step": 1088 + }, + { + "epoch": 0.54, + "learning_rate": 9.046020133735455e-06, + "loss": 0.409, + "step": 1089 + }, + { + "epoch": 0.55, + "learning_rate": 9.02990147145352e-06, + "loss": 0.4316, + "step": 1090 + }, + { + "epoch": 0.55, + "learning_rate": 9.013785353141887e-06, + "loss": 0.4189, + "step": 1091 + }, + { + "epoch": 0.55, + "learning_rate": 8.99767182106319e-06, + "loss": 0.5003, + "step": 1092 + }, + { + "epoch": 0.55, + "learning_rate": 8.981560917473292e-06, + "loss": 0.4231, + "step": 1093 + }, + { + "epoch": 0.55, + "learning_rate": 8.965452684621164e-06, + "loss": 0.4326, + "step": 1094 + }, + { + "epoch": 0.55, + "learning_rate": 8.949347164748761e-06, + "loss": 0.4282, + "step": 1095 + }, + { + "epoch": 0.55, + "learning_rate": 8.933244400090937e-06, + "loss": 0.4409, + "step": 1096 + }, + { + "epoch": 0.55, + "learning_rate": 8.91714443287531e-06, + "loss": 0.5219, + "step": 1097 + }, + { + "epoch": 0.55, + "learning_rate": 8.901047305322172e-06, + "loss": 0.4599, + "step": 1098 + }, + { + "epoch": 0.55, + "learning_rate": 8.88495305964436e-06, + "loss": 0.4843, + "step": 1099 + }, + { + "epoch": 0.55, + "learning_rate": 8.868861738047158e-06, + "loss": 0.4487, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_code_gate_load": [ + 204.55, + 178.2, + 181.25, + 158.8, + 174.15, + 180.85, + 194.9, + 167.3 + ], + "eval_code_loss": 0.3551269471645355, + "eval_code_runtime": 1.8096, + "eval_code_samples_per_second": 552.622, + "eval_code_steps_per_second": 34.815, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_orca_gate_load": [ + 497.9, + 347.05, + 405.0, + 400.25, + 349.7, + 422.7, + 367.5, + 352.2 + ], + "eval_orca_loss": 0.45805662870407104, + "eval_orca_runtime": 1.9956, + "eval_orca_samples_per_second": 501.094, + "eval_orca_steps_per_second": 31.569, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_math_gate_load": [ + 309.9, + 222.05, + 237.3, + 231.3, + 253.5, + 249.15, + 277.95, + 251.55 + ], + "eval_math_loss": 0.32890623807907104, + "eval_math_runtime": 1.861, + "eval_math_samples_per_second": 537.357, + "eval_math_steps_per_second": 33.853, + "step": 1100 + }, + { + "epoch": 0.55, + "eval_sharegpt_gate_load": [ + 1510.0, + 1101.7, + 1304.1, + 1211.55, + 1140.2, + 1358.85, + 1183.65, + 1101.55 + ], + "eval_sharegpt_loss": 0.539843738079071, + "eval_sharegpt_runtime": 3.0143, + "eval_sharegpt_samples_per_second": 331.753, + "eval_sharegpt_steps_per_second": 20.9, + "step": 1100 + }, + { + "epoch": 0.55, + "learning_rate": 8.852773382728184e-06, + "loss": 0.4398, + "step": 1101 + }, + { + "epoch": 0.55, + "learning_rate": 8.836688035877268e-06, + "loss": 0.4278, + "step": 1102 + }, + { + "epoch": 0.55, + "learning_rate": 8.820605739676363e-06, + "loss": 0.4476, + "step": 1103 + }, + { + "epoch": 0.55, + "learning_rate": 8.804526536299413e-06, + "loss": 0.4136, + "step": 1104 + }, + { + "epoch": 0.55, + "learning_rate": 8.788450467912254e-06, + "loss": 0.445, + "step": 1105 + }, + { + "epoch": 0.55, + "learning_rate": 8.772377576672502e-06, + "loss": 0.4633, + "step": 1106 + }, + { + "epoch": 0.55, + "learning_rate": 8.75630790472944e-06, + "loss": 0.4523, + "step": 1107 + }, + { + "epoch": 0.55, + "learning_rate": 8.740241494223911e-06, + "loss": 0.4346, + "step": 1108 + }, + { + "epoch": 0.55, + "learning_rate": 8.724178387288202e-06, + "loss": 0.4706, + "step": 1109 + }, + { + "epoch": 0.56, + "learning_rate": 8.708118626045939e-06, + "loss": 0.4377, + "step": 1110 + }, + { + "epoch": 0.56, + "learning_rate": 8.692062252611973e-06, + "loss": 0.4326, + "step": 1111 + }, + { + "epoch": 0.56, + "learning_rate": 8.676009309092273e-06, + "loss": 0.4979, + "step": 1112 + }, + { + "epoch": 0.56, + "learning_rate": 8.659959837583808e-06, + "loss": 0.4885, + "step": 1113 + }, + { + "epoch": 0.56, + "learning_rate": 8.643913880174449e-06, + "loss": 0.424, + "step": 1114 + }, + { + "epoch": 0.56, + "learning_rate": 8.62787147894285e-06, + "loss": 0.4862, + "step": 1115 + }, + { + "epoch": 0.56, + "learning_rate": 8.611832675958335e-06, + "loss": 0.4256, + "step": 1116 + }, + { + "epoch": 0.56, + "learning_rate": 8.595797513280799e-06, + "loss": 0.4172, + "step": 1117 + }, + { + "epoch": 0.56, + "learning_rate": 8.579766032960582e-06, + "loss": 0.4389, + "step": 1118 + }, + { + "epoch": 0.56, + "learning_rate": 8.563738277038376e-06, + "loss": 0.421, + "step": 1119 + }, + { + "epoch": 0.56, + "learning_rate": 8.5477142875451e-06, + "loss": 0.4555, + "step": 1120 + }, + { + "epoch": 0.56, + "learning_rate": 8.531694106501796e-06, + "loss": 0.4244, + "step": 1121 + }, + { + "epoch": 0.56, + "learning_rate": 8.515677775919528e-06, + "loss": 0.4084, + "step": 1122 + }, + { + "epoch": 0.56, + "learning_rate": 8.499665337799254e-06, + "loss": 0.4406, + "step": 1123 + }, + { + "epoch": 0.56, + "learning_rate": 8.48365683413172e-06, + "loss": 0.4274, + "step": 1124 + }, + { + "epoch": 0.56, + "learning_rate": 8.46765230689737e-06, + "loss": 0.3716, + "step": 1125 + }, + { + "epoch": 0.56, + "learning_rate": 8.451651798066203e-06, + "loss": 0.459, + "step": 1126 + }, + { + "epoch": 0.56, + "learning_rate": 8.43565534959769e-06, + "loss": 0.4536, + "step": 1127 + }, + { + "epoch": 0.56, + "learning_rate": 8.419663003440657e-06, + "loss": 0.4558, + "step": 1128 + }, + { + "epoch": 0.56, + "learning_rate": 8.40367480153316e-06, + "loss": 0.4123, + "step": 1129 + }, + { + "epoch": 0.56, + "learning_rate": 8.387690785802403e-06, + "loss": 0.4408, + "step": 1130 + }, + { + "epoch": 0.57, + "learning_rate": 8.371710998164595e-06, + "loss": 0.481, + "step": 1131 + }, + { + "epoch": 0.57, + "learning_rate": 8.355735480524874e-06, + "loss": 0.3822, + "step": 1132 + }, + { + "epoch": 0.57, + "learning_rate": 8.339764274777165e-06, + "loss": 0.4389, + "step": 1133 + }, + { + "epoch": 0.57, + "learning_rate": 8.3237974228041e-06, + "loss": 0.403, + "step": 1134 + }, + { + "epoch": 0.57, + "learning_rate": 8.307834966476885e-06, + "loss": 0.4676, + "step": 1135 + }, + { + "epoch": 0.57, + "learning_rate": 8.291876947655197e-06, + "loss": 0.4541, + "step": 1136 + }, + { + "epoch": 0.57, + "learning_rate": 8.275923408187086e-06, + "loss": 0.4605, + "step": 1137 + }, + { + "epoch": 0.57, + "learning_rate": 8.259974389908842e-06, + "loss": 0.4276, + "step": 1138 + }, + { + "epoch": 0.57, + "learning_rate": 8.244029934644916e-06, + "loss": 0.4232, + "step": 1139 + }, + { + "epoch": 0.57, + "learning_rate": 8.228090084207773e-06, + "loss": 0.4284, + "step": 1140 + }, + { + "epoch": 0.57, + "learning_rate": 8.212154880397817e-06, + "loss": 0.3999, + "step": 1141 + }, + { + "epoch": 0.57, + "learning_rate": 8.196224365003267e-06, + "loss": 0.4149, + "step": 1142 + }, + { + "epoch": 0.57, + "learning_rate": 8.180298579800034e-06, + "loss": 0.5071, + "step": 1143 + }, + { + "epoch": 0.57, + "learning_rate": 8.16437756655164e-06, + "loss": 0.4423, + "step": 1144 + }, + { + "epoch": 0.57, + "learning_rate": 8.148461367009081e-06, + "loss": 0.4525, + "step": 1145 + }, + { + "epoch": 0.57, + "learning_rate": 8.132550022910737e-06, + "loss": 0.4204, + "step": 1146 + }, + { + "epoch": 0.57, + "learning_rate": 8.116643575982254e-06, + "loss": 0.4082, + "step": 1147 + }, + { + "epoch": 0.57, + "learning_rate": 8.100742067936432e-06, + "loss": 0.4174, + "step": 1148 + }, + { + "epoch": 0.57, + "learning_rate": 8.084845540473127e-06, + "loss": 0.4162, + "step": 1149 + }, + { + "epoch": 0.57, + "learning_rate": 8.068954035279121e-06, + "loss": 0.4527, + "step": 1150 + }, + { + "epoch": 0.58, + "learning_rate": 8.053067594028044e-06, + "loss": 0.4525, + "step": 1151 + }, + { + "epoch": 0.58, + "learning_rate": 8.037186258380226e-06, + "loss": 0.4433, + "step": 1152 + }, + { + "epoch": 0.58, + "learning_rate": 8.021310069982624e-06, + "loss": 0.5178, + "step": 1153 + }, + { + "epoch": 0.58, + "learning_rate": 8.005439070468692e-06, + "loss": 0.4018, + "step": 1154 + }, + { + "epoch": 0.58, + "learning_rate": 7.989573301458274e-06, + "loss": 0.4235, + "step": 1155 + }, + { + "epoch": 0.58, + "learning_rate": 7.9737128045575e-06, + "loss": 0.4846, + "step": 1156 + }, + { + "epoch": 0.58, + "learning_rate": 7.957857621358674e-06, + "loss": 0.4281, + "step": 1157 + }, + { + "epoch": 0.58, + "learning_rate": 7.942007793440165e-06, + "loss": 0.4087, + "step": 1158 + }, + { + "epoch": 0.58, + "learning_rate": 7.9261633623663e-06, + "loss": 0.4426, + "step": 1159 + }, + { + "epoch": 0.58, + "learning_rate": 7.91032436968725e-06, + "loss": 0.4638, + "step": 1160 + }, + { + "epoch": 0.58, + "learning_rate": 7.894490856938931e-06, + "loss": 0.4776, + "step": 1161 + }, + { + "epoch": 0.58, + "learning_rate": 7.87866286564288e-06, + "loss": 0.4447, + "step": 1162 + }, + { + "epoch": 0.58, + "learning_rate": 7.862840437306165e-06, + "loss": 0.3879, + "step": 1163 + }, + { + "epoch": 0.58, + "learning_rate": 7.847023613421251e-06, + "loss": 0.4486, + "step": 1164 + }, + { + "epoch": 0.58, + "learning_rate": 7.831212435465925e-06, + "loss": 0.4983, + "step": 1165 + }, + { + "epoch": 0.58, + "learning_rate": 7.815406944903148e-06, + "loss": 0.4229, + "step": 1166 + }, + { + "epoch": 0.58, + "learning_rate": 7.799607183180981e-06, + "loss": 0.4204, + "step": 1167 + }, + { + "epoch": 0.58, + "learning_rate": 7.78381319173246e-06, + "loss": 0.3881, + "step": 1168 + }, + { + "epoch": 0.58, + "learning_rate": 7.768025011975481e-06, + "loss": 0.4176, + "step": 1169 + }, + { + "epoch": 0.58, + "learning_rate": 7.752242685312709e-06, + "loss": 0.4191, + "step": 1170 + }, + { + "epoch": 0.59, + "learning_rate": 7.736466253131451e-06, + "loss": 0.4608, + "step": 1171 + }, + { + "epoch": 0.59, + "learning_rate": 7.720695756803569e-06, + "loss": 0.446, + "step": 1172 + }, + { + "epoch": 0.59, + "learning_rate": 7.704931237685342e-06, + "loss": 0.3986, + "step": 1173 + }, + { + "epoch": 0.59, + "learning_rate": 7.689172737117389e-06, + "loss": 0.438, + "step": 1174 + }, + { + "epoch": 0.59, + "learning_rate": 7.673420296424541e-06, + "loss": 0.4122, + "step": 1175 + }, + { + "epoch": 0.59, + "learning_rate": 7.657673956915735e-06, + "loss": 0.4458, + "step": 1176 + }, + { + "epoch": 0.59, + "learning_rate": 7.641933759883913e-06, + "loss": 0.4381, + "step": 1177 + }, + { + "epoch": 0.59, + "learning_rate": 7.6261997466059035e-06, + "loss": 0.4505, + "step": 1178 + }, + { + "epoch": 0.59, + "learning_rate": 7.610471958342326e-06, + "loss": 0.4123, + "step": 1179 + }, + { + "epoch": 0.59, + "learning_rate": 7.594750436337467e-06, + "loss": 0.4655, + "step": 1180 + }, + { + "epoch": 0.59, + "learning_rate": 7.579035221819188e-06, + "loss": 0.4205, + "step": 1181 + }, + { + "epoch": 0.59, + "learning_rate": 7.5633263559988035e-06, + "loss": 0.4072, + "step": 1182 + }, + { + "epoch": 0.59, + "learning_rate": 7.547623880070992e-06, + "loss": 0.4442, + "step": 1183 + }, + { + "epoch": 0.59, + "learning_rate": 7.531927835213657e-06, + "loss": 0.4164, + "step": 1184 + }, + { + "epoch": 0.59, + "learning_rate": 7.516238262587851e-06, + "loss": 0.4247, + "step": 1185 + }, + { + "epoch": 0.59, + "learning_rate": 7.500555203337647e-06, + "loss": 0.4238, + "step": 1186 + }, + { + "epoch": 0.59, + "learning_rate": 7.48487869859004e-06, + "loss": 0.4493, + "step": 1187 + }, + { + "epoch": 0.59, + "learning_rate": 7.469208789454838e-06, + "loss": 0.4132, + "step": 1188 + }, + { + "epoch": 0.59, + "learning_rate": 7.4535455170245476e-06, + "loss": 0.4364, + "step": 1189 + }, + { + "epoch": 0.59, + "learning_rate": 7.4378889223742766e-06, + "loss": 0.4417, + "step": 1190 + }, + { + "epoch": 0.6, + "learning_rate": 7.422239046561619e-06, + "loss": 0.3977, + "step": 1191 + }, + { + "epoch": 0.6, + "learning_rate": 7.40659593062655e-06, + "loss": 0.4288, + "step": 1192 + }, + { + "epoch": 0.6, + "learning_rate": 7.390959615591315e-06, + "loss": 0.4328, + "step": 1193 + }, + { + "epoch": 0.6, + "learning_rate": 7.375330142460331e-06, + "loss": 0.3894, + "step": 1194 + }, + { + "epoch": 0.6, + "learning_rate": 7.35970755222007e-06, + "loss": 0.4406, + "step": 1195 + }, + { + "epoch": 0.6, + "learning_rate": 7.344091885838949e-06, + "loss": 0.4512, + "step": 1196 + }, + { + "epoch": 0.6, + "learning_rate": 7.328483184267236e-06, + "loss": 0.3937, + "step": 1197 + }, + { + "epoch": 0.6, + "learning_rate": 7.312881488436928e-06, + "loss": 0.4402, + "step": 1198 + }, + { + "epoch": 0.6, + "learning_rate": 7.297286839261659e-06, + "loss": 0.4219, + "step": 1199 + }, + { + "epoch": 0.6, + "learning_rate": 7.2816992776365714e-06, + "loss": 0.4174, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_code_gate_load": [ + 207.85, + 176.65, + 176.75, + 155.35, + 175.4, + 184.75, + 195.35, + 167.9 + ], + "eval_code_loss": 0.26713865995407104, + "eval_code_runtime": 1.7771, + "eval_code_samples_per_second": 562.706, + "eval_code_steps_per_second": 35.45, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_orca_gate_load": [ + 500.65, + 345.65, + 405.4, + 401.1, + 345.55, + 420.1, + 369.5, + 354.35 + ], + "eval_orca_loss": 0.4488769471645355, + "eval_orca_runtime": 2.0075, + "eval_orca_samples_per_second": 498.135, + "eval_orca_steps_per_second": 31.383, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_math_gate_load": [ + 324.85, + 216.9, + 232.55, + 225.15, + 251.7, + 253.15, + 278.7, + 249.7 + ], + "eval_math_loss": 0.30810546875, + "eval_math_runtime": 1.8431, + "eval_math_samples_per_second": 542.566, + "eval_math_steps_per_second": 34.182, + "step": 1200 + }, + { + "epoch": 0.6, + "eval_sharegpt_gate_load": [ + 1528.05, + 1087.15, + 1303.2, + 1211.5, + 1131.85, + 1353.75, + 1187.35, + 1108.75 + ], + "eval_sharegpt_loss": 0.537792980670929, + "eval_sharegpt_runtime": 2.9986, + "eval_sharegpt_samples_per_second": 333.485, + "eval_sharegpt_steps_per_second": 21.01, + "step": 1200 + }, + { + "epoch": 0.6, + "learning_rate": 7.2661188444382345e-06, + "loss": 0.3947, + "step": 1201 + }, + { + "epoch": 0.6, + "learning_rate": 7.250545580524515e-06, + "loss": 0.446, + "step": 1202 + }, + { + "epoch": 0.6, + "learning_rate": 7.234979526734482e-06, + "loss": 0.4238, + "step": 1203 + }, + { + "epoch": 0.6, + "learning_rate": 7.219420723888301e-06, + "loss": 0.4102, + "step": 1204 + }, + { + "epoch": 0.6, + "learning_rate": 7.203869212787112e-06, + "loss": 0.4027, + "step": 1205 + }, + { + "epoch": 0.6, + "learning_rate": 7.188325034212944e-06, + "loss": 0.3781, + "step": 1206 + }, + { + "epoch": 0.6, + "learning_rate": 7.1727882289285915e-06, + "loss": 0.3997, + "step": 1207 + }, + { + "epoch": 0.6, + "learning_rate": 7.157258837677514e-06, + "loss": 0.4436, + "step": 1208 + }, + { + "epoch": 0.6, + "learning_rate": 7.1417369011837355e-06, + "loss": 0.4706, + "step": 1209 + }, + { + "epoch": 0.6, + "learning_rate": 7.126222460151719e-06, + "loss": 0.4107, + "step": 1210 + }, + { + "epoch": 0.61, + "learning_rate": 7.110715555266281e-06, + "loss": 0.3836, + "step": 1211 + }, + { + "epoch": 0.61, + "learning_rate": 7.095216227192467e-06, + "loss": 0.4006, + "step": 1212 + }, + { + "epoch": 0.61, + "learning_rate": 7.0797245165754654e-06, + "loss": 0.4064, + "step": 1213 + }, + { + "epoch": 0.61, + "learning_rate": 7.064240464040472e-06, + "loss": 0.3926, + "step": 1214 + }, + { + "epoch": 0.61, + "learning_rate": 7.048764110192618e-06, + "loss": 0.4111, + "step": 1215 + }, + { + "epoch": 0.61, + "learning_rate": 7.033295495616834e-06, + "loss": 0.379, + "step": 1216 + }, + { + "epoch": 0.61, + "learning_rate": 7.017834660877756e-06, + "loss": 0.3834, + "step": 1217 + }, + { + "epoch": 0.61, + "learning_rate": 7.002381646519625e-06, + "loss": 0.4477, + "step": 1218 + }, + { + "epoch": 0.61, + "learning_rate": 6.986936493066165e-06, + "loss": 0.4177, + "step": 1219 + }, + { + "epoch": 0.61, + "learning_rate": 6.971499241020495e-06, + "loss": 0.4571, + "step": 1220 + }, + { + "epoch": 0.61, + "learning_rate": 6.956069930865005e-06, + "loss": 0.4066, + "step": 1221 + }, + { + "epoch": 0.61, + "learning_rate": 6.940648603061263e-06, + "loss": 0.4223, + "step": 1222 + }, + { + "epoch": 0.61, + "learning_rate": 6.925235298049906e-06, + "loss": 0.4069, + "step": 1223 + }, + { + "epoch": 0.61, + "learning_rate": 6.909830056250527e-06, + "loss": 0.3334, + "step": 1224 + }, + { + "epoch": 0.61, + "learning_rate": 6.894432918061579e-06, + "loss": 0.404, + "step": 1225 + }, + { + "epoch": 0.61, + "learning_rate": 6.8790439238602576e-06, + "loss": 0.403, + "step": 1226 + }, + { + "epoch": 0.61, + "learning_rate": 6.863663114002411e-06, + "loss": 0.3739, + "step": 1227 + }, + { + "epoch": 0.61, + "learning_rate": 6.848290528822417e-06, + "loss": 0.3936, + "step": 1228 + }, + { + "epoch": 0.61, + "learning_rate": 6.8329262086330864e-06, + "loss": 0.4142, + "step": 1229 + }, + { + "epoch": 0.61, + "learning_rate": 6.8175701937255645e-06, + "loss": 0.4611, + "step": 1230 + }, + { + "epoch": 0.62, + "learning_rate": 6.802222524369202e-06, + "loss": 0.4569, + "step": 1231 + }, + { + "epoch": 0.62, + "learning_rate": 6.786883240811479e-06, + "loss": 0.44, + "step": 1232 + }, + { + "epoch": 0.62, + "learning_rate": 6.771552383277875e-06, + "loss": 0.3902, + "step": 1233 + }, + { + "epoch": 0.62, + "learning_rate": 6.756229991971779e-06, + "loss": 0.467, + "step": 1234 + }, + { + "epoch": 0.62, + "learning_rate": 6.740916107074372e-06, + "loss": 0.389, + "step": 1235 + }, + { + "epoch": 0.62, + "learning_rate": 6.725610768744535e-06, + "loss": 0.4507, + "step": 1236 + }, + { + "epoch": 0.62, + "learning_rate": 6.710314017118734e-06, + "loss": 0.4448, + "step": 1237 + }, + { + "epoch": 0.62, + "learning_rate": 6.695025892310913e-06, + "loss": 0.4476, + "step": 1238 + }, + { + "epoch": 0.62, + "learning_rate": 6.6797464344124045e-06, + "loss": 0.4785, + "step": 1239 + }, + { + "epoch": 0.62, + "learning_rate": 6.664475683491797e-06, + "loss": 0.4106, + "step": 1240 + }, + { + "epoch": 0.62, + "learning_rate": 6.649213679594859e-06, + "loss": 0.4091, + "step": 1241 + }, + { + "epoch": 0.62, + "learning_rate": 6.633960462744415e-06, + "loss": 0.4373, + "step": 1242 + }, + { + "epoch": 0.62, + "learning_rate": 6.618716072940248e-06, + "loss": 0.4227, + "step": 1243 + }, + { + "epoch": 0.62, + "learning_rate": 6.603480550158995e-06, + "loss": 0.4241, + "step": 1244 + }, + { + "epoch": 0.62, + "learning_rate": 6.588253934354039e-06, + "loss": 0.3771, + "step": 1245 + }, + { + "epoch": 0.62, + "learning_rate": 6.5730362654554015e-06, + "loss": 0.4489, + "step": 1246 + }, + { + "epoch": 0.62, + "learning_rate": 6.5578275833696485e-06, + "loss": 0.4633, + "step": 1247 + }, + { + "epoch": 0.62, + "learning_rate": 6.542627927979772e-06, + "loss": 0.3883, + "step": 1248 + }, + { + "epoch": 0.62, + "learning_rate": 6.527437339145097e-06, + "loss": 0.4436, + "step": 1249 + }, + { + "epoch": 0.62, + "learning_rate": 6.5122558567011775e-06, + "loss": 0.4504, + "step": 1250 + }, + { + "epoch": 0.63, + "learning_rate": 6.497083520459674e-06, + "loss": 0.3626, + "step": 1251 + }, + { + "epoch": 0.63, + "learning_rate": 6.481920370208274e-06, + "loss": 0.4091, + "step": 1252 + }, + { + "epoch": 0.63, + "learning_rate": 6.466766445710568e-06, + "loss": 0.3863, + "step": 1253 + }, + { + "epoch": 0.63, + "learning_rate": 6.4516217867059615e-06, + "loss": 0.4081, + "step": 1254 + }, + { + "epoch": 0.63, + "learning_rate": 6.43648643290955e-06, + "loss": 0.3932, + "step": 1255 + }, + { + "epoch": 0.63, + "learning_rate": 6.421360424012039e-06, + "loss": 0.4043, + "step": 1256 + }, + { + "epoch": 0.63, + "learning_rate": 6.406243799679625e-06, + "loss": 0.4142, + "step": 1257 + }, + { + "epoch": 0.63, + "learning_rate": 6.39113659955389e-06, + "loss": 0.3747, + "step": 1258 + }, + { + "epoch": 0.63, + "learning_rate": 6.376038863251706e-06, + "loss": 0.3748, + "step": 1259 + }, + { + "epoch": 0.63, + "learning_rate": 6.360950630365126e-06, + "loss": 0.4162, + "step": 1260 + }, + { + "epoch": 0.63, + "learning_rate": 6.345871940461282e-06, + "loss": 0.4242, + "step": 1261 + }, + { + "epoch": 0.63, + "learning_rate": 6.33080283308228e-06, + "loss": 0.3819, + "step": 1262 + }, + { + "epoch": 0.63, + "learning_rate": 6.315743347745098e-06, + "loss": 0.4009, + "step": 1263 + }, + { + "epoch": 0.63, + "learning_rate": 6.300693523941481e-06, + "loss": 0.3622, + "step": 1264 + }, + { + "epoch": 0.63, + "learning_rate": 6.2856534011378365e-06, + "loss": 0.4283, + "step": 1265 + }, + { + "epoch": 0.63, + "learning_rate": 6.270623018775135e-06, + "loss": 0.4575, + "step": 1266 + }, + { + "epoch": 0.63, + "learning_rate": 6.255602416268799e-06, + "loss": 0.4358, + "step": 1267 + }, + { + "epoch": 0.63, + "learning_rate": 6.2405916330086106e-06, + "loss": 0.3554, + "step": 1268 + }, + { + "epoch": 0.63, + "learning_rate": 6.225590708358596e-06, + "loss": 0.4037, + "step": 1269 + }, + { + "epoch": 0.64, + "learning_rate": 6.210599681656933e-06, + "loss": 0.3967, + "step": 1270 + }, + { + "epoch": 0.64, + "learning_rate": 6.1956185922158445e-06, + "loss": 0.4144, + "step": 1271 + }, + { + "epoch": 0.64, + "learning_rate": 6.180647479321484e-06, + "loss": 0.4534, + "step": 1272 + }, + { + "epoch": 0.64, + "learning_rate": 6.165686382233856e-06, + "loss": 0.4557, + "step": 1273 + }, + { + "epoch": 0.64, + "learning_rate": 6.150735340186689e-06, + "loss": 0.3985, + "step": 1274 + }, + { + "epoch": 0.64, + "learning_rate": 6.135794392387353e-06, + "loss": 0.4285, + "step": 1275 + }, + { + "epoch": 0.64, + "learning_rate": 6.120863578016736e-06, + "loss": 0.3867, + "step": 1276 + }, + { + "epoch": 0.64, + "learning_rate": 6.1059429362291615e-06, + "loss": 0.4173, + "step": 1277 + }, + { + "epoch": 0.64, + "learning_rate": 6.091032506152274e-06, + "loss": 0.3591, + "step": 1278 + }, + { + "epoch": 0.64, + "learning_rate": 6.076132326886934e-06, + "loss": 0.4235, + "step": 1279 + }, + { + "epoch": 0.64, + "learning_rate": 6.061242437507131e-06, + "loss": 0.3599, + "step": 1280 + }, + { + "epoch": 0.64, + "learning_rate": 6.0463628770598574e-06, + "loss": 0.3687, + "step": 1281 + }, + { + "epoch": 0.64, + "learning_rate": 6.0314936845650296e-06, + "loss": 0.4174, + "step": 1282 + }, + { + "epoch": 0.64, + "learning_rate": 6.016634899015369e-06, + "loss": 0.3781, + "step": 1283 + }, + { + "epoch": 0.64, + "learning_rate": 6.00178655937631e-06, + "loss": 0.3717, + "step": 1284 + }, + { + "epoch": 0.64, + "learning_rate": 5.986948704585895e-06, + "loss": 0.3987, + "step": 1285 + }, + { + "epoch": 0.64, + "learning_rate": 5.972121373554665e-06, + "loss": 0.3913, + "step": 1286 + }, + { + "epoch": 0.64, + "learning_rate": 5.957304605165567e-06, + "loss": 0.4595, + "step": 1287 + }, + { + "epoch": 0.64, + "learning_rate": 5.942498438273849e-06, + "loss": 0.375, + "step": 1288 + }, + { + "epoch": 0.64, + "learning_rate": 5.927702911706961e-06, + "loss": 0.4359, + "step": 1289 + }, + { + "epoch": 0.65, + "learning_rate": 5.912918064264441e-06, + "loss": 0.3799, + "step": 1290 + }, + { + "epoch": 0.65, + "learning_rate": 5.898143934717831e-06, + "loss": 0.4056, + "step": 1291 + }, + { + "epoch": 0.65, + "learning_rate": 5.8833805618105635e-06, + "loss": 0.4361, + "step": 1292 + }, + { + "epoch": 0.65, + "learning_rate": 5.868627984257862e-06, + "loss": 0.4293, + "step": 1293 + }, + { + "epoch": 0.65, + "learning_rate": 5.853886240746643e-06, + "loss": 0.3594, + "step": 1294 + }, + { + "epoch": 0.65, + "learning_rate": 5.839155369935407e-06, + "loss": 0.4376, + "step": 1295 + }, + { + "epoch": 0.65, + "learning_rate": 5.82443541045415e-06, + "loss": 0.3927, + "step": 1296 + }, + { + "epoch": 0.65, + "learning_rate": 5.809726400904242e-06, + "loss": 0.4547, + "step": 1297 + }, + { + "epoch": 0.65, + "learning_rate": 5.795028379858355e-06, + "loss": 0.3932, + "step": 1298 + }, + { + "epoch": 0.65, + "learning_rate": 5.780341385860333e-06, + "loss": 0.4543, + "step": 1299 + }, + { + "epoch": 0.65, + "learning_rate": 5.765665457425102e-06, + "loss": 0.409, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_code_gate_load": [ + 206.65, + 175.25, + 174.65, + 157.15, + 173.1, + 187.8, + 194.85, + 170.55 + ], + "eval_code_loss": 0.2615722715854645, + "eval_code_runtime": 1.7786, + "eval_code_samples_per_second": 562.233, + "eval_code_steps_per_second": 35.421, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_orca_gate_load": [ + 499.85, + 346.7, + 403.75, + 402.4, + 350.05, + 418.15, + 370.1, + 351.3 + ], + "eval_orca_loss": 0.4473632872104645, + "eval_orca_runtime": 1.9968, + "eval_orca_samples_per_second": 500.804, + "eval_orca_steps_per_second": 31.551, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_math_gate_load": [ + 312.9, + 224.8, + 227.45, + 234.4, + 250.9, + 253.95, + 276.6, + 251.7 + ], + "eval_math_loss": 0.3106445372104645, + "eval_math_runtime": 1.8419, + "eval_math_samples_per_second": 542.921, + "eval_math_steps_per_second": 34.204, + "step": 1300 + }, + { + "epoch": 0.65, + "eval_sharegpt_gate_load": [ + 1519.25, + 1096.15, + 1298.05, + 1219.5, + 1133.1, + 1354.6, + 1183.8, + 1107.15 + ], + "eval_sharegpt_loss": 0.533984363079071, + "eval_sharegpt_runtime": 2.9971, + "eval_sharegpt_samples_per_second": 333.658, + "eval_sharegpt_steps_per_second": 21.02, + "step": 1300 + }, + { + "epoch": 0.65, + "learning_rate": 5.751000633038573e-06, + "loss": 0.4107, + "step": 1301 + }, + { + "epoch": 0.65, + "learning_rate": 5.736346951157544e-06, + "loss": 0.3936, + "step": 1302 + }, + { + "epoch": 0.65, + "learning_rate": 5.721704450209581e-06, + "loss": 0.3587, + "step": 1303 + }, + { + "epoch": 0.65, + "learning_rate": 5.707073168592943e-06, + "loss": 0.3612, + "step": 1304 + }, + { + "epoch": 0.65, + "learning_rate": 5.692453144676451e-06, + "loss": 0.3779, + "step": 1305 + }, + { + "epoch": 0.65, + "learning_rate": 5.677844416799424e-06, + "loss": 0.4237, + "step": 1306 + }, + { + "epoch": 0.65, + "learning_rate": 5.663247023271543e-06, + "loss": 0.4095, + "step": 1307 + }, + { + "epoch": 0.65, + "learning_rate": 5.648661002372769e-06, + "loss": 0.3707, + "step": 1308 + }, + { + "epoch": 0.65, + "learning_rate": 5.63408639235324e-06, + "loss": 0.4192, + "step": 1309 + }, + { + "epoch": 0.66, + "learning_rate": 5.619523231433177e-06, + "loss": 0.4076, + "step": 1310 + }, + { + "epoch": 0.66, + "learning_rate": 5.604971557802769e-06, + "loss": 0.3707, + "step": 1311 + }, + { + "epoch": 0.66, + "learning_rate": 5.590431409622081e-06, + "loss": 0.3618, + "step": 1312 + }, + { + "epoch": 0.66, + "learning_rate": 5.575902825020962e-06, + "loss": 0.4583, + "step": 1313 + }, + { + "epoch": 0.66, + "learning_rate": 5.56138584209893e-06, + "loss": 0.3753, + "step": 1314 + }, + { + "epoch": 0.66, + "learning_rate": 5.546880498925079e-06, + "loss": 0.4015, + "step": 1315 + }, + { + "epoch": 0.66, + "learning_rate": 5.5323868335379775e-06, + "loss": 0.4733, + "step": 1316 + }, + { + "epoch": 0.66, + "learning_rate": 5.517904883945577e-06, + "loss": 0.4065, + "step": 1317 + }, + { + "epoch": 0.66, + "learning_rate": 5.503434688125104e-06, + "loss": 0.391, + "step": 1318 + }, + { + "epoch": 0.66, + "learning_rate": 5.488976284022953e-06, + "loss": 0.413, + "step": 1319 + }, + { + "epoch": 0.66, + "learning_rate": 5.4745297095546125e-06, + "loss": 0.4226, + "step": 1320 + }, + { + "epoch": 0.66, + "learning_rate": 5.460095002604533e-06, + "loss": 0.4065, + "step": 1321 + }, + { + "epoch": 0.66, + "learning_rate": 5.445672201026054e-06, + "loss": 0.3748, + "step": 1322 + }, + { + "epoch": 0.66, + "learning_rate": 5.431261342641287e-06, + "loss": 0.3484, + "step": 1323 + }, + { + "epoch": 0.66, + "learning_rate": 5.416862465241033e-06, + "loss": 0.3729, + "step": 1324 + }, + { + "epoch": 0.66, + "learning_rate": 5.40247560658467e-06, + "loss": 0.4519, + "step": 1325 + }, + { + "epoch": 0.66, + "learning_rate": 5.3881008044000495e-06, + "loss": 0.3754, + "step": 1326 + }, + { + "epoch": 0.66, + "learning_rate": 5.373738096383423e-06, + "loss": 0.399, + "step": 1327 + }, + { + "epoch": 0.66, + "learning_rate": 5.359387520199317e-06, + "loss": 0.3878, + "step": 1328 + }, + { + "epoch": 0.66, + "learning_rate": 5.3450491134804416e-06, + "loss": 0.3375, + "step": 1329 + }, + { + "epoch": 0.67, + "learning_rate": 5.330722913827594e-06, + "loss": 0.3478, + "step": 1330 + }, + { + "epoch": 0.67, + "learning_rate": 5.3164089588095705e-06, + "loss": 0.4146, + "step": 1331 + }, + { + "epoch": 0.67, + "learning_rate": 5.302107285963045e-06, + "loss": 0.4672, + "step": 1332 + }, + { + "epoch": 0.67, + "learning_rate": 5.287817932792485e-06, + "loss": 0.389, + "step": 1333 + }, + { + "epoch": 0.67, + "learning_rate": 5.273540936770059e-06, + "loss": 0.3968, + "step": 1334 + }, + { + "epoch": 0.67, + "learning_rate": 5.259276335335522e-06, + "loss": 0.4544, + "step": 1335 + }, + { + "epoch": 0.67, + "learning_rate": 5.245024165896126e-06, + "loss": 0.4088, + "step": 1336 + }, + { + "epoch": 0.67, + "learning_rate": 5.2307844658265236e-06, + "loss": 0.3966, + "step": 1337 + }, + { + "epoch": 0.67, + "learning_rate": 5.216557272468675e-06, + "loss": 0.3967, + "step": 1338 + }, + { + "epoch": 0.67, + "learning_rate": 5.202342623131731e-06, + "loss": 0.4211, + "step": 1339 + }, + { + "epoch": 0.67, + "learning_rate": 5.18814055509195e-06, + "loss": 0.3969, + "step": 1340 + }, + { + "epoch": 0.67, + "learning_rate": 5.173951105592605e-06, + "loss": 0.3714, + "step": 1341 + }, + { + "epoch": 0.67, + "learning_rate": 5.1597743118438725e-06, + "loss": 0.4332, + "step": 1342 + }, + { + "epoch": 0.67, + "learning_rate": 5.145610211022738e-06, + "loss": 0.4368, + "step": 1343 + }, + { + "epoch": 0.67, + "learning_rate": 5.131458840272905e-06, + "loss": 0.3699, + "step": 1344 + }, + { + "epoch": 0.67, + "learning_rate": 5.117320236704697e-06, + "loss": 0.3549, + "step": 1345 + }, + { + "epoch": 0.67, + "learning_rate": 5.103194437394952e-06, + "loss": 0.3848, + "step": 1346 + }, + { + "epoch": 0.67, + "learning_rate": 5.089081479386928e-06, + "loss": 0.4111, + "step": 1347 + }, + { + "epoch": 0.67, + "learning_rate": 5.074981399690219e-06, + "loss": 0.4432, + "step": 1348 + }, + { + "epoch": 0.67, + "learning_rate": 5.060894235280637e-06, + "loss": 0.4486, + "step": 1349 + }, + { + "epoch": 0.68, + "learning_rate": 5.046820023100129e-06, + "loss": 0.3844, + "step": 1350 + }, + { + "epoch": 0.68, + "learning_rate": 5.03275880005667e-06, + "loss": 0.393, + "step": 1351 + }, + { + "epoch": 0.68, + "learning_rate": 5.018710603024187e-06, + "loss": 0.3565, + "step": 1352 + }, + { + "epoch": 0.68, + "learning_rate": 5.004675468842436e-06, + "loss": 0.3473, + "step": 1353 + }, + { + "epoch": 0.68, + "learning_rate": 4.990653434316915e-06, + "loss": 0.4119, + "step": 1354 + }, + { + "epoch": 0.68, + "learning_rate": 4.976644536218783e-06, + "loss": 0.3434, + "step": 1355 + }, + { + "epoch": 0.68, + "learning_rate": 4.9626488112847384e-06, + "loss": 0.3702, + "step": 1356 + }, + { + "epoch": 0.68, + "learning_rate": 4.948666296216938e-06, + "loss": 0.4028, + "step": 1357 + }, + { + "epoch": 0.68, + "learning_rate": 4.934697027682894e-06, + "loss": 0.3527, + "step": 1358 + }, + { + "epoch": 0.68, + "learning_rate": 4.9207410423153925e-06, + "loss": 0.3867, + "step": 1359 + }, + { + "epoch": 0.68, + "learning_rate": 4.9067983767123736e-06, + "loss": 0.4232, + "step": 1360 + }, + { + "epoch": 0.68, + "learning_rate": 4.8928690674368495e-06, + "loss": 0.4539, + "step": 1361 + }, + { + "epoch": 0.68, + "learning_rate": 4.878953151016816e-06, + "loss": 0.4188, + "step": 1362 + }, + { + "epoch": 0.68, + "learning_rate": 4.8650506639451385e-06, + "loss": 0.4357, + "step": 1363 + }, + { + "epoch": 0.68, + "learning_rate": 4.851161642679466e-06, + "loss": 0.3801, + "step": 1364 + }, + { + "epoch": 0.68, + "learning_rate": 4.837286123642141e-06, + "loss": 0.4286, + "step": 1365 + }, + { + "epoch": 0.68, + "learning_rate": 4.823424143220097e-06, + "loss": 0.411, + "step": 1366 + }, + { + "epoch": 0.68, + "learning_rate": 4.809575737764759e-06, + "loss": 0.4003, + "step": 1367 + }, + { + "epoch": 0.68, + "learning_rate": 4.795740943591955e-06, + "loss": 0.4079, + "step": 1368 + }, + { + "epoch": 0.68, + "learning_rate": 4.781919796981818e-06, + "loss": 0.4187, + "step": 1369 + }, + { + "epoch": 0.69, + "learning_rate": 4.7681123341787e-06, + "loss": 0.4027, + "step": 1370 + }, + { + "epoch": 0.69, + "learning_rate": 4.754318591391057e-06, + "loss": 0.3967, + "step": 1371 + }, + { + "epoch": 0.69, + "learning_rate": 4.740538604791371e-06, + "loss": 0.3976, + "step": 1372 + }, + { + "epoch": 0.69, + "learning_rate": 4.726772410516055e-06, + "loss": 0.4034, + "step": 1373 + }, + { + "epoch": 0.69, + "learning_rate": 4.713020044665348e-06, + "loss": 0.4319, + "step": 1374 + }, + { + "epoch": 0.69, + "learning_rate": 4.699281543303222e-06, + "loss": 0.4228, + "step": 1375 + }, + { + "epoch": 0.69, + "learning_rate": 4.685556942457296e-06, + "loss": 0.3096, + "step": 1376 + }, + { + "epoch": 0.69, + "learning_rate": 4.67184627811874e-06, + "loss": 0.4083, + "step": 1377 + }, + { + "epoch": 0.69, + "learning_rate": 4.65814958624217e-06, + "loss": 0.3911, + "step": 1378 + }, + { + "epoch": 0.69, + "learning_rate": 4.6444669027455615e-06, + "loss": 0.4363, + "step": 1379 + }, + { + "epoch": 0.69, + "learning_rate": 4.630798263510162e-06, + "loss": 0.3926, + "step": 1380 + }, + { + "epoch": 0.69, + "learning_rate": 4.617143704380382e-06, + "loss": 0.3413, + "step": 1381 + }, + { + "epoch": 0.69, + "learning_rate": 4.60350326116371e-06, + "loss": 0.414, + "step": 1382 + }, + { + "epoch": 0.69, + "learning_rate": 4.589876969630616e-06, + "loss": 0.3942, + "step": 1383 + }, + { + "epoch": 0.69, + "learning_rate": 4.576264865514467e-06, + "loss": 0.4239, + "step": 1384 + }, + { + "epoch": 0.69, + "learning_rate": 4.562666984511416e-06, + "loss": 0.4214, + "step": 1385 + }, + { + "epoch": 0.69, + "learning_rate": 4.549083362280318e-06, + "loss": 0.3739, + "step": 1386 + }, + { + "epoch": 0.69, + "learning_rate": 4.535514034442644e-06, + "loss": 0.372, + "step": 1387 + }, + { + "epoch": 0.69, + "learning_rate": 4.521959036582372e-06, + "loss": 0.3932, + "step": 1388 + }, + { + "epoch": 0.69, + "learning_rate": 4.508418404245903e-06, + "loss": 0.3606, + "step": 1389 + }, + { + "epoch": 0.69, + "learning_rate": 4.494892172941965e-06, + "loss": 0.4025, + "step": 1390 + }, + { + "epoch": 0.7, + "learning_rate": 4.481380378141528e-06, + "loss": 0.3945, + "step": 1391 + }, + { + "epoch": 0.7, + "learning_rate": 4.467883055277696e-06, + "loss": 0.4146, + "step": 1392 + }, + { + "epoch": 0.7, + "learning_rate": 4.454400239745619e-06, + "loss": 0.4055, + "step": 1393 + }, + { + "epoch": 0.7, + "learning_rate": 4.440931966902419e-06, + "loss": 0.3951, + "step": 1394 + }, + { + "epoch": 0.7, + "learning_rate": 4.427478272067066e-06, + "loss": 0.4238, + "step": 1395 + }, + { + "epoch": 0.7, + "learning_rate": 4.414039190520308e-06, + "loss": 0.4598, + "step": 1396 + }, + { + "epoch": 0.7, + "learning_rate": 4.400614757504565e-06, + "loss": 0.4238, + "step": 1397 + }, + { + "epoch": 0.7, + "learning_rate": 4.3872050082238535e-06, + "loss": 0.3732, + "step": 1398 + }, + { + "epoch": 0.7, + "learning_rate": 4.373809977843676e-06, + "loss": 0.4321, + "step": 1399 + }, + { + "epoch": 0.7, + "learning_rate": 4.360429701490935e-06, + "loss": 0.4115, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_code_gate_load": [ + 205.5, + 171.95, + 176.4, + 155.0, + 176.15, + 185.45, + 196.15, + 173.4 + ], + "eval_code_loss": 0.2625488340854645, + "eval_code_runtime": 1.7811, + "eval_code_samples_per_second": 561.449, + "eval_code_steps_per_second": 35.371, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_orca_gate_load": [ + 499.7, + 344.5, + 406.5, + 399.4, + 350.35, + 416.7, + 369.6, + 355.55 + ], + "eval_orca_loss": 0.39360350370407104, + "eval_orca_runtime": 1.9989, + "eval_orca_samples_per_second": 500.287, + "eval_orca_steps_per_second": 31.518, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_math_gate_load": [ + 315.6, + 219.15, + 229.9, + 225.75, + 253.55, + 256.25, + 275.3, + 257.2 + ], + "eval_math_loss": 0.30537110567092896, + "eval_math_runtime": 1.8428, + "eval_math_samples_per_second": 542.647, + "eval_math_steps_per_second": 34.187, + "step": 1400 + }, + { + "epoch": 0.7, + "eval_sharegpt_gate_load": [ + 1527.85, + 1080.7, + 1307.7, + 1213.2, + 1131.55, + 1358.05, + 1182.35, + 1110.2 + ], + "eval_sharegpt_loss": 0.53271484375, + "eval_sharegpt_runtime": 3.0058, + "eval_sharegpt_samples_per_second": 332.695, + "eval_sharegpt_steps_per_second": 20.96, + "step": 1400 + }, + { + "epoch": 0.7, + "learning_rate": 4.34706421425385e-06, + "loss": 0.3626, + "step": 1401 + }, + { + "epoch": 0.7, + "learning_rate": 4.3337135511818514e-06, + "loss": 0.3482, + "step": 1402 + }, + { + "epoch": 0.7, + "learning_rate": 4.320377747285497e-06, + "loss": 0.3597, + "step": 1403 + }, + { + "epoch": 0.7, + "learning_rate": 4.307056837536373e-06, + "loss": 0.4391, + "step": 1404 + }, + { + "epoch": 0.7, + "learning_rate": 4.2937508568670194e-06, + "loss": 0.4752, + "step": 1405 + }, + { + "epoch": 0.7, + "learning_rate": 4.280459840170818e-06, + "loss": 0.4315, + "step": 1406 + }, + { + "epoch": 0.7, + "learning_rate": 4.267183822301903e-06, + "loss": 0.4123, + "step": 1407 + }, + { + "epoch": 0.7, + "learning_rate": 4.2539228380750955e-06, + "loss": 0.3806, + "step": 1408 + }, + { + "epoch": 0.7, + "learning_rate": 4.240676922265774e-06, + "loss": 0.3826, + "step": 1409 + }, + { + "epoch": 0.7, + "learning_rate": 4.2274461096098085e-06, + "loss": 0.4211, + "step": 1410 + }, + { + "epoch": 0.71, + "learning_rate": 4.21423043480346e-06, + "loss": 0.382, + "step": 1411 + }, + { + "epoch": 0.71, + "learning_rate": 4.201029932503303e-06, + "loss": 0.415, + "step": 1412 + }, + { + "epoch": 0.71, + "learning_rate": 4.18784463732611e-06, + "loss": 0.4246, + "step": 1413 + }, + { + "epoch": 0.71, + "learning_rate": 4.17467458384878e-06, + "loss": 0.4311, + "step": 1414 + }, + { + "epoch": 0.71, + "learning_rate": 4.1615198066082475e-06, + "loss": 0.4051, + "step": 1415 + }, + { + "epoch": 0.71, + "learning_rate": 4.14838034010138e-06, + "loss": 0.3679, + "step": 1416 + }, + { + "epoch": 0.71, + "learning_rate": 4.135256218784896e-06, + "loss": 0.3997, + "step": 1417 + }, + { + "epoch": 0.71, + "learning_rate": 4.12214747707527e-06, + "loss": 0.4115, + "step": 1418 + }, + { + "epoch": 0.71, + "learning_rate": 4.1090541493486555e-06, + "loss": 0.413, + "step": 1419 + }, + { + "epoch": 0.71, + "learning_rate": 4.095976269940777e-06, + "loss": 0.3356, + "step": 1420 + }, + { + "epoch": 0.71, + "learning_rate": 4.082913873146842e-06, + "loss": 0.3641, + "step": 1421 + }, + { + "epoch": 0.71, + "learning_rate": 4.069866993221473e-06, + "loss": 0.3875, + "step": 1422 + }, + { + "epoch": 0.71, + "learning_rate": 4.056835664378585e-06, + "loss": 0.3374, + "step": 1423 + }, + { + "epoch": 0.71, + "learning_rate": 4.043819920791322e-06, + "loss": 0.3604, + "step": 1424 + }, + { + "epoch": 0.71, + "learning_rate": 4.03081979659195e-06, + "loss": 0.3292, + "step": 1425 + }, + { + "epoch": 0.71, + "learning_rate": 4.017835325871781e-06, + "loss": 0.3759, + "step": 1426 + }, + { + "epoch": 0.71, + "learning_rate": 4.004866542681079e-06, + "loss": 0.4933, + "step": 1427 + }, + { + "epoch": 0.71, + "learning_rate": 3.991913481028965e-06, + "loss": 0.343, + "step": 1428 + }, + { + "epoch": 0.71, + "learning_rate": 3.978976174883329e-06, + "loss": 0.3699, + "step": 1429 + }, + { + "epoch": 0.71, + "learning_rate": 3.966054658170754e-06, + "loss": 0.3808, + "step": 1430 + }, + { + "epoch": 0.72, + "learning_rate": 3.953148964776408e-06, + "loss": 0.3943, + "step": 1431 + }, + { + "epoch": 0.72, + "learning_rate": 3.940259128543967e-06, + "loss": 0.4061, + "step": 1432 + }, + { + "epoch": 0.72, + "learning_rate": 3.927385183275522e-06, + "loss": 0.3791, + "step": 1433 + }, + { + "epoch": 0.72, + "learning_rate": 3.914527162731498e-06, + "loss": 0.3934, + "step": 1434 + }, + { + "epoch": 0.72, + "learning_rate": 3.901685100630554e-06, + "loss": 0.3913, + "step": 1435 + }, + { + "epoch": 0.72, + "learning_rate": 3.888859030649498e-06, + "loss": 0.3934, + "step": 1436 + }, + { + "epoch": 0.72, + "learning_rate": 3.876048986423207e-06, + "loss": 0.4283, + "step": 1437 + }, + { + "epoch": 0.72, + "learning_rate": 3.863255001544526e-06, + "loss": 0.3086, + "step": 1438 + }, + { + "epoch": 0.72, + "learning_rate": 3.8504771095641905e-06, + "loss": 0.358, + "step": 1439 + }, + { + "epoch": 0.72, + "learning_rate": 3.837715343990727e-06, + "loss": 0.3836, + "step": 1440 + }, + { + "epoch": 0.72, + "learning_rate": 3.824969738290386e-06, + "loss": 0.351, + "step": 1441 + }, + { + "epoch": 0.72, + "learning_rate": 3.81224032588703e-06, + "loss": 0.3598, + "step": 1442 + }, + { + "epoch": 0.72, + "learning_rate": 3.7995271401620548e-06, + "loss": 0.4081, + "step": 1443 + }, + { + "epoch": 0.72, + "learning_rate": 3.7868302144543146e-06, + "loss": 0.3872, + "step": 1444 + }, + { + "epoch": 0.72, + "learning_rate": 3.7741495820600128e-06, + "loss": 0.3478, + "step": 1445 + }, + { + "epoch": 0.72, + "learning_rate": 3.7614852762326303e-06, + "loss": 0.4235, + "step": 1446 + }, + { + "epoch": 0.72, + "learning_rate": 3.7488373301828296e-06, + "loss": 0.3883, + "step": 1447 + }, + { + "epoch": 0.72, + "learning_rate": 3.736205777078381e-06, + "loss": 0.3884, + "step": 1448 + }, + { + "epoch": 0.72, + "learning_rate": 3.7235906500440576e-06, + "loss": 0.4015, + "step": 1449 + }, + { + "epoch": 0.72, + "learning_rate": 3.7109919821615546e-06, + "loss": 0.3689, + "step": 1450 + }, + { + "epoch": 0.73, + "learning_rate": 3.6984098064694174e-06, + "loss": 0.3799, + "step": 1451 + }, + { + "epoch": 0.73, + "learning_rate": 3.685844155962931e-06, + "loss": 0.3901, + "step": 1452 + }, + { + "epoch": 0.73, + "learning_rate": 3.673295063594049e-06, + "loss": 0.4099, + "step": 1453 + }, + { + "epoch": 0.73, + "learning_rate": 3.6607625622713005e-06, + "loss": 0.3802, + "step": 1454 + }, + { + "epoch": 0.73, + "learning_rate": 3.6482466848597164e-06, + "loss": 0.3853, + "step": 1455 + }, + { + "epoch": 0.73, + "learning_rate": 3.63574746418072e-06, + "loss": 0.3828, + "step": 1456 + }, + { + "epoch": 0.73, + "learning_rate": 3.6232649330120608e-06, + "loss": 0.3936, + "step": 1457 + }, + { + "epoch": 0.73, + "learning_rate": 3.610799124087725e-06, + "loss": 0.3601, + "step": 1458 + }, + { + "epoch": 0.73, + "learning_rate": 3.5983500700978425e-06, + "loss": 0.3573, + "step": 1459 + }, + { + "epoch": 0.73, + "learning_rate": 3.585917803688603e-06, + "loss": 0.4001, + "step": 1460 + }, + { + "epoch": 0.73, + "learning_rate": 3.5735023574621765e-06, + "loss": 0.3791, + "step": 1461 + }, + { + "epoch": 0.73, + "learning_rate": 3.5611037639766267e-06, + "loss": 0.3672, + "step": 1462 + }, + { + "epoch": 0.73, + "learning_rate": 3.548722055745818e-06, + "loss": 0.4283, + "step": 1463 + }, + { + "epoch": 0.73, + "learning_rate": 3.536357265239333e-06, + "loss": 0.3697, + "step": 1464 + }, + { + "epoch": 0.73, + "learning_rate": 3.5240094248824e-06, + "loss": 0.3795, + "step": 1465 + }, + { + "epoch": 0.73, + "learning_rate": 3.511678567055786e-06, + "loss": 0.3783, + "step": 1466 + }, + { + "epoch": 0.73, + "learning_rate": 3.4993647240957307e-06, + "loss": 0.3738, + "step": 1467 + }, + { + "epoch": 0.73, + "learning_rate": 3.487067928293848e-06, + "loss": 0.3223, + "step": 1468 + }, + { + "epoch": 0.73, + "learning_rate": 3.4747882118970565e-06, + "loss": 0.4047, + "step": 1469 + }, + { + "epoch": 0.73, + "learning_rate": 3.4625256071074776e-06, + "loss": 0.3838, + "step": 1470 + }, + { + "epoch": 0.74, + "learning_rate": 3.450280146082361e-06, + "loss": 0.3525, + "step": 1471 + }, + { + "epoch": 0.74, + "learning_rate": 3.4380518609340076e-06, + "loss": 0.4043, + "step": 1472 + }, + { + "epoch": 0.74, + "learning_rate": 3.4258407837296635e-06, + "loss": 0.3922, + "step": 1473 + }, + { + "epoch": 0.74, + "learning_rate": 3.413646946491458e-06, + "loss": 0.4094, + "step": 1474 + }, + { + "epoch": 0.74, + "learning_rate": 3.4014703811963024e-06, + "loss": 0.4139, + "step": 1475 + }, + { + "epoch": 0.74, + "learning_rate": 3.3893111197758276e-06, + "loss": 0.3647, + "step": 1476 + }, + { + "epoch": 0.74, + "learning_rate": 3.3771691941162755e-06, + "loss": 0.3618, + "step": 1477 + }, + { + "epoch": 0.74, + "learning_rate": 3.3650446360584276e-06, + "loss": 0.4106, + "step": 1478 + }, + { + "epoch": 0.74, + "learning_rate": 3.35293747739753e-06, + "loss": 0.3881, + "step": 1479 + }, + { + "epoch": 0.74, + "learning_rate": 3.3408477498831917e-06, + "loss": 0.4383, + "step": 1480 + }, + { + "epoch": 0.74, + "learning_rate": 3.3287754852193143e-06, + "loss": 0.3771, + "step": 1481 + }, + { + "epoch": 0.74, + "learning_rate": 3.3167207150640003e-06, + "loss": 0.3858, + "step": 1482 + }, + { + "epoch": 0.74, + "learning_rate": 3.304683471029485e-06, + "loss": 0.3392, + "step": 1483 + }, + { + "epoch": 0.74, + "learning_rate": 3.2926637846820366e-06, + "loss": 0.4236, + "step": 1484 + }, + { + "epoch": 0.74, + "learning_rate": 3.280661687541876e-06, + "loss": 0.3373, + "step": 1485 + }, + { + "epoch": 0.74, + "learning_rate": 3.268677211083109e-06, + "loss": 0.424, + "step": 1486 + }, + { + "epoch": 0.74, + "learning_rate": 3.256710386733629e-06, + "loss": 0.4352, + "step": 1487 + }, + { + "epoch": 0.74, + "learning_rate": 3.2447612458750365e-06, + "loss": 0.3318, + "step": 1488 + }, + { + "epoch": 0.74, + "learning_rate": 3.2328298198425556e-06, + "loss": 0.3844, + "step": 1489 + }, + { + "epoch": 0.74, + "learning_rate": 3.2209161399249677e-06, + "loss": 0.3987, + "step": 1490 + }, + { + "epoch": 0.75, + "learning_rate": 3.209020237364505e-06, + "loss": 0.3508, + "step": 1491 + }, + { + "epoch": 0.75, + "learning_rate": 3.197142143356787e-06, + "loss": 0.3412, + "step": 1492 + }, + { + "epoch": 0.75, + "learning_rate": 3.1852818890507255e-06, + "loss": 0.3601, + "step": 1493 + }, + { + "epoch": 0.75, + "learning_rate": 3.1734395055484623e-06, + "loss": 0.3728, + "step": 1494 + }, + { + "epoch": 0.75, + "learning_rate": 3.1616150239052647e-06, + "loss": 0.4014, + "step": 1495 + }, + { + "epoch": 0.75, + "learning_rate": 3.1498084751294523e-06, + "loss": 0.3344, + "step": 1496 + }, + { + "epoch": 0.75, + "learning_rate": 3.1380198901823313e-06, + "loss": 0.3488, + "step": 1497 + }, + { + "epoch": 0.75, + "learning_rate": 3.126249299978086e-06, + "loss": 0.3459, + "step": 1498 + }, + { + "epoch": 0.75, + "learning_rate": 3.1144967353837196e-06, + "loss": 0.4079, + "step": 1499 + }, + { + "epoch": 0.75, + "learning_rate": 3.1027622272189572e-06, + "loss": 0.3557, + "step": 1500 + }, + { + "epoch": 0.75, + "eval_code_gate_load": [ + 207.05, + 170.35, + 182.3, + 156.9, + 179.05, + 184.35, + 193.1, + 166.9 + ], + "eval_code_loss": 0.2623046934604645, + "eval_code_runtime": 1.7777, + "eval_code_samples_per_second": 562.517, + "eval_code_steps_per_second": 35.439, + "step": 1500 + }, + { + "epoch": 0.75, + "eval_orca_gate_load": [ + 501.2, + 343.6, + 407.1, + 397.6, + 349.35, + 418.55, + 371.3, + 353.6 + ], + "eval_orca_loss": 0.35224610567092896, + "eval_orca_runtime": 2.0086, + "eval_orca_samples_per_second": 497.848, + "eval_orca_steps_per_second": 31.364, + "step": 1500 + }, + { + "epoch": 0.75, + "eval_math_gate_load": [ + 318.45, + 216.6, + 235.85, + 226.0, + 251.4, + 258.6, + 275.95, + 249.85 + ], + "eval_math_loss": 0.3041015565395355, + "eval_math_runtime": 1.8624, + "eval_math_samples_per_second": 536.934, + "eval_math_steps_per_second": 33.827, + "step": 1500 + }, + { + "epoch": 0.75, + "eval_sharegpt_gate_load": [ + 1534.6, + 1081.35, + 1305.0, + 1210.7, + 1133.0, + 1357.35, + 1191.1, + 1098.5 + ], + "eval_sharegpt_loss": 0.530078113079071, + "eval_sharegpt_runtime": 2.9848, + "eval_sharegpt_samples_per_second": 335.033, + "eval_sharegpt_steps_per_second": 21.107, + "step": 1500 + }, + { + "epoch": 0.75, + "learning_rate": 3.0910458062561865e-06, + "loss": 0.3699, + "step": 1501 + }, + { + "epoch": 0.75, + "learning_rate": 3.0793475032203513e-06, + "loss": 0.4025, + "step": 1502 + }, + { + "epoch": 0.75, + "learning_rate": 3.0676673487888854e-06, + "loss": 0.3846, + "step": 1503 + }, + { + "epoch": 0.75, + "learning_rate": 3.0560053735916372e-06, + "loss": 0.3701, + "step": 1504 + }, + { + "epoch": 0.75, + "learning_rate": 3.0443616082107753e-06, + "loss": 0.4251, + "step": 1505 + }, + { + "epoch": 0.75, + "learning_rate": 3.032736083180716e-06, + "loss": 0.3944, + "step": 1506 + }, + { + "epoch": 0.75, + "learning_rate": 3.0211288289880404e-06, + "loss": 0.3492, + "step": 1507 + }, + { + "epoch": 0.75, + "learning_rate": 3.009539876071427e-06, + "loss": 0.3408, + "step": 1508 + }, + { + "epoch": 0.75, + "learning_rate": 2.997969254821548e-06, + "loss": 0.3257, + "step": 1509 + }, + { + "epoch": 0.76, + "learning_rate": 2.9864169955810085e-06, + "loss": 0.3895, + "step": 1510 + }, + { + "epoch": 0.76, + "learning_rate": 2.974883128644266e-06, + "loss": 0.367, + "step": 1511 + }, + { + "epoch": 0.76, + "learning_rate": 2.9633676842575386e-06, + "loss": 0.3883, + "step": 1512 + }, + { + "epoch": 0.76, + "learning_rate": 2.951870692618739e-06, + "loss": 0.3662, + "step": 1513 + }, + { + "epoch": 0.76, + "learning_rate": 2.940392183877382e-06, + "loss": 0.338, + "step": 1514 + }, + { + "epoch": 0.76, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.3624, + "step": 1515 + }, + { + "epoch": 0.76, + "learning_rate": 2.9174907354426696e-06, + "loss": 0.3262, + "step": 1516 + }, + { + "epoch": 0.76, + "learning_rate": 2.9060678558056876e-06, + "loss": 0.3512, + "step": 1517 + }, + { + "epoch": 0.76, + "learning_rate": 2.8946635791787546e-06, + "loss": 0.4026, + "step": 1518 + }, + { + "epoch": 0.76, + "learning_rate": 2.883277935468254e-06, + "loss": 0.3844, + "step": 1519 + }, + { + "epoch": 0.76, + "learning_rate": 2.8719109545317102e-06, + "loss": 0.3645, + "step": 1520 + }, + { + "epoch": 0.76, + "learning_rate": 2.8605626661776995e-06, + "loss": 0.3448, + "step": 1521 + }, + { + "epoch": 0.76, + "learning_rate": 2.849233100165795e-06, + "loss": 0.405, + "step": 1522 + }, + { + "epoch": 0.76, + "learning_rate": 2.837922286206457e-06, + "loss": 0.3771, + "step": 1523 + }, + { + "epoch": 0.76, + "learning_rate": 2.8266302539609747e-06, + "loss": 0.354, + "step": 1524 + }, + { + "epoch": 0.76, + "learning_rate": 2.8153570330413925e-06, + "loss": 0.3807, + "step": 1525 + }, + { + "epoch": 0.76, + "learning_rate": 2.8041026530104144e-06, + "loss": 0.3702, + "step": 1526 + }, + { + "epoch": 0.76, + "learning_rate": 2.7928671433813392e-06, + "loss": 0.3836, + "step": 1527 + }, + { + "epoch": 0.76, + "learning_rate": 2.78165053361798e-06, + "loss": 0.3682, + "step": 1528 + }, + { + "epoch": 0.76, + "learning_rate": 2.770452853134593e-06, + "loss": 0.3843, + "step": 1529 + }, + { + "epoch": 0.77, + "learning_rate": 2.759274131295787e-06, + "loss": 0.4317, + "step": 1530 + }, + { + "epoch": 0.77, + "learning_rate": 2.7481143974164548e-06, + "loss": 0.3398, + "step": 1531 + }, + { + "epoch": 0.77, + "learning_rate": 2.736973680761702e-06, + "loss": 0.3638, + "step": 1532 + }, + { + "epoch": 0.77, + "learning_rate": 2.7258520105467566e-06, + "loss": 0.3795, + "step": 1533 + }, + { + "epoch": 0.77, + "learning_rate": 2.714749415936904e-06, + "loss": 0.3695, + "step": 1534 + }, + { + "epoch": 0.77, + "learning_rate": 2.7036659260473973e-06, + "loss": 0.3323, + "step": 1535 + }, + { + "epoch": 0.77, + "learning_rate": 2.692601569943407e-06, + "loss": 0.3591, + "step": 1536 + }, + { + "epoch": 0.77, + "learning_rate": 2.6815563766399122e-06, + "loss": 0.3609, + "step": 1537 + }, + { + "epoch": 0.77, + "learning_rate": 2.670530375101641e-06, + "loss": 0.3286, + "step": 1538 + }, + { + "epoch": 0.77, + "learning_rate": 2.6595235942430044e-06, + "loss": 0.347, + "step": 1539 + }, + { + "epoch": 0.77, + "learning_rate": 2.648536062927999e-06, + "loss": 0.3776, + "step": 1540 + }, + { + "epoch": 0.77, + "learning_rate": 2.637567809970143e-06, + "loss": 0.469, + "step": 1541 + }, + { + "epoch": 0.77, + "learning_rate": 2.6266188641324e-06, + "loss": 0.3424, + "step": 1542 + }, + { + "epoch": 0.77, + "learning_rate": 2.6156892541271083e-06, + "loss": 0.384, + "step": 1543 + }, + { + "epoch": 0.77, + "learning_rate": 2.604779008615895e-06, + "loss": 0.3755, + "step": 1544 + }, + { + "epoch": 0.77, + "learning_rate": 2.593888156209603e-06, + "loss": 0.37, + "step": 1545 + }, + { + "epoch": 0.77, + "learning_rate": 2.583016725468226e-06, + "loss": 0.3906, + "step": 1546 + }, + { + "epoch": 0.77, + "learning_rate": 2.572164744900827e-06, + "loss": 0.3445, + "step": 1547 + }, + { + "epoch": 0.77, + "learning_rate": 2.5613322429654573e-06, + "loss": 0.3819, + "step": 1548 + }, + { + "epoch": 0.77, + "learning_rate": 2.5505192480690865e-06, + "loss": 0.305, + "step": 1549 + }, + { + "epoch": 0.78, + "learning_rate": 2.5397257885675396e-06, + "loss": 0.2964, + "step": 1550 + }, + { + "epoch": 0.78, + "learning_rate": 2.528951892765402e-06, + "loss": 0.3686, + "step": 1551 + }, + { + "epoch": 0.78, + "learning_rate": 2.5181975889159615e-06, + "loss": 0.3582, + "step": 1552 + }, + { + "epoch": 0.78, + "learning_rate": 2.507462905221122e-06, + "loss": 0.3419, + "step": 1553 + }, + { + "epoch": 0.78, + "learning_rate": 2.496747869831345e-06, + "loss": 0.3447, + "step": 1554 + }, + { + "epoch": 0.78, + "learning_rate": 2.48605251084556e-06, + "loss": 0.3513, + "step": 1555 + }, + { + "epoch": 0.78, + "learning_rate": 2.475376856311097e-06, + "loss": 0.3515, + "step": 1556 + }, + { + "epoch": 0.78, + "learning_rate": 2.464720934223619e-06, + "loss": 0.347, + "step": 1557 + }, + { + "epoch": 0.78, + "learning_rate": 2.4540847725270376e-06, + "loss": 0.3591, + "step": 1558 + }, + { + "epoch": 0.78, + "learning_rate": 2.4434683991134476e-06, + "loss": 0.3858, + "step": 1559 + }, + { + "epoch": 0.78, + "learning_rate": 2.432871841823047e-06, + "loss": 0.3432, + "step": 1560 + }, + { + "epoch": 0.78, + "learning_rate": 2.4222951284440776e-06, + "loss": 0.3379, + "step": 1561 + }, + { + "epoch": 0.78, + "learning_rate": 2.411738286712735e-06, + "loss": 0.3651, + "step": 1562 + }, + { + "epoch": 0.78, + "learning_rate": 2.401201344313102e-06, + "loss": 0.411, + "step": 1563 + }, + { + "epoch": 0.78, + "learning_rate": 2.390684328877089e-06, + "loss": 0.3929, + "step": 1564 + }, + { + "epoch": 0.78, + "learning_rate": 2.3801872679843384e-06, + "loss": 0.335, + "step": 1565 + }, + { + "epoch": 0.78, + "learning_rate": 2.36971018916217e-06, + "loss": 0.347, + "step": 1566 + }, + { + "epoch": 0.78, + "learning_rate": 2.3592531198854974e-06, + "loss": 0.36, + "step": 1567 + }, + { + "epoch": 0.78, + "learning_rate": 2.3488160875767717e-06, + "loss": 0.3374, + "step": 1568 + }, + { + "epoch": 0.78, + "learning_rate": 2.3383991196058918e-06, + "loss": 0.287, + "step": 1569 + }, + { + "epoch": 0.79, + "learning_rate": 2.328002243290138e-06, + "loss": 0.3322, + "step": 1570 + }, + { + "epoch": 0.79, + "learning_rate": 2.317625485894113e-06, + "loss": 0.324, + "step": 1571 + }, + { + "epoch": 0.79, + "learning_rate": 2.307268874629649e-06, + "loss": 0.3286, + "step": 1572 + }, + { + "epoch": 0.79, + "learning_rate": 2.296932436655752e-06, + "loss": 0.3224, + "step": 1573 + }, + { + "epoch": 0.79, + "learning_rate": 2.2866161990785228e-06, + "loss": 0.3541, + "step": 1574 + }, + { + "epoch": 0.79, + "learning_rate": 2.2763201889510987e-06, + "loss": 0.3138, + "step": 1575 + }, + { + "epoch": 0.79, + "learning_rate": 2.266044433273562e-06, + "loss": 0.3317, + "step": 1576 + }, + { + "epoch": 0.79, + "learning_rate": 2.2557889589928815e-06, + "loss": 0.3571, + "step": 1577 + }, + { + "epoch": 0.79, + "learning_rate": 2.245553793002849e-06, + "loss": 0.3136, + "step": 1578 + }, + { + "epoch": 0.79, + "learning_rate": 2.23533896214399e-06, + "loss": 0.3203, + "step": 1579 + }, + { + "epoch": 0.79, + "learning_rate": 2.2251444932035094e-06, + "loss": 0.3453, + "step": 1580 + }, + { + "epoch": 0.79, + "learning_rate": 2.2149704129152083e-06, + "loss": 0.4132, + "step": 1581 + }, + { + "epoch": 0.79, + "learning_rate": 2.204816747959434e-06, + "loss": 0.368, + "step": 1582 + }, + { + "epoch": 0.79, + "learning_rate": 2.194683524962986e-06, + "loss": 0.3031, + "step": 1583 + }, + { + "epoch": 0.79, + "learning_rate": 2.184570770499056e-06, + "loss": 0.3408, + "step": 1584 + }, + { + "epoch": 0.79, + "learning_rate": 2.1744785110871713e-06, + "loss": 0.3161, + "step": 1585 + }, + { + "epoch": 0.79, + "learning_rate": 2.1644067731931005e-06, + "loss": 0.2998, + "step": 1586 + }, + { + "epoch": 0.79, + "learning_rate": 2.1543555832288056e-06, + "loss": 0.3601, + "step": 1587 + }, + { + "epoch": 0.79, + "learning_rate": 2.1443249675523536e-06, + "loss": 0.343, + "step": 1588 + }, + { + "epoch": 0.79, + "learning_rate": 2.134314952467873e-06, + "loss": 0.306, + "step": 1589 + }, + { + "epoch": 0.8, + "learning_rate": 2.124325564225458e-06, + "loss": 0.3214, + "step": 1590 + }, + { + "epoch": 0.8, + "learning_rate": 2.1143568290211115e-06, + "loss": 0.3413, + "step": 1591 + }, + { + "epoch": 0.8, + "learning_rate": 2.1044087729966856e-06, + "loss": 0.3369, + "step": 1592 + }, + { + "epoch": 0.8, + "learning_rate": 2.0944814222397948e-06, + "loss": 0.3431, + "step": 1593 + }, + { + "epoch": 0.8, + "learning_rate": 2.0845748027837585e-06, + "loss": 0.2723, + "step": 1594 + }, + { + "epoch": 0.8, + "learning_rate": 2.074688940607529e-06, + "loss": 0.326, + "step": 1595 + }, + { + "epoch": 0.8, + "learning_rate": 2.064823861635633e-06, + "loss": 0.2964, + "step": 1596 + }, + { + "epoch": 0.8, + "learning_rate": 2.0549795917380867e-06, + "loss": 0.3481, + "step": 1597 + }, + { + "epoch": 0.8, + "learning_rate": 2.0451561567303378e-06, + "loss": 0.3613, + "step": 1598 + }, + { + "epoch": 0.8, + "learning_rate": 2.0353535823732053e-06, + "loss": 0.3455, + "step": 1599 + }, + { + "epoch": 0.8, + "learning_rate": 2.025571894372794e-06, + "loss": 0.2833, + "step": 1600 + }, + { + "epoch": 0.8, + "eval_code_gate_load": [ + 206.95, + 170.4, + 179.9, + 153.6, + 179.0, + 186.85, + 195.1, + 168.2 + ], + "eval_code_loss": 0.261962890625, + "eval_code_runtime": 1.7896, + "eval_code_samples_per_second": 558.77, + "eval_code_steps_per_second": 35.203, + "step": 1600 + }, + { + "epoch": 0.8, + "eval_orca_gate_load": [ + 498.0, + 341.05, + 410.9, + 396.4, + 351.45, + 419.25, + 369.95, + 355.3 + ], + "eval_orca_loss": 0.3511718809604645, + "eval_orca_runtime": 2.0018, + "eval_orca_samples_per_second": 499.561, + "eval_orca_steps_per_second": 31.472, + "step": 1600 + }, + { + "epoch": 0.8, + "eval_math_gate_load": [ + 314.8, + 213.8, + 239.55, + 221.0, + 254.25, + 260.1, + 278.1, + 251.1 + ], + "eval_math_loss": 0.24335937201976776, + "eval_math_runtime": 1.8471, + "eval_math_samples_per_second": 541.403, + "eval_math_steps_per_second": 34.108, + "step": 1600 + }, + { + "epoch": 0.8, + "eval_sharegpt_gate_load": [ + 1525.0, + 1074.95, + 1316.25, + 1205.0, + 1138.7, + 1360.4, + 1181.75, + 1109.55 + ], + "eval_sharegpt_loss": 0.48750001192092896, + "eval_sharegpt_runtime": 2.9974, + "eval_sharegpt_samples_per_second": 333.624, + "eval_sharegpt_steps_per_second": 21.018, + "step": 1600 + }, + { + "epoch": 0.8, + "learning_rate": 2.0158111183804407e-06, + "loss": 0.3204, + "step": 1601 + }, + { + "epoch": 0.8, + "learning_rate": 2.0060712799926407e-06, + "loss": 0.2845, + "step": 1602 + }, + { + "epoch": 0.8, + "learning_rate": 1.9963524047509898e-06, + "loss": 0.3192, + "step": 1603 + }, + { + "epoch": 0.8, + "learning_rate": 1.9866545181421016e-06, + "loss": 0.321, + "step": 1604 + }, + { + "epoch": 0.8, + "learning_rate": 1.976977645597552e-06, + "loss": 0.3466, + "step": 1605 + }, + { + "epoch": 0.8, + "learning_rate": 1.967321812493813e-06, + "loss": 0.341, + "step": 1606 + }, + { + "epoch": 0.8, + "learning_rate": 1.9576870441521834e-06, + "loss": 0.2962, + "step": 1607 + }, + { + "epoch": 0.8, + "learning_rate": 1.9480733658387175e-06, + "loss": 0.3237, + "step": 1608 + }, + { + "epoch": 0.8, + "learning_rate": 1.9384808027641666e-06, + "loss": 0.3164, + "step": 1609 + }, + { + "epoch": 0.81, + "learning_rate": 1.9289093800839067e-06, + "loss": 0.3583, + "step": 1610 + }, + { + "epoch": 0.81, + "learning_rate": 1.9193591228978815e-06, + "loss": 0.2978, + "step": 1611 + }, + { + "epoch": 0.81, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.2947, + "step": 1612 + }, + { + "epoch": 0.81, + "learning_rate": 1.9003222051307046e-06, + "loss": 0.2788, + "step": 1613 + }, + { + "epoch": 0.81, + "learning_rate": 1.8908355944716516e-06, + "loss": 0.3126, + "step": 1614 + }, + { + "epoch": 0.81, + "learning_rate": 1.8813702491508956e-06, + "loss": 0.3048, + "step": 1615 + }, + { + "epoch": 0.81, + "learning_rate": 1.8719261939902023e-06, + "loss": 0.2466, + "step": 1616 + }, + { + "epoch": 0.81, + "learning_rate": 1.862503453755502e-06, + "loss": 0.3255, + "step": 1617 + }, + { + "epoch": 0.81, + "learning_rate": 1.8531020531568377e-06, + "loss": 0.3581, + "step": 1618 + }, + { + "epoch": 0.81, + "learning_rate": 1.8437220168482839e-06, + "loss": 0.3437, + "step": 1619 + }, + { + "epoch": 0.81, + "learning_rate": 1.8343633694278895e-06, + "loss": 0.3263, + "step": 1620 + }, + { + "epoch": 0.81, + "learning_rate": 1.825026135437622e-06, + "loss": 0.323, + "step": 1621 + }, + { + "epoch": 0.81, + "learning_rate": 1.8157103393632869e-06, + "loss": 0.2819, + "step": 1622 + }, + { + "epoch": 0.81, + "learning_rate": 1.8064160056344714e-06, + "loss": 0.2967, + "step": 1623 + }, + { + "epoch": 0.81, + "learning_rate": 1.7971431586244814e-06, + "loss": 0.3171, + "step": 1624 + }, + { + "epoch": 0.81, + "learning_rate": 1.7878918226502816e-06, + "loss": 0.3275, + "step": 1625 + }, + { + "epoch": 0.81, + "learning_rate": 1.7786620219724205e-06, + "loss": 0.3351, + "step": 1626 + }, + { + "epoch": 0.81, + "learning_rate": 1.7694537807949707e-06, + "loss": 0.2992, + "step": 1627 + }, + { + "epoch": 0.81, + "learning_rate": 1.7602671232654755e-06, + "loss": 0.3353, + "step": 1628 + }, + { + "epoch": 0.81, + "learning_rate": 1.751102073474873e-06, + "loss": 0.3463, + "step": 1629 + }, + { + "epoch": 0.81, + "learning_rate": 1.7419586554574364e-06, + "loss": 0.3037, + "step": 1630 + }, + { + "epoch": 0.82, + "learning_rate": 1.7328368931907114e-06, + "loss": 0.2644, + "step": 1631 + }, + { + "epoch": 0.82, + "learning_rate": 1.723736810595461e-06, + "loss": 0.2871, + "step": 1632 + }, + { + "epoch": 0.82, + "learning_rate": 1.7146584315355886e-06, + "loss": 0.3261, + "step": 1633 + }, + { + "epoch": 0.82, + "learning_rate": 1.7056017798180824e-06, + "loss": 0.3192, + "step": 1634 + }, + { + "epoch": 0.82, + "learning_rate": 1.69656687919296e-06, + "loss": 0.3082, + "step": 1635 + }, + { + "epoch": 0.82, + "learning_rate": 1.687553753353195e-06, + "loss": 0.3397, + "step": 1636 + }, + { + "epoch": 0.82, + "learning_rate": 1.6785624259346556e-06, + "loss": 0.3451, + "step": 1637 + }, + { + "epoch": 0.82, + "learning_rate": 1.669592920516049e-06, + "loss": 0.3263, + "step": 1638 + }, + { + "epoch": 0.82, + "learning_rate": 1.660645260618864e-06, + "loss": 0.2625, + "step": 1639 + }, + { + "epoch": 0.82, + "learning_rate": 1.6517194697072903e-06, + "loss": 0.2828, + "step": 1640 + }, + { + "epoch": 0.82, + "learning_rate": 1.6428155711881722e-06, + "loss": 0.3153, + "step": 1641 + }, + { + "epoch": 0.82, + "learning_rate": 1.633933588410952e-06, + "loss": 0.3264, + "step": 1642 + }, + { + "epoch": 0.82, + "learning_rate": 1.6250735446675914e-06, + "loss": 0.2978, + "step": 1643 + }, + { + "epoch": 0.82, + "learning_rate": 1.6162354631925203e-06, + "loss": 0.3242, + "step": 1644 + }, + { + "epoch": 0.82, + "learning_rate": 1.607419367162577e-06, + "loss": 0.3276, + "step": 1645 + }, + { + "epoch": 0.82, + "learning_rate": 1.5986252796969482e-06, + "loss": 0.3074, + "step": 1646 + }, + { + "epoch": 0.82, + "learning_rate": 1.589853223857103e-06, + "loss": 0.3643, + "step": 1647 + }, + { + "epoch": 0.82, + "learning_rate": 1.5811032226467304e-06, + "loss": 0.3557, + "step": 1648 + }, + { + "epoch": 0.82, + "learning_rate": 1.5723752990116948e-06, + "loss": 0.3167, + "step": 1649 + }, + { + "epoch": 0.82, + "learning_rate": 1.5636694758399563e-06, + "loss": 0.2939, + "step": 1650 + }, + { + "epoch": 0.83, + "learning_rate": 1.5549857759615195e-06, + "loss": 0.2991, + "step": 1651 + }, + { + "epoch": 0.83, + "learning_rate": 1.5463242221483742e-06, + "loss": 0.3062, + "step": 1652 + }, + { + "epoch": 0.83, + "learning_rate": 1.5376848371144404e-06, + "loss": 0.2834, + "step": 1653 + }, + { + "epoch": 0.83, + "learning_rate": 1.5290676435154949e-06, + "loss": 0.3087, + "step": 1654 + }, + { + "epoch": 0.83, + "learning_rate": 1.520472663949122e-06, + "loss": 0.3508, + "step": 1655 + }, + { + "epoch": 0.83, + "learning_rate": 1.511899920954656e-06, + "loss": 0.2915, + "step": 1656 + }, + { + "epoch": 0.83, + "learning_rate": 1.5033494370131162e-06, + "loss": 0.2971, + "step": 1657 + }, + { + "epoch": 0.83, + "learning_rate": 1.4948212345471492e-06, + "loss": 0.3288, + "step": 1658 + }, + { + "epoch": 0.83, + "learning_rate": 1.4863153359209693e-06, + "loss": 0.3216, + "step": 1659 + }, + { + "epoch": 0.83, + "learning_rate": 1.4778317634403082e-06, + "loss": 0.3173, + "step": 1660 + }, + { + "epoch": 0.83, + "learning_rate": 1.469370539352345e-06, + "loss": 0.3572, + "step": 1661 + }, + { + "epoch": 0.83, + "learning_rate": 1.460931685845649e-06, + "loss": 0.3071, + "step": 1662 + }, + { + "epoch": 0.83, + "learning_rate": 1.4525152250501362e-06, + "loss": 0.3556, + "step": 1663 + }, + { + "epoch": 0.83, + "learning_rate": 1.4441211790369892e-06, + "loss": 0.294, + "step": 1664 + }, + { + "epoch": 0.83, + "learning_rate": 1.4357495698186186e-06, + "loss": 0.2905, + "step": 1665 + }, + { + "epoch": 0.83, + "learning_rate": 1.427400419348588e-06, + "loss": 0.3211, + "step": 1666 + }, + { + "epoch": 0.83, + "learning_rate": 1.4190737495215746e-06, + "loss": 0.2948, + "step": 1667 + }, + { + "epoch": 0.83, + "learning_rate": 1.4107695821733026e-06, + "loss": 0.2951, + "step": 1668 + }, + { + "epoch": 0.83, + "learning_rate": 1.402487939080479e-06, + "loss": 0.2877, + "step": 1669 + }, + { + "epoch": 0.83, + "learning_rate": 1.3942288419607476e-06, + "loss": 0.3135, + "step": 1670 + }, + { + "epoch": 0.84, + "learning_rate": 1.3859923124726283e-06, + "loss": 0.2892, + "step": 1671 + }, + { + "epoch": 0.84, + "learning_rate": 1.3777783722154603e-06, + "loss": 0.286, + "step": 1672 + }, + { + "epoch": 0.84, + "learning_rate": 1.369587042729341e-06, + "loss": 0.2768, + "step": 1673 + }, + { + "epoch": 0.84, + "learning_rate": 1.3614183454950824e-06, + "loss": 0.3297, + "step": 1674 + }, + { + "epoch": 0.84, + "learning_rate": 1.3532723019341376e-06, + "loss": 0.2959, + "step": 1675 + }, + { + "epoch": 0.84, + "learning_rate": 1.3451489334085555e-06, + "loss": 0.3296, + "step": 1676 + }, + { + "epoch": 0.84, + "learning_rate": 1.3370482612209224e-06, + "loss": 0.3014, + "step": 1677 + }, + { + "epoch": 0.84, + "learning_rate": 1.3289703066143112e-06, + "loss": 0.3268, + "step": 1678 + }, + { + "epoch": 0.84, + "learning_rate": 1.3209150907722124e-06, + "loss": 0.2981, + "step": 1679 + }, + { + "epoch": 0.84, + "learning_rate": 1.3128826348184886e-06, + "loss": 0.2967, + "step": 1680 + }, + { + "epoch": 0.84, + "learning_rate": 1.3048729598173248e-06, + "loss": 0.2178, + "step": 1681 + }, + { + "epoch": 0.84, + "learning_rate": 1.296886086773157e-06, + "loss": 0.2973, + "step": 1682 + }, + { + "epoch": 0.84, + "learning_rate": 1.2889220366306276e-06, + "loss": 0.3261, + "step": 1683 + }, + { + "epoch": 0.84, + "learning_rate": 1.2809808302745298e-06, + "loss": 0.2954, + "step": 1684 + }, + { + "epoch": 0.84, + "learning_rate": 1.2730624885297537e-06, + "loss": 0.3454, + "step": 1685 + }, + { + "epoch": 0.84, + "learning_rate": 1.2651670321612264e-06, + "loss": 0.3073, + "step": 1686 + }, + { + "epoch": 0.84, + "learning_rate": 1.2572944818738587e-06, + "loss": 0.2906, + "step": 1687 + }, + { + "epoch": 0.84, + "learning_rate": 1.249444858312502e-06, + "loss": 0.3021, + "step": 1688 + }, + { + "epoch": 0.84, + "learning_rate": 1.2416181820618745e-06, + "loss": 0.2942, + "step": 1689 + }, + { + "epoch": 0.84, + "learning_rate": 1.233814473646524e-06, + "loss": 0.3381, + "step": 1690 + }, + { + "epoch": 0.85, + "learning_rate": 1.226033753530763e-06, + "loss": 0.3046, + "step": 1691 + }, + { + "epoch": 0.85, + "learning_rate": 1.218276042118629e-06, + "loss": 0.3362, + "step": 1692 + }, + { + "epoch": 0.85, + "learning_rate": 1.2105413597538107e-06, + "loss": 0.3149, + "step": 1693 + }, + { + "epoch": 0.85, + "learning_rate": 1.202829726719611e-06, + "loss": 0.2529, + "step": 1694 + }, + { + "epoch": 0.85, + "learning_rate": 1.195141163238892e-06, + "loss": 0.3058, + "step": 1695 + }, + { + "epoch": 0.85, + "learning_rate": 1.1874756894740137e-06, + "loss": 0.3242, + "step": 1696 + }, + { + "epoch": 0.85, + "learning_rate": 1.1798333255267857e-06, + "loss": 0.2882, + "step": 1697 + }, + { + "epoch": 0.85, + "learning_rate": 1.1722140914384162e-06, + "loss": 0.2735, + "step": 1698 + }, + { + "epoch": 0.85, + "learning_rate": 1.1646180071894608e-06, + "loss": 0.2994, + "step": 1699 + }, + { + "epoch": 0.85, + "learning_rate": 1.1570450926997657e-06, + "loss": 0.2764, + "step": 1700 + }, + { + "epoch": 0.85, + "eval_code_gate_load": [ + 207.4, + 170.6, + 179.4, + 154.8, + 177.85, + 186.5, + 195.0, + 168.45 + ], + "eval_code_loss": 0.26079100370407104, + "eval_code_runtime": 1.7851, + "eval_code_samples_per_second": 560.183, + "eval_code_steps_per_second": 35.292, + "step": 1700 + }, + { + "epoch": 0.85, + "eval_orca_gate_load": [ + 499.15, + 341.7, + 406.8, + 396.0, + 351.7, + 421.4, + 368.95, + 356.6 + ], + "eval_orca_loss": 0.35004884004592896, + "eval_orca_runtime": 2.0049, + "eval_orca_samples_per_second": 498.779, + "eval_orca_steps_per_second": 31.423, + "step": 1700 + }, + { + "epoch": 0.85, + "eval_math_gate_load": [ + 314.35, + 215.9, + 240.2, + 220.65, + 253.5, + 262.05, + 277.05, + 249.0 + ], + "eval_math_loss": 0.2432861328125, + "eval_math_runtime": 1.8844, + "eval_math_samples_per_second": 530.665, + "eval_math_steps_per_second": 33.432, + "step": 1700 + }, + { + "epoch": 0.85, + "eval_sharegpt_gate_load": [ + 1523.75, + 1077.65, + 1308.6, + 1203.3, + 1140.45, + 1368.2, + 1179.25, + 1110.4 + ], + "eval_sharegpt_loss": 0.4883789122104645, + "eval_sharegpt_runtime": 3.003, + "eval_sharegpt_samples_per_second": 333.001, + "eval_sharegpt_steps_per_second": 20.979, + "step": 1700 + }, + { + "epoch": 0.85, + "learning_rate": 1.1494953678284105e-06, + "loss": 0.2901, + "step": 1701 + }, + { + "epoch": 0.85, + "learning_rate": 1.1419688523736761e-06, + "loss": 0.3298, + "step": 1702 + }, + { + "epoch": 0.85, + "learning_rate": 1.1344655660729676e-06, + "loss": 0.3355, + "step": 1703 + }, + { + "epoch": 0.85, + "learning_rate": 1.1269855286027798e-06, + "loss": 0.323, + "step": 1704 + }, + { + "epoch": 0.85, + "learning_rate": 1.1195287595786352e-06, + "loss": 0.3004, + "step": 1705 + }, + { + "epoch": 0.85, + "learning_rate": 1.1120952785550477e-06, + "loss": 0.258, + "step": 1706 + }, + { + "epoch": 0.85, + "learning_rate": 1.1046851050254504e-06, + "loss": 0.2928, + "step": 1707 + }, + { + "epoch": 0.85, + "learning_rate": 1.0972982584221592e-06, + "loss": 0.294, + "step": 1708 + }, + { + "epoch": 0.85, + "learning_rate": 1.0899347581163222e-06, + "loss": 0.3543, + "step": 1709 + }, + { + "epoch": 0.85, + "learning_rate": 1.0825946234178575e-06, + "loss": 0.3076, + "step": 1710 + }, + { + "epoch": 0.86, + "learning_rate": 1.0752778735754121e-06, + "loss": 0.2811, + "step": 1711 + }, + { + "epoch": 0.86, + "learning_rate": 1.067984527776309e-06, + "loss": 0.2797, + "step": 1712 + }, + { + "epoch": 0.86, + "learning_rate": 1.0607146051465011e-06, + "loss": 0.2882, + "step": 1713 + }, + { + "epoch": 0.86, + "learning_rate": 1.0534681247505107e-06, + "loss": 0.2896, + "step": 1714 + }, + { + "epoch": 0.86, + "learning_rate": 1.0462451055913847e-06, + "loss": 0.2878, + "step": 1715 + }, + { + "epoch": 0.86, + "learning_rate": 1.0390455666106547e-06, + "loss": 0.3366, + "step": 1716 + }, + { + "epoch": 0.86, + "learning_rate": 1.0318695266882696e-06, + "loss": 0.2932, + "step": 1717 + }, + { + "epoch": 0.86, + "learning_rate": 1.024717004642557e-06, + "loss": 0.3224, + "step": 1718 + }, + { + "epoch": 0.86, + "learning_rate": 1.0175880192301713e-06, + "loss": 0.3163, + "step": 1719 + }, + { + "epoch": 0.86, + "learning_rate": 1.010482589146048e-06, + "loss": 0.3059, + "step": 1720 + }, + { + "epoch": 0.86, + "learning_rate": 1.0034007330233487e-06, + "loss": 0.3024, + "step": 1721 + }, + { + "epoch": 0.86, + "learning_rate": 9.963424694334122e-07, + "loss": 0.31, + "step": 1722 + }, + { + "epoch": 0.86, + "learning_rate": 9.893078168857173e-07, + "loss": 0.2936, + "step": 1723 + }, + { + "epoch": 0.86, + "learning_rate": 9.822967938278172e-07, + "loss": 0.244, + "step": 1724 + }, + { + "epoch": 0.86, + "learning_rate": 9.753094186453028e-07, + "loss": 0.362, + "step": 1725 + }, + { + "epoch": 0.86, + "learning_rate": 9.683457096617487e-07, + "loss": 0.3021, + "step": 1726 + }, + { + "epoch": 0.86, + "learning_rate": 9.614056851386743e-07, + "loss": 0.2698, + "step": 1727 + }, + { + "epoch": 0.86, + "learning_rate": 9.544893632754816e-07, + "loss": 0.3254, + "step": 1728 + }, + { + "epoch": 0.86, + "learning_rate": 9.475967622094207e-07, + "loss": 0.3472, + "step": 1729 + }, + { + "epoch": 0.86, + "learning_rate": 9.407279000155311e-07, + "loss": 0.3558, + "step": 1730 + }, + { + "epoch": 0.87, + "learning_rate": 9.338827947066076e-07, + "loss": 0.2826, + "step": 1731 + }, + { + "epoch": 0.87, + "learning_rate": 9.270614642331377e-07, + "loss": 0.287, + "step": 1732 + }, + { + "epoch": 0.87, + "learning_rate": 9.202639264832669e-07, + "loss": 0.3271, + "step": 1733 + }, + { + "epoch": 0.87, + "learning_rate": 9.134901992827427e-07, + "loss": 0.3017, + "step": 1734 + }, + { + "epoch": 0.87, + "learning_rate": 9.067403003948783e-07, + "loss": 0.2957, + "step": 1735 + }, + { + "epoch": 0.87, + "learning_rate": 9.000142475204965e-07, + "loss": 0.2881, + "step": 1736 + }, + { + "epoch": 0.87, + "learning_rate": 8.933120582978827e-07, + "loss": 0.3096, + "step": 1737 + }, + { + "epoch": 0.87, + "learning_rate": 8.866337503027523e-07, + "loss": 0.3048, + "step": 1738 + }, + { + "epoch": 0.87, + "learning_rate": 8.799793410481871e-07, + "loss": 0.2805, + "step": 1739 + }, + { + "epoch": 0.87, + "learning_rate": 8.733488479845997e-07, + "loss": 0.3241, + "step": 1740 + }, + { + "epoch": 0.87, + "learning_rate": 8.667422884996823e-07, + "loss": 0.2841, + "step": 1741 + }, + { + "epoch": 0.87, + "learning_rate": 8.60159679918372e-07, + "loss": 0.3075, + "step": 1742 + }, + { + "epoch": 0.87, + "learning_rate": 8.536010395027905e-07, + "loss": 0.3022, + "step": 1743 + }, + { + "epoch": 0.87, + "learning_rate": 8.470663844522053e-07, + "loss": 0.2859, + "step": 1744 + }, + { + "epoch": 0.87, + "learning_rate": 8.405557319029911e-07, + "loss": 0.2964, + "step": 1745 + }, + { + "epoch": 0.87, + "learning_rate": 8.340690989285727e-07, + "loss": 0.2967, + "step": 1746 + }, + { + "epoch": 0.87, + "learning_rate": 8.276065025393909e-07, + "loss": 0.3191, + "step": 1747 + }, + { + "epoch": 0.87, + "learning_rate": 8.211679596828481e-07, + "loss": 0.27, + "step": 1748 + }, + { + "epoch": 0.87, + "learning_rate": 8.147534872432761e-07, + "loss": 0.3067, + "step": 1749 + }, + { + "epoch": 0.88, + "learning_rate": 8.083631020418792e-07, + "loss": 0.3003, + "step": 1750 + }, + { + "epoch": 0.88, + "learning_rate": 8.019968208366958e-07, + "loss": 0.3156, + "step": 1751 + }, + { + "epoch": 0.88, + "learning_rate": 7.956546603225601e-07, + "loss": 0.2852, + "step": 1752 + }, + { + "epoch": 0.88, + "learning_rate": 7.893366371310463e-07, + "loss": 0.3485, + "step": 1753 + }, + { + "epoch": 0.88, + "learning_rate": 7.830427678304353e-07, + "loss": 0.3002, + "step": 1754 + }, + { + "epoch": 0.88, + "learning_rate": 7.767730689256614e-07, + "loss": 0.3188, + "step": 1755 + }, + { + "epoch": 0.88, + "learning_rate": 7.705275568582848e-07, + "loss": 0.3151, + "step": 1756 + }, + { + "epoch": 0.88, + "learning_rate": 7.643062480064301e-07, + "loss": 0.2763, + "step": 1757 + }, + { + "epoch": 0.88, + "learning_rate": 7.581091586847522e-07, + "loss": 0.3074, + "step": 1758 + }, + { + "epoch": 0.88, + "learning_rate": 7.519363051443996e-07, + "loss": 0.2607, + "step": 1759 + }, + { + "epoch": 0.88, + "learning_rate": 7.457877035729588e-07, + "loss": 0.333, + "step": 1760 + }, + { + "epoch": 0.88, + "learning_rate": 7.3966337009442e-07, + "loss": 0.3025, + "step": 1761 + }, + { + "epoch": 0.88, + "learning_rate": 7.335633207691362e-07, + "loss": 0.2867, + "step": 1762 + }, + { + "epoch": 0.88, + "learning_rate": 7.274875715937746e-07, + "loss": 0.2828, + "step": 1763 + }, + { + "epoch": 0.88, + "learning_rate": 7.21436138501278e-07, + "loss": 0.2814, + "step": 1764 + }, + { + "epoch": 0.88, + "learning_rate": 7.154090373608236e-07, + "loss": 0.2994, + "step": 1765 + }, + { + "epoch": 0.88, + "learning_rate": 7.094062839777838e-07, + "loss": 0.2821, + "step": 1766 + }, + { + "epoch": 0.88, + "learning_rate": 7.03427894093679e-07, + "loss": 0.2824, + "step": 1767 + }, + { + "epoch": 0.88, + "learning_rate": 6.974738833861383e-07, + "loss": 0.2706, + "step": 1768 + }, + { + "epoch": 0.88, + "learning_rate": 6.915442674688633e-07, + "loss": 0.2929, + "step": 1769 + }, + { + "epoch": 0.89, + "learning_rate": 6.856390618915775e-07, + "loss": 0.3311, + "step": 1770 + }, + { + "epoch": 0.89, + "learning_rate": 6.797582821399973e-07, + "loss": 0.31, + "step": 1771 + }, + { + "epoch": 0.89, + "learning_rate": 6.739019436357774e-07, + "loss": 0.3079, + "step": 1772 + }, + { + "epoch": 0.89, + "learning_rate": 6.680700617364877e-07, + "loss": 0.307, + "step": 1773 + }, + { + "epoch": 0.89, + "learning_rate": 6.622626517355557e-07, + "loss": 0.3126, + "step": 1774 + }, + { + "epoch": 0.89, + "learning_rate": 6.564797288622371e-07, + "loss": 0.3571, + "step": 1775 + }, + { + "epoch": 0.89, + "learning_rate": 6.507213082815745e-07, + "loss": 0.277, + "step": 1776 + }, + { + "epoch": 0.89, + "learning_rate": 6.449874050943549e-07, + "loss": 0.3365, + "step": 1777 + }, + { + "epoch": 0.89, + "learning_rate": 6.392780343370686e-07, + "loss": 0.2917, + "step": 1778 + }, + { + "epoch": 0.89, + "learning_rate": 6.335932109818754e-07, + "loss": 0.2717, + "step": 1779 + }, + { + "epoch": 0.89, + "learning_rate": 6.279329499365649e-07, + "loss": 0.2477, + "step": 1780 + }, + { + "epoch": 0.89, + "learning_rate": 6.222972660445082e-07, + "loss": 0.3076, + "step": 1781 + }, + { + "epoch": 0.89, + "learning_rate": 6.166861740846297e-07, + "loss": 0.2606, + "step": 1782 + }, + { + "epoch": 0.89, + "learning_rate": 6.11099688771366e-07, + "loss": 0.2695, + "step": 1783 + }, + { + "epoch": 0.89, + "learning_rate": 6.055378247546217e-07, + "loss": 0.3152, + "step": 1784 + }, + { + "epoch": 0.89, + "learning_rate": 6.000005966197387e-07, + "loss": 0.268, + "step": 1785 + }, + { + "epoch": 0.89, + "learning_rate": 5.94488018887448e-07, + "loss": 0.3017, + "step": 1786 + }, + { + "epoch": 0.89, + "learning_rate": 5.890001060138484e-07, + "loss": 0.3279, + "step": 1787 + }, + { + "epoch": 0.89, + "learning_rate": 5.835368723903456e-07, + "loss": 0.2949, + "step": 1788 + }, + { + "epoch": 0.89, + "learning_rate": 5.780983323436374e-07, + "loss": 0.3345, + "step": 1789 + }, + { + "epoch": 0.9, + "learning_rate": 5.726845001356573e-07, + "loss": 0.3232, + "step": 1790 + }, + { + "epoch": 0.9, + "learning_rate": 5.672953899635524e-07, + "loss": 0.2993, + "step": 1791 + }, + { + "epoch": 0.9, + "learning_rate": 5.619310159596358e-07, + "loss": 0.3065, + "step": 1792 + }, + { + "epoch": 0.9, + "learning_rate": 5.565913921913513e-07, + "loss": 0.2778, + "step": 1793 + }, + { + "epoch": 0.9, + "learning_rate": 5.51276532661238e-07, + "loss": 0.2822, + "step": 1794 + }, + { + "epoch": 0.9, + "learning_rate": 5.459864513068991e-07, + "loss": 0.3144, + "step": 1795 + }, + { + "epoch": 0.9, + "learning_rate": 5.407211620009545e-07, + "loss": 0.3007, + "step": 1796 + }, + { + "epoch": 0.9, + "learning_rate": 5.354806785510113e-07, + "loss": 0.2625, + "step": 1797 + }, + { + "epoch": 0.9, + "learning_rate": 5.30265014699628e-07, + "loss": 0.293, + "step": 1798 + }, + { + "epoch": 0.9, + "learning_rate": 5.250741841242735e-07, + "loss": 0.2848, + "step": 1799 + }, + { + "epoch": 0.9, + "learning_rate": 5.199082004372958e-07, + "loss": 0.2789, + "step": 1800 + }, + { + "epoch": 0.9, + "eval_code_gate_load": [ + 206.3, + 171.0, + 178.4, + 153.6, + 178.55, + 188.05, + 195.05, + 169.05 + ], + "eval_code_loss": 0.24262695014476776, + "eval_code_runtime": 1.786, + "eval_code_samples_per_second": 559.908, + "eval_code_steps_per_second": 35.274, + "step": 1800 + }, + { + "epoch": 0.9, + "eval_orca_gate_load": [ + 498.15, + 341.45, + 405.0, + 396.0, + 352.45, + 421.95, + 368.95, + 358.35 + ], + "eval_orca_loss": 0.3505859375, + "eval_orca_runtime": 2.0051, + "eval_orca_samples_per_second": 498.718, + "eval_orca_steps_per_second": 31.419, + "step": 1800 + }, + { + "epoch": 0.9, + "eval_math_gate_load": [ + 314.35, + 216.7, + 239.55, + 220.1, + 253.35, + 261.95, + 275.8, + 250.9 + ], + "eval_math_loss": 0.24282225966453552, + "eval_math_runtime": 1.865, + "eval_math_samples_per_second": 536.202, + "eval_math_steps_per_second": 33.781, + "step": 1800 + }, + { + "epoch": 0.9, + "eval_sharegpt_gate_load": [ + 1525.35, + 1079.5, + 1303.65, + 1203.5, + 1140.4, + 1370.1, + 1178.8, + 1110.3 + ], + "eval_sharegpt_loss": 0.48857420682907104, + "eval_sharegpt_runtime": 2.9886, + "eval_sharegpt_samples_per_second": 334.604, + "eval_sharegpt_steps_per_second": 21.08, + "step": 1800 + }, + { + "epoch": 0.9, + "learning_rate": 5.147670771858848e-07, + "loss": 0.3078, + "step": 1801 + }, + { + "epoch": 0.9, + "learning_rate": 5.096508278520385e-07, + "loss": 0.3373, + "step": 1802 + }, + { + "epoch": 0.9, + "learning_rate": 5.045594658525232e-07, + "loss": 0.3121, + "step": 1803 + }, + { + "epoch": 0.9, + "learning_rate": 4.994930045388414e-07, + "loss": 0.2671, + "step": 1804 + }, + { + "epoch": 0.9, + "learning_rate": 4.944514571971981e-07, + "loss": 0.3293, + "step": 1805 + }, + { + "epoch": 0.9, + "learning_rate": 4.894348370484648e-07, + "loss": 0.3154, + "step": 1806 + }, + { + "epoch": 0.9, + "learning_rate": 4.844431572481412e-07, + "loss": 0.2988, + "step": 1807 + }, + { + "epoch": 0.9, + "learning_rate": 4.794764308863242e-07, + "loss": 0.2988, + "step": 1808 + }, + { + "epoch": 0.9, + "learning_rate": 4.745346709876786e-07, + "loss": 0.292, + "step": 1809 + }, + { + "epoch": 0.91, + "learning_rate": 4.696178905113913e-07, + "loss": 0.2773, + "step": 1810 + }, + { + "epoch": 0.91, + "learning_rate": 4.6472610235114513e-07, + "loss": 0.2962, + "step": 1811 + }, + { + "epoch": 0.91, + "learning_rate": 4.5985931933508757e-07, + "loss": 0.2758, + "step": 1812 + }, + { + "epoch": 0.91, + "learning_rate": 4.550175542257862e-07, + "loss": 0.3309, + "step": 1813 + }, + { + "epoch": 0.91, + "learning_rate": 4.502008197202068e-07, + "loss": 0.3072, + "step": 1814 + }, + { + "epoch": 0.91, + "learning_rate": 4.454091284496731e-07, + "loss": 0.3043, + "step": 1815 + }, + { + "epoch": 0.91, + "learning_rate": 4.406424929798403e-07, + "loss": 0.2964, + "step": 1816 + }, + { + "epoch": 0.91, + "learning_rate": 4.3590092581065055e-07, + "loss": 0.2578, + "step": 1817 + }, + { + "epoch": 0.91, + "learning_rate": 4.3118443937631094e-07, + "loss": 0.2903, + "step": 1818 + }, + { + "epoch": 0.91, + "learning_rate": 4.26493046045261e-07, + "loss": 0.2921, + "step": 1819 + }, + { + "epoch": 0.91, + "learning_rate": 4.218267581201296e-07, + "loss": 0.3116, + "step": 1820 + }, + { + "epoch": 0.91, + "learning_rate": 4.17185587837714e-07, + "loss": 0.2662, + "step": 1821 + }, + { + "epoch": 0.91, + "learning_rate": 4.125695473689406e-07, + "loss": 0.2796, + "step": 1822 + }, + { + "epoch": 0.91, + "learning_rate": 4.0797864881883977e-07, + "loss": 0.3705, + "step": 1823 + }, + { + "epoch": 0.91, + "learning_rate": 4.034129042265067e-07, + "loss": 0.265, + "step": 1824 + }, + { + "epoch": 0.91, + "learning_rate": 3.988723255650728e-07, + "loss": 0.3237, + "step": 1825 + }, + { + "epoch": 0.91, + "learning_rate": 3.943569247416801e-07, + "loss": 0.2654, + "step": 1826 + }, + { + "epoch": 0.91, + "learning_rate": 3.8986671359743767e-07, + "loss": 0.3055, + "step": 1827 + }, + { + "epoch": 0.91, + "learning_rate": 3.8540170390740097e-07, + "loss": 0.2912, + "step": 1828 + }, + { + "epoch": 0.91, + "learning_rate": 3.8096190738053815e-07, + "loss": 0.2577, + "step": 1829 + }, + { + "epoch": 0.92, + "learning_rate": 3.7654733565969826e-07, + "loss": 0.3198, + "step": 1830 + }, + { + "epoch": 0.92, + "learning_rate": 3.721580003215808e-07, + "loss": 0.3015, + "step": 1831 + }, + { + "epoch": 0.92, + "learning_rate": 3.67793912876705e-07, + "loss": 0.2554, + "step": 1832 + }, + { + "epoch": 0.92, + "learning_rate": 3.6345508476938296e-07, + "loss": 0.2986, + "step": 1833 + }, + { + "epoch": 0.92, + "learning_rate": 3.591415273776855e-07, + "loss": 0.2911, + "step": 1834 + }, + { + "epoch": 0.92, + "learning_rate": 3.548532520134129e-07, + "loss": 0.2769, + "step": 1835 + }, + { + "epoch": 0.92, + "learning_rate": 3.5059026992206645e-07, + "loss": 0.3182, + "step": 1836 + }, + { + "epoch": 0.92, + "learning_rate": 3.4635259228282256e-07, + "loss": 0.2591, + "step": 1837 + }, + { + "epoch": 0.92, + "learning_rate": 3.421402302084953e-07, + "loss": 0.3009, + "step": 1838 + }, + { + "epoch": 0.92, + "learning_rate": 3.379531947455128e-07, + "loss": 0.2904, + "step": 1839 + }, + { + "epoch": 0.92, + "learning_rate": 3.3379149687388866e-07, + "loss": 0.3221, + "step": 1840 + }, + { + "epoch": 0.92, + "learning_rate": 3.2965514750718964e-07, + "loss": 0.3366, + "step": 1841 + }, + { + "epoch": 0.92, + "learning_rate": 3.255441574925089e-07, + "loss": 0.2937, + "step": 1842 + }, + { + "epoch": 0.92, + "learning_rate": 3.2145853761043844e-07, + "loss": 0.3, + "step": 1843 + }, + { + "epoch": 0.92, + "learning_rate": 3.1739829857504235e-07, + "loss": 0.2915, + "step": 1844 + }, + { + "epoch": 0.92, + "learning_rate": 3.133634510338235e-07, + "loss": 0.2622, + "step": 1845 + }, + { + "epoch": 0.92, + "learning_rate": 3.093540055676958e-07, + "loss": 0.2865, + "step": 1846 + }, + { + "epoch": 0.92, + "learning_rate": 3.053699726909676e-07, + "loss": 0.2946, + "step": 1847 + }, + { + "epoch": 0.92, + "learning_rate": 3.0141136285129825e-07, + "loss": 0.3349, + "step": 1848 + }, + { + "epoch": 0.92, + "learning_rate": 2.974781864296783e-07, + "loss": 0.2425, + "step": 1849 + }, + { + "epoch": 0.93, + "learning_rate": 2.935704537404083e-07, + "loss": 0.2605, + "step": 1850 + }, + { + "epoch": 0.93, + "learning_rate": 2.8968817503105984e-07, + "loss": 0.2733, + "step": 1851 + }, + { + "epoch": 0.93, + "learning_rate": 2.8583136048245697e-07, + "loss": 0.2423, + "step": 1852 + }, + { + "epoch": 0.93, + "learning_rate": 2.820000202086459e-07, + "loss": 0.2926, + "step": 1853 + }, + { + "epoch": 0.93, + "learning_rate": 2.781941642568686e-07, + "loss": 0.242, + "step": 1854 + }, + { + "epoch": 0.93, + "learning_rate": 2.744138026075405e-07, + "loss": 0.3168, + "step": 1855 + }, + { + "epoch": 0.93, + "learning_rate": 2.706589451742181e-07, + "loss": 0.3208, + "step": 1856 + }, + { + "epoch": 0.93, + "learning_rate": 2.669296018035772e-07, + "loss": 0.2808, + "step": 1857 + }, + { + "epoch": 0.93, + "learning_rate": 2.632257822753881e-07, + "loss": 0.2936, + "step": 1858 + }, + { + "epoch": 0.93, + "learning_rate": 2.5954749630248355e-07, + "loss": 0.2861, + "step": 1859 + }, + { + "epoch": 0.93, + "learning_rate": 2.5589475353073987e-07, + "loss": 0.2806, + "step": 1860 + }, + { + "epoch": 0.93, + "learning_rate": 2.5226756353904925e-07, + "loss": 0.3313, + "step": 1861 + }, + { + "epoch": 0.93, + "learning_rate": 2.486659358392951e-07, + "loss": 0.288, + "step": 1862 + }, + { + "epoch": 0.93, + "learning_rate": 2.450898798763268e-07, + "loss": 0.2649, + "step": 1863 + }, + { + "epoch": 0.93, + "learning_rate": 2.4153940502793185e-07, + "loss": 0.2993, + "step": 1864 + }, + { + "epoch": 0.93, + "learning_rate": 2.380145206048201e-07, + "loss": 0.2831, + "step": 1865 + }, + { + "epoch": 0.93, + "learning_rate": 2.3451523585058756e-07, + "loss": 0.269, + "step": 1866 + }, + { + "epoch": 0.93, + "learning_rate": 2.3104155994170042e-07, + "loss": 0.2863, + "step": 1867 + }, + { + "epoch": 0.93, + "learning_rate": 2.2759350198746978e-07, + "loss": 0.2895, + "step": 1868 + }, + { + "epoch": 0.93, + "learning_rate": 2.24171071030026e-07, + "loss": 0.3511, + "step": 1869 + }, + { + "epoch": 0.94, + "learning_rate": 2.2077427604429435e-07, + "loss": 0.3163, + "step": 1870 + }, + { + "epoch": 0.94, + "learning_rate": 2.1740312593797274e-07, + "loss": 0.2954, + "step": 1871 + }, + { + "epoch": 0.94, + "learning_rate": 2.1405762955151178e-07, + "loss": 0.3207, + "step": 1872 + }, + { + "epoch": 0.94, + "learning_rate": 2.1073779565808471e-07, + "loss": 0.2869, + "step": 1873 + }, + { + "epoch": 0.94, + "learning_rate": 2.0744363296356872e-07, + "loss": 0.296, + "step": 1874 + }, + { + "epoch": 0.94, + "learning_rate": 2.0417515010652032e-07, + "loss": 0.3008, + "step": 1875 + }, + { + "epoch": 0.94, + "learning_rate": 2.009323556581566e-07, + "loss": 0.2822, + "step": 1876 + }, + { + "epoch": 0.94, + "learning_rate": 1.977152581223274e-07, + "loss": 0.2678, + "step": 1877 + }, + { + "epoch": 0.94, + "learning_rate": 1.9452386593549534e-07, + "loss": 0.3009, + "step": 1878 + }, + { + "epoch": 0.94, + "learning_rate": 1.9135818746671587e-07, + "loss": 0.2525, + "step": 1879 + }, + { + "epoch": 0.94, + "learning_rate": 1.8821823101760949e-07, + "loss": 0.342, + "step": 1880 + }, + { + "epoch": 0.94, + "learning_rate": 1.8510400482234848e-07, + "loss": 0.3283, + "step": 1881 + }, + { + "epoch": 0.94, + "learning_rate": 1.8201551704762453e-07, + "loss": 0.3051, + "step": 1882 + }, + { + "epoch": 0.94, + "learning_rate": 1.7895277579264015e-07, + "loss": 0.2982, + "step": 1883 + }, + { + "epoch": 0.94, + "learning_rate": 1.7591578908907724e-07, + "loss": 0.2743, + "step": 1884 + }, + { + "epoch": 0.94, + "learning_rate": 1.7290456490107522e-07, + "loss": 0.2717, + "step": 1885 + }, + { + "epoch": 0.94, + "learning_rate": 1.699191111252241e-07, + "loss": 0.3157, + "step": 1886 + }, + { + "epoch": 0.94, + "learning_rate": 1.6695943559052463e-07, + "loss": 0.3009, + "step": 1887 + }, + { + "epoch": 0.94, + "learning_rate": 1.6402554605838173e-07, + "loss": 0.2954, + "step": 1888 + }, + { + "epoch": 0.94, + "learning_rate": 1.6111745022257873e-07, + "loss": 0.2454, + "step": 1889 + }, + { + "epoch": 0.94, + "learning_rate": 1.5823515570925763e-07, + "loss": 0.325, + "step": 1890 + }, + { + "epoch": 0.95, + "learning_rate": 1.5537867007690111e-07, + "loss": 0.3027, + "step": 1891 + }, + { + "epoch": 0.95, + "learning_rate": 1.5254800081630828e-07, + "loss": 0.2612, + "step": 1892 + }, + { + "epoch": 0.95, + "learning_rate": 1.4974315535058016e-07, + "loss": 0.3325, + "step": 1893 + }, + { + "epoch": 0.95, + "learning_rate": 1.469641410350964e-07, + "loss": 0.2978, + "step": 1894 + }, + { + "epoch": 0.95, + "learning_rate": 1.4421096515749855e-07, + "loss": 0.284, + "step": 1895 + }, + { + "epoch": 0.95, + "learning_rate": 1.4148363493766803e-07, + "loss": 0.3069, + "step": 1896 + }, + { + "epoch": 0.95, + "learning_rate": 1.3878215752771264e-07, + "loss": 0.3225, + "step": 1897 + }, + { + "epoch": 0.95, + "learning_rate": 1.361065400119399e-07, + "loss": 0.3019, + "step": 1898 + }, + { + "epoch": 0.95, + "learning_rate": 1.3345678940684615e-07, + "loss": 0.3103, + "step": 1899 + }, + { + "epoch": 0.95, + "learning_rate": 1.30832912661093e-07, + "loss": 0.2691, + "step": 1900 + }, + { + "epoch": 0.95, + "eval_code_gate_load": [ + 205.55, + 171.4, + 178.65, + 153.4, + 178.2, + 188.1, + 195.95, + 168.75 + ], + "eval_code_loss": 0.24282225966453552, + "eval_code_runtime": 1.7813, + "eval_code_samples_per_second": 561.397, + "eval_code_steps_per_second": 35.368, + "step": 1900 + }, + { + "epoch": 0.95, + "eval_orca_gate_load": [ + 497.35, + 342.7, + 406.6, + 396.1, + 351.0, + 421.1, + 369.95, + 357.5 + ], + "eval_orca_loss": 0.34912109375, + "eval_orca_runtime": 2.0183, + "eval_orca_samples_per_second": 495.476, + "eval_orca_steps_per_second": 31.215, + "step": 1900 + }, + { + "epoch": 0.95, + "eval_math_gate_load": [ + 314.85, + 216.8, + 238.95, + 220.8, + 253.1, + 261.85, + 276.2, + 250.15 + ], + "eval_math_loss": 0.24270018935203552, + "eval_math_runtime": 1.8503, + "eval_math_samples_per_second": 540.439, + "eval_math_steps_per_second": 34.048, + "step": 1900 + }, + { + "epoch": 0.95, + "eval_sharegpt_gate_load": [ + 1522.5, + 1078.8, + 1304.65, + 1205.55, + 1138.55, + 1369.8, + 1181.35, + 1110.4 + ], + "eval_sharegpt_loss": 0.4888671934604645, + "eval_sharegpt_runtime": 3.0127, + "eval_sharegpt_samples_per_second": 331.925, + "eval_sharegpt_steps_per_second": 20.911, + "step": 1900 + }, + { + "epoch": 0.95, + "learning_rate": 1.2823491665549193e-07, + "loss": 0.3058, + "step": 1901 + }, + { + "epoch": 0.95, + "learning_rate": 1.2566280820298427e-07, + "loss": 0.3011, + "step": 1902 + }, + { + "epoch": 0.95, + "learning_rate": 1.231165940486234e-07, + "loss": 0.2873, + "step": 1903 + }, + { + "epoch": 0.95, + "learning_rate": 1.2059628086956044e-07, + "loss": 0.3044, + "step": 1904 + }, + { + "epoch": 0.95, + "learning_rate": 1.1810187527502182e-07, + "loss": 0.2932, + "step": 1905 + }, + { + "epoch": 0.95, + "learning_rate": 1.1563338380629618e-07, + "loss": 0.3145, + "step": 1906 + }, + { + "epoch": 0.95, + "learning_rate": 1.1319081293671541e-07, + "loss": 0.3078, + "step": 1907 + }, + { + "epoch": 0.95, + "learning_rate": 1.1077416907163573e-07, + "loss": 0.2436, + "step": 1908 + }, + { + "epoch": 0.95, + "learning_rate": 1.0838345854842447e-07, + "loss": 0.2853, + "step": 1909 + }, + { + "epoch": 0.95, + "learning_rate": 1.0601868763643997e-07, + "loss": 0.3035, + "step": 1910 + }, + { + "epoch": 0.96, + "learning_rate": 1.0367986253701945e-07, + "loss": 0.3109, + "step": 1911 + }, + { + "epoch": 0.96, + "learning_rate": 1.0136698938346012e-07, + "loss": 0.2506, + "step": 1912 + }, + { + "epoch": 0.96, + "learning_rate": 9.90800742410003e-08, + "loss": 0.3005, + "step": 1913 + }, + { + "epoch": 0.96, + "learning_rate": 9.68191231068083e-08, + "loss": 0.284, + "step": 1914 + }, + { + "epoch": 0.96, + "learning_rate": 9.45841419099669e-08, + "loss": 0.2738, + "step": 1915 + }, + { + "epoch": 0.96, + "learning_rate": 9.237513651145224e-08, + "loss": 0.3183, + "step": 1916 + }, + { + "epoch": 0.96, + "learning_rate": 9.019211270412275e-08, + "loss": 0.3705, + "step": 1917 + }, + { + "epoch": 0.96, + "learning_rate": 8.80350762127058e-08, + "loss": 0.2912, + "step": 1918 + }, + { + "epoch": 0.96, + "learning_rate": 8.590403269377656e-08, + "loss": 0.334, + "step": 1919 + }, + { + "epoch": 0.96, + "learning_rate": 8.379898773574924e-08, + "loss": 0.3033, + "step": 1920 + }, + { + "epoch": 0.96, + "learning_rate": 8.171994685885698e-08, + "loss": 0.2956, + "step": 1921 + }, + { + "epoch": 0.96, + "learning_rate": 7.966691551514527e-08, + "loss": 0.3074, + "step": 1922 + }, + { + "epoch": 0.96, + "learning_rate": 7.763989908844749e-08, + "loss": 0.3667, + "step": 1923 + }, + { + "epoch": 0.96, + "learning_rate": 7.563890289437825e-08, + "loss": 0.328, + "step": 1924 + }, + { + "epoch": 0.96, + "learning_rate": 7.366393218031564e-08, + "loss": 0.317, + "step": 1925 + }, + { + "epoch": 0.96, + "learning_rate": 7.171499212539124e-08, + "loss": 0.263, + "step": 1926 + }, + { + "epoch": 0.96, + "learning_rate": 6.979208784047454e-08, + "loss": 0.2675, + "step": 1927 + }, + { + "epoch": 0.96, + "learning_rate": 6.78952243681541e-08, + "loss": 0.3353, + "step": 1928 + }, + { + "epoch": 0.96, + "learning_rate": 6.602440668273758e-08, + "loss": 0.2805, + "step": 1929 + }, + { + "epoch": 0.96, + "learning_rate": 6.417963969022389e-08, + "loss": 0.2993, + "step": 1930 + }, + { + "epoch": 0.97, + "learning_rate": 6.236092822829887e-08, + "loss": 0.2835, + "step": 1931 + }, + { + "epoch": 0.97, + "learning_rate": 6.056827706632185e-08, + "loss": 0.2756, + "step": 1932 + }, + { + "epoch": 0.97, + "learning_rate": 5.880169090531351e-08, + "loss": 0.2952, + "step": 1933 + }, + { + "epoch": 0.97, + "learning_rate": 5.7061174377937015e-08, + "loss": 0.3181, + "step": 1934 + }, + { + "epoch": 0.97, + "learning_rate": 5.534673204849572e-08, + "loss": 0.2758, + "step": 1935 + }, + { + "epoch": 0.97, + "learning_rate": 5.365836841291439e-08, + "loss": 0.3501, + "step": 1936 + }, + { + "epoch": 0.97, + "learning_rate": 5.199608789873134e-08, + "loss": 0.285, + "step": 1937 + }, + { + "epoch": 0.97, + "learning_rate": 5.035989486508075e-08, + "loss": 0.3217, + "step": 1938 + }, + { + "epoch": 0.97, + "learning_rate": 4.874979360268928e-08, + "loss": 0.3108, + "step": 1939 + }, + { + "epoch": 0.97, + "learning_rate": 4.716578833386054e-08, + "loss": 0.2918, + "step": 1940 + }, + { + "epoch": 0.97, + "learning_rate": 4.56078832124629e-08, + "loss": 0.2769, + "step": 1941 + }, + { + "epoch": 0.97, + "learning_rate": 4.4076082323920576e-08, + "loss": 0.2785, + "step": 1942 + }, + { + "epoch": 0.97, + "learning_rate": 4.257038968520366e-08, + "loss": 0.3826, + "step": 1943 + }, + { + "epoch": 0.97, + "learning_rate": 4.109080924481479e-08, + "loss": 0.3202, + "step": 1944 + }, + { + "epoch": 0.97, + "learning_rate": 3.963734488278248e-08, + "loss": 0.3273, + "step": 1945 + }, + { + "epoch": 0.97, + "learning_rate": 3.82100004106456e-08, + "loss": 0.2841, + "step": 1946 + }, + { + "epoch": 0.97, + "learning_rate": 3.680877957145112e-08, + "loss": 0.2521, + "step": 1947 + }, + { + "epoch": 0.97, + "learning_rate": 3.543368603973529e-08, + "loss": 0.2639, + "step": 1948 + }, + { + "epoch": 0.97, + "learning_rate": 3.408472342152136e-08, + "loss": 0.2813, + "step": 1949 + }, + { + "epoch": 0.97, + "learning_rate": 3.2761895254306285e-08, + "loss": 0.2856, + "step": 1950 + }, + { + "epoch": 0.98, + "learning_rate": 3.1465205007052965e-08, + "loss": 0.2849, + "step": 1951 + }, + { + "epoch": 0.98, + "learning_rate": 3.019465608018024e-08, + "loss": 0.313, + "step": 1952 + }, + { + "epoch": 0.98, + "learning_rate": 2.8950251805553997e-08, + "loss": 0.2965, + "step": 1953 + }, + { + "epoch": 0.98, + "learning_rate": 2.773199544648164e-08, + "loss": 0.2648, + "step": 1954 + }, + { + "epoch": 0.98, + "learning_rate": 2.6539890197695428e-08, + "loss": 0.3016, + "step": 1955 + }, + { + "epoch": 0.98, + "learning_rate": 2.537393918535358e-08, + "loss": 0.2957, + "step": 1956 + }, + { + "epoch": 0.98, + "learning_rate": 2.423414546702807e-08, + "loss": 0.2827, + "step": 1957 + }, + { + "epoch": 0.98, + "learning_rate": 2.312051203169352e-08, + "loss": 0.3606, + "step": 1958 + }, + { + "epoch": 0.98, + "learning_rate": 2.2033041799723877e-08, + "loss": 0.3286, + "step": 1959 + }, + { + "epoch": 0.98, + "learning_rate": 2.0971737622883515e-08, + "loss": 0.2846, + "step": 1960 + }, + { + "epoch": 0.98, + "learning_rate": 1.9936602284318375e-08, + "loss": 0.3424, + "step": 1961 + }, + { + "epoch": 0.98, + "learning_rate": 1.8927638498551502e-08, + "loss": 0.3063, + "step": 1962 + }, + { + "epoch": 0.98, + "learning_rate": 1.7944848911470857e-08, + "loss": 0.2941, + "step": 1963 + }, + { + "epoch": 0.98, + "learning_rate": 1.698823610032929e-08, + "loss": 0.3948, + "step": 1964 + }, + { + "epoch": 0.98, + "learning_rate": 1.605780257373124e-08, + "loss": 0.3256, + "step": 1965 + }, + { + "epoch": 0.98, + "learning_rate": 1.5153550771630498e-08, + "loss": 0.2806, + "step": 1966 + }, + { + "epoch": 0.98, + "learning_rate": 1.4275483065321338e-08, + "loss": 0.2948, + "step": 1967 + }, + { + "epoch": 0.98, + "learning_rate": 1.3423601757436289e-08, + "loss": 0.2675, + "step": 1968 + }, + { + "epoch": 0.98, + "learning_rate": 1.2597909081931702e-08, + "loss": 0.2999, + "step": 1969 + }, + { + "epoch": 0.98, + "learning_rate": 1.179840720409331e-08, + "loss": 0.2986, + "step": 1970 + }, + { + "epoch": 0.99, + "learning_rate": 1.102509822051845e-08, + "loss": 0.3122, + "step": 1971 + }, + { + "epoch": 0.99, + "learning_rate": 1.0277984159122734e-08, + "loss": 0.3172, + "step": 1972 + }, + { + "epoch": 0.99, + "learning_rate": 9.557066979123398e-09, + "loss": 0.315, + "step": 1973 + }, + { + "epoch": 0.99, + "learning_rate": 8.862348571043733e-09, + "loss": 0.3141, + "step": 1974 + }, + { + "epoch": 0.99, + "learning_rate": 8.193830756699773e-09, + "loss": 0.3203, + "step": 1975 + }, + { + "epoch": 0.99, + "learning_rate": 7.551515289203615e-09, + "loss": 0.3238, + "step": 1976 + }, + { + "epoch": 0.99, + "learning_rate": 6.935403852950107e-09, + "loss": 0.2921, + "step": 1977 + }, + { + "epoch": 0.99, + "learning_rate": 6.345498063622391e-09, + "loss": 0.3407, + "step": 1978 + }, + { + "epoch": 0.99, + "learning_rate": 5.781799468177473e-09, + "loss": 0.2725, + "step": 1979 + }, + { + "epoch": 0.99, + "learning_rate": 5.2443095448506674e-09, + "loss": 0.3079, + "step": 1980 + }, + { + "epoch": 0.99, + "learning_rate": 4.733029703146708e-09, + "loss": 0.3236, + "step": 1981 + }, + { + "epoch": 0.99, + "learning_rate": 4.247961283835311e-09, + "loss": 0.3236, + "step": 1982 + }, + { + "epoch": 0.99, + "learning_rate": 3.789105558954509e-09, + "loss": 0.3095, + "step": 1983 + }, + { + "epoch": 0.99, + "learning_rate": 3.3564637317984318e-09, + "loss": 0.3494, + "step": 1984 + }, + { + "epoch": 0.99, + "learning_rate": 2.9500369369195313e-09, + "loss": 0.3102, + "step": 1985 + }, + { + "epoch": 0.99, + "learning_rate": 2.5698262401263607e-09, + "loss": 0.2826, + "step": 1986 + }, + { + "epoch": 0.99, + "learning_rate": 2.215832638474691e-09, + "loss": 0.2811, + "step": 1987 + }, + { + "epoch": 0.99, + "learning_rate": 1.888057060274173e-09, + "loss": 0.2664, + "step": 1988 + }, + { + "epoch": 0.99, + "learning_rate": 1.5865003650761268e-09, + "loss": 0.3725, + "step": 1989 + }, + { + "epoch": 0.99, + "learning_rate": 1.3111633436779792e-09, + "loss": 0.3254, + "step": 1990 + }, + { + "epoch": 1.0, + "learning_rate": 1.062046718121046e-09, + "loss": 0.3177, + "step": 1991 + }, + { + "epoch": 1.0, + "learning_rate": 8.391511416816489e-10, + "loss": 0.2967, + "step": 1992 + }, + { + "epoch": 1.0, + "learning_rate": 6.424771988788881e-10, + "loss": 0.3657, + "step": 1993 + }, + { + "epoch": 1.0, + "learning_rate": 4.720254054679796e-10, + "loss": 0.2909, + "step": 1994 + }, + { + "epoch": 1.0, + "learning_rate": 3.277962084369257e-10, + "loss": 0.3087, + "step": 1995 + }, + { + "epoch": 1.0, + "learning_rate": 2.0978998601206558e-10, + "loss": 0.3516, + "step": 1996 + }, + { + "epoch": 1.0, + "learning_rate": 1.1800704765030367e-10, + "loss": 0.3215, + "step": 1997 + }, + { + "epoch": 1.0, + "learning_rate": 5.244763404133046e-11, + "loss": 0.3044, + "step": 1998 + }, + { + "epoch": 1.0, + "learning_rate": 1.311191710651194e-11, + "loss": 0.3368, + "step": 1999 + }, + { + "epoch": 1.0, + "learning_rate": 0.0, + "loss": 0.2367, + "step": 2000 + }, + { + "epoch": 1.0, + "eval_code_gate_load": [ + 205.2, + 169.7, + 178.05, + 154.0, + 178.95, + 188.05, + 195.85, + 170.2 + ], + "eval_code_loss": 0.2437744140625, + "eval_code_runtime": 1.7981, + "eval_code_samples_per_second": 556.137, + "eval_code_steps_per_second": 35.037, + "step": 2000 + }, + { + "epoch": 1.0, + "eval_orca_gate_load": [ + 497.05, + 341.55, + 406.6, + 396.8, + 351.5, + 422.2, + 370.25, + 356.35 + ], + "eval_orca_loss": 0.34916990995407104, + "eval_orca_runtime": 2.0032, + "eval_orca_samples_per_second": 499.21, + "eval_orca_steps_per_second": 31.45, + "step": 2000 + }, + { + "epoch": 1.0, + "eval_math_gate_load": [ + 314.55, + 216.95, + 239.45, + 221.2, + 253.25, + 261.05, + 276.5, + 249.75 + ], + "eval_math_loss": 0.24287109076976776, + "eval_math_runtime": 1.8382, + "eval_math_samples_per_second": 544.019, + "eval_math_steps_per_second": 34.273, + "step": 2000 + }, + { + "epoch": 1.0, + "eval_sharegpt_gate_load": [ + 1524.05, + 1079.85, + 1307.8, + 1203.45, + 1136.45, + 1369.95, + 1180.15, + 1109.9 + ], + "eval_sharegpt_loss": 0.4881835877895355, + "eval_sharegpt_runtime": 2.9976, + "eval_sharegpt_samples_per_second": 333.601, + "eval_sharegpt_steps_per_second": 21.017, + "step": 2000 + }, + { + "epoch": 1.0, + "step": 2000, + "total_flos": 2.079150933069005e+19, + "train_loss": 0.5031886091232299, + "train_runtime": 25719.4214, + "train_samples_per_second": 9.954, + "train_steps_per_second": 0.078 + } + ], + "logging_steps": 1.0, + "max_steps": 2000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 9999999999999, + "total_flos": 2.079150933069005e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}