{ "best_metric": 0.5249539017677307, "best_model_checkpoint": "post-auto-v3/checkpoint-180", "epoch": 2.0, "eval_steps": 500, "global_step": 180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.044444444444444446, "grad_norm": 49.678558349609375, "learning_rate": 7.4074074074074075e-06, "loss": 2.749, "step": 4 }, { "epoch": 0.08888888888888889, "grad_norm": 38.787879943847656, "learning_rate": 1.4814814814814815e-05, "loss": 3.1578, "step": 8 }, { "epoch": 0.13333333333333333, "grad_norm": 42.88180160522461, "learning_rate": 2.2222222222222223e-05, "loss": 2.6539, "step": 12 }, { "epoch": 0.17777777777777778, "grad_norm": 35.309532165527344, "learning_rate": 2.962962962962963e-05, "loss": 1.7708, "step": 16 }, { "epoch": 0.2222222222222222, "grad_norm": 23.5089168548584, "learning_rate": 3.7037037037037037e-05, "loss": 1.1406, "step": 20 }, { "epoch": 0.26666666666666666, "grad_norm": 7.427271842956543, "learning_rate": 4.4444444444444447e-05, "loss": 0.8041, "step": 24 }, { "epoch": 0.3111111111111111, "grad_norm": 11.275213241577148, "learning_rate": 4.9794238683127575e-05, "loss": 0.8478, "step": 28 }, { "epoch": 0.35555555555555557, "grad_norm": 21.598812103271484, "learning_rate": 4.8971193415637865e-05, "loss": 0.885, "step": 32 }, { "epoch": 0.4, "grad_norm": 15.150390625, "learning_rate": 4.814814814814815e-05, "loss": 0.7711, "step": 36 }, { "epoch": 0.4444444444444444, "grad_norm": 8.3656587600708, "learning_rate": 4.732510288065844e-05, "loss": 0.6932, "step": 40 }, { "epoch": 0.4888888888888889, "grad_norm": 6.864919662475586, "learning_rate": 4.650205761316873e-05, "loss": 0.6934, "step": 44 }, { "epoch": 0.5333333333333333, "grad_norm": 20.607200622558594, "learning_rate": 4.567901234567901e-05, "loss": 0.6592, "step": 48 }, { "epoch": 0.5777777777777777, "grad_norm": 5.4164276123046875, "learning_rate": 4.48559670781893e-05, "loss": 0.6815, "step": 52 }, { "epoch": 0.6222222222222222, "grad_norm": 21.185943603515625, "learning_rate": 4.403292181069959e-05, "loss": 0.7281, "step": 56 }, { "epoch": 0.6666666666666666, "grad_norm": 5.345090866088867, "learning_rate": 4.3209876543209875e-05, "loss": 0.7072, "step": 60 }, { "epoch": 0.7111111111111111, "grad_norm": 28.171621322631836, "learning_rate": 4.2386831275720165e-05, "loss": 0.5982, "step": 64 }, { "epoch": 0.7555555555555555, "grad_norm": 18.241920471191406, "learning_rate": 4.1563786008230455e-05, "loss": 0.6613, "step": 68 }, { "epoch": 0.8, "grad_norm": 26.421844482421875, "learning_rate": 4.074074074074074e-05, "loss": 0.8852, "step": 72 }, { "epoch": 0.8444444444444444, "grad_norm": 18.892175674438477, "learning_rate": 3.9917695473251035e-05, "loss": 0.7267, "step": 76 }, { "epoch": 0.8888888888888888, "grad_norm": 15.053618431091309, "learning_rate": 3.909465020576132e-05, "loss": 0.3929, "step": 80 }, { "epoch": 0.9333333333333333, "grad_norm": 45.083675384521484, "learning_rate": 3.82716049382716e-05, "loss": 0.5913, "step": 84 }, { "epoch": 0.9777777777777777, "grad_norm": 16.20758628845215, "learning_rate": 3.74485596707819e-05, "loss": 0.6828, "step": 88 }, { "epoch": 1.0, "eval_accuracy": 0.7166666666666667, "eval_f1_macro": 0.7356755710414246, "eval_f1_micro": 0.7166666666666667, "eval_f1_weighted": 0.714955155503936, "eval_loss": 0.5613141655921936, "eval_precision_macro": 0.7246444246444246, "eval_precision_micro": 0.7166666666666667, "eval_precision_weighted": 0.7248502348502348, "eval_recall_macro": 0.7572316572316572, "eval_recall_micro": 0.7166666666666667, "eval_recall_weighted": 0.7166666666666667, "eval_runtime": 45.0143, "eval_samples_per_second": 3.999, "eval_steps_per_second": 0.267, "step": 90 }, { "epoch": 1.0222222222222221, "grad_norm": 23.50992774963379, "learning_rate": 3.662551440329218e-05, "loss": 0.661, "step": 92 }, { "epoch": 1.0666666666666667, "grad_norm": 16.499202728271484, "learning_rate": 3.580246913580247e-05, "loss": 0.54, "step": 96 }, { "epoch": 1.1111111111111112, "grad_norm": 12.408361434936523, "learning_rate": 3.497942386831276e-05, "loss": 0.4252, "step": 100 }, { "epoch": 1.1555555555555554, "grad_norm": 30.811758041381836, "learning_rate": 3.4156378600823045e-05, "loss": 0.3497, "step": 104 }, { "epoch": 1.2, "grad_norm": 33.299259185791016, "learning_rate": 3.3333333333333335e-05, "loss": 0.3372, "step": 108 }, { "epoch": 1.2444444444444445, "grad_norm": 9.907443046569824, "learning_rate": 3.2510288065843625e-05, "loss": 0.4249, "step": 112 }, { "epoch": 1.2888888888888888, "grad_norm": 26.002504348754883, "learning_rate": 3.168724279835391e-05, "loss": 0.4189, "step": 116 }, { "epoch": 1.3333333333333333, "grad_norm": 28.73817253112793, "learning_rate": 3.08641975308642e-05, "loss": 0.7664, "step": 120 }, { "epoch": 1.3777777777777778, "grad_norm": 26.192493438720703, "learning_rate": 3.0041152263374488e-05, "loss": 0.5389, "step": 124 }, { "epoch": 1.4222222222222223, "grad_norm": 38.437782287597656, "learning_rate": 2.9218106995884775e-05, "loss": 0.5959, "step": 128 }, { "epoch": 1.4666666666666668, "grad_norm": 17.456642150878906, "learning_rate": 2.839506172839506e-05, "loss": 0.474, "step": 132 }, { "epoch": 1.511111111111111, "grad_norm": 14.425949096679688, "learning_rate": 2.757201646090535e-05, "loss": 0.5086, "step": 136 }, { "epoch": 1.5555555555555556, "grad_norm": 20.57114028930664, "learning_rate": 2.6748971193415638e-05, "loss": 0.6384, "step": 140 }, { "epoch": 1.6, "grad_norm": 18.78620719909668, "learning_rate": 2.5925925925925925e-05, "loss": 0.5804, "step": 144 }, { "epoch": 1.6444444444444444, "grad_norm": 33.17873764038086, "learning_rate": 2.510288065843622e-05, "loss": 0.5028, "step": 148 }, { "epoch": 1.6888888888888889, "grad_norm": 26.253496170043945, "learning_rate": 2.4279835390946505e-05, "loss": 0.446, "step": 152 }, { "epoch": 1.7333333333333334, "grad_norm": 22.726823806762695, "learning_rate": 2.345679012345679e-05, "loss": 0.5482, "step": 156 }, { "epoch": 1.7777777777777777, "grad_norm": 13.564722061157227, "learning_rate": 2.2633744855967078e-05, "loss": 0.4052, "step": 160 }, { "epoch": 1.8222222222222222, "grad_norm": 13.872465133666992, "learning_rate": 2.1810699588477368e-05, "loss": 0.3939, "step": 164 }, { "epoch": 1.8666666666666667, "grad_norm": 8.408356666564941, "learning_rate": 2.0987654320987655e-05, "loss": 0.3675, "step": 168 }, { "epoch": 1.911111111111111, "grad_norm": 6.3673624992370605, "learning_rate": 2.016460905349794e-05, "loss": 0.3066, "step": 172 }, { "epoch": 1.9555555555555557, "grad_norm": 11.568488121032715, "learning_rate": 1.934156378600823e-05, "loss": 0.2487, "step": 176 }, { "epoch": 2.0, "grad_norm": 3.9987974166870117, "learning_rate": 1.8518518518518518e-05, "loss": 0.3745, "step": 180 }, { "epoch": 2.0, "eval_accuracy": 0.7722222222222223, "eval_f1_macro": 0.8129276695270152, "eval_f1_micro": 0.7722222222222223, "eval_f1_weighted": 0.7733237593303729, "eval_loss": 0.5249539017677307, "eval_precision_macro": 0.8374953236064346, "eval_precision_micro": 0.7722222222222223, "eval_precision_weighted": 0.775858897618157, "eval_recall_macro": 0.7930180930180931, "eval_recall_micro": 0.7722222222222223, "eval_recall_weighted": 0.7722222222222223, "eval_runtime": 47.7709, "eval_samples_per_second": 3.768, "eval_steps_per_second": 0.251, "step": 180 } ], "logging_steps": 4, "max_steps": 270, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 94720830382080.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }