{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18518518518518517, "grad_norm": 9.9296875, "learning_rate": 0.00019983081582712685, "loss": 0.8329, "step": 10 }, { "epoch": 0.37037037037037035, "grad_norm": 15.890625, "learning_rate": 0.00019932383577419432, "loss": 0.8473, "step": 20 }, { "epoch": 0.5555555555555556, "grad_norm": 15.71875, "learning_rate": 0.00019848077530122083, "loss": 0.8181, "step": 30 }, { "epoch": 0.7407407407407407, "grad_norm": 17.453125, "learning_rate": 0.00019730448705798239, "loss": 0.8512, "step": 40 }, { "epoch": 0.9259259259259259, "grad_norm": 18.234375, "learning_rate": 0.0001957989512315489, "loss": 0.8302, "step": 50 }, { "epoch": 1.1111111111111112, "grad_norm": 25.78125, "learning_rate": 0.00019396926207859084, "loss": 0.8281, "step": 60 }, { "epoch": 1.2962962962962963, "grad_norm": 24.984375, "learning_rate": 0.00019182161068802741, "loss": 0.8343, "step": 70 }, { "epoch": 1.4814814814814814, "grad_norm": 24.296875, "learning_rate": 0.00018936326403234125, "loss": 0.7968, "step": 80 }, { "epoch": 1.6666666666666665, "grad_norm": 16.3125, "learning_rate": 0.00018660254037844388, "loss": 0.7891, "step": 90 }, { "epoch": 1.8518518518518519, "grad_norm": 13.578125, "learning_rate": 0.00018354878114129367, "loss": 0.7864, "step": 100 }, { "epoch": 2.037037037037037, "grad_norm": 14.25, "learning_rate": 0.0001802123192755044, "loss": 0.7815, "step": 110 }, { "epoch": 2.2222222222222223, "grad_norm": 14.7734375, "learning_rate": 0.0001766044443118978, "loss": 0.7697, "step": 120 }, { "epoch": 2.4074074074074074, "grad_norm": 11.3125, "learning_rate": 0.00017273736415730488, "loss": 0.7728, "step": 130 }, { "epoch": 2.5925925925925926, "grad_norm": 11.203125, "learning_rate": 0.0001686241637868734, "loss": 0.7413, "step": 140 }, { "epoch": 2.7777777777777777, "grad_norm": 14.8125, "learning_rate": 0.00016427876096865394, "loss": 0.749, "step": 150 }, { "epoch": 2.962962962962963, "grad_norm": 11.5, "learning_rate": 0.00015971585917027862, "loss": 0.7216, "step": 160 }, { "epoch": 3.148148148148148, "grad_norm": 13.109375, "learning_rate": 0.0001549508978070806, "loss": 0.7739, "step": 170 }, { "epoch": 3.3333333333333335, "grad_norm": 13.453125, "learning_rate": 0.00015000000000000001, "loss": 0.733, "step": 180 }, { "epoch": 3.5185185185185186, "grad_norm": 17.84375, "learning_rate": 0.00014487991802004623, "loss": 0.7267, "step": 190 }, { "epoch": 3.7037037037037037, "grad_norm": 16.765625, "learning_rate": 0.0001396079766039157, "loss": 0.7191, "step": 200 }, { "epoch": 3.888888888888889, "grad_norm": 17.515625, "learning_rate": 0.00013420201433256689, "loss": 0.7207, "step": 210 }, { "epoch": 4.074074074074074, "grad_norm": 21.578125, "learning_rate": 0.00012868032327110904, "loss": 0.7718, "step": 220 }, { "epoch": 4.2592592592592595, "grad_norm": 15.7890625, "learning_rate": 0.00012306158707424403, "loss": 0.7487, "step": 230 }, { "epoch": 4.444444444444445, "grad_norm": 15.9140625, "learning_rate": 0.00011736481776669306, "loss": 0.7416, "step": 240 }, { "epoch": 4.62962962962963, "grad_norm": 19.1875, "learning_rate": 0.00011160929141252303, "loss": 0.7517, "step": 250 }, { "epoch": 4.814814814814815, "grad_norm": 20.03125, "learning_rate": 0.00010581448289104758, "loss": 0.7521, "step": 260 }, { "epoch": 5.0, "grad_norm": 20.421875, "learning_rate": 0.0001, "loss": 0.753, "step": 270 }, { "epoch": 5.185185185185185, "grad_norm": 21.109375, "learning_rate": 9.418551710895243e-05, "loss": 0.7609, "step": 280 }, { "epoch": 5.37037037037037, "grad_norm": 17.53125, "learning_rate": 8.839070858747697e-05, "loss": 0.7744, "step": 290 }, { "epoch": 5.555555555555555, "grad_norm": 16.90625, "learning_rate": 8.263518223330697e-05, "loss": 0.7907, "step": 300 }, { "epoch": 5.7407407407407405, "grad_norm": 20.46875, "learning_rate": 7.693841292575598e-05, "loss": 0.8155, "step": 310 }, { "epoch": 5.925925925925926, "grad_norm": 19.984375, "learning_rate": 7.131967672889101e-05, "loss": 0.7919, "step": 320 }, { "epoch": 6.111111111111111, "grad_norm": 20.0625, "learning_rate": 6.579798566743314e-05, "loss": 0.7648, "step": 330 }, { "epoch": 6.296296296296296, "grad_norm": 18.140625, "learning_rate": 6.039202339608432e-05, "loss": 0.7645, "step": 340 }, { "epoch": 6.481481481481482, "grad_norm": 17.40625, "learning_rate": 5.5120081979953785e-05, "loss": 0.7765, "step": 350 }, { "epoch": 6.666666666666667, "grad_norm": 18.484375, "learning_rate": 5.000000000000002e-05, "loss": 0.8114, "step": 360 }, { "epoch": 6.851851851851852, "grad_norm": 21.609375, "learning_rate": 4.50491021929194e-05, "loss": 0.7967, "step": 370 }, { "epoch": 7.037037037037037, "grad_norm": 26.5625, "learning_rate": 4.028414082972141e-05, "loss": 0.8445, "step": 380 }, { "epoch": 7.222222222222222, "grad_norm": 21.703125, "learning_rate": 3.5721239031346066e-05, "loss": 0.8495, "step": 390 }, { "epoch": 7.407407407407407, "grad_norm": 20.984375, "learning_rate": 3.137583621312665e-05, "loss": 0.8274, "step": 400 }, { "epoch": 7.592592592592593, "grad_norm": 19.296875, "learning_rate": 2.7262635842695127e-05, "loss": 0.8176, "step": 410 }, { "epoch": 7.777777777777778, "grad_norm": 21.984375, "learning_rate": 2.339555568810221e-05, "loss": 0.8328, "step": 420 }, { "epoch": 7.962962962962963, "grad_norm": 20.515625, "learning_rate": 1.9787680724495617e-05, "loss": 0.8183, "step": 430 }, { "epoch": 8.148148148148149, "grad_norm": 25.671875, "learning_rate": 1.6451218858706374e-05, "loss": 0.7987, "step": 440 }, { "epoch": 8.333333333333334, "grad_norm": 20.28125, "learning_rate": 1.339745962155613e-05, "loss": 0.7994, "step": 450 }, { "epoch": 8.518518518518519, "grad_norm": 22.828125, "learning_rate": 1.0636735967658784e-05, "loss": 0.7928, "step": 460 }, { "epoch": 8.703703703703704, "grad_norm": 22.375, "learning_rate": 8.178389311972612e-06, "loss": 0.7978, "step": 470 }, { "epoch": 8.88888888888889, "grad_norm": 22.171875, "learning_rate": 6.030737921409169e-06, "loss": 0.8077, "step": 480 }, { "epoch": 9.074074074074074, "grad_norm": 21.28125, "learning_rate": 4.20104876845111e-06, "loss": 0.8003, "step": 490 }, { "epoch": 9.25925925925926, "grad_norm": 18.765625, "learning_rate": 2.6955129420176196e-06, "loss": 0.8012, "step": 500 }, { "epoch": 9.444444444444445, "grad_norm": 19.609375, "learning_rate": 1.5192246987791981e-06, "loss": 0.7831, "step": 510 }, { "epoch": 9.62962962962963, "grad_norm": 17.96875, "learning_rate": 6.761642258056978e-07, "loss": 0.783, "step": 520 }, { "epoch": 9.814814814814815, "grad_norm": 18.890625, "learning_rate": 1.6918417287318245e-07, "loss": 0.8059, "step": 530 }, { "epoch": 10.0, "grad_norm": 28.9375, "learning_rate": 0.0, "loss": 0.7967, "step": 540 }, { "epoch": 10.0, "step": 540, "total_flos": 3.52542013784064e+16, "train_loss": 0.7879019375200624, "train_runtime": 461.2272, "train_samples_per_second": 4.683, "train_steps_per_second": 1.171 } ], "logging_steps": 10, "max_steps": 540, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 3.52542013784064e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }