{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 740, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13513513513513514, "grad_norm": 10.65625, "learning_rate": 0.00019990989662046818, "loss": 1.0534, "step": 10 }, { "epoch": 0.2702702702702703, "grad_norm": 10.4765625, "learning_rate": 0.00019963974885425266, "loss": 0.9925, "step": 20 }, { "epoch": 0.40540540540540543, "grad_norm": 12.546875, "learning_rate": 0.00019919004352588767, "loss": 0.9846, "step": 30 }, { "epoch": 0.5405405405405406, "grad_norm": 10.9296875, "learning_rate": 0.00019856159103477086, "loss": 0.9861, "step": 40 }, { "epoch": 0.6756756756756757, "grad_norm": 12.125, "learning_rate": 0.00019775552389476864, "loss": 0.9614, "step": 50 }, { "epoch": 0.8108108108108109, "grad_norm": 11.1640625, "learning_rate": 0.0001967732946933499, "loss": 0.9254, "step": 60 }, { "epoch": 0.9459459459459459, "grad_norm": 11.2109375, "learning_rate": 0.00019561667347392508, "loss": 0.9275, "step": 70 }, { "epoch": 1.0810810810810811, "grad_norm": 12.0703125, "learning_rate": 0.00019428774454610843, "loss": 0.9252, "step": 80 }, { "epoch": 1.2162162162162162, "grad_norm": 10.671875, "learning_rate": 0.00019278890272965096, "loss": 0.8997, "step": 90 }, { "epoch": 1.3513513513513513, "grad_norm": 10.0234375, "learning_rate": 0.0001911228490388136, "loss": 0.8997, "step": 100 }, { "epoch": 1.4864864864864864, "grad_norm": 9.828125, "learning_rate": 0.00018929258581495685, "loss": 0.9081, "step": 110 }, { "epoch": 1.6216216216216215, "grad_norm": 9.78125, "learning_rate": 0.00018730141131611882, "loss": 0.88, "step": 120 }, { "epoch": 1.7567567567567568, "grad_norm": 9.625, "learning_rate": 0.00018515291377333112, "loss": 0.8677, "step": 130 }, { "epoch": 1.8918918918918919, "grad_norm": 10.1875, "learning_rate": 0.00018285096492438424, "loss": 0.8999, "step": 140 }, { "epoch": 2.027027027027027, "grad_norm": 11.6640625, "learning_rate": 0.00018039971303669407, "loss": 0.9, "step": 150 }, { "epoch": 2.1621621621621623, "grad_norm": 12.2109375, "learning_rate": 0.00017780357543184397, "loss": 0.912, "step": 160 }, { "epoch": 2.2972972972972974, "grad_norm": 14.1640625, "learning_rate": 0.00017506723052527242, "loss": 0.8953, "step": 170 }, { "epoch": 2.4324324324324325, "grad_norm": 15.015625, "learning_rate": 0.00017219560939545246, "loss": 0.9759, "step": 180 }, { "epoch": 2.5675675675675675, "grad_norm": 15.5546875, "learning_rate": 0.00016919388689775464, "loss": 1.0338, "step": 190 }, { "epoch": 2.7027027027027026, "grad_norm": 16.5, "learning_rate": 0.00016606747233900815, "loss": 1.0389, "step": 200 }, { "epoch": 2.8378378378378377, "grad_norm": 16.9375, "learning_rate": 0.00016282199972956425, "loss": 1.0149, "step": 210 }, { "epoch": 2.972972972972973, "grad_norm": 14.21875, "learning_rate": 0.00015946331763042867, "loss": 0.9946, "step": 220 }, { "epoch": 3.108108108108108, "grad_norm": 11.09375, "learning_rate": 0.00015599747861375955, "loss": 0.9629, "step": 230 }, { "epoch": 3.2432432432432434, "grad_norm": 13.4921875, "learning_rate": 0.00015243072835572318, "loss": 0.9677, "step": 240 }, { "epoch": 3.3783783783783785, "grad_norm": 13.3046875, "learning_rate": 0.00014876949438136347, "loss": 0.9327, "step": 250 }, { "epoch": 3.5135135135135136, "grad_norm": 13.03125, "learning_rate": 0.00014502037448176734, "loss": 0.9566, "step": 260 }, { "epoch": 3.6486486486486487, "grad_norm": 11.9921875, "learning_rate": 0.0001411901248243993, "loss": 0.913, "step": 270 }, { "epoch": 3.7837837837837838, "grad_norm": 10.953125, "learning_rate": 0.00013728564777803088, "loss": 0.9266, "step": 280 }, { "epoch": 3.918918918918919, "grad_norm": 12.15625, "learning_rate": 0.00013331397947420576, "loss": 0.8859, "step": 290 }, { "epoch": 4.054054054054054, "grad_norm": 13.1171875, "learning_rate": 0.00012928227712765504, "loss": 0.8896, "step": 300 }, { "epoch": 4.1891891891891895, "grad_norm": 11.8203125, "learning_rate": 0.00012519780613851254, "loss": 0.8317, "step": 310 }, { "epoch": 4.324324324324325, "grad_norm": 12.53125, "learning_rate": 0.00012106792699957263, "loss": 0.8712, "step": 320 }, { "epoch": 4.45945945945946, "grad_norm": 12.671875, "learning_rate": 0.00011690008203218493, "loss": 0.8436, "step": 330 }, { "epoch": 4.594594594594595, "grad_norm": 15.8515625, "learning_rate": 0.00011270178197468789, "loss": 0.8538, "step": 340 }, { "epoch": 4.72972972972973, "grad_norm": 12.46875, "learning_rate": 0.00010848059244755093, "loss": 0.8787, "step": 350 }, { "epoch": 4.864864864864865, "grad_norm": 11.921875, "learning_rate": 0.00010424412031961484, "loss": 0.8823, "step": 360 }, { "epoch": 5.0, "grad_norm": 11.8203125, "learning_rate": 0.0001, "loss": 0.8839, "step": 370 }, { "epoch": 5.135135135135135, "grad_norm": 11.515625, "learning_rate": 9.57558796803852e-05, "loss": 0.8362, "step": 380 }, { "epoch": 5.27027027027027, "grad_norm": 12.15625, "learning_rate": 9.151940755244912e-05, "loss": 0.8678, "step": 390 }, { "epoch": 5.405405405405405, "grad_norm": 13.109375, "learning_rate": 8.729821802531212e-05, "loss": 0.8372, "step": 400 }, { "epoch": 5.54054054054054, "grad_norm": 12.96875, "learning_rate": 8.309991796781511e-05, "loss": 0.8662, "step": 410 }, { "epoch": 5.675675675675675, "grad_norm": 15.203125, "learning_rate": 7.89320730004274e-05, "loss": 0.9021, "step": 420 }, { "epoch": 5.8108108108108105, "grad_norm": 12.4765625, "learning_rate": 7.48021938614875e-05, "loss": 0.8902, "step": 430 }, { "epoch": 5.945945945945946, "grad_norm": 12.953125, "learning_rate": 7.071772287234497e-05, "loss": 0.852, "step": 440 }, { "epoch": 6.081081081081081, "grad_norm": 14.9609375, "learning_rate": 6.668602052579424e-05, "loss": 0.8743, "step": 450 }, { "epoch": 6.216216216216216, "grad_norm": 12.8828125, "learning_rate": 6.271435222196916e-05, "loss": 0.8844, "step": 460 }, { "epoch": 6.351351351351352, "grad_norm": 13.3984375, "learning_rate": 5.880987517560075e-05, "loss": 0.8924, "step": 470 }, { "epoch": 6.486486486486487, "grad_norm": 14.3046875, "learning_rate": 5.497962551823266e-05, "loss": 0.9041, "step": 480 }, { "epoch": 6.621621621621622, "grad_norm": 15.421875, "learning_rate": 5.123050561863657e-05, "loss": 0.871, "step": 490 }, { "epoch": 6.756756756756757, "grad_norm": 12.7109375, "learning_rate": 4.756927164427685e-05, "loss": 0.8924, "step": 500 }, { "epoch": 6.891891891891892, "grad_norm": 13.765625, "learning_rate": 4.4002521386240466e-05, "loss": 0.9047, "step": 510 }, { "epoch": 7.027027027027027, "grad_norm": 13.09375, "learning_rate": 4.053668236957134e-05, "loss": 0.9083, "step": 520 }, { "epoch": 7.162162162162162, "grad_norm": 13.875, "learning_rate": 3.717800027043576e-05, "loss": 0.8721, "step": 530 }, { "epoch": 7.297297297297297, "grad_norm": 12.09375, "learning_rate": 3.393252766099187e-05, "loss": 0.916, "step": 540 }, { "epoch": 7.4324324324324325, "grad_norm": 12.3671875, "learning_rate": 3.080611310224539e-05, "loss": 0.8714, "step": 550 }, { "epoch": 7.5675675675675675, "grad_norm": 13.2109375, "learning_rate": 2.7804390604547557e-05, "loss": 0.8843, "step": 560 }, { "epoch": 7.702702702702703, "grad_norm": 13.9765625, "learning_rate": 2.493276947472756e-05, "loss": 0.8934, "step": 570 }, { "epoch": 7.837837837837838, "grad_norm": 14.28125, "learning_rate": 2.2196424568156073e-05, "loss": 0.8953, "step": 580 }, { "epoch": 7.972972972972973, "grad_norm": 14.8828125, "learning_rate": 1.9600286963305957e-05, "loss": 0.8728, "step": 590 }, { "epoch": 8.108108108108109, "grad_norm": 13.2890625, "learning_rate": 1.7149035075615794e-05, "loss": 0.8995, "step": 600 }, { "epoch": 8.243243243243244, "grad_norm": 14.7578125, "learning_rate": 1.4847086226668872e-05, "loss": 0.8685, "step": 610 }, { "epoch": 8.378378378378379, "grad_norm": 12.0703125, "learning_rate": 1.2698588683881186e-05, "loss": 0.88, "step": 620 }, { "epoch": 8.513513513513514, "grad_norm": 16.8125, "learning_rate": 1.0707414185043163e-05, "loss": 0.8936, "step": 630 }, { "epoch": 8.64864864864865, "grad_norm": 14.0859375, "learning_rate": 8.87715096118642e-06, "loss": 0.8745, "step": 640 }, { "epoch": 8.783783783783784, "grad_norm": 13.1796875, "learning_rate": 7.211097270349066e-06, "loss": 0.8849, "step": 650 }, { "epoch": 8.91891891891892, "grad_norm": 12.6328125, "learning_rate": 5.71225545389158e-06, "loss": 0.8775, "step": 660 }, { "epoch": 9.054054054054054, "grad_norm": 12.15625, "learning_rate": 4.383326526074916e-06, "loss": 0.8663, "step": 670 }, { "epoch": 9.18918918918919, "grad_norm": 12.9140625, "learning_rate": 3.226705306650113e-06, "loss": 0.8975, "step": 680 }, { "epoch": 9.324324324324325, "grad_norm": 14.265625, "learning_rate": 2.2444761052313856e-06, "loss": 0.8677, "step": 690 }, { "epoch": 9.45945945945946, "grad_norm": 14.46875, "learning_rate": 1.4384089652291543e-06, "loss": 0.8565, "step": 700 }, { "epoch": 9.594594594594595, "grad_norm": 13.4375, "learning_rate": 8.099564741123166e-07, "loss": 0.8619, "step": 710 }, { "epoch": 9.72972972972973, "grad_norm": 13.5859375, "learning_rate": 3.6025114574734785e-07, "loss": 0.8956, "step": 720 }, { "epoch": 9.864864864864865, "grad_norm": 14.1796875, "learning_rate": 9.010337953185843e-08, "loss": 0.8906, "step": 730 }, { "epoch": 10.0, "grad_norm": 13.2109375, "learning_rate": 0.0, "loss": 0.8617, "step": 740 }, { "epoch": 10.0, "step": 740, "total_flos": 4.83113130000384e+16, "train_loss": 0.9043467921179694, "train_runtime": 626.4938, "train_samples_per_second": 4.725, "train_steps_per_second": 1.181 } ], "logging_steps": 10, "max_steps": 740, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 4.83113130000384e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }