{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9918809201623815, "eval_steps": 500, "global_step": 368, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005412719891745603, "grad_norm": 23.5, "learning_rate": 5.405405405405406e-07, "loss": 2.2045, "step": 1 }, { "epoch": 0.02706359945872801, "grad_norm": 19.375, "learning_rate": 2.702702702702703e-06, "loss": 2.2384, "step": 5 }, { "epoch": 0.05412719891745602, "grad_norm": 6.6875, "learning_rate": 5.405405405405406e-06, "loss": 2.1809, "step": 10 }, { "epoch": 0.08119079837618404, "grad_norm": 3.921875, "learning_rate": 8.108108108108109e-06, "loss": 2.0942, "step": 15 }, { "epoch": 0.10825439783491204, "grad_norm": 2.765625, "learning_rate": 1.0810810810810812e-05, "loss": 2.0225, "step": 20 }, { "epoch": 0.13531799729364005, "grad_norm": 3.40625, "learning_rate": 1.3513513513513515e-05, "loss": 1.9498, "step": 25 }, { "epoch": 0.16238159675236807, "grad_norm": 2.703125, "learning_rate": 1.6216216216216218e-05, "loss": 1.8531, "step": 30 }, { "epoch": 0.18944519621109607, "grad_norm": 1.78125, "learning_rate": 1.891891891891892e-05, "loss": 1.7903, "step": 35 }, { "epoch": 0.2165087956698241, "grad_norm": 1.703125, "learning_rate": 1.9995946530314384e-05, "loss": 1.7538, "step": 40 }, { "epoch": 0.2435723951285521, "grad_norm": 1.5078125, "learning_rate": 1.9971187226043746e-05, "loss": 1.7264, "step": 45 }, { "epoch": 0.2706359945872801, "grad_norm": 1.3515625, "learning_rate": 1.9923976226947417e-05, "loss": 1.6995, "step": 50 }, { "epoch": 0.2976995940460081, "grad_norm": 1.28125, "learning_rate": 1.985441983600819e-05, "loss": 1.6901, "step": 55 }, { "epoch": 0.32476319350473615, "grad_norm": 1.34375, "learning_rate": 1.9762674670369757e-05, "loss": 1.6784, "step": 60 }, { "epoch": 0.35182679296346414, "grad_norm": 1.2890625, "learning_rate": 1.9648947308688594e-05, "loss": 1.6738, "step": 65 }, { "epoch": 0.37889039242219213, "grad_norm": 1.328125, "learning_rate": 1.9513493825989664e-05, "loss": 1.6719, "step": 70 }, { "epoch": 0.4059539918809202, "grad_norm": 1.2734375, "learning_rate": 1.9356619217073252e-05, "loss": 1.6617, "step": 75 }, { "epoch": 0.4330175913396482, "grad_norm": 1.2734375, "learning_rate": 1.917867670977126e-05, "loss": 1.6447, "step": 80 }, { "epoch": 0.46008119079837617, "grad_norm": 1.296875, "learning_rate": 1.8980066969599216e-05, "loss": 1.6337, "step": 85 }, { "epoch": 0.4871447902571042, "grad_norm": 1.25, "learning_rate": 1.8761237197594945e-05, "loss": 1.6549, "step": 90 }, { "epoch": 0.5142083897158322, "grad_norm": 1.25, "learning_rate": 1.852268012337514e-05, "loss": 1.6334, "step": 95 }, { "epoch": 0.5412719891745602, "grad_norm": 1.265625, "learning_rate": 1.8264932895677195e-05, "loss": 1.6276, "step": 100 }, { "epoch": 0.5683355886332883, "grad_norm": 1.2421875, "learning_rate": 1.798857587288445e-05, "loss": 1.6326, "step": 105 }, { "epoch": 0.5953991880920162, "grad_norm": 1.203125, "learning_rate": 1.769423131625808e-05, "loss": 1.6334, "step": 110 }, { "epoch": 0.6224627875507442, "grad_norm": 1.234375, "learning_rate": 1.738256198881809e-05, "loss": 1.6327, "step": 115 }, { "epoch": 0.6495263870094723, "grad_norm": 1.3125, "learning_rate": 1.7054269663028232e-05, "loss": 1.6271, "step": 120 }, { "epoch": 0.6765899864682002, "grad_norm": 1.25, "learning_rate": 1.6710093540645056e-05, "loss": 1.6247, "step": 125 }, { "epoch": 0.7036535859269283, "grad_norm": 1.296875, "learning_rate": 1.6350808588288964e-05, "loss": 1.6255, "step": 130 }, { "epoch": 0.7307171853856563, "grad_norm": 1.234375, "learning_rate": 1.597722379248512e-05, "loss": 1.6155, "step": 135 }, { "epoch": 0.7577807848443843, "grad_norm": 1.2421875, "learning_rate": 1.559018033810316e-05, "loss": 1.6162, "step": 140 }, { "epoch": 0.7848443843031123, "grad_norm": 1.21875, "learning_rate": 1.5190549714297303e-05, "loss": 1.6081, "step": 145 }, { "epoch": 0.8119079837618404, "grad_norm": 1.21875, "learning_rate": 1.4779231752211546e-05, "loss": 1.6031, "step": 150 }, { "epoch": 0.8389715832205683, "grad_norm": 1.28125, "learning_rate": 1.4357152598868478e-05, "loss": 1.6155, "step": 155 }, { "epoch": 0.8660351826792964, "grad_norm": 1.2109375, "learning_rate": 1.3925262631803722e-05, "loss": 1.6039, "step": 160 }, { "epoch": 0.8930987821380244, "grad_norm": 1.2265625, "learning_rate": 1.3484534319141592e-05, "loss": 1.608, "step": 165 }, { "epoch": 0.9201623815967523, "grad_norm": 1.1875, "learning_rate": 1.303596002993028e-05, "loss": 1.6036, "step": 170 }, { "epoch": 0.9472259810554804, "grad_norm": 1.2578125, "learning_rate": 1.2580549799667034e-05, "loss": 1.6157, "step": 175 }, { "epoch": 0.9742895805142084, "grad_norm": 1.2734375, "learning_rate": 1.2119329056044533e-05, "loss": 1.601, "step": 180 }, { "epoch": 0.9959404600811907, "eval_loss": 1.6971594095230103, "eval_runtime": 28.8094, "eval_samples_per_second": 15.099, "eval_steps_per_second": 1.909, "step": 184 }, { "epoch": 1.0013531799729365, "grad_norm": 1.21875, "learning_rate": 1.165333631003928e-05, "loss": 1.5923, "step": 185 }, { "epoch": 1.0284167794316643, "grad_norm": 1.2109375, "learning_rate": 1.1183620817540985e-05, "loss": 1.5652, "step": 190 }, { "epoch": 1.0554803788903924, "grad_norm": 1.2265625, "learning_rate": 1.0711240216788036e-05, "loss": 1.5483, "step": 195 }, { "epoch": 1.0825439783491204, "grad_norm": 1.2578125, "learning_rate": 1.0237258146928849e-05, "loss": 1.5504, "step": 200 }, { "epoch": 1.1096075778078485, "grad_norm": 1.21875, "learning_rate": 9.762741853071153e-06, "loss": 1.555, "step": 205 }, { "epoch": 1.1366711772665765, "grad_norm": 1.1796875, "learning_rate": 9.288759783211967e-06, "loss": 1.5659, "step": 210 }, { "epoch": 1.1637347767253043, "grad_norm": 1.2109375, "learning_rate": 8.81637918245902e-06, "loss": 1.5601, "step": 215 }, { "epoch": 1.1907983761840324, "grad_norm": 1.1640625, "learning_rate": 8.346663689960724e-06, "loss": 1.5516, "step": 220 }, { "epoch": 1.2178619756427604, "grad_norm": 1.171875, "learning_rate": 7.880670943955467e-06, "loss": 1.5542, "step": 225 }, { "epoch": 1.2449255751014885, "grad_norm": 1.203125, "learning_rate": 7.419450200332965e-06, "loss": 1.5491, "step": 230 }, { "epoch": 1.2719891745602165, "grad_norm": 1.1640625, "learning_rate": 6.964039970069722e-06, "loss": 1.5564, "step": 235 }, { "epoch": 1.2990527740189446, "grad_norm": 1.1796875, "learning_rate": 6.515465680858412e-06, "loss": 1.5584, "step": 240 }, { "epoch": 1.3261163734776726, "grad_norm": 1.21875, "learning_rate": 6.074737368196279e-06, "loss": 1.5534, "step": 245 }, { "epoch": 1.3531799729364005, "grad_norm": 1.2421875, "learning_rate": 5.642847401131526e-06, "loss": 1.5588, "step": 250 }, { "epoch": 1.3802435723951285, "grad_norm": 1.21875, "learning_rate": 5.220768247788458e-06, "loss": 1.552, "step": 255 }, { "epoch": 1.4073071718538566, "grad_norm": 1.1875, "learning_rate": 4.809450285702697e-06, "loss": 1.5462, "step": 260 }, { "epoch": 1.4343707713125846, "grad_norm": 1.203125, "learning_rate": 4.409819661896839e-06, "loss": 1.5623, "step": 265 }, { "epoch": 1.4614343707713127, "grad_norm": 1.2421875, "learning_rate": 4.022776207514885e-06, "loss": 1.5605, "step": 270 }, { "epoch": 1.4884979702300405, "grad_norm": 1.2265625, "learning_rate": 3.6491914117110405e-06, "loss": 1.5616, "step": 275 }, { "epoch": 1.5155615696887685, "grad_norm": 1.1875, "learning_rate": 3.2899064593549477e-06, "loss": 1.5578, "step": 280 }, { "epoch": 1.5426251691474966, "grad_norm": 1.1953125, "learning_rate": 2.945730336971767e-06, "loss": 1.5482, "step": 285 }, { "epoch": 1.5696887686062246, "grad_norm": 1.21875, "learning_rate": 2.6174380111819144e-06, "loss": 1.5559, "step": 290 }, { "epoch": 1.5967523680649527, "grad_norm": 1.21875, "learning_rate": 2.3057686837419246e-06, "loss": 1.5481, "step": 295 }, { "epoch": 1.6238159675236807, "grad_norm": 1.1640625, "learning_rate": 2.011424127115552e-06, "loss": 1.5412, "step": 300 }, { "epoch": 1.6508795669824088, "grad_norm": 1.234375, "learning_rate": 1.7350671043228072e-06, "loss": 1.5611, "step": 305 }, { "epoch": 1.6779431664411368, "grad_norm": 1.203125, "learning_rate": 1.4773198766248642e-06, "loss": 1.5669, "step": 310 }, { "epoch": 1.7050067658998647, "grad_norm": 1.2109375, "learning_rate": 1.2387628024050557e-06, "loss": 1.5515, "step": 315 }, { "epoch": 1.7320703653585927, "grad_norm": 1.2109375, "learning_rate": 1.0199330304007858e-06, "loss": 1.5633, "step": 320 }, { "epoch": 1.7591339648173205, "grad_norm": 1.234375, "learning_rate": 8.213232902287438e-07, "loss": 1.5542, "step": 325 }, { "epoch": 1.7861975642760486, "grad_norm": 1.2421875, "learning_rate": 6.433807829267491e-07, "loss": 1.5575, "step": 330 }, { "epoch": 1.8132611637347766, "grad_norm": 1.1953125, "learning_rate": 4.865061740103361e-07, "loss": 1.5532, "step": 335 }, { "epoch": 1.8403247631935047, "grad_norm": 1.21875, "learning_rate": 3.510526913114065e-07, "loss": 1.5686, "step": 340 }, { "epoch": 1.8673883626522327, "grad_norm": 1.1953125, "learning_rate": 2.3732532963024468e-07, "loss": 1.5484, "step": 345 }, { "epoch": 1.8944519621109608, "grad_norm": 1.1953125, "learning_rate": 1.4558016399181086e-07, "loss": 1.56, "step": 350 }, { "epoch": 1.9215155615696888, "grad_norm": 1.1953125, "learning_rate": 7.602377305258479e-08, "loss": 1.5479, "step": 355 }, { "epoch": 1.9485791610284169, "grad_norm": 1.1796875, "learning_rate": 2.8812773956256034e-08, "loss": 1.5456, "step": 360 }, { "epoch": 1.975642760487145, "grad_norm": 1.2265625, "learning_rate": 4.053469685617595e-09, "loss": 1.5526, "step": 365 }, { "epoch": 1.9918809201623815, "eval_loss": 1.697079062461853, "eval_runtime": 28.912, "eval_samples_per_second": 15.046, "eval_steps_per_second": 1.902, "step": 368 }, { "epoch": 1.9918809201623815, "step": 368, "total_flos": 1.4968483831454106e+17, "train_loss": 1.6339908747569374, "train_runtime": 3216.6871, "train_samples_per_second": 3.675, "train_steps_per_second": 0.114 } ], "logging_steps": 5, "max_steps": 368, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4968483831454106e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }