|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9918809201623815, |
|
"eval_steps": 500, |
|
"global_step": 368, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005412719891745603, |
|
"grad_norm": 23.5, |
|
"learning_rate": 5.405405405405406e-07, |
|
"loss": 2.2045, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02706359945872801, |
|
"grad_norm": 19.375, |
|
"learning_rate": 2.702702702702703e-06, |
|
"loss": 2.2384, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05412719891745602, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 5.405405405405406e-06, |
|
"loss": 2.1809, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08119079837618404, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 8.108108108108109e-06, |
|
"loss": 2.0942, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.10825439783491204, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 1.0810810810810812e-05, |
|
"loss": 2.0225, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13531799729364005, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 1.3513513513513515e-05, |
|
"loss": 1.9498, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.16238159675236807, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 1.6216216216216218e-05, |
|
"loss": 1.8531, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.18944519621109607, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 1.891891891891892e-05, |
|
"loss": 1.7903, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2165087956698241, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.9995946530314384e-05, |
|
"loss": 1.7538, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2435723951285521, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.9971187226043746e-05, |
|
"loss": 1.7264, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2706359945872801, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.9923976226947417e-05, |
|
"loss": 1.6995, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2976995940460081, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.985441983600819e-05, |
|
"loss": 1.6901, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.32476319350473615, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.9762674670369757e-05, |
|
"loss": 1.6784, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.35182679296346414, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.9648947308688594e-05, |
|
"loss": 1.6738, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.37889039242219213, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.9513493825989664e-05, |
|
"loss": 1.6719, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4059539918809202, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.9356619217073252e-05, |
|
"loss": 1.6617, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4330175913396482, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.917867670977126e-05, |
|
"loss": 1.6447, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.46008119079837617, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.8980066969599216e-05, |
|
"loss": 1.6337, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.4871447902571042, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.8761237197594945e-05, |
|
"loss": 1.6549, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5142083897158322, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.852268012337514e-05, |
|
"loss": 1.6334, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5412719891745602, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 1.8264932895677195e-05, |
|
"loss": 1.6276, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5683355886332883, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 1.798857587288445e-05, |
|
"loss": 1.6326, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5953991880920162, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 1.769423131625808e-05, |
|
"loss": 1.6334, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6224627875507442, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.738256198881809e-05, |
|
"loss": 1.6327, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6495263870094723, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 1.7054269663028232e-05, |
|
"loss": 1.6271, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6765899864682002, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.6710093540645056e-05, |
|
"loss": 1.6247, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7036535859269283, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.6350808588288964e-05, |
|
"loss": 1.6255, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7307171853856563, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.597722379248512e-05, |
|
"loss": 1.6155, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.7577807848443843, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 1.559018033810316e-05, |
|
"loss": 1.6162, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7848443843031123, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.5190549714297303e-05, |
|
"loss": 1.6081, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8119079837618404, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.4779231752211546e-05, |
|
"loss": 1.6031, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8389715832205683, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.4357152598868478e-05, |
|
"loss": 1.6155, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.8660351826792964, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.3925262631803722e-05, |
|
"loss": 1.6039, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8930987821380244, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 1.3484534319141592e-05, |
|
"loss": 1.608, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.9201623815967523, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 1.303596002993028e-05, |
|
"loss": 1.6036, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9472259810554804, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.2580549799667034e-05, |
|
"loss": 1.6157, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.9742895805142084, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.2119329056044533e-05, |
|
"loss": 1.601, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9959404600811907, |
|
"eval_loss": 1.6971594095230103, |
|
"eval_runtime": 28.8094, |
|
"eval_samples_per_second": 15.099, |
|
"eval_steps_per_second": 1.909, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.0013531799729365, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.165333631003928e-05, |
|
"loss": 1.5923, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.0284167794316643, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.1183620817540985e-05, |
|
"loss": 1.5652, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0554803788903924, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 1.0711240216788036e-05, |
|
"loss": 1.5483, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.0825439783491204, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.0237258146928849e-05, |
|
"loss": 1.5504, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1096075778078485, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 9.762741853071153e-06, |
|
"loss": 1.555, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.1366711772665765, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 9.288759783211967e-06, |
|
"loss": 1.5659, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.1637347767253043, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 8.81637918245902e-06, |
|
"loss": 1.5601, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.1907983761840324, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 8.346663689960724e-06, |
|
"loss": 1.5516, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.2178619756427604, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 7.880670943955467e-06, |
|
"loss": 1.5542, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.2449255751014885, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 7.419450200332965e-06, |
|
"loss": 1.5491, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2719891745602165, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 6.964039970069722e-06, |
|
"loss": 1.5564, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.2990527740189446, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 6.515465680858412e-06, |
|
"loss": 1.5584, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.3261163734776726, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 6.074737368196279e-06, |
|
"loss": 1.5534, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.3531799729364005, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 5.642847401131526e-06, |
|
"loss": 1.5588, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3802435723951285, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 5.220768247788458e-06, |
|
"loss": 1.552, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.4073071718538566, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.809450285702697e-06, |
|
"loss": 1.5462, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.4343707713125846, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 4.409819661896839e-06, |
|
"loss": 1.5623, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.4614343707713127, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 4.022776207514885e-06, |
|
"loss": 1.5605, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4884979702300405, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 3.6491914117110405e-06, |
|
"loss": 1.5616, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.5155615696887685, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 3.2899064593549477e-06, |
|
"loss": 1.5578, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.5426251691474966, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 2.945730336971767e-06, |
|
"loss": 1.5482, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.5696887686062246, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 2.6174380111819144e-06, |
|
"loss": 1.5559, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.5967523680649527, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 2.3057686837419246e-06, |
|
"loss": 1.5481, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.6238159675236807, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 2.011424127115552e-06, |
|
"loss": 1.5412, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6508795669824088, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.7350671043228072e-06, |
|
"loss": 1.5611, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.6779431664411368, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 1.4773198766248642e-06, |
|
"loss": 1.5669, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.7050067658998647, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.2387628024050557e-06, |
|
"loss": 1.5515, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.7320703653585927, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.0199330304007858e-06, |
|
"loss": 1.5633, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.7591339648173205, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 8.213232902287438e-07, |
|
"loss": 1.5542, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.7861975642760486, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 6.433807829267491e-07, |
|
"loss": 1.5575, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.8132611637347766, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.865061740103361e-07, |
|
"loss": 1.5532, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.8403247631935047, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 3.510526913114065e-07, |
|
"loss": 1.5686, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.8673883626522327, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 2.3732532963024468e-07, |
|
"loss": 1.5484, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.8944519621109608, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 1.4558016399181086e-07, |
|
"loss": 1.56, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.9215155615696888, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 7.602377305258479e-08, |
|
"loss": 1.5479, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.9485791610284169, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 2.8812773956256034e-08, |
|
"loss": 1.5456, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.975642760487145, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.053469685617595e-09, |
|
"loss": 1.5526, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.9918809201623815, |
|
"eval_loss": 1.697079062461853, |
|
"eval_runtime": 28.912, |
|
"eval_samples_per_second": 15.046, |
|
"eval_steps_per_second": 1.902, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.9918809201623815, |
|
"step": 368, |
|
"total_flos": 1.4968483831454106e+17, |
|
"train_loss": 1.6339908747569374, |
|
"train_runtime": 3216.6871, |
|
"train_samples_per_second": 3.675, |
|
"train_steps_per_second": 0.114 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 368, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4968483831454106e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|