|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9996105008958479, |
|
"eval_steps": 25, |
|
"global_step": 401, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009971177066292747, |
|
"grad_norm": 23.35832977294922, |
|
"learning_rate": 1.9512195121951222e-05, |
|
"loss": 3.1378, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.019942354132585494, |
|
"grad_norm": 7.3288469314575195, |
|
"learning_rate": 3.9024390243902444e-05, |
|
"loss": 2.3678, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.029913531198878244, |
|
"grad_norm": 1.554034948348999, |
|
"learning_rate": 5.853658536585366e-05, |
|
"loss": 1.5701, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03988470826517099, |
|
"grad_norm": 0.8166317343711853, |
|
"learning_rate": 7.804878048780489e-05, |
|
"loss": 1.3583, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04985588533146374, |
|
"grad_norm": 0.7018758058547974, |
|
"learning_rate": 9.75609756097561e-05, |
|
"loss": 1.3179, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05982706239775649, |
|
"grad_norm": 0.5763296484947205, |
|
"learning_rate": 0.00011707317073170732, |
|
"loss": 1.27, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.062319856664329674, |
|
"eval_loss": 1.2523891925811768, |
|
"eval_runtime": 160.09, |
|
"eval_samples_per_second": 25.586, |
|
"eval_steps_per_second": 1.599, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06979823946404923, |
|
"grad_norm": 0.5097940564155579, |
|
"learning_rate": 0.00013658536585365856, |
|
"loss": 1.2733, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07976941653034197, |
|
"grad_norm": 0.48720231652259827, |
|
"learning_rate": 0.00015609756097560978, |
|
"loss": 1.3002, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08974059359663472, |
|
"grad_norm": 0.44958242774009705, |
|
"learning_rate": 0.000175609756097561, |
|
"loss": 1.2172, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09971177066292748, |
|
"grad_norm": 0.459570974111557, |
|
"learning_rate": 0.0001951219512195122, |
|
"loss": 1.1891, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10968294772922023, |
|
"grad_norm": 0.4571148753166199, |
|
"learning_rate": 0.00019833333333333335, |
|
"loss": 1.2378, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11965412479551298, |
|
"grad_norm": 0.4622347056865692, |
|
"learning_rate": 0.00019611111111111112, |
|
"loss": 1.1931, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12463971332865935, |
|
"eval_loss": 1.18223237991333, |
|
"eval_runtime": 159.6761, |
|
"eval_samples_per_second": 25.652, |
|
"eval_steps_per_second": 1.603, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1296253018618057, |
|
"grad_norm": 0.47264593839645386, |
|
"learning_rate": 0.0001938888888888889, |
|
"loss": 1.1967, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.13959647892809846, |
|
"grad_norm": 0.443099707365036, |
|
"learning_rate": 0.00019166666666666667, |
|
"loss": 1.1844, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1495676559943912, |
|
"grad_norm": 0.4309901297092438, |
|
"learning_rate": 0.00018944444444444445, |
|
"loss": 1.1734, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15953883306068395, |
|
"grad_norm": 0.4467671513557434, |
|
"learning_rate": 0.00018722222222222222, |
|
"loss": 1.1783, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1695100101269767, |
|
"grad_norm": 0.4321148991584778, |
|
"learning_rate": 0.00018500000000000002, |
|
"loss": 1.1625, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.17948118719326944, |
|
"grad_norm": 0.4267221987247467, |
|
"learning_rate": 0.00018277777777777777, |
|
"loss": 1.1571, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18695956999298902, |
|
"eval_loss": 1.15470552444458, |
|
"eval_runtime": 159.6542, |
|
"eval_samples_per_second": 25.655, |
|
"eval_steps_per_second": 1.603, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1894523642595622, |
|
"grad_norm": 0.44214680790901184, |
|
"learning_rate": 0.00018055555555555557, |
|
"loss": 1.1463, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.19942354132585496, |
|
"grad_norm": 0.44037795066833496, |
|
"learning_rate": 0.00017833333333333335, |
|
"loss": 1.1477, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2093947183921477, |
|
"grad_norm": 0.4275970160961151, |
|
"learning_rate": 0.00017611111111111112, |
|
"loss": 1.1642, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.21936589545844046, |
|
"grad_norm": 0.5124487280845642, |
|
"learning_rate": 0.0001738888888888889, |
|
"loss": 1.1789, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2293370725247332, |
|
"grad_norm": 0.4456328749656677, |
|
"learning_rate": 0.00017166666666666667, |
|
"loss": 1.1466, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.23930824959102595, |
|
"grad_norm": 0.41848745942115784, |
|
"learning_rate": 0.00016944444444444445, |
|
"loss": 1.1576, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2492794266573187, |
|
"grad_norm": 0.41770660877227783, |
|
"learning_rate": 0.00016722222222222222, |
|
"loss": 1.1543, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2492794266573187, |
|
"eval_loss": 1.1412527561187744, |
|
"eval_runtime": 159.6274, |
|
"eval_samples_per_second": 25.66, |
|
"eval_steps_per_second": 1.604, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2592506037236114, |
|
"grad_norm": 0.4416305720806122, |
|
"learning_rate": 0.000165, |
|
"loss": 1.1251, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.26922178078990416, |
|
"grad_norm": 0.4185228943824768, |
|
"learning_rate": 0.00016277777777777777, |
|
"loss": 1.1442, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2791929578561969, |
|
"grad_norm": 0.4267406463623047, |
|
"learning_rate": 0.00016055555555555558, |
|
"loss": 1.1438, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.28916413492248966, |
|
"grad_norm": 0.414470374584198, |
|
"learning_rate": 0.00015833333333333332, |
|
"loss": 1.1572, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.2991353119887824, |
|
"grad_norm": 0.41629141569137573, |
|
"learning_rate": 0.00015611111111111113, |
|
"loss": 1.1439, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.30910648905507515, |
|
"grad_norm": 0.4665207862854004, |
|
"learning_rate": 0.0001538888888888889, |
|
"loss": 1.1336, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3115992833216484, |
|
"eval_loss": 1.1312440633773804, |
|
"eval_runtime": 159.6919, |
|
"eval_samples_per_second": 25.649, |
|
"eval_steps_per_second": 1.603, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3190776661213679, |
|
"grad_norm": 0.44851216673851013, |
|
"learning_rate": 0.00015166666666666668, |
|
"loss": 1.1398, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.32904884318766064, |
|
"grad_norm": 0.46069833636283875, |
|
"learning_rate": 0.00014944444444444445, |
|
"loss": 1.1375, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3390200202539534, |
|
"grad_norm": 0.4321020543575287, |
|
"learning_rate": 0.00014722222222222223, |
|
"loss": 1.143, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.34899119732024614, |
|
"grad_norm": 0.41148263216018677, |
|
"learning_rate": 0.000145, |
|
"loss": 1.1265, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3589623743865389, |
|
"grad_norm": 0.4245862364768982, |
|
"learning_rate": 0.00014277777777777778, |
|
"loss": 1.1034, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.36893355145283163, |
|
"grad_norm": 0.48434320092201233, |
|
"learning_rate": 0.00014055555555555555, |
|
"loss": 1.1183, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.37391913998597803, |
|
"eval_loss": 1.1238796710968018, |
|
"eval_runtime": 159.6817, |
|
"eval_samples_per_second": 25.651, |
|
"eval_steps_per_second": 1.603, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3789047285191244, |
|
"grad_norm": 0.40798208117485046, |
|
"learning_rate": 0.00013833333333333333, |
|
"loss": 1.131, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3888759055854172, |
|
"grad_norm": 0.4265343248844147, |
|
"learning_rate": 0.00013611111111111113, |
|
"loss": 1.1486, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3988470826517099, |
|
"grad_norm": 0.4202587604522705, |
|
"learning_rate": 0.00013388888888888888, |
|
"loss": 1.1156, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4088182597180027, |
|
"grad_norm": 0.4195230305194855, |
|
"learning_rate": 0.00013166666666666668, |
|
"loss": 1.1128, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4187894367842954, |
|
"grad_norm": 0.44741353392601013, |
|
"learning_rate": 0.00012944444444444445, |
|
"loss": 1.1235, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.42876061385058817, |
|
"grad_norm": 0.4431656002998352, |
|
"learning_rate": 0.00012722222222222223, |
|
"loss": 1.1008, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4362389966503077, |
|
"eval_loss": 1.117403268814087, |
|
"eval_runtime": 159.8175, |
|
"eval_samples_per_second": 25.629, |
|
"eval_steps_per_second": 1.602, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4387317909168809, |
|
"grad_norm": 0.40237733721733093, |
|
"learning_rate": 0.000125, |
|
"loss": 1.1171, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.44870296798317366, |
|
"grad_norm": 0.42873087525367737, |
|
"learning_rate": 0.0001227777777777778, |
|
"loss": 1.1132, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4586741450494664, |
|
"grad_norm": 0.42686423659324646, |
|
"learning_rate": 0.00012055555555555555, |
|
"loss": 1.1311, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.46864532211575916, |
|
"grad_norm": 0.4393009543418884, |
|
"learning_rate": 0.00011833333333333334, |
|
"loss": 1.1288, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4786164991820519, |
|
"grad_norm": 0.4396825432777405, |
|
"learning_rate": 0.00011611111111111113, |
|
"loss": 1.0963, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.48858767624834465, |
|
"grad_norm": 0.4490801990032196, |
|
"learning_rate": 0.00011388888888888889, |
|
"loss": 1.116, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4985588533146374, |
|
"grad_norm": 0.4502253532409668, |
|
"learning_rate": 0.00011166666666666668, |
|
"loss": 1.1341, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4985588533146374, |
|
"eval_loss": 1.1130955219268799, |
|
"eval_runtime": 159.9517, |
|
"eval_samples_per_second": 25.608, |
|
"eval_steps_per_second": 1.6, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5085300303809301, |
|
"grad_norm": 0.4383712410926819, |
|
"learning_rate": 0.00010944444444444445, |
|
"loss": 1.1205, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5185012074472228, |
|
"grad_norm": 0.48988428711891174, |
|
"learning_rate": 0.00010722222222222223, |
|
"loss": 1.1409, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5284723845135156, |
|
"grad_norm": 0.4293093979358673, |
|
"learning_rate": 0.000105, |
|
"loss": 1.1176, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5384435615798083, |
|
"grad_norm": 0.45622044801712036, |
|
"learning_rate": 0.00010277777777777778, |
|
"loss": 1.1001, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5484147386461011, |
|
"grad_norm": 0.4486154615879059, |
|
"learning_rate": 0.00010055555555555555, |
|
"loss": 1.0903, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5583859157123938, |
|
"grad_norm": 0.43276774883270264, |
|
"learning_rate": 9.833333333333333e-05, |
|
"loss": 1.1058, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.560878709978967, |
|
"eval_loss": 1.1073756217956543, |
|
"eval_runtime": 159.9944, |
|
"eval_samples_per_second": 25.601, |
|
"eval_steps_per_second": 1.6, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5683570927786866, |
|
"grad_norm": 0.417212575674057, |
|
"learning_rate": 9.611111111111112e-05, |
|
"loss": 1.104, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5783282698449793, |
|
"grad_norm": 0.4814886152744293, |
|
"learning_rate": 9.388888888888889e-05, |
|
"loss": 1.1131, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5882994469112721, |
|
"grad_norm": 0.46115347743034363, |
|
"learning_rate": 9.166666666666667e-05, |
|
"loss": 1.1163, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5982706239775648, |
|
"grad_norm": 0.4268442690372467, |
|
"learning_rate": 8.944444444444446e-05, |
|
"loss": 1.1109, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6082418010438576, |
|
"grad_norm": 0.43017688393592834, |
|
"learning_rate": 8.722222222222223e-05, |
|
"loss": 1.0922, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6182129781101503, |
|
"grad_norm": 0.4273425340652466, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.105, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6231985666432968, |
|
"eval_loss": 1.1038172245025635, |
|
"eval_runtime": 160.1646, |
|
"eval_samples_per_second": 25.574, |
|
"eval_steps_per_second": 1.598, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6281841551764431, |
|
"grad_norm": 0.4336010813713074, |
|
"learning_rate": 8.277777777777778e-05, |
|
"loss": 1.106, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6381553322427358, |
|
"grad_norm": 0.4563632309436798, |
|
"learning_rate": 8.055555555555556e-05, |
|
"loss": 1.1366, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6481265093090286, |
|
"grad_norm": 0.4345051944255829, |
|
"learning_rate": 7.833333333333333e-05, |
|
"loss": 1.1195, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6580976863753213, |
|
"grad_norm": 0.4438092112541199, |
|
"learning_rate": 7.61111111111111e-05, |
|
"loss": 1.1209, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6680688634416141, |
|
"grad_norm": 0.4323583245277405, |
|
"learning_rate": 7.38888888888889e-05, |
|
"loss": 1.0757, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6780400405079068, |
|
"grad_norm": 0.48928600549697876, |
|
"learning_rate": 7.166666666666667e-05, |
|
"loss": 1.0916, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.6855184233076264, |
|
"eval_loss": 1.100623607635498, |
|
"eval_runtime": 159.5136, |
|
"eval_samples_per_second": 25.678, |
|
"eval_steps_per_second": 1.605, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6880112175741996, |
|
"grad_norm": 0.4694391191005707, |
|
"learning_rate": 6.944444444444444e-05, |
|
"loss": 1.0973, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6979823946404923, |
|
"grad_norm": 0.4758254289627075, |
|
"learning_rate": 6.722222222222223e-05, |
|
"loss": 1.0915, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7079535717067851, |
|
"grad_norm": 0.44181302189826965, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 1.0975, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7179247487730778, |
|
"grad_norm": 0.43584778904914856, |
|
"learning_rate": 6.277777777777778e-05, |
|
"loss": 1.1253, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7278959258393706, |
|
"grad_norm": 0.45839089155197144, |
|
"learning_rate": 6.055555555555555e-05, |
|
"loss": 1.1588, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7378671029056633, |
|
"grad_norm": 0.468307763338089, |
|
"learning_rate": 5.833333333333334e-05, |
|
"loss": 1.0903, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7478382799719561, |
|
"grad_norm": 0.43359532952308655, |
|
"learning_rate": 5.6111111111111114e-05, |
|
"loss": 1.0861, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7478382799719561, |
|
"eval_loss": 1.096602201461792, |
|
"eval_runtime": 159.5808, |
|
"eval_samples_per_second": 25.667, |
|
"eval_steps_per_second": 1.604, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7578094570382488, |
|
"grad_norm": 0.4408925771713257, |
|
"learning_rate": 5.388888888888889e-05, |
|
"loss": 1.1312, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7677806341045416, |
|
"grad_norm": 0.441047728061676, |
|
"learning_rate": 5.166666666666667e-05, |
|
"loss": 1.1003, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7777518111708344, |
|
"grad_norm": 0.4418092966079712, |
|
"learning_rate": 4.9444444444444446e-05, |
|
"loss": 1.0769, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.787722988237127, |
|
"grad_norm": 0.43967655301094055, |
|
"learning_rate": 4.722222222222222e-05, |
|
"loss": 1.1025, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7976941653034199, |
|
"grad_norm": 0.4572228193283081, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.089, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8076653423697125, |
|
"grad_norm": 0.4731937050819397, |
|
"learning_rate": 4.277777777777778e-05, |
|
"loss": 1.0935, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8101581366362858, |
|
"eval_loss": 1.0938494205474854, |
|
"eval_runtime": 159.6842, |
|
"eval_samples_per_second": 25.651, |
|
"eval_steps_per_second": 1.603, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8176365194360053, |
|
"grad_norm": 0.4658983051776886, |
|
"learning_rate": 4.055555555555556e-05, |
|
"loss": 1.1127, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.827607696502298, |
|
"grad_norm": 0.4431600272655487, |
|
"learning_rate": 3.8333333333333334e-05, |
|
"loss": 1.1115, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8375788735685908, |
|
"grad_norm": 0.531471848487854, |
|
"learning_rate": 3.611111111111111e-05, |
|
"loss": 1.0852, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8475500506348835, |
|
"grad_norm": 0.43536490201950073, |
|
"learning_rate": 3.388888888888889e-05, |
|
"loss": 1.0834, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8575212277011763, |
|
"grad_norm": 0.4632166624069214, |
|
"learning_rate": 3.1666666666666666e-05, |
|
"loss": 1.1029, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.867492404767469, |
|
"grad_norm": 0.46226751804351807, |
|
"learning_rate": 2.9444444444444448e-05, |
|
"loss": 1.0967, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.8724779933006154, |
|
"eval_loss": 1.0916736125946045, |
|
"eval_runtime": 159.6244, |
|
"eval_samples_per_second": 25.66, |
|
"eval_steps_per_second": 1.604, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8774635818337618, |
|
"grad_norm": 0.46160823106765747, |
|
"learning_rate": 2.7222222222222223e-05, |
|
"loss": 1.0907, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.8874347589000545, |
|
"grad_norm": 0.4519442915916443, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.1106, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8974059359663473, |
|
"grad_norm": 0.43158119916915894, |
|
"learning_rate": 2.277777777777778e-05, |
|
"loss": 1.0957, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.90737711303264, |
|
"grad_norm": 0.4569935202598572, |
|
"learning_rate": 2.0555555555555555e-05, |
|
"loss": 1.1244, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9173482900989328, |
|
"grad_norm": 0.44798123836517334, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 1.0899, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9273194671652255, |
|
"grad_norm": 0.459721177816391, |
|
"learning_rate": 1.6111111111111115e-05, |
|
"loss": 1.1039, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9347978499649451, |
|
"eval_loss": 1.0899683237075806, |
|
"eval_runtime": 159.8757, |
|
"eval_samples_per_second": 25.62, |
|
"eval_steps_per_second": 1.601, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9372906442315183, |
|
"grad_norm": 0.4426051080226898, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 1.0921, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.947261821297811, |
|
"grad_norm": 0.4699947237968445, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 1.1004, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9572329983641038, |
|
"grad_norm": 0.4296768307685852, |
|
"learning_rate": 9.444444444444445e-06, |
|
"loss": 1.09, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9672041754303965, |
|
"grad_norm": 0.46838051080703735, |
|
"learning_rate": 7.222222222222222e-06, |
|
"loss": 1.1073, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.9771753524966893, |
|
"grad_norm": 0.46618735790252686, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1122, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.987146529562982, |
|
"grad_norm": 0.4417199492454529, |
|
"learning_rate": 2.777777777777778e-06, |
|
"loss": 1.0874, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.9971177066292748, |
|
"grad_norm": 0.4636310040950775, |
|
"learning_rate": 5.555555555555556e-07, |
|
"loss": 1.1026, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9971177066292748, |
|
"eval_loss": 1.0892398357391357, |
|
"eval_runtime": 159.6704, |
|
"eval_samples_per_second": 25.653, |
|
"eval_steps_per_second": 1.603, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 401, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 512, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.4365963772771697e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|